diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -157,8 +157,8 @@
 TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "ncV:64:", "mmx,sse")
 TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "ncV:64:", "mmx,sse")
+TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "sV4sIi", "ncV:64:", "sse")
+TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4ssIi", "ncV:64:", "sse")
 
 // MMX+SSE2
 TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -12097,6 +12097,7 @@
   case X86::BI__builtin_ia32_vec_init_v2si:
     return Builder.CreateBitCast(BuildVector(Ops),
                                  llvm::Type::getX86_MMXTy(getLLVMContext()));
+  case X86::BI__builtin_ia32_vec_ext_v4hi:
   case X86::BI__builtin_ia32_vec_ext_v2si:
   case X86::BI__builtin_ia32_vec_ext_v16qi:
   case X86::BI__builtin_ia32_vec_ext_v8hi:
@@ -12115,6 +12116,7 @@
     // Otherwise we could just do this in the header file.
     return Builder.CreateExtractElement(Ops[0], Index);
   }
+  case X86::BI__builtin_ia32_vec_set_v4hi:
   case X86::BI__builtin_ia32_vec_set_v16qi:
   case X86::BI__builtin_ia32_vec_set_v8hi:
   case X86::BI__builtin_ia32_vec_set_v4si:
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -35,7 +35,9 @@
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1)
 
 /// Adds lower double-precision values in both operands and returns the
 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
@@ -1504,10 +1506,10 @@
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtpd_pi32(__m128d __a)
 {
-  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
+  return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
 }
 
 /// Converts the two double-precision floating-point elements of a
@@ -1524,10 +1526,10 @@
 /// \param __a
 ///    A 128-bit vector of [2 x double].
 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvttpd_pi32(__m128d __a)
 {
-  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
+  return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
 }
 
 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
@@ -1541,10 +1543,10 @@
 /// \param __a
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtpi32_pd(__m64 __a)
 {
-  return __builtin_ia32_cvtpi2pd((__v2si)__a);
+  return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
 }
 
 /// Returns the low-order element of a 128-bit vector of [2 x double] as
@@ -2175,10 +2177,10 @@
 /// \param __b
 ///    A 64-bit integer.
 /// \returns A 64-bit integer containing the sum of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_si64(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
+  return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
 }
 
 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
@@ -2507,10 +2509,11 @@
 /// \param __b
 ///    A 64-bit integer containing one of the source operands.
 /// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mul_su32(__m64 __a, __m64 __b)
 {
-  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
+                                             (__v4si)__anyext128(__b)));
 }
 
 /// Multiplies 32-bit unsigned integer values contained in the lower
@@ -2621,10 +2624,10 @@
 ///    A 64-bit integer vector containing the subtrahend.
 /// \returns A 64-bit integer vector containing the difference of the values in
 ///    the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_si64(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
+  return (__m64)((unsigned long long)__a - (unsigned long long)__b);
 }
 
 /// Subtracts the corresponding elements of two [2 x i64] vectors.
@@ -4965,8 +4968,10 @@
 #if defined(__cplusplus)
 } // extern "C"
 #endif
+
+#undef __anyext128
+#undef __trunc64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
 
 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
 
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -17,8 +17,29 @@
 typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));
 
+/* Unsigned types */
+typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
+typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v8qs __attribute__((__vector_size__(8)));
+
+/* SSE/SSE2 types */
+typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_SSE2 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1)
+#define __extract2_32(a) (__m64)__builtin_shufflevector((__v4si)(a), __extension__ (__v4si){}, 0, 2);
 
 /// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
@@ -44,10 +65,10 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
 ///    parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi32_si64(int __i)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+    return __extension__ (__m64)(__v2si){__i, 0};
 }
 
 /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
@@ -61,10 +82,10 @@
 ///    A 64-bit integer vector.
 /// \returns A 32-bit signed integer value containing the lower 32 bits of the
 ///    parameter.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_si32(__m64 __m)
 {
-    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+    return ((__v2si)__m)[0];
 }
 
 /// Casts a 64-bit signed integer value into a 64-bit integer vector.
@@ -77,7 +98,7 @@
 ///    A 64-bit signed integer.
 /// \returns A 64-bit integer vector containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtsi64_m64(long long __i)
 {
     return (__m64)__i;
@@ -93,7 +114,7 @@
 ///    A 64-bit integer vector.
 /// \returns A 64-bit signed integer containing the same bitwise pattern as the
 ///    parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtm64_si64(__m64 __m)
 {
     return (long long)__m;
@@ -123,10 +144,11 @@
 ///    [4 x i8] values are written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+    return __extract2_32(__builtin_ia32_packsswb128((__v8hi)__anyext128(__m1),
+                                                    (__v8hi)__anyext128(__m2)));
 }
 
 /// Converts 32-bit signed integers from both 64-bit integer vector
@@ -153,10 +175,11 @@
 ///    [2 x i16] values are written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+    return __extract2_32(__builtin_ia32_packssdw128((__v4si)__anyext128(__m1),
+                                                    (__v4si)__anyext128(__m2)));
 }
 
 /// Converts 16-bit signed integers from both 64-bit integer vector
@@ -183,10 +206,11 @@
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_packs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+    return __extract2_32(__builtin_ia32_packuswb128((__v8hi)__anyext128(__m1),
+                                                    (__v8hi)__anyext128(__m2)));
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -210,10 +234,11 @@
 ///    Bits [63:56] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
+                                          4, 12, 5, 13, 6, 14, 7, 15);
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -233,10 +258,11 @@
 ///    Bits [63:48] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
+                                          2, 6, 3, 7);
 }
 
 /// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -254,10 +280,10 @@
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -281,10 +307,11 @@
 ///    Bits [31:24] are written to bits [63:56] of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
+                                          0, 8, 1, 9, 2, 10, 3, 11);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -304,10 +331,11 @@
 ///    Bits [31:16] are written to bits [63:48] of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
+                                          0, 4, 1, 5);
 }
 
 /// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -325,10 +353,10 @@
 ///    the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+    return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
 }
 
 /// Adds each 8-bit integer element of the first 64-bit integer vector
@@ -346,10 +374,10 @@
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
 }
 
 /// Adds each 16-bit integer element of the first 64-bit integer vector
@@ -367,10 +395,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
 }
 
 /// Adds each 32-bit integer element of the first 64-bit integer vector
@@ -388,10 +416,10 @@
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_add_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
 }
 
 /// Adds each 8-bit signed integer element of the first 64-bit integer
@@ -410,10 +438,11 @@
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+    return __trunc64(__builtin_ia32_paddsb128((__v16qi)__anyext128(__m1),
+                                              (__v16qi)__anyext128(__m2)));
 }
 
 /// Adds each 16-bit signed integer element of the first 64-bit integer
@@ -433,10 +462,11 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
 ///    of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_paddsw128((__v8hi)__anyext128(__m1),
+                                              (__v8hi)__anyext128(__m2)));
 }
 
 /// Adds each 8-bit unsigned integer element of the first 64-bit integer
@@ -455,10 +485,11 @@
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+    return __trunc64(__builtin_ia32_paddusb128((__v16qi)__anyext128(__m1),
+                                               (__v16qi)__anyext128(__m2)));
 }
 
 /// Adds each 16-bit unsigned integer element of the first 64-bit integer
@@ -477,10 +508,11 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_adds_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_paddusw128((__v8hi)__anyext128(__m1),
+                                               (__v8hi)__anyext128(__m2)));
 }
 
 /// Subtracts each 8-bit integer element of the second 64-bit integer
@@ -498,10 +530,10 @@
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
 }
 
 /// Subtracts each 16-bit integer element of the second 64-bit integer
@@ -519,10 +551,10 @@
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
 }
 
 /// Subtracts each 32-bit integer element of the second 64-bit integer
@@ -540,10 +572,10 @@
 ///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
 /// \returns A 64-bit integer vector of [2 x i32] containing the differences of
 ///    both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
 }
 
 /// Subtracts each 8-bit signed integer element of the second 64-bit
@@ -563,10 +595,11 @@
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+    return __trunc64(__builtin_ia32_psubsb128((__v16qi)__anyext128(__m1),
+                                              (__v16qi)__anyext128(__m2)));
 }
 
 /// Subtracts each 16-bit signed integer element of the second 64-bit
@@ -586,10 +619,11 @@
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_psubsw128((__v8hi)__anyext128(__m1),
+                                              (__v8hi)__anyext128(__m2)));
 }
 
 /// Subtracts each 8-bit unsigned integer element of the second 64-bit
@@ -610,10 +644,11 @@
 ///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
 /// \returns A 64-bit integer vector of [8 x i8] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+    return __trunc64(__builtin_ia32_psubusb128((__v16qi)__anyext128(__m1),
+                                               (__v16qi)__anyext128(__m2)));
 }
 
 /// Subtracts each 16-bit unsigned integer element of the second 64-bit
@@ -634,10 +669,11 @@
 ///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
 /// \returns A 64-bit integer vector of [4 x i16] containing the saturated
 ///    differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_subs_pu16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_psubusw128((__v8hi)__anyext128(__m1),
+                                               (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -661,10 +697,11 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [2 x i32] containing the sums of
 ///    products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_madd_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
+                                               (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -682,10 +719,11 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+    return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
+                                              (__v8hi)__anyext128(__m2)));
 }
 
 /// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -703,10 +741,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
 ///    of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
 }
 
 /// Left-shifts each 16-bit signed integer element of the first
@@ -726,10 +764,11 @@
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }
 
 /// Left-shifts each 16-bit signed integer element of a 64-bit integer
@@ -748,10 +787,11 @@
 /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
 ///    values. If \a __count is greater or equal to 16, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
+                                              __count));
 }
 
 /// Left-shifts each 32-bit signed integer element of the first
@@ -771,10 +811,11 @@
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }
 
 /// Left-shifts each 32-bit signed integer element of a 64-bit integer
@@ -793,10 +834,11 @@
 /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
 ///    values. If \a __count is greater or equal to 32, the result is set to all
 ///    0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
+                                              __count));
 }
 
 /// Left-shifts the first 64-bit integer parameter by the number of bits
@@ -813,10 +855,11 @@
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
+                                             __anyext128(__count)));
 }
 
 /// Left-shifts the first parameter, which is a 64-bit integer, by the
@@ -833,10 +876,11 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the left-shifted value. If
 ///     \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_slli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 16-bit integer element of the first parameter,
@@ -857,10 +901,11 @@
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }
 
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -880,10 +925,11 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 32-bit integer element of the first parameter,
@@ -904,10 +950,11 @@
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sra_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }
 
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -927,10 +974,11 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srai_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 16-bit integer element of the first parameter,
@@ -950,10 +998,11 @@
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi16(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
+                                             (__v8hi)__anyext128(__count)));
 }
 
 /// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -972,10 +1021,11 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi16(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+    return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts each 32-bit integer element of the first parameter,
@@ -995,10 +1045,11 @@
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_pi32(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
+                                             (__v4si)__anyext128(__count)));
 }
 
 /// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -1017,10 +1068,11 @@
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_pi32(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+    return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
+                                              __count));
 }
 
 /// Right-shifts the first 64-bit integer parameter by the number of bits
@@ -1037,10 +1089,11 @@
 /// \param __count
 ///    A 64-bit integer vector interpreted as a single 64-bit integer.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srl_si64(__m64 __m, __m64 __count)
 {
-    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
+                                             __anyext128(__count)));
 }
 
 /// Right-shifts the first parameter, which is a 64-bit integer, by the
@@ -1058,10 +1111,11 @@
 /// \param __count
 ///    A 32-bit integer value.
 /// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_srli_si64(__m64 __m, int __count)
 {
-    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
+    return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
+                                              __count));
 }
 
 /// Performs a bitwise AND of two 64-bit integer vectors.
@@ -1076,10 +1130,10 @@
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_and_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
 }
 
 /// Performs a bitwise NOT of the first 64-bit integer vector, and then
@@ -1097,10 +1151,10 @@
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise AND of the second
 ///    parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_andnot_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
 }
 
 /// Performs a bitwise OR of two 64-bit integer vectors.
@@ -1115,10 +1169,10 @@
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_or_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
 }
 
 /// Performs a bitwise exclusive OR of two 64-bit integer vectors.
@@ -1133,10 +1187,10 @@
 ///    A 64-bit integer vector.
 /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
 ///    parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_xor_si64(__m64 __m1, __m64 __m2)
 {
-    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
+    return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
 }
 
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1155,10 +1209,10 @@
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+    return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
 }
 
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1177,10 +1231,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
 }
 
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1199,10 +1253,10 @@
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
 }
 
 /// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1221,10 +1275,12 @@
 ///    A 64-bit integer vector of [8 x i8].
 /// \returns A 64-bit integer vector of [8 x i8] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+  /* This function always performs a signed comparison, but __v8qi is a char
+     which may be signed or unsigned, so use __v8qs. */
+    return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
 }
 
 /// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1243,10 +1299,10 @@
 ///    A 64-bit integer vector of [4 x i16].
 /// \returns A 64-bit integer vector of [4 x i16] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+    return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
 }
 
 /// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1265,10 +1321,10 @@
 ///    A 64-bit integer vector of [2 x i32].
 /// \returns A 64-bit integer vector of [2 x i32] containing the comparison
 ///    results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 {
-    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+    return (__m64)((__v2si)__m1 > (__v2si)__m2);
 }
 
 /// Constructs a 64-bit integer vector initialized to zero.
@@ -1278,7 +1334,7 @@
 /// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setzero_si64(void)
 {
     return __extension__ (__m64){ 0LL };
@@ -1299,10 +1355,10 @@
 ///    A 32-bit integer value used to initialize the lower 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi32(int __i1, int __i0)
 {
-    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+    return __extension__ (__m64)(__v2si){__i0, __i1};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1322,10 +1378,10 @@
 /// \param __s0
 ///    A 16-bit integer value used to initialize bits [15:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 {
-    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+    return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3};
 }
 
 /// Constructs a 64-bit integer vector initialized with the specified
@@ -1353,12 +1409,12 @@
 /// \param __b0
 ///    An 8-bit integer value used to initialize bits [7:0] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
             char __b1, char __b0)
 {
-    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
-                                               __b4, __b5, __b6, __b7);
+    return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3,
+                                         __b4, __b5, __b6, __b7};
 }
 
 /// Constructs a 64-bit integer vector of [2 x i32], with each of the
@@ -1374,7 +1430,7 @@
 ///    A 32-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi32(int __i)
 {
     return _mm_set_pi32(__i, __i);
@@ -1393,7 +1449,7 @@
 ///    A 16-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi16(short __w)
 {
     return _mm_set_pi16(__w, __w, __w, __w);
@@ -1411,7 +1467,7 @@
 ///    An 8-bit integer value used to initialize each vector element of the
 ///    result.
 /// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_set1_pi8(char __b)
 {
     return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
@@ -1432,7 +1488,7 @@
 ///    A 32-bit integer value used to initialize the upper 32 bits of the
 ///    result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi32(int __i0, int __i1)
 {
     return _mm_set_pi32(__i1, __i0);
@@ -1455,7 +1511,7 @@
 /// \param __w3
 ///    A 16-bit integer value used to initialize bits [63:48] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 {
     return _mm_set_pi16(__w3, __w2, __w1, __w0);
@@ -1486,14 +1542,17 @@
 /// \param __b7
 ///    An 8-bit integer value used to initialize bits [63:56] of the result.
 /// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
              char __b6, char __b7)
 {
     return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
 }
 
-#undef __DEFAULT_FN_ATTRS
+#undef __extract2_32
+#undef __anyext128
+#undef __trunc64
+#undef __DEFAULT_FN_ATTRS_SSE2
 
 /* Aliases for compatibility. */
 #define _m_empty _mm_empty
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -14,7 +14,10 @@
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1)
+#define __extract2_32(a) (__m64)__builtin_shufflevector((__v4si)(a), __extension__ (__v4si){}, 0, 2);
 
 /// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
@@ -28,10 +31,10 @@
 ///    A 64-bit vector of [8 x i8].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi8(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+    return __trunc64(__builtin_ia32_pabsb128((__v16qi)__anyext128(__a)));
 }
 
 /// Computes the absolute value of each of the packed 8-bit signed
@@ -64,10 +67,10 @@
 ///    A 64-bit vector of [4 x i16].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi16(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+    return __trunc64(__builtin_ia32_pabsw128((__v8hi)__anyext128(__a)));
 }
 
 /// Computes the absolute value of each of the packed 16-bit signed
@@ -100,10 +103,10 @@
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi32(__m64 __a)
 {
-    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+    return __trunc64(__builtin_ia32_pabsd128((__v4si)__anyext128(__a)));
 }
 
 /// Computes the absolute value of each of the packed 32-bit signed
@@ -168,7 +171,10 @@
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_pi8(a, b, n) \
-  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
+  (__m64)__builtin_shufflevector(                                       \
+      __builtin_ia32_psrldqi128_byteshift(                              \
+          __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0),      \
+          (n)), __extension__ (__v2di){}, 0)
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
@@ -233,10 +239,11 @@
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+    return __extract2_32(__builtin_ia32_phaddw128((__v8hi)__anyext128(__a),
+                                                  (__v8hi)__anyext128(__b)));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -256,10 +263,11 @@
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+    return __extract2_32(__builtin_ia32_phaddd128((__v4si)__anyext128(__a),
+                                                  (__v4si)__anyext128(__b)));
 }
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -306,10 +314,11 @@
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+    return __extract2_32(__builtin_ia32_phaddsw128((__v8hi)__anyext128(__a),
+                                                   (__v8hi)__anyext128(__b)));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -375,10 +384,11 @@
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+    return __extract2_32(__builtin_ia32_phsubw128((__v8hi)__anyext128(__a),
+                                                  (__v8hi)__anyext128(__b)));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -398,10 +408,11 @@
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+    return __extract2_32(__builtin_ia32_phsubd128((__v4si)__anyext128(__a),
+                                                  (__v4si)__anyext128(__b)));
 }
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -448,10 +459,11 @@
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+    return __extract2_32(__builtin_ia32_phsubsw128((__v8hi)__anyext128(__a),
+                                                   (__v8hi)__anyext128(__b)));
 }
 
 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -512,10 +524,11 @@
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+                                                 (__v16qi)__anyext128(__b)));
 }
 
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -552,10 +565,11 @@
 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
+                                                (__v8hi)__anyext128(__b)));
 }
 
 /// Copies the 8-bit integers from a 128-bit integer vector to the
@@ -601,12 +615,15 @@
 ///    1: Clear the corresponding byte in the destination. \n
 ///    0: Copy the selected source byte to the corresponding byte in the
 ///    destination. \n
-///    Bits [3:0] select the source byte to be copied.
+///    Bits [2:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_pshufb128(
+        (__v16qi)__builtin_shufflevector(
+            (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
+        (__v16qi)__anyext128(__b)));
 }
 
 /// For each 8-bit integer in the first source operand, perform one of
@@ -707,10 +724,11 @@
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+    return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
+                                              (__v16qi)__anyext128(__b)));
 }
 
 /// For each 16-bit integer in the first source operand, perform one of
@@ -733,10 +751,11 @@
 ///    A 64-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+    return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
+                                              (__v8hi)__anyext128(__b)));
 }
 
 /// For each 32-bit integer in the first source operand, perform one of
@@ -759,13 +778,16 @@
 ///    A 64-bit integer vector containing two control doublewords corresponding
 ///    to positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
-    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+    return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
+                                              (__v4si)__anyext128(__b)));
 }
 
+#undef __extract2_32
+#undef __anyext128
+#undef __trunc64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
 
 #endif /* __TMMINTRIN_H */
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -29,7 +29,12 @@
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_SSE2 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __zext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, 2, 3)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1)
+#define __zeroupper64(x) (__m128i)__builtin_shufflevector((__v4si)(x), __extension__ (__v4si){}, 0, 1, 4, 5)
 
 /// Adds the 32-bit float values in the low-order bits of the operands.
 ///
@@ -1354,10 +1359,10 @@
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
 }
 
 /// Converts two low-order float values in a 128-bit vector of
@@ -1370,7 +1375,7 @@
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_ps2pi(__m128 __a)
 {
   return _mm_cvtps_pi32(__a);
@@ -1447,10 +1452,10 @@
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvttps_pi32(__m128 __a)
 {
-  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
+  return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
 }
 
 /// Converts two low-order float values in a 128-bit vector of [4 x
@@ -1464,7 +1469,7 @@
 /// \param __a
 ///    A 128-bit vector of [4 x float].
 /// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtt_ps2pi(__m128 __a)
 {
   return _mm_cvttps_pi32(__a);
@@ -1559,10 +1564,13 @@
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value of the second operand. The upper 64 bits are copied from
 ///    the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 {
-  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
+  return (__m128)__builtin_shufflevector(
+      (__v4sf)__a,
+      __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
+      4, 5, 2, 3);
 }
 
 /// Converts two elements of a 64-bit vector of [2 x i32] into two
@@ -1582,7 +1590,7 @@
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    converted value from the second operand. The upper 64 bits are copied
 ///    from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 {
   return _mm_cvtpi32_ps(__a, __b);
@@ -2116,10 +2124,10 @@
 ///    A pointer to an aligned memory location used to store the register value.
 /// \param __a
 ///    A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pi(__m64 *__p, __m64 __a)
 {
-  __builtin_ia32_movntq(__p, __a);
+  __builtin_nontemporal_store(__a, __p);
 }
 
 /// Moves packed float values from a 128-bit vector of [4 x float] to a
@@ -2181,7 +2189,7 @@
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 #define _mm_extract_pi16(a, n) \
-  (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
+  (int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
 
 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
@@ -2227,10 +2235,11 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pmaxsw128((__v8hi)__anyext128(__a),
+                                            (__v8hi)__anyext128(__b)));
 }
 
 /// Compares each of the corresponding packed 8-bit unsigned integer
@@ -2246,10 +2255,11 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_max_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_pmaxub128((__v16qi)__anyext128(__a),
+                                            (__v16qi)__anyext128(__b)));
 }
 
 /// Compares each of the corresponding packed 16-bit integer values of
@@ -2265,10 +2275,11 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pi16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pminsw128((__v8hi)__anyext128(__a),
+                                            (__v8hi)__anyext128(__b)));
 }
 
 /// Compares each of the corresponding packed 8-bit unsigned integer
@@ -2284,10 +2295,11 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_min_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_pminub128((__v16qi)__anyext128(__a),
+                                            (__v16qi)__anyext128(__b)));
 }
 
 /// Takes the most significant bit from each 8-bit element in a 64-bit
@@ -2302,10 +2314,10 @@
 ///    A 64-bit integer vector containing the values with bits to be extracted.
 /// \returns The most significant bit from each 8-bit element in \a __a,
 ///    written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_MMX
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
 _mm_movemask_pi8(__m64 __a)
 {
-  return __builtin_ia32_pmovmskb((__v8qi)__a);
+  return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
 }
 
 /// Multiplies packed 16-bit unsigned integer values and writes the
@@ -2321,10 +2333,11 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
+                                             (__v8hi)__anyext128(__b)));
 }
 
 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
@@ -2359,7 +2372,9 @@
 ///    11: assigned from bits [63:48] of \a a.
 /// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) \
-  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
+  (__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__ (__v4hi){}, \
+                                 (n) & 0x3, ((n) >> 2) & 0x3, \
+                                 ((n) >> 4) & 0x3, ((n) >> 6) & 0x3)
 
 /// Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
@@ -2384,10 +2399,25 @@
 ///    A pointer to a 64-bit memory location that will receive the conditionally
 ///    copied integer values. The address of the memory location does not have
 ///    to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS_SSE2
 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 {
-  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+  // This is complex, because we need to support the case where __p is pointing
+  // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
+  // write might cause a trap where a 64-bit maskmovq would not. (Memory
+  // locations not selected by the mask bits might still cause traps.)
+  __m128i __d128  = __anyext128(__d);
+  __m128i __n128  = __zext128(__n);
+  if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
+      ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
+    // If there's a risk of spurious trap due to a 128-bit write, back up the
+    // pointer by 8 bytes and shift values in registers to match.
+    __p -= 8;
+    __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
+    __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
+  }
+
+  __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
 }
 
 /// Computes the rounded averages of the packed unsigned 8-bit integer
@@ -2403,10 +2433,11 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
+                                           (__v16qi)__anyext128(__b)));
 }
 
 /// Computes the rounded averages of the packed unsigned 16-bit integer
@@ -2422,10 +2453,11 @@
 /// \param __b
 ///    A 64-bit integer vector containing one of the source operands.
 /// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_avg_pu16(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+  return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
+                                           (__v8hi)__anyext128(__b)));
 }
 
 /// Subtracts the corresponding 8-bit unsigned integer values of the two
@@ -2444,10 +2476,11 @@
 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
 ///    sets of absolute differences between both operands. The upper bits are
 ///    cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_sad_pu8(__m64 __a, __m64 __b)
 {
-  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+  return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
+                                            (__v16qi)__zext128(__b)));
 }
 
 #if defined(__cplusplus)
@@ -2725,22 +2758,10 @@
 ///    from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi16_ps(__m64 __a)
 {
-  __m64 __b, __c;
-  __m128 __r;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_cmpgt_pi16(__b, __a);
-  __c = _mm_unpackhi_pi16(__a, __b);
-  __r = _mm_setzero_ps();
-  __r = _mm_cvtpi32_ps(__r, __c);
-  __r = _mm_movelh_ps(__r, __r);
-  __c = _mm_unpacklo_pi16(__a, __b);
-  __r = _mm_cvtpi32_ps(__r, __c);
-
-  return __r;
+  return __builtin_convertvector((__v4hi)__a, __v4sf);
 }
 
 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
@@ -2755,21 +2776,10 @@
 ///    destination are copied from the corresponding elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu16_ps(__m64 __a)
 {
-  __m64 __b, __c;
-  __m128 __r;
-
-  __b = _mm_setzero_si64();
-  __c = _mm_unpackhi_pi16(__a, __b);
-  __r = _mm_setzero_ps();
-  __r = _mm_cvtpi32_ps(__r, __c);
-  __r = _mm_movelh_ps(__r, __r);
-  __c = _mm_unpacklo_pi16(__a, __b);
-  __r = _mm_cvtpi32_ps(__r, __c);
-
-  return __r;
+  return __builtin_convertvector((__v4hu)__a, __v4sf);
 }
 
 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
@@ -2784,16 +2794,12 @@
 ///    from the corresponding lower 4 elements in this operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi8_ps(__m64 __a)
 {
-  __m64 __b;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_cmpgt_pi8(__b, __a);
-  __b = _mm_unpacklo_pi8(__a, __b);
-
-  return _mm_cvtpi16_ps(__b);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
+                              0, 1, 2, 3), __v4sf);
 }
 
 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
@@ -2809,15 +2815,12 @@
 ///    operand.
 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
 ///    values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpu8_ps(__m64 __a)
 {
-  __m64 __b;
-
-  __b = _mm_setzero_si64();
-  __b = _mm_unpacklo_pi8(__a, __b);
-
-  return _mm_cvtpi16_ps(__b);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
+                              0, 1, 2, 3), __v4sf);
 }
 
 /// Converts the two 32-bit signed integer values from each 64-bit vector
@@ -2836,16 +2839,12 @@
 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
 ///    copied and converted values from the first operand. The upper 64 bits
 ///    contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 {
-  __m128 __c;
-
-  __c = _mm_setzero_ps();
-  __c = _mm_cvtpi32_ps(__c, __b);
-  __c = _mm_movelh_ps(__c, __c);
-
-  return _mm_cvtpi32_ps(__c, __a);
+  return __builtin_convertvector(
+      __builtin_shufflevector((__v2si)__a, (__v2si)__b,
+                              0, 1, 2, 3), __v4sf);
 }
 
 /// Converts each single-precision floating-point element of a 128-bit
@@ -2865,16 +2864,11 @@
 ///    A 128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi16(__m128 __a)
 {
-  __m64 __b, __c;
-
-  __b = _mm_cvtps_pi32(__a);
-  __a = _mm_movehl_ps(__a, __a);
-  __c = _mm_cvtps_pi32(__a);
-
-  return _mm_packs_pi32(__b, __c);
+  return __trunc64(__builtin_ia32_packssdw128(
+      (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
 }
 
 /// Converts each single-precision floating-point element of a 128-bit
@@ -2895,7 +2889,7 @@
 ///    128-bit floating-point vector of [4 x float].
 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
 ///    converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
 _mm_cvtps_pi8(__m128 __a)
 {
   __m64 __b, __c;
@@ -2997,8 +2991,12 @@
 #define _m_ _mm_
 #define _m_ _mm_
 
+#undef __trunc64
+#undef __zext128
+#undef __anyext128
+#undef __zeroupper64
 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
+#undef __DEFAULT_FN_ATTRS_SSE2
 
 /* Ugly hack for backwards-compatibility (compatible with gcc) */
 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -1,193 +1,200 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx
 
 
 #include <immintrin.h>
 
 __m64 test_mm_abs_pi8(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.b
+  // CHECK: call <16 x i8> @llvm.abs.v16i8(
   return _mm_abs_pi8(a);
 }
 
 __m64 test_mm_abs_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.w
+  // CHECK: call <8 x i16> @llvm.abs.v8i16(
   return _mm_abs_pi16(a);
 }
 
 __m64 test_mm_abs_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_abs_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.d
+  // CHECK: call <4 x i32> @llvm.abs.v4i32(
   return _mm_abs_pi32(a);
 }
 
 __m64 test_mm_add_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.b
+  // CHECK: add <8 x i8> {{%.*}}, {{%.*}}
   return _mm_add_pi8(a, b);
 }
 
 __m64 test_mm_add_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.w
+  // CHECK: add <4 x i16> {{%.*}}, {{%.*}}
   return _mm_add_pi16(a, b);
 }
 
 __m64 test_mm_add_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.d
+  // CHECK: add <2 x i32> {{%.*}}, {{%.*}}
   return _mm_add_pi32(a, b);
 }
 
 __m64 test_mm_add_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: add i64 {{%.*}}, {{%.*}}
   return _mm_add_si64(a, b);
 }
 
 __m64 test_mm_adds_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.padds.b
+  // CHECK: call <16 x i8> @llvm.sadd.sat.v16i8(
   return _mm_adds_pi8(a, b);
 }
 
 __m64 test_mm_adds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.padds.w
+  // CHECK: call <8 x i16> @llvm.sadd.sat.v8i16(
   return _mm_adds_pi16(a, b);
 }
 
 __m64 test_mm_adds_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.paddus.b
+  // CHECK: call <16 x i8> @llvm.uadd.sat.v16i8(
   return _mm_adds_pu8(a, b);
 }
 
 __m64 test_mm_adds_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_adds_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.paddus.w
+  // CHECK: call <8 x i16> @llvm.uadd.sat.v8i16(
   return _mm_adds_pu16(a, b);
 }
 
 __m64 test_mm_alignr_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_alignr_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.palignr.b
+  // CHECK: shufflevector <16 x i8> {{%.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   return _mm_alignr_pi8(a, b, 2);
 }
 
 __m64 test_mm_and_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_and_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pand
+  // CHECK: and <1 x i64> {{%.*}}, {{%.*}}
   return _mm_and_si64(a, b);
 }
 
 __m64 test_mm_andnot_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_andnot_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pandn
+  // CHECK: [[TMP:%.*]] = xor <1 x i64> {{%.*}}, <i64 -1>
+  // CHECK: and <1 x i64> [[TMP]], {{%.*}}
   return _mm_andnot_si64(a, b);
 }
 
 __m64 test_mm_avg_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_avg_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pavg.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(
   return _mm_avg_pu8(a, b);
 }
 
 __m64 test_mm_avg_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_avg_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pavg.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(
   return _mm_avg_pu16(a, b);
 }
 
 __m64 test_mm_cmpeq_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpeq_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.b
+  // CHECK:      [[CMP:%.*]] = icmp eq <8 x i8> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8>
   return _mm_cmpeq_pi8(a, b);
 }
 
 __m64 test_mm_cmpeq_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpeq_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.w
+  // CHECK:      [[CMP:%.*]] = icmp eq <4 x i16> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16>
   return _mm_cmpeq_pi16(a, b);
 }
 
 __m64 test_mm_cmpeq_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpeq_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.d
+  // CHECK:      [[CMP:%.*]] = icmp eq <2 x i32> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32>
   return _mm_cmpeq_pi32(a, b);
 }
 
 __m64 test_mm_cmpgt_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpgt_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.b
+  // CHECK:      [[CMP:%.*]] = icmp sgt <8 x i8> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8>
   return _mm_cmpgt_pi8(a, b);
 }
 
 __m64 test_mm_cmpgt_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpgt_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.w
+  // CHECK:      [[CMP:%.*]] = icmp sgt <4 x i16> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16>
   return _mm_cmpgt_pi16(a, b);
 }
 
 __m64 test_mm_cmpgt_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cmpgt_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.d
+  // CHECK:      [[CMP:%.*]] = icmp sgt <2 x i32> {{%.*}}, {{%.*}}
+  // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32>
   return _mm_cmpgt_pi32(a, b);
 }
 
 __m128 test_mm_cvt_pi2ps(__m128 a, __m64 b) {
   // CHECK-LABEL: test_mm_cvt_pi2ps
-  // CHECK: <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
   return _mm_cvt_pi2ps(a, b);
 }
 
 __m64 test_mm_cvt_ps2pi(__m128 a) {
   // CHECK-LABEL: test_mm_cvt_ps2pi
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(
   return _mm_cvt_ps2pi(a);
 }
 
 __m64 test_mm_cvtpd_pi32(__m128d a) {
   // CHECK-LABEL: test_mm_cvtpd_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtpd2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq(
   return _mm_cvtpd_pi32(a);
 }
 
 __m128 test_mm_cvtpi16_ps(__m64 a) {
   // CHECK-LABEL: test_mm_cvtpi16_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i16> {{%.*}} to <4 x float>
   return _mm_cvtpi16_ps(a);
 }
 
 __m128d test_mm_cvtpi32_pd(__m64 a) {
   // CHECK-LABEL: test_mm_cvtpi32_pd
-  // CHECK: call <2 x double> @llvm.x86.sse.cvtpi2pd
+  // CHECK: sitofp <2 x i32> {{%.*}} to <2 x double>
   return _mm_cvtpi32_pd(a);
 }
 
 __m128 test_mm_cvtpi32_ps(__m128 a, __m64 b) {
   // CHECK-LABEL: test_mm_cvtpi32_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
   return _mm_cvtpi32_ps(a, b);
 }
 
 __m128 test_mm_cvtpi32x2_ps(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_cvtpi32x2_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
-  // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+  // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
   return _mm_cvtpi32x2_ps(a, b);
 }
 
 __m64 test_mm_cvtps_pi16(__m128 a) {
   // CHECK-LABEL: test_mm_cvtps_pi16
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: [[TMP0:%.*]] = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> {{%.*}})
+  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP0]],
   return _mm_cvtps_pi16(a);
 }
 
 __m64 test_mm_cvtps_pi32(__m128 a) {
   // CHECK-LABEL: test_mm_cvtps_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(
   return _mm_cvtps_pi32(a);
 }
 
@@ -205,19 +212,19 @@
 
 __m64 test_mm_cvttpd_pi32(__m128d a) {
   // CHECK-LABEL: test_mm_cvttpd_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvttpd2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq(
   return _mm_cvttpd_pi32(a);
 }
 
 __m64 test_mm_cvttps_pi32(__m128 a) {
   // CHECK-LABEL: test_mm_cvttps_pi32
-  // CHECK: call x86_mmx @llvm.x86.sse.cvttps2pi
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq(
   return _mm_cvttps_pi32(a);
 }
 
 int test_mm_extract_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_extract_pi16
-  // CHECK: call i32 @llvm.x86.mmx.pextr.w
+  // CHECK: extractelement <4 x i16> {{%.*}}, i64 2
   return _mm_extract_pi16(a, 2);
 }
 
@@ -235,151 +242,153 @@
 
 __m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
   return _mm_hadd_pi16(a, b);
 }
 
 __m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadd_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
   return _mm_hadd_pi32(a, b);
 }
 
 __m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hadds_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
   return _mm_hadds_pi16(a, b);
 }
 
 __m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
   return _mm_hsub_pi16(a, b);
 }
 
 __m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsub_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
   return _mm_hsub_pi32(a, b);
 }
 
 __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_hsubs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
   return _mm_hsubs_pi16(a, b);
 }
 
 __m64 test_mm_insert_pi16(__m64 a, int d) {
   // CHECK-LABEL: test_mm_insert_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pinsr.w
+  // CHECK: insertelement <4 x i16>
   return _mm_insert_pi16(a, d, 2);
 }
 
 __m64 test_mm_madd_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_madd_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmadd.wd
+  // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(
   return _mm_madd_pi16(a, b);
 }
 
 __m64 test_mm_maddubs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_maddubs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(
   return _mm_maddubs_pi16(a, b);
 }
 
 void test_mm_maskmove_si64(__m64 d, __m64 n, char *p) {
   // CHECK-LABEL: test_mm_maskmove_si64
-  // CHECK: call void @llvm.x86.mmx.maskmovq
+  // CHECK: call void @llvm.x86.sse2.maskmov.dqu(
   _mm_maskmove_si64(d, n, p);
 }
 
 __m64 test_mm_max_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_max_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmaxs.w
+  // CHECK: call <8 x i16> @llvm.smax.v8i16(
   return _mm_max_pi16(a, b);
 }
 
 __m64 test_mm_max_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_max_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmaxu.b
+  // CHECK: call <16 x i8> @llvm.umax.v16i8(
   return _mm_max_pu8(a, b);
 }
 
 __m64 test_mm_min_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_min_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmins.w
+  // CHECK: call <8 x i16> @llvm.smin.v8i16(
   return _mm_min_pi16(a, b);
 }
 
 __m64 test_mm_min_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_min_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.pminu.b
+  // CHECK: call <16 x i8> @llvm.umin.v16i8(
   return _mm_min_pu8(a, b);
 }
 
 int test_mm_movemask_pi8(__m64 a) {
   // CHECK-LABEL: test_mm_movemask_pi8
-  // CHECK: call i32 @llvm.x86.mmx.pmovmskb
+  // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128(
   return _mm_movemask_pi8(a);
 }
 
 __m64 test_mm_mul_su32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mul_su32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: and <2 x i64> {{%.*}}, <i64 4294967295, i64 4294967295>
+  // CHECK: and <2 x i64> {{%.*}}, <i64 4294967295, i64 4294967295>
+  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_su32(a, b);
 }
 
 __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhi_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulh.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w(
   return _mm_mulhi_pi16(a, b);
 }
 
 __m64 test_mm_mulhi_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhi_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulhu.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w(
   return _mm_mulhi_pu16(a, b);
 }
 
 __m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mulhrs_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pmul.hr.sw
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(
   return _mm_mulhrs_pi16(a, b);
 }
 
 __m64 test_mm_mullo_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mullo_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmull.w
+  // CHECK: mul <4 x i16> {{%.*}}, {{%.*}}
   return _mm_mullo_pi16(a, b);
 }
 
 __m64 test_mm_or_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_or_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.por
+  // CHECK: or <1 x i64> {{%.*}}, {{%.*}}
   return _mm_or_si64(a, b);
 }
 
 __m64 test_mm_packs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_packs_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.packsswb
+  // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(
   return _mm_packs_pi16(a, b);
 }
 
 __m64 test_mm_packs_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_packs_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.packssdw
+  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(
   return _mm_packs_pi32(a, b);
 }
 
 __m64 test_mm_packs_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_packs_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.packuswb
+  // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(
   return _mm_packs_pu16(a, b);
 }
 
 __m64 test_mm_sad_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sad_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psad.bw
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>
   return _mm_sad_pu8(a, b);
 }
 
@@ -472,181 +481,181 @@
 
 __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_shuffle_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.pshuf.b
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(
   return _mm_shuffle_pi8(a, b);
 }
 
 __m64 test_mm_shuffle_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_shuffle_pi16
-  // CHECK: call x86_mmx @llvm.x86.sse.pshuf.w
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   return _mm_shuffle_pi16(a, 3);
 }
 
 __m64 test_mm_sign_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi8
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.b
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(
   return _mm_sign_pi8(a, b);
 }
 
 __m64 test_mm_sign_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi16
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.w
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128(
   return _mm_sign_pi16(a, b);
 }
 
 __m64 test_mm_sign_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi32
-  // CHECK: call x86_mmx @llvm.x86.ssse3.psign.d
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128(
   return _mm_sign_pi32(a, b);
 }
 
 __m64 test_mm_sll_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sll_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(
   return _mm_sll_pi16(a, b);
 }
 
 __m64 test_mm_sll_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sll_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d(
   return _mm_sll_pi32(a, b);
 }
 
 __m64 test_mm_sll_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sll_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psll.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q(
   return _mm_sll_si64(a, b);
 }
 
 __m64 test_mm_slli_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_slli_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(
   return _mm_slli_pi16(a, 3);
 }
 
 __m64 test_mm_slli_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_slli_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(
   return _mm_slli_pi32(a, 3);
 }
 
 __m64 test_mm_slli_si64(__m64 a) {
   // CHECK-LABEL: test_mm_slli_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(
   return _mm_slli_si64(a, 3);
 }
 
 __m64 test_mm_sra_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sra_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psra.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w(
   return _mm_sra_pi16(a, b);
 }
 
 __m64 test_mm_sra_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sra_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psra.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d(
   return _mm_sra_pi32(a, b);
 }
 
 __m64 test_mm_srai_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_srai_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrai.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(
   return _mm_srai_pi16(a, 3);
 }
 
 __m64 test_mm_srai_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_srai_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrai.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(
   return _mm_srai_pi32(a, 3);
 }
 
 __m64 test_mm_srl_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_srl_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w(
   return _mm_srl_pi16(a, b);
 }
 
 __m64 test_mm_srl_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_srl_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d(
   return _mm_srl_pi32(a, b);
 }
 
 __m64 test_mm_srl_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_srl_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrl.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q(
   return _mm_srl_si64(a, b);
 }
 
 __m64 test_mm_srli_pi16(__m64 a) {
   // CHECK-LABEL: test_mm_srli_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(
   return _mm_srli_pi16(a, 3);
 }
 
 __m64 test_mm_srli_pi32(__m64 a) {
   // CHECK-LABEL: test_mm_srli_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(
   return _mm_srli_pi32(a, 3);
 }
 
 __m64 test_mm_srli_si64(__m64 a) {
   // CHECK-LABEL: test_mm_srli_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(
   return _mm_srli_si64(a, 3);
 }
 
 void test_mm_stream_pi(__m64 *p, __m64 a) {
   // CHECK-LABEL: test_mm_stream_pi
-  // CHECK: call void @llvm.x86.mmx.movnt.dq
+  // CHECK: store <1 x i64> {{%.*}}, <1 x i64>* {{%.*}}, align 8, !nontemporal
   _mm_stream_pi(p, a);
 }
 
 __m64 test_mm_sub_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.b
+  // CHECK: sub <8 x i8> {{%.*}}, {{%.*}}
   return _mm_sub_pi8(a, b);
 }
 
 __m64 test_mm_sub_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.w
+  // CHECK: sub <4 x i16> {{%.*}}, {{%.*}}
   return _mm_sub_pi16(a, b);
 }
 
 __m64 test_mm_sub_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.d
+  // CHECK: sub <2 x i32> {{%.*}}, {{%.*}}
   return _mm_sub_pi32(a, b);
 }
 
 __m64 test_mm_sub_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  // CHECK: sub i64 {{%.*}}, {{%.*}}
   return _mm_sub_si64(a, b);
 }
 
 __m64 test_mm_subs_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubs.b
+  // CHECK: call <16 x i8> @llvm.ssub.sat.v16i8(
   return _mm_subs_pi8(a, b);
 }
 
 __m64 test_mm_subs_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubs.w
+  // CHECK: call <8 x i16> @llvm.ssub.sat.v8i16(
   return _mm_subs_pi16(a, b);
 }
 
 __m64 test_mm_subs_pu8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pu8
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubus.b
+  // CHECK: call <16 x i8> @llvm.usub.sat.v16i8(
   return _mm_subs_pu8(a, b);
 }
 
 __m64 test_mm_subs_pu16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_subs_pu16
-  // CHECK: call x86_mmx @llvm.x86.mmx.psubus.w
+  // CHECK: call <8 x i16> @llvm.usub.sat.v8i16(
   return _mm_subs_pu16(a, b);
 }
 
@@ -664,42 +673,42 @@
 
 __m64 test_mm_unpackhi_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpackhi_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhbw
+  // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   return _mm_unpackhi_pi8(a, b);
 }
 
 __m64 test_mm_unpackhi_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpackhi_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhwd
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   return _mm_unpackhi_pi16(a, b);
 }
 
 __m64 test_mm_unpackhi_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpackhi_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckhdq
+  // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 3>
   return _mm_unpackhi_pi32(a, b);
 }
 
 __m64 test_mm_unpacklo_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpacklo_pi8
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpcklbw
+  // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   return _mm_unpacklo_pi8(a, b);
 }
 
 __m64 test_mm_unpacklo_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpacklo_pi16
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpcklwd
+  // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   return _mm_unpacklo_pi16(a, b);
 }
 
 __m64 test_mm_unpacklo_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_unpacklo_pi32
-  // CHECK: call x86_mmx @llvm.x86.mmx.punpckldq
+  // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 0, i32 2>
   return _mm_unpacklo_pi32(a, b);
 }
 
 __m64 test_mm_xor_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_xor_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.pxor
+  // CHECK: xor <1 x i64> {{%.*}}, {{%.*}}
   return _mm_xor_si64(a, b);
 }
diff --git a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
--- a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
+++ b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
@@ -2,22 +2,22 @@
 #include <mmintrin.h>
 
 void shift(__m64 a, __m64 b, int c) {
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 {{.*}})
   _mm_slli_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 {{.*}})
   _mm_slli_pi32(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 {{.*}})
   _mm_slli_si64(a, c);
 
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 {{.*}})
   _mm_srli_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 {{.*}})
   _mm_srli_pi32(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 {{.*}})
   _mm_srli_si64(a, c);
 
-  // CHECK: x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 {{.*}})
   _mm_srai_pi16(a, c);
-  // CHECK: x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %{{.*}}, i32 {{.*}})
+  // CHECK: <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 {{.*}})
   _mm_srai_pi32(a, c);
 }
diff --git a/clang/test/CodeGen/attr-target-x86-mmx.c b/clang/test/CodeGen/attr-target-x86-mmx.c
--- a/clang/test/CodeGen/attr-target-x86-mmx.c
+++ b/clang/test/CodeGen/attr-target-x86-mmx.c
@@ -1,12 +1,11 @@
 // RUN: %clang_cc1 -triple i386-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// Picking a cpu that doesn't have mmx or sse by default so we can enable it later.
+// Picking a cpu that doesn't have sse by default so we can enable it later.
 
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
-// Verify that when we turn on sse that we also turn on mmx.
-void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) {
+void __attribute__((target("sse2"))) shift(__m64 a, __m64 b, int c) {
   _mm_slli_pi16(a, c);
   _mm_slli_pi32(a, c);
   _mm_slli_si64(a, c);
@@ -19,4 +18,4 @@
   _mm_srai_pi32(a, c);
 }
 
-// CHECK: "target-features"="+cx8,+mmx,+sse,+x87"
+// CHECK: "target-features"="+cx8,+mmx,+sse,+sse2,+x87"
diff --git a/clang/test/Headers/xmmintrin.c b/clang/test/Headers/xmmintrin.c
--- a/clang/test/Headers/xmmintrin.c
+++ b/clang/test/Headers/xmmintrin.c
@@ -14,7 +14,7 @@
 // checking that clang emits PACKSSDW instead of PACKSSWB.
 
 // CHECK: define{{.*}} i64 @test_mm_cvtps_pi16
-// CHECK: call x86_mmx @llvm.x86.mmx.packssdw
+// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128
 
 __m64 test_mm_cvtps_pi16(__m128 a) {
   return _mm_cvtps_pi16(a);
diff --git a/clang/test/Sema/x86-builtin-palignr.c b/clang/test/Sema/x86-builtin-palignr.c
--- a/clang/test/Sema/x86-builtin-palignr.c
+++ b/clang/test/Sema/x86-builtin-palignr.c
@@ -4,5 +4,5 @@
 #include <tmmintrin.h>
 
 __m64 test1(__m64 a, __m64 b, int c) {
-   return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_palignr' must be a constant integer}}
+   return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_psrldqi128_byteshift' must be a constant integer}}
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -2424,11 +2424,11 @@
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                         llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  def int_x86_mmx_pextr_w : GCCBuiltin<"__builtin_ia32_vec_ext_v4hi">,
+  def int_x86_mmx_pextr_w :
               Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty],
                         [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-  def int_x86_mmx_pinsr_w : GCCBuiltin<"__builtin_ia32_vec_set_v4hi">,
+  def int_x86_mmx_pinsr_w :
               Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
                         llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 }
diff --git a/mmx-tests/Makefile b/mmx-tests/Makefile
new file mode 100644
--- /dev/null
+++ b/mmx-tests/Makefile
@@ -0,0 +1,29 @@
+USE_XMM=
+#USE_XMM=--use-xmm
+
+OLDCC ?= clang-10
+NEWCC ?= ../build/bin/clang
+TESTCC=$(OLDCC)
+COPTS ?=
+
+gen_orig.c: mmx-tests.py
+	./mmx-tests.py --kind=wrapper --wrapper-prefix=orig $(USE_XMM) > $@
+gen_orig.h: mmx-tests.py
+	./mmx-tests.py --kind=wrapper_h --wrapper-prefix=orig $(USE_XMM) > $@
+gen_new.c: mmx-tests.py
+	./mmx-tests.py --kind=wrapper --wrapper-prefix=new $(USE_XMM) > $@
+gen_new.h: mmx-tests.py
+	./mmx-tests.py --kind=wrapper_h --wrapper-prefix=new $(USE_XMM) > $@
+gen_test.inc: mmx-tests.py
+	./mmx-tests.py --kind=test $(USE_XMM) > $@
+gen_orig.o: gen_orig.c
+	$(OLDCC) -c $(COPTS) -O2 -o $@ $^
+gen_new.o: gen_new.c
+	$(NEWCC) -c $(COPTS) -O2 -o $@ $^
+test.o: test.c gen_test.inc gen_orig.h gen_new.h
+	$(TESTCC) -c $(COPTS) -o $@ test.c
+test: test.o gen_orig.o gen_new.o
+	$(TESTCC) $(COPTS) -o $@ $^ -lm
+
+clean:
+	rm -f gen_orig.c gen_orig.h gen_new.c gen_new.h gen_test.inc gen_orig.o gen_new.o test.o test
diff --git a/mmx-tests/mmx-tests.py b/mmx-tests/mmx-tests.py
new file mode 100755
--- /dev/null
+++ b/mmx-tests/mmx-tests.py
@@ -0,0 +1,301 @@
+#!/usr/bin/python3
+
+import argparse
+import sys
+
+# This is a list of all intel functions and macros which take or
+# return an __m64.
+def do_mmx(fn):
+  # mmintrin.h
+  fn("_mm_cvtsi32_si64", "__m64", ("int", ))
+  fn("_mm_cvtsi64_si32", "int", ("__m64", ))
+  fn("_mm_cvtsi64_m64", "__m64", ("long long", ), condition='defined(__X86_64__) || defined(__clang__)')
+  fn("_mm_cvtm64_si64", "long long", ("__m64", ), condition='defined(__X86_64__) || defined(__clang__)')
+  fn("_mm_packs_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_packs_pi32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_packs_pu16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_unpackhi_pi8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_unpackhi_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_unpackhi_pi32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_unpacklo_pi8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_unpacklo_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_unpacklo_pi32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_add_pi8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_add_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_add_pi32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_adds_pi8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_adds_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_adds_pu8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_adds_pu16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_sub_pi8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_sub_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_sub_pi32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_subs_pi8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_subs_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_subs_pu8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_subs_pu16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_madd_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_mulhi_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_mullo_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_sll_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_slli_pi16", "__m64", ("__m64", "int", ))
+  fn("_mm_sll_pi32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_slli_pi32", "__m64", ("__m64", "int", ))
+  fn("_mm_sll_si64", "__m64", ("__m64", "__m64", ))
+  fn("_mm_slli_si64", "__m64", ("__m64", "int", ))
+  fn("_mm_sra_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_srai_pi16", "__m64", ("__m64", "int", ))
+  fn("_mm_sra_pi32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_srai_pi32", "__m64", ("__m64", "int", ))
+  fn("_mm_srl_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_srli_pi16", "__m64", ("__m64", "int", ))
+  fn("_mm_srl_pi32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_srli_pi32", "__m64", ("__m64", "int", ))
+  fn("_mm_srl_si64", "__m64", ("__m64", "__m64", ))
+  fn("_mm_srli_si64", "__m64", ("__m64", "int", ))
+  fn("_mm_and_si64", "__m64", ("__m64", "__m64", ))
+  fn("_mm_andnot_si64", "__m64", ("__m64", "__m64", ))
+  fn("_mm_or_si64", "__m64", ("__m64", "__m64", ))
+  fn("_mm_xor_si64", "__m64", ("__m64", "__m64", ))
+  fn("_mm_cmpeq_pi8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_cmpeq_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_cmpeq_pi32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_cmpgt_pi8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_cmpgt_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_cmpgt_pi32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_setzero_si64", "__m64", ())
+  fn("_mm_set_pi32", "__m64", ("int", "int", ))
+  fn("_mm_set_pi16", "__m64", ("short", "short", "short", "short", ))
+  fn("_mm_set_pi8", "__m64", ("char", "char", "char", "char", "char", "char", "char", "char", ))
+  fn("_mm_set1_pi32", "__m64", ("int", ))
+  fn("_mm_set1_pi16", "__m64", ("short", ))
+  fn("_mm_set1_pi8", "__m64", ("char", ))
+  fn("_mm_setr_pi32", "__m64", ("int", "int", ))
+  fn("_mm_setr_pi16", "__m64", ("short", "short", "short", "short", ))
+  fn("_mm_setr_pi8", "__m64", ("char", "char", "char", "char", "char", "char", "char", "char", ))
+
+  # xmmintrin.h
+  fn("_mm_cvtps_pi32", "__m64", ("__m128", ))
+  fn("_mm_cvt_ps2pi", "__m64", ("__m128", ))
+  fn("_mm_cvttps_pi32", "__m64", ("__m128", ))
+  fn("_mm_cvtt_ps2pi", "__m64", ("__m128", ))
+  fn("_mm_cvtpi32_ps", "__m128", ("__m128", "__m64", ))
+  fn("_mm_cvt_pi2ps", "__m128", ("__m128", "__m64", ))
+  fn("_mm_loadh_pi", "__m128", ("__m128", "const __m64 *", ))
+  fn("_mm_loadl_pi", "__m128", ("__m128", "const __m64 *", ))
+  fn("_mm_storeh_pi", "void", ("__m64 *", "__m128", ))
+  fn("_mm_storel_pi", "void", ("__m64 *", "__m128", ))
+  fn("_mm_stream_pi", "void", ("__m64 *", "__m64", ))
+  fn("_mm_max_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_max_pu8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_min_pi16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_min_pu8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_movemask_pi8", "int", ("__m64", ))
+  fn("_mm_mulhi_pu16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_maskmove_si64", "void", ("__m64", "__m64", "char *", ))
+  fn("_mm_avg_pu8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_avg_pu16", "__m64", ("__m64", "__m64", ))
+  fn("_mm_sad_pu8", "__m64", ("__m64", "__m64", ))
+  fn("_mm_cvtpi16_ps", "__m128", ("__m64", ))
+  fn("_mm_cvtpu16_ps", "__m128", ("__m64", ))
+  fn("_mm_cvtpi8_ps", "__m128", ("__m64", ))
+  fn("_mm_cvtpu8_ps", "__m128", ("__m64", ))
+  fn("_mm_cvtpi32x2_ps", "__m128", ("__m64", "__m64", ))
+  fn("_mm_cvtps_pi16", "__m64", ("__m128", ))
+  fn("_mm_cvtps_pi8", "__m64", ("__m128", ))
+
+  fn("_mm_extract_pi16", "int", ("__m64", "int", ), imm_range=(0, 3))
+  fn("_mm_insert_pi16", "__m64", ("__m64", "int", "int", ), imm_range=(0, 3))
+  fn("_mm_shuffle_pi16", "__m64", ("__m64", "int", ), imm_range=(0, 255))
+
+  # emmintrin.h
+  fn("_mm_cvtpd_pi32", "__m64", ("__m128d", ))
+  fn("_mm_cvttpd_pi32", "__m64", ("__m128d", ))
+  fn("_mm_cvtpi32_pd", "__m128d", ("__m64", ))
+  fn("_mm_add_si64", "__m64", ("__m64", "__m64", ))
+  fn("_mm_mul_su32", "__m64", ("__m64", "__m64", ))
+  fn("_mm_sub_si64", "__m64", ("__m64", "__m64", ))
+  fn("_mm_set_epi64", "__m128i", ("__m64", "__m64", ))
+  fn("_mm_set1_epi64", "__m128i", ("__m64", ))
+  fn("_mm_setr_epi64", "__m128i", ("__m64", "__m64", ))
+  fn("_mm_movepi64_pi64", "__m64", ("__m128i", ))
+  fn("_mm_movpi64_epi64", "__m128i", ("__m64", ))
+
+  # tmmintrin.h
+  fn("_mm_abs_pi8", "__m64", ("__m64", ), target='ssse3')
+  fn("_mm_abs_pi16", "__m64", ("__m64", ), target='ssse3')
+  fn("_mm_abs_pi32", "__m64", ("__m64", ), target='ssse3')
+  fn("_mm_hadd_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_hadd_pi32", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_hadds_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_hsub_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_hsub_pi32", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_hsubs_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_maddubs_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_mulhrs_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_shuffle_pi8", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_sign_pi8", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_sign_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_sign_pi32", "__m64", ("__m64", "__m64", ), target='ssse3')
+  fn("_mm_alignr_pi8", "__m64", ("__m64", "__m64", "int", ), imm_range=(0, 18), target='ssse3')
+
+# Generate a file full of wrapper functions for each of the above mmx
+# functions.
+#
+# If use_xmm is set, pass/return arguments as __m128 rather than of
+# __m64.
+def define_wrappers(prefix, use_xmm=True, header=False):
+  if header:
+    print('#pragma once')
+
+  print('#include <immintrin.h>')
+  if use_xmm and not header:
+    print('#define m128_to_m64(x) ((__m64)((__v2di)(x))[0])')
+    print('#define m64_to_m128(x) ((__m128)(__v2di){(long long)(__m64)(x), 0})')
+
+  def fn(name, ret_ty, arg_tys, imm_range=None, target=None, condition=None):
+    if condition:
+      print(f'#if {condition}')
+    convert_ret = False
+    if use_xmm and ret_ty == '__m64':
+      ret_ty = '__v2di'
+      convert_ret = True
+
+    if target:
+      attr = f'__attribute__((target("{target}"))) '
+    else:
+      attr = ''
+
+    if imm_range:
+      arg_tys = arg_tys[:-1]
+    def translate_type(t):
+      if use_xmm and t == '__m64':
+        return '__m128'
+      return t
+    def translate_arg(t, a):
+      if use_xmm and t == '__m64':
+        return f'm128_to_m64({a})'
+      return a
+
+    arg_decl = ', '.join(f'{translate_type(v[1])} arg_{v[0]}' for v in enumerate(arg_tys)) or 'void'
+    call_args = ', '.join(translate_arg(v[1], f'arg_{v[0]}') for v in enumerate(arg_tys))
+
+    def create_fn(suffix, extraarg):
+      if header:
+        print(f'{ret_ty} {prefix}_{name}{suffix}({arg_decl});')
+      else:
+        print(f'{attr}{ret_ty} {prefix}_{name}{suffix}({arg_decl})')
+        if use_xmm and convert_ret:
+          print(f'{{ return ({ret_ty})m64_to_m128({name}({call_args}{extraarg})); }}')
+        else:
+          print(f'{{ return {name}({call_args}{extraarg}); }}')
+
+    if imm_range:
+      for i in range(imm_range[0], imm_range[1]+1):
+        create_fn(f'_{i}', f', {i}')
+    else:
+      create_fn('', '')
+    if condition:
+      print('#endif')
+
+  do_mmx(fn)
+
+
+# Create a C file that tests an "orig" set of wrappers against a "new"
+# set of wrappers.
+def define_tests(use_xmm=False):
+  def fn(name, ret_ty, arg_tys, imm_range=None, target=None, condition=None):
+    if condition:
+      print(f'#if {condition}')
+    arg_decl = ', '.join(f'{v[1]} arg_{v[0]}' for v in enumerate(arg_tys)) or 'void'
+    print(f' // {ret_ty} {name}({arg_decl});')
+
+    if imm_range:
+      for i in range(imm_range[0], imm_range[1]+1):
+        fn(name + f'_{i}', ret_ty, arg_tys[:-1], target=target)
+      return
+
+    convert_pre = convert_post = ''
+    if use_xmm and ret_ty == '__m64':
+      convert_pre = 'm128_to_m64('
+      convert_post = ')'
+
+    args=[]
+    loops=[]
+    printf_fmts = []
+    printf_args = []
+    for arg_ty in arg_tys:
+      v=len(loops)
+      if arg_ty in ('char', 'short'):
+        loops.append(f' for(int l{v} = 0; l{v} < arraysize(short_vals); ++l{v}) {{')
+        args.append(f'({arg_ty})short_vals[l{v}]')
+        printf_fmts.append('%016x')
+        printf_args.append(f'short_vals[l{v}]')
+      elif arg_ty in ('int', 'long long'):
+        loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{')
+        args.append(f'({arg_ty})mmx_vals[l{v}]')
+        printf_fmts.append('%016llx')
+        printf_args.append(f'mmx_vals[l{v}]')
+      elif arg_ty == '__m64':
+        loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{')
+        if use_xmm:
+          loops.append(f' for(int l{v+1} = 0; l{v+1} < arraysize(padding_mmx_vals); ++l{v+1}) {{')
+          args.append(f'(__m128)(__m128i){{mmx_vals[l{v}], padding_mmx_vals[l{v+1}]}}')
+          printf_fmts.append('(__m128i){%016llx, %016llx}')
+          printf_args.append(f'mmx_vals[l{v}], padding_mmx_vals[l{v+1}]')
+        else:
+          args.append(f'({arg_ty})mmx_vals[l{v}]')
+          printf_fmts.append('%016llx')
+          printf_args.append(f'mmx_vals[l{v}]')
+      elif arg_ty in ('__m128', '__m128i', '__m128d'):
+        loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{')
+        loops.append(f' for(int l{v+1} = 0; l{v+1} < arraysize(mmx_vals); ++l{v+1}) {{')
+        args.append(f'({arg_ty})(__m128i){{mmx_vals[l{v}], mmx_vals[l{v+1}]}}')
+        printf_fmts.append('(__m128i){%016llx, %016llx}')
+        printf_args.append(f'mmx_vals[l{v}], mmx_vals[l{v+1}]')
+      elif arg_ty == 'const __m64 *':
+        loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{\n' +
+                     f'  mem.m64 = (__m64)mmx_vals[l{v}];')
+        args.append(f'&mem.m64')
+        printf_fmts.append('&mem.m64 /* %016llx */')
+        printf_args.append(f'(long long)mem.m64')
+      else:
+        print(' //   -> UNSUPPORTED')
+        return
+
+    printf_fmt_str = '"' + ', '.join(printf_fmts) + '"'
+    if printf_args:
+      printf_arg_str = ', ' + ','.join(printf_args)
+    else:
+      printf_arg_str = ''
+
+    print('\n'.join(loops))
+    print(f'''
+  clear_exc_flags();
+  {ret_ty} orig_res = {convert_pre}orig_{name}({", ".join(args)}){convert_post};
+  int orig_exc = get_exc_flags();
+  clear_exc_flags();
+  {ret_ty} new_res = {convert_pre}new_{name}({", ".join(args)}){convert_post};
+  int new_exc = get_exc_flags();
+  check_mismatch("{name}", orig_exc, new_exc, &orig_res, &new_res, sizeof(orig_res), {printf_fmt_str}{printf_arg_str});
+''')
+    print(' }\n' * len(loops))
+    print()
+    if condition:
+      print('#endif')
+
+  do_mmx(fn)
+
+
+parser = argparse.ArgumentParser(description='Generate mmx test code.')
+parser.add_argument('--kind', choices=['wrapper', 'wrapper_h', 'test'])
+parser.add_argument('--wrapper-prefix', default='orig')
+parser.add_argument('--use-xmm', action='store_true')
+
+args = parser.parse_args()
+if args.kind == 'wrapper':
+  define_wrappers(args.wrapper_prefix, use_xmm=args.use_xmm, header=False)
+elif args.kind == 'wrapper_h':
+  define_wrappers(args.wrapper_prefix, use_xmm=args.use_xmm, header=True)
+elif args.kind == 'test':
+  define_tests(use_xmm=args.use_xmm)
diff --git a/mmx-tests/test.c b/mmx-tests/test.c
new file mode 100644
--- /dev/null
+++ b/mmx-tests/test.c
@@ -0,0 +1,237 @@
+#include <fenv.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include "gen_orig.h"
+#include "gen_new.h"
+
+
+// A bunch of helper functions for the code in gen_test.inc
+#define m128_to_m64(x) (__m64)((__v2di)(x))[0]
+
+#define arraysize(a) (sizeof(a) / sizeof(*a))
+
+static void dump_mem(void *ptr, int nbytes) {
+  for (int i = 0; i < nbytes; ++i) {
+    printf(" %02x", ((unsigned char*)ptr)[i]);
+  }
+  printf("\n");
+}
+
+static int get_exc_flags() {
+  return fetestexcept(FE_ALL_EXCEPT | __FE_DENORM);
+}
+
+static void clear_exc_flags() {
+  feclearexcept(FE_ALL_EXCEPT | __FE_DENORM);
+}
+
+static void dump_exc_flags(int exc_flags) {
+  printf("%x", exc_flags);
+  if (exc_flags & FE_INEXACT)
+    printf(" inexact");
+  if (exc_flags & FE_DIVBYZERO)
+    printf(" divbyzero");
+  if (exc_flags & FE_UNDERFLOW)
+    printf(" underflow");
+  if (exc_flags & FE_OVERFLOW)
+    printf(" overflow");
+  if (exc_flags & FE_INVALID)
+    printf(" invalid");
+  if (exc_flags & __FE_DENORM)
+    printf(" denormal");
+}
+
+static void dump_result(int orig_exc, int new_exc, void *orig_data, void *new_data, int nbytes) {
+  printf(" orig_exc = ");
+  dump_exc_flags(orig_exc);
+  printf(" new_exc = ");
+  dump_exc_flags(new_exc);
+  printf("\n");
+  printf(" orig");
+  dump_mem(orig_data, nbytes);
+  printf(" new ");
+  dump_mem(new_data, nbytes);
+}
+
+static void check_mismatch(const char *name, int orig_exc, int new_exc,
+                           void *orig_data, void *new_data, int nbytes,
+                           const char *printf_fmt, ...) {
+  if (orig_exc != new_exc || memcmp(orig_data, new_data, nbytes)) {
+    va_list args;
+    va_start(args, printf_fmt);
+    printf("mismatch %s(", name);
+    vprintf(printf_fmt, args);
+    printf("):\n");
+    dump_result(orig_exc, new_exc, orig_data, new_data, nbytes);
+    va_end(args);
+  }
+}
+
+unsigned short short_vals[] = {
+  0x0000,
+  0x0001,
+  0xffee,
+  0xffff,
+};
+
+unsigned long long padding_mmx_vals[] = {
+  0x0000000000000000LL,
+  0xffffffffffffffffLL,
+  0x7fc000007fc00000LL, // float nan nan
+  0xfff8000000000000LL, // -nan
+};
+
+unsigned long long mmx_vals[] = {
+  0x0000000000000000LL,
+  0x0000000000000001LL,
+  0x0000000000000002LL,
+  0x0000000000000003LL,
+  0x0000000000000004LL,
+  0x0000000000000005LL,
+  0x0000000000000006LL,
+  0x0000000000000007LL,
+  0x0000000000000008LL,
+  0x0000000000000009LL,
+  0x000000000000000aLL,
+  0x000000000000000bLL,
+  0x000000000000000cLL,
+  0x000000000000000dLL,
+  0x000000000000000eLL,
+  0x000000000000000fLL,
+  0x0000000000000100LL,
+  0x0000000000010000LL,
+  0x0000000001000000LL,
+  0x0000000100000000LL,
+  0x0000010000000000LL,
+  0x0001000000000000LL,
+  0x0100000000000000LL,
+  0x0101010101010101LL,
+  0x0102030405060708LL,
+  0x1234567890abcdefLL,
+  0x007f007f007f007fLL,
+  0x7f007f007f007f00LL,
+  0x7f7f7f7f7f7f7f7fLL,
+  0x8000800080008000LL,
+  0x0080008000800080LL,
+  0x8080808080808080LL,
+  0x7fff7fff7fff7fffLL,
+  0x8000800080008000LL,
+  0x7fffffff7fffffffLL,
+  0x8000000080000000LL,
+  0x0000777700006666LL,
+  0x7777000066660000LL,
+  0x0000ffff0000eeeeLL,
+  0xffff0000eeee0000LL,
+  0x7700660055004400LL,
+  0x0077006600550044LL,
+  0xff00ee00dd00cc00LL,
+  0x00ff00ee00dd00ccLL,
+  0xffffffffffffffffLL,
+  0x3ff0000000000000LL, // 1.0
+  0x3ff8000000000000LL, // 1.5
+  0x4000000000000000LL, // 2.0
+  0x3f8000003fc00000LL, // float 1.0 1.5
+  0x3fc0000040000000LL, // float 1.5 2.0
+  0x7ff0000000000000LL, // inf
+  0x7f8000007f800000LL, // float inf inf
+  0xfff0000000000000LL, // -inf
+  0xff800000ff800000LL, // float -inf -inf
+  0x7ff8000000000000LL, // nan
+  0x7fc000007fc00000LL, // float nan nan
+  0xfff8000000000000LL, // -nan
+  0xffc00000ffc00000LL, // float -nan -nan
+};
+
+struct __attribute__((aligned(sizeof(__m128)))) Mem {
+  __m64 dummy;
+  __m64 m64;
+} mem, mem2;
+
+// These 3 could be autogenerated...but I didn't add support for stores to the generator.
+void test_stores() {
+ // void _mm_storeh_pi(__m64 * arg_0, __m128 arg_1);
+  for(int l0 = 0; l0 < arraysize(mmx_vals); ++l0) {
+    for(int l1 = 0; l1 < arraysize(mmx_vals); ++l1) {
+      clear_exc_flags();
+      orig__mm_storeh_pi(&mem.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]});
+      int orig_exc = get_exc_flags();
+      clear_exc_flags();
+      new__mm_storeh_pi(&mem2.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]});
+      int new_exc = get_exc_flags();
+      check_mismatch("_mm_storeh_pi", orig_exc, new_exc, &mem.m64, &mem2.m64, sizeof(__m64),
+                     "&mem.m64, (__m128i){%016llx, %016llx},", mmx_vals[l0], mmx_vals[l1]);
+    }
+  }
+
+  // void _mm_storel_pi(__m64 * arg_0, __m128 arg_1);
+  for(int l0 = 0; l0 < arraysize(mmx_vals); ++l0) {
+    for(int l1 = 0; l1 < arraysize(mmx_vals); ++l1) {
+      clear_exc_flags();
+      orig__mm_storel_pi(&mem.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]});
+      int orig_exc = get_exc_flags();
+      clear_exc_flags();
+      new__mm_storel_pi(&mem2.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]});
+      int new_exc = get_exc_flags();
+      check_mismatch("_mm_storeh_pi", orig_exc, new_exc, &mem.m64, &mem2.m64, sizeof(__m64),
+                     "&mem.m64, (__m128i){%016llx, %016llx},", mmx_vals[l0], mmx_vals[l1]);
+    }
+  }
+
+  // void _mm_stream_pi(__m64 * arg_0, __m64 arg_1);
+  for(int l0 = 0; l0 < arraysize(mmx_vals); ++l0) {
+    clear_exc_flags();
+    orig__mm_stream_pi(&mem.m64, (__m64)mmx_vals[l0]);
+    int orig_exc = get_exc_flags();
+    clear_exc_flags();
+    new__mm_stream_pi(&mem2.m64, (__m64)mmx_vals[l0]);
+    int new_exc = get_exc_flags();
+    check_mismatch("_mm_stream_pi", orig_exc, new_exc, &mem.m64, &mem2.m64, sizeof(__m64),
+                   "&mem.m64, %016llx,", mmx_vals[l0]);
+  }
+}
+
+// Test that the nominally 64-bit maskmove doesn't trap at the edges of
+// non-writable memory, despite being implemented by a 128-bit write.
+void test_maskmove() {
+  // Create a page memory with an inaccessible page on either side.
+  char *map = mmap(0, 3 * 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  if (!map)
+    abort();
+  if (mprotect(map, 4096, PROT_NONE))
+    abort();
+  if (mprotect(map + 4096 * 2, 4096, PROT_NONE))
+    abort();
+  long long init_val = 0xffeeddccbbaa9900;
+  long long expected = 0x11ee3344bb669900;
+  for (int offset = 0; offset < 16+9; ++offset) {
+    char *copy_location = map + 4096 + (offset > 16 ? 4096 - 32 + offset : offset);
+    memcpy(copy_location, &init_val, 8);
+    new__mm_maskmove_si64((__m64)0x1122334455667788LL, (__m64)0x8000808000800000, copy_location);
+    long long result;
+    memcpy(&result, copy_location, 8);
+    if (memcmp(&expected, &result, 8) != 0) {
+      printf("test_maskmove: wrong value was stored %llx vs %llx\n", result, expected);
+      return;
+    }
+  }
+}
+
+void test_generated() {
+  #include "gen_test.inc"
+}
+
+int main() {
+  int rounding[] = {FE_TONEAREST, FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO};
+  for (int i = 0; i < 4; ++i)
+  {
+    fesetround(rounding[i]);
+
+    test_maskmove();
+    test_stores();
+    test_generated();
+  }
+}