diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -157,8 +157,8 @@
TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "ncV:64:", "mmx,sse")
TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "ncV:64:", "mmx,sse")
TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "ncV:64:", "mmx,sse")
-TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "ncV:64:", "mmx,sse")
+TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "sV4sIi", "ncV:64:", "sse")
+TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4ssIi", "ncV:64:", "sse")
// MMX+SSE2
TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -12097,6 +12097,7 @@
case X86::BI__builtin_ia32_vec_init_v2si:
return Builder.CreateBitCast(BuildVector(Ops),
llvm::Type::getX86_MMXTy(getLLVMContext()));
+ case X86::BI__builtin_ia32_vec_ext_v4hi:
case X86::BI__builtin_ia32_vec_ext_v2si:
case X86::BI__builtin_ia32_vec_ext_v16qi:
case X86::BI__builtin_ia32_vec_ext_v8hi:
@@ -12115,6 +12116,7 @@
// Otherwise we could just do this in the header file.
return Builder.CreateExtractElement(Ops[0], Index);
}
+ case X86::BI__builtin_ia32_vec_set_v4hi:
case X86::BI__builtin_ia32_vec_set_v16qi:
case X86::BI__builtin_ia32_vec_set_v8hi:
case X86::BI__builtin_ia32_vec_set_v4si:
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -35,7 +35,9 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1)
/// Adds lower double-precision values in both operands and returns the
/// sum in the lower 64 bits of the result. The upper 64 bits of the result
@@ -1504,10 +1506,10 @@
/// \param __a
/// A 128-bit vector of [2 x double].
/// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_cvtpd_pi32(__m128d __a)
{
- return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
+ return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
}
/// Converts the two double-precision floating-point elements of a
@@ -1524,10 +1526,10 @@
/// \param __a
/// A 128-bit vector of [2 x double].
/// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_cvttpd_pi32(__m128d __a)
{
- return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
+ return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
}
/// Converts the two signed 32-bit integer elements of a 64-bit vector of
@@ -1541,10 +1543,10 @@
/// \param __a
/// A 64-bit vector of [2 x i32].
/// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128d __DEFAULT_FN_ATTRS
_mm_cvtpi32_pd(__m64 __a)
{
- return __builtin_ia32_cvtpi2pd((__v2si)__a);
+ return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
}
/// Returns the low-order element of a 128-bit vector of [2 x double] as
@@ -2175,10 +2177,10 @@
/// \param __b
/// A 64-bit integer.
/// \returns A 64-bit integer containing the sum of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_add_si64(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
+ return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
}
/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
@@ -2507,10 +2509,11 @@
/// \param __b
/// A 64-bit integer containing one of the source operands.
/// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_mul_su32(__m64 __a, __m64 __b)
{
- return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+ return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
+ (__v4si)__anyext128(__b)));
}
/// Multiplies 32-bit unsigned integer values contained in the lower
@@ -2621,10 +2624,10 @@
/// A 64-bit integer vector containing the subtrahend.
/// \returns A 64-bit integer vector containing the difference of the values in
/// the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sub_si64(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
+ return (__m64)((unsigned long long)__a - (unsigned long long)__b);
}
/// Subtracts the corresponding elements of two [2 x i64] vectors.
@@ -4965,8 +4968,10 @@
#if defined(__cplusplus)
} // extern "C"
#endif
+
+#undef __anyext128
+#undef __trunc64
#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -17,8 +17,29 @@
typedef short __v4hi __attribute__((__vector_size__(8)));
typedef char __v8qi __attribute__((__vector_size__(8)));
+/* Unsigned types */
+typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8)));
+typedef unsigned int __v2su __attribute__ ((__vector_size__ (8)));
+typedef unsigned short __v4hu __attribute__((__vector_size__(8)));
+typedef unsigned char __v8qu __attribute__((__vector_size__(8)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v8qs __attribute__((__vector_size__(8)));
+
+/* SSE/SSE2 types */
+typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_SSE2 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1)
+#define __extract2_32(a) (__m64)__builtin_shufflevector((__v4si)(a), __extension__ (__v4si){}, 0, 2);
/// Clears the MMX state by setting the state of the x87 stack registers
/// to empty.
@@ -44,10 +65,10 @@
/// A 32-bit integer value.
/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
/// parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtsi32_si64(int __i)
{
- return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+ return __extension__ (__m64)(__v2si){__i, 0};
}
/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
@@ -61,10 +82,10 @@
/// A 64-bit integer vector.
/// \returns A 32-bit signed integer value containing the lower 32 bits of the
/// parameter.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
_mm_cvtsi64_si32(__m64 __m)
{
- return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+ return ((__v2si)__m)[0];
}
/// Casts a 64-bit signed integer value into a 64-bit integer vector.
@@ -77,7 +98,7 @@
/// A 64-bit signed integer.
/// \returns A 64-bit integer vector containing the same bitwise pattern as the
/// parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtsi64_m64(long long __i)
{
return (__m64)__i;
@@ -93,7 +114,7 @@
/// A 64-bit integer vector.
/// \returns A 64-bit signed integer containing the same bitwise pattern as the
/// parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS
+static __inline__ long long __DEFAULT_FN_ATTRS_SSE2
_mm_cvtm64_si64(__m64 __m)
{
return (long long)__m;
@@ -123,10 +144,11 @@
/// [4 x i8] values are written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_packs_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+ return __extract2_32(__builtin_ia32_packsswb128((__v8hi)__anyext128(__m1),
+ (__v8hi)__anyext128(__m2)));
}
/// Converts 32-bit signed integers from both 64-bit integer vector
@@ -153,10 +175,11 @@
/// [2 x i16] values are written to the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [4 x i16] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_packs_pi32(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+ return __extract2_32(__builtin_ia32_packssdw128((__v4si)__anyext128(__m1),
+ (__v4si)__anyext128(__m2)));
}
/// Converts 16-bit signed integers from both 64-bit integer vector
@@ -183,10 +206,11 @@
/// the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_packs_pu16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+ return __extract2_32(__builtin_ia32_packuswb128((__v8hi)__anyext128(__m1),
+ (__v8hi)__anyext128(__m2)));
}
/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -210,10 +234,11 @@
/// Bits [63:56] are written to bits [63:56] of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+ return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
+ 4, 12, 5, 13, 6, 14, 7, 15);
}
/// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -233,10 +258,11 @@
/// Bits [63:48] are written to bits [63:48] of the result.
/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+ return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
+ 2, 6, 3, 7);
}
/// Unpacks the upper 32 bits from two 64-bit integer vectors of
@@ -254,10 +280,10 @@
/// the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+ return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3);
}
/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
@@ -281,10 +307,11 @@
/// Bits [31:24] are written to bits [63:56] of the result.
/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+ return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2,
+ 0, 8, 1, 9, 2, 10, 3, 11);
}
/// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -304,10 +331,11 @@
/// Bits [31:16] are written to bits [63:48] of the result.
/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+ return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2,
+ 0, 4, 1, 5);
}
/// Unpacks the lower 32 bits from two 64-bit integer vectors of
@@ -325,10 +353,10 @@
/// the upper 32 bits of the result.
/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+ return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2);
}
/// Adds each 8-bit integer element of the first 64-bit integer vector
@@ -346,10 +374,10 @@
/// A 64-bit integer vector of [8 x i8].
/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_add_pi8(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+ return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2));
}
/// Adds each 16-bit integer element of the first 64-bit integer vector
@@ -367,10 +395,10 @@
/// A 64-bit integer vector of [4 x i16].
/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_add_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+ return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2));
}
/// Adds each 32-bit integer element of the first 64-bit integer vector
@@ -388,10 +416,10 @@
/// A 64-bit integer vector of [2 x i32].
/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_add_pi32(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+ return (__m64)(((__v2su)__m1) + ((__v2su)__m2));
}
/// Adds each 8-bit signed integer element of the first 64-bit integer
@@ -410,10 +438,11 @@
/// A 64-bit integer vector of [8 x i8].
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
/// of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_adds_pi8(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+ return __trunc64(__builtin_ia32_paddsb128((__v16qi)__anyext128(__m1),
+ (__v16qi)__anyext128(__m2)));
}
/// Adds each 16-bit signed integer element of the first 64-bit integer
@@ -433,10 +462,11 @@
/// A 64-bit integer vector of [4 x i16].
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
/// of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_adds_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+ return __trunc64(__builtin_ia32_paddsw128((__v8hi)__anyext128(__m1),
+ (__v8hi)__anyext128(__m2)));
}
/// Adds each 8-bit unsigned integer element of the first 64-bit integer
@@ -455,10 +485,11 @@
/// A 64-bit integer vector of [8 x i8].
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
/// unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_adds_pu8(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+ return __trunc64(__builtin_ia32_paddusb128((__v16qi)__anyext128(__m1),
+ (__v16qi)__anyext128(__m2)));
}
/// Adds each 16-bit unsigned integer element of the first 64-bit integer
@@ -477,10 +508,11 @@
/// A 64-bit integer vector of [4 x i16].
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
/// unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_adds_pu16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+ return __trunc64(__builtin_ia32_paddusw128((__v8hi)__anyext128(__m1),
+ (__v8hi)__anyext128(__m2)));
}
/// Subtracts each 8-bit integer element of the second 64-bit integer
@@ -498,10 +530,10 @@
/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
/// both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sub_pi8(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+ return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2));
}
/// Subtracts each 16-bit integer element of the second 64-bit integer
@@ -519,10 +551,10 @@
/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
/// both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sub_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+ return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2));
}
/// Subtracts each 32-bit integer element of the second 64-bit integer
@@ -540,10 +572,10 @@
/// A 64-bit integer vector of [2 x i32] containing the subtrahends.
/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
/// both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sub_pi32(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+ return (__m64)(((__v2su)__m1) - ((__v2su)__m2));
}
/// Subtracts each 8-bit signed integer element of the second 64-bit
@@ -563,10 +595,11 @@
/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
/// differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_subs_pi8(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+ return __trunc64(__builtin_ia32_psubsb128((__v16qi)__anyext128(__m1),
+ (__v16qi)__anyext128(__m2)));
}
/// Subtracts each 16-bit signed integer element of the second 64-bit
@@ -586,10 +619,11 @@
/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
/// differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_subs_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+ return __trunc64(__builtin_ia32_psubsw128((__v8hi)__anyext128(__m1),
+ (__v8hi)__anyext128(__m2)));
}
/// Subtracts each 8-bit unsigned integer element of the second 64-bit
@@ -610,10 +644,11 @@
/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
/// differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_subs_pu8(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+ return __trunc64(__builtin_ia32_psubusb128((__v16qi)__anyext128(__m1),
+ (__v16qi)__anyext128(__m2)));
}
/// Subtracts each 16-bit unsigned integer element of the second 64-bit
@@ -634,10 +669,11 @@
/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
/// differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_subs_pu16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+ return __trunc64(__builtin_ia32_psubusw128((__v8hi)__anyext128(__m1),
+ (__v8hi)__anyext128(__m2)));
}
/// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -661,10 +697,11 @@
/// A 64-bit integer vector of [4 x i16].
/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
/// products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_madd_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+ return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1),
+ (__v8hi)__anyext128(__m2)));
}
/// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -682,10 +719,11 @@
/// A 64-bit integer vector of [4 x i16].
/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
/// of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+ return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1),
+ (__v8hi)__anyext128(__m2)));
}
/// Multiplies each 16-bit signed integer element of the first 64-bit
@@ -703,10 +741,10 @@
/// A 64-bit integer vector of [4 x i16].
/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
/// of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_mullo_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+ return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2));
}
/// Left-shifts each 16-bit signed integer element of the first
@@ -726,10 +764,11 @@
/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
/// values. If \a __count is greater or equal to 16, the result is set to all
/// 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sll_pi16(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+ return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m),
+ (__v8hi)__anyext128(__count)));
}
/// Left-shifts each 16-bit signed integer element of a 64-bit integer
@@ -748,10 +787,11 @@
/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
/// values. If \a __count is greater or equal to 16, the result is set to all
/// 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_slli_pi16(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+ return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m),
+ __count));
}
/// Left-shifts each 32-bit signed integer element of the first
@@ -771,10 +811,11 @@
/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
/// values. If \a __count is greater or equal to 32, the result is set to all
/// 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sll_pi32(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+ return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m),
+ (__v4si)__anyext128(__count)));
}
/// Left-shifts each 32-bit signed integer element of a 64-bit integer
@@ -793,10 +834,11 @@
/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
/// values. If \a __count is greater or equal to 32, the result is set to all
/// 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_slli_pi32(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+ return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m),
+ __count));
}
/// Left-shifts the first 64-bit integer parameter by the number of bits
@@ -813,10 +855,11 @@
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \returns A 64-bit integer vector containing the left-shifted value. If
/// \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sll_si64(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
+ return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m),
+ __anyext128(__count)));
}
/// Left-shifts the first parameter, which is a 64-bit integer, by the
@@ -833,10 +876,11 @@
/// A 32-bit integer value.
/// \returns A 64-bit integer vector containing the left-shifted value. If
/// \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_slli_si64(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
+ return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m),
+ __count));
}
/// Right-shifts each 16-bit integer element of the first parameter,
@@ -857,10 +901,11 @@
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sra_pi16(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+ return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m),
+ (__v8hi)__anyext128(__count)));
}
/// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -880,10 +925,11 @@
/// A 32-bit integer value.
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srai_pi16(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+ return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m),
+ __count));
}
/// Right-shifts each 32-bit integer element of the first parameter,
@@ -904,10 +950,11 @@
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sra_pi32(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+ return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m),
+ (__v4si)__anyext128(__count)));
}
/// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -927,10 +974,11 @@
/// A 32-bit integer value.
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srai_pi32(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+ return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m),
+ __count));
}
/// Right-shifts each 16-bit integer element of the first parameter,
@@ -950,10 +998,11 @@
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srl_pi16(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+ return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m),
+ (__v8hi)__anyext128(__count)));
}
/// Right-shifts each 16-bit integer element of a 64-bit integer vector
@@ -972,10 +1021,11 @@
/// A 32-bit integer value.
/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srli_pi16(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+ return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m),
+ __count));
}
/// Right-shifts each 32-bit integer element of the first parameter,
@@ -995,10 +1045,11 @@
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srl_pi32(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+ return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m),
+ (__v4si)__anyext128(__count)));
}
/// Right-shifts each 32-bit integer element of a 64-bit integer vector
@@ -1017,10 +1068,11 @@
/// A 32-bit integer value.
/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srli_pi32(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+ return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m),
+ __count));
}
/// Right-shifts the first 64-bit integer parameter by the number of bits
@@ -1037,10 +1089,11 @@
/// \param __count
/// A 64-bit integer vector interpreted as a single 64-bit integer.
/// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srl_si64(__m64 __m, __m64 __count)
{
- return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
+ return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m),
+ __anyext128(__count)));
}
/// Right-shifts the first parameter, which is a 64-bit integer, by the
@@ -1058,10 +1111,11 @@
/// \param __count
/// A 32-bit integer value.
/// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_srli_si64(__m64 __m, int __count)
{
- return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
+ return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m),
+ __count));
}
/// Performs a bitwise AND of two 64-bit integer vectors.
@@ -1076,10 +1130,10 @@
/// A 64-bit integer vector.
/// \returns A 64-bit integer vector containing the bitwise AND of both
/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_and_si64(__m64 __m1, __m64 __m2)
{
- return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
+ return (__m64)(((__v1du)__m1) & ((__v1du)__m2));
}
/// Performs a bitwise NOT of the first 64-bit integer vector, and then
@@ -1097,10 +1151,10 @@
/// A 64-bit integer vector.
/// \returns A 64-bit integer vector containing the bitwise AND of the second
/// parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_andnot_si64(__m64 __m1, __m64 __m2)
{
- return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
+ return (__m64)(~((__v1du)__m1) & ((__v1du)__m2));
}
/// Performs a bitwise OR of two 64-bit integer vectors.
@@ -1115,10 +1169,10 @@
/// A 64-bit integer vector.
/// \returns A 64-bit integer vector containing the bitwise OR of both
/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_or_si64(__m64 __m1, __m64 __m2)
{
- return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
+ return (__m64)(((__v1du)__m1) | ((__v1du)__m2));
}
/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
@@ -1133,10 +1187,10 @@
/// A 64-bit integer vector.
/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_xor_si64(__m64 __m1, __m64 __m2)
{
- return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
+ return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2));
}
/// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1155,10 +1209,10 @@
/// A 64-bit integer vector of [8 x i8].
/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+ return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2));
}
/// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1177,10 +1231,10 @@
/// A 64-bit integer vector of [4 x i16].
/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+ return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2));
}
/// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1199,10 +1253,10 @@
/// A 64-bit integer vector of [2 x i32].
/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+ return (__m64)(((__v2si)__m1) == ((__v2si)__m2));
}
/// Compares the 8-bit integer elements of two 64-bit integer vectors of
@@ -1221,10 +1275,12 @@
/// A 64-bit integer vector of [8 x i8].
/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+ /* This function always performs a signed comparison, but __v8qi is a char
+ which may be signed or unsigned, so use __v8qs. */
+ return (__m64)((__v8qs)__m1 > (__v8qs)__m2);
}
/// Compares the 16-bit integer elements of two 64-bit integer vectors of
@@ -1243,10 +1299,10 @@
/// A 64-bit integer vector of [4 x i16].
/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+ return (__m64)((__v4hi)__m1 > (__v4hi)__m2);
}
/// Compares the 32-bit integer elements of two 64-bit integer vectors of
@@ -1265,10 +1321,10 @@
/// A 64-bit integer vector of [2 x i32].
/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
{
- return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+ return (__m64)((__v2si)__m1 > (__v2si)__m2);
}
/// Constructs a 64-bit integer vector initialized to zero.
@@ -1278,7 +1334,7 @@
/// This intrinsic corresponds to the PXOR instruction.
///
/// \returns An initialized 64-bit integer vector with all elements set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_setzero_si64(void)
{
return __extension__ (__m64){ 0LL };
@@ -1299,10 +1355,10 @@
/// A 32-bit integer value used to initialize the lower 32 bits of the
/// result.
/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set_pi32(int __i1, int __i0)
{
- return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+ return __extension__ (__m64)(__v2si){__i0, __i1};
}
/// Constructs a 64-bit integer vector initialized with the specified
@@ -1322,10 +1378,10 @@
/// \param __s0
/// A 16-bit integer value used to initialize bits [15:0] of the result.
/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
{
- return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+ return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3};
}
/// Constructs a 64-bit integer vector initialized with the specified
@@ -1353,12 +1409,12 @@
/// \param __b0
/// An 8-bit integer value used to initialize bits [7:0] of the result.
/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
char __b1, char __b0)
{
- return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
- __b4, __b5, __b6, __b7);
+ return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3,
+ __b4, __b5, __b6, __b7};
}
/// Constructs a 64-bit integer vector of [2 x i32], with each of the
@@ -1374,7 +1430,7 @@
/// A 32-bit integer value used to initialize each vector element of the
/// result.
/// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set1_pi32(int __i)
{
return _mm_set_pi32(__i, __i);
@@ -1393,7 +1449,7 @@
/// A 16-bit integer value used to initialize each vector element of the
/// result.
/// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set1_pi16(short __w)
{
return _mm_set_pi16(__w, __w, __w, __w);
@@ -1411,7 +1467,7 @@
/// An 8-bit integer value used to initialize each vector element of the
/// result.
/// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_set1_pi8(char __b)
{
return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
@@ -1432,7 +1488,7 @@
/// A 32-bit integer value used to initialize the upper 32 bits of the
/// result.
/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_setr_pi32(int __i0, int __i1)
{
return _mm_set_pi32(__i1, __i0);
@@ -1455,7 +1511,7 @@
/// \param __w3
/// A 16-bit integer value used to initialize bits [63:48] of the result.
/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
{
return _mm_set_pi16(__w3, __w2, __w1, __w0);
@@ -1486,14 +1542,17 @@
/// \param __b7
/// An 8-bit integer value used to initialize bits [63:56] of the result.
/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
char __b6, char __b7)
{
return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
}
-#undef __DEFAULT_FN_ATTRS
+#undef __extract2_32
+#undef __anyext128
+#undef __trunc64
+#undef __DEFAULT_FN_ATTRS_SSE2
/* Aliases for compatibility. */
#define _m_empty _mm_empty
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -14,7 +14,10 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1)
+#define __extract2_32(a) (__m64)__builtin_shufflevector((__v4si)(a), __extension__ (__v4si){}, 0, 2);
/// Computes the absolute value of each of the packed 8-bit signed
/// integers in the source operand and stores the 8-bit unsigned integer
@@ -28,10 +31,10 @@
/// A 64-bit vector of [8 x i8].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi8(__m64 __a)
{
- return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+ return __trunc64(__builtin_ia32_pabsb128((__v16qi)__anyext128(__a)));
}
/// Computes the absolute value of each of the packed 8-bit signed
@@ -64,10 +67,10 @@
/// A 64-bit vector of [4 x i16].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi16(__m64 __a)
{
- return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+ return __trunc64(__builtin_ia32_pabsw128((__v8hi)__anyext128(__a)));
}
/// Computes the absolute value of each of the packed 16-bit signed
@@ -100,10 +103,10 @@
/// A 64-bit vector of [2 x i32].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_abs_pi32(__m64 __a)
{
- return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+ return __trunc64(__builtin_ia32_pabsd128((__v4si)__anyext128(__a)));
}
/// Computes the absolute value of each of the packed 32-bit signed
@@ -168,7 +171,10 @@
/// \returns A 64-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_pi8(a, b, n) \
- (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
+ (__m64)__builtin_shufflevector( \
+ __builtin_ia32_psrldqi128_byteshift( \
+ __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \
+ (n)), __extension__ (__v2di){}, 0)
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16].
@@ -233,10 +239,11 @@
/// destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
/// operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadd_pi16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+ return __extract2_32(__builtin_ia32_phaddw128((__v8hi)__anyext128(__a),
+ (__v8hi)__anyext128(__b)));
}
/// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -256,10 +263,11 @@
/// destination.
/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
/// operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadd_pi32(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+ return __extract2_32(__builtin_ia32_phaddd128((__v4si)__anyext128(__a),
+ (__v4si)__anyext128(__b)));
}
/// Horizontally adds the adjacent pairs of values contained in 2 packed
@@ -306,10 +314,11 @@
/// destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
/// sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hadds_pi16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+ return __extract2_32(__builtin_ia32_phaddsw128((__v8hi)__anyext128(__a),
+ (__v8hi)__anyext128(__b)));
}
/// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -375,10 +384,11 @@
/// the destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
/// of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsub_pi16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+ return __extract2_32(__builtin_ia32_phsubw128((__v8hi)__anyext128(__a),
+ (__v8hi)__anyext128(__b)));
}
/// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -398,10 +408,11 @@
/// the destination.
/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
/// of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsub_pi32(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+ return __extract2_32(__builtin_ia32_phsubd128((__v4si)__anyext128(__a),
+ (__v4si)__anyext128(__b)));
}
/// Horizontally subtracts the adjacent pairs of values contained in 2
@@ -448,10 +459,11 @@
/// the destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
/// differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_hsubs_pi16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+ return __extract2_32(__builtin_ia32_phsubsw128((__v8hi)__anyext128(__a),
+ (__v8hi)__anyext128(__b)));
}
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
@@ -512,10 +524,11 @@
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_maddubs_pi16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+ return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a),
+ (__v16qi)__anyext128(__b)));
}
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
@@ -552,10 +565,11 @@
/// A 64-bit vector of [4 x i16] containing one of the source operands.
/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
/// products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_mulhrs_pi16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+ return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a),
+ (__v8hi)__anyext128(__b)));
}
/// Copies the 8-bit integers from a 128-bit integer vector to the
@@ -601,12 +615,15 @@
/// 1: Clear the corresponding byte in the destination. \n
/// 0: Copy the selected source byte to the corresponding byte in the
/// destination. \n
-/// Bits [3:0] select the source byte to be copied.
+/// Bits [2:0] select the source byte to be copied.
/// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_shuffle_pi8(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+ return __trunc64(__builtin_ia32_pshufb128(
+ (__v16qi)__builtin_shufflevector(
+ (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
+ (__v16qi)__anyext128(__b)));
}
/// For each 8-bit integer in the first source operand, perform one of
@@ -707,10 +724,11 @@
/// A 64-bit integer vector containing control bytes corresponding to
/// positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi8(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+ return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
+ (__v16qi)__anyext128(__b)));
}
/// For each 16-bit integer in the first source operand, perform one of
@@ -733,10 +751,11 @@
/// A 64-bit integer vector containing control words corresponding to
/// positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+ return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
+ (__v8hi)__anyext128(__b)));
}
/// For each 32-bit integer in the first source operand, perform one of
@@ -759,13 +778,16 @@
/// A 64-bit integer vector containing two control doublewords corresponding
/// to positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_sign_pi32(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+ return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
+ (__v4si)__anyext128(__b)));
}
+#undef __extract2_32
+#undef __anyext128
+#undef __trunc64
#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
#endif /* __TMMINTRIN_H */
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -29,7 +29,12 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_SSE2 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(64)))
+
+#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0)
+#define __zext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, 2, 3)
+#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1)
+#define __zeroupper64(x) (__m128i)__builtin_shufflevector((__v4si)(x), __extension__ (__v4si){}, 0, 1, 4, 5)
/// Adds the 32-bit float values in the low-order bits of the operands.
///
@@ -1354,10 +1359,10 @@
/// \param __a
/// A 128-bit vector of [4 x float].
/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtps_pi32(__m128 __a)
{
- return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
+ return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
}
/// Converts two low-order float values in a 128-bit vector of
@@ -1370,7 +1375,7 @@
/// \param __a
/// A 128-bit vector of [4 x float].
/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvt_ps2pi(__m128 __a)
{
return _mm_cvtps_pi32(__a);
@@ -1447,10 +1452,10 @@
/// \param __a
/// A 128-bit vector of [4 x float].
/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvttps_pi32(__m128 __a)
{
- return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
+ return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
}
/// Converts two low-order float values in a 128-bit vector of [4 x
@@ -1464,7 +1469,7 @@
/// \param __a
/// A 128-bit vector of [4 x float].
/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtt_ps2pi(__m128 __a)
{
return _mm_cvttps_pi32(__a);
@@ -1559,10 +1564,13 @@
/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
/// converted value of the second operand. The upper 64 bits are copied from
/// the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi32_ps(__m128 __a, __m64 __b)
{
- return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
+ return (__m128)__builtin_shufflevector(
+ (__v4sf)__a,
+ __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
+ 4, 5, 2, 3);
}
/// Converts two elements of a 64-bit vector of [2 x i32] into two
@@ -1582,7 +1590,7 @@
/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
/// converted value from the second operand. The upper 64 bits are copied
/// from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvt_pi2ps(__m128 __a, __m64 __b)
{
return _mm_cvtpi32_ps(__a, __b);
@@ -2116,10 +2124,10 @@
/// A pointer to an aligned memory location used to store the register value.
/// \param __a
/// A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_pi(__m64 *__p, __m64 __a)
{
- __builtin_ia32_movntq(__p, __a);
+ __builtin_nontemporal_store(__a, __p);
}
/// Moves packed float values from a 128-bit vector of [4 x float] to a
@@ -2181,7 +2189,7 @@
/// 3: Bits [63:48] are copied to the destination.
/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
#define _mm_extract_pi16(a, n) \
- (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
+ (int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
/// Copies data from the 64-bit vector of [4 x i16] to the destination,
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
@@ -2227,10 +2235,11 @@
/// \param __b
/// A 64-bit integer vector containing one of the source operands.
/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_max_pi16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+ return __trunc64(__builtin_ia32_pmaxsw128((__v8hi)__anyext128(__a),
+ (__v8hi)__anyext128(__b)));
}
/// Compares each of the corresponding packed 8-bit unsigned integer
@@ -2246,10 +2255,11 @@
/// \param __b
/// A 64-bit integer vector containing one of the source operands.
/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_max_pu8(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+ return __trunc64(__builtin_ia32_pmaxub128((__v16qi)__anyext128(__a),
+ (__v16qi)__anyext128(__b)));
}
/// Compares each of the corresponding packed 16-bit integer values of
@@ -2265,10 +2275,11 @@
/// \param __b
/// A 64-bit integer vector containing one of the source operands.
/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_min_pi16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+ return __trunc64(__builtin_ia32_pminsw128((__v8hi)__anyext128(__a),
+ (__v8hi)__anyext128(__b)));
}
/// Compares each of the corresponding packed 8-bit unsigned integer
@@ -2284,10 +2295,11 @@
/// \param __b
/// A 64-bit integer vector containing one of the source operands.
/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_min_pu8(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+ return __trunc64(__builtin_ia32_pminub128((__v16qi)__anyext128(__a),
+ (__v16qi)__anyext128(__b)));
}
/// Takes the most significant bit from each 8-bit element in a 64-bit
@@ -2302,10 +2314,10 @@
/// A 64-bit integer vector containing the values with bits to be extracted.
/// \returns The most significant bit from each 8-bit element in \a __a,
/// written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_MMX
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2
_mm_movemask_pi8(__m64 __a)
{
- return __builtin_ia32_pmovmskb((__v8qi)__a);
+ return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
}
/// Multiplies packed 16-bit unsigned integer values and writes the
@@ -2321,10 +2333,11 @@
/// \param __b
/// A 64-bit integer vector containing one of the source operands.
/// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_mulhi_pu16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+ return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
+ (__v8hi)__anyext128(__b)));
}
/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
@@ -2359,7 +2372,9 @@
/// 11: assigned from bits [63:48] of \a a.
/// \returns A 64-bit integer vector containing the shuffled values.
#define _mm_shuffle_pi16(a, n) \
- (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
+ (__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__ (__v4hi){}, \
+ (n) & 0x3, ((n) >> 2) & 0x3, \
+ ((n) >> 4) & 0x3, ((n) >> 6) & 0x3)
/// Conditionally copies the values from each 8-bit element in the first
/// 64-bit integer vector operand to the specified memory location, as
@@ -2384,10 +2399,25 @@
/// A pointer to a 64-bit memory location that will receive the conditionally
/// copied integer values. The address of the memory location does not have
/// to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
+static __inline__ void __DEFAULT_FN_ATTRS_SSE2
_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
{
- __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+ // This is complex, because we need to support the case where __p is pointing
+ // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
+ // write might cause a trap where a 64-bit maskmovq would not. (Memory
+ // locations not selected by the mask bits might still cause traps.)
+ __m128i __d128 = __anyext128(__d);
+ __m128i __n128 = __zext128(__n);
+ if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
+ ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
+ // If there's a risk of spurious trap due to a 128-bit write, back up the
+ // pointer by 8 bytes and shift values in registers to match.
+ __p -= 8;
+ __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
+ __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
+ }
+
+ __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
}
/// Computes the rounded averages of the packed unsigned 8-bit integer
@@ -2403,10 +2433,11 @@
/// \param __b
/// A 64-bit integer vector containing one of the source operands.
/// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_avg_pu8(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+ return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
+ (__v16qi)__anyext128(__b)));
}
/// Computes the rounded averages of the packed unsigned 16-bit integer
@@ -2422,10 +2453,11 @@
/// \param __b
/// A 64-bit integer vector containing one of the source operands.
/// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_avg_pu16(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+ return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
+ (__v8hi)__anyext128(__b)));
}
/// Subtracts the corresponding 8-bit unsigned integer values of the two
@@ -2444,10 +2476,11 @@
/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
/// sets of absolute differences between both operands. The upper bits are
/// cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sad_pu8(__m64 __a, __m64 __b)
{
- return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+ return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
+ (__v16qi)__zext128(__b)));
}
#if defined(__cplusplus)
@@ -2725,22 +2758,10 @@
/// from the corresponding elements in this operand.
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
/// values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi16_ps(__m64 __a)
{
- __m64 __b, __c;
- __m128 __r;
-
- __b = _mm_setzero_si64();
- __b = _mm_cmpgt_pi16(__b, __a);
- __c = _mm_unpackhi_pi16(__a, __b);
- __r = _mm_setzero_ps();
- __r = _mm_cvtpi32_ps(__r, __c);
- __r = _mm_movelh_ps(__r, __r);
- __c = _mm_unpacklo_pi16(__a, __b);
- __r = _mm_cvtpi32_ps(__r, __c);
-
- return __r;
+ return __builtin_convertvector((__v4hi)__a, __v4sf);
}
/// Converts a 64-bit vector of 16-bit unsigned integer values into a
@@ -2755,21 +2776,10 @@
/// destination are copied from the corresponding elements in this operand.
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
/// values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpu16_ps(__m64 __a)
{
- __m64 __b, __c;
- __m128 __r;
-
- __b = _mm_setzero_si64();
- __c = _mm_unpackhi_pi16(__a, __b);
- __r = _mm_setzero_ps();
- __r = _mm_cvtpi32_ps(__r, __c);
- __r = _mm_movelh_ps(__r, __r);
- __c = _mm_unpacklo_pi16(__a, __b);
- __r = _mm_cvtpi32_ps(__r, __c);
-
- return __r;
+ return __builtin_convertvector((__v4hu)__a, __v4sf);
}
/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
@@ -2784,16 +2794,12 @@
/// from the corresponding lower 4 elements in this operand.
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
/// values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi8_ps(__m64 __a)
{
- __m64 __b;
-
- __b = _mm_setzero_si64();
- __b = _mm_cmpgt_pi8(__b, __a);
- __b = _mm_unpacklo_pi8(__a, __b);
-
- return _mm_cvtpi16_ps(__b);
+ return __builtin_convertvector(
+ __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
+ 0, 1, 2, 3), __v4sf);
}
/// Converts the lower four unsigned 8-bit integer values from a 64-bit
@@ -2809,15 +2815,12 @@
/// operand.
/// \returns A 128-bit vector of [4 x float] containing the copied and converted
/// values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpu8_ps(__m64 __a)
{
- __m64 __b;
-
- __b = _mm_setzero_si64();
- __b = _mm_unpacklo_pi8(__a, __b);
-
- return _mm_cvtpi16_ps(__b);
+ return __builtin_convertvector(
+ __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
+ 0, 1, 2, 3), __v4sf);
}
/// Converts the two 32-bit signed integer values from each 64-bit vector
@@ -2836,16 +2839,12 @@
/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
/// copied and converted values from the first operand. The upper 64 bits
/// contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
{
- __m128 __c;
-
- __c = _mm_setzero_ps();
- __c = _mm_cvtpi32_ps(__c, __b);
- __c = _mm_movelh_ps(__c, __c);
-
- return _mm_cvtpi32_ps(__c, __a);
+ return __builtin_convertvector(
+ __builtin_shufflevector((__v2si)__a, (__v2si)__b,
+ 0, 1, 2, 3), __v4sf);
}
/// Converts each single-precision floating-point element of a 128-bit
@@ -2865,16 +2864,11 @@
/// A 128-bit floating-point vector of [4 x float].
/// \returns A 64-bit integer vector of [4 x i16] containing the converted
/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtps_pi16(__m128 __a)
{
- __m64 __b, __c;
-
- __b = _mm_cvtps_pi32(__a);
- __a = _mm_movehl_ps(__a, __a);
- __c = _mm_cvtps_pi32(__a);
-
- return _mm_packs_pi32(__b, __c);
+ return __trunc64(__builtin_ia32_packssdw128(
+ (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
}
/// Converts each single-precision floating-point element of a 128-bit
@@ -2895,7 +2889,7 @@
/// 128-bit floating-point vector of [4 x float].
/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
/// converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtps_pi8(__m128 __a)
{
__m64 __b, __c;
@@ -2997,8 +2991,12 @@
#define _m_ _mm_
#define _m_ _mm_
+#undef __trunc64
+#undef __zext128
+#undef __anyext128
+#undef __zeroupper64
#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
+#undef __DEFAULT_FN_ATTRS_SSE2
/* Ugly hack for backwards-compatibility (compatible with gcc) */
#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -1,193 +1,200 @@
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s
-// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx
#include
__m64 test_mm_abs_pi8(__m64 a) {
// CHECK-LABEL: test_mm_abs_pi8
- // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.b
+ // CHECK: call <16 x i8> @llvm.abs.v16i8(
return _mm_abs_pi8(a);
}
__m64 test_mm_abs_pi16(__m64 a) {
// CHECK-LABEL: test_mm_abs_pi16
- // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.w
+ // CHECK: call <8 x i16> @llvm.abs.v8i16(
return _mm_abs_pi16(a);
}
__m64 test_mm_abs_pi32(__m64 a) {
// CHECK-LABEL: test_mm_abs_pi32
- // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.d
+ // CHECK: call <4 x i32> @llvm.abs.v4i32(
return _mm_abs_pi32(a);
}
__m64 test_mm_add_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_add_pi8
- // CHECK: call x86_mmx @llvm.x86.mmx.padd.b
+ // CHECK: add <8 x i8> {{%.*}}, {{%.*}}
return _mm_add_pi8(a, b);
}
__m64 test_mm_add_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_add_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.padd.w
+ // CHECK: add <4 x i16> {{%.*}}, {{%.*}}
return _mm_add_pi16(a, b);
}
__m64 test_mm_add_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_add_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.padd.d
+ // CHECK: add <2 x i32> {{%.*}}, {{%.*}}
return _mm_add_pi32(a, b);
}
__m64 test_mm_add_si64(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_add_si64
- // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+ // CHECK: add i64 {{%.*}}, {{%.*}}
return _mm_add_si64(a, b);
}
__m64 test_mm_adds_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_adds_pi8
- // CHECK: call x86_mmx @llvm.x86.mmx.padds.b
+ // CHECK: call <16 x i8> @llvm.sadd.sat.v16i8(
return _mm_adds_pi8(a, b);
}
__m64 test_mm_adds_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_adds_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.padds.w
+ // CHECK: call <8 x i16> @llvm.sadd.sat.v8i16(
return _mm_adds_pi16(a, b);
}
__m64 test_mm_adds_pu8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_adds_pu8
- // CHECK: call x86_mmx @llvm.x86.mmx.paddus.b
+ // CHECK: call <16 x i8> @llvm.uadd.sat.v16i8(
return _mm_adds_pu8(a, b);
}
__m64 test_mm_adds_pu16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_adds_pu16
- // CHECK: call x86_mmx @llvm.x86.mmx.paddus.w
+ // CHECK: call <8 x i16> @llvm.uadd.sat.v8i16(
return _mm_adds_pu16(a, b);
}
__m64 test_mm_alignr_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_alignr_pi8
- // CHECK: call x86_mmx @llvm.x86.mmx.palignr.b
+ // CHECK: shufflevector <16 x i8> {{%.*}}, <16 x i8> zeroinitializer, <16 x i32>
return _mm_alignr_pi8(a, b, 2);
}
__m64 test_mm_and_si64(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_and_si64
- // CHECK: call x86_mmx @llvm.x86.mmx.pand
+ // CHECK: and <1 x i64> {{%.*}}, {{%.*}}
return _mm_and_si64(a, b);
}
__m64 test_mm_andnot_si64(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_andnot_si64
- // CHECK: call x86_mmx @llvm.x86.mmx.pandn
+ // CHECK: [[TMP:%.*]] = xor <1 x i64> {{%.*}},
+ // CHECK: and <1 x i64> [[TMP]], {{%.*}}
return _mm_andnot_si64(a, b);
}
__m64 test_mm_avg_pu8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_avg_pu8
- // CHECK: call x86_mmx @llvm.x86.mmx.pavg.b
+ // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(
return _mm_avg_pu8(a, b);
}
__m64 test_mm_avg_pu16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_avg_pu16
- // CHECK: call x86_mmx @llvm.x86.mmx.pavg.w
+ // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(
return _mm_avg_pu16(a, b);
}
__m64 test_mm_cmpeq_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_cmpeq_pi8
- // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.b
+ // CHECK: [[CMP:%.*]] = icmp eq <8 x i8> {{%.*}}, {{%.*}}
+ // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8>
return _mm_cmpeq_pi8(a, b);
}
__m64 test_mm_cmpeq_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_cmpeq_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.w
+ // CHECK: [[CMP:%.*]] = icmp eq <4 x i16> {{%.*}}, {{%.*}}
+ // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16>
return _mm_cmpeq_pi16(a, b);
}
__m64 test_mm_cmpeq_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_cmpeq_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.d
+ // CHECK: [[CMP:%.*]] = icmp eq <2 x i32> {{%.*}}, {{%.*}}
+ // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32>
return _mm_cmpeq_pi32(a, b);
}
__m64 test_mm_cmpgt_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_cmpgt_pi8
- // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.b
+ // CHECK: [[CMP:%.*]] = icmp sgt <8 x i8> {{%.*}}, {{%.*}}
+ // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8>
return _mm_cmpgt_pi8(a, b);
}
__m64 test_mm_cmpgt_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_cmpgt_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.w
+ // CHECK: [[CMP:%.*]] = icmp sgt <4 x i16> {{%.*}}, {{%.*}}
+ // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16>
return _mm_cmpgt_pi16(a, b);
}
__m64 test_mm_cmpgt_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_cmpgt_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.d
+ // CHECK: [[CMP:%.*]] = icmp sgt <2 x i32> {{%.*}}, {{%.*}}
+ // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32>
return _mm_cmpgt_pi32(a, b);
}
__m128 test_mm_cvt_pi2ps(__m128 a, __m64 b) {
// CHECK-LABEL: test_mm_cvt_pi2ps
- // CHECK: <4 x float> @llvm.x86.sse.cvtpi2ps
+ // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
return _mm_cvt_pi2ps(a, b);
}
__m64 test_mm_cvt_ps2pi(__m128 a) {
// CHECK-LABEL: test_mm_cvt_ps2pi
- // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+ // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(
return _mm_cvt_ps2pi(a);
}
__m64 test_mm_cvtpd_pi32(__m128d a) {
// CHECK-LABEL: test_mm_cvtpd_pi32
- // CHECK: call x86_mmx @llvm.x86.sse.cvtpd2pi
+ // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq(
return _mm_cvtpd_pi32(a);
}
__m128 test_mm_cvtpi16_ps(__m64 a) {
// CHECK-LABEL: test_mm_cvtpi16_ps
- // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+ // CHECK: sitofp <4 x i16> {{%.*}} to <4 x float>
return _mm_cvtpi16_ps(a);
}
__m128d test_mm_cvtpi32_pd(__m64 a) {
// CHECK-LABEL: test_mm_cvtpi32_pd
- // CHECK: call <2 x double> @llvm.x86.sse.cvtpi2pd
+ // CHECK: sitofp <2 x i32> {{%.*}} to <2 x double>
return _mm_cvtpi32_pd(a);
}
__m128 test_mm_cvtpi32_ps(__m128 a, __m64 b) {
// CHECK-LABEL: test_mm_cvtpi32_ps
- // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+ // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
return _mm_cvtpi32_ps(a, b);
}
__m128 test_mm_cvtpi32x2_ps(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_cvtpi32x2_ps
- // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
- // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps
+ // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float>
return _mm_cvtpi32x2_ps(a, b);
}
__m64 test_mm_cvtps_pi16(__m128 a) {
// CHECK-LABEL: test_mm_cvtps_pi16
- // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+ // CHECK: [[TMP0:%.*]] = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> {{%.*}})
+ // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP0]],
return _mm_cvtps_pi16(a);
}
__m64 test_mm_cvtps_pi32(__m128 a) {
// CHECK-LABEL: test_mm_cvtps_pi32
- // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi
+ // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(
return _mm_cvtps_pi32(a);
}
@@ -205,19 +212,19 @@
__m64 test_mm_cvttpd_pi32(__m128d a) {
// CHECK-LABEL: test_mm_cvttpd_pi32
- // CHECK: call x86_mmx @llvm.x86.sse.cvttpd2pi
+ // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq(
return _mm_cvttpd_pi32(a);
}
__m64 test_mm_cvttps_pi32(__m128 a) {
// CHECK-LABEL: test_mm_cvttps_pi32
- // CHECK: call x86_mmx @llvm.x86.sse.cvttps2pi
+ // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq(
return _mm_cvttps_pi32(a);
}
int test_mm_extract_pi16(__m64 a) {
// CHECK-LABEL: test_mm_extract_pi16
- // CHECK: call i32 @llvm.x86.mmx.pextr.w
+ // CHECK: extractelement <4 x i16> {{%.*}}, i64 2
return _mm_extract_pi16(a, 2);
}
@@ -235,151 +242,153 @@
__m64 test_mm_hadd_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_hadd_pi16
- // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.w
+ // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(
return _mm_hadd_pi16(a, b);
}
__m64 test_mm_hadd_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_hadd_pi32
- // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.d
+ // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(
return _mm_hadd_pi32(a, b);
}
__m64 test_mm_hadds_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_hadds_pi16
- // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.sw
+ // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(
return _mm_hadds_pi16(a, b);
}
__m64 test_mm_hsub_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_hsub_pi16
- // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.w
+ // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(
return _mm_hsub_pi16(a, b);
}
__m64 test_mm_hsub_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_hsub_pi32
- // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.d
+ // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(
return _mm_hsub_pi32(a, b);
}
__m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_hsubs_pi16
- // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.sw
+ // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(
return _mm_hsubs_pi16(a, b);
}
__m64 test_mm_insert_pi16(__m64 a, int d) {
// CHECK-LABEL: test_mm_insert_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.pinsr.w
+ // CHECK: insertelement <4 x i16>
return _mm_insert_pi16(a, d, 2);
}
__m64 test_mm_madd_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_madd_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.pmadd.wd
+ // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(
return _mm_madd_pi16(a, b);
}
__m64 test_mm_maddubs_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_maddubs_pi16
- // CHECK: call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw
+ // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(
return _mm_maddubs_pi16(a, b);
}
void test_mm_maskmove_si64(__m64 d, __m64 n, char *p) {
// CHECK-LABEL: test_mm_maskmove_si64
- // CHECK: call void @llvm.x86.mmx.maskmovq
+ // CHECK: call void @llvm.x86.sse2.maskmov.dqu(
_mm_maskmove_si64(d, n, p);
}
__m64 test_mm_max_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_max_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.pmaxs.w
+ // CHECK: call <8 x i16> @llvm.smax.v8i16(
return _mm_max_pi16(a, b);
}
__m64 test_mm_max_pu8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_max_pu8
- // CHECK: call x86_mmx @llvm.x86.mmx.pmaxu.b
+ // CHECK: call <16 x i8> @llvm.umax.v16i8(
return _mm_max_pu8(a, b);
}
__m64 test_mm_min_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_min_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.pmins.w
+ // CHECK: call <8 x i16> @llvm.smin.v8i16(
return _mm_min_pi16(a, b);
}
__m64 test_mm_min_pu8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_min_pu8
- // CHECK: call x86_mmx @llvm.x86.mmx.pminu.b
+ // CHECK: call <16 x i8> @llvm.umin.v16i8(
return _mm_min_pu8(a, b);
}
int test_mm_movemask_pi8(__m64 a) {
// CHECK-LABEL: test_mm_movemask_pi8
- // CHECK: call i32 @llvm.x86.mmx.pmovmskb
+ // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128(
return _mm_movemask_pi8(a);
}
__m64 test_mm_mul_su32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_mul_su32
- // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+ // CHECK: and <2 x i64> {{%.*}},
+ // CHECK: and <2 x i64> {{%.*}},
+ // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
return _mm_mul_su32(a, b);
}
__m64 test_mm_mulhi_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_mulhi_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.pmulh.w
+ // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w(
return _mm_mulhi_pi16(a, b);
}
__m64 test_mm_mulhi_pu16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_mulhi_pu16
- // CHECK: call x86_mmx @llvm.x86.mmx.pmulhu.w
+ // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w(
return _mm_mulhi_pu16(a, b);
}
__m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_mulhrs_pi16
- // CHECK: call x86_mmx @llvm.x86.ssse3.pmul.hr.sw
+ // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(
return _mm_mulhrs_pi16(a, b);
}
__m64 test_mm_mullo_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_mullo_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.pmull.w
+ // CHECK: mul <4 x i16> {{%.*}}, {{%.*}}
return _mm_mullo_pi16(a, b);
}
__m64 test_mm_or_si64(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_or_si64
- // CHECK: call x86_mmx @llvm.x86.mmx.por
+ // CHECK: or <1 x i64> {{%.*}}, {{%.*}}
return _mm_or_si64(a, b);
}
__m64 test_mm_packs_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_packs_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.packsswb
+ // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(
return _mm_packs_pi16(a, b);
}
__m64 test_mm_packs_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_packs_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.packssdw
+ // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(
return _mm_packs_pi32(a, b);
}
__m64 test_mm_packs_pu16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_packs_pu16
- // CHECK: call x86_mmx @llvm.x86.mmx.packuswb
+ // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(
return _mm_packs_pu16(a, b);
}
__m64 test_mm_sad_pu8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sad_pu8
- // CHECK: call x86_mmx @llvm.x86.mmx.psad.bw
+ // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>
return _mm_sad_pu8(a, b);
}
@@ -472,181 +481,181 @@
__m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_shuffle_pi8
- // CHECK: call x86_mmx @llvm.x86.ssse3.pshuf.b
+ // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(
return _mm_shuffle_pi8(a, b);
}
__m64 test_mm_shuffle_pi16(__m64 a) {
// CHECK-LABEL: test_mm_shuffle_pi16
- // CHECK: call x86_mmx @llvm.x86.sse.pshuf.w
+ // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32>
return _mm_shuffle_pi16(a, 3);
}
__m64 test_mm_sign_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sign_pi8
- // CHECK: call x86_mmx @llvm.x86.ssse3.psign.b
+ // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(
return _mm_sign_pi8(a, b);
}
__m64 test_mm_sign_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sign_pi16
- // CHECK: call x86_mmx @llvm.x86.ssse3.psign.w
+ // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128(
return _mm_sign_pi16(a, b);
}
__m64 test_mm_sign_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sign_pi32
- // CHECK: call x86_mmx @llvm.x86.ssse3.psign.d
+ // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128(
return _mm_sign_pi32(a, b);
}
__m64 test_mm_sll_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sll_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.psll.w
+ // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(
return _mm_sll_pi16(a, b);
}
__m64 test_mm_sll_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sll_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.psll.d
+ // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d(
return _mm_sll_pi32(a, b);
}
__m64 test_mm_sll_si64(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sll_si64
- // CHECK: call x86_mmx @llvm.x86.mmx.psll.q
+ // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q(
return _mm_sll_si64(a, b);
}
__m64 test_mm_slli_pi16(__m64 a) {
// CHECK-LABEL: test_mm_slli_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.pslli.w
+ // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(
return _mm_slli_pi16(a, 3);
}
__m64 test_mm_slli_pi32(__m64 a) {
// CHECK-LABEL: test_mm_slli_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.pslli.d
+ // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(
return _mm_slli_pi32(a, 3);
}
__m64 test_mm_slli_si64(__m64 a) {
// CHECK-LABEL: test_mm_slli_si64
- // CHECK: call x86_mmx @llvm.x86.mmx.pslli.q
+ // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(
return _mm_slli_si64(a, 3);
}
__m64 test_mm_sra_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sra_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.psra.w
+ // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w(
return _mm_sra_pi16(a, b);
}
__m64 test_mm_sra_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sra_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.psra.d
+ // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d(
return _mm_sra_pi32(a, b);
}
__m64 test_mm_srai_pi16(__m64 a) {
// CHECK-LABEL: test_mm_srai_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.psrai.w
+ // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(
return _mm_srai_pi16(a, 3);
}
__m64 test_mm_srai_pi32(__m64 a) {
// CHECK-LABEL: test_mm_srai_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.psrai.d
+ // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(
return _mm_srai_pi32(a, 3);
}
__m64 test_mm_srl_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_srl_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.psrl.w
+ // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w(
return _mm_srl_pi16(a, b);
}
__m64 test_mm_srl_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_srl_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.psrl.d
+ // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d(
return _mm_srl_pi32(a, b);
}
__m64 test_mm_srl_si64(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_srl_si64
- // CHECK: call x86_mmx @llvm.x86.mmx.psrl.q
+ // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q(
return _mm_srl_si64(a, b);
}
__m64 test_mm_srli_pi16(__m64 a) {
// CHECK-LABEL: test_mm_srli_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.psrli.w
+ // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(
return _mm_srli_pi16(a, 3);
}
__m64 test_mm_srli_pi32(__m64 a) {
// CHECK-LABEL: test_mm_srli_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.psrli.d
+ // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(
return _mm_srli_pi32(a, 3);
}
__m64 test_mm_srli_si64(__m64 a) {
// CHECK-LABEL: test_mm_srli_si64
- // CHECK: call x86_mmx @llvm.x86.mmx.psrli.q
+ // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(
return _mm_srli_si64(a, 3);
}
void test_mm_stream_pi(__m64 *p, __m64 a) {
// CHECK-LABEL: test_mm_stream_pi
- // CHECK: call void @llvm.x86.mmx.movnt.dq
+ // CHECK: store <1 x i64> {{%.*}}, <1 x i64>* {{%.*}}, align 8, !nontemporal
_mm_stream_pi(p, a);
}
__m64 test_mm_sub_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sub_pi8
- // CHECK: call x86_mmx @llvm.x86.mmx.psub.b
+ // CHECK: sub <8 x i8> {{%.*}}, {{%.*}}
return _mm_sub_pi8(a, b);
}
__m64 test_mm_sub_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sub_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.psub.w
+ // CHECK: sub <4 x i16> {{%.*}}, {{%.*}}
return _mm_sub_pi16(a, b);
}
__m64 test_mm_sub_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sub_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.psub.d
+ // CHECK: sub <2 x i32> {{%.*}}, {{%.*}}
return _mm_sub_pi32(a, b);
}
__m64 test_mm_sub_si64(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_sub_si64
- // CHECK: call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+ // CHECK: sub i64 {{%.*}}, {{%.*}}
return _mm_sub_si64(a, b);
}
__m64 test_mm_subs_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_subs_pi8
- // CHECK: call x86_mmx @llvm.x86.mmx.psubs.b
+ // CHECK: call <16 x i8> @llvm.ssub.sat.v16i8(
return _mm_subs_pi8(a, b);
}
__m64 test_mm_subs_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_subs_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.psubs.w
+ // CHECK: call <8 x i16> @llvm.ssub.sat.v8i16(
return _mm_subs_pi16(a, b);
}
__m64 test_mm_subs_pu8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_subs_pu8
- // CHECK: call x86_mmx @llvm.x86.mmx.psubus.b
+ // CHECK: call <16 x i8> @llvm.usub.sat.v16i8(
return _mm_subs_pu8(a, b);
}
__m64 test_mm_subs_pu16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_subs_pu16
- // CHECK: call x86_mmx @llvm.x86.mmx.psubus.w
+ // CHECK: call <8 x i16> @llvm.usub.sat.v8i16(
return _mm_subs_pu16(a, b);
}
@@ -664,42 +673,42 @@
__m64 test_mm_unpackhi_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_unpackhi_pi8
- // CHECK: call x86_mmx @llvm.x86.mmx.punpckhbw
+ // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32>
return _mm_unpackhi_pi8(a, b);
}
__m64 test_mm_unpackhi_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_unpackhi_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.punpckhwd
+ // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32>
return _mm_unpackhi_pi16(a, b);
}
__m64 test_mm_unpackhi_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_unpackhi_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.punpckhdq
+ // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32>
return _mm_unpackhi_pi32(a, b);
}
__m64 test_mm_unpacklo_pi8(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_unpacklo_pi8
- // CHECK: call x86_mmx @llvm.x86.mmx.punpcklbw
+ // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32>
return _mm_unpacklo_pi8(a, b);
}
__m64 test_mm_unpacklo_pi16(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_unpacklo_pi16
- // CHECK: call x86_mmx @llvm.x86.mmx.punpcklwd
+ // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32>
return _mm_unpacklo_pi16(a, b);
}
__m64 test_mm_unpacklo_pi32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_unpacklo_pi32
- // CHECK: call x86_mmx @llvm.x86.mmx.punpckldq
+ // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32>
return _mm_unpacklo_pi32(a, b);
}
__m64 test_mm_xor_si64(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_xor_si64
- // CHECK: call x86_mmx @llvm.x86.mmx.pxor
+ // CHECK: xor <1 x i64> {{%.*}}, {{%.*}}
return _mm_xor_si64(a, b);
}
diff --git a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
--- a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
+++ b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c
@@ -2,22 +2,22 @@
#include
void shift(__m64 a, __m64 b, int c) {
- // CHECK: x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %{{.*}}, i32 {{.*}})
+ // CHECK: <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 {{.*}})
_mm_slli_pi16(a, c);
- // CHECK: x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %{{.*}}, i32 {{.*}})
+ // CHECK: <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 {{.*}})
_mm_slli_pi32(a, c);
- // CHECK: x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %{{.*}}, i32 {{.*}})
+ // CHECK: <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 {{.*}})
_mm_slli_si64(a, c);
- // CHECK: x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %{{.*}}, i32 {{.*}})
+ // CHECK: <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 {{.*}})
_mm_srli_pi16(a, c);
- // CHECK: x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %{{.*}}, i32 {{.*}})
+ // CHECK: <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 {{.*}})
_mm_srli_pi32(a, c);
- // CHECK: x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %{{.*}}, i32 {{.*}})
+ // CHECK: <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 {{.*}})
_mm_srli_si64(a, c);
- // CHECK: x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %{{.*}}, i32 {{.*}})
+ // CHECK: <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 {{.*}})
_mm_srai_pi16(a, c);
- // CHECK: x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %{{.*}}, i32 {{.*}})
+ // CHECK: <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 {{.*}})
_mm_srai_pi32(a, c);
}
diff --git a/clang/test/CodeGen/attr-target-x86-mmx.c b/clang/test/CodeGen/attr-target-x86-mmx.c
--- a/clang/test/CodeGen/attr-target-x86-mmx.c
+++ b/clang/test/CodeGen/attr-target-x86-mmx.c
@@ -1,12 +1,11 @@
// RUN: %clang_cc1 -triple i386-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// Picking a cpu that doesn't have mmx or sse by default so we can enable it later.
+// Picking a cpu that doesn't have sse by default so we can enable it later.
#define __MM_MALLOC_H
#include
-// Verify that when we turn on sse that we also turn on mmx.
-void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) {
+void __attribute__((target("sse2"))) shift(__m64 a, __m64 b, int c) {
_mm_slli_pi16(a, c);
_mm_slli_pi32(a, c);
_mm_slli_si64(a, c);
@@ -19,4 +18,4 @@
_mm_srai_pi32(a, c);
}
-// CHECK: "target-features"="+cx8,+mmx,+sse,+x87"
+// CHECK: "target-features"="+cx8,+mmx,+sse,+sse2,+x87"
diff --git a/clang/test/Headers/xmmintrin.c b/clang/test/Headers/xmmintrin.c
--- a/clang/test/Headers/xmmintrin.c
+++ b/clang/test/Headers/xmmintrin.c
@@ -14,7 +14,7 @@
// checking that clang emits PACKSSDW instead of PACKSSWB.
// CHECK: define{{.*}} i64 @test_mm_cvtps_pi16
-// CHECK: call x86_mmx @llvm.x86.mmx.packssdw
+// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128
__m64 test_mm_cvtps_pi16(__m128 a) {
return _mm_cvtps_pi16(a);
diff --git a/clang/test/Sema/x86-builtin-palignr.c b/clang/test/Sema/x86-builtin-palignr.c
--- a/clang/test/Sema/x86-builtin-palignr.c
+++ b/clang/test/Sema/x86-builtin-palignr.c
@@ -4,5 +4,5 @@
#include
__m64 test1(__m64 a, __m64 b, int c) {
- return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_palignr' must be a constant integer}}
+ return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_psrldqi128_byteshift' must be a constant integer}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -2424,11 +2424,11 @@
Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>;
- def int_x86_mmx_pextr_w : GCCBuiltin<"__builtin_ia32_vec_ext_v4hi">,
+ def int_x86_mmx_pextr_w :
Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty],
[IntrNoMem, ImmArg>]>;
- def int_x86_mmx_pinsr_w : GCCBuiltin<"__builtin_ia32_vec_set_v4hi">,
+ def int_x86_mmx_pinsr_w :
Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>;
}
diff --git a/mmx-tests/Makefile b/mmx-tests/Makefile
new file mode 100644
--- /dev/null
+++ b/mmx-tests/Makefile
@@ -0,0 +1,29 @@
+USE_XMM=
+#USE_XMM=--use-xmm
+
+OLDCC ?= clang-10
+NEWCC ?= ../build/bin/clang
+TESTCC=$(OLDCC)
+COPTS ?=
+
+gen_orig.c: mmx-tests.py
+ ./mmx-tests.py --kind=wrapper --wrapper-prefix=orig $(USE_XMM) > $@
+gen_orig.h: mmx-tests.py
+ ./mmx-tests.py --kind=wrapper_h --wrapper-prefix=orig $(USE_XMM) > $@
+gen_new.c: mmx-tests.py
+ ./mmx-tests.py --kind=wrapper --wrapper-prefix=new $(USE_XMM) > $@
+gen_new.h: mmx-tests.py
+ ./mmx-tests.py --kind=wrapper_h --wrapper-prefix=new $(USE_XMM) > $@
+gen_test.inc: mmx-tests.py
+ ./mmx-tests.py --kind=test $(USE_XMM) > $@
+gen_orig.o: gen_orig.c
+ $(OLDCC) -c $(COPTS) -O2 -o $@ $^
+gen_new.o: gen_new.c
+ $(NEWCC) -c $(COPTS) -O2 -o $@ $^
+test.o: test.c gen_test.inc gen_orig.h gen_new.h
+ $(TESTCC) -c $(COPTS) -o $@ test.c
+test: test.o gen_orig.o gen_new.o
+ $(TESTCC) $(COPTS) -o $@ $^ -lm
+
+clean:
+ rm -f gen_orig.c gen_orig.h gen_new.c gen_new.h gen_test.inc gen_orig.o gen_new.o test.o test
diff --git a/mmx-tests/mmx-tests.py b/mmx-tests/mmx-tests.py
new file mode 100755
--- /dev/null
+++ b/mmx-tests/mmx-tests.py
@@ -0,0 +1,301 @@
+#!/usr/bin/python3
+
+import argparse
+import sys
+
+# This is a list of all intel functions and macros which take or
+# return an __m64.
+def do_mmx(fn):
+ # mmintrin.h
+ fn("_mm_cvtsi32_si64", "__m64", ("int", ))
+ fn("_mm_cvtsi64_si32", "int", ("__m64", ))
+ fn("_mm_cvtsi64_m64", "__m64", ("long long", ), condition='defined(__X86_64__) || defined(__clang__)')
+ fn("_mm_cvtm64_si64", "long long", ("__m64", ), condition='defined(__X86_64__) || defined(__clang__)')
+ fn("_mm_packs_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_packs_pi32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_packs_pu16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_unpackhi_pi8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_unpackhi_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_unpackhi_pi32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_unpacklo_pi8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_unpacklo_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_unpacklo_pi32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_add_pi8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_add_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_add_pi32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_adds_pi8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_adds_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_adds_pu8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_adds_pu16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_sub_pi8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_sub_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_sub_pi32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_subs_pi8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_subs_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_subs_pu8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_subs_pu16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_madd_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_mulhi_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_mullo_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_sll_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_slli_pi16", "__m64", ("__m64", "int", ))
+ fn("_mm_sll_pi32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_slli_pi32", "__m64", ("__m64", "int", ))
+ fn("_mm_sll_si64", "__m64", ("__m64", "__m64", ))
+ fn("_mm_slli_si64", "__m64", ("__m64", "int", ))
+ fn("_mm_sra_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_srai_pi16", "__m64", ("__m64", "int", ))
+ fn("_mm_sra_pi32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_srai_pi32", "__m64", ("__m64", "int", ))
+ fn("_mm_srl_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_srli_pi16", "__m64", ("__m64", "int", ))
+ fn("_mm_srl_pi32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_srli_pi32", "__m64", ("__m64", "int", ))
+ fn("_mm_srl_si64", "__m64", ("__m64", "__m64", ))
+ fn("_mm_srli_si64", "__m64", ("__m64", "int", ))
+ fn("_mm_and_si64", "__m64", ("__m64", "__m64", ))
+ fn("_mm_andnot_si64", "__m64", ("__m64", "__m64", ))
+ fn("_mm_or_si64", "__m64", ("__m64", "__m64", ))
+ fn("_mm_xor_si64", "__m64", ("__m64", "__m64", ))
+ fn("_mm_cmpeq_pi8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_cmpeq_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_cmpeq_pi32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_cmpgt_pi8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_cmpgt_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_cmpgt_pi32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_setzero_si64", "__m64", ())
+ fn("_mm_set_pi32", "__m64", ("int", "int", ))
+ fn("_mm_set_pi16", "__m64", ("short", "short", "short", "short", ))
+ fn("_mm_set_pi8", "__m64", ("char", "char", "char", "char", "char", "char", "char", "char", ))
+ fn("_mm_set1_pi32", "__m64", ("int", ))
+ fn("_mm_set1_pi16", "__m64", ("short", ))
+ fn("_mm_set1_pi8", "__m64", ("char", ))
+ fn("_mm_setr_pi32", "__m64", ("int", "int", ))
+ fn("_mm_setr_pi16", "__m64", ("short", "short", "short", "short", ))
+ fn("_mm_setr_pi8", "__m64", ("char", "char", "char", "char", "char", "char", "char", "char", ))
+
+ # xmmintrin.h
+ fn("_mm_cvtps_pi32", "__m64", ("__m128", ))
+ fn("_mm_cvt_ps2pi", "__m64", ("__m128", ))
+ fn("_mm_cvttps_pi32", "__m64", ("__m128", ))
+ fn("_mm_cvtt_ps2pi", "__m64", ("__m128", ))
+ fn("_mm_cvtpi32_ps", "__m128", ("__m128", "__m64", ))
+ fn("_mm_cvt_pi2ps", "__m128", ("__m128", "__m64", ))
+ fn("_mm_loadh_pi", "__m128", ("__m128", "const __m64 *", ))
+ fn("_mm_loadl_pi", "__m128", ("__m128", "const __m64 *", ))
+ fn("_mm_storeh_pi", "void", ("__m64 *", "__m128", ))
+ fn("_mm_storel_pi", "void", ("__m64 *", "__m128", ))
+ fn("_mm_stream_pi", "void", ("__m64 *", "__m64", ))
+ fn("_mm_max_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_max_pu8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_min_pi16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_min_pu8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_movemask_pi8", "int", ("__m64", ))
+ fn("_mm_mulhi_pu16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_maskmove_si64", "void", ("__m64", "__m64", "char *", ))
+ fn("_mm_avg_pu8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_avg_pu16", "__m64", ("__m64", "__m64", ))
+ fn("_mm_sad_pu8", "__m64", ("__m64", "__m64", ))
+ fn("_mm_cvtpi16_ps", "__m128", ("__m64", ))
+ fn("_mm_cvtpu16_ps", "__m128", ("__m64", ))
+ fn("_mm_cvtpi8_ps", "__m128", ("__m64", ))
+ fn("_mm_cvtpu8_ps", "__m128", ("__m64", ))
+ fn("_mm_cvtpi32x2_ps", "__m128", ("__m64", "__m64", ))
+ fn("_mm_cvtps_pi16", "__m64", ("__m128", ))
+ fn("_mm_cvtps_pi8", "__m64", ("__m128", ))
+
+ fn("_mm_extract_pi16", "int", ("__m64", "int", ), imm_range=(0, 3))
+ fn("_mm_insert_pi16", "__m64", ("__m64", "int", "int", ), imm_range=(0, 3))
+ fn("_mm_shuffle_pi16", "__m64", ("__m64", "int", ), imm_range=(0, 255))
+
+ # emmintrin.h
+ fn("_mm_cvtpd_pi32", "__m64", ("__m128d", ))
+ fn("_mm_cvttpd_pi32", "__m64", ("__m128d", ))
+ fn("_mm_cvtpi32_pd", "__m128d", ("__m64", ))
+ fn("_mm_add_si64", "__m64", ("__m64", "__m64", ))
+ fn("_mm_mul_su32", "__m64", ("__m64", "__m64", ))
+ fn("_mm_sub_si64", "__m64", ("__m64", "__m64", ))
+ fn("_mm_set_epi64", "__m128i", ("__m64", "__m64", ))
+ fn("_mm_set1_epi64", "__m128i", ("__m64", ))
+ fn("_mm_setr_epi64", "__m128i", ("__m64", "__m64", ))
+ fn("_mm_movepi64_pi64", "__m64", ("__m128i", ))
+ fn("_mm_movpi64_epi64", "__m128i", ("__m64", ))
+
+ # tmmintrin.h
+ fn("_mm_abs_pi8", "__m64", ("__m64", ), target='ssse3')
+ fn("_mm_abs_pi16", "__m64", ("__m64", ), target='ssse3')
+ fn("_mm_abs_pi32", "__m64", ("__m64", ), target='ssse3')
+ fn("_mm_hadd_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_hadd_pi32", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_hadds_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_hsub_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_hsub_pi32", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_hsubs_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_maddubs_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_mulhrs_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_shuffle_pi8", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_sign_pi8", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_sign_pi16", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_sign_pi32", "__m64", ("__m64", "__m64", ), target='ssse3')
+ fn("_mm_alignr_pi8", "__m64", ("__m64", "__m64", "int", ), imm_range=(0, 18), target='ssse3')
+
+# Generate a file full of wrapper functions for each of the above mmx
+# functions.
+#
+# If use_xmm is set, pass/return arguments as __m128 rather than of
+# __m64.
+def define_wrappers(prefix, use_xmm=True, header=False):
+ if header:
+ print('#pragma once')
+
+ print('#include ')
+ if use_xmm and not header:
+ print('#define m128_to_m64(x) ((__m64)((__v2di)(x))[0])')
+ print('#define m64_to_m128(x) ((__m128)(__v2di){(long long)(__m64)(x), 0})')
+
+ def fn(name, ret_ty, arg_tys, imm_range=None, target=None, condition=None):
+ if condition:
+ print(f'#if {condition}')
+ convert_ret = False
+ if use_xmm and ret_ty == '__m64':
+ ret_ty = '__v2di'
+ convert_ret = True
+
+ if target:
+ attr = f'__attribute__((target("{target}"))) '
+ else:
+ attr = ''
+
+ if imm_range:
+ arg_tys = arg_tys[:-1]
+ def translate_type(t):
+ if use_xmm and t == '__m64':
+ return '__m128'
+ return t
+ def translate_arg(t, a):
+ if use_xmm and t == '__m64':
+ return f'm128_to_m64({a})'
+ return a
+
+ arg_decl = ', '.join(f'{translate_type(v[1])} arg_{v[0]}' for v in enumerate(arg_tys)) or 'void'
+ call_args = ', '.join(translate_arg(v[1], f'arg_{v[0]}') for v in enumerate(arg_tys))
+
+ def create_fn(suffix, extraarg):
+ if header:
+ print(f'{ret_ty} {prefix}_{name}{suffix}({arg_decl});')
+ else:
+ print(f'{attr}{ret_ty} {prefix}_{name}{suffix}({arg_decl})')
+ if use_xmm and convert_ret:
+ print(f'{{ return ({ret_ty})m64_to_m128({name}({call_args}{extraarg})); }}')
+ else:
+ print(f'{{ return {name}({call_args}{extraarg}); }}')
+
+ if imm_range:
+ for i in range(imm_range[0], imm_range[1]+1):
+ create_fn(f'_{i}', f', {i}')
+ else:
+ create_fn('', '')
+ if condition:
+ print('#endif')
+
+ do_mmx(fn)
+
+
+# Create a C file that tests an "orig" set of wrappers against a "new"
+# set of wrappers.
+def define_tests(use_xmm=False):
+ def fn(name, ret_ty, arg_tys, imm_range=None, target=None, condition=None):
+ if condition:
+ print(f'#if {condition}')
+ arg_decl = ', '.join(f'{v[1]} arg_{v[0]}' for v in enumerate(arg_tys)) or 'void'
+ print(f' // {ret_ty} {name}({arg_decl});')
+
+ if imm_range:
+ for i in range(imm_range[0], imm_range[1]+1):
+ fn(name + f'_{i}', ret_ty, arg_tys[:-1], target=target)
+ return
+
+ convert_pre = convert_post = ''
+ if use_xmm and ret_ty == '__m64':
+ convert_pre = 'm128_to_m64('
+ convert_post = ')'
+
+ args=[]
+ loops=[]
+ printf_fmts = []
+ printf_args = []
+ for arg_ty in arg_tys:
+ v=len(loops)
+ if arg_ty in ('char', 'short'):
+ loops.append(f' for(int l{v} = 0; l{v} < arraysize(short_vals); ++l{v}) {{')
+ args.append(f'({arg_ty})short_vals[l{v}]')
+ printf_fmts.append('%016x')
+ printf_args.append(f'short_vals[l{v}]')
+ elif arg_ty in ('int', 'long long'):
+ loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{')
+ args.append(f'({arg_ty})mmx_vals[l{v}]')
+ printf_fmts.append('%016llx')
+ printf_args.append(f'mmx_vals[l{v}]')
+ elif arg_ty == '__m64':
+ loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{')
+ if use_xmm:
+ loops.append(f' for(int l{v+1} = 0; l{v+1} < arraysize(padding_mmx_vals); ++l{v+1}) {{')
+ args.append(f'(__m128)(__m128i){{mmx_vals[l{v}], padding_mmx_vals[l{v+1}]}}')
+ printf_fmts.append('(__m128i){%016llx, %016llx}')
+ printf_args.append(f'mmx_vals[l{v}], padding_mmx_vals[l{v+1}]')
+ else:
+ args.append(f'({arg_ty})mmx_vals[l{v}]')
+ printf_fmts.append('%016llx')
+ printf_args.append(f'mmx_vals[l{v}]')
+ elif arg_ty in ('__m128', '__m128i', '__m128d'):
+ loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{')
+ loops.append(f' for(int l{v+1} = 0; l{v+1} < arraysize(mmx_vals); ++l{v+1}) {{')
+ args.append(f'({arg_ty})(__m128i){{mmx_vals[l{v}], mmx_vals[l{v+1}]}}')
+ printf_fmts.append('(__m128i){%016llx, %016llx}')
+ printf_args.append(f'mmx_vals[l{v}], mmx_vals[l{v+1}]')
+ elif arg_ty == 'const __m64 *':
+ loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{\n' +
+ f' mem.m64 = (__m64)mmx_vals[l{v}];')
+ args.append(f'&mem.m64')
+ printf_fmts.append('&mem.m64 /* %016llx */')
+ printf_args.append(f'(long long)mem.m64')
+ else:
+ print(' // -> UNSUPPORTED')
+ return
+
+ printf_fmt_str = '"' + ', '.join(printf_fmts) + '"'
+ if printf_args:
+ printf_arg_str = ', ' + ','.join(printf_args)
+ else:
+ printf_arg_str = ''
+
+ print('\n'.join(loops))
+ print(f'''
+ clear_exc_flags();
+ {ret_ty} orig_res = {convert_pre}orig_{name}({", ".join(args)}){convert_post};
+ int orig_exc = get_exc_flags();
+ clear_exc_flags();
+ {ret_ty} new_res = {convert_pre}new_{name}({", ".join(args)}){convert_post};
+ int new_exc = get_exc_flags();
+ check_mismatch("{name}", orig_exc, new_exc, &orig_res, &new_res, sizeof(orig_res), {printf_fmt_str}{printf_arg_str});
+''')
+ print(' }\n' * len(loops))
+ print()
+ if condition:
+ print('#endif')
+
+ do_mmx(fn)
+
+
+parser = argparse.ArgumentParser(description='Generate mmx test code.')
+parser.add_argument('--kind', choices=['wrapper', 'wrapper_h', 'test'])
+parser.add_argument('--wrapper-prefix', default='orig')
+parser.add_argument('--use-xmm', action='store_true')
+
+args = parser.parse_args()
+if args.kind == 'wrapper':
+ define_wrappers(args.wrapper_prefix, use_xmm=args.use_xmm, header=False)
+elif args.kind == 'wrapper_h':
+ define_wrappers(args.wrapper_prefix, use_xmm=args.use_xmm, header=True)
+elif args.kind == 'test':
+ define_tests(use_xmm=args.use_xmm)
diff --git a/mmx-tests/test.c b/mmx-tests/test.c
new file mode 100644
--- /dev/null
+++ b/mmx-tests/test.c
@@ -0,0 +1,237 @@
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "gen_orig.h"
+#include "gen_new.h"
+
+
+// A bunch of helper functions for the code in gen_test.inc
+#define m128_to_m64(x) (__m64)((__v2di)(x))[0]
+
+#define arraysize(a) (sizeof(a) / sizeof(*a))
+
+static void dump_mem(void *ptr, int nbytes) {
+ for (int i = 0; i < nbytes; ++i) {
+ printf(" %02x", ((unsigned char*)ptr)[i]);
+ }
+ printf("\n");
+}
+
+static int get_exc_flags() {
+ return fetestexcept(FE_ALL_EXCEPT | __FE_DENORM);
+}
+
+static void clear_exc_flags() {
+ feclearexcept(FE_ALL_EXCEPT | __FE_DENORM);
+}
+
+static void dump_exc_flags(int exc_flags) {
+ printf("%x", exc_flags);
+ if (exc_flags & FE_INEXACT)
+ printf(" inexact");
+ if (exc_flags & FE_DIVBYZERO)
+ printf(" divbyzero");
+ if (exc_flags & FE_UNDERFLOW)
+ printf(" underflow");
+ if (exc_flags & FE_OVERFLOW)
+ printf(" overflow");
+ if (exc_flags & FE_INVALID)
+ printf(" invalid");
+ if (exc_flags & __FE_DENORM)
+ printf(" denormal");
+}
+
+static void dump_result(int orig_exc, int new_exc, void *orig_data, void *new_data, int nbytes) {
+ printf(" orig_exc = ");
+ dump_exc_flags(orig_exc);
+ printf(" new_exc = ");
+ dump_exc_flags(new_exc);
+ printf("\n");
+ printf(" orig");
+ dump_mem(orig_data, nbytes);
+ printf(" new ");
+ dump_mem(new_data, nbytes);
+}
+
+static void check_mismatch(const char *name, int orig_exc, int new_exc,
+ void *orig_data, void *new_data, int nbytes,
+ const char *printf_fmt, ...) {
+ if (orig_exc != new_exc || memcmp(orig_data, new_data, nbytes)) {
+ va_list args;
+ va_start(args, printf_fmt);
+ printf("mismatch %s(", name);
+ vprintf(printf_fmt, args);
+ printf("):\n");
+ dump_result(orig_exc, new_exc, orig_data, new_data, nbytes);
+ va_end(args);
+ }
+}
+
+unsigned short short_vals[] = {
+ 0x0000,
+ 0x0001,
+ 0xffee,
+ 0xffff,
+};
+
+unsigned long long padding_mmx_vals[] = {
+ 0x0000000000000000LL,
+ 0xffffffffffffffffLL,
+ 0x7fc000007fc00000LL, // float nan nan
+ 0xfff8000000000000LL, // -nan
+};
+
+unsigned long long mmx_vals[] = {
+ 0x0000000000000000LL,
+ 0x0000000000000001LL,
+ 0x0000000000000002LL,
+ 0x0000000000000003LL,
+ 0x0000000000000004LL,
+ 0x0000000000000005LL,
+ 0x0000000000000006LL,
+ 0x0000000000000007LL,
+ 0x0000000000000008LL,
+ 0x0000000000000009LL,
+ 0x000000000000000aLL,
+ 0x000000000000000bLL,
+ 0x000000000000000cLL,
+ 0x000000000000000dLL,
+ 0x000000000000000eLL,
+ 0x000000000000000fLL,
+ 0x0000000000000100LL,
+ 0x0000000000010000LL,
+ 0x0000000001000000LL,
+ 0x0000000100000000LL,
+ 0x0000010000000000LL,
+ 0x0001000000000000LL,
+ 0x0100000000000000LL,
+ 0x0101010101010101LL,
+ 0x0102030405060708LL,
+ 0x1234567890abcdefLL,
+ 0x007f007f007f007fLL,
+ 0x7f007f007f007f00LL,
+ 0x7f7f7f7f7f7f7f7fLL,
+ 0x8000800080008000LL,
+ 0x0080008000800080LL,
+ 0x8080808080808080LL,
+ 0x7fff7fff7fff7fffLL,
+ 0x8000800080008000LL,
+ 0x7fffffff7fffffffLL,
+ 0x8000000080000000LL,
+ 0x0000777700006666LL,
+ 0x7777000066660000LL,
+ 0x0000ffff0000eeeeLL,
+ 0xffff0000eeee0000LL,
+ 0x7700660055004400LL,
+ 0x0077006600550044LL,
+ 0xff00ee00dd00cc00LL,
+ 0x00ff00ee00dd00ccLL,
+ 0xffffffffffffffffLL,
+ 0x3ff0000000000000LL, // 1.0
+ 0x3ff8000000000000LL, // 1.5
+ 0x4000000000000000LL, // 2.0
+ 0x3f8000003fc00000LL, // float 1.0 1.5
+ 0x3fc0000040000000LL, // float 1.5 2.0
+ 0x7ff0000000000000LL, // inf
+ 0x7f8000007f800000LL, // float inf inf
+ 0xfff0000000000000LL, // -inf
+ 0xff800000ff800000LL, // float -inf -inf
+ 0x7ff8000000000000LL, // nan
+ 0x7fc000007fc00000LL, // float nan nan
+ 0xfff8000000000000LL, // -nan
+ 0xffc00000ffc00000LL, // float -nan -nan
+};
+
+struct __attribute__((aligned(sizeof(__m128)))) Mem {
+ __m64 dummy;
+ __m64 m64;
+} mem, mem2;
+
+// These 3 could be autogenerated...but I didn't add support for stores to the generator.
+void test_stores() {
+ // void _mm_storeh_pi(__m64 * arg_0, __m128 arg_1);
+ for(int l0 = 0; l0 < arraysize(mmx_vals); ++l0) {
+ for(int l1 = 0; l1 < arraysize(mmx_vals); ++l1) {
+ clear_exc_flags();
+ orig__mm_storeh_pi(&mem.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]});
+ int orig_exc = get_exc_flags();
+ clear_exc_flags();
+ new__mm_storeh_pi(&mem2.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]});
+ int new_exc = get_exc_flags();
+ check_mismatch("_mm_storeh_pi", orig_exc, new_exc, &mem.m64, &mem2.m64, sizeof(__m64),
+ "&mem.m64, (__m128i){%016llx, %016llx},", mmx_vals[l0], mmx_vals[l1]);
+ }
+ }
+
+ // void _mm_storel_pi(__m64 * arg_0, __m128 arg_1);
+ for(int l0 = 0; l0 < arraysize(mmx_vals); ++l0) {
+ for(int l1 = 0; l1 < arraysize(mmx_vals); ++l1) {
+ clear_exc_flags();
+ orig__mm_storel_pi(&mem.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]});
+ int orig_exc = get_exc_flags();
+ clear_exc_flags();
+ new__mm_storel_pi(&mem2.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]});
+ int new_exc = get_exc_flags();
+ check_mismatch("_mm_storeh_pi", orig_exc, new_exc, &mem.m64, &mem2.m64, sizeof(__m64),
+ "&mem.m64, (__m128i){%016llx, %016llx},", mmx_vals[l0], mmx_vals[l1]);
+ }
+ }
+
+ // void _mm_stream_pi(__m64 * arg_0, __m64 arg_1);
+ for(int l0 = 0; l0 < arraysize(mmx_vals); ++l0) {
+ clear_exc_flags();
+ orig__mm_stream_pi(&mem.m64, (__m64)mmx_vals[l0]);
+ int orig_exc = get_exc_flags();
+ clear_exc_flags();
+ new__mm_stream_pi(&mem2.m64, (__m64)mmx_vals[l0]);
+ int new_exc = get_exc_flags();
+ check_mismatch("_mm_stream_pi", orig_exc, new_exc, &mem.m64, &mem2.m64, sizeof(__m64),
+ "&mem.m64, %016llx,", mmx_vals[l0]);
+ }
+}
+
+// Test that the nominally 64-bit maskmove doesn't trap at the edges of
+// non-writable memory, despite being implemented by a 128-bit write.
+void test_maskmove() {
+ // Create a page memory with an inaccessible page on either side.
+ char *map = mmap(0, 3 * 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ if (!map)
+ abort();
+ if (mprotect(map, 4096, PROT_NONE))
+ abort();
+ if (mprotect(map + 4096 * 2, 4096, PROT_NONE))
+ abort();
+ long long init_val = 0xffeeddccbbaa9900;
+ long long expected = 0x11ee3344bb669900;
+ for (int offset = 0; offset < 16+9; ++offset) {
+ char *copy_location = map + 4096 + (offset > 16 ? 4096 - 32 + offset : offset);
+ memcpy(copy_location, &init_val, 8);
+ new__mm_maskmove_si64((__m64)0x1122334455667788LL, (__m64)0x8000808000800000, copy_location);
+ long long result;
+ memcpy(&result, copy_location, 8);
+ if (memcmp(&expected, &result, 8) != 0) {
+ printf("test_maskmove: wrong value was stored %llx vs %llx\n", result, expected);
+ return;
+ }
+ }
+}
+
+void test_generated() {
+ #include "gen_test.inc"
+}
+
+int main() {
+ int rounding[] = {FE_TONEAREST, FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO};
+ for (int i = 0; i < 4; ++i)
+ {
+ fesetround(rounding[i]);
+
+ test_maskmove();
+ test_stores();
+ test_generated();
+ }
+}