diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -157,8 +157,8 @@ TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "ncV:64:", "mmx,sse") TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "ncV:64:", "mmx,sse") TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "ncV:64:", "mmx,sse") -TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "ncV:64:", "mmx,sse") +TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "sV4sIi", "ncV:64:", "sse") +TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4ssIi", "ncV:64:", "sse") // MMX+SSE2 TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "ncV:64:", "mmx,sse2") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -12097,6 +12097,7 @@ case X86::BI__builtin_ia32_vec_init_v2si: return Builder.CreateBitCast(BuildVector(Ops), llvm::Type::getX86_MMXTy(getLLVMContext())); + case X86::BI__builtin_ia32_vec_ext_v4hi: case X86::BI__builtin_ia32_vec_ext_v2si: case X86::BI__builtin_ia32_vec_ext_v16qi: case X86::BI__builtin_ia32_vec_ext_v8hi: @@ -12115,6 +12116,7 @@ // Otherwise we could just do this in the header file. return Builder.CreateExtractElement(Ops[0], Index); } + case X86::BI__builtin_ia32_vec_set_v4hi: case X86::BI__builtin_ia32_vec_set_v16qi: case X86::BI__builtin_ia32_vec_set_v8hi: case X86::BI__builtin_ia32_vec_set_v4si: diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -35,7 +35,9 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64))) + +#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0) +#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1) /// Adds lower double-precision values in both operands and returns the /// sum in the lower 64 bits of the result. The upper 64 bits of the result @@ -1504,10 +1506,10 @@ /// \param __a /// A 128-bit vector of [2 x double]. /// \returns A 64-bit vector of [2 x i32] containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) { - return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); + return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a)); } /// Converts the two double-precision floating-point elements of a @@ -1524,10 +1526,10 @@ /// \param __a /// A 128-bit vector of [2 x double]. /// \returns A 64-bit vector of [2 x i32] containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) { - return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); + return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a)); } /// Converts the two signed 32-bit integer elements of a 64-bit vector of @@ -1541,10 +1543,10 @@ /// \param __a /// A 64-bit vector of [2 x i32]. /// \returns A 128-bit vector of [2 x double] containing the converted values. -static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) { - return __builtin_ia32_cvtpi2pd((__v2si)__a); + return (__m128d) __builtin_convertvector((__v2si)__a, __v2df); } /// Returns the low-order element of a 128-bit vector of [2 x double] as @@ -2175,10 +2177,10 @@ /// \param __b /// A 64-bit integer. /// \returns A 64-bit integer containing the sum of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); + return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b)); } /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], @@ -2507,10 +2509,11 @@ /// \param __b /// A 64-bit integer containing one of the source operands. /// \returns A 64-bit integer vector containing the product of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) { - return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); + return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a), + (__v4si)__anyext128(__b))); } /// Multiplies 32-bit unsigned integer values contained in the lower @@ -2621,10 +2624,10 @@ /// A 64-bit integer vector containing the subtrahend. /// \returns A 64-bit integer vector containing the difference of the values in /// the operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); + return (__m64)((unsigned long long)__a - (unsigned long long)__b); } /// Subtracts the corresponding elements of two [2 x i64] vectors. @@ -4965,8 +4968,10 @@ #if defined(__cplusplus) } // extern "C" #endif + +#undef __anyext128 +#undef __trunc64 #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -17,8 +17,29 @@ typedef short __v4hi __attribute__((__vector_size__(8))); typedef char __v8qi __attribute__((__vector_size__(8))); +/* Unsigned types */ +typedef unsigned long long __v1du __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); +typedef unsigned short __v4hu __attribute__((__vector_size__(8))); +typedef unsigned char __v8qu __attribute__((__vector_size__(8))); + +/* We need an explicitly signed variant for char. Note that this shouldn't + * appear in the interface though. */ +typedef signed char __v8qs __attribute__((__vector_size__(8))); + +/* SSE/SSE2 types */ +typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __v2di __attribute__ ((__vector_size__ (16))); +typedef int __v4si __attribute__((__vector_size__(16))); +typedef short __v8hi __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); + /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64))) +#define __DEFAULT_FN_ATTRS_SSE2 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(64))) + +#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0) +#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1) +#define __extract2_32(a) (__m64)__builtin_shufflevector((__v4si)(a), __extension__ (__v4si){}, 0, 2); /// Clears the MMX state by setting the state of the x87 stack registers /// to empty. @@ -44,10 +65,10 @@ /// A 32-bit integer value. /// \returns A 64-bit integer vector. The lower 32 bits contain the value of the /// parameter. The upper 32 bits are set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi32_si64(int __i) { - return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); + return __extension__ (__m64)(__v2si){__i, 0}; } /// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit @@ -61,10 +82,10 @@ /// A 64-bit integer vector. /// \returns A 32-bit signed integer value containing the lower 32 bits of the /// parameter. -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_si32(__m64 __m) { - return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0); + return ((__v2si)__m)[0]; } /// Casts a 64-bit signed integer value into a 64-bit integer vector. @@ -77,7 +98,7 @@ /// A 64-bit signed integer. /// \returns A 64-bit integer vector containing the same bitwise pattern as the /// parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtsi64_m64(long long __i) { return (__m64)__i; @@ -93,7 +114,7 @@ /// A 64-bit integer vector. /// \returns A 64-bit signed integer containing the same bitwise pattern as the /// parameter. -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS_SSE2 _mm_cvtm64_si64(__m64 __m) { return (long long)__m; @@ -123,10 +144,11 @@ /// [4 x i8] values are written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); + return __extract2_32(__builtin_ia32_packsswb128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Converts 32-bit signed integers from both 64-bit integer vector @@ -153,10 +175,11 @@ /// [2 x i16] values are written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); + return __extract2_32(__builtin_ia32_packssdw128((__v4si)__anyext128(__m1), + (__v4si)__anyext128(__m2))); } /// Converts 16-bit signed integers from both 64-bit integer vector @@ -183,10 +206,11 @@ /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_packs_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); + return __extract2_32(__builtin_ia32_packuswb128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] @@ -210,10 +234,11 @@ /// Bits [63:56] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, + 4, 12, 5, 13, 6, 14, 7, 15); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of @@ -233,10 +258,11 @@ /// Bits [63:48] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, + 2, 6, 3, 7); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of @@ -254,10 +280,10 @@ /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2); + return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 3); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] @@ -281,10 +307,11 @@ /// Bits [31:24] are written to bits [63:56] of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2); + return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, + 0, 8, 1, 9, 2, 10, 3, 11); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of @@ -304,10 +331,11 @@ /// Bits [31:16] are written to bits [63:48] of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2); + return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, + 0, 4, 1, 5); } /// Unpacks the lower 32 bits from two 64-bit integer vectors of @@ -325,10 +353,10 @@ /// the upper 32 bits of the result. /// \returns A 64-bit integer vector of [2 x i32] containing the interleaved /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2); + return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2); } /// Adds each 8-bit integer element of the first 64-bit integer vector @@ -346,10 +374,10 @@ /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)(((__v8qu)__m1) + ((__v8qu)__m2)); } /// Adds each 16-bit integer element of the first 64-bit integer vector @@ -367,10 +395,10 @@ /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hu)__m1) + ((__v4hu)__m2)); } /// Adds each 32-bit integer element of the first 64-bit integer vector @@ -388,10 +416,10 @@ /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_add_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); + return (__m64)(((__v2su)__m1) + ((__v2su)__m2)); } /// Adds each 8-bit signed integer element of the first 64-bit integer @@ -410,10 +438,11 @@ /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); + return __trunc64(__builtin_ia32_paddsb128((__v16qi)__anyext128(__m1), + (__v16qi)__anyext128(__m2))); } /// Adds each 16-bit signed integer element of the first 64-bit integer @@ -433,10 +462,11 @@ /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums /// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_paddsw128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Adds each 8-bit unsigned integer element of the first 64-bit integer @@ -455,10 +485,11 @@ /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); + return __trunc64(__builtin_ia32_paddusb128((__v16qi)__anyext128(__m1), + (__v16qi)__anyext128(__m2))); } /// Adds each 16-bit unsigned integer element of the first 64-bit integer @@ -477,10 +508,11 @@ /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_adds_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_paddusw128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Subtracts each 8-bit integer element of the second 64-bit integer @@ -498,10 +530,10 @@ /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)(((__v8qu)__m1) - ((__v8qu)__m2)); } /// Subtracts each 16-bit integer element of the second 64-bit integer @@ -519,10 +551,10 @@ /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hu)__m1) - ((__v4hu)__m2)); } /// Subtracts each 32-bit integer element of the second 64-bit integer @@ -540,10 +572,10 @@ /// A 64-bit integer vector of [2 x i32] containing the subtrahends. /// \returns A 64-bit integer vector of [2 x i32] containing the differences of /// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sub_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2); + return (__m64)(((__v2su)__m1) - ((__v2su)__m2)); } /// Subtracts each 8-bit signed integer element of the second 64-bit @@ -563,10 +595,11 @@ /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); + return __trunc64(__builtin_ia32_psubsb128((__v16qi)__anyext128(__m1), + (__v16qi)__anyext128(__m2))); } /// Subtracts each 16-bit signed integer element of the second 64-bit @@ -586,10 +619,11 @@ /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_psubsw128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Subtracts each 8-bit unsigned integer element of the second 64-bit @@ -610,10 +644,11 @@ /// A 64-bit integer vector of [8 x i8] containing the subtrahends. /// \returns A 64-bit integer vector of [8 x i8] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); + return __trunc64(__builtin_ia32_psubusb128((__v16qi)__anyext128(__m1), + (__v16qi)__anyext128(__m2))); } /// Subtracts each 16-bit unsigned integer element of the second 64-bit @@ -634,10 +669,11 @@ /// A 64-bit integer vector of [4 x i16] containing the subtrahends. /// \returns A 64-bit integer vector of [4 x i16] containing the saturated /// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_subs_pu16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_psubusw128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -661,10 +697,11 @@ /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [2 x i32] containing the sums of /// products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_madd_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_pmaddwd128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -682,10 +719,11 @@ /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); + return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1), + (__v8hi)__anyext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit @@ -703,10 +741,10 @@ /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mullo_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hu)__m1) * ((__v4hu)__m2)); } /// Left-shifts each 16-bit signed integer element of the first @@ -726,10 +764,11 @@ /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted /// values. If \a __count is greater or equal to 16, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psllw128((__v8hi)__anyext128(__m), + (__v8hi)__anyext128(__count))); } /// Left-shifts each 16-bit signed integer element of a 64-bit integer @@ -748,10 +787,11 @@ /// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted /// values. If \a __count is greater or equal to 16, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psllwi128((__v8hi)__anyext128(__m), + __count)); } /// Left-shifts each 32-bit signed integer element of the first @@ -771,10 +811,11 @@ /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted /// values. If \a __count is greater or equal to 32, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_pslld((__v2si)__m, __count); + return __trunc64(__builtin_ia32_pslld128((__v4si)__anyext128(__m), + (__v4si)__anyext128(__count))); } /// Left-shifts each 32-bit signed integer element of a 64-bit integer @@ -793,10 +834,11 @@ /// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted /// values. If \a __count is greater or equal to 32, the result is set to all /// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); + return __trunc64(__builtin_ia32_pslldi128((__v4si)__anyext128(__m), + __count)); } /// Left-shifts the first 64-bit integer parameter by the number of bits @@ -813,10 +855,11 @@ /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector containing the left-shifted value. If /// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sll_si64(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psllq((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psllq128((__v2di)__anyext128(__m), + __anyext128(__count))); } /// Left-shifts the first parameter, which is a 64-bit integer, by the @@ -833,10 +876,11 @@ /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the left-shifted value. If /// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_slli_si64(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psllqi128((__v2di)__anyext128(__m), + __count)); } /// Right-shifts each 16-bit integer element of the first parameter, @@ -857,10 +901,11 @@ /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psraw128((__v8hi)__anyext128(__m), + (__v8hi)__anyext128(__count))); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -880,10 +925,11 @@ /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psrawi128((__v8hi)__anyext128(__m), + __count)); } /// Right-shifts each 32-bit integer element of the first parameter, @@ -904,10 +950,11 @@ /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sra_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrad((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psrad128((__v4si)__anyext128(__m), + (__v4si)__anyext128(__count))); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -927,10 +974,11 @@ /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srai_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psradi128((__v4si)__anyext128(__m), + __count)); } /// Right-shifts each 16-bit integer element of the first parameter, @@ -950,10 +998,11 @@ /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi16(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psrlw128((__v8hi)__anyext128(__m), + (__v8hi)__anyext128(__count))); } /// Right-shifts each 16-bit integer element of a 64-bit integer vector @@ -972,10 +1021,11 @@ /// A 32-bit integer value. /// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi16(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); + return __trunc64(__builtin_ia32_psrlwi128((__v8hi)__anyext128(__m), + __count)); } /// Right-shifts each 32-bit integer element of the first parameter, @@ -995,10 +1045,11 @@ /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_pi32(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrld((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psrld128((__v4si)__anyext128(__m), + (__v4si)__anyext128(__count))); } /// Right-shifts each 32-bit integer element of a 64-bit integer vector @@ -1017,10 +1068,11 @@ /// A 32-bit integer value. /// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_pi32(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); + return __trunc64(__builtin_ia32_psrldi128((__v4si)__anyext128(__m), + __count)); } /// Right-shifts the first 64-bit integer parameter by the number of bits @@ -1037,10 +1089,11 @@ /// \param __count /// A 64-bit integer vector interpreted as a single 64-bit integer. /// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srl_si64(__m64 __m, __m64 __count) { - return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psrlq128((__v2di)__anyext128(__m), + __anyext128(__count))); } /// Right-shifts the first parameter, which is a 64-bit integer, by the @@ -1058,10 +1111,11 @@ /// \param __count /// A 32-bit integer value. /// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_srli_si64(__m64 __m, int __count) { - return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count); + return __trunc64(__builtin_ia32_psrlqi128((__v2di)__anyext128(__m), + __count)); } /// Performs a bitwise AND of two 64-bit integer vectors. @@ -1076,10 +1130,10 @@ /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_and_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2); + return (__m64)(((__v1du)__m1) & ((__v1du)__m2)); } /// Performs a bitwise NOT of the first 64-bit integer vector, and then @@ -1097,10 +1151,10 @@ /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise AND of the second /// parameter and the one's complement of the first parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_andnot_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2); + return (__m64)(~((__v1du)__m1) & ((__v1du)__m2)); } /// Performs a bitwise OR of two 64-bit integer vectors. @@ -1115,10 +1169,10 @@ /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_or_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2); + return (__m64)(((__v1du)__m1) | ((__v1du)__m2)); } /// Performs a bitwise exclusive OR of two 64-bit integer vectors. @@ -1133,10 +1187,10 @@ /// A 64-bit integer vector. /// \returns A 64-bit integer vector containing the bitwise exclusive OR of both /// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_xor_si64(__m64 __m1, __m64 __m2) { - return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2); + return (__m64)(((__v1du)__m1) ^ ((__v1du)__m2)); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of @@ -1155,10 +1209,10 @@ /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2); + return (__m64)(((__v8qi)__m1) == ((__v8qi)__m2)); } /// Compares the 16-bit integer elements of two 64-bit integer vectors of @@ -1177,10 +1231,10 @@ /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)(((__v4hi)__m1) == ((__v4hi)__m2)); } /// Compares the 32-bit integer elements of two 64-bit integer vectors of @@ -1199,10 +1253,10 @@ /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2); + return (__m64)(((__v2si)__m1) == ((__v2si)__m2)); } /// Compares the 8-bit integer elements of two 64-bit integer vectors of @@ -1221,10 +1275,12 @@ /// A 64-bit integer vector of [8 x i8]. /// \returns A 64-bit integer vector of [8 x i8] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2); + /* This function always performs a signed comparison, but __v8qi is a char + which may be signed or unsigned, so use __v8qs. */ + return (__m64)((__v8qs)__m1 > (__v8qs)__m2); } /// Compares the 16-bit integer elements of two 64-bit integer vectors of @@ -1243,10 +1299,10 @@ /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2); + return (__m64)((__v4hi)__m1 > (__v4hi)__m2); } /// Compares the 32-bit integer elements of two 64-bit integer vectors of @@ -1265,10 +1321,10 @@ /// A 64-bit integer vector of [2 x i32]. /// \returns A 64-bit integer vector of [2 x i32] containing the comparison /// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2); + return (__m64)((__v2si)__m1 > (__v2si)__m2); } /// Constructs a 64-bit integer vector initialized to zero. @@ -1278,7 +1334,7 @@ /// This intrinsic corresponds to the PXOR instruction. /// /// \returns An initialized 64-bit integer vector with all elements set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setzero_si64(void) { return __extension__ (__m64){ 0LL }; @@ -1299,10 +1355,10 @@ /// A 32-bit integer value used to initialize the lower 32 bits of the /// result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi32(int __i1, int __i0) { - return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1); + return __extension__ (__m64)(__v2si){__i0, __i1}; } /// Constructs a 64-bit integer vector initialized with the specified @@ -1322,10 +1378,10 @@ /// \param __s0 /// A 16-bit integer value used to initialize bits [15:0] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) { - return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3); + return __extension__ (__m64)(__v4hi){__s0, __s1, __s2, __s3}; } /// Constructs a 64-bit integer vector initialized with the specified @@ -1353,12 +1409,12 @@ /// \param __b0 /// An 8-bit integer value used to initialize bits [7:0] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) { - return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3, - __b4, __b5, __b6, __b7); + return __extension__ (__m64)(__v8qi){__b0, __b1, __b2, __b3, + __b4, __b5, __b6, __b7}; } /// Constructs a 64-bit integer vector of [2 x i32], with each of the @@ -1374,7 +1430,7 @@ /// A 32-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [2 x i32]. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi32(int __i) { return _mm_set_pi32(__i, __i); @@ -1393,7 +1449,7 @@ /// A 16-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [4 x i16]. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi16(short __w) { return _mm_set_pi16(__w, __w, __w, __w); @@ -1411,7 +1467,7 @@ /// An 8-bit integer value used to initialize each vector element of the /// result. /// \returns An initialized 64-bit integer vector of [8 x i8]. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_set1_pi8(char __b) { return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); @@ -1432,7 +1488,7 @@ /// A 32-bit integer value used to initialize the upper 32 bits of the /// result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi32(int __i0, int __i1) { return _mm_set_pi32(__i1, __i0); @@ -1455,7 +1511,7 @@ /// \param __w3 /// A 16-bit integer value used to initialize bits [63:48] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { return _mm_set_pi16(__w3, __w2, __w1, __w0); @@ -1486,14 +1542,17 @@ /// \param __b7 /// An 8-bit integer value used to initialize bits [63:56] of the result. /// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7) { return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); } -#undef __DEFAULT_FN_ATTRS +#undef __extract2_32 +#undef __anyext128 +#undef __trunc64 +#undef __DEFAULT_FN_ATTRS_SSE2 /* Aliases for compatibility. */ #define _m_empty _mm_empty diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -14,7 +14,10 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64))) -#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64))) + +#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0) +#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1) +#define __extract2_32(a) (__m64)__builtin_shufflevector((__v4si)(a), __extension__ (__v4si){}, 0, 2); /// Computes the absolute value of each of the packed 8-bit signed /// integers in the source operand and stores the 8-bit unsigned integer @@ -28,10 +31,10 @@ /// A 64-bit vector of [8 x i8]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi8(__m64 __a) { - return (__m64)__builtin_ia32_pabsb((__v8qi)__a); + return __trunc64(__builtin_ia32_pabsb128((__v16qi)__anyext128(__a))); } /// Computes the absolute value of each of the packed 8-bit signed @@ -64,10 +67,10 @@ /// A 64-bit vector of [4 x i16]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi16(__m64 __a) { - return (__m64)__builtin_ia32_pabsw((__v4hi)__a); + return __trunc64(__builtin_ia32_pabsw128((__v8hi)__anyext128(__a))); } /// Computes the absolute value of each of the packed 16-bit signed @@ -100,10 +103,10 @@ /// A 64-bit vector of [2 x i32]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi32(__m64 __a) { - return (__m64)__builtin_ia32_pabsd((__v2si)__a); + return __trunc64(__builtin_ia32_pabsd128((__v4si)__anyext128(__a))); } /// Computes the absolute value of each of the packed 32-bit signed @@ -168,7 +171,10 @@ /// \returns A 64-bit integer vector containing the concatenated right-shifted /// value. #define _mm_alignr_pi8(a, b, n) \ - (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)) + (__m64)__builtin_shufflevector( \ + __builtin_ia32_psrldqi128_byteshift( \ + __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \ + (n)), __extension__ (__v2di){}, 0) /// Horizontally adds the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [8 x i16]. @@ -233,10 +239,11 @@ /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadd_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b); + return __extract2_32(__builtin_ia32_phaddw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -256,10 +263,11 @@ /// destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadd_pi32(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b); + return __extract2_32(__builtin_ia32_phaddd128((__v4si)__anyext128(__a), + (__v4si)__anyext128(__b))); } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -306,10 +314,11 @@ /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// sums of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadds_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b); + return __extract2_32(__builtin_ia32_phaddsw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -375,10 +384,11 @@ /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsub_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b); + return __extract2_32(__builtin_ia32_phsubw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -398,10 +408,11 @@ /// the destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsub_pi32(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b); + return __extract2_32(__builtin_ia32_phsubd128((__v4si)__anyext128(__a), + (__v4si)__anyext128(__b))); } /// Horizontally subtracts the adjacent pairs of values contained in 2 @@ -448,10 +459,11 @@ /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// differences of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsubs_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b); + return __extract2_32(__builtin_ia32_phsubsw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Multiplies corresponding pairs of packed 8-bit unsigned integer @@ -512,10 +524,11 @@ /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_maddubs_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pmaddubsw128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// Multiplies packed 16-bit signed integer values, truncates the 32-bit @@ -552,10 +565,11 @@ /// A 64-bit vector of [4 x i16] containing one of the source operands. /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled /// products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mulhrs_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pmulhrsw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Copies the 8-bit integers from a 128-bit integer vector to the @@ -601,12 +615,15 @@ /// 1: Clear the corresponding byte in the destination. \n /// 0: Copy the selected source byte to the corresponding byte in the /// destination. \n -/// Bits [3:0] select the source byte to be copied. +/// Bits [2:0] select the source byte to be copied. /// \returns A 64-bit integer vector containing the copied or cleared values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_shuffle_pi8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pshufb128( + (__v16qi)__builtin_shufflevector( + (__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1), + (__v16qi)__anyext128(__b))); } /// For each 8-bit integer in the first source operand, perform one of @@ -707,10 +724,11 @@ /// A 64-bit integer vector containing control bytes corresponding to /// positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// For each 16-bit integer in the first source operand, perform one of @@ -733,10 +751,11 @@ /// A 64-bit integer vector containing control words corresponding to /// positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// For each 32-bit integer in the first source operand, perform one of @@ -759,13 +778,16 @@ /// A 64-bit integer vector containing two control doublewords corresponding /// to positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi32(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b); + return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a), + (__v4si)__anyext128(__b))); } +#undef __extract2_32 +#undef __anyext128 +#undef __trunc64 #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX #endif /* __TMMINTRIN_H */ diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -29,7 +29,12 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64))) +#define __DEFAULT_FN_ATTRS_SSE2 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(64))) + +#define __trunc64(x) (__m64)__builtin_shufflevector((__v2di)(x), __extension__ (__v2di){}, 0) +#define __zext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, 2, 3) +#define __anyext128(x) (__m128i)__builtin_shufflevector((__v2si)(x), __extension__ (__v2si){}, 0, 1, -1, -1) +#define __zeroupper64(x) (__m128i)__builtin_shufflevector((__v4si)(x), __extension__ (__v4si){}, 0, 1, 4, 5) /// Adds the 32-bit float values in the low-order bits of the operands. /// @@ -1354,10 +1359,10 @@ /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi32(__m128 __a) { - return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); + return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a))); } /// Converts two low-order float values in a 128-bit vector of @@ -1370,7 +1375,7 @@ /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_ps2pi(__m128 __a) { return _mm_cvtps_pi32(__a); @@ -1447,10 +1452,10 @@ /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvttps_pi32(__m128 __a) { - return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); + return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a))); } /// Converts two low-order float values in a 128-bit vector of [4 x @@ -1464,7 +1469,7 @@ /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtt_ps2pi(__m128 __a) { return _mm_cvttps_pi32(__a); @@ -1559,10 +1564,13 @@ /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted value of the second operand. The upper 64 bits are copied from /// the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32_ps(__m128 __a, __m64 __b) { - return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); + return (__m128)__builtin_shufflevector( + (__v4sf)__a, + __builtin_convertvector((__v4si)__zext128(__b), __v4sf), + 4, 5, 2, 3); } /// Converts two elements of a 64-bit vector of [2 x i32] into two @@ -1582,7 +1590,7 @@ /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted value from the second operand. The upper 64 bits are copied /// from the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvt_pi2ps(__m128 __a, __m64 __b) { return _mm_cvtpi32_ps(__a, __b); @@ -2116,10 +2124,10 @@ /// A pointer to an aligned memory location used to store the register value. /// \param __a /// A 64-bit integer containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS_MMX +static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(__m64 *__p, __m64 __a) { - __builtin_ia32_movntq(__p, __a); + __builtin_nontemporal_store(__a, __p); } /// Moves packed float values from a 128-bit vector of [4 x float] to a @@ -2181,7 +2189,7 @@ /// 3: Bits [63:48] are copied to the destination. /// \returns A 16-bit integer containing the extracted 16 bits of packed data. #define _mm_extract_pi16(a, n) \ - (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n) + (int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n) /// Copies data from the 64-bit vector of [4 x i16] to the destination, /// and inserts the lower 16-bits of an integer operand at the 16-bit offset @@ -2227,10 +2235,11 @@ /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pmaxsw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Compares each of the corresponding packed 8-bit unsigned integer @@ -2246,10 +2255,11 @@ /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_max_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pmaxub128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// Compares each of the corresponding packed 16-bit integer values of @@ -2265,10 +2275,11 @@ /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pi16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pminsw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Compares each of the corresponding packed 8-bit unsigned integer @@ -2284,10 +2295,11 @@ /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_min_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pminub128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// Takes the most significant bit from each 8-bit element in a 64-bit @@ -2302,10 +2314,10 @@ /// A 64-bit integer vector containing the values with bits to be extracted. /// \returns The most significant bit from each 8-bit element in \a __a, /// written to bits [7:0]. -static __inline__ int __DEFAULT_FN_ATTRS_MMX +static __inline__ int __DEFAULT_FN_ATTRS_SSE2 _mm_movemask_pi8(__m64 __a) { - return __builtin_ia32_pmovmskb((__v8qi)__a); + return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a)); } /// Multiplies packed 16-bit unsigned integer values and writes the @@ -2321,10 +2333,11 @@ /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_mulhi_pu16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the @@ -2359,7 +2372,9 @@ /// 11: assigned from bits [63:48] of \a a. /// \returns A 64-bit integer vector containing the shuffled values. #define _mm_shuffle_pi16(a, n) \ - (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)) + (__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__ (__v4hi){}, \ + (n) & 0x3, ((n) >> 2) & 0x3, \ + ((n) >> 4) & 0x3, ((n) >> 6) & 0x3) /// Conditionally copies the values from each 8-bit element in the first /// 64-bit integer vector operand to the specified memory location, as @@ -2384,10 +2399,25 @@ /// A pointer to a 64-bit memory location that will receive the conditionally /// copied integer values. The address of the memory location does not have /// to be aligned. -static __inline__ void __DEFAULT_FN_ATTRS_MMX +static __inline__ void __DEFAULT_FN_ATTRS_SSE2 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) { - __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); + // This is complex, because we need to support the case where __p is pointing + // within the last 15 to 8 bytes of a page. In that case, using a 128-bit + // write might cause a trap where a 64-bit maskmovq would not. (Memory + // locations not selected by the mask bits might still cause traps.) + __m128i __d128 = __anyext128(__d); + __m128i __n128 = __zext128(__n); + if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 && + ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) { + // If there's a risk of spurious trap due to a 128-bit write, back up the + // pointer by 8 bytes and shift values in registers to match. + __p -= 8; + __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8); + __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8); + } + + __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p); } /// Computes the rounded averages of the packed unsigned 8-bit integer @@ -2403,10 +2433,11 @@ /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the averages of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a), + (__v16qi)__anyext128(__b))); } /// Computes the rounded averages of the packed unsigned 16-bit integer @@ -2422,10 +2453,11 @@ /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the averages of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_avg_pu16(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); + return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a), + (__v8hi)__anyext128(__b))); } /// Subtracts the corresponding 8-bit unsigned integer values of the two @@ -2444,10 +2476,11 @@ /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the /// sets of absolute differences between both operands. The upper bits are /// cleared. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_sad_pu8(__m64 __a, __m64 __b) { - return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); + return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a), + (__v16qi)__zext128(__b))); } #if defined(__cplusplus) @@ -2725,22 +2758,10 @@ /// from the corresponding elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi16_ps(__m64 __a) { - __m64 __b, __c; - __m128 __r; - - __b = _mm_setzero_si64(); - __b = _mm_cmpgt_pi16(__b, __a); - __c = _mm_unpackhi_pi16(__a, __b); - __r = _mm_setzero_ps(); - __r = _mm_cvtpi32_ps(__r, __c); - __r = _mm_movelh_ps(__r, __r); - __c = _mm_unpacklo_pi16(__a, __b); - __r = _mm_cvtpi32_ps(__r, __c); - - return __r; + return __builtin_convertvector((__v4hi)__a, __v4sf); } /// Converts a 64-bit vector of 16-bit unsigned integer values into a @@ -2755,21 +2776,10 @@ /// destination are copied from the corresponding elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu16_ps(__m64 __a) { - __m64 __b, __c; - __m128 __r; - - __b = _mm_setzero_si64(); - __c = _mm_unpackhi_pi16(__a, __b); - __r = _mm_setzero_ps(); - __r = _mm_cvtpi32_ps(__r, __c); - __r = _mm_movelh_ps(__r, __r); - __c = _mm_unpacklo_pi16(__a, __b); - __r = _mm_cvtpi32_ps(__r, __c); - - return __r; + return __builtin_convertvector((__v4hu)__a, __v4sf); } /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] @@ -2784,16 +2794,12 @@ /// from the corresponding lower 4 elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi8_ps(__m64 __a) { - __m64 __b; - - __b = _mm_setzero_si64(); - __b = _mm_cmpgt_pi8(__b, __a); - __b = _mm_unpacklo_pi8(__a, __b); - - return _mm_cvtpi16_ps(__b); + return __builtin_convertvector( + __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){}, + 0, 1, 2, 3), __v4sf); } /// Converts the lower four unsigned 8-bit integer values from a 64-bit @@ -2809,15 +2815,12 @@ /// operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpu8_ps(__m64 __a) { - __m64 __b; - - __b = _mm_setzero_si64(); - __b = _mm_unpacklo_pi8(__a, __b); - - return _mm_cvtpi16_ps(__b); + return __builtin_convertvector( + __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){}, + 0, 1, 2, 3), __v4sf); } /// Converts the two 32-bit signed integer values from each 64-bit vector @@ -2836,16 +2839,12 @@ /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// copied and converted values from the first operand. The upper 64 bits /// contain the copied and converted values from the second operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) { - __m128 __c; - - __c = _mm_setzero_ps(); - __c = _mm_cvtpi32_ps(__c, __b); - __c = _mm_movelh_ps(__c, __c); - - return _mm_cvtpi32_ps(__c, __a); + return __builtin_convertvector( + __builtin_shufflevector((__v2si)__a, (__v2si)__b, + 0, 1, 2, 3), __v4sf); } /// Converts each single-precision floating-point element of a 128-bit @@ -2865,16 +2864,11 @@ /// A 128-bit floating-point vector of [4 x float]. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi16(__m128 __a) { - __m64 __b, __c; - - __b = _mm_cvtps_pi32(__a); - __a = _mm_movehl_ps(__a, __a); - __c = _mm_cvtps_pi32(__a); - - return _mm_packs_pi32(__b, __c); + return __trunc64(__builtin_ia32_packssdw128( + (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps())); } /// Converts each single-precision floating-point element of a 128-bit @@ -2895,7 +2889,7 @@ /// 128-bit floating-point vector of [4 x float]. /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the /// converted values and the uppper 32 bits are set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 _mm_cvtps_pi8(__m128 __a) { __m64 __b, __c; @@ -2997,8 +2991,12 @@ #define _m_ _mm_ #define _m_ _mm_ +#undef __trunc64 +#undef __zext128 +#undef __anyext128 +#undef __zeroupper64 #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX +#undef __DEFAULT_FN_ATTRS_SSE2 /* Ugly hack for backwards-compatibility (compatible with gcc) */ #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -1,193 +1,200 @@ -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +ssse3 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --implicit-check-not=x86mmx #include __m64 test_mm_abs_pi8(__m64 a) { // CHECK-LABEL: test_mm_abs_pi8 - // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.b + // CHECK: call <16 x i8> @llvm.abs.v16i8( return _mm_abs_pi8(a); } __m64 test_mm_abs_pi16(__m64 a) { // CHECK-LABEL: test_mm_abs_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.w + // CHECK: call <8 x i16> @llvm.abs.v8i16( return _mm_abs_pi16(a); } __m64 test_mm_abs_pi32(__m64 a) { // CHECK-LABEL: test_mm_abs_pi32 - // CHECK: call x86_mmx @llvm.x86.ssse3.pabs.d + // CHECK: call <4 x i32> @llvm.abs.v4i32( return _mm_abs_pi32(a); } __m64 test_mm_add_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_add_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.padd.b + // CHECK: add <8 x i8> {{%.*}}, {{%.*}} return _mm_add_pi8(a, b); } __m64 test_mm_add_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_add_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.padd.w + // CHECK: add <4 x i16> {{%.*}}, {{%.*}} return _mm_add_pi16(a, b); } __m64 test_mm_add_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_add_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.padd.d + // CHECK: add <2 x i32> {{%.*}}, {{%.*}} return _mm_add_pi32(a, b); } __m64 test_mm_add_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_add_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}}) + // CHECK: add i64 {{%.*}}, {{%.*}} return _mm_add_si64(a, b); } __m64 test_mm_adds_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_adds_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.padds.b + // CHECK: call <16 x i8> @llvm.sadd.sat.v16i8( return _mm_adds_pi8(a, b); } __m64 test_mm_adds_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_adds_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.padds.w + // CHECK: call <8 x i16> @llvm.sadd.sat.v8i16( return _mm_adds_pi16(a, b); } __m64 test_mm_adds_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_adds_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.paddus.b + // CHECK: call <16 x i8> @llvm.uadd.sat.v16i8( return _mm_adds_pu8(a, b); } __m64 test_mm_adds_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_adds_pu16 - // CHECK: call x86_mmx @llvm.x86.mmx.paddus.w + // CHECK: call <8 x i16> @llvm.uadd.sat.v8i16( return _mm_adds_pu16(a, b); } __m64 test_mm_alignr_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_alignr_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.palignr.b + // CHECK: shufflevector <16 x i8> {{%.*}}, <16 x i8> zeroinitializer, <16 x i32> return _mm_alignr_pi8(a, b, 2); } __m64 test_mm_and_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_and_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.pand + // CHECK: and <1 x i64> {{%.*}}, {{%.*}} return _mm_and_si64(a, b); } __m64 test_mm_andnot_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_andnot_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.pandn + // CHECK: [[TMP:%.*]] = xor <1 x i64> {{%.*}}, + // CHECK: and <1 x i64> [[TMP]], {{%.*}} return _mm_andnot_si64(a, b); } __m64 test_mm_avg_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_avg_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.pavg.b + // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b( return _mm_avg_pu8(a, b); } __m64 test_mm_avg_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_avg_pu16 - // CHECK: call x86_mmx @llvm.x86.mmx.pavg.w + // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w( return _mm_avg_pu16(a, b); } __m64 test_mm_cmpeq_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpeq_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.b + // CHECK: [[CMP:%.*]] = icmp eq <8 x i8> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8> return _mm_cmpeq_pi8(a, b); } __m64 test_mm_cmpeq_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpeq_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.w + // CHECK: [[CMP:%.*]] = icmp eq <4 x i16> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16> return _mm_cmpeq_pi16(a, b); } __m64 test_mm_cmpeq_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpeq_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpeq.d + // CHECK: [[CMP:%.*]] = icmp eq <2 x i32> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32> return _mm_cmpeq_pi32(a, b); } __m64 test_mm_cmpgt_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpgt_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.b + // CHECK: [[CMP:%.*]] = icmp sgt <8 x i8> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <8 x i1> [[CMP]] to <8 x i8> return _mm_cmpgt_pi8(a, b); } __m64 test_mm_cmpgt_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpgt_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.w + // CHECK: [[CMP:%.*]] = icmp sgt <4 x i16> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <4 x i1> [[CMP]] to <4 x i16> return _mm_cmpgt_pi16(a, b); } __m64 test_mm_cmpgt_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cmpgt_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.pcmpgt.d + // CHECK: [[CMP:%.*]] = icmp sgt <2 x i32> {{%.*}}, {{%.*}} + // CHECK-NEXT: {{%.*}} = sext <2 x i1> [[CMP]] to <2 x i32> return _mm_cmpgt_pi32(a, b); } __m128 test_mm_cvt_pi2ps(__m128 a, __m64 b) { // CHECK-LABEL: test_mm_cvt_pi2ps - // CHECK: <4 x float> @llvm.x86.sse.cvtpi2ps + // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float> return _mm_cvt_pi2ps(a, b); } __m64 test_mm_cvt_ps2pi(__m128 a) { // CHECK-LABEL: test_mm_cvt_ps2pi - // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi + // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq( return _mm_cvt_ps2pi(a); } __m64 test_mm_cvtpd_pi32(__m128d a) { // CHECK-LABEL: test_mm_cvtpd_pi32 - // CHECK: call x86_mmx @llvm.x86.sse.cvtpd2pi + // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq( return _mm_cvtpd_pi32(a); } __m128 test_mm_cvtpi16_ps(__m64 a) { // CHECK-LABEL: test_mm_cvtpi16_ps - // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps + // CHECK: sitofp <4 x i16> {{%.*}} to <4 x float> return _mm_cvtpi16_ps(a); } __m128d test_mm_cvtpi32_pd(__m64 a) { // CHECK-LABEL: test_mm_cvtpi32_pd - // CHECK: call <2 x double> @llvm.x86.sse.cvtpi2pd + // CHECK: sitofp <2 x i32> {{%.*}} to <2 x double> return _mm_cvtpi32_pd(a); } __m128 test_mm_cvtpi32_ps(__m128 a, __m64 b) { // CHECK-LABEL: test_mm_cvtpi32_ps - // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps + // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float> return _mm_cvtpi32_ps(a, b); } __m128 test_mm_cvtpi32x2_ps(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_cvtpi32x2_ps - // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps - // CHECK: call <4 x float> @llvm.x86.sse.cvtpi2ps + // CHECK: sitofp <4 x i32> {{%.*}} to <4 x float> return _mm_cvtpi32x2_ps(a, b); } __m64 test_mm_cvtps_pi16(__m128 a) { // CHECK-LABEL: test_mm_cvtps_pi16 - // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi + // CHECK: [[TMP0:%.*]] = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> {{%.*}}) + // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP0]], return _mm_cvtps_pi16(a); } __m64 test_mm_cvtps_pi32(__m128 a) { // CHECK-LABEL: test_mm_cvtps_pi32 - // CHECK: call x86_mmx @llvm.x86.sse.cvtps2pi + // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq( return _mm_cvtps_pi32(a); } @@ -205,19 +212,19 @@ __m64 test_mm_cvttpd_pi32(__m128d a) { // CHECK-LABEL: test_mm_cvttpd_pi32 - // CHECK: call x86_mmx @llvm.x86.sse.cvttpd2pi + // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq( return _mm_cvttpd_pi32(a); } __m64 test_mm_cvttps_pi32(__m128 a) { // CHECK-LABEL: test_mm_cvttps_pi32 - // CHECK: call x86_mmx @llvm.x86.sse.cvttps2pi + // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq( return _mm_cvttps_pi32(a); } int test_mm_extract_pi16(__m64 a) { // CHECK-LABEL: test_mm_extract_pi16 - // CHECK: call i32 @llvm.x86.mmx.pextr.w + // CHECK: extractelement <4 x i16> {{%.*}}, i64 2 return _mm_extract_pi16(a, 2); } @@ -235,151 +242,153 @@ __m64 test_mm_hadd_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadd_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.w + // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128( return _mm_hadd_pi16(a, b); } __m64 test_mm_hadd_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadd_pi32 - // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.d + // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128( return _mm_hadd_pi32(a, b); } __m64 test_mm_hadds_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hadds_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.phadd.sw + // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128( return _mm_hadds_pi16(a, b); } __m64 test_mm_hsub_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsub_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.w + // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128( return _mm_hsub_pi16(a, b); } __m64 test_mm_hsub_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsub_pi32 - // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.d + // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128( return _mm_hsub_pi32(a, b); } __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_hsubs_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.phsub.sw + // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128( return _mm_hsubs_pi16(a, b); } __m64 test_mm_insert_pi16(__m64 a, int d) { // CHECK-LABEL: test_mm_insert_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pinsr.w + // CHECK: insertelement <4 x i16> return _mm_insert_pi16(a, d, 2); } __m64 test_mm_madd_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_madd_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmadd.wd + // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd( return _mm_madd_pi16(a, b); } __m64 test_mm_maddubs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_maddubs_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.pmadd.ub.sw + // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128( return _mm_maddubs_pi16(a, b); } void test_mm_maskmove_si64(__m64 d, __m64 n, char *p) { // CHECK-LABEL: test_mm_maskmove_si64 - // CHECK: call void @llvm.x86.mmx.maskmovq + // CHECK: call void @llvm.x86.sse2.maskmov.dqu( _mm_maskmove_si64(d, n, p); } __m64 test_mm_max_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_max_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmaxs.w + // CHECK: call <8 x i16> @llvm.smax.v8i16( return _mm_max_pi16(a, b); } __m64 test_mm_max_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_max_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.pmaxu.b + // CHECK: call <16 x i8> @llvm.umax.v16i8( return _mm_max_pu8(a, b); } __m64 test_mm_min_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_min_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmins.w + // CHECK: call <8 x i16> @llvm.smin.v8i16( return _mm_min_pi16(a, b); } __m64 test_mm_min_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_min_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.pminu.b + // CHECK: call <16 x i8> @llvm.umin.v16i8( return _mm_min_pu8(a, b); } int test_mm_movemask_pi8(__m64 a) { // CHECK-LABEL: test_mm_movemask_pi8 - // CHECK: call i32 @llvm.x86.mmx.pmovmskb + // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128( return _mm_movemask_pi8(a); } __m64 test_mm_mul_su32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mul_su32 - // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}}) + // CHECK: and <2 x i64> {{%.*}}, + // CHECK: and <2 x i64> {{%.*}}, + // CHECK: mul <2 x i64> %{{.*}}, %{{.*}} return _mm_mul_su32(a, b); } __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mulhi_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmulh.w + // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w( return _mm_mulhi_pi16(a, b); } __m64 test_mm_mulhi_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mulhi_pu16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmulhu.w + // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w( return _mm_mulhi_pu16(a, b); } __m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mulhrs_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.pmul.hr.sw + // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128( return _mm_mulhrs_pi16(a, b); } __m64 test_mm_mullo_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mullo_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pmull.w + // CHECK: mul <4 x i16> {{%.*}}, {{%.*}} return _mm_mullo_pi16(a, b); } __m64 test_mm_or_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_or_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.por + // CHECK: or <1 x i64> {{%.*}}, {{%.*}} return _mm_or_si64(a, b); } __m64 test_mm_packs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_packs_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.packsswb + // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128( return _mm_packs_pi16(a, b); } __m64 test_mm_packs_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_packs_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.packssdw + // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128( return _mm_packs_pi32(a, b); } __m64 test_mm_packs_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_packs_pu16 - // CHECK: call x86_mmx @llvm.x86.mmx.packuswb + // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128( return _mm_packs_pu16(a, b); } __m64 test_mm_sad_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sad_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.psad.bw + // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> return _mm_sad_pu8(a, b); } @@ -472,181 +481,181 @@ __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_shuffle_pi8 - // CHECK: call x86_mmx @llvm.x86.ssse3.pshuf.b + // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128( return _mm_shuffle_pi8(a, b); } __m64 test_mm_shuffle_pi16(__m64 a) { // CHECK-LABEL: test_mm_shuffle_pi16 - // CHECK: call x86_mmx @llvm.x86.sse.pshuf.w + // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> return _mm_shuffle_pi16(a, 3); } __m64 test_mm_sign_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sign_pi8 - // CHECK: call x86_mmx @llvm.x86.ssse3.psign.b + // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128( return _mm_sign_pi8(a, b); } __m64 test_mm_sign_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sign_pi16 - // CHECK: call x86_mmx @llvm.x86.ssse3.psign.w + // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128( return _mm_sign_pi16(a, b); } __m64 test_mm_sign_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sign_pi32 - // CHECK: call x86_mmx @llvm.x86.ssse3.psign.d + // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128( return _mm_sign_pi32(a, b); } __m64 test_mm_sll_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sll_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psll.w + // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w( return _mm_sll_pi16(a, b); } __m64 test_mm_sll_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sll_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psll.d + // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d( return _mm_sll_pi32(a, b); } __m64 test_mm_sll_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sll_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.psll.q + // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q( return _mm_sll_si64(a, b); } __m64 test_mm_slli_pi16(__m64 a) { // CHECK-LABEL: test_mm_slli_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.pslli.w + // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w( return _mm_slli_pi16(a, 3); } __m64 test_mm_slli_pi32(__m64 a) { // CHECK-LABEL: test_mm_slli_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.pslli.d + // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d( return _mm_slli_pi32(a, 3); } __m64 test_mm_slli_si64(__m64 a) { // CHECK-LABEL: test_mm_slli_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.pslli.q + // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q( return _mm_slli_si64(a, 3); } __m64 test_mm_sra_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sra_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psra.w + // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w( return _mm_sra_pi16(a, b); } __m64 test_mm_sra_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sra_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psra.d + // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d( return _mm_sra_pi32(a, b); } __m64 test_mm_srai_pi16(__m64 a) { // CHECK-LABEL: test_mm_srai_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psrai.w + // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w( return _mm_srai_pi16(a, 3); } __m64 test_mm_srai_pi32(__m64 a) { // CHECK-LABEL: test_mm_srai_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psrai.d + // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d( return _mm_srai_pi32(a, 3); } __m64 test_mm_srl_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_srl_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psrl.w + // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w( return _mm_srl_pi16(a, b); } __m64 test_mm_srl_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_srl_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psrl.d + // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d( return _mm_srl_pi32(a, b); } __m64 test_mm_srl_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_srl_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.psrl.q + // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q( return _mm_srl_si64(a, b); } __m64 test_mm_srli_pi16(__m64 a) { // CHECK-LABEL: test_mm_srli_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psrli.w + // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w( return _mm_srli_pi16(a, 3); } __m64 test_mm_srli_pi32(__m64 a) { // CHECK-LABEL: test_mm_srli_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psrli.d + // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d( return _mm_srli_pi32(a, 3); } __m64 test_mm_srli_si64(__m64 a) { // CHECK-LABEL: test_mm_srli_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.psrli.q + // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q( return _mm_srli_si64(a, 3); } void test_mm_stream_pi(__m64 *p, __m64 a) { // CHECK-LABEL: test_mm_stream_pi - // CHECK: call void @llvm.x86.mmx.movnt.dq + // CHECK: store <1 x i64> {{%.*}}, <1 x i64>* {{%.*}}, align 8, !nontemporal _mm_stream_pi(p, a); } __m64 test_mm_sub_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sub_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.psub.b + // CHECK: sub <8 x i8> {{%.*}}, {{%.*}} return _mm_sub_pi8(a, b); } __m64 test_mm_sub_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sub_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psub.w + // CHECK: sub <4 x i16> {{%.*}}, {{%.*}} return _mm_sub_pi16(a, b); } __m64 test_mm_sub_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sub_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.psub.d + // CHECK: sub <2 x i32> {{%.*}}, {{%.*}} return _mm_sub_pi32(a, b); } __m64 test_mm_sub_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sub_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}}) + // CHECK: sub i64 {{%.*}}, {{%.*}} return _mm_sub_si64(a, b); } __m64 test_mm_subs_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_subs_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.psubs.b + // CHECK: call <16 x i8> @llvm.ssub.sat.v16i8( return _mm_subs_pi8(a, b); } __m64 test_mm_subs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_subs_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.psubs.w + // CHECK: call <8 x i16> @llvm.ssub.sat.v8i16( return _mm_subs_pi16(a, b); } __m64 test_mm_subs_pu8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_subs_pu8 - // CHECK: call x86_mmx @llvm.x86.mmx.psubus.b + // CHECK: call <16 x i8> @llvm.usub.sat.v16i8( return _mm_subs_pu8(a, b); } __m64 test_mm_subs_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_subs_pu16 - // CHECK: call x86_mmx @llvm.x86.mmx.psubus.w + // CHECK: call <8 x i16> @llvm.usub.sat.v8i16( return _mm_subs_pu16(a, b); } @@ -664,42 +673,42 @@ __m64 test_mm_unpackhi_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpackhi_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.punpckhbw + // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> return _mm_unpackhi_pi8(a, b); } __m64 test_mm_unpackhi_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpackhi_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.punpckhwd + // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> return _mm_unpackhi_pi16(a, b); } __m64 test_mm_unpackhi_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpackhi_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.punpckhdq + // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> return _mm_unpackhi_pi32(a, b); } __m64 test_mm_unpacklo_pi8(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpacklo_pi8 - // CHECK: call x86_mmx @llvm.x86.mmx.punpcklbw + // CHECK: shufflevector <8 x i8> {{%.*}}, <8 x i8> {{%.*}}, <8 x i32> return _mm_unpacklo_pi8(a, b); } __m64 test_mm_unpacklo_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpacklo_pi16 - // CHECK: call x86_mmx @llvm.x86.mmx.punpcklwd + // CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> return _mm_unpacklo_pi16(a, b); } __m64 test_mm_unpacklo_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_unpacklo_pi32 - // CHECK: call x86_mmx @llvm.x86.mmx.punpckldq + // CHECK: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> return _mm_unpacklo_pi32(a, b); } __m64 test_mm_xor_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_xor_si64 - // CHECK: call x86_mmx @llvm.x86.mmx.pxor + // CHECK: xor <1 x i64> {{%.*}}, {{%.*}} return _mm_xor_si64(a, b); } diff --git a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c --- a/clang/test/CodeGen/X86/mmx-shift-with-immediate.c +++ b/clang/test/CodeGen/X86/mmx-shift-with-immediate.c @@ -2,22 +2,22 @@ #include void shift(__m64 a, __m64 b, int c) { - // CHECK: x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 {{.*}}) _mm_slli_pi16(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 {{.*}}) _mm_slli_pi32(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 {{.*}}) _mm_slli_si64(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 {{.*}}) _mm_srli_pi16(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 {{.*}}) _mm_srli_pi32(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 {{.*}}) _mm_srli_si64(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 {{.*}}) _mm_srai_pi16(a, c); - // CHECK: x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx %{{.*}}, i32 {{.*}}) + // CHECK: <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 {{.*}}) _mm_srai_pi32(a, c); } diff --git a/clang/test/CodeGen/attr-target-x86-mmx.c b/clang/test/CodeGen/attr-target-x86-mmx.c --- a/clang/test/CodeGen/attr-target-x86-mmx.c +++ b/clang/test/CodeGen/attr-target-x86-mmx.c @@ -1,12 +1,11 @@ // RUN: %clang_cc1 -triple i386-linux-gnu -emit-llvm %s -o - | FileCheck %s -// Picking a cpu that doesn't have mmx or sse by default so we can enable it later. +// Picking a cpu that doesn't have sse by default so we can enable it later. #define __MM_MALLOC_H #include -// Verify that when we turn on sse that we also turn on mmx. -void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) { +void __attribute__((target("sse2"))) shift(__m64 a, __m64 b, int c) { _mm_slli_pi16(a, c); _mm_slli_pi32(a, c); _mm_slli_si64(a, c); @@ -19,4 +18,4 @@ _mm_srai_pi32(a, c); } -// CHECK: "target-features"="+cx8,+mmx,+sse,+x87" +// CHECK: "target-features"="+cx8,+mmx,+sse,+sse2,+x87" diff --git a/clang/test/Headers/xmmintrin.c b/clang/test/Headers/xmmintrin.c --- a/clang/test/Headers/xmmintrin.c +++ b/clang/test/Headers/xmmintrin.c @@ -14,7 +14,7 @@ // checking that clang emits PACKSSDW instead of PACKSSWB. // CHECK: define{{.*}} i64 @test_mm_cvtps_pi16 -// CHECK: call x86_mmx @llvm.x86.mmx.packssdw +// CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128 __m64 test_mm_cvtps_pi16(__m128 a) { return _mm_cvtps_pi16(a); diff --git a/clang/test/Sema/x86-builtin-palignr.c b/clang/test/Sema/x86-builtin-palignr.c --- a/clang/test/Sema/x86-builtin-palignr.c +++ b/clang/test/Sema/x86-builtin-palignr.c @@ -4,5 +4,5 @@ #include __m64 test1(__m64 a, __m64 b, int c) { - return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_palignr' must be a constant integer}} + return _mm_alignr_pi8(a, b, c); // expected-error {{argument to '__builtin_ia32_psrldqi128_byteshift' must be a constant integer}} } diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -2424,11 +2424,11 @@ Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; - def int_x86_mmx_pextr_w : GCCBuiltin<"__builtin_ia32_vec_ext_v4hi">, + def int_x86_mmx_pextr_w : Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; - def int_x86_mmx_pinsr_w : GCCBuiltin<"__builtin_ia32_vec_set_v4hi">, + def int_x86_mmx_pinsr_w : Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; } diff --git a/mmx-tests/Makefile b/mmx-tests/Makefile new file mode 100644 --- /dev/null +++ b/mmx-tests/Makefile @@ -0,0 +1,29 @@ +USE_XMM= +#USE_XMM=--use-xmm + +OLDCC ?= clang-10 +NEWCC ?= ../build/bin/clang +TESTCC=$(OLDCC) +COPTS ?= + +gen_orig.c: mmx-tests.py + ./mmx-tests.py --kind=wrapper --wrapper-prefix=orig $(USE_XMM) > $@ +gen_orig.h: mmx-tests.py + ./mmx-tests.py --kind=wrapper_h --wrapper-prefix=orig $(USE_XMM) > $@ +gen_new.c: mmx-tests.py + ./mmx-tests.py --kind=wrapper --wrapper-prefix=new $(USE_XMM) > $@ +gen_new.h: mmx-tests.py + ./mmx-tests.py --kind=wrapper_h --wrapper-prefix=new $(USE_XMM) > $@ +gen_test.inc: mmx-tests.py + ./mmx-tests.py --kind=test $(USE_XMM) > $@ +gen_orig.o: gen_orig.c + $(OLDCC) -c $(COPTS) -O2 -o $@ $^ +gen_new.o: gen_new.c + $(NEWCC) -c $(COPTS) -O2 -o $@ $^ +test.o: test.c gen_test.inc gen_orig.h gen_new.h + $(TESTCC) -c $(COPTS) -o $@ test.c +test: test.o gen_orig.o gen_new.o + $(TESTCC) $(COPTS) -o $@ $^ -lm + +clean: + rm -f gen_orig.c gen_orig.h gen_new.c gen_new.h gen_test.inc gen_orig.o gen_new.o test.o test diff --git a/mmx-tests/mmx-tests.py b/mmx-tests/mmx-tests.py new file mode 100755 --- /dev/null +++ b/mmx-tests/mmx-tests.py @@ -0,0 +1,301 @@ +#!/usr/bin/python3 + +import argparse +import sys + +# This is a list of all intel functions and macros which take or +# return an __m64. +def do_mmx(fn): + # mmintrin.h + fn("_mm_cvtsi32_si64", "__m64", ("int", )) + fn("_mm_cvtsi64_si32", "int", ("__m64", )) + fn("_mm_cvtsi64_m64", "__m64", ("long long", ), condition='defined(__X86_64__) || defined(__clang__)') + fn("_mm_cvtm64_si64", "long long", ("__m64", ), condition='defined(__X86_64__) || defined(__clang__)') + fn("_mm_packs_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_packs_pi32", "__m64", ("__m64", "__m64", )) + fn("_mm_packs_pu16", "__m64", ("__m64", "__m64", )) + fn("_mm_unpackhi_pi8", "__m64", ("__m64", "__m64", )) + fn("_mm_unpackhi_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_unpackhi_pi32", "__m64", ("__m64", "__m64", )) + fn("_mm_unpacklo_pi8", "__m64", ("__m64", "__m64", )) + fn("_mm_unpacklo_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_unpacklo_pi32", "__m64", ("__m64", "__m64", )) + fn("_mm_add_pi8", "__m64", ("__m64", "__m64", )) + fn("_mm_add_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_add_pi32", "__m64", ("__m64", "__m64", )) + fn("_mm_adds_pi8", "__m64", ("__m64", "__m64", )) + fn("_mm_adds_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_adds_pu8", "__m64", ("__m64", "__m64", )) + fn("_mm_adds_pu16", "__m64", ("__m64", "__m64", )) + fn("_mm_sub_pi8", "__m64", ("__m64", "__m64", )) + fn("_mm_sub_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_sub_pi32", "__m64", ("__m64", "__m64", )) + fn("_mm_subs_pi8", "__m64", ("__m64", "__m64", )) + fn("_mm_subs_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_subs_pu8", "__m64", ("__m64", "__m64", )) + fn("_mm_subs_pu16", "__m64", ("__m64", "__m64", )) + fn("_mm_madd_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_mulhi_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_mullo_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_sll_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_slli_pi16", "__m64", ("__m64", "int", )) + fn("_mm_sll_pi32", "__m64", ("__m64", "__m64", )) + fn("_mm_slli_pi32", "__m64", ("__m64", "int", )) + fn("_mm_sll_si64", "__m64", ("__m64", "__m64", )) + fn("_mm_slli_si64", "__m64", ("__m64", "int", )) + fn("_mm_sra_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_srai_pi16", "__m64", ("__m64", "int", )) + fn("_mm_sra_pi32", "__m64", ("__m64", "__m64", )) + fn("_mm_srai_pi32", "__m64", ("__m64", "int", )) + fn("_mm_srl_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_srli_pi16", "__m64", ("__m64", "int", )) + fn("_mm_srl_pi32", "__m64", ("__m64", "__m64", )) + fn("_mm_srli_pi32", "__m64", ("__m64", "int", )) + fn("_mm_srl_si64", "__m64", ("__m64", "__m64", )) + fn("_mm_srli_si64", "__m64", ("__m64", "int", )) + fn("_mm_and_si64", "__m64", ("__m64", "__m64", )) + fn("_mm_andnot_si64", "__m64", ("__m64", "__m64", )) + fn("_mm_or_si64", "__m64", ("__m64", "__m64", )) + fn("_mm_xor_si64", "__m64", ("__m64", "__m64", )) + fn("_mm_cmpeq_pi8", "__m64", ("__m64", "__m64", )) + fn("_mm_cmpeq_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_cmpeq_pi32", "__m64", ("__m64", "__m64", )) + fn("_mm_cmpgt_pi8", "__m64", ("__m64", "__m64", )) + fn("_mm_cmpgt_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_cmpgt_pi32", "__m64", ("__m64", "__m64", )) + fn("_mm_setzero_si64", "__m64", ()) + fn("_mm_set_pi32", "__m64", ("int", "int", )) + fn("_mm_set_pi16", "__m64", ("short", "short", "short", "short", )) + fn("_mm_set_pi8", "__m64", ("char", "char", "char", "char", "char", "char", "char", "char", )) + fn("_mm_set1_pi32", "__m64", ("int", )) + fn("_mm_set1_pi16", "__m64", ("short", )) + fn("_mm_set1_pi8", "__m64", ("char", )) + fn("_mm_setr_pi32", "__m64", ("int", "int", )) + fn("_mm_setr_pi16", "__m64", ("short", "short", "short", "short", )) + fn("_mm_setr_pi8", "__m64", ("char", "char", "char", "char", "char", "char", "char", "char", )) + + # xmmintrin.h + fn("_mm_cvtps_pi32", "__m64", ("__m128", )) + fn("_mm_cvt_ps2pi", "__m64", ("__m128", )) + fn("_mm_cvttps_pi32", "__m64", ("__m128", )) + fn("_mm_cvtt_ps2pi", "__m64", ("__m128", )) + fn("_mm_cvtpi32_ps", "__m128", ("__m128", "__m64", )) + fn("_mm_cvt_pi2ps", "__m128", ("__m128", "__m64", )) + fn("_mm_loadh_pi", "__m128", ("__m128", "const __m64 *", )) + fn("_mm_loadl_pi", "__m128", ("__m128", "const __m64 *", )) + fn("_mm_storeh_pi", "void", ("__m64 *", "__m128", )) + fn("_mm_storel_pi", "void", ("__m64 *", "__m128", )) + fn("_mm_stream_pi", "void", ("__m64 *", "__m64", )) + fn("_mm_max_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_max_pu8", "__m64", ("__m64", "__m64", )) + fn("_mm_min_pi16", "__m64", ("__m64", "__m64", )) + fn("_mm_min_pu8", "__m64", ("__m64", "__m64", )) + fn("_mm_movemask_pi8", "int", ("__m64", )) + fn("_mm_mulhi_pu16", "__m64", ("__m64", "__m64", )) + fn("_mm_maskmove_si64", "void", ("__m64", "__m64", "char *", )) + fn("_mm_avg_pu8", "__m64", ("__m64", "__m64", )) + fn("_mm_avg_pu16", "__m64", ("__m64", "__m64", )) + fn("_mm_sad_pu8", "__m64", ("__m64", "__m64", )) + fn("_mm_cvtpi16_ps", "__m128", ("__m64", )) + fn("_mm_cvtpu16_ps", "__m128", ("__m64", )) + fn("_mm_cvtpi8_ps", "__m128", ("__m64", )) + fn("_mm_cvtpu8_ps", "__m128", ("__m64", )) + fn("_mm_cvtpi32x2_ps", "__m128", ("__m64", "__m64", )) + fn("_mm_cvtps_pi16", "__m64", ("__m128", )) + fn("_mm_cvtps_pi8", "__m64", ("__m128", )) + + fn("_mm_extract_pi16", "int", ("__m64", "int", ), imm_range=(0, 3)) + fn("_mm_insert_pi16", "__m64", ("__m64", "int", "int", ), imm_range=(0, 3)) + fn("_mm_shuffle_pi16", "__m64", ("__m64", "int", ), imm_range=(0, 255)) + + # emmintrin.h + fn("_mm_cvtpd_pi32", "__m64", ("__m128d", )) + fn("_mm_cvttpd_pi32", "__m64", ("__m128d", )) + fn("_mm_cvtpi32_pd", "__m128d", ("__m64", )) + fn("_mm_add_si64", "__m64", ("__m64", "__m64", )) + fn("_mm_mul_su32", "__m64", ("__m64", "__m64", )) + fn("_mm_sub_si64", "__m64", ("__m64", "__m64", )) + fn("_mm_set_epi64", "__m128i", ("__m64", "__m64", )) + fn("_mm_set1_epi64", "__m128i", ("__m64", )) + fn("_mm_setr_epi64", "__m128i", ("__m64", "__m64", )) + fn("_mm_movepi64_pi64", "__m64", ("__m128i", )) + fn("_mm_movpi64_epi64", "__m128i", ("__m64", )) + + # tmmintrin.h + fn("_mm_abs_pi8", "__m64", ("__m64", ), target='ssse3') + fn("_mm_abs_pi16", "__m64", ("__m64", ), target='ssse3') + fn("_mm_abs_pi32", "__m64", ("__m64", ), target='ssse3') + fn("_mm_hadd_pi16", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_hadd_pi32", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_hadds_pi16", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_hsub_pi16", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_hsub_pi32", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_hsubs_pi16", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_maddubs_pi16", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_mulhrs_pi16", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_shuffle_pi8", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_sign_pi8", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_sign_pi16", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_sign_pi32", "__m64", ("__m64", "__m64", ), target='ssse3') + fn("_mm_alignr_pi8", "__m64", ("__m64", "__m64", "int", ), imm_range=(0, 18), target='ssse3') + +# Generate a file full of wrapper functions for each of the above mmx +# functions. +# +# If use_xmm is set, pass/return arguments as __m128 rather than of +# __m64. +def define_wrappers(prefix, use_xmm=True, header=False): + if header: + print('#pragma once') + + print('#include ') + if use_xmm and not header: + print('#define m128_to_m64(x) ((__m64)((__v2di)(x))[0])') + print('#define m64_to_m128(x) ((__m128)(__v2di){(long long)(__m64)(x), 0})') + + def fn(name, ret_ty, arg_tys, imm_range=None, target=None, condition=None): + if condition: + print(f'#if {condition}') + convert_ret = False + if use_xmm and ret_ty == '__m64': + ret_ty = '__v2di' + convert_ret = True + + if target: + attr = f'__attribute__((target("{target}"))) ' + else: + attr = '' + + if imm_range: + arg_tys = arg_tys[:-1] + def translate_type(t): + if use_xmm and t == '__m64': + return '__m128' + return t + def translate_arg(t, a): + if use_xmm and t == '__m64': + return f'm128_to_m64({a})' + return a + + arg_decl = ', '.join(f'{translate_type(v[1])} arg_{v[0]}' for v in enumerate(arg_tys)) or 'void' + call_args = ', '.join(translate_arg(v[1], f'arg_{v[0]}') for v in enumerate(arg_tys)) + + def create_fn(suffix, extraarg): + if header: + print(f'{ret_ty} {prefix}_{name}{suffix}({arg_decl});') + else: + print(f'{attr}{ret_ty} {prefix}_{name}{suffix}({arg_decl})') + if use_xmm and convert_ret: + print(f'{{ return ({ret_ty})m64_to_m128({name}({call_args}{extraarg})); }}') + else: + print(f'{{ return {name}({call_args}{extraarg}); }}') + + if imm_range: + for i in range(imm_range[0], imm_range[1]+1): + create_fn(f'_{i}', f', {i}') + else: + create_fn('', '') + if condition: + print('#endif') + + do_mmx(fn) + + +# Create a C file that tests an "orig" set of wrappers against a "new" +# set of wrappers. +def define_tests(use_xmm=False): + def fn(name, ret_ty, arg_tys, imm_range=None, target=None, condition=None): + if condition: + print(f'#if {condition}') + arg_decl = ', '.join(f'{v[1]} arg_{v[0]}' for v in enumerate(arg_tys)) or 'void' + print(f' // {ret_ty} {name}({arg_decl});') + + if imm_range: + for i in range(imm_range[0], imm_range[1]+1): + fn(name + f'_{i}', ret_ty, arg_tys[:-1], target=target) + return + + convert_pre = convert_post = '' + if use_xmm and ret_ty == '__m64': + convert_pre = 'm128_to_m64(' + convert_post = ')' + + args=[] + loops=[] + printf_fmts = [] + printf_args = [] + for arg_ty in arg_tys: + v=len(loops) + if arg_ty in ('char', 'short'): + loops.append(f' for(int l{v} = 0; l{v} < arraysize(short_vals); ++l{v}) {{') + args.append(f'({arg_ty})short_vals[l{v}]') + printf_fmts.append('%016x') + printf_args.append(f'short_vals[l{v}]') + elif arg_ty in ('int', 'long long'): + loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{') + args.append(f'({arg_ty})mmx_vals[l{v}]') + printf_fmts.append('%016llx') + printf_args.append(f'mmx_vals[l{v}]') + elif arg_ty == '__m64': + loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{') + if use_xmm: + loops.append(f' for(int l{v+1} = 0; l{v+1} < arraysize(padding_mmx_vals); ++l{v+1}) {{') + args.append(f'(__m128)(__m128i){{mmx_vals[l{v}], padding_mmx_vals[l{v+1}]}}') + printf_fmts.append('(__m128i){%016llx, %016llx}') + printf_args.append(f'mmx_vals[l{v}], padding_mmx_vals[l{v+1}]') + else: + args.append(f'({arg_ty})mmx_vals[l{v}]') + printf_fmts.append('%016llx') + printf_args.append(f'mmx_vals[l{v}]') + elif arg_ty in ('__m128', '__m128i', '__m128d'): + loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{') + loops.append(f' for(int l{v+1} = 0; l{v+1} < arraysize(mmx_vals); ++l{v+1}) {{') + args.append(f'({arg_ty})(__m128i){{mmx_vals[l{v}], mmx_vals[l{v+1}]}}') + printf_fmts.append('(__m128i){%016llx, %016llx}') + printf_args.append(f'mmx_vals[l{v}], mmx_vals[l{v+1}]') + elif arg_ty == 'const __m64 *': + loops.append(f' for(int l{v} = 0; l{v} < arraysize(mmx_vals); ++l{v}) {{\n' + + f' mem.m64 = (__m64)mmx_vals[l{v}];') + args.append(f'&mem.m64') + printf_fmts.append('&mem.m64 /* %016llx */') + printf_args.append(f'(long long)mem.m64') + else: + print(' // -> UNSUPPORTED') + return + + printf_fmt_str = '"' + ', '.join(printf_fmts) + '"' + if printf_args: + printf_arg_str = ', ' + ','.join(printf_args) + else: + printf_arg_str = '' + + print('\n'.join(loops)) + print(f''' + clear_exc_flags(); + {ret_ty} orig_res = {convert_pre}orig_{name}({", ".join(args)}){convert_post}; + int orig_exc = get_exc_flags(); + clear_exc_flags(); + {ret_ty} new_res = {convert_pre}new_{name}({", ".join(args)}){convert_post}; + int new_exc = get_exc_flags(); + check_mismatch("{name}", orig_exc, new_exc, &orig_res, &new_res, sizeof(orig_res), {printf_fmt_str}{printf_arg_str}); +''') + print(' }\n' * len(loops)) + print() + if condition: + print('#endif') + + do_mmx(fn) + + +parser = argparse.ArgumentParser(description='Generate mmx test code.') +parser.add_argument('--kind', choices=['wrapper', 'wrapper_h', 'test']) +parser.add_argument('--wrapper-prefix', default='orig') +parser.add_argument('--use-xmm', action='store_true') + +args = parser.parse_args() +if args.kind == 'wrapper': + define_wrappers(args.wrapper_prefix, use_xmm=args.use_xmm, header=False) +elif args.kind == 'wrapper_h': + define_wrappers(args.wrapper_prefix, use_xmm=args.use_xmm, header=True) +elif args.kind == 'test': + define_tests(use_xmm=args.use_xmm) diff --git a/mmx-tests/test.c b/mmx-tests/test.c new file mode 100644 --- /dev/null +++ b/mmx-tests/test.c @@ -0,0 +1,237 @@ +#include +#include +#include +#include +#include +#include + +#include "gen_orig.h" +#include "gen_new.h" + + +// A bunch of helper functions for the code in gen_test.inc +#define m128_to_m64(x) (__m64)((__v2di)(x))[0] + +#define arraysize(a) (sizeof(a) / sizeof(*a)) + +static void dump_mem(void *ptr, int nbytes) { + for (int i = 0; i < nbytes; ++i) { + printf(" %02x", ((unsigned char*)ptr)[i]); + } + printf("\n"); +} + +static int get_exc_flags() { + return fetestexcept(FE_ALL_EXCEPT | __FE_DENORM); +} + +static void clear_exc_flags() { + feclearexcept(FE_ALL_EXCEPT | __FE_DENORM); +} + +static void dump_exc_flags(int exc_flags) { + printf("%x", exc_flags); + if (exc_flags & FE_INEXACT) + printf(" inexact"); + if (exc_flags & FE_DIVBYZERO) + printf(" divbyzero"); + if (exc_flags & FE_UNDERFLOW) + printf(" underflow"); + if (exc_flags & FE_OVERFLOW) + printf(" overflow"); + if (exc_flags & FE_INVALID) + printf(" invalid"); + if (exc_flags & __FE_DENORM) + printf(" denormal"); +} + +static void dump_result(int orig_exc, int new_exc, void *orig_data, void *new_data, int nbytes) { + printf(" orig_exc = "); + dump_exc_flags(orig_exc); + printf(" new_exc = "); + dump_exc_flags(new_exc); + printf("\n"); + printf(" orig"); + dump_mem(orig_data, nbytes); + printf(" new "); + dump_mem(new_data, nbytes); +} + +static void check_mismatch(const char *name, int orig_exc, int new_exc, + void *orig_data, void *new_data, int nbytes, + const char *printf_fmt, ...) { + if (orig_exc != new_exc || memcmp(orig_data, new_data, nbytes)) { + va_list args; + va_start(args, printf_fmt); + printf("mismatch %s(", name); + vprintf(printf_fmt, args); + printf("):\n"); + dump_result(orig_exc, new_exc, orig_data, new_data, nbytes); + va_end(args); + } +} + +unsigned short short_vals[] = { + 0x0000, + 0x0001, + 0xffee, + 0xffff, +}; + +unsigned long long padding_mmx_vals[] = { + 0x0000000000000000LL, + 0xffffffffffffffffLL, + 0x7fc000007fc00000LL, // float nan nan + 0xfff8000000000000LL, // -nan +}; + +unsigned long long mmx_vals[] = { + 0x0000000000000000LL, + 0x0000000000000001LL, + 0x0000000000000002LL, + 0x0000000000000003LL, + 0x0000000000000004LL, + 0x0000000000000005LL, + 0x0000000000000006LL, + 0x0000000000000007LL, + 0x0000000000000008LL, + 0x0000000000000009LL, + 0x000000000000000aLL, + 0x000000000000000bLL, + 0x000000000000000cLL, + 0x000000000000000dLL, + 0x000000000000000eLL, + 0x000000000000000fLL, + 0x0000000000000100LL, + 0x0000000000010000LL, + 0x0000000001000000LL, + 0x0000000100000000LL, + 0x0000010000000000LL, + 0x0001000000000000LL, + 0x0100000000000000LL, + 0x0101010101010101LL, + 0x0102030405060708LL, + 0x1234567890abcdefLL, + 0x007f007f007f007fLL, + 0x7f007f007f007f00LL, + 0x7f7f7f7f7f7f7f7fLL, + 0x8000800080008000LL, + 0x0080008000800080LL, + 0x8080808080808080LL, + 0x7fff7fff7fff7fffLL, + 0x8000800080008000LL, + 0x7fffffff7fffffffLL, + 0x8000000080000000LL, + 0x0000777700006666LL, + 0x7777000066660000LL, + 0x0000ffff0000eeeeLL, + 0xffff0000eeee0000LL, + 0x7700660055004400LL, + 0x0077006600550044LL, + 0xff00ee00dd00cc00LL, + 0x00ff00ee00dd00ccLL, + 0xffffffffffffffffLL, + 0x3ff0000000000000LL, // 1.0 + 0x3ff8000000000000LL, // 1.5 + 0x4000000000000000LL, // 2.0 + 0x3f8000003fc00000LL, // float 1.0 1.5 + 0x3fc0000040000000LL, // float 1.5 2.0 + 0x7ff0000000000000LL, // inf + 0x7f8000007f800000LL, // float inf inf + 0xfff0000000000000LL, // -inf + 0xff800000ff800000LL, // float -inf -inf + 0x7ff8000000000000LL, // nan + 0x7fc000007fc00000LL, // float nan nan + 0xfff8000000000000LL, // -nan + 0xffc00000ffc00000LL, // float -nan -nan +}; + +struct __attribute__((aligned(sizeof(__m128)))) Mem { + __m64 dummy; + __m64 m64; +} mem, mem2; + +// These 3 could be autogenerated...but I didn't add support for stores to the generator. +void test_stores() { + // void _mm_storeh_pi(__m64 * arg_0, __m128 arg_1); + for(int l0 = 0; l0 < arraysize(mmx_vals); ++l0) { + for(int l1 = 0; l1 < arraysize(mmx_vals); ++l1) { + clear_exc_flags(); + orig__mm_storeh_pi(&mem.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]}); + int orig_exc = get_exc_flags(); + clear_exc_flags(); + new__mm_storeh_pi(&mem2.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]}); + int new_exc = get_exc_flags(); + check_mismatch("_mm_storeh_pi", orig_exc, new_exc, &mem.m64, &mem2.m64, sizeof(__m64), + "&mem.m64, (__m128i){%016llx, %016llx},", mmx_vals[l0], mmx_vals[l1]); + } + } + + // void _mm_storel_pi(__m64 * arg_0, __m128 arg_1); + for(int l0 = 0; l0 < arraysize(mmx_vals); ++l0) { + for(int l1 = 0; l1 < arraysize(mmx_vals); ++l1) { + clear_exc_flags(); + orig__mm_storel_pi(&mem.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]}); + int orig_exc = get_exc_flags(); + clear_exc_flags(); + new__mm_storel_pi(&mem2.m64, (__m128)(__m128i){mmx_vals[l0], mmx_vals[l1]}); + int new_exc = get_exc_flags(); + check_mismatch("_mm_storeh_pi", orig_exc, new_exc, &mem.m64, &mem2.m64, sizeof(__m64), + "&mem.m64, (__m128i){%016llx, %016llx},", mmx_vals[l0], mmx_vals[l1]); + } + } + + // void _mm_stream_pi(__m64 * arg_0, __m64 arg_1); + for(int l0 = 0; l0 < arraysize(mmx_vals); ++l0) { + clear_exc_flags(); + orig__mm_stream_pi(&mem.m64, (__m64)mmx_vals[l0]); + int orig_exc = get_exc_flags(); + clear_exc_flags(); + new__mm_stream_pi(&mem2.m64, (__m64)mmx_vals[l0]); + int new_exc = get_exc_flags(); + check_mismatch("_mm_stream_pi", orig_exc, new_exc, &mem.m64, &mem2.m64, sizeof(__m64), + "&mem.m64, %016llx,", mmx_vals[l0]); + } +} + +// Test that the nominally 64-bit maskmove doesn't trap at the edges of +// non-writable memory, despite being implemented by a 128-bit write. +void test_maskmove() { + // Create a page memory with an inaccessible page on either side. + char *map = mmap(0, 3 * 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (!map) + abort(); + if (mprotect(map, 4096, PROT_NONE)) + abort(); + if (mprotect(map + 4096 * 2, 4096, PROT_NONE)) + abort(); + long long init_val = 0xffeeddccbbaa9900; + long long expected = 0x11ee3344bb669900; + for (int offset = 0; offset < 16+9; ++offset) { + char *copy_location = map + 4096 + (offset > 16 ? 4096 - 32 + offset : offset); + memcpy(copy_location, &init_val, 8); + new__mm_maskmove_si64((__m64)0x1122334455667788LL, (__m64)0x8000808000800000, copy_location); + long long result; + memcpy(&result, copy_location, 8); + if (memcmp(&expected, &result, 8) != 0) { + printf("test_maskmove: wrong value was stored %llx vs %llx\n", result, expected); + return; + } + } +} + +void test_generated() { + #include "gen_test.inc" +} + +int main() { + int rounding[] = {FE_TONEAREST, FE_UPWARD, FE_DOWNWARD, FE_TOWARDZERO}; + for (int i = 0; i < 4; ++i) + { + fesetround(rounding[i]); + + test_maskmove(); + test_stores(); + test_generated(); + } +}