Index: lib/Headers/__wmmintrin_aes.h =================================================================== --- lib/Headers/__wmmintrin_aes.h +++ lib/Headers/__wmmintrin_aes.h @@ -28,36 +28,120 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"))) +/// \brief Performs a single round of AES encryption, transforming the state +/// value from the first source operand using a round key value contained +/// in the second source operand, and writes the result to the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VAESENC instruction. +/// +/// \param __V +/// A 128-bit integer vector containing the state value. +/// \param __R +/// A 128-bit integer vector containing the round key value. +/// \returns A 128-bit integer vector containing the encrypted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_aesenc_si128(__m128i __V, __m128i __R) { return (__m128i)__builtin_ia32_aesenc128(__V, __R); } +/// \brief Performs the final round of AES encryption, transforming the state +/// value from the first source operand using a round key value contained +/// in the second source operand, and writes the result to the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VAESENCLAST instruction. +/// +/// \param __V +/// A 128-bit integer vector containing the state value. +/// \param __R +/// A 128-bit integer vector containing the round key value. +/// \returns A 128-bit integer vector containing the encrypted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_aesenclast_si128(__m128i __V, __m128i __R) { return (__m128i)__builtin_ia32_aesenclast128(__V, __R); } +/// \brief Performs a single round of AES decryption, transforming the state +/// value from the first source operand using a round key value contained +/// in the second source operand, and writes the result to the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VAESDEC instruction. +/// +/// \param __V +/// A 128-bit integer vector containing the state value. +/// \param __R +/// A 128-bit integer vector containing the round key value. +/// \returns A 128-bit integer vector containing the decrypted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_aesdec_si128(__m128i __V, __m128i __R) { return (__m128i)__builtin_ia32_aesdec128(__V, __R); } +/// \brief Performs the final round of AES decryption, transforming the state +/// value from the first source operand using a round key value contained +/// in the second source operand, and writes the result to the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VAESDECLAST instruction. +/// +/// \param __V +/// A 128-bit integer vector containing the state value. +/// \param __R +/// A 128-bit integer vector containing the round key value. +/// \returns A 128-bit integer vector containing the decrypted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_aesdeclast_si128(__m128i __V, __m128i __R) { return (__m128i)__builtin_ia32_aesdeclast128(__V, __R); } +/// \brief Applies the AES InvMixColumns() transformation to an expanded key +/// contained in the source operand, and writes the result to the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VAESIMC instruction. +/// +/// \param __V +/// A 128-bit integer vector containing the expanded key. +/// \returns A 128-bit integer vector containing the transformed value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_aesimc_si128(__m128i __V) { return (__m128i)__builtin_ia32_aesimc128(__V); } +/// \brief Expands the round key value contained in the first source operand +/// using a round constant specified by the second source operand, and +/// writes the result to the destination. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R); +/// \endcode +/// +/// This intrinsic corresponds to \c AESKEYGENASSIST instruction. +/// +/// \param C +/// A 128-bit integer vector containing the round key value. +/// \param R +/// An 8-bit integer containing the round constant. +/// \returns A 128-bit integer vector containing the expanded round key value. #define _mm_aeskeygenassist_si128(C, R) \ (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)) Index: lib/Headers/__wmmintrin_pclmul.h =================================================================== --- lib/Headers/__wmmintrin_pclmul.h +++ lib/Headers/__wmmintrin_pclmul.h @@ -23,6 +23,36 @@ #ifndef _WMMINTRIN_PCLMUL_H #define _WMMINTRIN_PCLMUL_H +/// \brief Multiplies two 64-bit integer values, selected from the operands +/// using the immediate value operand. The multiplication is a carry-less +/// multiplication, and the 128-bit integer product is stored in the +/// destination. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCLMULQDQ instruction. +/// +/// \param __X +/// A 128-bit vector of [2 x i64] containing one of the source +/// operands. +/// \param __Y +/// A 128-bit vector of [2 x i64] containing one of the source +/// operands. +/// \param __I +/// An immediate value specifying which 64-bit values to select +/// from the operands. +/// Bit 0 is used to select a value from operand __X, +/// and bit 4 is used to select a value from operand __Y: +/// Bit[0]=0 indicates that bits[63:0] of operand __X are used. +/// Bit[0]=1 indicates that bits[127:64] of operand __X are used. +/// Bit[4]=0 indicates that bits[63:0] of operand __Y are used. +/// Bit[4]=1 indicates that bits[127:64] of operand __Y are used. +/// \returns The 128-bit integer vector containing the result of the carry-less +/// multiplication of the selected 64-bit values. #define _mm_clmulepi64_si128(__X, __Y, __I) \ ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \ (__v2di)(__m128i)(__Y), (char)(__I))) Index: lib/Headers/avxintrin.h =================================================================== --- lib/Headers/avxintrin.h +++ lib/Headers/avxintrin.h @@ -47,117 +47,412 @@ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) /* Arithmetic */ +/// \brief Adds 2 packed 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VADDPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. +/// \returns A 256-bit vector of [4 x double] containing the sums of both static __inline __m256d __DEFAULT_FN_ATTRS _mm256_add_pd(__m256d __a, __m256d __b) { return __a+__b; } +/// \brief Adds 2 packed 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VADDPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. +/// \returns A 256-bit vector of [8 x float] containing the sums of both static __inline __m256 __DEFAULT_FN_ATTRS _mm256_add_ps(__m256 __a, __m256 __b) { return __a+__b; } +/// \brief Subtracts 2 packed 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSUBPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the minuend. +/// \param __b +/// A 256-bit vector of [4 x double] containing the subtrahend. +/// \returns A 256-bit vector of [4 x double] containing the differences +/// between both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sub_pd(__m256d __a, __m256d __b) { return __a-__b; } +/// \brief Subtracts 2 packed 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSUBPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the minuend. +/// \param __b +/// A 256-bit vector of [8 x float] containing the subtrahend. +/// \returns A 256-bit vector of [8 x float] containing the differences between +/// both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sub_ps(__m256 __a, __m256 __b) { return __a-__b; } +/// \brief Adds the even-indexed values and subtracts the odd-indexed values of +/// 2 packed 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VADDSUBPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the left source +/// operand. +/// \param __b +/// A 256-bit vector of [4 x double] containing the right source +/// operand. +/// \returns A 256-bit vector of [4 x double] containing the alternating sums +/// and differences between both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_addsub_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); } +/// \brief Adds the even-indexed values and subtracts the odd-indexed values of +/// 2 packed 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VADDSUBPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the left source +/// operand. +/// \param __b +/// A 256-bit vector of [8 x float] containing the right source +/// operand. +/// \returns A 256-bit vector of [8 x float] containing the alternating sums +/// and differences between both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_addsub_ps(__m256 __a, __m256 __b) { return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); } +/// \brief Divides 2 packed 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VDIVPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the dividend. +/// \param __b +/// A 256-bit vector of [4 x double] containing the divisor. +/// \returns A 256-bit vector of [4 x double] containing the quotients between +/// both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_div_pd(__m256d __a, __m256d __b) { return __a / __b; } +/// \brief Divides 2 packed 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VDIVPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the dividend. +/// \param __b +/// A 256-bit vector of [8 x float] containing the divisor. +/// \returns A 256-bit vector of [8 x float] containing the quotients between +/// both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_div_ps(__m256 __a, __m256 __b) { return __a / __b; } +/// \brief Compares 2 packed 256-bit vectors of [4 x double] and stores the +/// greater of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMAXPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the +/// operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the +/// operands. +/// \returns A 256-bit vector of [4 x double] containing the maximum values +/// between both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_max_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); } +/// \brief Compares 2 packed 256-bit vectors of [8 x float] and stores the +/// greater of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMAXPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the +/// operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the +/// operands. +/// \returns A 256-bit vector of [8 x float] containing the maximum values +/// between both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_max_ps(__m256 __a, __m256 __b) { return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); } +/// \brief Compares 2 packed 256-bit vectors of [4 x double] and stores the +/// lesser of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMINPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the +/// operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the +/// operands. +/// \returns A 256-bit vector of [4 x double] containing the minimum values +/// between both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_min_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); } +/// \brief Compares 2 packed 256-bit vectors of [8 x float] and stores the +/// lesser of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMINPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the +/// operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the +/// operands. +/// \returns A 256-bit vector of [8 x float] containing the minimum values +/// between both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_min_ps(__m256 __a, __m256 __b) { return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); } +/// \brief Multiplies 2 packed 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMULPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the +/// operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the +/// operands. +/// \returns A 256-bit vector of [4 x double] containing the products between +/// both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_mul_pd(__m256d __a, __m256d __b) { return __a * __b; } +/// \brief Multiplies 2 packed 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMULPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the +/// operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the +/// operands. +/// \returns A 256-bit vector of [8 x float] containing the products between +/// both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_mul_ps(__m256 __a, __m256 __b) { return __a * __b; } +/// \brief Calculates the square roots of the values stored in a packed 256-bit +/// vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSQRTPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] values. +/// \returns A 256-bit vector of [4 x double] containing the square roots of +/// the values in the operand. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) { return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); } +/// \brief Calculates the square roots of the values stored in a packed 256-bit +/// vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSQRTPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] values. +/// \returns A 256-bit vector of [8 x float] containing the square roots of the +/// values in the operand. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) { return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); } +/// \brief Calculates the reciprocal square roots of the values stored in a +/// packed 256-bit vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VRSQRTPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] values. +/// \returns A 256-bit vector of [8 x float] containing the reciprocal square +/// roots of the values in the operand. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rsqrt_ps(__m256 __a) { return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); } +/// \brief Calculates the reciprocals of the values stored in a packed 256-bit +/// vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VRCPPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] values. +/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the +/// values in the operand. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_rcp_ps(__m256 __a) { return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); } +/// \brief Rounds the values stored in a packed 256-bit vector of [4 x double] +/// as specified by the byte operand. The source values are rounded to +/// integer values and returned as 64-bit double-precision floating-point +/// values. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_round_pd(__m256d V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VROUNDPD instruction. +/// +/// \param V +/// A 256-bit vector of [4 x double] values. +/// \param M +/// An integer value that specifies the rounding operation. +/// Bits [7:4] are reserved. +/// Bit [3] is a precision exception value: +/// 0: A normal PE exception is used +/// 1: The PE field is not updated +/// Bit [2] is the rounding control source: +/// 0: Use bits [1:0] of M +/// 1: Use the current MXCSR setting +/// Bits [1:0] contain the rounding control definition: +/// 00: Nearest +/// 01: Downward (toward negative infinity) +/// 10: Upward (toward positive infinity) +/// 11: Truncated +/// \returns A 256-bit vector of [4 x double] containing the rounded values. #define _mm256_round_pd(V, M) __extension__ ({ \ (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) +/// \brief Rounds the values stored in a packed 256-bit vector of [8 x float] +/// as specified by the byte operand. The source values are rounded to +/// integer values and returned as floating-point values. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_round_ps(__m256 V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VROUNDPS instruction. +/// +/// \param V +/// A 256-bit vector of [8 x float] values. +/// \param M +/// An integer value that specifies the rounding operation. +/// Bits [7:4] are reserved. +/// Bit [3] is a precision exception value: +/// 0: A normal PE exception is used +/// 1: The PE field is not updated +/// Bit [2] is the rounding control source: +/// 0: Use bits [1:0] of M +/// 1: Use the current MXCSR setting +/// Bits [1:0] contain the rounding control definition: +/// 00: Nearest +/// 01: Downward (toward negative infinity) +/// 10: Upward (toward positive infinity) +/// 11: Truncated +/// \returns A 256-bit vector of [8 x float] containing the rounded values. #define _mm256_round_ps(V, M) __extension__ ({ \ (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) @@ -167,48 +462,165 @@ #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) /* Logical */ +/// \brief Performs a bitwise AND of 2 packed 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VANDPS instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. +/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the +/// values between both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_and_pd(__m256d __a, __m256d __b) { return (__m256d)((__v4di)__a & (__v4di)__b); } +/// \brief Performs a bitwise AND of 2 packed 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VANDPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. +/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the +/// values between both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_and_ps(__m256 __a, __m256 __b) { return (__m256)((__v8si)__a & (__v8si)__b); } +/// \brief Performs a bitwise AND of 2 packed 256-bit vectors of [4 x double], +/// using the ones-complement of the values contained in the first +/// source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VANDNPS instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the left source +/// operand. The ones complement of this value is used in +/// the bitwise AND. +/// \param __b +/// A 256-bit vector of [4 x double] containing the right source +/// operand. +/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the +/// values of the second operand and the ones-complement of the +/// first operand. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_andnot_pd(__m256d __a, __m256d __b) { return (__m256d)(~(__v4di)__a & (__v4di)__b); } +/// \brief Performs a bitwise AND of 2 packed 256-bit vectors of [8 x float], +/// using the ones-complement of the values contained in the first +/// source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VANDNPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the left source +/// operand. The ones complement of this value is used in +/// the bitwise AND. +/// \param __b +/// A 256-bit vector of [8 x float] containing the right source +/// operand. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_andnot_ps(__m256 __a, __m256 __b) { return (__m256)(~(__v8si)__a & (__v8si)__b); } +/// \brief Performs a bitwise OR of 2 packed 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VORPS instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. +/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the +/// values between both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_or_pd(__m256d __a, __m256d __b) { return (__m256d)((__v4di)__a | (__v4di)__b); } +/// \brief Performs a bitwise OR of 2 packed 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VORPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. +/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the +/// values between both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_or_ps(__m256 __a, __m256 __b) { return (__m256)((__v8si)__a | (__v8si)__b); } +/// \brief Performs a bitwise XOR of 2 packed 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VXORPS instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. +/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the +/// values between both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_xor_pd(__m256d __a, __m256d __b) { return (__m256d)((__v4di)__a ^ (__v4di)__b); } +/// \brief Performs a bitwise XOR of 2 packed 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VXORPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. +/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the +/// values between both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_xor_ps(__m256 __a, __m256 __b) { @@ -216,24 +628,100 @@ } /* Horizontal arithmetic */ +/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VHADDPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. The horizontal sums of the values are +/// stored in the low-order (even-indexed) elements of the +/// destination. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. The horizontal sums of the values are +/// stored in the high-order (odd-indexed) elements of the +/// destination. +/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of +/// both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hadd_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); } +/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VHADDPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the low-order elements (index 0, 1, 4, 5) of the +/// destination. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the high-order elements (index 2, 3, 6, 7) of the +/// destination. +/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of +/// both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hadd_ps(__m256 __a, __m256 __b) { return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); } +/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VHSUBPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. The horizontal differences between the +/// values are stored in the low-order (even-indexed) elements +/// of the destination. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the +/// source operands. The horizontal differences between the +/// values are stored in the high-order (odd-indexed) elements +/// of the destination. +/// \returns A 256-bit vector of [4 x double] containing the horizontal +/// differences of both operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_hsub_pd(__m256d __a, __m256d __b) { return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); } +/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VHSUBPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the low-order elements (index 0, 1, 4, 5) of the +/// destination. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the high-order elements (index 2, 3, 6, 7) of the +/// destination. +/// \returns A 256-bit vector of [8 x float] containing the horizontal +/// differences of both operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_hsub_ps(__m256 __a, __m256 __b) { @@ -241,35 +729,289 @@ } /* Vector permutations */ +/// \brief Copies the values stored in a packed 128-bit vector of [2 x double] +/// as specified by the 128-bit integer vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPERMILPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __c +/// A 128-bit integer vector operand specifying how the values +/// are to be copied. +/// Bit [1]: +/// 0: Bits [63:0] of the source are copied to bits [63:0] of +/// the destination +/// 1: Bits [127:64] of the source are copied to bits [63:0] of +/// the destination +/// Bit [65]: +/// 0: Bits [63:0] of the source are copied to bits [127:64] of +/// the destination +/// 1: Bits [127:64] of the source are copied to bits [127:64] +/// of the destination +/// \returns A 128-bit vector of [2 x double] containing the copied values. static __inline __m128d __DEFAULT_FN_ATTRS _mm_permutevar_pd(__m128d __a, __m128i __c) { return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); } +/// \brief Copies the values stored in a packed 256-bit vector of [4 x double] +/// as specified by the 256-bit integer vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPERMILPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] values. +/// \param __c +/// A 256-bit integer vector operand specifying how the values +/// are to be copied. +/// Bit [1]: +/// 0: Bits [63:0] of the source are copied to bits [63:0] of +/// the destination +/// 1: Bits [127:64] of the source are copied to bits [63:0] of +/// the destination +/// Bit [65]: +/// 0: Bits [63:0] of the source are copied to bits [127:64] of +/// the destination +/// 1: Bits [127:64] of the source are copied to bits [127:64] +/// of the destination +/// Bit [129]: +/// 0: Bits [191:128] of the source are copied to bits [191:128] +/// of the destination +/// 1: Bits [255:192] of the source are copied to bits [191:128] +/// of the destination +/// Bit [193]: +/// 0: Bits [191:128] of the source are copied to bits [255:192] +/// of the destination +/// 1: Bits [255:192] of the source are copied to bits [255:192] +/// of the destination +/// \returns A 256-bit vector of [4 x double] containing the copied values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_permutevar_pd(__m256d __a, __m256i __c) { return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); } +/// \brief Copies the values stored in a packed 128-bit vector of [4 x float] +/// as specified by the 128-bit integer vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPERMILPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __c +/// A 128-bit integer vector operand specifying how the values +/// are to be copied. +/// Bits [1:0]: +/// 00: Bits [31:0] of the source are copied to bits [31:0] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [31:0] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [31:0] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [31:0] of +/// the destination +/// Bits [33:32]: +/// 00: Bits [31:0] of the source are copied to bits [63:32] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [63:32] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [63:32] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [63:32] +/// of the destination +/// Bits [65:64]: +/// 00: Bits [31:0] of the source are copied to bits [95:64] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [95:64] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [95:64] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [95:64] +/// of the destination +/// Bits [97:96]: +/// 00: Bits [31:0] of the source are copied to bits [127:96] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [127:96] +/// of the destination +/// 10: Bits [95:64] of the source are copied to bits [127:96] +/// of the destination +/// 11: Bits [127:96] of the source are copied to bits [127:96] +/// of the destination +/// \returns A 128-bit vector of [4 x float] containing the copied values. static __inline __m128 __DEFAULT_FN_ATTRS _mm_permutevar_ps(__m128 __a, __m128i __c) { return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); } +/// \brief Copies the values stored in a packed 256-bit vector of [8 x float] +/// as specified by the 256-bit integer vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPERMILPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] values. +/// \param __c +/// A 256-bit integer vector operand specifying how the values +/// are to be copied. +/// Bits [1:0]: +/// 00: Bits [31:0] of the source are copied to bits [31:0] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [31:0] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [31:0] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [31:0] of +/// the destination +/// Bits [33:32]: +/// 00: Bits [31:0] of the source are copied to bits [63:32] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [63:32] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [63:32] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [63:32] +/// of the destination +/// Bits [65:64]: +/// 00: Bits [31:0] of the source are copied to bits [95:64] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [95:64] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [95:64] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [95:64] +/// of the destination +/// Bits [97:96]: +/// 00: Bits [31:0] of the source are copied to bits [127:96] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [127:96] +/// of the destination +/// 10: Bits [95:64] of the source are copied to bits [127:96] +/// of the destination +/// 11: Bits [127:96] of the source are copied to bits [127:96] +/// of the destination +/// Bits [129:128]: +/// 00: Bits [159:128] of the source are copied to bits +/// [159:128] of the destination +/// 01: Bits [191:160] of the source are copied to bits +/// [159:128] of the destination +/// 10: Bits [223:192] of the source are copied to bits +/// [159:128] of the destination +/// 11: Bits [255:224] of the source are copied to bits +/// [159:128] of the destination +/// Bits [161:160]: +/// 00: Bits [159:128] of the source are copied to bits +/// [191:160] of the destination +/// 01: Bits [191:160] of the source are copied to bits +/// [191:160] of the destination +/// 10: Bits [223:192] of the source are copied to bits +/// [191:160] of the destination +/// 11: Bits [255:224] of the source are copied to bits +/// [191:160] of the destination +/// Bits [193:192]: +/// 00: Bits [159:128] of the source are copied to bits +/// [223:192] of the destination +/// 01: Bits [191:160] of the source are copied to bits +/// [223:192] of the destination +/// 10: Bits [223:192] of the source are copied to bits +/// [223:192] of the destination +/// 11: Bits [255:224] of the source are copied to bits +/// [223:192] of the destination +/// Bits [225:224]: +/// 00: Bits [159:128] of the source are copied to bits +/// [255:224] of the destination +/// 01: Bits [191:160] of the source are copied to bits +/// [255:224] of the destination +/// 10: Bits [223:192] of the source are copied to bits +/// [255:224] of the destination +/// 11: Bits [255:224] of the source are copied to bits +/// [255:224] of the destination +/// \returns A 256-bit vector of [8 x float] containing the copied values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_permutevar_ps(__m256 __a, __m256i __c) { return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); } +/// \brief Copies the values stored in a packed 128-bit vector of [2 x double] +/// as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_permute_pd(__m128d A, const int C); +/// \endcode +/// +/// This intrinsic corresponds to \c VPERMILPD instruction. +/// +/// \param A +/// A 128-bit vector of [2 x double] values. +/// \param C +/// An immediate integer operand specifying how the values are +/// to be copied. +/// Bit [0]: +/// 0: Bits [63:0] of the source are copied to bits [63:0] of +/// the destination +/// 1: Bits [127:64] of the source are copied to bits [63:0] of +/// the destination +/// Bit [1]: +/// 0: Bits [63:0] of the source are copied to bits [127:64] of +/// the destination +/// 1: Bits [127:64] of the source are copied to bits [127:64] +/// of the destination +/// \returns A 128-bit vector of [2 x double] containing the copied values. #define _mm_permute_pd(A, C) __extension__ ({ \ (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ (__v2df)_mm_setzero_pd(), \ (C) & 0x1, ((C) & 0x2) >> 1); }) +/// \brief Copies the values stored in a packed 256-bit vector of [4 x double] +/// as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_permute_pd(__m256d A, const int C); +/// \endcode +/// +/// This intrinsic corresponds to \c VPERMILPD instruction. +/// +/// \param A +/// A 256-bit vector of [4 x double] values. +/// \param C +/// An immediate integer operand specifying how the values are +/// to be copied. +/// Bit [0]: +/// 0: Bits [63:0] of the source are copied to bits [63:0] of +/// the destination +/// 1: Bits [127:64] of the source are copied to bits [63:0] of +/// the destination +/// Bit [1]: +/// 0: Bits [63:0] of the source are copied to bits [127:64] of +/// the destination +/// 1: Bits [127:64] of the source are copied to bits [127:64] +/// of the destination +/// Bit [2]: +/// 0: Bits [191:128] of the source are copied to bits [191:128] +/// of the destination +/// 1: Bits [255:192] of the source are copied to bits [191:128] +/// of the destination +/// Bit [3]: +/// 0: Bits [191:128] of the source are copied to bits [255:192] +/// of the destination +/// 1: Bits [255:192] of the source are copied to bits [255:192] +/// of the destination +/// \returns A 256-bit vector of [4 x double] containing the copied values. #define _mm256_permute_pd(A, C) __extension__ ({ \ (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ (__v4df)_mm256_setzero_pd(), \ @@ -277,12 +1019,154 @@ 2 + (((C) & 0x4) >> 2), \ 2 + (((C) & 0x8) >> 3)); }) +/// \brief Copies the values stored in a packed 128-bit vector of [4 x float] +/// as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_permute_ps(__m128 A, const int C); +/// \endcode +/// +/// This intrinsic corresponds to \c VPERMILPS instruction. +/// +/// \param A +/// A 128-bit vector of [4 x float] values. +/// \param C +/// An immediate integer operand specifying how the values are +/// to be copied. +/// Bits [1:0]: +/// 00: Bits [31:0] of the source are copied to bits [31:0] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [31:0] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [31:0] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [31:0] of +/// the destination +/// Bits [3:2]: +/// 00: Bits [31:0] of the source are copied to bits [63:32] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [63:32] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [63:32] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [63:32] +/// of the destination +/// Bits [5:4]: +/// 00: Bits [31:0] of the source are copied to bits [95:64] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [95:64] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [95:64] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [95:64] +/// of the destination +/// Bits [7:6]: +/// 00: Bits [31:0] of the source are copied to bits [127:96] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [127:96] +/// of the destination +/// 10: Bits [95:64] of the source are copied to bits [127:96] +/// of the destination +/// 11: Bits [127:96] of the source are copied to bits [127:96] +/// of the destination +/// \returns A 128-bit vector of [4 x float] containing the copied values. #define _mm_permute_ps(A, C) __extension__ ({ \ (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ (__v4sf)_mm_setzero_ps(), \ (C) & 0x3, ((C) & 0xc) >> 2, \ ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); }) +/// \brief Copies the values stored in a packed 256-bit vector of [8 x float] +/// as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_permute_ps(__m256 A, const int C); +/// \endcode +/// +/// This intrinsic corresponds to \c VPERMILPS instruction. +/// +/// \param A +/// A 256-bit vector of [8 x float] values. +/// \param C +/// An immediate integer operand specifying how the values are +/// to be copied. +/// Bits [1:0]: +/// 00: Bits [31:0] of the source are copied to bits [31:0] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [31:0] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [31:0] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [31:0] of +/// the destination +/// Bits [3:2]: +/// 00: Bits [31:0] of the source are copied to bits [63:32] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [63:32] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [63:32] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [63:32] +/// of the destination +/// Bits [5:4]: +/// 00: Bits [31:0] of the source are copied to bits [95:64] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [95:64] of +/// the destination +/// 10: Bits [95:64] of the source are copied to bits [95:64] of +/// the destination +/// 11: Bits [127:96] of the source are copied to bits [95:64] +/// of the destination +/// Bits [7:6]: +/// 00: Bits [31:0] of the source are copied to bits [127:96] of +/// the destination +/// 01: Bits [63:32] of the source are copied to bits [127:96] +/// of the destination +/// 10: Bits [95:64] of the source are copied to bits [127:96] +/// of the destination +/// 11: Bits [127:96] of the source are copied to bits [127:96] +/// of the destination +/// Bits [1:0]: +/// 00: Bits [159:128] of the source are copied to bits +/// [159:128] of the destination +/// 01: Bits [191:160] of the source are copied to bits +/// [159:128] of the destination +/// 10: Bits [223:192] of the source are copied to bits +/// [159:128] of the destination +/// 11: Bits [255:224] of the source are copied to bits +/// [159:128] of the destination +/// Bits [3:2]: +/// 00: Bits [159:128] of the source are copied to bits +/// [191:160] of the destination +/// 01: Bits [191:160] of the source are copied to bits +/// [191:160] of the destination +/// 10: Bits [223:192] of the source are copied to bits +/// [191:160] of the destination +/// 11: Bits [255:224] of the source are copied to bits +/// [191:160] of the destination +/// Bits [5:4]: +/// 00: Bits [159:128] of the source are copied to bits +/// [223:192] of the destination +/// 01: Bits [191:160] of the source are copied to bits +/// [223:192] of the destination +/// 10: Bits [223:192] of the source are copied to bits +/// [223:192] of the destination +/// 11: Bits [255:224] of the source are copied to bits +/// [223:192] of the destination +/// Bits [7:6]: +/// 00: Bits [159:128] of the source are copied to bits +/// [255:224] of the destination +/// 01: Bits [191:160] of the source are copied to bits +/// [255:224] of the destination +/// 10: Bits [223:192] of the source are copied to bits +/// [255:224] of the destination +/// 11: Bits [255:224] of the source are copied to bits +/// [255:224] of the destination +/// \returns A 256-bit vector of [8 x float] containing the copied values. #define _mm256_permute_ps(A, C) __extension__ ({ \ (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ (__v8sf)_mm256_setzero_ps(), \ @@ -293,19 +1177,156 @@ 4 + (((C) & 0x30) >> 4), \ 4 + (((C) & 0xc0) >> 6)); }) +/// \brief Copies 128-bit data values stored in two packed 256-bit vectors of +/// [4 x double], as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPERM2F128 instruction. +/// +/// \param V1 +/// A 256-bit vector of [4 x double] values. +/// \param V2 +/// A 256-bit vector of [4 x double] values. +/// \param M +/// An immediate integer operand specifying how the values are +/// to be copied. +/// Bits [1:0]: +/// 00: Bits [127:0] of operand V1 are +/// copied to bits [127:0] of the destination +/// 01: Bits [255:128] of operand V1 are +/// copied to bits [127:0] of the destination +/// 10: Bits [127:0] of operand V2 are +/// copied to bits [127:0] of the destination +/// 11: Bits [255:128] of operand V2 are +/// copied to bits [127:0] of the destination +/// Bits [5:4]: +/// 00: Bits [127:0] of operand V1 are +/// copied to bits [255:128] of the destination +/// 01: Bits [255:128] of operand V1 are +/// copied to bits [255:128] of the destination +/// 10: Bits [127:0] of operand V2 are +/// copied to bits [255:128] of the destination +/// 11: Bits [255:128] of operand V2 are +/// copied to bits [255:128] of the destination +/// \returns A 256-bit vector of [4 x double] containing the copied values. #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ (__v4df)(__m256d)(V2), (M)); }) +/// \brief Copies 128-bit data values stored in two packed 256-bit vectors of +/// [8 x float], as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPERM2F128 instruction. +/// +/// \param V1 +/// A 256-bit vector of [8 x float] values. +/// \param V2 +/// A 256-bit vector of [8 x float] values. +/// \param M +/// An immediate integer operand specifying how the values are +/// to be copied. +/// Bits [1:0]: +/// 00: Bits [127:0] of operand V1 are +/// copied to bits [127:0] of the destination +/// 01: Bits [255:128] of operand V1 are +/// copied to bits [127:0] of the destination +/// 10: Bits [127:0] of operand V2 are +/// copied to bits [127:0] of the destination +/// 11: Bits [255:128] of operand V2 are +/// copied to bits [127:0] of the destination +/// Bits [5:4]: +/// 00: Bits [127:0] of operand V1 are +/// copied to bits [255:128] of the destination +/// 01: Bits [255:128] of operand V1 are +/// copied to bits [255:128] of the destination +/// 10: Bits [127:0] of operand V2 are +/// copied to bits [255:128] of the destination +/// 11: Bits [255:128] of operand V2 are +/// copied to bits [255:128] of the destination +/// \returns A 256-bit vector of [8 x float] containing the copied values. #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ (__v8sf)(__m256)(V2), (M)); }) +/// \brief Copies 128-bit data values stored in two packed 256-bit integer +/// vectors, as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPERM2F128 instruction. +/// +/// \param V1 +/// A 256-bit integer vector. +/// \param V2 +/// A 256-bit integer vector. +/// \param M +/// An immediate integer operand specifying how the values are +/// to be copied. +/// Bits [1:0]: +/// 00: Bits [127:0] of operand V1 are +/// copied to bits [127:0] of the destination +/// 01: Bits [255:128] of operand V1 are +/// copied to bits [127:0] of the destination +/// 10: Bits [127:0] of operand V2 are +/// copied to bits [127:0] of the destination +/// 11: Bits [255:128] of operand V2 are +/// copied to bits [127:0] of the destination +/// Bits [5:4]: +/// 00: Bits [127:0] of operand V1 are +/// copied to bits [255:128] of the destination +/// 01: Bits [255:128] of operand V1 are +/// copied to bits [255:128] of the destination +/// 10: Bits [127:0] of operand V2 are +/// copied to bits [255:128] of the destination +/// 11: Bits [255:128] of operand V2 are +/// copied to bits [255:128] of the destination +/// \returns A 256-bit integer vector containing the copied values. #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ (__v8si)(__m256i)(V2), (M)); }) /* Vector Blend */ +/// \brief Copies 64-bit double-precision data values stored in either of the +/// two packed 256-bit vectors of [4 x double], as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VBLENDPD instruction. +/// +/// \param V1 +/// A 256-bit vector of [4 x double] values. +/// \param V2 +/// A 256-bit vector of [4 x double] values. +/// \param M +/// An immediate integer operand, with mask bits [3:0] +/// specifying how the values are to be copied. The position of +/// the mask bit corresponds to the index of a copied value. +/// When a mask bit is 0, the corresponding 64-bit element in +/// operand V1 is copied to the same +/// position in the destination. When a mask bit is 1, the +/// corresponding 64-bit element in operand V2 +/// is copied to the same position in the destination. +/// \returns A 256-bit vector of [4 x double] containing the copied values. #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ (__v4df)(__m256d)(V2), \ @@ -314,6 +1335,31 @@ (((M) & 0x04) ? 6 : 2), \ (((M) & 0x08) ? 7 : 3)); }) +/// \brief Copies 32-bit single-precision data values stored in either of the +/// two packed 256-bit vectors of [8 x float], as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VBLENDPS instruction. +/// +/// \param V1 +/// A 256-bit vector of [8 x float] values. +/// \param V2 +/// A 256-bit vector of [8 x float] values. +/// \param M +/// An immediate integer operand, with mask bits [7:0] +/// specifying how the values are to be copied. The position of +/// the mask bit corresponds to the index of a copied value. +/// When a mask bit is 0, the corresponding 32-bit element in +/// operand V1 is copied to the same +/// position in the destination. When a mask bit is 1, the +/// corresponding 32-bit element in operand V2 +/// is copied to the same position in the destination. #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ (__v8sf)(__m256)(V2), \ @@ -326,6 +1372,28 @@ (((M) & 0x40) ? 14 : 6), \ (((M) & 0x80) ? 15 : 7)); }) +/// \brief Copies 64-bit double-precision data values stored in either of the +/// two packed 256-bit vectors of [4 x double], as specified by the 256-bit +/// vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VBLENDVPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] values. +/// \param __b +/// A 256-bit vector of [4 x double] values. +/// \param __c +/// A 256-bit vector operand, with mask bits 255, 191, 127, and +/// 63 specifying how the values are to be copied. The position +/// of the mask bit corresponds to the most significant bit of a +/// copied value. When a mask bit is 0, the corresponding 64-bit +/// element in operand __a is copied to the +/// same position in the destination. When a mask bit is 1, the +/// corresponding 64-bit element in operand __b +/// is copied to the same position in the destination. +/// \returns A 256-bit vector of [4 x double] containing the copied values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) { @@ -333,6 +1401,29 @@ (__v4df)__a, (__v4df)__b, (__v4df)__c); } +/// \brief Copies 32-bit single-precision data values stored in either of the +/// two packed 256-bit vectors of [8 x float], as specified by the 256-bit +/// vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VBLENDVPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] values. +/// \param __b +/// A 256-bit vector of [8 x float] values. +/// \param __c +/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, +/// 127, 95, 63, and 31 specifying how the values are to be +/// copied. The position of the mask bit corresponds to the most +/// significant bit of a copied value. When a mask bit is 0, the +/// corresponding 32-bit element in operand __a +/// is copied to the same position in the destination. When a +/// mask bit is 1, the corresponding 32-bit element in operand +/// __b is copied to the same position in +/// the destination. +/// \returns A 256-bit vector of [8 x float] containing the copied values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) { @@ -341,11 +1432,98 @@ } /* Vector Dot Product */ +/// \brief Computes two dot products: one dot product is computed using the +/// lower 128 bits of the two packed 256-bit vectors of [8 x float], and the +/// other dot product is computed using the upper 128 bits of the two +/// packed 256-bit vectors of [8 x float]. Both dot products are computed +/// as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VDPPS instruction. +/// +/// \param V1 +/// A 256-bit vector of [8 x float] values. +/// \param V2 +/// A 256-bit vector of [8 x float] values. +/// \param M +/// An immediate integer operand. Mask bits [7:4] are used to +/// select 32-bit segments of the source operands. If a mask bit +/// is 1, the corresponding bits are used in the dot product +/// calculation: +/// Bit [7]: selects bits [127:96] or bits [255:224] +/// Bit [6]: selects bits [95:64] or bits [223:192] +/// Bit [5]: selects bits [63:32] or bits [191:160] +/// Bit [4]: selects bits [31:0] or bits [159:128] +/// Bits [3:0] select which bits within the destination will be +/// used to store the 32-bit sum. +/// \returns A 256-bit vector of [8 x float] containing the two dot products. #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ (__v8sf)(__m256)(V2), (M)); }) /* Vector shuffle */ +/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as +/// specified by the immediate value operand. The four selected elements +/// in each operand are copied to the destination according to the bits +/// specified in the immediate operand. The selected elements from the +/// first 256-bit operand are copied to bits [63:0] and bits [191:128] of +/// the destination, and the selected elements from the second 256-bit +/// operand are copied to bits [127:64] and bits [255:192] of the +/// destination. For example, if bits [7:0] of the immediate operand +/// contain a value of 0xFF, the 256-bit destination vector would contain +/// the following values: +/// b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3] +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); +/// \endcode +/// +/// This intrinsic corresponds to \c VSHUFPS instruction. +/// +/// \param a +/// A 256-bit vector of [8 x float]. The four selected elements +/// in this operand are copied to bits [63:0] and bits [191:128] +/// in the destination, according to the bits specified in the +/// immediate operand. +/// \param b +/// A 256-bit vector of [8 x float]. The four selected elements +/// in this operand are copied to bits [127:64] and bits +/// [255:192] in the destination, according to the bits +/// specified in the immediate operand. +/// \param mask +/// An immediate value containing an 8-bit value specifying +/// which elements to copy from a and b. Bits [3:0] specify the values +/// copied +/// from operand a. Bits [7:4] specify the +/// values copied from operand b. +/// The destinations within the 256-bit destination are assigned +/// values as follows, according to the bit value assignments +/// described further below: +/// Bits [1:0] are used to assign values to bits [31:0] and +/// [159:128] in the destination. +/// Bits [3:2] are used to assign values to bits [63:32] and +/// [191:160] in the destination. +/// Bits [5:4] are used to assign values to bits [95:64] and +/// [223:192] in the destination. +/// Bits [7:6] are used to assign values to bits [127:96] and +/// [255:224] in the destination. +/// Bit value assignments: +/// 00: Bits [31:0] and [159:128] are copied from the selected +/// operand. +/// 01: Bits [63:32] and [191:160] are copied from the selected +/// operand. +/// 10: Bits [95:64] and [223:192] are copied from the selected +/// operand. +/// 11: Bits [127:96] and [255:224] are copied from the selected +/// operand. +/// \returns A 256-bit vector of [8 x float] containing the shuffled values. #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ (__v8sf)(__m256)(b), \ @@ -358,6 +1536,49 @@ (((mask) & 0x30) >> 4) + 12, \ (((mask) & 0xc0) >> 6) + 12); }) +/// \brief Selects four double-precision values from the 256-bit operands of [4 +/// x double], as specified by the immediate value operand. The selected +/// elements from the first 256-bit operand are copied to bits [63:0] and +/// bits [191:128] in the destination, and the selected elements from the +/// second 256-bit operand are copied to bits [127:64] and bits [255:192] +/// in the destination. For example, if bits [3:0] of the immediate +/// operand contain a value of 0xF, the 256-bit destination vector would +/// contain the following values: +/// b[3], a[3], b[1], +/// a[1] +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); +/// \endcode +/// +/// This intrinsic corresponds to \c VSHUFPD instruction. +/// +/// \param a +/// A 256-bit vector of [4 x double]. +/// \param b +/// A 256-bit vector of [4 x double]. +/// \param mask +/// An immediate value containing 8-bit values specifying which +/// elements to copy from a and b: +/// Bit [0]=0: Bits [63:0] are copied from a +/// to bits [63:0] of the destination. +/// Bit [0]=1: Bits [127:64] are copied from a +/// to bits [63:0] of the destination. +/// Bit [1]=0: Bits [63:0] are copied from b +/// to bits [127:64] of the destination. +/// Bit [1]=1: Bits [127:64] are copied from b +/// to bits [127:64] of the destination. +/// Bit [2]=0: Bits [191:128] are copied from a +/// to bits [191:128] of the destination. +/// Bit [2]=1: Bits [255:192] are copied from a +/// to bits [191:128] of the destination. +/// Bit [3]=0: Bits [191:128] are copied from b +/// to bits [255:192] of the destination. +/// Bit [3]=1: Bits [255:192] are copied from b +/// to bits [255:192] of the destination. +/// \returns A 256-bit vector of [4 x double] containing the shuffled values. #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ (__v4df)(__m256d)(b), \ @@ -400,30 +1621,244 @@ #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double], using the operation specified by +/// the integer operand. If the result is true, all 64 bits of the +/// destination vector are set; otherwise they are cleared. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to \c VCMPPD instruction. +/// +/// \param a +/// A 128-bit vector of [2 x double] values. +/// \param b +/// A 128-bit vector of [2 x double] values. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying +/// which comparison operation to use: +/// 00h, 08h, 10h, 18h: Equal +/// 01h, 09h, 11h, 19h:Less than +/// 02h, 0Ah, 12h, 1Ah:Less than or equal +/// Greater than or equal (swapped operands) +/// 03h, 0Bh, 13h, 1Bh:Unordered +/// 04h, 0Ch, 14h, 1Ch:Not equal +/// 05h, 0Dh, 15h, 1Dh:Not less than +/// Not greater than (swapped operands) +/// 06h, 0Eh, 16h, 1Eh:Not less than or equal +/// Not greater than or equal (swapped operands) +/// 07h, 0Fh, 17h, 1Fh:Ordered +/// \returns A 128-bit vector of [2 x double] containing the comparison results. #define _mm_cmp_pd(a, b, c) __extension__ ({ \ (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ (__v2df)(__m128d)(b), (c)); }) +/// \brief Compares each of the corresponding packed values of the 128-bit +/// vectors of [4 x float], using the operation specified by the integer +/// operand. If the result is true, all 32 bits of the destination vector +/// are set; otherwise they are cleared. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to \c VCMPPS instruction. +/// +/// \param a +/// A 128-bit vector of [4 x float] values. +/// \param b +/// A 128-bit vector of [4 x float] values. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying +/// which comparison operation to use: +/// 00h, 08h, 10h, 18h: Equal +/// 01h, 09h, 11h, 19h:Less than +/// 02h, 0Ah, 12h, 1Ah:Less than or equal +/// Greater than or equal (swapped operands) +/// 03h, 0Bh, 13h, 1Bh:Unordered +/// 04h, 0Ch, 14h, 1Ch:Not equal +/// 05h, 0Dh, 15h, 1Dh:Not less than +/// Not greater than (swapped operands) +/// 06h, 0Eh, 16h, 1Eh:Not less than or equal +/// Not greater than or equal (swapped operands) +/// 07h, 0Fh, 17h, 1Fh:Ordered +/// \returns A 128-bit vector of [4 x float] containing the comparison results. #define _mm_cmp_ps(a, b, c) __extension__ ({ \ (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ (__v4sf)(__m128)(b), (c)); }) +/// \brief Compares each of the corresponding packed double-precision values of +/// the 256-bit vectors of [4 x double], using the operation specified by +/// the integer operand. If the result is true, all 64 bits of the +/// destination vector are set; otherwise they are cleared. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to \c VCMPPD instruction. +/// +/// \param a +/// A 256-bit vector of [4 x double] values. +/// \param b +/// A 256-bit vector of [4 x double] values. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying +/// which comparison operation to use: +/// 00h, 08h, 10h, 18h: Equal +/// 01h, 09h, 11h, 19h:Less than +/// 02h, 0Ah, 12h, 1Ah:Less than or equal +/// Greater than or equal (swapped operands) +/// 03h, 0Bh, 13h, 1Bh:Unordered +/// 04h, 0Ch, 14h, 1Ch:Not equal +/// 05h, 0Dh, 15h, 1Dh:Not less than +/// Not greater than (swapped operands) +/// 06h, 0Eh, 16h, 1Eh:Not less than or equal +/// Not greater than or equal (swapped operands) +/// 07h, 0Fh, 17h, 1Fh:Ordered +/// \returns A 256-bit vector of [4 x double] containing the comparison results. #define _mm256_cmp_pd(a, b, c) __extension__ ({ \ (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ (__v4df)(__m256d)(b), (c)); }) +/// \brief Compares each of the corresponding packed values of the 256-bit +/// vectors of [8 x float], using the operation specified by the integer +/// operand. If the result is true, all 32 bits of the destination vector +/// are set; otherwise they are cleared. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to \c VCMPPS instruction. +/// +/// \param a +/// A 256-bit vector of [8 x float] values. +/// \param b +/// A 256-bit vector of [8 x float] values. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying +/// which comparison operation to use: +/// 00h, 08h, 10h, 18h: Equal +/// 01h, 09h, 11h, 19h:Less than +/// 02h, 0Ah, 12h, 1Ah:Less than or equal +/// Greater than or equal (swapped operands) +/// 03h, 0Bh, 13h, 1Bh:Unordered +/// 04h, 0Ch, 14h, 1Ch:Not equal +/// 05h, 0Dh, 15h, 1Dh:Not less than +/// Not greater than (swapped operands) +/// 06h, 0Eh, 16h, 1Eh:Not less than or equal +/// Not greater than or equal (swapped operands) +/// 07h, 0Fh, 17h, 1Fh:Ordered +/// \returns A 256-bit vector of [8 x float] containing the comparison results. #define _mm256_cmp_ps(a, b, c) __extension__ ({ \ (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ (__v8sf)(__m256)(b), (c)); }) +/// \brief Compares each of the corresponding scalar double-precision values of +/// the 128-bit [2 x double] operands, using the operation specified by +/// the integer operand. If the result is true, all 64 bits of the +/// destination vector are set; otherwise they are cleared. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to \c VCMPSD instruction. +/// +/// \param a +/// A 128-bit vector of [2 x double] values. +/// \param b +/// A 128-bit vector of [2 x double] values. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying +/// which comparison operation to use: +/// 00h, 08h, 10h, 18h: Equal +/// 01h, 09h, 11h, 19h:Less than +/// 02h, 0Ah, 12h, 1Ah:Less than or equal +/// Greater than or equal (swapped operands) +/// 03h, 0Bh, 13h, 1Bh:Unordered +/// 04h, 0Ch, 14h, 1Ch:Not equal +/// 05h, 0Dh, 15h, 1Dh:Not less than +/// Not greater than (swapped operands) +/// 06h, 0Eh, 16h, 1Eh:Not less than or equal +/// Not greater than or equal (swapped operands) +/// 07h, 0Fh, 17h, 1Fh:Ordered +/// \returns A 128-bit vector of [2 x double] containing the comparison results. #define _mm_cmp_sd(a, b, c) __extension__ ({ \ (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ (__v2df)(__m128d)(b), (c)); }) +/// \brief Compares each of the corresponding scalar values of the 128-bit +/// vectors of [4 x float], using the operation specified by the integer +/// operand. If the result is true, all 32 bits of the destination vector +/// are set; otherwise they are cleared. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to \c VCMPSS instruction. +/// +/// \param a +/// A 128-bit vector of [4 x float] values. +/// \param b +/// A 128-bit vector of [4 x float] values. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying +/// which comparison operation to use: +/// 00h, 08h, 10h, 18h: Equal +/// 01h, 09h, 11h, 19h:Less than +/// 02h, 0Ah, 12h, 1Ah:Less than or equal +/// Greater than or equal (swapped operands) +/// 03h, 0Bh, 13h, 1Bh:Unordered +/// 04h, 0Ch, 14h, 1Ch:Not equal +/// 05h, 0Dh, 15h, 1Dh:Not less than +/// Not greater than (swapped operands) +/// 06h, 0Eh, 16h, 1Eh:Not less than or equal +/// Not greater than or equal (swapped operands) +/// 07h, 0Fh, 17h, 1Fh:Ordered +/// \returns A 128-bit vector of [4 x float] containing the comparison results. #define _mm_cmp_ss(a, b, c) __extension__ ({ \ (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ (__v4sf)(__m128)(b), (c)); }) +/// \brief Extracts 32 bits of extended packed data from a 256-bit integer +/// vector and copies it to the destination, as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VEXTRACTF128+COMPOSITE instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __imm +/// Determines which bits are extracted using bits [3:0]: +/// 000: Bits [31:0] are copied to the destination. +/// 001: Bits [63:32] are copied to the destination. +/// 010: Bits [95:64] are copied to the destination. +/// 011: Bits [127:96] are copied to the destination. +/// 100: Bits [159:128] are copied to the destination. +/// 101: Bits [191:160] are copied to the destination. +/// 110: Bits [223:192] are copied to the destination. +/// 111: Bits [255:224] are copied to the destination. +/// \returns A 32-bit integer containing the extracted 32 bits of extended +/// packed data. static __inline int __DEFAULT_FN_ATTRS _mm256_extract_epi32(__m256i __a, const int __imm) { @@ -431,6 +1866,36 @@ return __b[__imm & 7]; } +/// \brief Extracts 16 bits of extended packed data from a 256-bit integer +/// vector and copies it to the destination, as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VEXTRACTF128+COMPOSITE instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __imm +/// Determines which bits are extracted using bits [3:0]: +/// 0000: Bits [15:0] are copied to the destination. +/// 0001: Bits [31:16] are copied to the destination. +/// 0010: Bits [47:32] are copied to the destination. +/// 0011: Bits [63:48] are copied to the destination. +/// 0100: Bits [79:64] are copied to the destination. +/// 0101: Bits [95:80] are copied to the destination. +/// 0110: Bits [111:96] are copied to the destination. +/// 0111: Bits [127:112] are copied to the destination. +/// 1000: Bits [143:128] are copied to the destination. +/// 1001: Bits [159:144] are copied to the destination. +/// 1010: Bits [175:160] are copied to the destination. +/// 1011: Bits [191:176] are copied to the destination. +/// 1100: Bits [207:192] are copied to the destination. +/// 1101: Bits [223:208] are copied to the destination. +/// 1110: Bits [239:224] are copied to the destination. +/// 1111: Bits [255:240] are copied to the destination. +/// \returns A 32-bit integer containing the extracted 16 bits of extended +/// packed data. static __inline int __DEFAULT_FN_ATTRS _mm256_extract_epi16(__m256i __a, const int __imm) { @@ -438,6 +1903,52 @@ return __b[__imm & 15]; } +/// \brief Extracts 8 bits of extended packed data from a 256-bit integer +/// vector and copies it to the destination, as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VEXTRACTF128+COMPOSITE instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __imm +/// Determines which bits are extracted using bits [4:0]: +/// 00000: Bits [7:0] are copied to the destination. +/// 00001: Bits [15:8] are copied to the destination. +/// 00010: Bits [23:16] are copied to the destination. +/// 00011: Bits [31:24] are copied to the destination. +/// 00100: Bits [39:32] are copied to the destination. +/// 00101: Bits [47:40] are copied to the destination. +/// 00110: Bits [55:48] are copied to the destination. +/// 00111: Bits [63:56] are copied to the destination. +/// 01000: Bits [71:64] are copied to the destination. +/// 01001: Bits [79:72] are copied to the destination. +/// 01010: Bits [87:80] are copied to the destination. +/// 01011: Bits [95:88] are copied to the destination. +/// 01100: Bits [103:96] are copied to the destination. +/// 01101: Bits [111:104] are copied to the destination. +/// 01110: Bits [119:112] are copied to the destination. +/// 01111: Bits [127:120] are copied to the destination. +/// 10000: Bits [135:128] are copied to the destination. +/// 10001: Bits [143:136] are copied to the destination. +/// 10010: Bits [151:144] are copied to the destination. +/// 10011: Bits [159:152] are copied to the destination. +/// 10100: Bits [167:160] are copied to the destination. +/// 10101: Bits [175:168] are copied to the destination. +/// 10110: Bits [183:176] are copied to the destination. +/// 00111: Bits [191:184] are copied to the destination. +/// 11000: Bits [199:192] are copied to the destination. +/// 11001: Bits [207:200] are copied to the destination. +/// 11010: Bits [215:208] are copied to the destination. +/// 11011: Bits [223:216] are copied to the destination. +/// 11100: Bits [231:224] are copied to the destination. +/// 11101: Bits [239:232] are copied to the destination. +/// 11110: Bits [247:240] are copied to the destination. +/// 11111: Bits [255:248] are copied to the destination. +/// \returns A 32-bit integer containing the extracted 8 bits of extended +/// packed data. static __inline int __DEFAULT_FN_ATTRS _mm256_extract_epi8(__m256i __a, const int __imm) { @@ -446,6 +1957,24 @@ } #ifdef __x86_64__ +/// \brief Extracts 64 bits of extended packed data from a 256-bit integer +/// vector and copies it to the destination, as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VEXTRACTF128+COMPOSITE instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __imm +/// Determines which bits are extracted using bits [1:0]: +/// 00: Bits [63:0] are copied to the destination. +/// 01: Bits [127:64] are copied to the destination. +/// 10: Bits [191:128] are copied to the destination. +/// 11: Bits [255:192] are copied to the destination. +/// \returns A 64-bit integer containing the extracted 64 bits of extended +/// packed data. static __inline long long __DEFAULT_FN_ATTRS _mm256_extract_epi64(__m256i __a, const int __imm) { @@ -454,6 +1983,35 @@ } #endif +/// \brief Combines 224 bits of extended packed data from the 256-bit integer +/// vector operand with 32 bits of extended packed data from the 32-bit +/// integer operand and copies them to the destination, using the offset +/// specified by the integer operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VINSERTF128+COMPOSITE instruction. +/// +/// \param __a +/// A 256-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param __b +/// An integer. The bits of this operand are written to the +/// destination beginning at the offset specified by operand __imm. +/// \param __imm +/// An immediate integer used to determine which bits in the +/// destination are used when copying the bits from operand __b: +/// If c is 0, bits [31:0] are used in the destination. +/// If c is 1, bits [63:32] are used in the destination. +/// If c is 2, bits [95:64] are used in the destination. +/// If c is 3, bits [127:96] are used in the destination. +/// If c is 4, bits [159:128] are used in the destination. +/// If c is 5, bits [191:160] are used in the destination. +/// If c is 6, bits [223:192] are used in the destination. +/// If c is 7, bits [255:224] are used in the destination. +/// \returns A 256-bit integer vector containing the copied extended packed +/// data from the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_insert_epi32(__m256i __a, int __b, int const __imm) { @@ -462,6 +2020,43 @@ return (__m256i)__c; } +/// \brief Combines 240 bits of extended packed data from the 256-bit integer +/// vector operand with 16 bits of extended packed data from the 16-bit +/// integer operand and copies them to the destination, using the offset +/// specified by the integer operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VINSERTF128+COMPOSITE instruction. +/// +/// \param __a +/// A 256-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param __b +/// An integer. The bits of this operand are written to the +/// destination beginning at the offset specified by operand __imm. +/// \param __imm +/// An immediate integer used to determine which bits in the +/// destination are used when copying the bits from operand __b: +/// If c is 0, bits [15:0] are used in the destination. +/// If c is 1, bits [31:16] are used in the destination. +/// If c is 2, bits [47:32] are used in the destination. +/// If c is 3, bits [63:48] are used in the destination. +/// If c is 4, bits [79:64] are used in the destination. +/// If c is 5, bits [95:80] are used in the destination. +/// If c is 6, bits [111:96] are used in the destination. +/// If c is 7, bits [127:112] are used in the destination. +/// If c is 8, bits [143:128] are used in the destination. +/// If c is 9, bits [159:144] are used in the destination. +/// If c is 10, bits [175:160] are used in the destination. +/// If c is 11, bits [191:176] are used in the destination. +/// If c is 12, bits [207:192] are used in the destination. +/// If c is 13, bits [223:208] are used in the destination. +/// If c is 14, bits [239:224] are used in the destination. +/// If c is 15, bits [255:240] are used in the destination. +/// \returns A 256-bit integer vector containing the copied extended packed +/// data from the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_insert_epi16(__m256i __a, int __b, int const __imm) { @@ -470,6 +2065,29 @@ return (__m256i)__c; } +/// \brief Combines 248 bits of extended packed data from the 256-bit integer +/// vector operand with 8 bits of extended packed data from the 8-bit +/// integer operand and copies them to the destination, using the offset +/// specified by the integer operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VINSERTF128+COMPOSITE instruction. +/// +/// \param __a +/// A 256-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param __b +/// An integer. The bits of this operand are written to the +/// destination beginning at the offset specified by operand __imm. +/// \param __imm +/// An immediate integer used to determine which bits in the +/// destination are used when copying the bits from operand __b. Bits [8*c 1, +/// 8*(c -1)] are used +/// in the destination, where c is a value from [0-31]. +/// \returns A 256-bit integer vector containing the copied extended packed +/// data from the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_insert_epi8(__m256i __a, int __b, int const __imm) { @@ -479,6 +2097,58 @@ } #ifdef __x86_64__ +/// \brief Combines 192 bits of extended packed data from the 256-bit integer +/// vector operand with 64 bits of extended packed data from the 64-bit +/// integer operand and copies them to the destination, using the offset +/// specified by the integer operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VINSERTF128+COMPOSITE instruction. +/// +/// \param __a +/// A 256-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param __b +/// A 64-bit integer. The bits of this operand are written to +/// the destination beginning at the offset specified by operand +/// __imm. +/// \param __imm +/// An immediate integer used to determine which bits in the +/// destination are used when copying the bits from operand __b: +/// If c is 0, bits [63:0] are used in the destination. +/// If c is 1, bits [127:64] are used in the destination. +/// If c is 2, bits [191:128] are used in the destination. +/// If c is 3, bits [255:192] are used in the destination. +/// \returns A 256-bit integer vector containing the copied extended packed +/// data from the operands. +/// \brief Combines 192 bits of extended packed data from the 256-bit integer +/// vector operand with 64 bits of extended packed data from the 64-bit +/// integer operand and copies them to the destination, using the offset +/// specified by the integer operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VINSERTF128+COMPOSITE instruction. +/// +/// \param __a +/// A 256-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param __b +/// A 64-bit integer. The bits of this operand are written to +/// the destination beginning at the offset specified by operand +/// __imm. +/// \param __imm +/// An immediate integer used to determine which bits in the +/// destination are used when copying the bits from operand __b: +/// If c is 0, bits [63:0] are used in the destination. +/// If c is 1, bits [127:64] are used in the destination. +/// If c is 2, bits [191:128] are used in the destination. +/// If c is 3, bits [255:192] are used in the destination. +/// \returns A 256-bit integer vector containing the copied extended packed +/// data from the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_insert_epi64(__m256i __a, long long __b, int const __imm) { @@ -489,48 +2159,125 @@ #endif /* Conversion */ +/// \brief Converts a vector of [4 x i32] into a vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTDQ2PD instruction. +/// +/// \param __a +/// A 128-bit integer vector of [4 x i32]. +/// \returns A 256-bit vector of [4 x double] containing the converted values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtepi32_pd(__m128i __a) { return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a); } +/// \brief Converts a vector of [8 x i32] into a vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTDQ2PS instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 256-bit vector of [8 x float] containing the converted values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_cvtepi32_ps(__m256i __a) { return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); } +/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTPD2PS instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 128-bit vector of [4 x float] containing the converted values. static __inline __m128 __DEFAULT_FN_ATTRS _mm256_cvtpd_ps(__m256d __a) { return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); } +/// \brief Converts a vector of [8 x float] into a vector of [8 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTPS2DQ instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit integer vector containing the converted values. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvtps_epi32(__m256 __a) { return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); } +/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 +/// x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTPS2PD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 256-bit vector of [4 x double] containing the converted values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_cvtps_pd(__m128 __a) { return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a); } +/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of +/// [4 x i32], truncating the result when it is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTTPD2DQ instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 128-bit integer vector containing the converted values. static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvttpd_epi32(__m256d __a) { return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); } +/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of +/// [4 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTPD2DQ instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 128-bit integer vector containing the converted values. static __inline __m128i __DEFAULT_FN_ATTRS _mm256_cvtpd_epi32(__m256d __a) { return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); } +/// \brief Converts a vector of [8 x float] into a vector of [8x i32], +/// truncating the result when it is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTTPS2DQ instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit integer vector containing the converted values. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_cvttps_epi32(__m256 __a) { @@ -538,18 +2285,74 @@ } /* Vector replicate */ +/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit +/// vector of [8 x float] to float values stored in a packed 256-bit +/// vector of [8 x float]. +/// Bits [255:224] of the source are written to bits [255:224] and +/// [223:192] of the destination. +/// Bits [191:160] of the source are written to bits [191:160] and +/// [159:128] of the destination. +/// Bits [127:96] of the source are written to bits [127:96] and [95:64] +/// of the destination. +/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVSHDUP instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] containing the moved and +/// duplicated values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_movehdup_ps(__m256 __a) { return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7); } +/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit +/// vector of [8 x float] to float values stored in a packed 256-bit +/// vector of [8 x float]. +/// Bits [223:192] of the source are written to bits [255:224] and +/// [223:192] of the destination. +/// Bits [159:128] of the source are written to bits [191:160] and +/// [159:128] of the destination. +/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of +/// the destination. +/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVSLDUP instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] containing the moved and +/// duplicated values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_moveldup_ps(__m256 __a) { return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6); } +/// \brief Moves and duplicates double-precision values from a 256-bit vector +/// of [4 x double] to double-precision values stored in a packed 256-bit +/// vector of [4 x double]. +/// Bits [63:0] of the source are written to bits [127:64] and [63:0] of +/// the destination. +/// Bits [191:128] of the source are written to bits [255:192] and +/// [191:128] of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDDUP instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 256-bit vector of [4 x double] containing the moved and +/// duplicated values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_movedup_pd(__m256d __a) { @@ -557,24 +2360,118 @@ } /* Unpack and Interleave */ +/// \brief Unpacks the high-order (odd-indexed) double-precision values from +/// two 256-bit vectors of [4 x double] and interleaves them into a packed +/// 256-bit vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUNPCKHPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// Bits [127:64] are written to bits [63:0] of the destination. +/// Bits [255:192] are written to bits [191:128] of the +/// destination. +/// \param __b +/// A 256-bit vector of [4 x double]. +/// Bits [127:64] are written to bits [127:64] of the +/// destination. +/// Bits [255:192] are written to bits [255:191] of the +/// destination. +/// \returns A 256-bit vector of [4 x double] containing the interleaved values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpackhi_pd(__m256d __a, __m256d __b) { return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2); } +/// \brief Unpacks the low-order (even-indexed) double-precision values from +/// two 256-bit vectors of [4 x double] and interleaves them into a packed +/// 256-bit vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUNPCKLPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// Bits [63:0] are written to bits [63:0] of the destination. +/// Bits [191:128] are written to bits [191:128] of the +/// destination. +/// \param __b +/// A 256-bit vector of [4 x double]. +/// Bits [63:0] are written to bits [127:64] of the destination. +/// Bits [191:128] are written to bits [255:191] of the +/// destination. +/// \returns A 256-bit vector of [4 x double] containing the interleaved values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_unpacklo_pd(__m256d __a, __m256d __b) { return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2); } +/// \brief Unpacks the high-order (index 2,3,6,7) values from two 256-bit +/// vectors of [8 x float] and interleaves them into a packed 256-bit vector +/// of [8 +/// x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUNPCKHPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// Bits [95:64] are written to bits [31:0] of the destination. +/// Bits [127:96] are written to bits [95:64] of the +/// destination. +/// Bits [223:192] are written to bits [159:128] of the +/// destination. +/// Bits [255:224] are written to bits [223:192] of the +/// destination. +/// \param __b +/// A 256-bit vector of [8 x float]. +/// Bits [95:64] are written to bits [63:32] of the destination. +/// Bits [127:96] are written to bits [127:96] of the +/// destination. +/// Bits [223:192] are written to bits [191:160] of the +/// destination. +/// Bits [255:224] are written to bits [255:224] of the +/// destination. +/// \returns A 256-bit vector of [8 x float] containing the interleaved values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpackhi_ps(__m256 __a, __m256 __b) { return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); } +/// \brief Unpacks the low-order (index 0,1,4,5) values from two 256-bit +/// vectors of [8 x float] and interleaves them into a packed 256-bit vector +/// of [8 +/// x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUNPCKLPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// Bits [31:0] are written to bits [31:0] of the destination. +/// Bits [63:32] are written to bits [95:64] of the destination. +/// Bits [159:128] are written to bits [159:128] of the +/// destination. +/// Bits [191:160] are written to bits [223:192] of the +/// destination. +/// \param __b +/// A 256-bit vector of [8 x float]. +/// Bits [31:0] are written to bits [63:32] of the destination. +/// Bits [63:32] are written to bits [127:96] of the +/// destination. +/// Bits [159:128] are written to bits [191:160] of the +/// destination. +/// Bits [191:160] are written to bits [255:224] of the +/// destination. +/// \returns A 256-bit vector of [8 x float] containing the interleaved values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_unpacklo_ps(__m256 __a, __m256 __b) { @@ -582,90 +2479,302 @@ } /* Bit Test */ +/// \brief Tests whether the specified sign bits in a 128-bit vector of [2 x +/// double] are all zeros. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the sign bits to +/// be tested. +/// \param __b +/// A 128-bit vector of [2 x double] selecting which sign bits +/// to test in operand __a. +/// \returns TRUE if the specified sign bits are all zeros; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm_testz_pd(__m128d __a, __m128d __b) { return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); } +/// \brief Tests whether the specified sign bits in a 128-bit vector of [2 x +/// double] are all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the sign bits to +/// be tested. +/// \param __b +/// A 128-bit vector of [2 x double] selecting which sign bits +/// to test in operand __a. +/// \returns TRUE if the specified sign bits are all ones; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm_testc_pd(__m128d __a, __m128d __b) { return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); } +/// \brief Tests whether the specified sign bits in a 128-bit vector of [2 x +/// double] are neither all zeros nor all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the sign bits to +/// be tested. +/// \param __b +/// A 128-bit vector of [2 x double] selecting which sign bits +/// to test in operand __a. +/// \returns TRUE if the specified sign bits are neither all zeros nor all +/// ones; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm_testnzc_pd(__m128d __a, __m128d __b) { return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); } +/// \brief Tests whether the specified sign bits in a 128-bit vector of [4 x +/// float] are all zeros. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the sign bits to +/// be tested. +/// \param __b +/// A 128-bit vector of [4 x float] selecting which sign bits to +/// test in operand __a. +/// \returns TRUE if the specified sign bits are all zeros; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm_testz_ps(__m128 __a, __m128 __b) { return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); } +/// \brief Tests whether the specified sign bits in a 128-bit vector of [4 x +/// float] are all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the sign bits to +/// be tested. +/// \param __b +/// A 128-bit vector of [4 x float] selecting which sign bits to +/// test in operand __a. +/// \returns TRUE if the specified sign bits are all ones; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm_testc_ps(__m128 __a, __m128 __b) { return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); } +/// \brief Tests whether the specified sign bits in a 128-bit vector of [4 x +/// float] are neither all zeros nor all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the sign bits to +/// be tested. +/// \param __b +/// A 128-bit vector of [4 x float] selecting which sign bits to +/// test in operand __a. +/// \returns TRUE if the specified sign bits are neither all zeros nor all +/// ones; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm_testnzc_ps(__m128 __a, __m128 __b) { return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); } +/// \brief Tests whether the specified sign bits in a 256-bit vector of [4 x +/// double] are all zeros. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the sign bits to +/// be tested. +/// \param __b +/// A 256-bit vector of [4 x double] selecting which sign bits +/// to test in operand __a. +/// \returns TRUE if the specified sign bits are all zeros; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm256_testz_pd(__m256d __a, __m256d __b) { return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); } +/// \brief Tests whether the specified sign bits in a 256-bit vector of [4 x +/// double] are all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the sign bits to +/// be tested. +/// \param __b +/// A 256-bit vector of [4 x double] selecting which sign bits +/// to test in operand __a. +/// \returns TRUE if the specified sign bits are all ones; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm256_testc_pd(__m256d __a, __m256d __b) { return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); } +/// \brief Tests whether the specified sign bits in a 256-bit vector of [4 x +/// double] are neither all zeros nor all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the sign bits to +/// be tested. +/// \param __b +/// A 256-bit vector of [4 x double] selecting which sign bits +/// to test in operand __a. +/// \returns TRUE if the specified sign bits are neither all zeros nor all +/// ones; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_pd(__m256d __a, __m256d __b) { return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); } +/// \brief Tests whether the specified sign bits in a 256-bit vector of [8 x +/// float] are all zeros. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the sign bits to +/// be tested. +/// \param __b +/// A 256-bit vector of [8 x float] selecting which sign bits to +/// test in operand __a. +/// \returns TRUE if the specified sign bits are all zeros; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm256_testz_ps(__m256 __a, __m256 __b) { return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); } +/// \brief Tests whether the specified sign bits in a 256-bit vector of [8 x +/// float] are all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the sign bits to +/// be tested. +/// \param __b +/// A 256-bit vector of [8 x float] selecting which sign bits to +/// test in operand __a. +/// \returns TRUE if the specified sign bits are all ones; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm256_testc_ps(__m256 __a, __m256 __b) { return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); } +/// \brief Tests whether the specified sign bits in a 256-bit vector of [8 x +/// float] are neither all zeros nor all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VTESTPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the sign bits to +/// be tested. +/// \param __b +/// A 256-bit vector of [8 x float] selecting which sign bits to +/// test in operand __a. +/// \returns TRUE if the specified sign bits are neither all zeros nor all +/// ones; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_ps(__m256 __a, __m256 __b) { return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); } +/// \brief Tests whether the specified bits in a 256-bit integer vector are all +/// zeros. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPTEST instruction. +/// +/// \param __a +/// A 256-bit integer vector containing the bits to be tested. +/// \param __b +/// A 256-bit integer vector selecting which bits to test in +/// operand __a. +/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm256_testz_si256(__m256i __a, __m256i __b) { return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); } +/// \brief Tests whether the specified bits in a 256-bit integer vector are all +/// ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPTEST instruction. +/// +/// \param __a +/// A 256-bit integer vector containing the bits to be tested. +/// \param __b +/// A 256-bit integer vector selecting which bits to test in +/// operand __a. +/// \returns TRUE if the specified bits are all ones; FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm256_testc_si256(__m256i __a, __m256i __b) { return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); } +/// \brief Tests whether the specified bits in a 256-bit integer vector are +/// neither all zeros nor all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPTEST instruction. +/// +/// \param __a +/// A 256-bit integer vector containing the bits to be tested. +/// \param __b +/// A 256-bit integer vector selecting which bits to test in +/// operand __a. +/// \returns TRUE if the specified bits are neither all zeros nor all ones; +/// FALSE otherwise. static __inline int __DEFAULT_FN_ATTRS _mm256_testnzc_si256(__m256i __a, __m256i __b) { @@ -673,12 +2782,38 @@ } /* Vector extract sign mask */ +/// \brief Extracts the sign bits of packed double-precision values in a +/// 256-bit vector of [4 x double] and writes them to the lower order bits of +/// the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVMSKPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the +/// double-precision values with sign bits to be extracted. +/// \returns The sign bits from the operand, written to bits [3:0]. static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_pd(__m256d __a) { return __builtin_ia32_movmskpd256((__v4df)__a); } +/// \brief Extracts the sign bits of packed double-precision values in a +/// 256-bit vector of [8 x float] and writes them to the lower order bits of +/// the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVMSKPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the +/// double-precision values with sign bits to be extracted. +/// \returns The sign bits from the operand, written to bits [7:0]. static __inline int __DEFAULT_FN_ATTRS _mm256_movemask_ps(__m256 __a) { @@ -686,12 +2821,24 @@ } /* Vector __zero */ +/// \brief Clears all the YMM registers. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VZEROALL instruction. +/// static __inline void __DEFAULT_FN_ATTRS _mm256_zeroall(void) { __builtin_ia32_vzeroall(); } +/// \brief Clears the upper octword of all the YMM registers. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VZEROUPPER instruction. +/// static __inline void __DEFAULT_FN_ATTRS _mm256_zeroupper(void) { @@ -699,6 +2846,17 @@ } /* Vector load with broadcast */ +/// \brief Loads a float value and writes it to 32-bit elements in a 128-bit +/// vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VBROADCASTSS instruction. +/// +/// \param __a +/// The float value to be broadcast. +/// \returns A 128-bit vector of [4 x float] whose 32-bit values each contain +/// the broadcast value. static __inline __m128 __DEFAULT_FN_ATTRS _mm_broadcast_ss(float const *__a) { @@ -706,6 +2864,17 @@ return (__m128)(__v4sf){ __f, __f, __f, __f }; } +/// \brief Loads a double-precision value and writes it to 64-bit elements in a +/// 256-bit vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VBROADCASTSD instruction. +/// +/// \param __a +/// The double-precision value to be broadcast. +/// \returns A 256-bit vector of [4 x double] whose 64-bit values each contain +/// the broadcast value. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a) { @@ -713,6 +2882,17 @@ return (__m256d)(__v4df){ __d, __d, __d, __d }; } +/// \brief Loads a float value and writes it to 32-bit elements in a 256-bit +/// vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VBROADCASTSS instruction. +/// +/// \param __a +/// The float value to be broadcast. +/// \returns A 256-bit vector of [8 x float] whose 32-bit values each contain +/// the broadcast value. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a) { @@ -720,12 +2900,34 @@ return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; } +/// \brief Loads the data from a 128-bit vector of [2 x double] and writes it +/// to 128-bit elements in a 256-bit vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VBROADCASTF128 instruction. +/// +/// \param __a +/// The 128-bit vector of [2 x double] to be broadcast. +/// \returns A 256-bit vector of [4 x double] whose 128-bit elements each +/// contain the broadcast value. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a) { return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a); } +/// \brief Loads the data from a 128-bit vector of [4 x float] and writes it to +/// 128-bit elements in a 256-bit vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VBROADCASTF128 instruction. +/// +/// \param __a +/// The 128-bit vector of [4 x float] to be broadcast. +/// \returns A 256-bit vector of [8 x float] whose 128-bit elements each +/// contain the broadcast value. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a) { @@ -733,18 +2935,51 @@ } /* SIMD load ops */ +/// \brief Moves packed double-precision values from an aligned memory location +/// to 64-bit elements in a 256-bit vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPD instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location containing +/// double-precision values. +/// \returns A 256-bit vector of [4 x double] containing the moved values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_load_pd(double const *__p) { return *(__m256d *)__p; } +/// \brief Moves packed float values from an aligned memory location to 32-bit +/// elements in a 256-bit vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPS instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location containing +/// float values. +/// \returns A 256-bit vector of [8 x float] containing the moved values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_load_ps(float const *__p) { return *(__m256 *)__p; } +/// \brief Moves packed double-precision values from an unaligned memory +/// location to 64-bit elements in a 256-bit vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVUPD instruction. +/// +/// \param __p +/// A pointer to a memory location containing double-precision +/// values. +/// \returns A 256-bit vector of [4 x double] containing the moved values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_loadu_pd(double const *__p) { @@ -754,6 +2989,16 @@ return ((struct __loadu_pd*)__p)->__v; } +/// \brief Moves packed float values from an unaligned memory location to +/// 32-bit elements in a 256-bit vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVUPS instruction. +/// +/// \param __p +/// A pointer to a memory location containing float values. +/// \returns A 256-bit vector of [8 x float] containing the moved values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_loadu_ps(float const *__p) { @@ -763,12 +3008,34 @@ return ((struct __loadu_ps*)__p)->__v; } +/// \brief Moves integer values from an aligned memory location to elements in +/// a 256-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDQA instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a 256-bit integer vector +/// containing integer values. +/// \returns A 256-bit integer vector containing the moved values. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_load_si256(__m256i const *__p) { return *__p; } +/// \brief Moves integer values from an unaligned memory location to elements +/// in a 256-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDQU instruction. +/// +/// \param __p +/// A pointer to a 256-bit integer vector containing integer +/// values. +/// \returns A 256-bit integer vector containing the moved values. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_loadu_si256(__m256i const *__p) { @@ -778,6 +3045,18 @@ return ((struct __loadu_si256*)__p)->__v; } +/// \brief Moves integer values from an unaligned memory location to elements +/// in a 256-bit integer vector. The instruction may read 32 bytes to +/// retrieve either or both of the first and second parts of the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VLDDQU instruction. +/// +/// \param __p +/// A pointer to a 256-bit integer vector containing integer +/// values. +/// \returns A 256-bit integer vector containing the moved values. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_lddqu_si256(__m256i const *__p) { @@ -785,36 +3064,112 @@ } /* SIMD store ops */ +/// \brief Moves packed double-precision values from a 256-bit vector of [4 x +/// double] to an aligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPD instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location that will +/// receive the double-precision values. +/// \param __a +/// A 256-bit vector of [4 x double] containing the values to be +/// moved. static __inline void __DEFAULT_FN_ATTRS _mm256_store_pd(double *__p, __m256d __a) { *(__m256d *)__p = __a; } +/// \brief Moves packed float values from a 256-bit vector of [8 x float] to an +/// aligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPS instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location that will +/// receive the float values. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be +/// moved. static __inline void __DEFAULT_FN_ATTRS _mm256_store_ps(float *__p, __m256 __a) { *(__m256 *)__p = __a; } +/// \brief Moves packed double-precision values from a 256-bit vector of [4 x +/// double] to an unaligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVUPD instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the +/// double-precision values. +/// \param __a +/// A 256-bit vector of [4 x double] containing the values to be +/// moved. static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_pd(double *__p, __m256d __a) { __builtin_ia32_storeupd256(__p, (__v4df)__a); } +/// \brief Moves packed float values from a 256-bit vector of [8 x float] to an +/// unaligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVUPS instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float +/// values. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be +/// moved. static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_ps(float *__p, __m256 __a) { __builtin_ia32_storeups256(__p, (__v8sf)__a); } +/// \brief Moves integer values from a 256-bit integer vector to an aligned +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDQA instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location that will +/// receive the integer values. +/// \param __a +/// A 256-bit integer vector containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_store_si256(__m256i *__p, __m256i __a) { *__p = __a; } +/// \brief Moves integer values from a 256-bit integer vector to an unaligned +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDQU instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the integer +/// values. +/// \param __a +/// A 256-bit integer vector containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_storeu_si256(__m256i *__p, __m256i __a) { @@ -822,12 +3177,48 @@ } /* Conditional load ops */ +/// \brief Loads packed double-precision values from a memory location storing +/// 64-bit double-precision values to a 128-bit vector of [2 x double], +/// according to the specified mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMASKMOVPD instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the +/// double-precision values. +/// \param __m +/// A 128-bit vector of [2 x double] containing the mask. The +/// most significant bit of each data element represents the +/// mask bits. If a mask bit is zero, the corresponding value in +/// the memory location is not loaded and the corresponding +/// field in the destination vector is set to zero. +/// \returns A 128-bit vector of [2 x double] containing the loaded values. static __inline __m128d __DEFAULT_FN_ATTRS _mm_maskload_pd(double const *__p, __m128i __m) { return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); } +/// \brief Loads packed double-precision values from a memory location storing +/// 64-bit double-precision values to a 256-bit vector of [4 x double], +/// according to the specified mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMASKMOVPD instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the +/// double-precision values. +/// \param __m +/// A 256-bit vector of [4 x double] containing the mask. The +/// most significant bit of each data element represents the +/// mask bits. If a mask bit is zero, the corresponding value in +/// the memory location is not loaded and the corresponding +/// field in the destination vector is set to zero. +/// \returns A 256-bit vector of [4 x double] containing the loaded values. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_maskload_pd(double const *__p, __m256i __m) { @@ -835,12 +3226,50 @@ (__v4di)__m); } +/// \brief Loads packed float values from a memory location storing 32-bit +/// float values to a 128-bit vector of [4 x float], according to the +/// specified +/// mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMASKMOVPS instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the float +/// values. +/// \param __m +/// A 128-bit vector of [4 x float] containing the mask. The +/// most significant bit of each data element represents the +/// mask bits. If a mask bit is zero, the corresponding value in +/// the memory location is not loaded and the corresponding +/// field in the destination vector is set to zero. +/// \returns A 128-bit vector of [4 x float] containing the loaded values. static __inline __m128 __DEFAULT_FN_ATTRS _mm_maskload_ps(float const *__p, __m128i __m) { return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); } +/// \brief Loads packed float values from a memory location storing 32-bit +/// float values to a 256-bit vector of [8 x float], according to the +/// specified +/// mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMASKMOVPS instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the float +/// values. +/// \param __m +/// A 256-bit vector of [8 x float] containing the mask. The +/// most significant bit of each data element represents the +/// mask bits. If a mask bit is zero, the corresponding value in +/// the memory location is not loaded and the corresponding +/// field in the destination vector is set to zero. +/// \returns A 256-bit vector of [8 x float] containing the loaded values. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_maskload_ps(float const *__p, __m256i __m) { @@ -848,24 +3277,104 @@ } /* Conditional store ops */ +/// \brief Moves packed float values from a 256-bit vector of [8 x float] to a +/// memory location, according to the specified mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMASKMOVPS instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float +/// values. +/// \param __m +/// A 256-bit vector of [8 x float] containing the mask. The +/// most significant bit of each field in the mask vector +/// represents the mask bits. If a mask bit is zero, the +/// corresponding value from vector __a is +/// not stored and the corresponding field in the destination +/// memory location is not changed. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be +/// stored. static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) { __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); } +/// \brief Moves packed double-precision values from a 128-bit vector of [2 x +/// double] to a memory location, according to the specified mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMASKMOVPD instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float +/// values. +/// \param __m +/// A 128-bit vector of [2 x double] containing the mask. The +/// most significant bit of each field in the mask vector +/// represents the mask bits. If a mask bit is zero, the +/// corresponding value from vector __a is +/// not stored and the corresponding field in the destination +/// memory location is not changed. +/// \param __a +/// A 128-bit vector of [2 x double] containing the values to be +/// stored. static __inline void __DEFAULT_FN_ATTRS _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) { __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); } +/// \brief Moves packed double-precision values from a 256-bit vector of [4 x +/// double] to a memory location, according to the specified mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMASKMOVPD instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float +/// values. +/// \param __m +/// A 256-bit vector of [4 x double] containing the mask. The +/// most significant bit of each field in the mask vector +/// represents the mask bits. If a mask bit is zero, the +/// corresponding value from vector __a is +/// not stored and the corresponding field in the destination +/// memory location is not changed. +/// \param __a +/// A 256-bit vector of [4 x double] containing the values to be +/// stored. static __inline void __DEFAULT_FN_ATTRS _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) { __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); } +/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a +/// memory location, according to the specified mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMASKMOVPS instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float +/// values. +/// \param __m +/// A 128-bit vector of [4 x float] containing the mask. The +/// most significant bit of each field in the mask vector +/// represents the mask bits. If a mask bit is zero, the +/// corresponding value from vector __a is +/// not stored and the corresponding field in the destination +/// memory location is not changed. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be +/// stored. static __inline void __DEFAULT_FN_ATTRS _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) { @@ -873,18 +3382,60 @@ } /* Cacheability support ops */ +/// \brief Moves packed integer values from a 256-bit integer vector to a +/// 256-bit aligned memory location. To minimize caching, the data is flagged +/// as +/// non-temporal (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVNTDQ instruction. +/// +/// \param __a +/// A 256-bit aligned pointer to a memory location that will +/// receive the integer values. +/// \param __b +/// A 256-bit integer vector containing the values to be moved. static __inline void __DEFAULT_FN_ATTRS _mm256_stream_si256(__m256i *__a, __m256i __b) { __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b); } +/// \brief Moves packed double-precision values from a 256-bit vector of [4 x +/// double] to a 256-bit aligned memory location. To minimize caching, the +/// data is flagged as non-temporal (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVNTPD instruction. +/// +/// \param __a +/// A 256-bit aligned pointer to a memory location that will +/// receive the integer values. +/// \param __b +/// A 256-bit vector of [4 x double] containing the values to be +/// moved. static __inline void __DEFAULT_FN_ATTRS _mm256_stream_pd(double *__a, __m256d __b) { __builtin_ia32_movntpd256(__a, (__v4df)__b); } +/// \brief Moves packed float values from a 256-bit vector of [8 x float] to a +/// 256-bit aligned memory location. To minimize caching, the data is +/// flagged as non-temporal (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVNTPS instruction. +/// +/// \param __p +/// A 256-bit aligned pointer to a memory location that will +/// receive the integer values. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be +/// moved. static __inline void __DEFAULT_FN_ATTRS _mm256_stream_ps(float *__p, __m256 __a) { @@ -910,12 +3461,62 @@ return (__m256i)__builtin_ia32_undef256(); } +/// \brief Initializes a 256-bit vector of [4 x double] with the specified +/// 64-bit double-precision values. +/// +/// \headerfile +/// +/// \param __a +/// A double-precision value used to initialize bits [255:192] +/// of the destination vector of [4 x double]. +/// \param __b +/// A double-precision value used to initialize bits [191:128] +/// of the destination vector of [4 x double]. +/// \param __c +/// A double-precision value used to initialize bits [127:64] of +/// the destination vector of [4 x double]. +/// \param __d +/// A double-precision value used to initialize bits [63:0] of +/// the destination vector of [4 x double]. +/// \returns An initialized 256-bit vector of [4 x double] containing the +/// values provided in the operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_pd(double __a, double __b, double __c, double __d) { return (__m256d){ __d, __c, __b, __a }; } +/// \brief Initializes a 256-bit vector of [8 x float] with the specified +/// 32-bit float values. +/// +/// \headerfile +/// +/// \param __a +/// A float value used to initialize the bits [255:224] of the +/// destination vector of [8 x float]. +/// \param __b +/// A float value used to initialize the bits [223:192] of the +/// destination vector of [8 x float]. +/// \param __c +/// A float value used to initialize the bits [191:160] of the +/// destination vector of [8 x float]. +/// \param __d +/// A float value used to initialize the bits [159:128] of the +/// destination vector of [8 x float]. +/// \param __e +/// A float value used to initialize the bits [127:96] of the +/// destination vector of [8 x float]. +/// \param __f +/// A float value used to initialize the bits [95:64] of the +/// destination vector of [8 x float]. +/// \param __g +/// A float value used to initialize the bits [63:32] of the +/// destination vector of [8 x float]. +/// \param __h +/// A float value used to initialize the bits [31:0] of the +/// destination vector of [8 x float]. +/// \returns An initialized 256-bit vector of [8 x float] containing the values +/// provided in the operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h) @@ -923,6 +3524,37 @@ return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; } +/// \brief Initializes a 256-bit integer vector with the specified integer +/// values. +/// +/// \headerfile +/// +/// \param __i0 +/// A 32-bit integer value used to initialize bits [255:224] of +/// the destination vector. +/// \param __i1 +/// A 32-bit integer value used to initialize bits [223:192] of +/// the destination vector. +/// \param __i2 +/// A 32-bit integer value used to initialize bits [191:160] of +/// the destination vector. +/// \param __i3 +/// A 32-bit integer value used to initialize bits [159:128] of +/// the destination vector. +/// \param __i4 +/// A 32-bit integer value used to initialize bits [127:96] of +/// the destination vector. +/// \param __i5 +/// A 32-bit integer value used to initialize bits [95:64] of +/// the destination vector. +/// \param __i6 +/// A 32-bit integer value used to initialize bits [63:32] of +/// the destination vector. +/// \param __i7 +/// A 32-bit integer value used to initialize bits [31:0] of the +/// destination vector. +/// \returns An initialized 256-bit integer vector containing the values +/// provided in the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7) @@ -930,6 +3562,60 @@ return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; } +/// \brief Initializes a 256-bit integer vector with the specified short values. +/// +/// \headerfile +/// +/// \param __w15 +/// A 16-bit integer value used to initialize bits [255:240] of +/// the destination vector. +/// \param __w14 +/// A 16-bit integer value used to initialize bits [239:224] of +/// the destination vector. +/// \param __w13 +/// A 16-bit integer value used to initialize bits [223:208] of +/// the destination vector. +/// \param __w12 +/// A 16-bit integer value used to initialize bits [207:192] of +/// the destination vector. +/// \param __w11 +/// A 16-bit integer value used to initialize bits [191:176] of +/// the destination vector. +/// \param __w10 +/// A 16-bit integer value used to initialize bits [175:160] of +/// the destination vector. +/// \param __w09 +/// A 16-bit integer value used to initialize bits [159:144] of +/// the destination vector. +/// \param __w08 +/// A 16-bit integer value used to initialize bits [143:128] of +/// the destination vector. +/// \param __w07 +/// A 16-bit integer value used to initialize bits [127:112] of +/// the destination vector. +/// \param __w06 +/// A 16-bit integer value used to initialize bits [111:96] of +/// the destination vector. +/// \param __w05 +/// A 16-bit integer value used to initialize bits [95:80] of +/// the destination vector. +/// \param __w04 +/// A 16-bit integer value used to initialize bits [79:64] of +/// the destination vector. +/// \param __w03 +/// A 16-bit integer value used to initialize bits [63:48] of +/// the destination vector. +/// \param __w02 +/// A 16-bit integer value used to initialize bits [47:32] of +/// the destination vector. +/// \param __w01 +/// A 16-bit integer value used to initialize bits [31:16] of +/// the destination vector. +/// \param __w00 +/// A 16-bit integer value used to initialize bits [15:0] of the +/// destination vector. +/// \returns An initialized 256-bit integer vector containing the values +/// provided in the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, @@ -940,6 +3626,76 @@ __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; } +/// \brief Initializes a 256-bit integer vector with the specified char values. +/// +/// \headerfile +/// +/// \param __b31 +/// Initializes bits [255:248] of the destination vector. +/// \param __b30 +/// Initializes bits [247:240] of the destination vector. +/// \param __b29 +/// Initializes bits [239:232] of the destination vector. +/// \param __b28 +/// Initializes bits [231:224] of the destination vector. +/// \param __b27 +/// Initializes bits [223:216] of the destination vector. +/// \param __b26 +/// Initializes bits [215:208] of the destination vector. +/// \param __b25 +/// Initializes bits [207:200] of the destination vector. +/// \param __b24 +/// Initializes bits [199:192] of the destination vector. +/// \param __b23 +/// Initializes bits [191:184] of the destination vector. +/// \param __b22 +/// Initializes bits [183:176] of the destination vector. +/// \param __b21 +/// Initializes bits [175:168] of the destination vector. +/// \param __b20 +/// Initializes bits [167:160] of the destination vector. +/// \param __b19 +/// Initializes bits [159:152] of the destination vector. +/// \param __b18 +/// Initializes bits [151:144] of the destination vector. +/// \param __b17 +/// Initializes bits [143:136] of the destination vector. +/// \param __b16 +/// Initializes bits [135:128] of the destination vector. +/// \param __b15 +/// Initializes bits [127:120] of the destination vector. +/// \param __b14 +/// Initializes bits [119:112] of the destination vector. +/// \param __b13 +/// Initializes bits [111:104] of the destination vector. +/// \param __b12 +/// Initializes bits [103:96] of the destination vector. +/// \param __b11 +/// Initializes bits [95:88] of the destination vector. +/// \param __b10 +/// Initializes bits [87:80] of the destination vector. +/// \param __b09 +/// Initializes bits [79:72] of the destination vector. +/// \param __b08 +/// Initializes bits [71:64] of the destination vector. +/// \param __b07 +/// Initializes bits [63:56] of the destination vector. +/// \param __b06 +/// Initializes bits [55:48] of the destination vector. +/// \param __b05 +/// Initializes bits [47:40] of the destination vector. +/// \param __b04 +/// Initializes bits [39:32] of the destination vector. +/// \param __b03 +/// Initializes bits [31:24] of the destination vector. +/// \param __b02 +/// Initializes bits [23:16] of the destination vector. +/// \param __b01 +/// Initializes bits [15:8] of the destination vector. +/// \param __b00 +/// Initializes bits [7:0] of the destination vector. +/// \returns An initialized 256-bit integer vector containing the values +/// provided in the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, @@ -958,6 +3714,25 @@ }; } +/// \brief Initializes a 256-bit integer vector with the specified 64-bit +/// integer values. +/// +/// \headerfile +/// +/// \param __a +/// A 64-bit integer value used to initialize bits [255:192] of +/// the destination vector of [4 x i64]. +/// \param __b +/// A 64-bit integer value used to initialize bits [191:128] of +/// the destination vector of [4 x i64]. +/// \param __c +/// A 64-bit integer value used to initialize bits [127:64] of +/// the destination vector of [4 x i64]. +/// \param __d +/// A 64-bit integer value used to initialize bits [63:0] of the +/// destination vector of [4 x i64]. +/// \returns An initialized 256-bit integer vector containing the values +/// provided in the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) { @@ -965,12 +3740,66 @@ } /* Create vectors with elements in reverse order */ +/// \brief Initializes a 256-bit vector of [4 x double] with the specified +/// 64-bit double-precision values, storing the first two operands in the +/// lower +/// bits and the second two operands in the upper bits. +/// +/// \headerfile +/// +/// \param __a +/// A double-precision value used to initialize bits [63:0] of +/// the destination vector of [4 x double]. +/// \param __b +/// A double-precision value used to initialize bits [127:64] of +/// the destination vector of [4 x double]. +/// \param __c +/// A double-precision value used to initialize bits [191:128] +/// of the destination vector of [4 x double]. +/// \param __d +/// A double-precision value used to initialize bits [255:192] +/// of the destination vector of [4 x double]. +/// \returns An initialized 256-bit vector of [4 x double] containing the +/// values provided in the operands. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_pd(double __a, double __b, double __c, double __d) { return (__m256d){ __a, __b, __c, __d }; } +/// \brief Initializes a 256-bit vector of [8 x float] with the specified +/// 32-bit float values, storing the first 4 operands in the lower bits and +/// the +/// second 4 operands in the upper bits. +/// +/// \headerfile +/// +/// \param __a +/// A float value used to initialize the bits [31:0] of the +/// destination vector of [8 x float]. +/// \param __b +/// A float value used to initialize the bits [63:32] of the +/// destination vector of [8 x float]. +/// \param __c +/// A float value used to initialize the bits [95:64] of the +/// destination vector of [8 x float]. +/// \param __d +/// A float value used to initialize the bits [127:96] of the +/// destination vector of [8 x float]. +/// \param __e +/// A float value used to initialize the bits [159:128] of the +/// destination vector of [8 x float]. +/// \param __f +/// A float value used to initialize the bits [191:160] of the +/// destination vector of [8 x float]. +/// \param __g +/// A float value used to initialize the bits [223:192] of the +/// destination vector of [8 x float]. +/// \param __h +/// A float value used to initialize the bits [255:224] of the +/// destination vector of [8 x float]. +/// \returns An initialized 256-bit vector of [8 x float] containing the values +/// provided in the operands. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h) @@ -978,6 +3807,38 @@ return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; } +/// \brief Initializes a 256-bit integer vector with the specified integer +/// values, storing the first 4 operands in the lower bits and the second +/// 4 operands in the upper bits. +/// +/// \headerfile +/// +/// \param __i0 +/// A 32-bit integer value used to initialize bits [31:0] of the +/// destination vector. +/// \param __i1 +/// A 32-bit integer value used to initialize bits [63:32] of +/// the destination vector. +/// \param __i2 +/// A 32-bit integer value used to initialize bits [95:64] of +/// the destination vector. +/// \param __i3 +/// A 32-bit integer value used to initialize bits [127:96] of +/// the destination vector. +/// \param __i4 +/// A 32-bit integer value used to initialize bits [159:128] of +/// the destination vector. +/// \param __i5 +/// A 32-bit integer value used to initialize bits [191:160] of +/// the destination vector. +/// \param __i6 +/// A 32-bit integer value used to initialize bits [223:192] of +/// the destination vector. +/// \param __i7 +/// A 32-bit integer value used to initialize bits [255:224] of +/// the destination vector. +/// \returns An initialized 256-bit integer vector containing the values +/// provided in the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7) @@ -985,6 +3846,62 @@ return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; } +/// \brief Initializes a 256-bit integer vector with the specified short values, +/// storing the first 8 operands in the lower bits and the second 8 +/// operands in the upper bits. +/// +/// \headerfile +/// +/// \param __w15 +/// A 16-bit integer value used to initialize bits [15:0] of the +/// destination vector. +/// \param __w14 +/// A 16-bit integer value used to initialize bits [31:16] of +/// the destination vector. +/// \param __w13 +/// A 16-bit integer value used to initialize bits [47:32] of +/// the destination vector. +/// \param __w12 +/// A 16-bit integer value used to initialize bits [63:48] of +/// the destination vector. +/// \param __w11 +/// A 16-bit integer value used to initialize bits [79:64] of +/// the destination vector. +/// \param __w10 +/// A 16-bit integer value used to initialize bits [95:80] of +/// the destination vector. +/// \param __w09 +/// A 16-bit integer value used to initialize bits [111:96] of +/// the destination vector. +/// \param __w08 +/// A 16-bit integer value used to initialize bits [127:112] of +/// the destination vector. +/// \param __w07 +/// A 16-bit integer value used to initialize bits [143:128] of +/// the destination vector. +/// \param __w06 +/// A 16-bit integer value used to initialize bits [159:144] of +/// the destination vector. +/// \param __w05 +/// A 16-bit integer value used to initialize bits [175:160] of +/// the destination vector. +/// \param __w04 +/// A 16-bit integer value used to initialize bits [191:176] of +/// the destination vector. +/// \param __w03 +/// A 16-bit integer value used to initialize bits [207:192] of +/// the destination vector. +/// \param __w02 +/// A 16-bit integer value used to initialize bits [223:208] of +/// the destination vector. +/// \param __w01 +/// A 16-bit integer value used to initialize bits [239:224] of +/// the destination vector. +/// \param __w00 +/// A 16-bit integer value used to initialize bits [255:240] of +/// the destination vector. +/// \returns An initialized 256-bit integer vector containing the values +/// provided in the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w11, short __w10, short __w09, short __w08, @@ -995,6 +3912,78 @@ __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; } +/// \brief Initializes a 256-bit integer vector with the specified char values, +/// storing the first 16 operands in the lower bits and the second 16 +/// operands in the upper bits. +/// +/// \headerfile +/// +/// \param __b31 +/// Initializes bits [7:0] of the destination vector. +/// \param __b30 +/// Initializes bits [15:8] of the destination vector. +/// \param __b29 +/// Initializes bits [23:16] of the destination vector. +/// \param __b28 +/// Initializes bits [31:24] of the destination vector. +/// \param __b27 +/// Initializes bits [39:32] of the destination vector. +/// \param __b26 +/// Initializes bits [47:40] of the destination vector. +/// \param __b25 +/// Initializes bits [55:48] of the destination vector. +/// \param __b24 +/// Initializes bits [63:56] of the destination vector. +/// \param __b23 +/// Initializes bits [71:64] of the destination vector. +/// \param __b22 +/// Initializes bits [79:72] of the destination vector. +/// \param __b21 +/// Initializes bits [87:80] of the destination vector. +/// \param __b20 +/// Initializes bits [95:88] of the destination vector. +/// \param __b19 +/// Initializes bits [103:96] of the destination vector. +/// \param __b18 +/// Initializes bits [111:104] of the destination vector. +/// \param __b17 +/// Initializes bits [119:112] of the destination vector. +/// \param __b16 +/// Initializes bits [127:120] of the destination vector. +/// \param __b15 +/// Initializes bits [135:128] of the destination vector. +/// \param __b14 +/// Initializes bits [143:136] of the destination vector. +/// \param __b13 +/// Initializes bits [151:144] of the destination vector. +/// \param __b12 +/// Initializes bits [159:152] of the destination vector. +/// \param __b11 +/// Initializes bits [167:160] of the destination vector. +/// \param __b10 +/// Initializes bits [175:168] of the destination vector. +/// \param __b09 +/// Initializes bits [183:176] of the destination vector. +/// \param __b08 +/// Initializes bits [191:184] of the destination vector. +/// \param __b07 +/// Initializes bits [199:192] of the destination vector. +/// \param __b06 +/// Initializes bits [207:200] of the destination vector. +/// \param __b05 +/// Initializes bits [215:208] of the destination vector. +/// \param __b04 +/// Initializes bits [223:216] of the destination vector. +/// \param __b03 +/// Initializes bits [231:224] of the destination vector. +/// \param __b02 +/// Initializes bits [239:232] of the destination vector. +/// \param __b01 +/// Initializes bits [247:240] of the destination vector. +/// \param __b00 +/// Initializes bits [255:248] of the destination vector. +/// \returns An initialized 256-bit integer vector containing the values +/// provided in the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b27, char __b26, char __b25, char __b24, @@ -1012,6 +4001,26 @@ __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; } +/// \brief Initializes a 256-bit integer vector with the specified 64-bit +/// integer values, storing the first two operands in the lower bits and the +/// second two operands in the upper bits. +/// +/// \headerfile +/// +/// \param __a +/// A 64-bit integer value used to initialize bits [63:0] of the +/// destination vector of [4 x i64]. +/// \param __b +/// A 64-bit integer value used to initialize bits [127:64] of +/// the destination vector of [4 x i64]. +/// \param __c +/// A 64-bit integer value used to initialize bits [191:128] of +/// the destination vector of [4 x i64]. +/// \param __d +/// A 64-bit integer value used to initialize bits [255:192] of +/// the destination vector of [4 x i64]. +/// \returns An initialized 256-bit integer vector containing the values +/// provided in the operands. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) { @@ -1019,24 +4028,62 @@ } /* Create vectors with repeated elements */ +/// \brief Initializes a 256-bit vector of [4 x double] with the specified +/// 64-bit double-precision value. +/// +/// \headerfile +/// +/// \param __w +/// Double-precision value used to initialize the destination +/// vector of [4 x double]. +/// \returns An initialized 256-bit vector of [4 x double] containing the value +/// provided in the operand. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set1_pd(double __w) { return (__m256d){ __w, __w, __w, __w }; } +/// \brief Initializes a 256-bit vector of [8 x float] with the specified +/// 32-bit float value. +/// +/// \headerfile +/// +/// \param __w +/// Float value used to initialize the destination vector of [8 +/// x float]. +/// \returns An initialized 256-bit vector of [8 x float] containing the value +/// provided in the operand. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set1_ps(float __w) { return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; } +/// \brief Initializes a 256-bit integer vector with the specified integer value. +/// +/// \headerfile +/// +/// \param __i +/// Integer value used to initialize the destination integer +/// vector. +/// \returns An initialized 256-bit integer vector containing the value provided +/// in the operand. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i) { return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; } +/// \brief Initializes a 256-bit integer vector with the specified short value. +/// +/// \headerfile +/// +/// \param __w +/// Short value used to initialize the destination integer +/// vector. +/// \returns An initialized 256-bit integer vector containing the value provided +/// in the operand. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi16(short __w) { @@ -1044,6 +4091,15 @@ __w, __w, __w, __w, __w, __w }; } +/// \brief Initializes a 256-bit integer vector with the specified char value. +/// +/// \headerfile +/// +/// \param __b +/// Char values used to initialize the destination integer +/// vector. +/// \returns An initialized 256-bit integer vector containing the value provided +/// in the operand. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi8(char __b) { @@ -1052,6 +4108,16 @@ __b, __b, __b, __b, __b, __b, __b }; } +/// \brief Initializes a 256-bit integer vector with the specified 64-bit +/// integer value. +/// +/// \headerfile +/// +/// \param __q +/// 64-bit integer value used to initialize the destination +/// integer vector. +/// \returns An initialized 256-bit integer vector containing the value provided +/// in the operand. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q) { @@ -1059,18 +4125,38 @@ } /* Create __zeroed vectors */ +/// \brief Sets the 256-bit YMM register to zero, or creates a 256-bit vector +/// of [4 x double] with all elements initialized to zero. +/// +/// \headerfile +/// +/// \returns An initialized 256-bit vector of [4 x double] with all elements set +/// to zero. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void) { return (__m256d){ 0, 0, 0, 0 }; } +/// \brief Sets the 256-bit YMM register to zero, or creates a 256-bit vector +/// of [8 x float] with all elements initialized to zero. +/// +/// \headerfile +/// +/// \returns An initialized 256-bit vector of [8 x float] with all elements set +/// to zero. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void) { return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; } +/// \brief Sets the 256-bit YMM register to zero, or creates a 256-bit vector +/// of [4 x i64] with all elements initialized to zero. +/// +/// \headerfile +/// +/// \returns An initialized 256-bit integer vector with all elements set to zero. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void) { @@ -1078,72 +4164,183 @@ } /* Cast between vector types */ +/// \brief Casts 64-bit double-precision values as 32-bit float values. +/// +/// \headerfile +/// +/// \param __a +/// A 256-bit vector of [4 x double] to be cast as float values. +/// \returns A 256-bit vector of [8 x float] containing the typecast values +/// provided in the operand. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castpd_ps(__m256d __a) { return (__m256)__a; } +/// \brief Casts 64-bit double-precision values as integer values. +/// +/// \headerfile +/// +/// \param __a +/// A 256-bit vector of [4 x double] to be cast as integer +/// values. +/// \returns A 256-bit integer vector containing the typecast values provided +/// in the operand. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castpd_si256(__m256d __a) { return (__m256i)__a; } +/// \brief Casts 32-bit float values as 64-bit double-precision values. +/// +/// \headerfile +/// +/// \param __a +/// A 256-bit vector of [8 x float] to be cast as +/// double-precision values. +/// \returns A 256-bit vector of [4 x double] containing the typecast values +/// provided in the operand. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castps_pd(__m256 __a) { return (__m256d)__a; } +/// \brief Casts 32-bit float values as integer values. +/// +/// \headerfile +/// +/// \param __a +/// A 256-bit vector of [8 x float] to be cast as integer +/// values. +/// \returns A 256-bit integer vector containing the typecast values provided +/// in the operand. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castps_si256(__m256 __a) { return (__m256i)__a; } +/// \brief Casts integer values as 32-bit float values. +/// +/// \headerfile +/// +/// \param __a +/// A 256-bit integer vector to be cast as float values. +/// \returns A 256-bit vector of [8 x float] containing the typecast values +/// provided in the operand. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castsi256_ps(__m256i __a) { return (__m256)__a; } +/// \brief Casts integer values as 64-bit double-precision values. +/// +/// \headerfile +/// +/// \param __a +/// A 256-bit integer vector to be cast as double-precision +/// values. +/// \returns A 256-bit vector of [4 x double] containing the typecast values +/// provided in the operand. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castsi256_pd(__m256i __a) { return (__m256d)__a; } +/// \brief Casts a 256-bit vector of [4 x double] as a 128-bit vector of [2 x +/// double]. +/// +/// \headerfile +/// +/// \param __a +/// A 256-bit vector of [4 x double] to be cast as double +/// values. The lower 128 bits of this vector are used. +/// \returns A 128-bit vector of [2 x double] containing the typecast values +/// provided in the operand. static __inline __m128d __DEFAULT_FN_ATTRS _mm256_castpd256_pd128(__m256d __a) { return __builtin_shufflevector(__a, __a, 0, 1); } +/// \brief Casts a 256-bit vector of [8 x float] as a 128-bit vector of [4 x +/// float]. +/// +/// \headerfile +/// +/// \param __a +/// A 256-bit vector of [8 x float] to be cast as double values. +/// The lower 128 bits of this vector are used. +/// \returns A 128-bit vector of [4 x float] containing the typecast values +/// provided in the operand. static __inline __m128 __DEFAULT_FN_ATTRS _mm256_castps256_ps128(__m256 __a) { return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); } +/// \brief Casts a 256-bit integer vector as a 128-bit integer vector. +/// +/// \headerfile +/// +/// \param __a +/// A 256-bit integer vector of to be cast as integer values. +/// The lower 128 bits of this vector are used. +/// \returns A 128-bit integer vector containing the typecast values provided +/// in the operand. static __inline __m128i __DEFAULT_FN_ATTRS _mm256_castsi256_si128(__m256i __a) { return __builtin_shufflevector(__a, __a, 0, 1); } +/// \brief Casts a 128-bit vector of [2 x double] as a 256-bit vector of [4 x +/// double]. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit vector of [2 x double] to be cast as double +/// values. The upper 128 bits of the destination vector are +/// undefined. +/// \returns A 256-bit vector of [4 x double] containing the typecast values +/// provided in the operand in the lower 128 bits. static __inline __m256d __DEFAULT_FN_ATTRS _mm256_castpd128_pd256(__m128d __a) { return __builtin_shufflevector(__a, __a, 0, 1, -1, -1); } +/// \brief Casts a 128-bit vector of [4 x float] as a 256-bit vector of [8 x +/// float]. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit vector of [4 x float] to be cast as float values. +/// The upper 128 bits of the destination vector are undefined. +/// \returns A 256-bit vector of [8 x float] containing the typecast values +/// provided in the operand in the lower 128 bits. static __inline __m256 __DEFAULT_FN_ATTRS _mm256_castps128_ps256(__m128 __a) { return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1); } +/// \brief Casts a 128-bit integer vector as a 256-bit integer vector. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit integer vector to be cast as integer values. The +/// upper 128 bits of the destination vector are undefined. +/// \returns A 256-bit integer vector containing the typecast values provided +/// in the operand in the lower 128 bits. static __inline __m256i __DEFAULT_FN_ATTRS _mm256_castsi128_si256(__m128i __a) { @@ -1155,6 +4352,34 @@ We use macros rather than inlines because we only want to accept invocations where the immediate M is a constant expression. */ +/// \brief Combines 128 bits of packed data from the 256-bit vector operand of +/// [8 x float] with 128 bits of packed data from the 128-bit vector operand +/// of [4 x float], using the offset specified by the integer operand, and +/// copies them to the destination. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VINSERTF128 instruction. +/// +/// \param V1 +/// A 256-bit vector of [8 x float] values. The remaining bits +/// in the destination are copied from the corresponding bits in +/// this operand. +/// \param V2 +/// A 128-bit vector of [4 x float] values. The bits of this +/// operand are written to the destination beginning at the +/// offset specified by operand M. +/// \param M +/// An immediate integer used to determine which bits in the +/// destination are used when copying the bits from operand V2: +/// If bit [0] is 0, bits [127:0] are used in the destination. +/// If bit [0] is 1, bits [255:128] are used in the destination. +/// \returns A 256-bit vector of [8 x float] containing the copied packed data +/// from the operands. #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ (__m256)__builtin_shufflevector( \ (__v8sf)(__m256)(V1), \ @@ -1168,6 +4393,35 @@ (((M) & 1) ? 10 : 6), \ (((M) & 1) ? 11 : 7) );}) +/// \brief Combines 128 bits of packed data from the 256-bit vector operand of +/// [4 x double] with 128 bits of packed data from the 128-bit vector +/// operand +/// of [2 x double], using the offset specified by the integer operand, +/// and copies them to the destination. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VINSERTF128 instruction. +/// +/// \param V1 +/// A 256-bit vector of [4 x double] values. The remaining bits +/// in the destination are copied from the corresponding bits in +/// this operand. +/// \param V2 +/// A 128-bit vector of [2 x double] values. The bits of this +/// operand are written to the destination beginning at the +/// offset specified by operand M. +/// \param M +/// An immediate integer used to determine which bits in the +/// destination are used when copying the bits from operand V2: +/// If bit [0] is 0, bits [127:0] are used in the destination. +/// If bit [0] is 1, bits [255:128] are used in the destination. +/// \returns A 256-bit vector of [4 x double] containing the copied packed data +/// from the operands. #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ (__m256d)__builtin_shufflevector( \ (__v4df)(__m256d)(V1), \ @@ -1177,6 +4431,34 @@ (((M) & 1) ? 4 : 2), \ (((M) & 1) ? 5 : 3) );}) +/// \brief Combines 128 bits of packed data from the 256-bit integer vector +/// operand with 128 bits of packed data from the 128-bit integer vector +/// operand, using the offset specified by the integer operand, and copies +/// them to the destination. +/// +/// \headerfile +/// +/// \code +/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VINSERTF128 instruction. +/// +/// \param V1 +/// A 256-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param V2 +/// A 128-bit integer vector. The bits of this operand are +/// written to the destination beginning at the offset specified +/// by operand M. +/// \param M +/// An immediate integer used to determine which bits in the +/// destination are used when copying the bits from operand V2: +/// If bit [0] is 0, bits [127:0] are used in the destination. +/// If bit [0] is 1, bits [255:128] are used in the destination. +/// \returns A 256-bit integer vector containing the copied packed data from +/// the operands. #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ (__m256i)__builtin_shufflevector( \ (__v4di)(__m256i)(V1), \ @@ -1191,6 +4473,27 @@ We use macros rather than inlines because we only want to accept invocations where the immediate M is a constant expression. */ +/// \brief Extracts 128 bits of packed data from a 256-bit vector of [8 x float] +/// and copies it to the destination, as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm256_extractf128_ps(__m256 V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VEXTRACTF128 instruction. +/// +/// \param V +/// A 256-bit vector of [8 x float] values. +/// \param M +/// An immediate integer used to determine which bits are +/// extracted: +/// If bit [0] is 0, bits [127:0] are copied to the destination. +/// If bit [0] is 1, bits [255:128] are copied to the +/// destination. +/// \returns A 128-bit vector of [4 x float] containing the extracted 128 bits +/// of packed data. #define _mm256_extractf128_ps(V, M) __extension__ ({ \ (__m128)__builtin_shufflevector( \ (__v8sf)(__m256)(V), \ @@ -1200,6 +4503,27 @@ (((M) & 1) ? 6 : 2), \ (((M) & 1) ? 7 : 3) );}) +/// \brief Extracts 128 bits of packed data from a 256-bit vector of [4 x double] +/// and copies it to the destination, as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm256_extractf128_pd(__m256d V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VEXTRACTF128 instruction. +/// +/// \param V +/// A 256-bit vector of [4 x double] values. +/// \param M +/// An immediate integer used to determine which bits are +/// extracted: +/// If bit [0] is 0, bits [127:0] are copied to the destination. +/// If bit [0] is 1, bits [255:128] are copied to the +/// destination. +/// \returns A 128-bit vector of [2 x double] containing the extracted 128 bits +/// of packed data. #define _mm256_extractf128_pd(V, M) __extension__ ({ \ (__m128d)__builtin_shufflevector( \ (__v4df)(__m256d)(V), \ @@ -1207,6 +4531,27 @@ (((M) & 1) ? 2 : 0), \ (((M) & 1) ? 3 : 1) );}) +/// \brief Extracts 128 bits of packed data from a 256-bit integer vector and +/// copies it to the destination, as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm256_extractf128_si256(__m256i V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VEXTRACTF128 instruction. +/// +/// \param V +/// A 256-bit integer vector. +/// \param M +/// An immediate integer used to determine which bits are +/// extracted: +/// If bit [0] is 0, bits [127:0] are copied to the destination. +/// If bit [0] is 1, bits [255:128] are copied to the +/// destination. +/// \returns A 128-bit integer vector containing the extracted 128 bits of +/// packed data. #define _mm256_extractf128_si256(V, M) __extension__ ({ \ (__m128i)__builtin_shufflevector( \ (__v4di)(__m256i)(V), \ Index: lib/Headers/bmiintrin.h =================================================================== --- lib/Headers/bmiintrin.h +++ lib/Headers/bmiintrin.h @@ -44,12 +44,36 @@ to use it as a potentially faster version of BSF. */ #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) +/// \brief Counts the number of trailing zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c TZCNT instruction. +/// +/// \param __X +/// An unsigned 16-bit integer whose trailing zeros are to be +/// counted. +/// \returns An unsigned 16-bit integer containing the number of trailing zero +/// bits in the operand. static __inline__ unsigned short __RELAXED_FN_ATTRS __tzcnt_u16(unsigned short __X) { return __X ? __builtin_ctzs(__X) : 16; } +/// \brief Performs a bitwise AND of the second operand with the ones +/// complement of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c ANDN instruction. +/// +/// \param __X +/// An unsigned integer containing one of the operands. +/// \param __Y +/// An unsigned integer containing one of the operands. +/// \returns An unsigned integer containing the bitwise AND of the second +/// operand with the ones complement of the first operand. static __inline__ unsigned int __DEFAULT_FN_ATTRS __andn_u32(unsigned int __X, unsigned int __Y) { @@ -57,6 +81,22 @@ } /* AMD-specified, double-leading-underscore version of BEXTR */ +/// \brief Extracts the specified bits from the first operand and puts them +/// into the least significant bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c BEXTR instruction. +/// +/// \param __X +/// An unsigned integer whose bits are to be extracted. +/// \param __Y +/// An unsigned integer used to specify which bits are +/// extracted. Bits [7:0] specify the index of the least +/// significant bit. Bits [15:8] specify the number of bits to +/// be extracted. +/// \returns An unsigned integer whose least significant bits contain the +/// extracted bits. static __inline__ unsigned int __DEFAULT_FN_ATTRS __bextr_u32(unsigned int __X, unsigned int __Y) { @@ -64,30 +104,93 @@ } /* Intel-specified, single-leading-underscore version of BEXTR */ +/// \brief Extracts the specified bits from the first operand and puts them +/// into the least significant bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c BEXTR instruction. +/// +/// \param __X +/// An unsigned integer whose bits are to be extracted. +/// \param __Y +/// An unsigned integer used to specify the index of the least +/// significant bit for the bits to be extracted. Bits [7:0] +/// specify the index. +/// \param __Z +/// An unsigned integer used to specify the number of bits to be +/// extracted. Bits [7:0] specify the number of bits. +/// \returns An unsigned integer whose least significant bits contain the +/// extracted bits. static __inline__ unsigned int __DEFAULT_FN_ATTRS _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) { return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); } +/// \brief Clears all bits in the source except for the least significant bit +/// containing a value of 1, and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c BLSI instruction. +/// +/// \param __X +/// An unsigned integer whose bits are to be cleared. +/// \returns An unsigned integer containing the result of clearing the bits +/// from the source operand. static __inline__ unsigned int __DEFAULT_FN_ATTRS __blsi_u32(unsigned int __X) { return __X & -__X; } +/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and +/// including the least siginificant bit that is set to 1 in the source +/// operand, and writes the result to the destination. For example, __X ^ +/// (__X-1). +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c BLSMSK instruction. +/// +/// \param __X +/// An unsigned integer used to create the mask. +/// \returns An unsigned integer containing the newly created mask. static __inline__ unsigned int __DEFAULT_FN_ATTRS __blsmsk_u32(unsigned int __X) { return __X ^ (__X - 1); } +/// \brief Clears the least siginificant bit that is set to 1 in the source +/// operand, and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c BLSR instruction. +/// +/// \param __X +/// An unsigned integer containing the operand to be cleared. +/// \returns An unsigned integer containing the result of clearing the source +/// operand. static __inline__ unsigned int __DEFAULT_FN_ATTRS __blsr_u32(unsigned int __X) { return __X & (__X - 1); } +/// \brief Counts the number of trailing zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c TZCNT instruction. +/// +/// \param __X +/// An unsigned 32-bit integer whose trailing zeros are to be +/// counted. +/// \returns An unsigned 32-bit integer containing the number of trailing zero +/// bits in the operand. static __inline__ unsigned int __RELAXED_FN_ATTRS __tzcnt_u32(unsigned int __X) { @@ -103,6 +206,19 @@ #define _blsr_u64(a) (__blsr_u64((a))) #define _tzcnt_u64(a) (__tzcnt_u64((a))) +/// \brief Performs a bitwise AND of the second operand with the ones +/// complement of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c ANDN instruction. +/// +/// \param __X +/// An unsigned 64-bit integer containing one of the operands. +/// \param __Y +/// An unsigned 64-bit integer containing one of the operands. +/// \returns An unsigned 64-bit integer containing the bitwise AND of the +/// second operand with the ones complement of the first operand. static __inline__ unsigned long long __DEFAULT_FN_ATTRS __andn_u64 (unsigned long long __X, unsigned long long __Y) { @@ -110,6 +226,22 @@ } /* AMD-specified, double-leading-underscore version of BEXTR */ +/// \brief Extracts the specified bits from the first operand and puts them +/// into the least significant bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c BEXTR instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose bits are to be extracted. +/// \param __Y +/// An unsigned 64-bit integer used to specify which bits are +/// extracted. Bits [7:0] specify the index of the least +/// significant bit. Bits [15:8] specify the number of bits to +/// be extracted. +/// \returns An unsigned 64-bit integer whose least significant bits contain +/// the extracted bits. static __inline__ unsigned long long __DEFAULT_FN_ATTRS __bextr_u64(unsigned long long __X, unsigned long long __Y) { @@ -117,30 +249,93 @@ } /* Intel-specified, single-leading-underscore version of BEXTR */ +/// \brief Extracts the specified bits from the first operand and puts them +/// into the least significant bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c BEXTR instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose bits are to be extracted. +/// \param __Y +/// An unsigned integer used to specify the index of the least +/// significant bit for the bits to be extracted. Bits [7:0] +/// specify the index. +/// \param __Z +/// An unsigned integer used to specify the number of bits to be +/// extracted. Bits [7:0] specify the number of bits. +/// \returns An unsigned 64-bit integer whose least significant bits contain +/// the extracted bits. static __inline__ unsigned long long __DEFAULT_FN_ATTRS _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) { return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); } +/// \brief Clears all bits in the source except for the least significant bit +/// containing a value of 1, and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c BLSI instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose bits are to be cleared. +/// \returns An unsigned 64-bit integer containing the result of clearing the +/// bits from the source operand. static __inline__ unsigned long long __DEFAULT_FN_ATTRS __blsi_u64(unsigned long long __X) { return __X & -__X; } +/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and +/// including the least siginificant bit that is set to 1 in the source +/// operand, and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c BLSMSK instruction. +/// +/// \param __X +/// An unsigned 64-bit integer used to create the mask. +/// \returns A unsigned 64-bit integer containing the newly created mask. static __inline__ unsigned long long __DEFAULT_FN_ATTRS __blsmsk_u64(unsigned long long __X) { return __X ^ (__X - 1); } +/// \brief Clears the least siginificant bit that is set to 1 in the source +/// operand, and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c BLSR instruction. +/// +/// \param __X +/// An unsigned 64-bit integer containing the operand to be +/// cleared. +/// \returns An unsigned 64-bit integer containing the result of clearing the +/// source operand. static __inline__ unsigned long long __DEFAULT_FN_ATTRS __blsr_u64(unsigned long long __X) { return __X & (__X - 1); } +/// \brief Counts the number of trailing zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c TZCNT instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose trailing zeros are to be +/// counted. +/// \returns An unsigned 64-bit integer containing the number of trailing zero +/// bits in the operand. static __inline__ unsigned long long __RELAXED_FN_ATTRS __tzcnt_u64(unsigned long long __X) { Index: lib/Headers/emmintrin.h =================================================================== --- lib/Headers/emmintrin.h +++ lib/Headers/emmintrin.h @@ -44,6 +44,22 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) +/// \brief Adds the 64-bit double-precision scalar values in the low-order bits +/// of the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VADDSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// sum of the lower 64 bits of both operands. The upper 64 bits are copied +/// from the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, __m128d __b) { @@ -51,12 +67,39 @@ return __a; } +/// \brief Adds 2 packed 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VADDPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \returns A 128-bit vector of [2 x double] containing the sums of both static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, __m128d __b) { return __a + __b; } +/// \brief Subtracts the 64-bit double-precision values in the low-order bits +/// of the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSUBSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the minuend. +/// \param __b +/// A 128-bit vector of [2 x double] containing the subtrahend. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// difference of the lower 64 bits of both operands. The upper 64 bits +/// are copied from the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, __m128d __b) { @@ -64,12 +107,40 @@ return __a; } +/// \brief Subtracts 2 packed 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSUBPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the minuend. +/// \param __b +/// A 128-bit vector of [2 x double] containing the subtrahend. +/// \returns A 128-bit vector of [2 x double] containing the differences +/// between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, __m128d __b) { return __a - __b; } +/// \brief Multiplies the 64-bit double-precision values in the low-order bits +/// of the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMULSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// product of the lower 64 bits of both operands. The upper 64 bits are +/// copied from the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, __m128d __b) { @@ -77,12 +148,40 @@ return __a; } +/// \brief Multiplies 2 packed 128-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMULPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// operands. +/// \returns A 128-bit vector of [2 x double] containing the products between +/// both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, __m128d __b) { return __a * __b; } +/// \brief Divides the 64-bit double-precision values in the low-order bits of +/// the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VDIVSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the dividend. +/// \param __b +/// A 128-bit vector of [2 x double] containing divisor. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// quotient of the lower 64 bits of both operands. The upper 64 bits are +/// copied from the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, __m128d __b) { @@ -90,12 +189,45 @@ return __a; } +/// \brief Divides 2 packed 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VDIVPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the dividend. +/// \param __b +/// A 128-bit vector of [2 x double] containing the divisor. +/// \returns A 128-bit vector of [2 x double] containing the quotients between +/// both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, __m128d __b) { return __a / __b; } +/// \brief Calculates the square root of the 64-bit double-precision value in +/// the low-order bits of the second operand, copying the upper 64 bits of +/// the +/// first operand to bits [127:64] of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSQRTSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// operands. The upper 64 bits of this operand are copied to +/// the upper 64 bits of the destination. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// operands. The square root is calculated using the lower 64 +/// bits of this operand. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// square root of the lower 64 bits of operand __b, +/// and whose upper 64 bits are copied from the upper 64 bits of operand +/// __a. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b) { @@ -103,150 +235,499 @@ return (__m128d) { __c[0], __a[1] }; } +/// \brief Calculates the square roots of the values stored in a packed 128-bit +/// vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSQRTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the square roots of +/// the values in the operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) { return __builtin_ia32_sqrtpd(__a); } +/// \brief Compares 2 64-bit double-precision values in the low-order bits of +/// both operands, and stores the lesser of the pair of values in the +/// lower 64 bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMINSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// operands. The lower 64 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// operands. The lower 64 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// minimum value between both operands. The upper 64 bits are copied from +/// the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, __m128d __b) { return __builtin_ia32_minsd(__a, __b); } +/// \brief Compares 2 packed 128-bit vectors of [2 x double] and stores the +/// lesser of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMINPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// operands. +/// \returns A 128-bit vector of [2 x double] containing the minimum values +/// between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, __m128d __b) { return __builtin_ia32_minpd(__a, __b); } +/// \brief Compares 2 64-bit double-precision values in the low-order bits of +/// both operands, and stores the greater of the pair of values in the +/// lower 64 bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMAXSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// operands. The lower 64 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// operands. The lower 64 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// maximum value between both operands. The upper 64 bits are copied from +/// the upper 64 bits of the first source operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, __m128d __b) { return __builtin_ia32_maxsd(__a, __b); } +/// \brief Compares 2 packed 128-bit vectors of [2 x double] and stores the +/// greater of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMAXPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// operands. +/// \returns A 128-bit vector of [2 x double] containing the maximum values +/// between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, __m128d __b) { return __builtin_ia32_maxpd(__a, __b); } +/// \brief Performs a bitwise AND of 2 packed 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPAND instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the +/// values between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, __m128d __b) { return (__m128d)((__v4si)__a & (__v4si)__b); } +/// \brief Performs a bitwise AND of 2 packed 128-bit vectors of [2 x double], +/// using the ones-complement of the values contained in the first +/// source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPANDN instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the left source +/// operand. The ones complement of this value is used in +/// the bitwise AND. +/// \param __b +/// A 128-bit vector of [2 x double] containing the right source +/// operand. +/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the +/// values in the second operand and the ones-complement of the +/// first operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, __m128d __b) { return (__m128d)(~(__v4si)__a & (__v4si)__b); } +/// \brief Performs a bitwise OR of 2 packed 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPOR instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the +/// values between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, __m128d __b) { return (__m128d)((__v4si)__a | (__v4si)__b); } +/// \brief Performs a bitwise XOR of 2 packed 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPXOR instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. +/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the +/// values between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, __m128d __b) { return (__m128d)((__v4si)__a ^ (__v4si)__b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPEQPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpeqpd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpltpd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are less than or equal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLEPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmplepd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpltpd(__b, __a); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are greater than or equal to those in the second +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLEPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmplepd(__b, __a); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are ordered with respect to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPORDPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpordpd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are unordered with respect to those in the second +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPUNORDPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpunordpd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are unequal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNEQPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpneqpd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are not less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnltpd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are not less than or equal to those in the second +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLEPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnlepd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are not greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnltpd(__b, __a); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [2 x double] to determine if the values in the +/// first operand are not greater than or equal to those in the second +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLEPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnlepd(__b, __a); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPEQSD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x double] values. +/// \param __b +/// A 128-bit vector of [4 x double] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpeqsd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLTSD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpltsd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are less than or equal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLESD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmplesd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLTSD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b) { @@ -254,6 +735,20 @@ return (__m128d) { __c[0], __a[1] }; } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are greater than or equal to those in the second +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLESD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b) { @@ -261,36 +756,116 @@ return (__m128d) { __c[0], __a[1] }; } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are ordered with respect to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPORDSD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpordsd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are unordered with respect to those in the second +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPUNORDSD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpunordsd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are unequal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNEQSD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpneqsd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are not less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLTSD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnltsd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are not less than or equal to those in the second +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLESD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, __m128d __b) { return (__m128d)__builtin_ia32_cmpnlesd(__a, __b); } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are not greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLTSD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b) { @@ -298,6 +873,20 @@ return (__m128d) { __c[0], __a[1] }; } +/// \brief Compares each of the corresponding packed double-precision values of +/// the 128-bit vectors of [4 x float] to determine if the values in the +/// first operand are not greater than or equal to those in the second +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLESD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b) { @@ -305,24 +894,79 @@ return (__m128d) { __c[0], __a[1] }; } +/// \brief Compares 2 64-bit double-precision values in the low-order bits of +/// both operands for equality, and stores the result of the comparison in +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b) { return __builtin_ia32_comisdeq(__a, __b); } +/// \brief Compares 2 64-bit double-precision values in the low-order bits of +/// both operands to determine if the first operand is less than the +/// second operand, and stores the result of the comparison in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b) { return __builtin_ia32_comisdlt(__a, __b); } +/// \brief Compares 2 64-bit double-precision values in the low-order bits of +/// both operands to determine if the first operand is less than or equal +/// to the second operand, and stores the result of the comparison in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b) { return __builtin_ia32_comisdle(__a, __b); } +/// \brief Compares 2 64-bit double-precision values in the low-order bits of +/// both operands to determine if the first operand is greater than the +/// second operand, and stores the result of the comparison in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b) { @@ -335,30 +979,98 @@ return __builtin_ia32_comisdge(__a, __b); } +/// \brief Compares 2 64-bit double-precision values in the low-order bits of +/// both operands for inequality, and stores the result of the comparison +/// in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b) { return __builtin_ia32_comisdneq(__a, __b); } +/// \brief Performs an unordered comparison of 2 64-bit double-precision values +/// using the low-order bits of both operands to determine equality, and +/// stores the result of the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b) { return __builtin_ia32_ucomisdeq(__a, __b); } +/// \brief Performs an unordered comparison of 2 64-bit double-precision values +/// using the low-order bits of both operands to determine if the first +/// operand is less than the second operand, and stores the result of the +/// comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b) { return __builtin_ia32_ucomisdlt(__a, __b); } +/// \brief Performs an unordered comparison of 2 64-bit double-precision values +/// using the low-order bits of both operands to determine if the first +/// operand is less than or equal to the second operand, and stores the +/// result of the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b) { return __builtin_ia32_ucomisdle(__a, __b); } +/// \brief Performs an unordered comparison of 2 64-bit double-precision values +/// using the low-order bits of both operands to determine if the first +/// operand is greater than the second operand, and stores the result of +/// the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, __m128d __b) { @@ -371,42 +1083,124 @@ return __builtin_ia32_ucomisdge(__a, __b); } +/// \brief Performs an unordered comparison of 2 64-bit double-precision values +/// using the low-order bits of both operands to determine inequality, and +/// stores the result of the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \param __b +/// A 128-bit vector of [2 x double] values. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b) { return __builtin_ia32_ucomisdneq(__a, __b); } +/// \brief Converts a 128-bit vector of [2 x double] into a 128-bit vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTPD2PS instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [4 x float] containing the converted values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) { return __builtin_ia32_cvtpd2ps(__a); } +/// \brief Converts a 128-bit vector of [4 x float] into a 128-bit vector of [2 +/// x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTPS2PD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [2 x double] containing the converted values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) { return __builtin_ia32_cvtps2pd(__a); } +/// \brief Converts a vector of [4 x i32] into a vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTDQ2PD instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 128-bit vector of [2 x double] containing the converted values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) { return __builtin_ia32_cvtdq2pd((__v4si)__a); } +/// \brief Converts a 128-bit vector of [2 x double] into a 128-bit vector of +/// [4 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTPD2DQ instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit integer vector containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) { return __builtin_ia32_cvtpd2dq(__a); } +/// \brief Converts a vector of [2 x double] into a 32-bit signed integer value, +/// using the lower 64 bits of the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSD2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are used +/// in the conversion. +/// \returns A 32-bit signed integer containing the converted value. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) { return __builtin_ia32_cvtsd2si(__a); } +/// \brief Converts a vector of [2 x double] into a vector of [4 x float], +/// using the lower 64 bits of the operand. The result is written to the +/// lower +/// 32 bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSD2SS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The upper 96 bits of this +/// parameter are copied to the destination. +/// \param __b +/// A 128-bit vector of [2 x double] operand containing a double +/// value to be converted. The lower 64 bits of this operand are +/// used in the conversion. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// converted value from the second operand. The upper 96 bits are copied +/// from the upper 96 bits of the first operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, __m128d __b) { @@ -414,6 +1208,22 @@ return __a; } +/// \brief Converts a 32-bit signed integer value into a vector of [2 x double], +/// writing the result to the lower 64 bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSI2SD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The upper 64 bits of this +/// parameter are copied to the destination. +/// \param __b +/// A 32-bit signed integer operand containing the value to be +/// converted. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// converted value from the second operand. The upper 64 bits are copied +/// from the upper 64 bits of the first operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, int __b) { @@ -421,6 +1231,25 @@ return __a; } +/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a vector of [2 x double], using the lower 64 bits of +/// the operand. The result is written to the lower 64 bits of the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSS2SD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The upper 64 bits are +/// copied to the destination. +/// \param __b +/// A 128-bit vector of [4 x float] operand containing a double +/// value to be converted. The lower 32 bits of this operand are +/// used in the conversion. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// converted value from the second operand. The upper 64 bits are copied +/// from the upper 64 bits of the first operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, __m128 __b) { @@ -428,48 +1257,133 @@ return __a; } +/// \brief Converts a 128-bit vector of [2 x double] into a 128-bit vector of +/// [4 x i32], truncating the result when it is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTTPD2DQ instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [4 x i32] containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) { return (__m128i)__builtin_ia32_cvttpd2dq(__a); } +/// \brief Converts a vector of [2 x double] into a 32-bit signed integer value, +/// using the lower 64 bits of the operand, truncating the result when it +/// is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTTSD2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are used +/// in the conversion. +/// \returns A 32-bit signed integer containing the converted value. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) { return __a[0]; } +/// \brief Converts a 128-bit vector of [2 x double] into a 64-bit vector of [2 +/// x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPD2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 64-bit vector of [2 x i32] containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) { return (__m64)__builtin_ia32_cvtpd2pi(__a); } +/// \brief Converts a 128-bit vector of [2 x double] into a 64-bit vector of [2 +/// x i32], truncating the result when it is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTTPD2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 64-bit vector of [2 x i32] containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) { return (__m64)__builtin_ia32_cvttpd2pi(__a); } +/// \brief Converts a 64-bit vector of [2 x i32] into a128-bit vector of [2 x +/// double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPI2PD instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32]. +/// \returns A 128-bit vector of [2 x double] containing the converted values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtpi32_pd(__m64 __a) { return __builtin_ia32_cvtpi2pd((__v2si)__a); } +/// \brief Extracts a double-precision value from a vector of [2 x double] into +/// a double-precision value, using the lower 64 bits of the operand. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit vector of [2 x double] operand containing a double +/// value to be extracted. The lower 64 bits of this operand are +/// used in the extraction. +/// \returns A double-precision value extracted from the lower 64 bits of the +/// operand. static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) { return __a[0]; } +/// \brief Moves packed double-precision values from an aligned memory location +/// to 64-bit elements in a 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPD instruction. +/// +/// \param __dp +/// A 32-byte aligned pointer to a memory location containing +/// double-precision values. +/// \returns A 128-bit vector of [2 x double] containing the moved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) { return *(__m128d*)__dp; } +/// \brief Moves and duplicates one double-precision value to double-precision +/// values stored in a packed 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDDUP instruction. +/// +/// \param __dp +/// A double-precision value to be moved and duplicated. +/// \returns A 128-bit vector of [2 x double] containing the moved and +/// duplicated values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) { @@ -482,6 +1396,18 @@ #define _mm_load_pd1(dp) _mm_load1_pd(dp) +/// \brief Loads two double-precision values in reverse order into a packed +/// 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPD+shuffling instruction. +/// +/// \param __dp +/// An array of double-precision values to be loaded in reverse +/// order. +/// \returns A 128-bit vector of [2 x double] containing the reversed loaded +/// values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) { @@ -489,6 +1415,17 @@ return __builtin_shufflevector(__u, __u, 1, 0); } +/// \brief Moves packed double-precision values from an unaligned memory +/// location to 64-bit elements in a 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVUPD instruction. +/// +/// \param __dp +/// A pointer to a memory location containing double-precision +/// values. +/// \returns A 128-bit vector of [2 x double] containing the moved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) { @@ -498,6 +1435,18 @@ return ((struct __loadu_pd*)__dp)->__v; } +/// \brief Moves a packed double-precision value to the lower 64 bits of a +/// 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MOVSD instruction. +/// +/// \param __dp +/// A pointer to a memory location containing a double-precision +/// value. +/// \returns A 128-bit vector of [2 x double] containing the moved value in the +/// lower 64 bits, with a value of 0 assigned to the upper 64 bits. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) { @@ -508,6 +1457,21 @@ return (__m128d){ __u, 0 }; } +/// \brief Loads a double-precision value into the high-order bits of a 128-bit +/// vector of [2 x double]. The low-order bits are copied from the +/// low-order bits of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVHPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// Bits [63:0] are written to bits [63:0] of the destination. +/// \param __dp +/// A pointer to a double-precision value. +/// Bits [63:0] are written to bits [127:64] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the interleaved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, double const *__dp) { @@ -518,6 +1482,21 @@ return (__m128d){ __a[0], __u }; } +/// \brief Loads a double-precision value into the low-order bits of a 128-bit +/// vector of [2 x double]. The high-order bits are copied from the +/// high-order bits of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVLPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// Bits [63:0] are written to bits [127:64] of the destination. +/// \param __dp +/// A pointer to a double-precision value. +/// Bits [63:0] are written to bits [63:0] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the moved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, double const *__dp) { @@ -534,42 +1513,124 @@ return (__m128d)__builtin_ia32_undef128(); } +/// \brief Initializes a 128-bit vector of [2 x double] with the specified +/// 64-bit double-precision value. +/// +/// \headerfile +/// +/// \param __w +/// A double-precision value used to initialize the lower 64 +/// bits of the destination vector of [2 x double]. The upper +/// bits of the destination are set to zero. +/// \returns An initialized 128-bit vector of [2 x double] containing the value +/// provided in the operand. The upper bits of the destination are set to +/// zero. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) { return (__m128d){ __w, 0 }; } +/// \brief Initializes both double-precision values in a 128-bit vector of [2 x +/// double] with the specified 64-bit double-precision value. +/// +/// \headerfile +/// +/// \param __w +/// A double-precision value used to initialize both 64-bit +/// double-precision values of the destination vector of [2 x +/// double]. +/// \returns An initialized 128-bit vector of [2 x double] containing the value +/// provided in the operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) { return (__m128d){ __w, __w }; } +/// \brief Initializes both double-precision values in a 128-bit vector of [2 x +/// double] with the specified 64-bit double-precision values. +/// +/// \headerfile +/// +/// \param __w +/// A double-precision value used to initialize the upper 64 +/// bits of the destination vector of [2 x double]. +/// \param __x +/// A double-precision value used to initialize the lower 64 +/// bits of the destination vector of [2 x double]. +/// \returns An initialized 128-bit vector of [2 x double] containing the +/// values provided in the operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x) { return (__m128d){ __x, __w }; } +/// \brief Initializes both double-precision values in a 128-bit vector of [2 x +/// double] with the specified 64-bit double-precision values. +/// +/// \headerfile +/// +/// \param __w +/// A double-precision value used to initialize the lower 64 +/// bits of the destination vector of [2 x double]. +/// \param __x +/// A double-precision value used to initialize the upper 64 +/// bits of the destination vector of [2 x double]. +/// \returns An initialized 128-bit vector of [2 x double] containing the +/// values provided in the operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x) { return (__m128d){ __w, __x }; } +/// \brief Sets the 64-bit double-precision registers to zero. +/// +/// \headerfile +/// +/// \returns An initialized 128-bit vector of [2 x double] with all elements set +/// to zero. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) { return (__m128d){ 0, 0 }; } +/// \brief Moves two double-precision values into a packed 128-bit vector of [2 +/// x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVSD instruction. +/// +/// \param __a +/// 128-bit vector of [2 x double]. The upper 64 bits of this +/// operand are copied to the upper 64 bits of the destination. +/// \param __b +/// 128-bit vector of [2 x double]. The lower 64 bits of this +/// operand are copied to the lower 64 bits of the destination. +/// \returns A 128-bit vector of [2 x double] containing the moved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b) { return (__m128d){ __b[0], __a[1] }; } +/// \brief Moves packed double-precision values from a 128-bit vector of [2 x +/// double] to a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVSD instruction. +/// +/// \param __dp +/// A pointer to a memory location that will receive the +/// double-precision values. +/// \param __a +/// A 128-bit vector of [2 x double] containing the values to be +/// moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, __m128d __a) { @@ -579,6 +1640,19 @@ ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; } +/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to +/// the upper and lower 64 bits of a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVSD instruction. +/// +/// \param __dp +/// A pointer to a memory location that can store 2 +/// double-precision values. +/// \param __a +/// A 128-bit vector of [2 x double] whose lower 64 bits are +/// copied to each of the values in __dp. static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a) { @@ -589,18 +1663,57 @@ ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0]; } +/// \brief Moves packed double-precision values from a 128-bit vector of [2 x +/// double] to a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPD instruction. +/// +/// \param __dp +/// A pointer to an aligned memory location that can store 2 +/// double-precision values. +/// \param __a +/// A packed 128-bit vector of [2 x double] containing the +/// values to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, __m128d __a) { *(__m128d *)__dp = __a; } +/// \brief Moves packed double-precision values from a 128-bit vector of [2 x +/// double] to an unaligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVUPD instruction. +/// +/// \param __dp +/// A pointer to an unaligned memory location that can store 2 +/// double-precision values. +/// \param __a +/// A packed 128-bit vector of [2 x double] containing the +/// values to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, __m128d __a) { __builtin_ia32_storeupd(__dp, __a); } +/// \brief Moves packed double-precision values, in reverse order, from a +/// 128-bit vector of [2 x double] to a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPD + shuffling instruction. +/// +/// \param __dp +/// A pointer to an aligned memory location that can store 2 +/// double-precision values in reverse order. +/// \param __a +/// A packed 128-bit vector of [2 x double] containing the +/// values to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, __m128d __a) { @@ -608,6 +1721,19 @@ *(__m128d *)__dp = __a; } +/// \brief Moves a packed double-precision value from the upper 64 bits of a +/// 128-bit vector of [2 x double] to a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVHPD instruction. +/// +/// \param __dp +/// A pointer to a memory location that will receive the +/// double-precision value. +/// \param __a +/// A packed 128-bit vector of [2 x double] containing the value +/// to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, __m128d __a) { @@ -617,6 +1743,19 @@ ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; } +/// \brief Moves a packed double-precision value from the lower 64 bits of a +/// 128-bit vector of [2 x double] to a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVLPD instruction. +/// +/// \param __dp +/// A pointer to a memory location that will receive the +/// double-precision value. +/// \param __a +/// A packed 128-bit vector of [2 x double] containing the value +/// to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, __m128d __a) { @@ -626,216 +1765,747 @@ ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; } +/// \brief Adds packed 8-bit integer values and writes the sums to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPADDB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the sums of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, __m128i __b) { return (__m128i)((__v16qi)__a + (__v16qi)__b); } +/// \brief Adds packed 16-bit integer values and writes the sums to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPADDW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the sums of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, __m128i __b) { return (__m128i)((__v8hi)__a + (__v8hi)__b); } +/// \brief Adds packed 32-bit integer values and writes the sums to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPADDD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the sums of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, __m128i __b) { return (__m128i)((__v4si)__a + (__v4si)__b); } +/// \brief Adds signed or unsigned 64-bit integer values and writes the sum to +/// the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c ADD instruction. +/// +/// \param __a +/// A 64-bit integer containing one of the source operands. +/// \param __b +/// A 64-bit integer containing one of the source operands. +/// \returns A 64-bit integer containing the sum of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_paddq(__a, __b); } +/// \brief Adds packed 64-bit integer values and writes the sums to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPADDQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the sums of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, __m128i __b) { return __a + __b; } +/// \brief Adds packed 8-bit integer values and writes the sums to the +/// corresponding bits in the destination. Positive sums greater than +/// 7Fh are saturated to 7Fh. Negative sums less than 80h are saturated to +/// 80h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPADDSB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the sums of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); } +/// \brief Adds packed 16-bit integer values and writes the sums to the +/// corresponding bits in the destination. Positive sums greater than +/// 7FFFh are saturated to 7FFFh. Negative sums less than 8000h are saturated +/// to 8000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPADDSW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the sums of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Adds packed 8-bit integer values and writes the unsigned integer +/// sums to the corresponding bits in the destination. The sums greater than +/// FFh are saturated to FFh. Negative sums less than 00h are saturated to +/// 00h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPADDUSB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the unsigned integer sums of +/// both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); } +/// \brief Adds packed 16-bit integer values and writes the unsigned integer +/// sums to the corresponding bits in the destination. Positive sums greater +/// than FFFFh are saturated to +/// FFFFh. Negative sums less than +/// 0000h are saturated to 0000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPADDUSW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the unsigned integer sums of +/// both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Computes the rounded averages of the packed unsigned 8-bit integer +/// values and writes the averages to the corresponding bits in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPAVGB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the averages of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); } +/// \brief Computes the rounded averages of the packed unsigned 16-bit integer +/// values and writes the averages to the corresponding bits in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPAVGW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the averages of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Multiplies corresponding pairs of packed 16-bit signed integer values, +/// adds pairs of contiguous products, and writes the 32-bit sums to the +/// corresponding bits in the destination. For example, bits [15:0] of +/// both operands are multiplied, bits [31:16] of both operands are +/// multiplied, and the sum of both results is written to bits [31:0] of +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMADDWD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the sums of products of both +/// operands: static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); } +/// \brief Compares each of the corresponding packed 16-bit integer values of +/// the 128-bit integer vectors, and writes the greater value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMAXSW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Compares each of the corresponding packed 8-bit unsigned integer +/// values of the 128-bit integer vectors, and writes the greater value to +/// the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMAXUB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); } +/// \brief Compares each of the corresponding packed 16-bit integer values of +/// the 128-bit integer vectors, and writes the lesser value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMINSW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Compares each of the corresponding packed 8-bit unsigned integer +/// values of the 128-bit integer vectors, and writes the lesser value to +/// the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMINUB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); } +/// \brief Multiplies packed 16-bit signed integer values and writes the +/// high-order 16 bits of each 32-bit product to the corresponding bits in +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMULHW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the products of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Multiplies packed 16-bit unsigned integer values and writes the +/// high-order 16 bits of each 32-bit product to the corresponding bits in +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMULHUW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the products of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Multiplies packed 16-bit integer values and writes the low-order 16 +/// bits of each 32-bit product to the corresponding bits in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMULLW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the products of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, __m128i __b) { return (__m128i)((__v8hi)__a * (__v8hi)__b); } +/// \brief Multiplies 32-bit unsigned integer values contained in the lower +/// bits of the two 64-bit integer vectors, and writes the 64-bit unsigned +/// product to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMULUDQ instruction. +/// +/// \param __a +/// A 64-bit integer containing one of the source operands. +/// \param __b +/// A 64-bit integer containing one of the source operands. +/// \returns A 64-bit integer vector containing the product of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) { return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); } +/// \brief Multiplies the even-indexed packed 32-bit unsigned integer values +/// contained in the two 128-bit integer vectors and writes the 64-bit +/// unsigned products to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMULUDQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the product of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, __m128i __b) { return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); } +/// \brief Subtracts packed 8-bit unsigned integer values and computes the +/// absolute differences to the corresponding bits in the destination. +/// Then sums of the absolute differences for the upper 8 source bytes and +/// the lower 8 source bytes are computed, and written to bits [15:0] and +/// [79:64] of the destination, respectively. The remaining bits in the +/// destination are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSADBW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the sums of the sets of +/// absolute differences between both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, __m128i __b) { return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); } +/// \brief Subtracts the 8-bit integer values in the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSUBB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, __m128i __b) { return (__m128i)((__v16qi)__a - (__v16qi)__b); } +/// \brief Subtracts the 16-bit integer values in the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSUBW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, __m128i __b) { return (__m128i)((__v8hi)__a - (__v8hi)__b); } +/// \brief Subtracts the 32-bit integer values in the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSUBD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, __m128i __b) { return (__m128i)((__v4si)__a - (__v4si)__b); } +/// \brief Subtracts signed or unsigned 64-bit integer values and writes the +/// difference to the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c SUB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the minuend. +/// \param __b +/// A 64-bit integer vector containing the subtrahend. +/// \returns A 64-bit integer vector containing the difference of the values in +/// the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psubq(__a, __b); } +/// \brief Subtracts the 64-bit integer values in the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSUBQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, __m128i __b) { return __a - __b; } +/// \brief Subtracts packed 8-bit integer values and writes the differences to +/// the corresponding bits in the destination. Values greater than the +/// largest signed 8-bit integer are saturated to 7Fh, and values less than +/// the smallest +/// signed 8-bit integer are saturated to 80h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSUBSB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); } +/// \brief Subtracts packed 16-bit integer values and writes the differences to +/// the corresponding bits in the destination. Values greater than 7FFFh are +/// saturated to 7FFFh, and values less than 8000h are saturated to 8000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSUBSW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Subtracts packed 8-bit integer values and writes the unsigned +/// integer differences to the corresponding bits in the destination. Values +/// less +/// than 00h are saturated to 00h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSUBUSB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the unsigned integer +/// differences of the values in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); } +/// \brief Subtracts packed 16-bit integer values and writes the unsigned +/// integer differences to the corresponding bits in the destination. Values +/// greater than FFFFh are saturated +/// to FFFFh, and values less 0000h are saturated to 0000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSUBUSW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the unsigned integer +/// differences of the values in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Performs a bitwise AND of 2 packed 128-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPAND instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the bitwise AND of the values +/// between both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, __m128i __b) { return __a & __b; } +/// \brief Performs a bitwise AND of 2 packed 128-bit integer vectors, using +/// the ones-complement of the values contained in the first source +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPEQD+VPXOR+VPAND instruction. +/// +/// \param __a +/// A 128-bit vector containing the left source operand. The ones +/// complement of this value is used in the bitwise AND. +/// \param __b +/// A 128-bit vector containing the right source operand. +/// \returns A 128-bit integer vector containing the bitwise AND of the +/// ones-complement of the first operand and the values in the second operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, __m128i __b) { return ~__a & __b; } +/// \brief Performs a bitwise OR of 2 packed 128-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPOR instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the bitwise OR of the values +/// between both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, __m128i __b) { return __a | __b; } +/// \brief Performs a bitwise exclusive OR of 2 packed 128-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPXOR instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the bitwise exclusive OR of +/// the values between both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, __m128i __b) { return __a ^ __b; } +/// \brief Left-shifts the 128-bit integer vector operand by the specified +/// number of bytes. Low-order bits are cleared. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_slli_si128(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to \c VPSLLDQ instruction. +/// +/// \param a +/// A 128-bit integer vector containing the source operand. +/// \param imm +/// An immediate value specifying the number of bytes to +/// left-shift operand a. +/// \returns A 128-bit integer vector containing the left-shifted value. #define _mm_slli_si128(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \ (__v16qi)(__m128i)(a), \ @@ -859,66 +2529,217 @@ #define _mm_bslli_si128(a, imm) \ _mm_slli_si128((a), (imm)) +/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSLLW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to left-shift +/// each value in operand __a. +/// \returns A 128-bit integer vector containing the left-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, int __count) { return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); } +/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSLLW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits[63:0] specify the +/// number of bits to left-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the left-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); } +/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSLLD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to left-shift +/// each value in operand __a. +/// \returns A 128-bit integer vector containing the left-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, int __count) { return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); } +/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSLLD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits[63:0] specify the +/// number of bits to left-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the left-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); } +/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSLLQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to left-shift +/// each value in operand __a. +/// \returns A 128-bit integer vector containing the left-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, int __count) { return __builtin_ia32_psllqi128(__a, __count); } +/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSLLQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits[63:0] specify the +/// number of bits to left-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the left-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, __m128i __count) { return __builtin_ia32_psllq128(__a, __count); } +/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the +/// sign bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSRAW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to +/// right-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the right-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, int __count) { return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); } +/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the +/// sign bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSRAW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits[63:0] specify the +/// number of bits to right-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the right-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); } +/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the +/// sign bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSRAD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to +/// right-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the right-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, int __count) { return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); } +/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the +/// sign bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSRAD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits[63:0] specify the +/// number of bits to right-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the right-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); } +/// \brief Right-shifts the 128-bit integer vector operand by the specified +/// number of bytes. High-order bits are cleared. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_srli_si128(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to \c VPSRLDQ instruction. +/// +/// \param a +/// A 128-bit integer vector containing the source operand. +/// \param imm +/// An immediate value specifying the number of bytes to +/// right-shift operand a. +/// \returns A 128-bit integer vector containing the right-shifted value. #define _mm_srli_si128(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \ (__v16qi)_mm_setzero_si128(), \ @@ -942,60 +2763,188 @@ #define _mm_bsrli_si128(a, imm) \ _mm_srli_si128((a), (imm)) +/// \brief Right-shifts each packed 16-bit value in the 128-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSRLW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to +/// right-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the right-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, int __count) { return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); } +/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSRLW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits[63:0] specify the +/// number of bits to right-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the right-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); } +/// \brief Right-shifts each packed 32-bit value in the 128-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSRLD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to +/// right-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the right-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, int __count) { return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); } +/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSRLD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits[63:0] specify the +/// number of bits to right-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the right-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, __m128i __count) { return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); } +/// \brief Right-shifts each packed 64-bit value in the 128-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSRLQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to +/// right-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the right-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, int __count) { return __builtin_ia32_psrlqi128(__a, __count); } +/// \brief Right-shifts each 64-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSRLQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits[63:0] specify the +/// number of bits to right-shift each value in operand __a. +/// \returns A 128-bit integer vector containing the right-shifted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, __m128i __count) { return __builtin_ia32_psrlq128(__a, __count); } +/// \brief Compares each of the corresponding packed 8-bit values of the +/// 128-bit integer vectors for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPEQB instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b) { return (__m128i)((__v16qi)__a == (__v16qi)__b); } +/// \brief Compares each of the corresponding packed 16-bit values of the +/// 128-bit integer vectors for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPEQW instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, __m128i __b) { return (__m128i)((__v8hi)__a == (__v8hi)__b); } +/// \brief Compares each of the corresponding packed 32-bit values of the +/// 128-bit integer vectors for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPEQD instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, __m128i __b) { return (__m128i)((__v4si)__a == (__v4si)__b); } +/// \brief Compares each of the corresponding packed 8-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are +/// greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPGTB instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, __m128i __b) { @@ -1004,30 +2953,100 @@ return (__m128i)((__v16qs)__a > (__v16qs)__b); } +/// \brief Compares each of the corresponding packed 16-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are +/// greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPGTW instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, __m128i __b) { return (__m128i)((__v8hi)__a > (__v8hi)__b); } +/// \brief Compares each of the corresponding packed 32-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are +/// greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPGTD instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, __m128i __b) { return (__m128i)((__v4si)__a > (__v4si)__b); } +/// \brief Compares each of the corresponding packed 8-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are +/// less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPGTB instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, __m128i __b) { return _mm_cmpgt_epi8(__b, __a); } +/// \brief Compares each of the corresponding packed 16-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are +/// less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPGTW instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, __m128i __b) { return _mm_cmpgt_epi16(__b, __a); } +/// \brief Compares each of the corresponding packed 32-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are +/// less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPGTD instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, __m128i __b) { @@ -1035,6 +3054,24 @@ } #ifdef __x86_64__ +/// \brief Converts a 64-bit signed integer value into a double-precision value, +/// writing the result to the lower 64 bits of the destination. The upper +/// 64 bits of the first operand are copied to the upper 64 bits of the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSI2SD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The upper 64 bits of this +/// operand are copied to the upper 64 bits of the destination. +/// \param __b +/// A 64-bit signed integer operand containing the value to be +/// converted. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// converted value of the second operand. The upper 64 bits are copied +/// from the upper 64 bits of the first operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a, long long __b) { @@ -1042,12 +3079,36 @@ return __a; } +/// \brief Converts a vector of [2 x double] into a 64-bit signed integer value, +/// using the lower 64 bits of the operand, truncating the result when it +/// is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSD2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are used +/// in the conversion. +/// \returns A 64-bit signed integer containing the converted value. static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) { return __builtin_ia32_cvtsd2si64(__a); } +/// \brief Converts a vector of [2 x double] into a 64-bit signed integer value, +/// using the lower 64 bits of the operand, truncating the result when it +/// is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTTSD2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are used +/// in the conversion. +/// \returns A 64-bit signed integer containing the converted value. static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) { @@ -1055,24 +3116,62 @@ } #endif +/// \brief Converts a vector of [4 x i32] into a vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTDQ2PS instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 128-bit vector of [4 x float] containing the converted values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) { return __builtin_ia32_cvtdq2ps((__v4si)__a); } +/// \brief Converts a 128-bit vector of [4 x float] into a 128-bit vector of [4 +/// x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTPS2DQ instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit integer vector containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) { return (__m128i)__builtin_ia32_cvtps2dq(__a); } +/// \brief Converts a 128-bit vector of [4 x float] into a 128-bit vector of [4 +/// x i32], truncating the result when it is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTTPS2DQ instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x i32] containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) { return (__m128i)__builtin_ia32_cvttps2dq(__a); } +/// \brief Converts a 32-bit signed integer value into a vector of [4 x i32], +/// writing the result to the lower 32 bits of the destination. +/// +/// \headerfile +/// +/// \param __a +/// A 32-bit signed integer operand containing the value to be +/// converted. +/// \returns A 128-bit vector of [4 x i32] containing the converted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) { @@ -1080,6 +3179,15 @@ } #ifdef __x86_64__ +/// \brief Converts a 64-bit signed integer value into a vector of [2 x i64], +/// writing the result to the lower 64 bits of the destination. +/// +/// \headerfile +/// +/// \param __a +/// A 64-bit signed integer operand containing the value to be +/// converted. +/// \returns A 128-bit vector of [2 x i64] containing the converted value. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) { @@ -1087,6 +3195,15 @@ } #endif +/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a +/// 32-bit signed integer value. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit vector of [4 x i32] operand. The least significant +/// 32 bits are moved to the destination. +/// \returns A 32-bit signed integer containing the moved value. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) { @@ -1095,6 +3212,15 @@ } #ifdef __x86_64__ +/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a +/// 64-bit signed integer value. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit vector of [2 x i64] operand. The least significant +/// 64 bits are moved to the destination. +/// \returns A 64-bit signed integer containing the moved value. static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) { @@ -1102,12 +3228,33 @@ } #endif +/// \brief Moves packed integer values from an aligned 128-bit memory location +/// to elements in a 128-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDQA instruction. +/// +/// \param __p +/// An aligned pointer to a memory location containing integer +/// values. +/// \returns A 128-bit integer vector containing the moved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p) { return *__p; } +/// \brief Moves packed integer values from an unaligned 128-bit memory +/// location to elements in a 128-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDQU instruction. +/// +/// \param __p +/// A pointer to a memory location containing integer values. +/// \returns A 128-bit integer vector containing the moved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i const *__p) { @@ -1117,6 +3264,16 @@ return ((struct __loadu_si128*)__p)->__v; } +/// \brief Moves the packed low-order integer values from a 128-bit source +/// operand of [2 x i64] to the corresponding bits in the destination. +/// +/// \headerfile +/// +/// \param __p +/// A 128-bit vector of [2 x i64]. +/// Bits [63:0] are written to bits [63:0] of the destination. +/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the +/// moved value. The higher order bits are cleared. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadl_epi64(__m128i const *__p) { @@ -1132,114 +3289,444 @@ return (__m128i)__builtin_ia32_undef128(); } +/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with +/// the specified 64-bit integer values. +/// +/// \headerfile +/// +/// \param __q1 +/// A 64-bit integer value used to initialize the upper 64 bits +/// of the destination vector of [2 x i64]. +/// \param __q0 +/// A 64-bit integer value used to initialize the lower 64 bits +/// of the destination vector of [2 x i64]. +/// \returns An initialized 128-bit vector of [2 x i64] containing the values +/// provided in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0) { return (__m128i){ __q0, __q1 }; } +/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with +/// the specified 64-bit integer values. +/// +/// \headerfile +/// +/// \param __q1 +/// A 64-bit integer value used to initialize the upper 64 bits +/// of the destination vector of [2 x i64]. +/// \param __q0 +/// A 64-bit integer value used to initialize the lower 64 bits +/// of the destination vector of [2 x i64]. +/// \returns An initialized 128-bit vector of [2 x i64] containing the values +/// provided in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0) { return (__m128i){ (long long)__q0, (long long)__q1 }; } +/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with +/// the specified 32-bit integer values. +/// +/// \headerfile +/// +/// \param __i3 +/// A 32-bit integer value used to initialize bits [127:96] of +/// the destination vector. +/// \param __i2 +/// A 32-bit integer value used to initialize bits [95:64] of +/// the destination vector. +/// \param __i1 +/// A 32-bit integer value used to initialize bits [63:32] of +/// the destination vector. +/// \param __i0 +/// A 32-bit integer value used to initialize bits [31:0] of the +/// destination vector. +/// \returns An initialized 128-bit vector of [4 x i32] containing the values +/// provided in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0) { return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; } +/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with +/// the specified 16-bit integer values. +/// +/// \headerfile +/// +/// \param __w7 +/// A 16-bit integer value used to initialize bits [127:112] of +/// the destination vector. +/// \param __w6 +/// A 16-bit integer value used to initialize bits [111:96] of +/// the destination vector. +/// \param __w5 +/// A 16-bit integer value used to initialize bits [95:80] of +/// the destination vector. +/// \param __w4 +/// A 16-bit integer value used to initialize bits [79:64] of +/// the destination vector. +/// \param __w3 +/// A 16-bit integer value used to initialize bits [63:48] of +/// the destination vector. +/// \param __w2 +/// A 16-bit integer value used to initialize bits [47:32] of +/// the destination vector. +/// \param __w1 +/// A 16-bit integer value used to initialize bits [31:16] of +/// the destination vector. +/// \param __w0 +/// A 16-bit integer value used to initialize bits [15:0] of the +/// destination vector. +/// \returns An initialized 128-bit vector of [8 x i16] containing the values +/// provided in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) { return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; } +/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with +/// the specified 8-bit integer values. +/// +/// \headerfile +/// +/// \param __b15 +/// Initializes bits [127:120] of the destination vector. +/// \param __b14 +/// Initializes bits [119:112] of the destination vector. +/// \param __b13 +/// Initializes bits [111:104] of the destination vector. +/// \param __b12 +/// Initializes bits [103:96] of the destination vector. +/// \param __b11 +/// Initializes bits [95:88] of the destination vector. +/// \param __b10 +/// Initializes bits [87:80] of the destination vector. +/// \param __b9 +/// Initializes bits [79:72] of the destination vector. +/// \param __b8 +/// Initializes bits [71:64] of the destination vector. +/// \param __b7 +/// Initializes bits [63:56] of the destination vector. +/// \param __b6 +/// Initializes bits [55:48] of the destination vector. +/// \param __b5 +/// Initializes bits [47:40] of the destination vector. +/// \param __b4 +/// Initializes bits [39:32] of the destination vector. +/// \param __b3 +/// Initializes bits [31:24] of the destination vector. +/// \param __b2 +/// Initializes bits [23:16] of the destination vector. +/// \param __b1 +/// Initializes bits [15:8] of the destination vector. +/// \param __b0 +/// Initializes bits [7:0] of the destination vector. +/// \returns An initialized 128-bit vector of [8 x i8] containing the values +/// provided in the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) { return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; } +/// \brief Initializes both values in a 128-bit integer vector with the +/// specified 64-bit integer value. +/// +/// \headerfile +/// +/// \param __q +/// Integer value used to initialize the elements of the +/// destination integer vector. +/// \returns An initialized 128-bit integer vector with all elements containing +/// the value provided in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) { return (__m128i){ __q, __q }; } +/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the +/// specified 64-bit value. +/// +/// \headerfile +/// +/// \param __q +/// A 64-bit value used to initialize the elements of the +/// destination integer vector. +/// \returns An initialized 128-bit vector of [2 x i64] with all elements +/// containing the value provided in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) { return (__m128i){ (long long)__q, (long long)__q }; } +/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the +/// specified 32-bit value. +/// +/// \headerfile +/// +/// \param __i +/// A 32-bit value used to initialize the elements of the +/// destination integer vector. +/// \returns An initialized 128-bit vector of [4 x i32] with all elements +/// containing the value provided in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) { return (__m128i)(__v4si){ __i, __i, __i, __i }; } +/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the +/// specified 16-bit value. +/// +/// \headerfile +/// +/// \param __w +/// A 16-bit value used to initialize the elements of the +/// destination integer vector. +/// \returns An initialized 128-bit vector of [8 x i16] with all elements +/// containing the value provided in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) { return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; } +/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the +/// specified 8-bit value. +/// +/// \headerfile +/// +/// \param __b +/// An 8-bit value used to initialize the elements of the +/// destination integer vector. +/// \returns An initialized 128-bit vector of [16 x i8] with all elements +/// containing the value provided in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) { return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; } +/// \brief Initializes both 64-bit integer values in a 128-bit vector of [2 x +/// i64] in reverse order, using the specified 64-bit values. +/// +/// \headerfile +/// +/// \param __q0 +/// A 64-bit value used to initialize the lower 64 bits of the +/// destination vector. +/// \param __q1 +/// A 64-bit value used to initialize the upper 64 bits of the +/// destination vector. +/// \returns An initialized 128-bit vector of [2 x i64] containing the values +/// provided in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1) { return (__m128i){ (long long)__q0, (long long)__q1 }; } +/// \brief Initializes all 32-bit integer values in a 128-bit vector of [4 x i32] +/// in reverse order, using the specified 32-bit integer values. +/// +/// \headerfile +/// +/// \param __i0 +/// A 32-bit integer value used to initialize bits [31:0] of the +/// destination vector. +/// \param __i1 +/// A 32-bit integer value used to initialize bits [63:32] of +/// the destination vector. +/// \param __i2 +/// A 32-bit integer value used to initialize bits [95:64] of +/// the destination vector. +/// \param __i3 +/// A 32-bit integer value used to initialize bits [127:96] of +/// the destination vector. +/// \returns An initialized 128-bit vector of [4 x i32] containing the values +/// provided in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) { return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; } +/// \brief Initializes all 16-bit integer values in a 128-bit vector of [8 x i16] +/// in reverse order, using the specified 16-bit integer values. +/// +/// \headerfile +/// +/// \param __w0 +/// A 16-bit integer value used to initialize bits [15:0] of the +/// destination vector. +/// \param __w1 +/// A 16-bit integer value used to initialize bits [31:16] of +/// the destination vector. +/// \param __w2 +/// A 16-bit integer value used to initialize bits [47:32] of +/// the destination vector. +/// \param __w3 +/// A 16-bit integer value used to initialize bits [63:48] of +/// the destination vector. +/// \param __w4 +/// A 16-bit integer value used to initialize bits [79:64] of +/// the destination vector. +/// \param __w5 +/// A 16-bit integer value used to initialize bits [95:80] of +/// the destination vector. +/// \param __w6 +/// A 16-bit integer value used to initialize bits [111:96] of +/// the destination vector. +/// \param __w7 +/// A 16-bit integer value used to initialize bits [127:112] of +/// the destination vector. +/// \returns An initialized 128-bit vector of [8 x i16] containing the values +/// provided in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) { return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; } +/// \brief Initializes all 16-bit integer values in a 128-bit vector of [16 x i8] +/// in reverse order, using the specified 16-bit integer values. +/// +/// \headerfile +/// +/// \param __b0 +/// Initializes bits [7:0] of the destination vector. +/// \param __b1 +/// Initializes bits [15:8] of the destination vector. +/// \param __b2 +/// Initializes bits [23:16] of the destination vector. +/// \param __b3 +/// Initializes bits [31:24] of the destination vector. +/// \param __b4 +/// Initializes bits [39:32] of the destination vector. +/// \param __b5 +/// Initializes bits [47:40] of the destination vector. +/// \param __b6 +/// Initializes bits [55:48] of the destination vector. +/// \param __b7 +/// Initializes bits [63:56] of the destination vector. +/// \param __b8 +/// Initializes bits [71:64] of the destination vector. +/// \param __b9 +/// Initializes bits [79:72] of the destination vector. +/// \param __b10 +/// Initializes bits [87:80] of the destination vector. +/// \param __b11 +/// Initializes bits [95:88] of the destination vector. +/// \param __b12 +/// Initializes bits [103:96] of the destination vector. +/// \param __b13 +/// Initializes bits [111:104] of the destination vector. +/// \param __b14 +/// Initializes bits [119:112] of the destination vector. +/// \param __b15 +/// Initializes bits [127:120] of the destination vector. +/// \returns An initialized 128-bit vector of [16 x i8] containing the values +/// provided in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) { return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; } +/// \brief Sets the 128-bit integer registers to zero. +/// +/// \headerfile +/// +/// \returns An initialized 128-bit integer vector with all elements set to zero. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) { return (__m128i){ 0LL, 0LL }; } +/// \brief Moves packed integer values from a 128-bit integer vector to an +/// aligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDQA instruction. +/// +/// \param __p +/// A pointer to an aligned memory location that will receive +/// the integer values. +/// \param __b +/// A packed 128-bit integer vector containing the values to be +/// moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, __m128i __b) { *__p = __b; } +/// \brief Moves packed integer values from a 128-bit integer vector to an +/// unaligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDQU instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the integer +/// values. +/// \param __b +/// A packed 128-bit integer vector containing the values to be +/// moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i *__p, __m128i __b) { __builtin_ia32_storedqu((char *)__p, (__v16qi)__b); } +/// \brief Moves bytes selected by the mask from the first operand to the +/// specified unaligned memory location. When a mask bit is 1, the +/// corresponding byte is written, otherwise it is not written. Exception +/// and trap behavior for elements not selected for storage to memory are +/// implementation dependent. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMASKMOVDQU instruction. +/// +/// \param __d +/// A 128-bit integer vector containing the values to be moved. +/// \param __n +/// A 128-bit integer vector containing the mask. The most +/// significant bit of each byte represents the mask bits. +/// \param __p +/// A 128-bit unaligned memory location where the specified +/// values are moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) { __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); } +/// \brief Moves a packed 64-bit integer value from the lower 64 bits of a +/// 128-bit vector of [2 x i64] to a 128-bit integer vector memory +/// location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVQ instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the 64-bit +/// integer value. +/// \param __a +/// A packed 128-bit vector of [2 x i64]. The lower 64 bits +/// contain the value to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i *__p, __m128i __a) { @@ -1249,18 +3736,58 @@ ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; } +/// \brief Stores double-precision values in a 128-bit memory location. To +/// minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVNTPD instruction. +/// +/// \param __p +/// The 128-bit memory location used to store the value. +/// \param __a +/// A vector of [2 x double] containing the 64-bit values to be +/// stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, __m128d __a) { __builtin_ia32_movntpd(__p, __a); } +/// \brief Moves packed integer values from a 128-bit integer vector to a +/// 128-bit aligned memory location. To minimize caching, the data is flagged +/// as +/// non-temporal (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVNTDQ instruction. +/// +/// \param __p +/// A 128-bit aligned pointer to a memory location that will +/// receive the integer values. +/// \param __a +/// A 128-bit integer vector containing the values to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, __m128i __a) { __builtin_ia32_movntdq(__p, __a); } +/// \brief Stores a 32-bit integer value in the specified aligned memory +/// location. To minimize caching, the data is flagged as non-temporal +/// (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MOVNTI instruction. +/// +/// \param __p +/// The aligned memory location used to store the register +/// value. +/// \param __a +/// A 32-bit integer containing the value to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si32(int *__p, int __a) { @@ -1275,42 +3802,143 @@ } #endif +/// \brief The cache line containing __p is flushed +/// and invalidated from all caches in the coherency domain. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CLFLUSH instruction. +/// +/// \param __p +/// The memory location used to identify the cache line to be +/// flushed. static __inline__ void __DEFAULT_FN_ATTRS _mm_clflush(void const *__p) { __builtin_ia32_clflush(__p); } +/// \brief Forces strong memory ordering (serialization) between load +/// instructions preceding this instruction and load instructions +/// following this instruction, assuring the system completes all previous +/// loads before executing subsequent loads. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c LFENCE instruction. +/// static __inline__ void __DEFAULT_FN_ATTRS _mm_lfence(void) { __builtin_ia32_lfence(); } +/// \brief Forces strong memory ordering (serialization) between load and store +/// instructions preceding this instruction and load and store +/// instructions following this instruction, assuring that the system +/// completes all previous memory accesses before executing subsequent +/// memory accesses. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MFENCE instruction. +/// static __inline__ void __DEFAULT_FN_ATTRS _mm_mfence(void) { __builtin_ia32_mfence(); } +/// \brief Converts 16-bit signed integers from both 128-bit integer vector +/// operands into 8-bit signed integers, and packs the results into the +/// destination. Positive values greater than 7Fh are saturated to 7Fh. +/// Negative values less than 80h are saturated to 80h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPACKSSWB instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. The converted values are +/// written to the lower order bits of the destination. +/// \param __b +/// A 128-bit vector of [8 x i16]. The converted values are +/// written to the upper order bits of the destination. +/// \returns A 128-bit vector of [16 x i8] containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); } +/// \brief Converts 32-bit signed integers from both 128-bit integer vector +/// operands into 16-bit signed integers, and packs the results into the +/// destination. Positive values greater than 7FFFh are saturated to 7FFFh. +/// Negative values less than 8000h are saturated to 8000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPACKSSDW instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. The converted values are +/// written to the lower order bits of the destination. +/// \param __b +/// A 128-bit vector of [4 x i32]. The converted values are +/// written to the upper order bits of the destination. +/// \returns A 128-bit vector of [8 x i16] containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); } +/// \brief Converts 16-bit signed integers from both 128-bit integer vector +/// operands into 8-bit unsigned integers, and packs the results into the +/// destination. Values greater than 7Fh +/// are saturated to 7Fh. Values +/// less than 00h are saturated to +/// 00h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPACKUSWB instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. The converted values are +/// written to the lower order bits of the destination. +/// \param __b +/// A 128-bit vector of [8 x i16]. The converted values are +/// written to the upper order bits of the destination. +/// \returns A 128-bit vector of [16 x i8] containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); } +/// \brief Extracts 16 bits of extended packed data from a 128-bit integer +/// vector and copies it to the destination, as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPEXTRW instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __imm +/// Determines which bits are extracted using bits [3:0]: +/// 000: Bits [15:0] are copied to the destination. +/// 001: Bits [31:16] are copied to the destination. +/// 010: Bits [47:32] are copied to the destination. +/// 011: Bits [63:48] are copied to the destination. +/// 100: Bits [79:64] are copied to the destination. +/// 101: Bits [95:80] are copied to the destination. +/// 110: Bits [111:96] are copied to the destination. +/// 111: Bits [127:112] are copied to the destination. +/// \returns A 16-bit integer containing the extracted 16 bits of extended +/// packed data. static __inline__ int __DEFAULT_FN_ATTRS _mm_extract_epi16(__m128i __a, int __imm) { @@ -1318,6 +3946,27 @@ return (unsigned short)__b[__imm & 7]; } +/// \brief Copies extended packed data from the 128-bit integer vector operand +/// to the destination and inserts the lower 16-bits of an integer operand, +/// using the offset specified by the immediate operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPINSRW instruction. +/// +/// \param __a +/// A 128-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param __b +/// An integer. The bits of this operand are written to the +/// destination beginning at the offset specified by operand __imm. +/// \param __imm +/// Specifies the bit offset to be used in the destination. The +/// remaining bits in the destination are copied from the +/// corresponding bits in operand __a. +/// \returns A 128-bit integer vector containing the copied extended packed +/// data from the operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_insert_epi16(__m128i __a, int __b, int __imm) { @@ -1326,18 +3975,69 @@ return (__m128i)__c; } +/// \brief Copies the values of the most significant bits from each 8-bit +/// element in a 128-bit integer vector to create a 16-bit mask value, +/// zero-extends the value, and writes it to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVMSKB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values with bits to +/// be extracted. +/// \returns The most significant bits from each 8-bit element in the operand, +/// written to bits [15:0]. static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) { return __builtin_ia32_pmovmskb128((__v16qi)__a); } +/// \brief Shuffles the 4 32-bit integers from a 128-bit integer vector to the +/// destination, as specified by the immediate value operand. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_shuffle_epi32(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to \c VPSHUFB instruction. +/// +/// \param a +/// A 128-bit integer vector containing the values to be copied. +/// \param imm +/// An immediate value containing 8-bit values specifying which +/// elements to copy from a. If bit 7 is set, the corresponding +/// 8-bit element in the destination is cleared. Bits [3:0] +/// select which 8-bit element to copy. +/// \returns A 128-bit integer vector containing the shuffled values. #define _mm_shuffle_epi32(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ (__v4si)_mm_setzero_si128(), \ (imm) & 0x3, ((imm) & 0xc) >> 2, \ ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); }) +/// \brief Shuffles the lower 4 16-bit integers from a 128-bit integer vector +/// to the destination, as specified by the immediate value operand. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm; +/// \endcode +/// +/// This intrinsic corresponds to \c VPSHUFLW instruction. +/// +/// \param a +/// A 128-bit integer vector containing the values to be copied. +/// \param imm +/// An immediate value containing 8-bit values specifying which +/// elements to copy from a. If bit 7 is set, the corresponding +/// 8-bit element in the destination is cleared. Bits [3:0] +/// select which 8-bit element to copy. +/// \returns A 128-bit integer vector containing the shuffled values. #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ (__v8hi)_mm_setzero_si128(), \ @@ -1345,6 +4045,25 @@ ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \ 4, 5, 6, 7); }) +/// \brief Shuffles the upper 4 16-bit integers from a 128-bit integer vector +/// to the destination, as specified by the immediate value operand. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to \c VPSHUFLW instruction. +/// +/// \param a +/// A 128-bit integer vector containing the values to be copied. +/// \param imm +/// An immediate value containing 8-bit values specifying which +/// elements to copy from a. If bit 7 is set, the corresponding +/// 8-bit element in the destination is cleared. Bits [3:0] +/// select which 8-bit element to copy. +/// \returns A 128-bit integer vector containing the shuffled values. #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ (__v8hi)_mm_setzero_si128(), \ @@ -1354,130 +4073,442 @@ 4 + (((imm) & 0x30) >> 4), \ 4 + (((imm) & 0xc0) >> 6)); }) +/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors +/// of [16 x i8] and interleaves them into a packed 128-bit vector of [16 x +/// i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPUNPCKHBW instruction. +/// +/// \param __a +/// A 128-bit vector of [16 x i8]. +/// Bits [71:64] are written to bits [7:0] of the destination. +/// This pattern continues until: +/// Bits [127:120] are written to bits [119:112] of the +/// destination. +/// \param __b +/// A 128-bit vector of [16 x i8]. +/// Bits [71:64] are written to bits [15:8] of the destination. +/// This pattern continues until: +/// Bits [127:120] are written to bits [127:120] of the +/// destination. +/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); } +/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors +/// of [8 x i16] and interleaves them into a packed 128-bit vector of [8 x +/// i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPUNPCKHWD instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. +/// Bits [79:64] are written to bits [15:0] of the destination. +/// This pattern continues until: +/// Bits [127:112] are written to bits [111:96] of the +/// destination. +/// \param __b +/// A 128-bit vector of [8 x i16]. +/// Bits [79:64] are written to bits [31:16] of the destination. +/// This pattern continues until: +/// Bits [127:112] are written to bits [127:112] of the +/// destination. +/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); } +/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors +/// of [4 x i32] and interleaves them into a packed 128-bit vector of [4 x +/// i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPUNPCKHDQ instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. +/// Bits [95:64] are written to bits [31:0] of the destination. +/// Bits [127:96] are written to bits [95:64] of the +/// destination. +/// \param __b +/// A 128-bit vector of [4 x i32]. +/// Bits [95:64] are written to bits [64:32] of the destination. +/// Bits [127:96] are written to bits [127:96] of the +/// destination. +/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); } +/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors +/// of [2 x i64] and interleaves them into a packed 128-bit vector of [2 x +/// i64]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPUNPCKHQDQ instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x i64]. +/// Bits [127:64] are written to bits [63:0] of the destination. +/// \param __b +/// A 128-bit vector of [2 x i64]. +/// Bits [127:64] are written to bits [127:64] of the +/// destination. +/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1); } +/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of +/// [16 x i8] and interleaves them into a packed 128-bit vector of [16 x +/// i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPUNPCKLBW instruction. +/// +/// \param __a +/// A 128-bit vector of [16 x i8]. +/// Bits [7:0] are written to bits [7:0] of the destination. +/// This pattern continues until: +/// Bits [63:56] are written to bits [119:112] of the +/// destination. +/// \param __b +/// A 128-bit vector of [16 x i8]. +/// Bits [7:0] are written to bits [15:8] of the destination. +/// This pattern continues until: +/// Bits [63:56] are written to bits [127:120] of the +/// destination. +/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); } +/// \brief Unpacks the low-order (index 0-3) values from two 128-bit vectors of +/// [8 x i16] and interleaves them into a packed 128-bit vector of [8 x +/// i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPUNPCKLWD instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. +/// Bits [15:0] are written to bits [15:0] of the destination. +/// This pattern continues until: +/// Bits [63:48] are written to bits [111:96] of the +/// destination. +/// \param __b +/// A 128-bit vector of [8 x i16]. +/// Bits [15:0] are written to bits [31:16] of the destination. +/// This pattern continues until: +/// Bits [63:48] are written to bits [127:112] of the +/// destination. +/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); } +/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of +/// [4 x i32] and interleaves them into a packed 128-bit vector of [4 x +/// i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPUNPCKLDQ instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. +/// Bits [31:0] are written to bits [31:0] of the destination. +/// Bits [63:32] are written to bits [95:64] of the destination. +/// \param __b +/// A 128-bit vector of [4 x i32]. +/// Bits [31:0] are written to bits [64:32] of the destination. +/// Bits [63:32] are written to bits [127:96] of the +/// destination. +/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); } +/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors +/// of [2 x i64] and interleaves them into a packed 128-bit vector of [2 x +/// i64]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPUNPCKLQDQ instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x i64]. +/// Bits [63:0] are written to bits [63:0] of the destination. +/// \param __b +/// A 128-bit vector of [2 x i64]. +/// Bits [63:0] are written to bits [127:64] of the destination. +/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, __m128i __b) { return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0); } +/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 64-bit +/// register. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MOVDQ2Q instruction. +/// +/// \param __a +/// A 128-bit integer vector operand. The lower 64 bits are +/// moved to the destination. +/// \returns A 64-bit register containing the lower 64 bits of the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) { return (__m64)__a[0]; } +/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the +/// upper bits. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MOVQ2DQ instruction. +/// +/// \param __a +/// A 64-bit value. +/// \returns A 128-bit integer vector. The lower 64 bits contain the value from +/// the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) { return (__m128i){ (long long)__a, 0 }; } +/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit +/// integer vector, zeroing the upper bits. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVQ instruction. +/// +/// \param __a +/// A 128-bit integer vector operand. The lower 64 bits are +/// moved to the destination. +/// \returns A 128-bit integer vector. The lower 64 bits contain the value from +/// the operand. The upper 64 bits are assigned zeros. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) { return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2); } +/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors +/// of [2 x double] and interleaves them into a packed 128-bit vector of +/// [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUNPCKHPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// Bits [127:64] are written to bits [63:0] of the destination. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// Bits [127:64] are written to bits [127:64] of the +/// destination. +/// \returns A 128-bit vector of [2 x double] containing the interleaved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, __m128d __b) { return __builtin_shufflevector(__a, __b, 1, 2+1); } +/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors +/// of [2 x double] and interleaves them into a packed 128-bit vector of +/// [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUNPCKLPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// Bits [63:0] are written to bits [63:0] of the destination. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// Bits [63:0] are written to bits [127:64] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the interleaved values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, __m128d __b) { return __builtin_shufflevector(__a, __b, 0, 2+0); } +/// \brief Extracts the sign bits of the packed double-precision values in the +/// 128-bit vector of [2 x double], zero-extends the value, and writes it +/// to the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVMSKPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the values with +/// sign bits to be extracted. +/// \returns The sign bits from the operand, written to bits [1:0]. The +/// remaining bits are assigned values of zero. static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) { return __builtin_ia32_movmskpd(__a); } +/// \brief Selects two double-precision values from the 128-bit operands of [2 +/// x double], as specified by the immediate value operand. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); +/// \endcode +/// +/// This intrinsic corresponds to \c VSHUFPD instruction. +/// +/// \param a +/// A 128-bit vector of [2 x double]. +/// \param b +/// A 128-bit vector of [2 x double]. +/// \param i +/// An immediate value containing 8-bit values specifying which +/// elements to copy from a and b: +/// Bit [0]=0: Bits [63:0] are copied from a. +/// Bit [0]=1: Bits [127:64] are copied from a. +/// Bit [1]=0: Bits [63:0] are copied from b. +/// Bit [1]=1: Bits [127:64] are copied from b. +/// \returns A 128-bit vector of [2 x double] containing the shuffled values. #define _mm_shuffle_pd(a, b, i) __extension__ ({ \ (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ (i) & 1, (((i) & 2) >> 1) + 2); }) +/// \brief Casts 64-bit double-precision values as packed 32-bit float values. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit vector of [2 x double] to be cast as float values. +/// \returns A 128-bit vector of [4 x float] containing the typecast values +/// provided in the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) { return (__m128)__a; } +/// \brief Casts 64-bit double-precision values as integer values. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit vector of [2 x double] to be cast as integer +/// values. +/// \returns A 128-bit integer vector containing the typecast values provided +/// in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) { return (__m128i)__a; } +/// \brief Casts 32-bit float values as 64-bit double-precision values. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit vector of [4 x float] to be cast as +/// double-precision values. +/// \returns A 128-bit vector of [2 x double] containing the typecast values +/// provided in the operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) { return (__m128d)__a; } +/// \brief Casts 32-bit float values as integer values. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit vector of [4 x float] to be cast as integer +/// values. +/// \returns A 128-bit integer vector containing the typecast values provided +/// in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) { return (__m128i)__a; } +/// \brief Casts integer values as 32-bit float values. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit integer vector to be cast as float values. +/// \returns A 128-bit vector of [4 x float] containing the typecast values +/// provided in the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) { return (__m128)__a; } +/// \brief Casts integer values as 64-bit double-precision values. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit integer vector to be cast as double-precision +/// values. +/// \returns A 128-bit vector of [2 x double] containing the typecast values +/// provided in the operand. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) { return (__m128d)__a; } +/// \brief Indicates that a spin loop is being executed for the purposes of +/// optimizing power consumption during the loop. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PAUSE instruction. +/// static __inline__ void __DEFAULT_FN_ATTRS _mm_pause(void) { Index: lib/Headers/f16cintrin.h =================================================================== --- lib/Headers/f16cintrin.h +++ lib/Headers/f16cintrin.h @@ -31,9 +31,43 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("f16c"))) +/// \brief Converts a 128-bit vector containing 32-bit float values into a +/// 128-bit vector containing 16-bit half-precision float values. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_cvtps_ph(__m128 a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to \c VCVTPS2PH instruction. +/// +/// \param a +/// A 128-bit vector containing 32-bit float values. +/// \param imm +/// An immediate value controlling rounding using bits [2:0]: +/// 000: Nearest +/// 001: Down +/// 010: Up +/// 011: Truncate +/// 1XX: Use MXCSR.RC for rounding +/// \returns A 128-bit vector containing converted 16-bit half-precision float +/// values. The lower 64 bits are used to store the converted 16-bit +/// half-precision floating-point values. #define _mm_cvtps_ph(a, imm) __extension__ ({ \ (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); }) +/// \brief Converts a 128-bit vector containing 16-bit half-precision float +/// values into a 128-bit vector containing 32-bit float values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTPH2PS instruction. +/// +/// \param __a +/// A 128-bit vector containing 16-bit half-precision float +/// values. The lower 64 bits are used in the conversion. +/// \returns A 128-bit vector of [4 x float] containing converted float values. static __inline __m128 __DEFAULT_FN_ATTRS _mm_cvtph_ps(__m128i __a) { Index: lib/Headers/immintrin.h =================================================================== --- lib/Headers/immintrin.h +++ lib/Headers/immintrin.h @@ -46,9 +46,46 @@ Intel documents these as being in immintrin.h, and they depend on typedefs from avxintrin.h. */ +/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector +/// containing 16-bit half-precision float values. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm256_cvtps_ph(__m256 a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to \c VCVTPS2PH instruction. +/// +/// \param a +/// A 256-bit vector containing 32-bit single-precision float +/// values to be converted to 16-bit half-precision float +/// values. +/// \param imm +/// An immediate value controlling rounding using bits [2:0]: +/// 000: Nearest +/// 001: Down +/// 010: Up +/// 011: Truncate +/// 1XX: Use MXCSR.RC for rounding +/// \returns A 128-bit vector containing the converted 16-bit half-precision +/// float values. #define _mm256_cvtps_ph(a, imm) __extension__ ({ \ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); }) +/// \brief Converts a 128-bit vector containing 16-bit half-precision float +/// values into a 256-bit vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTPH2PS instruction. +/// +/// \param __a +/// A 128-bit vector containing 16-bit half-precision float +/// values to be converted to 32-bit single-precision float +/// values. +/// \returns A vector of [8 x float] containing the converted 32-bit +/// single-precision float values. static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c"))) _mm256_cvtph_ps(__m128i __a) { Index: lib/Headers/mmintrin.h =================================================================== --- lib/Headers/mmintrin.h +++ lib/Headers/mmintrin.h @@ -33,366 +33,1229 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"))) +/// \brief Clears the MMX state by setting the state of the stack registers to +/// empty. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c EMMS instruction. +/// static __inline__ void __DEFAULT_FN_ATTRS _mm_empty(void) { __builtin_ia32_emms(); } +/// \brief Converts a 32-bit signed integer value into a 64-bit vector, writing +/// the result to the lower 32 bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVD instruction. +/// +/// \param __i +/// A 32-bit signed integer operand containing the value to be +/// converted. +/// \returns A 64-bit vector containing the converted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtsi32_si64(int __i) { return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); } +/// \brief Converts a 64-bit vector into a 32-bit signed integer value, using +/// the lower 32 bits of the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVD instruction. +/// +/// \param __m +/// A 64-bit vector. The lower 32 bits are used in the +/// conversion. +/// \returns A 32-bit signed integer containing the converted value. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi64_si32(__m64 __m) { return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0); } +/// \brief Converts a 64-bit signed integer value into a 64-bit vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MOVQ instruction. +/// +/// \param __i +/// A 64-bit signed integer operand containing the value to be +/// converted. +/// \returns A 64-bit vector containing the converted value of operand __i. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtsi64_m64(long long __i) { return (__m64)__i; } +/// \brief Converts a 64-bit vector into a 64-bit signed integer value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MOVQ instruction. +/// +/// \param __m +/// A 64-bit vector operand containing the value to be +/// converted. +/// \returns A 64-bit signed integer containing the converted value of operand static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtm64_si64(__m64 __m) { return (long long)__m; } +/// \brief Converts 16-bit signed integers from both 64-bit vector operands +/// into 8-bit signed integer values, and packs the results into the +/// destination. Positive values greater than 7Fh are saturated to 7Fh. +/// Negative values less than 80h are saturated to 80h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PACKSSWB instruction. +/// +/// \param __m1 +/// A 64-bit vector of [4 x i16]. The converted values are +/// written to the lower order bits of the destination. +/// \param __m2 +/// A 64-bit vector of [4 x i16]. The converted values are +/// written to the upper order bits of the destination. +/// \returns A 64-bit vector of [8 x i8] containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Converts 32-bit signed integers from both 64-bit integer vector +/// operands into 16-bit signed integers, and packs the results into the +/// destination. Positive values greater than 7FFFh are saturated to 7FFFh. +/// Negative values less than 8000h are saturated to 8000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PACKSSDW instruction. +/// +/// \param __m1 +/// A 64-bit vector of [2 x i32]. The converted values are +/// written to the lower order bits of the destination. +/// \param __m2 +/// A 64-bit vector of [2 x i32]. The converted values are +/// written to the upper order bits of the destination. +/// \returns A 64-bit vector of [4 x i16] containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); } +/// \brief Converts 16-bit signed integers from both 64-bit integer vector +/// operands into 8-bit unsigned integers, and packs the results into the +/// destination. Values greater than 7Fh +/// are saturated to 7Fh. Values +/// less than 00h are saturated to +/// 00h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PACKUSWB instruction. +/// +/// \param __m1 +/// A 64-bit vector of [4 x i16]. The converted values are +/// written to the lower order bits of the destination. +/// \param __m2 +/// A 64-bit vector of [4 x i16]. The converted values are +/// written to the upper order bits of the destination. +/// \returns A 64-bit vector of [8 x i8] containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_packs_pu16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Unpacks the high-order (index 4-7) values from two 64-bit vectors of +/// [8 x i8] and interleaves them into a packed 64-bit vector of [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PUNPCKHBW instruction. +/// +/// \param __m1 +/// A 64-bit vector of [8 x i8]. +/// Bits [39:32] are written to bits [7:0] of the destination. +/// This pattern continues until: +/// Bits [63:56] are written to bits [55:48] of the destination. +/// \param __m2 +/// A 64-bit vector of [8 x i8]. +/// Bits [39:32] are written to bits [15:8] of the destination. +/// This pattern continues until: +/// Bits [63:56] are written to bits [63:56] of the destination. +/// \returns A 64-bit vector of [8 x i8] containing the interleaved values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2); } +/// \brief Unpacks the high-order (index 2,3) values from two 64-bit vectors of +/// [4 x i16] and interleaves them into a packed 64-bit vector of [4 x +/// i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PUNPCKHWD instruction. +/// +/// \param __m1 +/// A 64-bit vector of [4 x i16]. +/// Bits [47:32] are written to bits [15:0] of the destination. +/// Bits [63:48] are written to bits [47:32] of the destination. +/// \param __m2 +/// A 64-bit vector of [4 x i16]. +/// Bits [47:32] are written to bits [31:16] of the destination. +/// Bits [63:48] are written to bits [63:48] of the destination. +/// \returns A 64-bit vector of [4 x i16] containing the interleaved values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Unpacks the high-order (odd-indexed) values from two 64-bit vectors +/// of [2 x i32] and interleaves them into a packed 64-bit vector of [2 x +/// i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PUNPCKHDQ instruction. +/// +/// \param __m1 +/// A 64-bit vector of [2 x i32]. +/// Bits [63:32] are written to bits [31:0] of the destination. +/// \param __m2 +/// A 64-bit vector of [2 x i32]. +/// Bits [63:32] are written to bits [63:32] of the destination. +/// \returns A 64-bit vector of [2 x i32] containing the interleaved values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2); } +/// \brief Unpacks the low-order (index 0-3) values from two 64-bit vectors of +/// [8 x i8] and interleaves them into a packed 64-bit vector of [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PUNPCKLBW instruction. +/// +/// \param __m1 +/// A 64-bit vector of [8 x i8]. +/// Bits [7:0] are written to bits [7:0] of the destination. +/// This pattern continues until: +/// Bits [31:24] are written to bits [55:48] of the destination. +/// \param __m2 +/// A 64-bit vector of [8 x i8]. +/// Bits [7:0] are written to bits [15:8] of the destination. +/// This pattern continues until: +/// Bits [31:24] are written to bits [63:56] of the destination. +/// \returns A 64-bit vector of [8 x i8] containing the interleaved values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2); } +/// \brief Unpacks the low-order (index 0,1) values from two 64-bit vectors of +/// [4 x i16] and interleaves them into a packed 64-bit vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PUNPCKLWD instruction. +/// +/// \param __m1 +/// A 64-bit vector of [4 x i16]. +/// Bits [15:0] are written to bits [15:0] of the destination. +/// Bits [31:16] are written to bits [47:32] of the destination. +/// \param __m2 +/// A 64-bit vector of [4 x i16]. +/// Bits [15:0] are written to bits [31:16] of the destination. +/// Bits [31:16] are written to bits [63:48] of the destination. +/// \returns A 64-bit vector of [4 x i16] containing the interleaved values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Unpacks the low-order (even-indexed) values from two 64-bit vectors +/// of [2 x i32] and interleaves them into a packed 64-bit vector of [2 x +/// i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PUNPCKLDQ instruction. +/// +/// \param __m1 +/// A 64-bit vector of [2 x i32]. +/// Bits [31:0] are written to bits [31:0] of the destination. +/// \param __m2 +/// A 64-bit vector of [2 x i32]. +/// Bits [31:0] are written to bits [63:32] of the destination. +/// \returns A 64-bit vector of [2 x i32] containing the interleaved values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2); } +/// \brief Adds packed 8-bit integer values and writes the sums to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PADDB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the sums of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); } +/// \brief Adds packed 16-bit integer values and writes the sums to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PADDW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the sums of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Adds packed 32-bit integer values and writes the sums to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PADDD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the sums of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); } +/// \brief Adds packed 8-bit integer values and writes the sums to the +/// corresponding bits in the destination. Positive sums greater than +/// 7Fh are saturated to 7Fh. Negative sums less than 80h are saturated to +/// 80h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PADDSB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the sums of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); } +/// \brief Adds packed 16-bit integer values and writes the sums to the +/// corresponding bits in the destination. Positive sums greater than +/// 7FFFh are saturated to 7FFFh. Negative sums less than 8000h are saturated +/// to 8000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PADDSW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the sums of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Adds packed 8-bit integer values and writes the unsigned integer +/// sums to the corresponding bits in the destination. The sums greater than +/// FFh are saturated to FFh. Negative sums less than 00h are saturated to +/// 00h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PADDUSB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the unsigned integer sums of +/// both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pu8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); } +/// \brief Adds packed 16-bit integer values and writes the unsigned integer +/// sums to the corresponding bits in the destination. Positive sums greater +/// than FFFFh are saturated to +/// FFFFh. Negative sums less than +/// 0000h are saturated to 0000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PADDUSW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the unsigned integer sums of +/// both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_adds_pu16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Subtracts the 8-bit integer values in the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSUBB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing the minuends. +/// \param __m2 +/// A 64-bit integer vector containing the subtrahends. +/// \returns A 64-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); } +/// \brief Subtracts the 16-bit integer values in the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSUBW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing the minuends. +/// \param __m2 +/// A 64-bit integer vector containing the subtrahends. +/// \returns A 64-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Subtracts the 32-bit integer values in the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSUBD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing the minuends. +/// \param __m2 +/// A 64-bit integer vector containing the subtrahends. +/// \returns A 64-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2); } +/// \brief Subtracts packed 8-bit integer values and writes the differences to +/// the corresponding bits in the destination. Values greater than the +/// largest signed 8-bit integer are saturated to 7Fh, and values less than +/// the smallest +/// signed 8-bit integer are saturated to 80h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSUBSB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing the minuends. +/// \param __m2 +/// A 64-bit integer vector containing the subtrahends. +/// \returns A 64-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); } +/// \brief Subtracts packed 16-bit integer values and writes the differences to +/// the corresponding bits in the destination. Values greater than 7FFFh are +/// saturated to 7FFFh, and values less than 8000h are saturated to 8000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSUBSW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing the minuends. +/// \param __m2 +/// A 64-bit integer vector containing the subtrahends. +/// \returns A 64-bit integer vector containing the differences of the values +/// in the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Subtracts packed 8-bit integer values and writes the unsigned +/// integer differences to the corresponding bits in the destination. Values +/// less +/// than 00h are saturated to 00h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSUBUSB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing the minuends. +/// \param __m2 +/// A 64-bit integer vector containing the subtrahends. +/// \returns A 64-bit integer vector containing the unsigned integer differences +/// of the values in the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pu8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); } +/// \brief Subtracts packed 16-bit integer values and writes the unsigned +/// integer differences to the corresponding bits in the destination. Values +/// greater than FFFFh are saturated +/// to FFFFh, and values less 0000h are saturated to 0000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSUBUSW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing the minuends. +/// \param __m2 +/// A 64-bit integer vector containing the subtrahends. +/// \returns A 64-bit integer vector containing the unsigned integer differences +/// of the values in the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_subs_pu16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Multiplies corresponding pairs of packed 16-bit signed integer values, +/// adds pairs of contiguous products, and writes the 32-bit sums to the +/// corresponding bits in the destination. For example, bits [15:0] of +/// both operands are multiplied, bits [31:16] of both operands are +/// multiplied, and the sum of both results is written to bits [31:0] of +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMADDWD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the sums of products of both +/// operands: static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_madd_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Multiplies packed 16-bit signed integer values and writes the +/// high-order 16 bits of each 32-bit product to the corresponding bits in +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMULHW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the products of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Multiplies packed 16-bit integer values and writes the low-order 16 +/// bits of each 32-bit product to the corresponding bits in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMULLW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the products of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mullo_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Left-shifts each 16-bit value in the 64-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSLLW instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// A 64-bit integer vector specifying the number of bits to +/// left-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the left-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_pi16(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count); } +/// \brief Left-shifts each 16-bit value in the 64-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSLLW instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to left-shift +/// each value in operand __m. +/// \returns A 64-bit integer vector containing the left-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_pi16(__m64 __m, int __count) { return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); } +/// \brief Left-shifts each 32-bit value in the 64-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSLLD instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// A 64-bit integer vector specifying the number of bits to +/// left-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the left-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_pi32(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_pslld((__v2si)__m, __count); } +/// \brief Left-shifts each 32-bit value in the 64-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSLLD instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to left-shift +/// each value in operand __m. +/// \returns A 64-bit integer vector containing the left-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_pi32(__m64 __m, int __count) { return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); } +/// \brief Left-shifts the 64-bit integer vector operand by the specified +/// number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSLLQ instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// A 64-bit integer vector specifying the number of bits to +/// left-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the left-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sll_si64(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psllq(__m, __count); } +/// \brief Left-shifts the 64-bit integer vector operand by the specified +/// number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSLLQ instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to left-shift +/// each value in operand __m. +/// \returns A 64-bit integer vector containing the left-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_slli_si64(__m64 __m, int __count) { return (__m64)__builtin_ia32_psllqi(__m, __count); } +/// \brief Right-shifts each 16-bit value in the 64-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the +/// sign +/// bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSRAW instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// A 64-bit integer vector specifying the number of bits to +/// right-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the right-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sra_pi16(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count); } +/// \brief Right-shifts each 16-bit value in the 64-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the +/// sign +/// bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSRAW instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to +/// right-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the right-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srai_pi16(__m64 __m, int __count) { return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); } +/// \brief Right-shifts each 32-bit value in the 64-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the +/// sign +/// bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSRAD instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// A 64-bit integer vector specifying the number of bits to +/// right-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the right-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sra_pi32(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psrad((__v2si)__m, __count); } +/// \brief Right-shifts each 32-bit value in the 64-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the +/// sign +/// bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSRAD instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to +/// right-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the right-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srai_pi32(__m64 __m, int __count) { return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); } +/// \brief Right-shifts each 16-bit value in the 64-bit integer vector operand +/// by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSRLW instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// A 64-bit integer vector specifying the number of bits to +/// right-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the right-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_pi16(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count); } +/// \brief Right-shifts each packed 16-bit value in the 64-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSRLW instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to +/// right-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the right-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srli_pi16(__m64 __m, int __count) { return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); } +/// \brief Right-shifts each 32-bit value in the 64-bit integer vector operand +/// by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSRLD instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// A 64-bit integer vector specifying the number of bits to +/// right-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the right-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_pi32(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psrld((__v2si)__m, __count); } +/// \brief Right-shifts each packed 32-bit value in the 64-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSRLD instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to +/// right-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the right-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srli_pi32(__m64 __m, int __count) { return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); } +/// \brief Right-shifts the 64-bit integer vector operand by the specified +/// number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSRLQ instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// A 64-bit integer vector specifying the number of bits to +/// right-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the right-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srl_si64(__m64 __m, __m64 __count) { return (__m64)__builtin_ia32_psrlq(__m, __count); } +/// \brief Right-shifts the 64-bit integer vector operand by the specified +/// number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSRLQ instruction. +/// +/// \param __m +/// A 64-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to +/// right-shift each value in operand __m. +/// \returns A 64-bit integer vector containing the right-shifted value. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_srli_si64(__m64 __m, int __count) { return (__m64)__builtin_ia32_psrlqi(__m, __count); } +/// \brief Performs a bitwise AND of 2 packed 64-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PAND instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the bitwise AND of the values +/// between both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_and_si64(__m64 __m1, __m64 __m2) { return __builtin_ia32_pand(__m1, __m2); } +/// \brief Performs a bitwise AND of 2 packed 64-bit integer vectors, using the +/// ones-complement of the values contained in the first source +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PANDN instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing the left source operand. +/// The ones complement of this value is used in the +/// bitwise AND. +/// \param __m2 +/// A 64-bit integer vector containing the right source operand. +/// \returns A 64-bit integer vector containing the bitwise AND of the +/// ones-complement of the first operand and the values in the second operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_andnot_si64(__m64 __m1, __m64 __m2) { return __builtin_ia32_pandn(__m1, __m2); } +/// \brief Performs a bitwise OR of 2 packed 64-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c POR instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the bitwise OR of the values +/// between both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_or_si64(__m64 __m1, __m64 __m2) { return __builtin_ia32_por(__m1, __m2); } +/// \brief Performs a bitwise exclusive OR of 2 packed 64-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PXOR instruction. +/// +/// \param __m1 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __m2 +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the bitwise exclusive OR of the +/// values between both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_xor_si64(__m64 __m1, __m64 __m2) { return __builtin_ia32_pxor(__m1, __m2); } +/// \brief Compares each of the corresponding packed 8-bit values of the 64-bit +/// integer vectors for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PCMPEQB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector. +/// \param __m2 +/// A 64-bit integer vector. +/// \returns A 64-bit integer vector containing the comparison results. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2); } +/// \brief Compares each of the corresponding packed 16-bit values of the +/// 64-bit integer vectors for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PCMPEQW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector. +/// \param __m2 +/// A 64-bit integer vector. +/// \returns A 64-bit integer vector containing the comparison results. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Compares each of the corresponding packed 32-bit values of the +/// 64-bit integer vectors for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PCMPEQD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector. +/// \param __m2 +/// A 64-bit integer vector. +/// \returns A 64-bit integer vector containing the comparison results. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2); } +/// \brief Compares each of the corresponding packed 8-bit values of the 64-bit +/// integer vectors to determine if the values in the first operand are +/// greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PCMPGTB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector. +/// \param __m2 +/// A 64-bit integer vector. +/// \returns A 64-bit integer vector containing the comparison results. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2); } +/// \brief Compares each of the corresponding packed 16-bit values of the +/// 64-bit integer vectors to determine if the values in the first operand +/// are +/// greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PCMPGTW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector. +/// \param __m2 +/// A 64-bit integer vector. +/// \returns A 64-bit integer vector containing the comparison results. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2); } +/// \brief Compares each of the corresponding packed 32-bit values of the +/// 64-bit integer vectors to determine if the values in the first operand +/// are +/// greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PCMPGTD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector. +/// \param __m2 +/// A 64-bit integer vector. +/// \returns A 64-bit integer vector containing the comparison results. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2); } +/// \brief Sets a 64-bit integer register to zero. +/// +/// \headerfile +/// +/// \returns An initialized 64-bit integer vector with all elements set to zero. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setzero_si64(void) { return (__m64){ 0LL }; } +/// \brief Initializes the 32-bit values in a 64-bit vector of [2 x i32] with +/// the specified 32-bit integer values. +/// +/// \headerfile +/// +/// \param __i1 +/// A 32-bit integer value used to initialize bits [63:32] of +/// the destination vector. +/// \param __i0 +/// A 32-bit integer value used to initialize bits [31:0] of the +/// destination vector. +/// \returns An initialized 64-bit vector of [2 x i32] containing the values +/// provided in the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set_pi32(int __i1, int __i0) { return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1); } +/// \brief Initializes the 16-bit values in a 64-bit vector of [4 x i16] with +/// the specified 16-bit integer values. +/// +/// \headerfile +/// +/// \param __s3 +/// A 16-bit integer value used to initialize bits [63:48] of +/// the destination vector. +/// \param __s2 +/// A 16-bit integer value used to initialize bits [47:32] of +/// the destination vector. +/// \param __s1 +/// A 16-bit integer value used to initialize bits [31:16] of +/// the destination vector. +/// \param __s0 +/// A 16-bit integer value used to initialize bits [15:0] of the +/// destination vector. +/// \returns An initialized 64-bit vector of [4 x i16] containing the values +/// provided in the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) { return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3); } +/// \brief Initializes the 8-bit values in a 64-bit vector of [8 x i8] with the +/// specified 8-bit integer values. +/// +/// \headerfile +/// +/// \param __b7 +/// Initializes bits [63:56] of the destination vector. +/// \param __b6 +/// Initializes bits [55:48] of the destination vector. +/// \param __b5 +/// Initializes bits [47:40] of the destination vector. +/// \param __b4 +/// Initializes bits [39:32] of the destination vector. +/// \param __b3 +/// Initializes bits [31:24] of the destination vector. +/// \param __b2 +/// Initializes bits [23:16] of the destination vector. +/// \param __b1 +/// Initializes bits [15:8] of the destination vector. +/// \param __b0 +/// Initializes bits [7:0] of the destination vector. +/// \returns An initialized 64-bit vector of [8 x i8] containing the values +/// provided in the operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) @@ -401,36 +1264,121 @@ __b4, __b5, __b6, __b7); } +/// \brief Initializes all values in a 64-bit vector of [2 x i32] with the +/// specified 32-bit value. +/// +/// \headerfile +/// +/// \param __i +/// A 32-bit value used to initialize the elements of the +/// destination integer vector. +/// \returns An initialized 64-bit vector of [2 x i32] with all elements +/// containing the value provided in the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi32(int __i) { return _mm_set_pi32(__i, __i); } +/// \brief Initializes all values in a 64-bit vector of [4 x i16] with the +/// specified 16-bit value. +/// +/// \headerfile +/// +/// \param __w +/// A 16-bit value used to initialize the elements of the +/// destination integer vector. +/// \returns An initialized 64-bit vector of [4 x i16] with all elements +/// containing the value provided in the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi16(short __w) { return _mm_set_pi16(__w, __w, __w, __w); } +/// \brief Initializes all values in a 64-bit vector of [8 x i8] with the +/// specified 8-bit value. +/// +/// \headerfile +/// +/// \param __b +/// An 8-bit value used to initialize the elements of the +/// destination integer vector. +/// \returns An initialized 64-bit vector of [8 x i8] with all elements +/// containing the value provided in the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_set1_pi8(char __b) { return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); } +/// \brief Initializes all 32-bit integer values in a 64-bit vector of [2 x i32] +/// in reverse order, using the specified 32-bit integer values. +/// +/// \headerfile +/// +/// \param __i0 +/// A 32-bit integer value used to initialize bits [31:0] of the +/// destination vector. +/// \param __i1 +/// A 32-bit integer value used to initialize bits [63:32] of +/// the destination vector. +/// \returns An initialized 64-bit vector of [2 x i32] containing the values +/// provided in the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setr_pi32(int __i0, int __i1) { return _mm_set_pi32(__i1, __i0); } +/// \brief Initializes all 16-bit integer values in a 64-bit vector of [4 x i16] +/// in reverse order, using the specified 16-bit integer values. +/// +/// \headerfile +/// +/// \param __w0 +/// A 16-bit integer value used to initialize bits [15:0] of the +/// destination vector. +/// \param __w1 +/// A 16-bit integer value used to initialize bits [31:16] of +/// the destination vector. +/// \param __w2 +/// A 16-bit integer value used to initialize bits [47:32] of +/// the destination vector. +/// \param __w3 +/// A 16-bit integer value used to initialize bits [63:48] of +/// the destination vector. +/// \returns An initialized 64-bit vector of [4 x i16] containing the values +/// provided in the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { return _mm_set_pi16(__w3, __w2, __w1, __w0); } +/// \brief Initializes all 8-bit integer values in a 64-bit vector of [8 x i8] +/// in reverse order, using the specified 8-bit integer values. +/// +/// \headerfile +/// +/// \param __b0 +/// Initializes bits [7:0] of the destination vector. +/// \param __b1 +/// Initializes bits [15:8] of the destination vector. +/// \param __b2 +/// Initializes bits [23:16] of the destination vector. +/// \param __b3 +/// Initializes bits [31:24] of the destination vector. +/// \param __b4 +/// Initializes bits [39:32] of the destination vector. +/// \param __b5 +/// Initializes bits [47:40] of the destination vector. +/// \param __b6 +/// Initializes bits [55:48] of the destination vector. +/// \param __b7 +/// Initializes bits [63:56] of the destination vector. +/// \returns An initialized 64-bit vector of [8 x i8] containing the values +/// provided in the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7) Index: lib/Headers/pmmintrin.h =================================================================== --- lib/Headers/pmmintrin.h +++ lib/Headers/pmmintrin.h @@ -29,62 +29,233 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse3"))) +/// \brief Moves integer values from an unaligned memory location to elements +/// in a 128-bit integer vector. The instruction may read 16 bytes to +/// retrieve either or both of the first and second parts of the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VLDDQU instruction. +/// +/// \param __p +/// A pointer to a 128-bit integer vector containing integer +/// values. +/// \returns A 128-bit integer vector containing the moved values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_lddqu_si128(__m128i const *__p) { return (__m128i)__builtin_ia32_lddqu((char const *)__p); } +/// \brief Adds the even-indexed values and subtracts the odd-indexed values of +/// 2 packed 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VADDSUBPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the left source +/// operand. +/// \param __b +/// A 128-bit vector of [4 x float] containing the right source +/// operand. +/// \returns A 128-bit vector of [4 x float] containing the alternating sums +/// and differences between both operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_addsub_ps(__m128 __a, __m128 __b) { return __builtin_ia32_addsubps(__a, __b); } +/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VHADDPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the lower bits of the destination. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the upper bits of the destination. +/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of +/// both operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_hadd_ps(__m128 __a, __m128 __b) { return __builtin_ia32_haddps(__a, __b); } +/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VHSUBPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the lower bits of the destination. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the upper bits of the destination. +/// \returns A 128-bit vector of [4 x float] containing the horizontal +/// differences of both operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_hsub_ps(__m128 __a, __m128 __b) { return __builtin_ia32_hsubps(__a, __b); } +/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit +/// vector of [4 x float] to float values stored in a packed 128-bit +/// vector of [4 x float]. +/// Bits [127:96] of the source are written to bits [127:96] and [95:64] +/// of the destination. +/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVHDUP instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the moved and +/// duplicated values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehdup_ps(__m128 __a) { return __builtin_shufflevector(__a, __a, 1, 1, 3, 3); } +/// \brief Moves and duplicates low-order (even-indexed) values from a 128-bit +/// vector of [4 x float] to float values stored in a packed 128-bit +/// vector of [4 x float]. +/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of +/// the destination. +/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVSLDUP instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the moved and +/// duplicated values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_moveldup_ps(__m128 __a) { return __builtin_shufflevector(__a, __a, 0, 0, 2, 2); } +/// \brief Adds the even-indexed values and subtracts the odd-indexed values of +/// 2 packed 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VADDSUBPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the left source +/// operand. +/// \param __b +/// A 128-bit vector of [2 x double] containing the right source +/// operand. +/// \returns A 128-bit vector of [2 x double] containing the alternating sums +/// and differences between both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_addsub_pd(__m128d __a, __m128d __b) { return __builtin_ia32_addsubpd(__a, __b); } +/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VHADDPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. The horizontal sums of the values are +/// stored in the lower bits of the destination. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. The horizontal sums of the values are +/// stored in the upper bits of the destination. +/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of +/// both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_hadd_pd(__m128d __a, __m128d __b) { return __builtin_ia32_haddpd(__a, __b); } +/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VHSUBPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. The horizontal differences between the +/// values are stored in the lower bits of the destination. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the +/// source operands. The horizontal differences between the +/// values are stored in the upper bits of the destination. +/// \returns A 128-bit vector of [2 x double] containing the horizontal +/// differences of both operands. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_hsub_pd(__m128d __a, __m128d __b) { return __builtin_ia32_hsubpd(__a, __b); } +/// \brief Moves and duplicates one double-precision value to double-precision +/// values stored in a packed 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_loaddup_pd(double const * dp); +/// \endcode +/// +/// This intrinsic corresponds to \c VMOVDDUP instruction. +/// +/// \param dp +/// A pointer to a double-precision value to be moved and +/// duplicated. +/// \returns A 128-bit vector of [2 x double] containing the moved and +/// duplicated values. #define _mm_loaddup_pd(dp) _mm_load1_pd(dp) +/// \brief Moves and duplicates the double-precision value in the lower bits of +/// a 128-bit vector of [2 x double] to double-precision values stored in a +/// packed 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVDDUP instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. Bits [63:0] are written to +/// bits [127:64] and [63:0] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the moved and +/// duplicated values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_movedup_pd(__m128d __a) { @@ -99,12 +270,45 @@ #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) +/// \brief Establishes a linear address memory range to be monitored and puts +/// the processor in the monitor event pending state. Data stored in the +/// monitored address range causes the processor to exit the pending +/// state. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MONITOR instruction. +/// +/// \param __p +/// The memory range to be monitored. The size of the range is +/// determined by CPUID function 0000_0005h. +/// \param __extensions +/// Optional extensions for the monitoring state. +/// \param __hints +/// Optional hints for the monitoring state. static __inline__ void __DEFAULT_FN_ATTRS _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) { __builtin_ia32_monitor((void *)__p, __extensions, __hints); } +/// \brief Used with the MONITOR instruction to wait while the processor is in +/// the monitor event pending state. Data stored in the monitored address +/// range causes the processor to exit the pending state. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MWAIT instruction. +/// +/// \param __extensions +/// Optional extensions for the monitoring state. Only setting +/// bit 0, which allows interrupts to wake MWAIT, is supported. +/// Setting any other bits results in a General Protection +/// fault. +/// \param __hints +/// Optional hints for the monitoring state. No hints are +/// actually defined: any bits set in this value are ignored by +/// the processor. static __inline__ void __DEFAULT_FN_ATTRS _mm_mwait(unsigned __extensions, unsigned __hints) { Index: lib/Headers/popcntintrin.h =================================================================== --- lib/Headers/popcntintrin.h +++ lib/Headers/popcntintrin.h @@ -27,6 +27,15 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt"))) +/// \brief Counts the number of bits in the source operand having a value of 1. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c POPCNT instruction. +/// +/// \param __A +/// An unsigned 32-bit integer operand. +/// \returns A 32-bit integer containing the number of bits in the source static __inline__ int __DEFAULT_FN_ATTRS _mm_popcnt_u32(unsigned int __A) { @@ -40,6 +49,15 @@ } #ifdef __x86_64__ +/// \brief Counts the number of bits in the source operand having a value of 1. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c POPCNT instruction. +/// +/// \param __A +/// An unsigned 64-bit integer operand. +/// \returns A 64-bit integer containing the number of bits in the source static __inline__ long long __DEFAULT_FN_ATTRS _mm_popcnt_u64(unsigned long long __A) { Index: lib/Headers/prfchwintrin.h =================================================================== --- lib/Headers/prfchwintrin.h +++ lib/Headers/prfchwintrin.h @@ -35,6 +35,28 @@ __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */); } +/// \brief Loads a memory sequence containing the specified memory address into +/// the L1 data cache. Data can be +/// written to the cache line without additional delay, because the data +/// is already prefetched in the modified cache-coherency state. Data can +/// also be read from the cache line without additional delay. However, +/// prefetching write data takes longer than prefetching read data if the +/// processor must wait for another caching master to first write back its +/// modified copy of the requested data to memory before the prefetch +/// request is satisfied. +/// The PREFETCHW instruction +/// provides a hint to the processor that the cache line is to be +/// modified, and is intended for use when the cache line will be written +/// to shortly after the prefetch is performed. The processor can place +/// the cache line in the modified state when it is prefetched, but before +/// it is actually written. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PREFETCHW instruction. +/// +/// \param __P +/// A pointer specifying the memory address to be prefetched. static __inline__ void __attribute__((__always_inline__, __nodebug__)) _m_prefetchw(void *__P) { Index: lib/Headers/smmintrin.h =================================================================== --- lib/Headers/smmintrin.h +++ lib/Headers/smmintrin.h @@ -46,37 +46,352 @@ #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) +/// \brief Rounds up the values stored in a packed 128-bit vector of [4 x float]. +/// The source values are rounded to integer values and returned as +/// floating-point values. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_ceil_ps(__m128 X); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) +/// \brief Rounds up the values stored in a packed 128-bit vector of [2 x +/// double]. The source values are rounded to integer values and returned +/// as double-precision values. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_ceil_pd(__m128d X); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) +/// \brief Copies the values stored in bits [127:32] from the first operand to +/// the destination. Rounds up the low-order value stored in bits [31:0] +/// of the second operand to an integer value and stores the result in +/// bits [31:0] of the destination. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_ceil_ss(__m128 X, __m128 Y); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [4 x float] values. The values stored in +/// bits [127:32] are copied to the corresponding bits in the +/// destination. +/// \param Y +/// A 128-bit vector of [4 x float] values. The value stored in +/// bits [31:0] is rounded up to the nearest integer and is +/// stored in bits [31:0] of the destination. +/// \returns A 128-bit vector of [4 x float] containing the copied and rounded +/// values. #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) +/// \brief Copies the value stored in bits [127:64] from the first operand to +/// the destination. Rounds up the low-order value stored in bits [63:0] of +/// the second operand to an integer value and stores the result in bits +/// [63:0] of the destination. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_ceil_sd(__m128d X, __m128d Y); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [2 x double] values. The values stored +/// in bits [127:64] are copied to the corresponding bits in the +/// destination. +/// \param Y +/// A 128-bit vector of [2 x double] values. The value stored in +/// bits [63:0] is rounded up to the nearest integer and is +/// stored in bits [63:0] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the copied and rounded +/// values. #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) +/// \brief Rounds down the values stored in a packed 128-bit vector of [4 x +/// float]. The source values are rounded to integer values and returned +/// as floating-point values. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_floor_ps(__m128 X); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) +/// \brief Rounds down the values stored in a packed 128-bit vector of [2 x +/// double]. The source values are rounded to integer values and returned +/// as double-precision values. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_floor_pd(__m128d X); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [2 x double] values. +/// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) +/// \brief Copies the values stored in bits [127:32] from the first operand to +/// the destination. Rounds down the low-order value stored in bits [31:0] +/// of the second operand to an integer value and stores the result in +/// bits [31:0] of the destination. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_floor_ss(__m128 X, __m128 Y); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [4 x float] values. The values stored in +/// bits [127:32] are copied to the corresponding bits in the +/// destination. +/// \param Y +/// A 128-bit vector of [4 x float] values. The value stored in +/// bits [31:0] is rounded down to the nearest integer and is +/// stored in bits [31:0] of the destination. +/// \returns A 128-bit vector of [4 x float] containing the copied and rounded +/// values. #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) +/// \brief Copies the value stored in bits [127:64] from the first operand to +/// the destination. Rounds down the low-order value stored in bits [63:0] +/// of +/// the second operand to an integer value and stores the result in bits +/// [63:0] of the destination. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_floor_sd(__m128d X, __m128d Y); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [2 x double] values. The values stored +/// in bits [127:64] are copied to the corresponding bits in the +/// destination. +/// \param Y +/// A 128-bit vector of [2 x double] values. The value stored in +/// bits [63:0] is rounded down to the nearest integer and is +/// stored in bits [63:0] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the copied and rounded +/// values. #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) +/// \brief Rounds the values stored in a packed 128-bit vector of [4 x float] +/// using the specified rounding control. The source values are rounded to +/// integer values and returned as floating-point values. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_round_ps(__m128 X, const int M); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [4 x float] values. +/// \param M +/// An integer value that specifies the rounding operation. +/// Bits [7:4] are reserved. +/// Bit [3] is a precision exception value: +/// 0: A normal PE exception is used +/// 1: The PE field is not updated +/// Bit [2] is the rounding control source: +/// 0: Use bits [1:0] of M +/// 1: Use the current MXCSR setting +/// Bits [1:0] contain the rounding control definition: +/// 00: Nearest +/// 01: Downward (toward negative infinity) +/// 10: Upward (toward positive infinity) +/// 11: Truncated +/// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_round_ps(X, M) __extension__ ({ \ (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); }) +/// \brief Copies the values stored in bits [127:32] from the first operand to +/// the destination. Rounds the low-order value stored in bits [31:0] of +/// the second operand to an integer value using the specified rounding +/// control, and stores the result in bits [31:0] of the destination. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [4 x float] values. The values stored in +/// bits [127:32] are copied to the corresponding bits in the +/// destination. +/// \param Y +/// A 128-bit vector of [4 x float] values. The value stored in +/// bits [31:0] is rounded to the nearest integer using the +/// specified rounding control, and is stored in bits [31:0] of +/// the destination. +/// \param M +/// An integer value that specifies the rounding operation. +/// Bits [7:4] are reserved. +/// Bit [3] is a precision exception value: +/// 0: A normal PE exception is used +/// 1: The PE field is not updated +/// Bit [2] is the rounding control source: +/// 0: Use bits [1:0] of M +/// 1: Use the current MXCSR setting +/// Bits [1:0] contain the rounding control definition: +/// 00: Nearest +/// 01: Downward (toward negative infinity) +/// 10: Upward (toward positive infinity) +/// 11: Truncated +/// \returns A 128-bit vector of [4 x float] containing the copied and rounded +/// values. #define _mm_round_ss(X, Y, M) __extension__ ({ \ (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (M)); }) +/// \brief Rounds up the values stored in a packed 128-bit vector of [2 x double] +/// using the specified rounding control. The source values are rounded to +/// integer values and returned as double-precision values. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_round_pd(__m128d X, const int M); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [2 x double] values. +/// \param M +/// An integer value that specifies the rounding operation. +/// Bits [7:4] are reserved. +/// Bit [3] is a precision exception value: +/// 0: A normal PE exception is used +/// 1: The PE field is not updated +/// Bit [2] is the rounding control source: +/// 0: Use bits [1:0] of M +/// 1: Use the current MXCSR setting +/// Bits [1:0] contain the rounding control definition: +/// 00: Nearest +/// 01: Downward (toward negative infinity) +/// 10: Upward (toward positive infinity) +/// 11: Truncated +/// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_round_pd(X, M) __extension__ ({ \ (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); }) +/// \brief Copies the value stored in bits [127:64] from the first operand to +/// the destination. Rounds the low-order value stored in bits [63:0] of the +/// second operand to an integer value using the specified rounding +/// control, and stores the result in bits [63:0] of the destination. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); +/// \endcode +/// +/// \param X +/// A 128-bit vector of [2 x double] values. The values stored +/// in bits [127:64] are copied to the corresponding bits in the +/// destination. +/// \param Y +/// A 128-bit vector of [2 x double] values. The value stored in +/// bits [63:0] is rounded to the nearest integer using the +/// specified rounding control, and is stored in bits [63:0] of +/// the destination. +/// \param M +/// An integer value that specifies the rounding operation. +/// Bits [7:4] are reserved. +/// Bit [3] is a precision exception value: +/// 0: A normal PE exception is used +/// 1: The PE field is not updated +/// Bit [2] is the rounding control source: +/// 0: Use bits [1:0] of M +/// 1: Use the current MXCSR setting +/// Bits [1:0] contain the rounding control definition: +/// 00: Nearest +/// 01: Downward (toward negative infinity) +/// 10: Upward (toward positive infinity) +/// 11: Truncated +/// \returns A 128-bit vector of [2 x double] containing the copied and rounded +/// values. #define _mm_round_sd(X, Y, M) __extension__ ({ \ (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (M)); }) /* SSE4 Packed Blending Intrinsics. */ +/// \brief Copies 64-bit double-precision data values stored in either of the +/// two packed 128-bit vectors of [2 x double], as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// \code +/// _mm_blend_pd( V1, V2, M); +/// \endcode +/// +/// This intrinsic corresponds to \c VBLENDPD instruction. +/// +/// \param V1 +/// A 128-bit vector of [2 x double] values. +/// \param V2 +/// A 128-bit vector of [2 x double] values. +/// \param M +/// An immediate integer operand, with mask bits [1:0] +/// specifying how the values are to be copied. The position of +/// the mask bit corresponds to the index of a copied value. +/// When a mask bit is 0, the corresponding 64-bit element in +/// operand V1 is copied to the same +/// position in the destination. When a mask bit is 1, the +/// corresponding 64-bit element in operand V2 +/// is copied to the same position in the destination. +/// \returns A 128-bit vector of [2 x double] containing the copied values. #define _mm_blend_pd(V1, V2, M) __extension__ ({ \ (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \ (__v2df)(__m128d)(V2), \ (((M) & 0x01) ? 2 : 0), \ (((M) & 0x02) ? 3 : 1)); }) +/// \brief Copies 32-bit single-precision data values stored in either of the +/// two packed 128-bit vectors of [4 x float], as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// \code +/// _mm_blend_ps( V1, V2, M); +/// \endcode +/// +/// This intrinsic corresponds to \c VBLENDPS instruction. +/// +/// \param V1 +/// A 128-bit vector of [4 x float] values. +/// \param V2 +/// A 128-bit vector of [4 x float] values. +/// \param M +/// An immediate integer operand, with mask bits [3:0] +/// specifying how the values are to be copied. The position of +/// the mask bit corresponds to the index of a copied value. +/// When a mask bit is 0, the corresponding 32-bit element in +/// operand V1 is copied to the same +/// position in the destination. When a mask bit is 1, the +/// corresponding 32-bit element in operand V2 +/// is copied to the same position in the destination. +/// \returns A 128-bit vector of [4 x float] containing the copied values. #define _mm_blend_ps(V1, V2, M) __extension__ ({ \ (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ (((M) & 0x01) ? 4 : 0), \ @@ -84,6 +399,28 @@ (((M) & 0x04) ? 6 : 2), \ (((M) & 0x08) ? 7 : 3)); }) +/// \brief Copies 64-bit double-precision data values stored in either of the +/// two packed 128-bit vectors of [2 x double], as specified by the 128-bit +/// vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VBLENDVPD instruction. +/// +/// \param __V1 +/// A 128-bit vector of [2 x double] values. +/// \param __V2 +/// A 128-bit vector of [2 x double] values. +/// \param __M +/// A 128-bit vector operand, with mask bits 127 and 63 +/// specifying how the values are to be copied. The position of +/// the mask bit corresponds to the most significant bit of a +/// copied value. When a mask bit is 0, the corresponding 64-bit +/// element in operand __V1 is copied to the +/// same position in the destination. When a mask bit is 1, the +/// corresponding 64-bit element in operand __V2 +/// is copied to the same position in the destination. +/// \returns A 128-bit vector of [2 x double] containing the copied values. static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) { @@ -91,6 +428,28 @@ (__v2df)__M); } +/// \brief Copies 32-bit single-precision data values stored in either of the +/// two packed 128-bit vectors of [4 x float], as specified by the 128-bit +/// vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VBLENDVPS instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x float] values. +/// \param __V2 +/// A 128-bit vector of [4 x float] values. +/// \param __M +/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 +/// specifying how the values are to be copied. The position of +/// the mask bit corresponds to the most significant bit of a +/// copied value. When a mask bit is 0, the corresponding 32-bit +/// element in operand __V1 is copied to the +/// same position in the destination. When a mask bit is 1, the +/// corresponding 32-bit element in operand __V2 +/// is copied to the same position in the destination. +/// \returns A 128-bit vector of [4 x float] containing the copied values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) { @@ -98,6 +457,28 @@ (__v4sf)__M); } +/// \brief Copies packed 8-bit integer data values stored in either of the two +/// packed 128-bit vectors of [16 x i8], as specified by the 128-bit +/// vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPBLENDVB instruction. +/// +/// \param __V1 +/// A 128-bit vector of [16 x i8] values. +/// \param __V2 +/// A 128-bit vector of [16 x i8] values. +/// \param __M +/// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 +/// specifying how the values are to be copied. The position of +/// the mask bit corresponds to the most significant bit of a +/// copied value. When a mask bit is 0, the corresponding 8-bit +/// element in operand __V1 is copied to the +/// same position in the destination. When a mask bit is 1, the +/// corresponding 8-bit element in operand __V2 +/// is copied to the same position in the destination. +/// \returns A 128-bit vector of [16 x i8] containing the copied values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) { @@ -105,6 +486,32 @@ (__v16qi)__M); } +/// \brief Copies packed 16-bit integer data values stored in either of the two +/// packed 128-bit vectors of [8 x i16], as specified by the 128-bit +/// vector operand. +/// +/// \headerfile +/// +/// \code +/// _mm_blend_epi16( V1, V2, M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPBLENDW instruction. +/// +/// \param V1 +/// A 128-bit vector of [8 x i16] values. +/// \param V2 +/// A 128-bit vector of [8 x i16] values. +/// \param M +/// An immediate integer operand, with mask bits [7:0] +/// specifying how the values are to be copied. The position of +/// the mask bit corresponds to the index of a copied value. +/// When a mask bit is 0, the corresponding 16-bit element in +/// operand V1 is copied to the same +/// position in the destination. When a mask bit is 1, the +/// corresponding 16-bit element in operand V2 +/// is copied to the same position in the destination. +/// \returns A 128-bit vector of [8 x i16] containing the copied values. #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \ (__v8hi)(__m128i)(V2), \ @@ -118,12 +525,44 @@ (((M) & 0x80) ? 15 : 7)); }) /* SSE4 Dword Multiply Instructions. */ +/// \brief Multiplies packed 32-bit integer values and writes the low-order 32 +/// bits of each 64-bit product to the corresponding bits in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMULLD instruction. +/// +/// \param __V1 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __V2 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the products of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32 (__m128i __V1, __m128i __V2) { return (__m128i) ((__v4si)__V1 * (__v4si)__V2); } +/// \brief Multiplies the even-indexed packed 32-bit signed integer values +/// contained in the two 128-bit integer vectors and writes the 64-bit +/// signed products to the destination. +/// The following code illustrates this intrinsics behavior: +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMULDQ instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x i32] containing one of the source +/// operands. +/// \param __V2 +/// A 128-bit vector of [4 x i32] containing one of the source +/// operands. +/// \returns A 128-bit vector of [2 x i64] containing the products of both +/// operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32 (__m128i __V1, __m128i __V2) { @@ -131,15 +570,81 @@ } /* SSE4 Floating Point Dot Product Instructions. */ +/// \brief Computes the dot product of the two packed 128-bit vectors of [4 x +/// float], as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VDPPS instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float] values. +/// \param Y +/// A 128-bit vector of [4 x float] values. +/// \param M +/// An immediate integer operand. Mask bits [7:4] are used to +/// select 32-bit segments of the source operands. If a mask bit +/// is 1, the corresponding bits are used in the dot product +/// calculation: +/// Bit [7]: selects bits [127:96] +/// Bit [6]: selects bits [95:64] +/// Bit [5]: selects bits [63:32] +/// Bit [4]: selects bits [31:0] +/// Bits [3:0] select which bits within the destination will be +/// used to store the 32-bit sum. +/// \returns A 128-bit vector of [4 x float] containing the dot product. #define _mm_dp_ps(X, Y, M) __extension__ ({ \ (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (M)); }) +/// \brief Computes the dot product of the two packed 128-bit vectors of [2 x +/// double], as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c DPPD instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double] values. +/// \param Y +/// A 128-bit vector of [2 x double] values. +/// \param M +/// An immediate integer operand. Mask bits [7:4] are used to +/// select 64-bit segments of the source operands. If a mask bit +/// is 1, the corresponding bits are used in the dot product +/// calculation: +/// Bit [5]: selects bits [127:64] +/// Bit [4]: selects bits [63:0] +/// Bits [1:0] select which bits within the destination will be +/// used to store the 64-bit sum. +/// \returns A 128-bit vector of [2 x double] containing the dot product. #define _mm_dp_pd(X, Y, M) __extension__ ({\ (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (M)); }) /* SSE4 Streaming Load Hint Instruction. */ +/// \brief Loads integer values from a 128-bit aligned memory location to a +/// 128-bit integer vector. +/// The PlayStation4 CPU does not +/// support non-temporal hints for load operations. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVNTDQA instruction. +/// +/// \param __V +/// A 128-bit aligned pointer to a memory location that contains +/// the integer values. +/// \returns A 128-bit integer vector containing the data stored at the +/// specified memory location. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_stream_load_si128 (__m128i const *__V) { @@ -147,48 +652,168 @@ } /* SSE4 Packed Integer Min/Max Instructions. */ +/// \brief Compares each of the corresponding packed 8-bit integer values of +/// the 128-bit integer vectors, and writes the lesser value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMINSB instruction. +/// +/// \param __V1 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __V2 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); } +/// \brief Compares each of the corresponding packed 8-bit integer values of +/// the 128-bit integer vectors, and writes the greater value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMAXSB instruction. +/// +/// \param __V1 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __V2 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); } +/// \brief Compares each of the corresponding packed 16-bit unsigned integer +/// values of the 128-bit integer vectors, and writes the lesser value to +/// the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMINUW instruction. +/// +/// \param __V1 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __V2 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); } +/// \brief Compares each of the corresponding packed 16-bit unsigned integer +/// values of the 128-bit integer vectors, and writes the greater value to +/// the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMAXUW instruction. +/// +/// \param __V1 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __V2 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); } +/// \brief Compares each of the corresponding packed 32-bit integer values of +/// the 128-bit integer vectors, and writes the lesser value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMINSD instruction. +/// +/// \param __V1 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __V2 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); } +/// \brief Compares each of the corresponding packed 32-bit integer values of +/// the 128-bit integer vectors, and writes the greater value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMAXSD instruction. +/// +/// \param __V1 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __V2 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); } +/// \brief Compares each of the corresponding packed 32-bit unsigned integer +/// values of the 128-bit integer vectors, and writes the lesser value to +/// the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMINUD instruction. +/// +/// \param __V1 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __V2 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32 (__m128i __V1, __m128i __V2) { return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); } +/// \brief Compares each of the corresponding packed 32-bit unsigned integer +/// values of the 128-bit integer vectors, and writes the greater value to +/// the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMAXUD instruction. +/// +/// \param __V1 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \param __V2 +/// A 128-bit integer vector containing one of the source +/// operands. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32 (__m128i __V1, __m128i __V2) { @@ -196,7 +821,78 @@ } /* SSE4 Insertion and Extraction from XMM Register Instructions. */ +/// \brief Copies 32-bit single-precision floating-point data from the 128-bit +/// vector operands to the destination, using the bit indexes specified by +/// the immediate operand. The immediate operand may indicate that +/// specific index values should be set to zero. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); +/// \endcode +/// +/// This intrinsic corresponds to \c VINSERTPS instruction. +/// +/// \param X +/// A 128-bit vector source operand of [4 x float]. With the +/// exception of those bits in the destination copied from +/// parameter Y and zeroed by bits [3:0] +/// of N, all bits from this parameter +/// are copied to the destination. +/// \param Y +/// A 128-bit vector source operand of [4 x float]. One +/// single-precision floating-point element from this source, as +/// determined by the immediate parameter, is copied to the +/// destination. +/// \param N +/// Specifies the bits to be copied from operand Y, +/// the bits in the destination to which the selected bits from +/// operand Y are copied, and any bits in +/// the destination to be masked. The following assignments are +/// made: +/// Bits [7:6] specify the bits to copy from operand Y: +/// 00: Selects bits [31:0] from operand Y. +/// 01: Selects bits [63:32] from operand Y. +/// 10: Selects bits [95:64] from operand Y. +/// 11: Selects bits [127:96] from operand Y. +/// Bits [5:4] specify the bits in the destination to which the +/// selected bits from +/// operand Y are copied: +/// 00: Copies the selected bits from operand Y +/// to bits [31:0] of the destination. +/// 01: Copies the selected bits from operand Y +/// to bits [63:32] of the destination. +/// 10: Copies the selected bits from operand Y +/// to bits [95:64] of the destination. +/// 11: Copies the selected bits from operand Y +/// to bits [127:96] of the destination. +/// Bits[3:0]: If any of these bits are set, the corresponding +/// destination element is +/// cleared. +/// \returns A 128-bit vector of [4 x float] containing the copied float data +/// from the operands. #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) +/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and +/// copies it to the destination, as specified by the integer operand. +/// +/// \headerfile +/// +/// \code +/// int _mm_extract_ps(__m128 X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to \c EXTRACTPS instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. +/// \param N +/// Determines which bits are extracted using bits [1:0]: +/// 00: Bits [31:0] are copied to the destination. +/// 01: Bits [63:32] are copied to the destination. +/// 10: Bits [95:64] are copied to the destination. +/// 11: Bits [127:96] are copied to the destination. +/// \returns A 32-bit integer containing the extracted 32 bits of float data. #define _mm_extract_ps(X, N) (__extension__ \ ({ union { int __i; float __f; } __t; \ __v4sf __a = (__v4sf)(__m128)(X); \ @@ -217,15 +913,91 @@ _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) /* Insert int into packed integer array at index. */ +/// \brief Inserts 8 bits of extended packed data from the 128-bit integer +/// vector operand and the integer operand and copies them to the destination, +/// using the specified offset. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_insert_epi8(__m128i X, int I, const int N); +/// \endcode +/// +/// This intrinsic corresponds to \c PINSRB instruction. +/// +/// \param X +/// A 128-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param I +/// An integer. The bits of this operand are written to the +/// destination beginning at the offset specified by operand N. +/// \param N +/// Specifies the bit offset to be used in the destination. The +/// remaining bits in the destination are copied from the +/// corresponding bits in operand X. +/// \returns A 128-bit integer vector containing the copied extended packed +/// data from the operands. #define _mm_insert_epi8(X, I, N) (__extension__ \ ({ __v16qi __a = (__v16qi)(__m128i)(X); \ __a[(N) & 15] = (I); \ __a;})) +/// \brief Inserts 32 bits of extended packed data from the 128-bit integer +/// vector operand and the integer operand and copies them to the +/// destination, using the specified offset. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_insert_epi32(__m128i X, int I, const int N); +/// \endcode +/// +/// This intrinsic corresponds to \c PINSRD instruction. +/// +/// \param X +/// A 128-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param I +/// An integer. The bits of this operand are written to the +/// destination beginning at the offset specified by operand N. +/// \param N +/// Specifies the bit offset to be used in the destination. The +/// remaining bits in the destination are copied from the +/// corresponding bits in operand X. +/// \returns A 128-bit integer vector containing the copied extended packed +/// data from the operands. #define _mm_insert_epi32(X, I, N) (__extension__ \ ({ __v4si __a = (__v4si)(__m128i)(X); \ __a[(N) & 3] = (I); \ __a;})) #ifdef __x86_64__ +/// \brief Inserts 64 bits of extended packed data from the 128-bit integer +/// vector operand and the integer operand and copies them to the +/// destination, using the specified offset. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); +/// \endcode +/// +/// This intrinsic corresponds to \c PINSRQ instruction. +/// +/// \param X +/// A 128-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param I +/// A 64-bit integer. The bits of this operand are written to +/// the destination beginning at the offset specified by operand +/// N. +/// \param N +/// Specifies the bit offset to be used in the destination. The +/// remaining bits in the destination are copied from the +/// corresponding bits in operand X. +/// \returns A 128-bit integer vector containing the copied extended packed +/// data from the operands. #define _mm_insert_epi64(X, I, N) (__extension__ \ ({ __v2di __a = (__v2di)(__m128i)(X); \ __a[(N) & 1] = (I); \ @@ -235,42 +1007,198 @@ /* Extract int from packed integer array at index. This returns the element * as a zero extended value, so it is unsigned. */ +/// \brief Extracts 8 bits of extended packed data from a 128-bit integer +/// vector and copies it to the destination, as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// \code +/// int _mm_extract_epi8(__m128i X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to \c PEXTRB instruction. +/// +/// \param X +/// A 128-bit integer vector. +/// \param N +/// Specifies which element to copy to the destination. +/// \returns An 8-bit unsigned integer containing the extracted bits of +/// extended packed data. #define _mm_extract_epi8(X, N) (__extension__ \ ({ __v16qi __a = (__v16qi)(__m128i)(X); \ (int)(unsigned char) __a[(N) & 15];})) +/// \brief Extracts 32 bits of extended packed data from a 128-bit integer +/// vector and copies it to the destination, as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// \code +/// int _mm_extract_epi32(__m128i X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to \c PEXTRD instruction. +/// +/// \param X +/// A 128-bit integer vector. +/// \param N +/// Specifies which element to copy to the destination. +/// \returns A 32-bit unsigned integer containing the extracted bits of +/// extended packed data. #define _mm_extract_epi32(X, N) (__extension__ \ ({ __v4si __a = (__v4si)(__m128i)(X); \ (int)__a[(N) & 3];})) #ifdef __x86_64__ +/// \brief Extracts 64 bits of extended packed data from a 128-bit integer +/// vector and copies it to the destination, as specified by the integer +/// operand. +/// +/// \headerfile +/// +/// \code +/// long long _mm_extract_epi64(__m128i X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to \c PEXTRQ instruction. +/// +/// \param X +/// A 128-bit integer vector. +/// \param N +/// Specifies which element to copy to the destination. +/// \returns A 64-bit unsigned integer containing the extracted bits of +/// extended packed data. #define _mm_extract_epi64(X, N) (__extension__ \ ({ __v2di __a = (__v2di)(__m128i)(X); \ (long long)__a[(N) & 1];})) #endif /* __x86_64 */ /* SSE4 128-bit Packed Integer Comparisons. */ +/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// zeros. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPTEST instruction. +/// +/// \param __M +/// A 128-bit integer vector containing the bits to be tested. +/// \param __V +/// A 128-bit integer vector selecting which bits to test in +/// operand __M. +/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M, __m128i __V) { return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); } +/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPTEST instruction. +/// +/// \param __M +/// A 128-bit integer vector containing the bits to be tested. +/// \param __V +/// A 128-bit integer vector selecting which bits to test in +/// operand __M. +/// \returns TRUE if the specified bits are all ones; FALSE otherwise. static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M, __m128i __V) { return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); } +/// \brief Tests whether the specified bits in a 128-bit integer vector are +/// neither all zeros nor all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPTEST instruction. +/// +/// \param __M +/// A 128-bit integer vector containing the bits to be tested. +/// \param __V +/// A 128-bit integer vector selecting which bits to test in +/// operand __M. +/// \returns TRUE if the specified bits are neither all zeros nor all ones; +/// FALSE otherwise. static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M, __m128i __V) { return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); } +/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// ones. +/// +/// \headerfile +/// +/// \code +/// int _mm_test_all_ones(__m128i V); +/// \endcode +/// +/// This intrinsic corresponds to \c VPTEST instruction. +/// +/// \param V +/// A 128-bit integer vector containing the bits to be tested. +/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE +/// otherwise. #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) +/// \brief Tests whether the specified bits in a 128-bit integer vector are +/// neither all zeros nor all ones. +/// +/// \headerfile +/// +/// \code +/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); +/// \endcode +/// +/// This intrinsic corresponds to \c VPTEST instruction. +/// +/// \param M +/// A 128-bit integer vector containing the bits to be tested. +/// \param V +/// A 128-bit integer vector selecting which bits to test in +/// operand M. +/// \returns TRUE if the specified bits are neither all zeros nor all ones; +/// FALSE otherwise. #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) +/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// zeros. +/// +/// \headerfile +/// +/// \code +/// int _mm_test_all_zeros(__m128i M, __m128i V); +/// \endcode +/// +/// This intrinsic corresponds to \c VPTEST instruction. +/// +/// \param M +/// A 128-bit integer vector containing the bits to be tested. +/// \param V +/// A 128-bit integer vector selecting which bits to test in +/// operand M. +/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) /* SSE4 64-bit Packed Integer Comparisons. */ +/// \brief Compares each of the corresponding packed 64-bit values of the +/// 128-bit integer vectors for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPEQQ instruction. +/// +/// \param __V1 +/// A 128-bit integer vector. +/// \param __V2 +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) { @@ -278,6 +1206,17 @@ } /* SSE4 Packed Integer Sign-Extension. */ +/// \brief Sign-extends each of the packed 8-bit integers in the lower bits of +/// a 128-bit integer vector to 16-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVSXBW instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The values stored in bits +/// [63:0] are sign-extended. +/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) { @@ -286,6 +1225,17 @@ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); } +/// \brief Sign-extends each of the packed 8-bit integers in the lower bits of +/// a 128-bit integer vector to 32-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVSXBD instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The values stored in bits +/// [31:0] are sign-extended. +/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) { @@ -294,6 +1244,17 @@ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); } +/// \brief Sign-extends each of the packed 8-bit integers in the lower bits of +/// a 128-bit integer vector to 64-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVSXBQ instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The values stored in bits +/// [15:0] are sign-extended. +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) { @@ -303,18 +1264,51 @@ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); } +/// \brief Sign-extends each of the packed 16-bit integers in the lower bits of +/// a 128-bit integer vector to 32-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVSXWD instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The values stored in bits +/// [63:0] are sign-extended. +/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) { return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); } +/// \brief Sign-extends each of the packed 16-bit integers in the lower bits of +/// a 128-bit integer vector to 64-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVSXWQ instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The values stored in bits +/// [31:0] are sign-extended. +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) { return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); } +/// \brief Sign-extends each of the packed 32-bit integers in the lower bits of +/// a 128-bit integer vector to 64-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVSXDQ instruction. +/// +/// \param __V +/// A 128-bit vector of [4 x i32]. The values stored in bits +/// [63:0] are sign-extended. +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) { @@ -322,36 +1316,102 @@ } /* SSE4 Packed Integer Zero-Extension. */ +/// \brief Zero-extends each of the packed 8-bit integers in the lower bits of +/// a 128-bit integer vector to 16-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVZXBW instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The values stored in bits +/// [63:0] are zero-extended. +/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) { return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V); } +/// \brief Zero-extends each of the packed 8-bit integers in the lower bits of +/// a 128-bit integer vector to 32-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVZXBD instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The values stored in bits +/// [31:0] are zero-extended. +/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) { return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V); } +/// \brief Zero-extends each of the packed 8-bit integers in the lower bits of +/// a 128-bit integer vector to 64-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVZXBQ instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The values stored in bits +/// [15:0] are zero-extended. +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) { return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V); } +/// \brief Zero-extends each of the packed 16-bit integers in the lower bits of +/// a 128-bit integer vector to 32-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVZXWD instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The values stored in bits +/// [63:0] are zero-extended. +/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) { return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V); } +/// \brief Zero-extends each of the packed 16-bit integers in the lower bits of +/// a 128-bit integer vector to 64-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVZXWQ instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The values stored in bits +/// [31:0] are zero-extended. +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) { return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V); } +/// \brief Zero-extends each of the packed 32-bit integers in the lower bits of +/// a 128-bit integer vector to 64-bit values stored in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMOVZXDQ instruction. +/// +/// \param __V +/// A 128-bit vector of [4 x i32]. The values stored in bits +/// [63:0] are zero-extended. +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) { @@ -359,6 +1419,26 @@ } /* SSE4 Pack with Unsigned Saturation. */ +/// \brief Converts 32-bit signed integers from both 128-bit integer vector +/// operands into 16-bit unsigned integers, and packs the results into the +/// destination. Values greater than FFFFh +/// are saturated to FFFFh. Values +/// less than 0000h are saturated to +/// 0000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPACKUSDW instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x i32]. The +/// converted values are written to the lower order bits of the +/// destination. +/// \param __V2 +/// A 128-bit vector of [4 x i32]. The +/// converted values are written to the upper order bits of the +/// destination. +/// \returns A 128-bit vector of [8 x i16] containing the converted values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, __m128i __V2) { @@ -366,6 +1446,31 @@ } /* SSE4 Multiple Packed Sums of Absolute Difference. */ +/// \brief Subtracts packed 8-bit unsigned integer values and computes the +/// absolute values of the differences to the corresponding bits in the +/// destination. Then sums of the absolute differences are written to the +/// destination, according to the bit fields in the immediate operand. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c MPSADBW instruction. +/// +/// \param X +/// A 128-bit vector of [16 x i8] containing one of the source +/// operands. +/// \param Y +/// A 128-bit vector of [16 x i8] containing one of the source +/// operands. +/// \param M +/// An 8-bit immediate operand specifying how the absolute +/// differences are to be calculated, according to the following +/// algorithm: +/// \returns A 128-bit integer vector containing the sums of the sets of +/// absolute differences between both operands. #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ (__v16qi)(__m128i)(Y), (M)); }) @@ -411,61 +1516,859 @@ #define _SIDD_UNIT_MASK 0x40 /* SSE4.2 Packed Comparison Intrinsics. */ +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with implicitly defined lengths that is +/// contained in source operands A and B. Returns a 128-bit integer vector +/// representing the +/// result mask of the comparison. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPISTRM instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words, the type of comparison to perform, and +/// the format of the return value. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// Bit [6]: Determines whether the result is zero-extended or +/// expanded to 16 bytes. +/// 0: The result is zero-extended to 16 bytes. +/// 1: The result is expanded to 16 bytes (this expansion is +/// performed by +/// repeating each bit 8 or 16 times). +/// \returns Returns a 128-bit integer vector representing the result mask of +/// the comparison. #define _mm_cmpistrm(A, B, M) \ (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with implicitly defined lengths that is +/// contained in source operands A and B. Returns an integer representing the +/// result index +/// of the comparison. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistri(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPISTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words, the type of comparison to perform, and +/// the format of the return value. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// Bit [6]: Determines whether the index of the lowest set bit +/// or the highest set bit is +/// returned. +/// 0: The index of the least significant set bit. +/// 1: The index of the most significant set bit. +/// \returns Returns an integer representing the result index of the comparison. #define _mm_cmpistri(A, B, M) \ (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with explicitly defined lengths that is +/// contained in source operands A and B. Returns a 128-bit integer vector +/// representing the +/// result mask of the comparison. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPESTRM instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LA +/// An integer that specifies the length of the string in A. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LB +/// An integer that specifies the length of the string in B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words, the type of comparison to perform, and +/// the format of the return value. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// Bit [6]: Determines whether the result is zero-extended or +/// expanded to 16 bytes. +/// 0: The result is zero-extended to 16 bytes. +/// 1: The result is expanded to 16 bytes (this expansion is +/// performed by +/// repeating each bit 8 or 16 times). +/// \returns Returns a 128-bit integer vector representing the result mask of +/// the comparison. #define _mm_cmpestrm(A, LA, B, LB, M) \ (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with explicitly defined lengths that is +/// contained in source operands A and B. Returns an integer representing the +/// result index +/// of the comparison. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPESTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LA +/// An integer that specifies the length of the string in A. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LB +/// An integer that specifies the length of the string in B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words, the type of comparison to perform, and +/// the format of the return value. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// Bit [6]: Determines whether the index of the lowest set bit +/// or the highest set bit is +/// returned. +/// 0: The index of the least significant set bit. +/// 1: The index of the most significant set bit. +/// \returns Returns an integer representing the result index of the comparison. #define _mm_cmpestri(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with implicitly defined lengths that is +/// contained in source operands A and B. Returns 1 if the bit mask is not +/// zero and the +/// length of the string in B is the maximum. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistra(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPISTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words and the type of comparison to perform. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// \returns Returns 1 if the bit mask is not zero and the length of the string +/// in B is the maximum; otherwise returns 0. #define _mm_cmpistra(A, B, M) \ (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with implicitly defined lengths that is +/// contained in source operands A and B. Returns 1 if the bit mask is zero. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistrc(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPISTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words and the type of comparison to perform. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// \returns Returns 1 if the bit mask is zero; otherwise returns 0. #define _mm_cmpistrc(A, B, M) \ (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with implicitly defined lengths that is +/// contained in source operands A and B. Returns 1 if the least significant +/// bit of the bit +/// mask is 1. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistro(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPISTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words and the type of comparison to perform. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// \returns Returns 1 if the least significant bit of the bit mask is 1; +/// otherwise returns 0. #define _mm_cmpistro(A, B, M) \ (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with implicitly defined lengths that is +/// contained in source operands A and B. Returns 1 if the length of the +/// string in A is less than the maximum. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistrs(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPISTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words and the type of comparison to perform. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// \returns Returns 1 if the length of the string in A is +/// less than the maximum; otherwise returns 0. #define _mm_cmpistrs(A, B, M) \ (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with implicitly defined lengths that is +/// contained in source operands A and B. Returns 1 if the length of the +/// string in B is less than the maximum. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistrz(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPISTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words and the type of comparison to perform. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// \returns Returns 1 if the length of the string in B is +/// less than the maximum; otherwise returns 0. #define _mm_cmpistrz(A, B, M) \ (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with explicitly defined lengths that is +/// contained in source operands A and B. Returns 1 if the bit mask is not +/// zero and the +/// length of the string in B is the maximum. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPESTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LA +/// An integer that specifies the length of the string in A. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LB +/// An integer that specifies the length of the string in B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words and the type of comparison to perform. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// \returns Returns 1 if the bit mask is not zero and the length of the string +/// in B is the maximum; otherwise returns 0. #define _mm_cmpestra(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with explicitly defined lengths that is +/// contained in source operands A and B. Returns 1 if the bit mask is zero. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPESTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LA +/// An integer that specifies the length of the string in A. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LB +/// An integer that specifies the length of the string in B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words and the type of comparison to perform. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// \returns Returns 1 if the bit mask is zero; otherwise returns 0. #define _mm_cmpestrc(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with explicitly defined lengths that is +/// contained in source operands A and B. Returns 1 if the least significant +/// bit of the bit +/// mask is 1. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPESTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LA +/// An integer that specifies the length of the string in A. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LB +/// An integer that specifies the length of the string in B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words and the type of comparison to perform. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// \returns Returns 1 if the least significant bit of the bit mask is 1; +/// otherwise returns 0. #define _mm_cmpestro(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with explicitly defined lengths that is +/// contained in source operands A and B. Returns 1 if the length of the +/// string in A is less than the maximum. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPESTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LA +/// An integer that specifies the length of the string in A. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LB +/// An integer that specifies the length of the string in B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words and the type of comparison to perform. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// \returns Returns 1 if the length of the string in A is +/// less than the maximum; otherwise returns 0. #define _mm_cmpestrs(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) +/// \brief Uses the immediate operand M to perform a packed +/// comparison of string data with explicitly defined lengths that is +/// contained in source operands A and B. Returns 1 if the length of the +/// string in B is less than the maximum. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to \c VPCMPESTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LA +/// An integer that specifies the length of the string in A. +/// \param B +/// A 128-bit integer vector containing one of the source +/// operands to be compared. +/// \param LB +/// An integer that specifies the length of the string in B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters +/// are bytes or words and the type of comparison to perform. +/// Bits [1:0]: Determine source data format. +/// 00: 16 packed unsigned bytes +/// 01: 8 packed unsigned words +/// 10: 16 packed signed bytes +/// 11: 8 packed signed words +/// Bits [3:2]: Determine comparison type and aggregation +/// method. +/// 00: Subset: Each character in B is +/// compared for equality with all the +/// characters in A. +/// 01: Ranges: Each character in B is +/// compared to A. The comparison basis is +/// greater than or equal for even-indexed elements in A, and less than or +/// equal for odd-indexed elements in A. +/// 10: Match: Compare each pair of corresponding characters in +/// A and B +/// for equality. +/// 11: Substring: Search B for substring +/// matches of A. +/// Bits [5:4]: Determine whether to perform a ones +/// complement on the bit +/// mask of the comparison results. +/// 00: No effect. +/// 01: Negate the bit mask. +/// 10: No effect. +/// 11: Negate the bit mask only for bits with an index less +/// than or equal to the +/// size of A or B. +/// \returns Returns 1 if the length of the string in B is +/// less than the maximum; otherwise returns 0. #define _mm_cmpestrz(A, LA, B, LB, M) \ (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) /* SSE4.2 Compare Packed Data -- Greater Than. */ +/// \brief Compares each of the corresponding packed 64-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are +/// greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPCMPGTQ instruction. +/// +/// \param __V1 +/// A 128-bit integer vector. +/// \param __V2 +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) { @@ -473,18 +2376,63 @@ } /* SSE4.2 Accumulate CRC32. */ +/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// character operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CRC32 instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum +/// of operand __D. +/// \param __D +/// An unsigned 8-bit integer operand used to compute the +/// CRC-32C checksum. +/// \returns The result of adding operand __C to the CRC-32C +/// checksum of operand __D. static __inline__ unsigned int __DEFAULT_FN_ATTRS _mm_crc32_u8(unsigned int __C, unsigned char __D) { return __builtin_ia32_crc32qi(__C, __D); } +/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// character operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CRC32 instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum +/// of operand __D. +/// \param __D +/// An unsigned 16-bit integer operand used to compute the +/// CRC-32C checksum. +/// \returns The result of adding operand __C to the CRC-32C +/// checksum of operand __D. static __inline__ unsigned int __DEFAULT_FN_ATTRS _mm_crc32_u16(unsigned int __C, unsigned short __D) { return __builtin_ia32_crc32hi(__C, __D); } +/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// character operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CRC32 instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum +/// of operand __D. +/// \param __D +/// An unsigned 32-bit integer operand used to compute the +/// CRC-32C checksum. +/// \returns The result of adding operand __C to the CRC-32C +/// checksum of operand __D. static __inline__ unsigned int __DEFAULT_FN_ATTRS _mm_crc32_u32(unsigned int __C, unsigned int __D) { @@ -492,6 +2440,21 @@ } #ifdef __x86_64__ +/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// character operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CRC32 instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum +/// of operand __D. +/// \param __D +/// An unsigned 64-bit integer operand used to compute the +/// CRC-32C checksum. +/// \returns The result of adding operand __C to the CRC-32C +/// checksum of operand __D. static __inline__ unsigned long long __DEFAULT_FN_ATTRS _mm_crc32_u64(unsigned long long __C, unsigned long long __D) { Index: lib/Headers/tmmintrin.h =================================================================== --- lib/Headers/tmmintrin.h +++ lib/Headers/tmmintrin.h @@ -29,187 +29,758 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"))) +/// \brief Computes the absolute value of each of the packed 8-bit signed +/// integers in the source operand and stores the 8-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PABSB instruction. +/// +/// \param __a +/// A 64-bit vector of [8 x i8]. +/// \returns A 64-bit integer vector containing the absolute values of the +/// elements in the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi8(__m64 __a) { return (__m64)__builtin_ia32_pabsb((__v8qi)__a); } +/// \brief Computes the absolute value of each of the packed 8-bit signed +/// integers in the source operand and stores the 8-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPABSB instruction. +/// +/// \param __a +/// A 128-bit vector of [16 x i8]. +/// \returns A 128-bit integer vector containing the absolute values of the +/// elements in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_abs_epi8(__m128i __a) { return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a); } +/// \brief Computes the absolute value of each of the packed 16-bit signed +/// integers in the source operand and stores the 16-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PABSW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16]. +/// \returns A 64-bit integer vector containing the absolute values of the +/// elements in the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi16(__m64 __a) { return (__m64)__builtin_ia32_pabsw((__v4hi)__a); } +/// \brief Computes the absolute value of each of the packed 16-bit signed +/// integers in the source operand and stores the 16-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPABSW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. +/// \returns A 128-bit integer vector containing the absolute values of the +/// elements in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_abs_epi16(__m128i __a) { return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a); } +/// \brief Computes the absolute value of each of the packed 32-bit signed +/// integers in the source operand and stores the 32-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PABSD instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32]. +/// \returns A 64-bit integer vector containing the absolute values of the +/// elements in the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_abs_pi32(__m64 __a) { return (__m64)__builtin_ia32_pabsd((__v2si)__a); } +/// \brief Computes the absolute value of each of the packed 32-bit signed +/// integers in the source operand and stores the 32-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPABSD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. +/// \returns A 128-bit integer vector containing the absolute values of the +/// elements in the operand. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_abs_epi32(__m128i __a) { return (__m128i)__builtin_ia32_pabsd128((__v4si)__a); } +/// \brief Concatenates the two 128-bit integer vector operands, and +/// right-shifts the result by the number of bytes specified in the immediate +/// operand. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n); +/// \endcode +/// +/// \param a +/// A 128-bit vector of [16 x i8] containing one of the source +/// operands. +/// \param b +/// A 128-bit vector of [16 x i8] containing one of the source +/// operands. +/// \param n +/// An immediate operand specifying how many bytes to +/// right-shift the result. +/// \returns A 128-bit integer vector containing the concatenated right-shifted +/// value. #define _mm_alignr_epi8(a, b, n) __extension__ ({ \ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \ (__v16qi)(__m128i)(b), (n)); }) +/// \brief Concatenates the two 64-bit integer vector operands, and +/// right-shifts the result by the number of bytes specified in the immediate +/// operand. +/// +/// \headerfile +/// +/// \code +/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n); +/// \endcode +/// +/// \param a +/// A 64-bit vector of [8 x i8] containing one of the source +/// operands. +/// \param b +/// A 64-bit vector of [8 x i8] containing one of the source +/// operands. +/// \param n +/// An immediate operand specifying how many bytes to +/// right-shift the result. +/// \returns A 64-bit integer vector containing the concatenated right-shifted +/// value. #define _mm_alignr_pi8(a, b, n) __extension__ ({ \ (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); }) +/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of [8 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPHADDW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the lower bits of the destination. +/// \param __b +/// A 128-bit vector of [8 x i16] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the upper bits of the destination. +/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of +/// both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_hadd_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of [4 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPHADDD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the lower bits of the destination. +/// \param __b +/// A 128-bit vector of [4 x i32] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the upper bits of the destination. +/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of +/// both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_hadd_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b); } +/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// 64-bit vectors of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PHADDW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the lower bits of the destination. +/// \param __b +/// A 64-bit vector of [4 x i16] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the upper bits of the destination. +/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of +/// both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadd_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b); } +/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// 64-bit vectors of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PHADDD instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the lower bits of the destination. +/// \param __b +/// A 64-bit vector of [2 x i32] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the upper bits of the destination. +/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of +/// both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadd_pi32(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b); } +/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are +/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPHADDSW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the lower bits of the destination. +/// \param __b +/// A 128-bit vector of [8 x i16] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the upper bits of the destination. +/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated +/// sums of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_hadds_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are +/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PHADDSW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the lower bits of the destination. +/// \param __b +/// A 64-bit vector of [4 x i16] containing one of the source +/// operands. The horizontal sums of the values are stored in +/// the upper bits of the destination. +/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated +/// sums of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hadds_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b); } +/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of [8 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPHSUBW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the lower bits of the destination. +/// \param __b +/// A 128-bit vector of [8 x i16] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the upper bits of the destination. +/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences +/// of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_hsub_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of [4 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPHSUBD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the lower bits of the destination. +/// \param __b +/// A 128-bit vector of [4 x i32] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the upper bits of the destination. +/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences +/// of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_hsub_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b); } +/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 64-bit vectors of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PHSUBW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the lower bits of the destination. +/// \param __b +/// A 64-bit vector of [4 x i16] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the upper bits of the destination. +/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences +/// of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsub_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b); } +/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 64-bit vectors of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PHSUBD instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the lower bits of the destination. +/// \param __b +/// A 64-bit vector of [2 x i32] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the upper bits of the destination. +/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences +/// of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsub_pi32(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b); } +/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of [8 x i16]. Positive differences greater than +/// 7FFFh are saturated to 7FFFh. Negative differences less than +/// 8000h are saturated to 8000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPHSUBSW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the lower bits of the destination. +/// \param __b +/// A 128-bit vector of [8 x i16] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the upper bits of the destination. +/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated +/// differences of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_hsubs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 64-bit vectors of [4 x i16]. Positive differences greater than +/// 7FFFh are saturated to 7FFFh. Negative differences less than +/// 8000h are saturated to 8000h. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PHSUBSW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the lower bits of the destination. +/// \param __b +/// A 64-bit vector of [4 x i16] containing one of the source +/// operands. The horizontal differences between the values are +/// stored in the upper bits of the destination. +/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated +/// differences of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_hsubs_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b); } +/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer +/// values contained in the first source operand and packed 8-bit signed +/// integer +/// values contained in the second source operand, adds pairs of +/// contiguous products with signed saturation, and writes the 16-bit sums +/// to the corresponding bits in the destination. For example, bits [7:0] +/// of both operands are multiplied, bits [15:8] of both operands are +/// multiplied, and the sum of both results is written to bits [15:0] of +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMADDUBSW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the first source +/// operands. +/// \param __b +/// A 128-bit integer vector containing the second source +/// operands. +/// \returns A 128-bit integer vector containing the sums of products of both +/// operands: +/// R0 := (a0 * b0) + (a1 * b1) +/// R1 := (a2 * b2) + (a3 * b3) +/// R2 := (a4 * b4) + (a5 * b5) +/// R3 := (a6 * b6) + (a7 * b7) +/// R4 := (a8 * b8) + (a9 * b9) +/// R5 := (a10 * b10) + (a11 * b11) +/// R6 := (a12 * b12) + (a13 * b13) +/// R7 := (a14 * b14) + (a15 * b15) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maddubs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); } +/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer +/// values contained in the first source operand and packed 8-bit signed +/// integer +/// values contained in the second source operand, adds pairs of +/// contiguous products with signed saturation, and writes the 16-bit sums +/// to the corresponding bits in the destination. For example, bits [7:0] +/// of both operands are multiplied, bits [15:8] of both operands are +/// multiplied, and the sum of both results is written to bits [15:0] of +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMADDUBSW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the first source +/// operands. +/// \param __b +/// A 64-bit integer vector containing the second source +/// operands. +/// \returns A 64-bit integer vector containing the sums of products of both +/// operands: +/// R0 := (a0 * b0) + (a1 * b1) +/// R1 := (a2 * b2) + (a3 * b3) +/// R2 := (a4 * b4) + (a5 * b5) +/// R3 := (a6 * b6) + (a7 * b7) static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_maddubs_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b); } +/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit +/// products to the 18 most significant bits by right-shifting, rounds the +/// truncated value, and writes bits [16:1] to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPMULHRSW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16] containing one of the source +/// operands. +/// \param __b +/// A 128-bit vector of [8 x i16] containing one of the source +/// operands. +/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled +/// products of both operands. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhrs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); } +/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit +/// products to the 18 most significant bits by right-shifting, rounds the +/// truncated value, and writes bits [16:1] to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMULHRSW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16] containing one of the source +/// operands. +/// \param __b +/// A 64-bit vector of [4 x i16] containing one of the source +/// operands. +/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled +/// products of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mulhrs_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b); } +/// \brief Copies the 8-bit integers from a 128-bit integer vector to the +/// destination or clears 8-bit values in the destination, as specified by +/// the second source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSHUFB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values to be copied. +/// \param __b +/// A 128-bit integer vector containing control bytes +/// corresponding to positions in the destination: +/// Bit 7: +/// 1: Clear the corresponding byte in the destination. +/// 0: Copy the selected source byte to the corresponding byte +/// in the destination. +/// Bits [6:4] Reserved. +/// Bits [3:0] select the source byte to be copied. +/// \returns A 128-bit integer vector containing the copied or cleared values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_shuffle_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b); } +/// \brief Copies the 8-bit integers from a 64-bit integer vector to the +/// destination or clears 8-bit values in the destination, as specified by +/// the second source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSHUFB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the values to be copied. +/// \param __b +/// A 64-bit integer vector containing control bytes +/// corresponding to positions in the destination: +/// Bit 7: +/// 1: Clear the corresponding byte in the destination. +/// 0: Copy the selected source byte to the corresponding byte +/// in the destination. +/// Bits [3:0] select the source byte to be copied. +/// \returns A 64-bit integer vector containing the copied or cleared values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_shuffle_pi8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b); } +/// \brief For each 8-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand: +/// If the byte in the second source is negative, calculate the twos +/// complement of the corresponding byte in the first source, and write +/// that value to the destination. +/// If the byte in the second source is positive, copy the corresponding +/// byte from the first source to the destination. +/// If the byte in the second source is zero, clear the corresponding byte +/// in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSIGNB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values to be copied. +/// \param __b +/// A 128-bit integer vector containing control bytes +/// corresponding to positions in the destination. +/// \returns A 128-bit integer vector containing the resultant values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sign_epi8(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b); } +/// \brief For each 16-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand: +/// If the word in the second source is negative, calculate the twos +/// complement of the corresponding byte in the first source, and write +/// that value to the destination. +/// If the word in the second source is positive, copy the corresponding +/// byte from the first source to the destination. +/// If the word in the second source is zero, clear the corresponding byte +/// in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSIGNW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values to be copied. +/// \param __b +/// A 128-bit integer vector containing control words +/// corresponding to positions in the destination. +/// \returns A 128-bit integer vector containing the resultant values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sign_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b); } +/// \brief For each 32-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand: +/// If the doubleword in the second source is negative, calculate the twos +/// complement of the corresponding byte in the first source, and write +/// that value to the destination. +/// If the doubleword in the second source is positive, copy the +/// corresponding byte from the first source to the destination. +/// If the doubleword in the second source is zero, clear the +/// corresponding byte in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPSIGND instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values to be copied. +/// \param __b +/// A 128-bit integer vector containing control doublewords +/// corresponding to positions in the destination. +/// \returns A 128-bit integer vector containing the resultant values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sign_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b); } +/// \brief For each 8-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand: +/// If the byte in the second source is negative, calculate the twos +/// complement of the corresponding byte in the first source, and write +/// that value to the destination. +/// If the byte in the second source is positive, copy the corresponding +/// byte from the first source to the destination. +/// If the byte in the second source is zero, clear the corresponding byte +/// in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSIGNB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the values to be copied. +/// \param __b +/// A 64-bit integer vector containing control bytes +/// corresponding to positions in the destination. +/// \returns A 64-bit integer vector containing the resultant values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b); } +/// \brief For each 16-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand: +/// If the word in the second source is negative, calculate the twos +/// complement of the corresponding byte in the first source, and write +/// that value to the destination. +/// If the word in the second source is positive, copy the corresponding +/// byte from the first source to the destination. +/// If the word in the second source is zero, clear the corresponding byte +/// in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSIGNW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the values to be copied. +/// \param __b +/// A 64-bit integer vector containing control words +/// corresponding to positions in the destination. +/// \returns A 64-bit integer vector containing the resultant values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b); } +/// \brief For each 32-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand: +/// If the doubleword in the second source is negative, calculate the twos +/// complement of the corresponding byte in the first source, and write +/// that value to the destination. +/// If the doubleword in the second source is positive, copy the +/// corresponding byte from the first source to the destination. +/// If the doubleword in the second source is zero, clear the +/// corresponding byte in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSIGND instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the values to be copied. +/// \param __b +/// A 64-bit integer vector containing control doubleword +/// corresponding to positions in the destination. +/// \returns A 64-bit integer vector containing the resultant values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sign_pi32(__m64 __a, __m64 __b) { Index: lib/Headers/xmmintrin.h =================================================================== --- lib/Headers/xmmintrin.h +++ lib/Headers/xmmintrin.h @@ -39,6 +39,23 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"))) +/// \brief Adds the 32-bit float values in the low-order bits of the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VADDSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. The lower 32 bits of this operand are used in the +/// calculation. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. The lower 32 bits of this operand are used in the +/// calculation. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum +/// of the lower 32 bits of both operands. The upper 96 bits are copied from +/// the upper 96 bits of the first source operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ss(__m128 __a, __m128 __b) { @@ -46,12 +63,43 @@ return __a; } +/// \brief Adds each of the values of 2 packed 128-bit vectors of [4 x float], +/// and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VADDPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. +/// \returns A 128-bit vector of [4 x float] containing the sums of both static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_add_ps(__m128 __a, __m128 __b) { return __a + __b; } +/// \brief Subtracts the 32-bit float value in the low-order bits of the second +/// operand from the corresponding value in the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSUBSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the minuend. The +/// lower 32 bits of this operand are used in the calculation. +/// \param __b +/// A 128-bit vector of [4 x float] containing the subtrahend. +/// The lower 32 bits of this operand are used in the +/// calculation. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// difference of the lower 32 bits of both operands. The upper 96 bits +/// are copied from the upper 96 bits of the first source operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ss(__m128 __a, __m128 __b) { @@ -59,12 +107,44 @@ return __a; } +/// \brief Subtracts each of the values of the second operand from the first +/// operand, both of which are 2 packed 128-bit vectors of [4 x float], +/// and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSUBPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the minuend. +/// \param __b +/// A 128-bit vector of [4 x float] containing the subtrahend. +/// \returns A 128-bit vector of [4 x float] containing the differences between +/// both operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sub_ps(__m128 __a, __m128 __b) { return __a - __b; } +/// \brief Multiplies 2 32-bit float values in the low-order bits of 2 the +/// operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMULSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. The lower 32 bits of this operand are used in the +/// calculation. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. The lower 32 bits of this operand are used in the +/// calculation. +/// \returns A 128-bit vector of [4 x float] containing the product of the lower +/// 32 bits of both operands. The upper 96 bits are copied from the upper 96 +/// bits of the first source operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ss(__m128 __a, __m128 __b) { @@ -72,12 +152,44 @@ return __a; } +/// \brief Multiplies each of the values of 2 packed 128-bit vectors of [4 x +/// float], and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMULPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. +/// \returns A 128-bit vector of [4 x float] containing the products of both +/// operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_mul_ps(__m128 __a, __m128 __b) { return __a * __b; } +/// \brief Divides the value in the low-order 32 bits of the first operand by +/// the corresponding value in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VDIVSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the dividend. The +/// lower 32 bits of this operand are used in the calculation. +/// \param __b +/// A 128-bit vector of [4 x float] containing the divisor. The +/// lower 32 bits of this operand are used in the calculation. +/// \returns A 128-bit vector of [4 x float] containing the quotients of the +/// lower 32 bits of both operands. The upper 96 bits are copied from the +/// upper +/// 96 bits of the first source operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ss(__m128 __a, __m128 __b) { @@ -85,12 +197,36 @@ return __a; } +/// \brief Divides 2 packed 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VDIVPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the dividend. +/// \param __b +/// A 128-bit vector of [4 x float] containing the divisor. +/// \returns A 128-bit vector of [4 x float] containing the quotients between +/// both operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_div_ps(__m128 __a, __m128 __b) { return __a / __b; } +/// \brief Calculates the square root of the value stored in the low-order bits +/// of a packed 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSQRTSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the calculation. +/// \returns A 128-bit vector of [4 x float] containing the square root of the +/// value in the operand in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) { @@ -98,12 +234,35 @@ return (__m128) { __c[0], __a[1], __a[2], __a[3] }; } +/// \brief Calculates the square roots of the values stored in a packed 128-bit +/// vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSQRTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the square roots of the +/// values in the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) { return __builtin_ia32_sqrtps(__a); } +/// \brief Calculates the approximate reciprocal of the value stored in the +/// low-order bits of a packed 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VRCPSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the calculation. +/// \returns A 128-bit vector of [4 x float] containing the approximate +/// reciprocal of the value in the operand in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a) { @@ -111,12 +270,37 @@ return (__m128) { __c[0], __a[1], __a[2], __a[3] }; } +/// \brief Calculates the approximate reciprocals of the values stored in a +/// packed 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VRCPPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the approximate +/// reciprocals of the values in the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a) { return __builtin_ia32_rcpps(__a); } +/// \brief Calculates the approximate reciprocal of the square root of the +/// value stored in the low-order bits of a packed 128-bit vector of [4 x +/// float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VRSQRTSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the calculation. +/// \returns A 128-bit vector of [4 x float] containing the approximate +/// reciprocal of the square root of the value in the operand in the +/// low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a) { @@ -124,96 +308,349 @@ return (__m128) { __c[0], __a[1], __a[2], __a[3] }; } +/// \brief Calculates the approximate reciprocals of the square roots of the +/// values stored in a packed 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VRSQRTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the approximate +/// reciprocals of the square roots of the values in the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ps(__m128 __a) { return __builtin_ia32_rsqrtps(__a); } +/// \brief Compares 2 32-bit float values in the low-order bits of both operands, +/// and stores the lesser of the pair of values in the low-order bits of +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMINSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// minimum value between both operands. The upper 96 bits are copied from +/// the upper 96 bits of the first source operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ss(__m128 __a, __m128 __b) { return __builtin_ia32_minss(__a, __b); } +/// \brief Compares 2 packed 128-bit vectors of [4 x float] and stores the +/// lesser of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMINPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. +/// \returns A 128-bit vector of [4 x float] containing the minimum values +/// between both operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_min_ps(__m128 __a, __m128 __b) { return __builtin_ia32_minps(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both operands, +/// and stores the greater of the pair of values in the low-order bits of +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMAXSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// maximum value between both operands. The upper 96 bits are copied from +/// the upper 96 bits of the first source operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ss(__m128 __a, __m128 __b) { return __builtin_ia32_maxss(__a, __b); } +/// \brief Compares 2 packed 128-bit vectors of [4 x float] and stores the +/// greater of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMAXPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. +/// \returns A 128-bit vector of [4 x float] containing the maximum values +/// between both operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_max_ps(__m128 __a, __m128 __b) { return __builtin_ia32_maxps(__a, __b); } +/// \brief Performs a bitwise AND of 2 packed 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPAND instruction. +/// +/// \param __a +/// A 128-bit vector containing one of the source operands. +/// \param __b +/// A 128-bit vector containing one of the source operands. +/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the +/// values between both operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_and_ps(__m128 __a, __m128 __b) { return (__m128)((__v4si)__a & (__v4si)__b); } +/// \brief Performs a bitwise AND of 2 packed 128-bit vectors of [4 x float], +/// using the ones-complement of the values contained in the first +/// source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPANDN instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the left source +/// operand. The ones complement of this value is used in +/// the bitwise AND. +/// \param __b +/// A 128-bit vector of [4 x float] containing the right source +/// operand. +/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the +/// ones-complement of the first operand and the values in the second operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_andnot_ps(__m128 __a, __m128 __b) { return (__m128)(~(__v4si)__a & (__v4si)__b); } +/// \brief Performs a bitwise OR of 2 packed 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPOR instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. +/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the +/// values between both operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_or_ps(__m128 __a, __m128 __b) { return (__m128)((__v4si)__a | (__v4si)__b); } +/// \brief Performs a bitwise exclusive OR of 2 packed 128-bit vectors of [4 x +/// float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPXOR instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source +/// operands. +/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR +/// of the values between both operands. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_xor_ps(__m128 __a, __m128 __b) { return (__m128)((__v4si)__a ^ (__v4si)__b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands for equality, and stores the result of the comparison in the +/// low-order +/// bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPEQSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpeqss(__a, __b); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPEQPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpeq_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpeqps(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is less than the +/// corresponding value in the second operand, and stores the result of +/// the comparison in the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLTSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpltss(__a, __b); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmplt_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpltps(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is less than or +/// equal +/// to the corresponding value in the second operand, and stores the +/// result of the comparison in the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLESS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpless(__a, __b); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are less than or equal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLEPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmple_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpleps(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is greater than +/// the +/// corresponding value in the second operand, and stores the result of +/// the comparison in the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLTSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ss(__m128 __a, __m128 __b) { @@ -222,12 +659,45 @@ 4, 1, 2, 3); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpgt_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpltps(__b, __a); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is greater than +/// or +/// equal to the corresponding value in the second operand, and stores the +/// result of the comparison in the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLESS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ss(__m128 __a, __m128 __b) { @@ -236,48 +706,177 @@ 4, 1, 2, 3); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are greater than or equal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPLEPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpge_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpleps(__b, __a); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands for inequality, and stores the result of the comparison in the +/// low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNEQSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpneqss(__a, __b); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] for inequality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNEQPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpneq_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpneqps(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is not less than +/// the +/// corresponding value in the second operand, and stores the result of +/// the comparison in the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLTSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnltss(__a, __b); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are not less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnlt_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnltps(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is not less than +/// or +/// equal to the corresponding value in the second operand, and stores the +/// result of the comparison in the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLESS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnless(__a, __b); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are not less than or equal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLEPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnle_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnleps(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is not greater +/// than the +/// corresponding value in the second operand, and stores the result of +/// the comparison in the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLTSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ss(__m128 __a, __m128 __b) { @@ -286,12 +885,45 @@ 4, 1, 2, 3); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are not greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpngt_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnltps(__b, __a); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is not greater +/// than or +/// equal to the corresponding value in the second operand, and stores the +/// result of the comparison in the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLESS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ss(__m128 __a, __m128 __b) { @@ -300,114 +932,407 @@ 4, 1, 2, 3); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are not greater than or equal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPNLEPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpnge_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnleps(__b, __a); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is ordered with +/// respect +/// to the corresponding value in the second operand, and stores the +/// result of the comparison in the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPORDSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpordss(__a, __b); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are ordered with respect to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPORDPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpord_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpordps(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is unordered +/// with +/// respect to the corresponding value in the second operand, and stores +/// the result of the comparison in the low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPUNORDSS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the +/// operands. The lower 32 bits of this operand are used in the +/// comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpunordss(__a, __b); } +/// \brief Compares each of the corresponding packed 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are unordered with respect to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCMPUNORDPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. +/// \param __b +/// A 128-bit vector of [4 x float] values. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cmpunord_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpunordps(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands for equality, and stores the result of the comparison in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comieq(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the first operand is less than the second +/// operand, and +/// stores the result of the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comilt(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the first operand is less than or equal to the +/// second +/// operand, and stores the result of the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comile(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the first operand is greater than the second +/// operand, +/// and stores the result of the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comigt(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the first operand is greater than or equal to +/// the +/// second operand, and stores the result of the comparison in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comige(__a, __b); } +/// \brief Compares 2 32-bit float values in the low-order bits of both +/// operands to determine if the first operand is not equal to the second +/// operand, +/// and stores the result of the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comineq(__a, __b); } +/// \brief Performs an unordered comparison of 2 32-bit float values using the +/// low-order bits of both operands to determine equality, and stores the +/// result of the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomieq(__a, __b); } +/// \brief Performs an unordered comparison of 2 32-bit float values using the +/// low-order bits of both operands to determine if the first operand is +/// less than the second operand, and stores the result of the comparison +/// in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomilt(__a, __b); } +/// \brief Performs an unordered comparison of 2 32-bit float values using the +/// low-order bits of both operands to determine if the first operand is +/// less than or equal to the second operand, and stores the result of the +/// comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomile(__a, __b); } +/// \brief Performs an unordered comparison of 2 32-bit float values using the +/// low-order bits of both operands to determine if the first operand is +/// greater than the second operand, and stores the result of the +/// comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomigt(__a, __b); } +/// \brief Performs an unordered comparison of 2 32-bit float values using the +/// low-order bits of both operands to determine if the first operand is +/// greater than or equal to the second operand, and stores the result of +/// the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomige(__a, __b); } +/// \brief Performs an unordered comparison of 2 32-bit float values using the +/// low-order bits of both operands to determine inequality, and stores +/// the result of the comparison in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUCOMISS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] values. The lower 32 bits of +/// this operand are used in the comparison. +/// \returns An integer containing the comparison results. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomineq(__a, __b); } +/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 32-bit integer. The result is written to the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSS2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this +/// operand are used in the conversion. +/// \returns A 32-bit integer containing the converted value. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtss_si32(__m128 __a) { return __builtin_ia32_cvtss2si(__a); } +/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 32-bit integer. The result is written to the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSS2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this +/// operand are used in the conversion. +/// \returns A 32-bit integer containing the converted value. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvt_ss2si(__m128 __a) { @@ -416,6 +1341,18 @@ #ifdef __x86_64__ +/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 64-bit integer. The result is written to the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSS2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this +/// operand are used in the conversion. +/// \returns A 64-bit integer containing the converted value. static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtss_si64(__m128 __a) { @@ -424,48 +1361,145 @@ #endif +/// \brief Converts 2 low-order float values in a 128-bit vector of [4 x float] +/// into a 64-bit vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPS2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 64-bit integer vector containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtps_pi32(__m128 __a) { return (__m64)__builtin_ia32_cvtps2pi(__a); } +/// \brief Converts 2 low-order float values in a 128-bit vector of [4 x float] +/// into a 64-bit vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPS2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 64-bit integer vector containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvt_ps2pi(__m128 __a) { return _mm_cvtps_pi32(__a); } +/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 32-bit integer, truncating the result when it is +/// inexact. The result is written to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTTSS2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this +/// operand are used in the conversion. +/// \returns A 32-bit integer containing the converted value. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttss_si32(__m128 __a) { return __a[0]; } +/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 32-bit integer, truncating the result when it is +/// inexact. The result is written to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTTSS2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this +/// operand are used in the conversion. +/// \returns A 32-bit integer containing the converted value. static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtt_ss2si(__m128 __a) { return _mm_cvttss_si32(__a); } +/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 64-bit integer, truncating the result when it is +/// inexact. The result is written to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTTSS2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this +/// operand are used in the conversion. +/// \returns A 64-bit integer containing the converted value. static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttss_si64(__m128 __a) { return __a[0]; } +/// \brief Converts 2 low-order float values in a 128-bit vector of [4 x float] +/// into a 64-bit vector of [2 x i32], truncating the result when it is +/// inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTTPS2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 64-bit integer vector containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttps_pi32(__m128 __a) { return (__m64)__builtin_ia32_cvttps2pi(__a); } +/// \brief Converts 2 low-order float values in a 128-bit vector of [4 x float] +/// into a 64-bit vector of [2 x i32], truncating the result when it is +/// inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTTPS2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 64-bit integer vector containing the converted values. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtt_ps2pi(__m128 __a) { return _mm_cvttps_pi32(__a); } +/// \brief Converts a 32-bit signed integer value into a vector of [4 x float], +/// writing the result to the lower 32 bits of the destination. The +/// remaining higher order elements of the destination are copied from the +/// corresponding elements in the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSI2SS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The upper 96 bits of the +/// destination are copied from the corresponding elements in +/// this operand. +/// \param __b +/// A 32-bit signed integer operand containing the value to be +/// converted. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// converted value of the second operand. The upper 96 bits are copied +/// from the upper 96 bits of the first operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi32_ss(__m128 __a, int __b) { @@ -473,6 +1507,25 @@ return __a; } +/// \brief Converts a 32-bit signed integer value into a vector of [4 x float], +/// writing the result to the lower 32 bits of the destination. The +/// remaining higher order elements of the destination are copied from the +/// corresponding elements in the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSI2SS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The upper 96 bits of the +/// destination are copied from the corresponding elements in +/// this operand. +/// \param __b +/// A 32-bit signed integer operand containing the value to be +/// converted. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// converted value of the second operand. The upper 96 bits are copied +/// from the upper 96 bits of the first operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_si2ss(__m128 __a, int __b) { @@ -481,6 +1534,25 @@ #ifdef __x86_64__ +/// \brief Converts a 64-bit signed integer value into a vector of [4 x float], +/// writing the result to the lower 32 bits of the destination. The +/// remaining higher order elements of the destination are copied from the +/// corresponding elements in the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VCVTSI2SS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The upper 96 bits of the +/// destination are copied from the corresponding elements in +/// this operand. +/// \param __b +/// A 64-bit signed integer operand containing the value to be +/// converted. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// converted value of the second operand. The upper 96 bits are copied +/// from the upper 96 bits of the first operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsi64_ss(__m128 __a, long long __b) { @@ -490,24 +1562,85 @@ #endif +/// \brief Converts a 64-bit vector of [2 x i32] into a 128-bit vector of [4 x +/// float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPI2PS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The upper 64 bits of this +/// operand are copied to the destination. +/// \param __b +/// A 64-bit vector of [2 x i32]. The elements in this vector +/// are converted and written to the corresponding low-order +/// elements in the destination. +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the +/// converted value of the second operand. The upper 64 bits are copied +/// from the upper 64 bits of the first operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpi32_ps(__m128 __a, __m64 __b) { return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b); } +/// \brief Converts a 64-bit vector of [2 x i32] into a 128-bit vector of [4 x +/// float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPI2PS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The remaining higher order +/// elements of the destination are copied from the +/// corresponding elements in this operand. +/// \param __b +/// A 64-bit vector of [2 x i32]. The elements in this vector +/// are converted and written to the corresponding low-order +/// elements in the destination. +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the +/// converted value from the second operand. The upper 64 bits are copied +/// from the upper 64 bits of the first operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvt_pi2ps(__m128 __a, __m64 __b) { return _mm_cvtpi32_ps(__a, __b); } +/// \brief Extracts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 32-bit float. The result is written to the +/// destination. +/// +/// \headerfile +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this +/// operand are used in the extraction. +/// \returns A 32-bit float containing the extracted value. static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtss_f32(__m128 __a) { return __a[0]; } +/// \brief Loads float values into the high-order bits of a 128-bit vector of +/// [4 x float]. The low-order bits are copied from the low-order bits of +/// the +/// first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MOVLHPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// Bits [63:0] are written to bits [63:0] of the destination. +/// \param __p +/// A pointer to float values. +/// Bits [63:0] are written to bits [127:64] of the destination. +/// \returns A 128-bit vector of [4 x float] containing the moved values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadh_pi(__m128 __a, const __m64 *__p) { @@ -520,6 +1653,22 @@ return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); } +/// \brief Loads float values into the low-order bits of a 128-bit vector of [4 +/// x float]. The high-order bits are copied from the high-order bits of the +/// first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MOVLPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// Bits [127:64] are written to bits [127:64] of the +/// destination. +/// \param __p +/// A pointer to a float values. +/// Bits [63:0] are written to bits [63:0] of the destination. +/// \returns A 128-bit vector of [4 x float] containing the moved values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadl_pi(__m128 __a, const __m64 *__p) { @@ -532,6 +1681,17 @@ return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); } +/// \brief Loads a single float value to the low element in a 128-bit integer +/// vector and clears the upper elements. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVSS instruction. +/// +/// \param __p +/// An aligned pointer to a memory location containing a 32-bit +/// float value. +/// \returns A 128-bit vector of [4 x float] containing the moved value. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ss(const float *__p) { @@ -542,6 +1702,15 @@ return (__m128){ __u, 0, 0, 0 }; } +/// \brief Moves and duplicates one float value to float values stored in a +/// packed 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// \param __p +/// A float value to be moved and duplicated. +/// \returns A 128-bit vector of [4 x float] containing the moved and +/// duplicated values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load1_ps(const float *__p) { @@ -554,12 +1723,33 @@ #define _mm_load_ps1(p) _mm_load1_ps(p) +/// \brief Moves packed float values from an aligned memory location to 32-bit +/// elements in a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPS instruction. +/// +/// \param __p +/// A 16-byte aligned pointer to a memory location containing +/// float values. +/// \returns A 128-bit vector of [4 x float] containing the moved values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p) { return *(__m128*)__p; } +/// \brief Moves packed float values from an unaligned memory location to +/// 32-bit elements in a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVUPS instruction. +/// +/// \param __p +/// A pointer to a memory location containing float values. +/// \returns A 128-bit vector of [4 x float] containing the moved values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadu_ps(const float *__p) { @@ -569,6 +1759,18 @@ return ((struct __loadu_ps*)__p)->__v; } +/// \brief Moves packed float values, in reverse order, from an aligned memory +/// location to 32-bit elements in a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPS+shuffling instruction. +/// +/// \param __p +/// A 16-byte aligned pointer to a memory location containing +/// float values. +/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded +/// in reverse order. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_loadr_ps(const float *__p) { @@ -582,12 +1784,34 @@ return (__m128)__builtin_ia32_undef128(); } +/// \brief Initializes a 128-bit vector of [4 x float] with the specified +/// 32-bit float value. +/// +/// \headerfile +/// +/// \param __w +/// A float value used to initialize the lower 32 bits of the +/// destination vector of [4 x float]. The upper bits of the +/// destination are set to zero. +/// \returns An initialized 128-bit vector of [4 x float] containing the value +/// provided in the operand. The upper bits of the destination are set to +/// zero. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w) { return (__m128){ __w, 0, 0, 0 }; } +/// \brief Initializes all elements a 128-bit vector of [4 x float] with the +/// specified 32-bit float value. +/// +/// \headerfile +/// +/// \param __w +/// A float value used to initialize all elements of the +/// destination vector of [4 x float]. +/// \returns An initialized 128-bit vector of [4 x float] in which all elements +/// containing the value provided in the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w) { @@ -595,42 +1819,137 @@ } /* Microsoft specific. */ +/// \brief Initializes all elements a 128-bit vector of [4 x float] with the +/// specified 32-bit float value. +/// +/// \headerfile +/// +/// \param __w +/// A float value used to initialize all elements of the +/// destination vector of [4 x float]. +/// \returns An initialized 128-bit vector of [4 x float] in which all elements +/// containing the value provided in the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps1(float __w) { return _mm_set1_ps(__w); } +/// \brief Initializes the float values in a 128-bit vector of [4 x float] with +/// the specified 32-bit float values. +/// The following code illustrates this intrinsics behavior: +/// +/// \headerfile +/// +/// \param __z +/// A value value used to initialize the bits [127:96] of the +/// destination vector of [4 x float]. +/// \param __y +/// A value value used to initialize the bits [95:64] of the +/// destination vector of [4 x float]. +/// \param __x +/// A value value used to initialize the bits [63:32] of the +/// destination vector of [4 x float]. +/// \param __w +/// A value value used to initialize the bits [31:0] of the +/// destination vector of [4 x float]. +/// \returns An initialized 128-bit vector of [4 x float] containing the values +/// provided in the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w) { return (__m128){ __w, __x, __y, __z }; } +/// \brief Initializes the float values in a 128-bit vector of [4 x float] in +/// reverse order with the specified 32-bit float values. +/// The following code illustrates this intrinsics behavior: +/// +/// \headerfile +/// +/// \param __z +/// A value value used to initialize the bits [31:0] of the +/// destination vector of [4 x float]. +/// \param __y +/// A value value used to initialize the bits [63:32] of the +/// destination vector of [4 x float]. +/// \param __x +/// A value value used to initialize the bits [95:64] of the +/// destination vector of [4 x float]. +/// \param __w +/// A value value used to initialize the bits [127:96] of the +/// destination vector of [4 x float]. +/// \returns An initialized 128-bit vector of [4 x float] containing the values +/// provided in the operand, loaded in reverse order. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w) { return (__m128){ __z, __y, __x, __w }; } +/// \brief Sets the 32-bit float registers to zero. +/// +/// \headerfile +/// +/// \returns An initialized 128-bit vector of [4 x float] with all elements set +/// to zero. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void) { return (__m128){ 0, 0, 0, 0 }; } +/// \brief Moves the packed float values from the upper 64 bits of a 128-bit +/// vector of [4 x float] to a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VPEXTRQ instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float +/// values. +/// \param __a +/// A packed 128-bit vector of [4 x float] containing the values +/// to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pi(__m64 *__p, __m128 __a) { __builtin_ia32_storehps((__v2si *)__p, __a); } +/// \brief Moves the packed float values from the lower 64 bits of a 128-bit +/// vector of [4 x float] to a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVLPS instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float +/// values. +/// \param __a +/// A packed 128-bit vector of [4 x float] containing the values +/// to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pi(__m64 *__p, __m128 __a) { __builtin_ia32_storelps((__v2si *)__p, __a); } +/// \brief Moves the packed float value from the lower 32 bits of a 128-bit +/// vector of [4 x float] to a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVSS instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float +/// value. +/// \param __a +/// A packed 128-bit vector of [4 x float] containing the value +/// to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ss(float *__p, __m128 __a) { @@ -640,12 +1959,36 @@ ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; } +/// \brief Moves packed float values from a 128-bit vector of [4 x float] to an +/// unaligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVUPS instruction. +/// +/// \param __p +/// A pointer to an unaligned memory location that can store 4 +/// float values. +/// \param __a +/// A packed 128-bit vector of [4 x float] containing the values +/// to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_ps(float *__p, __m128 __a) { __builtin_ia32_storeups(__p, __a); } +/// \brief Moves the lower 32 bits of a 128-bit vector of [4 x float] four +/// times to all the elements of a memory location. +/// +/// \headerfile +/// +/// \param __p +/// A pointer to a memory location that can store 4 float +/// values. +/// \param __a +/// A 128-bit vector of [4 x float] whose lower 32 bits are +/// copied to each of the values in __p. static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_ps(float *__p, __m128 __a) { @@ -653,18 +1996,55 @@ _mm_storeu_ps(__p, __a); } +/// \brief Moves the lower 32 bits of a 128-bit vector of [4 x float] four +/// times to all the elements of a memory location. +/// +/// \headerfile +/// +/// \param __p +/// A pointer to a memory location that can store 4 float +/// values. +/// \param __a +/// A 128-bit vector of [4 x float] whose lower 32 bits are +/// copied to each of the values in __p. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a) { return _mm_store1_ps(__p, __a); } +/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPS instruction. +/// +/// \param __p +/// A pointer to an aligned memory location that can store 4 +/// float values. +/// \param __a +/// A packed 128-bit vector of [4 x float] containing the values +/// to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps(float *__p, __m128 __a) { *(__m128 *)__p = __a; } +/// \brief Moves packed float values, in reverse order, from a 128-bit vector +/// of [4 x float] to a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVAPS+shuffling instruction. +/// +/// \param __p +/// A pointer to an aligned memory location that can store 4 +/// float values, which are loaded in reverse order. +/// \param __a +/// A packed 128-bit vector of [4 x float] containing the values +/// to be moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_ps(float *__p, __m128 __a) { @@ -681,27 +2061,111 @@ /* FIXME: We have to #define this because "sel" must be a constant integer, and Sema doesn't do any form of constant propagation yet. */ +/// \brief Loads one cache line of data from the specified address to a +/// location closer to the processor. +/// +/// \headerfile +/// +/// \code +/// void _mm_prefetch(const void * a, const int sel); +/// \endcode +/// +/// This intrinsic corresponds to \c PREFETCHNTA instruction. +/// +/// \param a +/// A pointer to a memory location containing a cache line of +/// data. +/// \param sel +/// A predefined integer constant specifying the type of +/// prefetch operation: +/// _MM_HINT_NTA: Move +/// data using the non-temporal access (NTA) hint. +/// The PREFETCHNTA +/// instruction will be generated. +/// _MM_HINT_T0: Move data +/// using the T0 hint. The PREFETCHT0 +/// instruction will be generated. +/// _MM_HINT_T1: Move data +/// using the T1 hint. The PREFETCHT1 +/// instruction will be generated. +/// _MM_HINT_T2: Move data +/// using the T2 hint. The PREFETCHT2 +/// instruction will be generated. #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) #endif +/// \brief Stores a 64-bit integer in the specified aligned memory location. To +/// minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MOVNTQ instruction. +/// +/// \param __p +/// The aligned memory location used to store the register +/// value. +/// \param __a +/// A 64-bit integer containing the value to be stored. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pi(__m64 *__p, __m64 __a) { __builtin_ia32_movntq(__p, __a); } +/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a +/// 128-bit aligned memory location. To minimize caching, the data is +/// flagged as non-temporal (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVNTPS instruction. +/// +/// \param __p +/// A 128-bit aligned pointer to a memory location that will +/// receive the integer values. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be +/// moved. static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_ps(float *__p, __m128 __a) { __builtin_ia32_movntps(__p, __a); } +/// \brief Forces strong memory ordering (serialization) between store +/// instructions preceding this instruction and store instructions +/// following this instruction, assuring the system completes all previous +/// stores before executing subsequent stores. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c SFENCE instruction. +/// static __inline__ void __DEFAULT_FN_ATTRS _mm_sfence(void) { __builtin_ia32_sfence(); } +/// \brief Extracts 16 bits of packed data from a 64-bit integer vector and +/// copies it to the destination, as specified by the immediate integer +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PEXTRW instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __n +/// An immediate integer operand that determines which bits are +/// extracted: +/// 0: Bits [15:0] are copied to the destination. +/// 1: Bits [31:16] are copied to the destination. +/// 2: Bits [47:32] are copied to the destination. +/// 3: Bits [63:48] are copied to the destination. +/// \returns A 16-bit integer containing the extracted 16 bits of packed data. static __inline__ int __DEFAULT_FN_ATTRS _mm_extract_pi16(__m64 __a, int __n) { @@ -709,6 +2173,27 @@ return (unsigned short)__b[__n & 3]; } +/// \brief Copies packed data from the 64-bit integer vector operand to the +/// destination, and inserts the lower 16-bits of an integer operand, +/// using the offset specified by the immediate operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PINSRW instruction. +/// +/// \param __a +/// A 64-bit integer vector. The remaining bits in the +/// destination are copied from the corresponding bits in this +/// operand. +/// \param __d +/// An integer. The bits of this operand are written to the +/// destination beginning at the offset specified by operand __n. +/// \param __n +/// Specifies the bit offset to be used in the destination. The +/// remaining bits in the destination are copied from the +/// corresponding bits in operand __a. +/// \returns A 64-bit integer vector containing the copied packed data from the +/// operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_insert_pi16(__m64 __a, int __d, int __n) { @@ -717,117 +2202,484 @@ return (__m64)__b; } +/// \brief Compares each of the corresponding packed 16-bit integer values of +/// the 64-bit integer vectors, and writes the greater value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMAXSW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the comparison results. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_max_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); } +/// \brief Compares each of the corresponding packed 8-bit unsigned integer +/// values of the 64-bit integer vectors, and writes the greater value to +/// the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMAXUB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the comparison results. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_max_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); } +/// \brief Compares each of the corresponding packed 16-bit integer values of +/// the 64-bit integer vectors, and writes the lesser value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMINSW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the comparison results. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_min_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); } +/// \brief Compares each of the corresponding packed 8-bit unsigned integer +/// values of the 64-bit integer vectors, and writes the lesser value to +/// the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMINUB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the comparison results. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_min_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); } +/// \brief Copies the values of the most significant bits from each 8-bit +/// element in a 64-bit integer vector to create a 16-bit mask value, +/// zero-extends +/// the value, and writes it to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMOVMSKB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the values with bits to +/// be extracted. +/// \returns The most significant bits from each 8-bit element in the operand, +/// written to bits [15:0]. static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pi8(__m64 __a) { return __builtin_ia32_pmovmskb((__v8qi)__a); } +/// \brief Multiplies packed 16-bit unsigned integer values and writes the +/// high-order 16 bits of each 32-bit product to the corresponding bits in +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PMULHUW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the products of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mulhi_pu16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); } +/// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the +/// destination, as specified by the immediate value operand. +/// +/// \headerfile +/// +/// \code +/// __m64 _mm_shuffle_pi16(__m64 a, const int n); +/// \endcode +/// +/// \param a +/// A 64-bit integer vector containing the values to be copied. +/// \param n +/// An immediate value containing 8-bit values specifying which +/// elements to copy from a. If bit 7 is +/// set, the corresponding 8-bit element in the destination is +/// cleared. Bits [3:0] select which 8-bit element to copy. +/// \returns A 64-bit integer vector containing the shuffled values. #define _mm_shuffle_pi16(a, n) __extension__ ({ \ (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); }) +/// \brief Conditionally copies the values from each 8-bit element in the first +/// 64-bit integer vector operand to the specified memory location, as +/// specified by the most significant bit in the corresponding element in +/// the second 64-bit integer vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c MASKMOVQ instruction. +/// +/// \param __d +/// A 64-bit integer vector containing the values with elements +/// to be copied. +/// \param __n +/// A 64-bit integer vector operand. The most significant bit +/// from each 8-bit element determines whether the corresponding +/// element in operand __d is copied. If the +/// most significant bit of a given element is 1, the +/// corresponding element in operand __d is +/// copied. +/// \param __p +/// A 64-bit aligned pointer to a memory location that will +/// receive the conditionally copied integer values. static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) { __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); } +/// \brief Computes the rounded averages of the packed unsigned 8-bit integer +/// values and writes the averages to the corresponding bits in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PAVGB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the averages of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_avg_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); } +/// \brief Computes the rounded averages of the packed unsigned 16-bit integer +/// values and writes the averages to the corresponding bits in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PAVGW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector containing the averages of both operands. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_avg_pu16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); } +/// \brief Subtracts 8-bit unsigned integer values and computes the absolute +/// differences to the corresponding bits in the destination. Then sum of +/// the absolute differences is written bits [15:0] of the destination. +/// The upper elements in the destination are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c PSADBW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source +/// operands. +/// \param __b +/// A 64-bit integer vector containing one of the source +/// operands. +/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of +/// the sets of absolute differences between both operands. The upper bits +/// are +/// cleared. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sad_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); } +/// \brief Saves the content of the MXCSR register to the 32-bit unsigned +/// integer destination. +/// There are several classes of macros available with this intrinsic. +/// These macros include masks for getting or setting exceptions, rounding +/// modes, flushing, and denormalization. +/// The following macros are exception flag masks that are used with +/// _mm_getcsr() to check if certain exceptions +/// have been raised: +/// For example, the following expression checks if an overflow exception +/// has occurred: +/// This expression checks for division by zero: +/// This expression checks for if any exception has occurred: +/// The following macros are used to get or set rounding modes: +/// This expression gets the current rounding mode: +/// Note that the FLUSH and DENORMALS masks can be used to check the DAZ +/// (denormals are zero) and FZ (flush to zero) modes. +/// The following macros, which wrap_mm_getcsr()and _mm_setcsr(), are used +/// as convenience wrappers to easily get and set exception, flushing, +/// denormalization, and rounding mode states: +/// This expression gets the current rounding mode: +/// This statement checks for floating-point exceptions: +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VSTMXCSR instruction. +/// +/// \returns A 32-bit unsigned integer containing the content of the MXCSR +/// register. static __inline__ unsigned int __DEFAULT_FN_ATTRS _mm_getcsr(void) { return __builtin_ia32_stmxcsr(); } +/// \brief Loads the MXCSR register with the value stored in the 32-bit +/// unsigned integer operand. +/// There are several classes of macros available with this intrinsic. +/// These macros include masks for getting or setting exceptions, rounding +/// modes, flushing, and denormalization. +/// The following mask macros are used to set exception masks: +/// For example, the following expression sets a mask to ignore underflow +/// exceptions: +/// This expression turns underflow exceptions back on: +/// The following macros are used to get or set rounding modes: +/// For example, the following expression causes floating-point operations +/// to round up: +/// Note that the FLUSH and DENORMALS masks can be used to check the DAZ +/// (denormals are zero) and FZ (flush to zero) modes. +/// The following macros, which wrap_mm_getcsr()and _mm_setcsr(), are used +/// as convenience wrappers to easily get and set exception, flushing, +/// denormalization, and rounding mode states: +/// For example, this code sets the DAZ and FZ flags: +/// This expression sets the rounding mode: +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VLDMXCSR instruction. +/// +/// \param __i +/// A 32-bit unsigned integer operand whose value is loaded into +/// the MXCSR register. static __inline__ void __DEFAULT_FN_ATTRS _mm_setcsr(unsigned int __i) { __builtin_ia32_ldmxcsr(__i); } +/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as +/// specified by the immediate value operand. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask); +/// \endcode +/// +/// This intrinsic corresponds to \c VSHUFPS instruction. +/// +/// \param a +/// A 128-bit vector of [4 x float]. +/// \param b +/// A 128-bit vector of [4 x float]. +/// \param mask +/// An immediate value containing an 8-bit value specifying +/// which elements to copy from a and b. Bits [3:0] specify the values +/// copied +/// from operand a. Bits [7:4] specify the +/// values copied from operand b. The +/// destinations within the 128-bit destination are assigned +/// values as follows: +/// Bits [1:0] are used to assign values to bits [31:0] in the +/// destination. +/// Bits [3:2] are used to assign values to bits [63:32] in the +/// destination. +/// Bits [5:4] are used to assign values to bits [95:64] in the +/// destination. +/// Bits [7:6] are used to assign values to bits [127:96] in the +/// destination. +/// Bit value assignments: +/// 00: Bits [31:0] copied from the specified operand. +/// 01: Bits [63:32] copied from the specified operand. +/// 10: Bits [95:64] copied from the specified operand. +/// 11: Bits [127:96] copied from the specified operand. +/// \returns A 128-bit vector of [4 x float] containing the shuffled values. #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ (mask) & 0x3, ((mask) & 0xc) >> 2, \ (((mask) & 0x30) >> 4) + 4, \ (((mask) & 0xc0) >> 6) + 4); }) +/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors +/// of [4 x float] and interleaves them into a packed 128-bit vector of [4 x +/// float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUNPCKHPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// Bits [95:64] are written to bits [31:0] of the destination. +/// Bits [127:96] are written to bits [95:64] of the +/// destination. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// Bits [95:64] are written to bits [63:32] of the destination. +/// Bits [127:96] are written to bits [127:96] of the +/// destination. +/// \returns A 128-bit vector of [4 x float] containing the interleaved values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpackhi_ps(__m128 __a, __m128 __b) { return __builtin_shufflevector(__a, __b, 2, 6, 3, 7); } +/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of +/// [4 x float] and interleaves them into a packed 128-bit vector of [4 x +/// float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VUNPCKLPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// Bits [31:0] are written to bits [31:0] of the destination. +/// Bits [63:32] are written to bits [95:64] of the destination. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// Bits [31:0] are written to bits [63:32] of the destination. +/// Bits [63:32] are written to bits [127:96] of the +/// destination. +/// \returns A 128-bit vector of [4 x float] containing the interleaved values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_unpacklo_ps(__m128 __a, __m128 __b) { return __builtin_shufflevector(__a, __b, 0, 4, 1, 5); } +/// \brief Moves the low-order 32-bit element from the second operand to the +/// low-order element of the destination, and copies the corresponding +/// upper elements from the first operand. +/// The following code illustrates this intrinsics behavior: +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVSS instruction. +/// +/// \param __a +/// 128-bit vector of [4 x float]. The upper 96 bits of this +/// operand are copied to the upper 96 bits of the destination. +/// \param __b +/// 128-bit vector of [4 x float]. The lower 32 bits of this +/// operand are copied to the lower 32 bits of the destination. +/// \returns A 128-bit vector of [4 x float] containing the moved values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b) { return __builtin_shufflevector(__a, __b, 4, 1, 2, 3); } +/// \brief Moves the 2 high-order 32-bit elements from the second operand to +/// the low-order elements of the destination, and copies the corresponding +/// upper elements from the first operand. +/// The following code illustrates this intrinsics behavior: +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVHLPS instruction. +/// +/// \param __a +/// 128-bit vector of [4 x float]. The upper 64 bits of this +/// operand are copied to the upper 64 bits of the destination. +/// \param __b +/// 128-bit vector of [4 x float]. The upper 64 bits of this +/// operand are copied to the lower 64 bits of the destination. +/// \returns A 128-bit vector of [4 x float] containing the moved values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movehl_ps(__m128 __a, __m128 __b) { return __builtin_shufflevector(__a, __b, 6, 7, 2, 3); } +/// \brief Moves the 2 low-order 32-bit elements from the second operand to the +/// high-order elements of the destination, and copies the corresponding +/// upper elements from the first operand. +/// The following code illustrates this intrinsics behavior: +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVLHPS instruction. +/// +/// \param __a +/// 128-bit vector of [4 x float]. The lower 64 bits of this +/// operand are copied to the lower 64 bits of the destination. +/// \param __b +/// 128-bit vector of [4 x float]. The lower 64 bits of this +/// operand are copied to the upper 64 bits of the destination. +/// \returns A 128-bit vector of [4 x float] containing the moved values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_movelh_ps(__m128 __a, __m128 __b) { return __builtin_shufflevector(__a, __b, 0, 1, 4, 5); } +/// \brief Converts a 64-bit vector of [4 x i16] into a128-bit vector of [4 x +/// float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPI2PS+COMPOSITE instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16]. The elements of the +/// destination are copied from the corresponding elements in +/// this operand. +/// \returns A 128-bit vector of [4 x float] containing the copied and +/// converted values from the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpi16_ps(__m64 __a) { @@ -846,6 +2698,19 @@ return __r; } +/// \brief Converts a 64-bit vector of 16-bit unsigned integer values into +/// a128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPI2PS+COMPOSITE instruction. +/// +/// \param __a +/// A 64-bit vector of 16-bit unsigned integer values. The +/// elements of the destination are copied from the +/// corresponding elements in this operand. +/// \returns A 128-bit vector of [4 x float] containing the copied and +/// converted values from the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpu16_ps(__m64 __a) { @@ -863,6 +2728,19 @@ return __r; } +/// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] +/// into a128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPI2PS+COMPOSITE instruction. +/// +/// \param __a +/// A 64-bit vector of [8 x i8]. The elements of the destination +/// are copied from the corresponding lower 4 elements in this +/// operand. +/// \returns A 128-bit vector of [4 x float] containing the copied and +/// converted values from the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpi8_ps(__m64 __a) { @@ -875,6 +2753,19 @@ return _mm_cvtpi16_ps(__b); } +/// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit +/// vector into a128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPI2PS+COMPOSITE instruction. +/// +/// \param __a +/// A 64-bit vector of unsigned 8-bit integer values. The +/// elements of the destination are copied from the +/// corresponding lower 4 elements in this operand. +/// \returns A 128-bit vector of [4 x float] containing the copied and +/// converted values from the operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpu8_ps(__m64 __a) { @@ -886,6 +2777,24 @@ return _mm_cvtpi16_ps(__b); } +/// \brief Converts the 2 32-bit signed integer values from each 64-bit vector +/// operand of [2 x i32] into a128-bit vector of [4 x float]. +/// The following code illustrates this intrinsics behavior: +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPI2PS+COMPOSITE instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32]. The lower elements of the +/// destination are copied from the elements in this operand. +/// \param __b +/// A 64-bit vector of [2 x i32]. The upper elements of the +/// destination are copied from the elements in this operand. +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the +/// copied and converted values from the first operand. The upper 64 bits +/// contain +/// the copied and converted values from the second operand. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) { @@ -898,6 +2807,19 @@ return _mm_cvtpi32_ps(__c, __a); } +/// \brief Converts a 128-bit vector of [4 x float] into a 64-bit vector of [4 +/// x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPS2PI+COMPOSITE instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The elements of the +/// destination are copied from the corresponding elements in +/// this operand. +/// \returns A 64-bit vector of [4 x i16] containing the copied and converted +/// values from the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtps_pi16(__m128 __a) { @@ -910,6 +2832,19 @@ return _mm_packs_pi32(__b, __c); } +/// \brief Converts the values in a 128-bit vector of [4 x float] to the lower +/// four 8-bit values in a 64-bit vector of [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c CVTPS2PI+COMPOSITE instruction. +/// +/// \param __a +/// 128-bit vector of [4 x float]. The corresponding lower 4 +/// elements elements of the destination are copied from the +/// values in this operand. +/// \returns A 64-bit vector of [8 x i8] containing the copied and converted +/// values from the operand. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtps_pi8(__m128 __a) { @@ -921,6 +2856,19 @@ return _mm_packs_pi16(__b, __c); } +/// \brief Extracts the sign bits of the packed float values in the 128-bit +/// vector of [4 x float], zero-extends the value, and writes it to the +/// low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to \c VMOVMSKPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the values with +/// sign bits to be extracted. +/// \returns The sign bits from the operand, written to bits [3:0]. The +/// remaining bits are assigned values of zero. static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_ps(__m128 __a) {