Index: lib/Headers/__wmmintrin_aes.h
===================================================================
--- lib/Headers/__wmmintrin_aes.h
+++ lib/Headers/__wmmintrin_aes.h
@@ -28,36 +28,120 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
 
+/// \brief Performs a single round of AES encryption, transforming the state
+///    value from the first source operand using a round key value contained
+///    in the second source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VAESENC instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesenc_si128(__m128i __V, __m128i __R)
 {
   return (__m128i)__builtin_ia32_aesenc128(__V, __R);
 }
 
+/// \brief Performs the final round of AES encryption, transforming the state
+///    value from the first source operand using a round key value contained
+///    in the second source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VAESENCLAST instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesenclast_si128(__m128i __V, __m128i __R)
 {
   return (__m128i)__builtin_ia32_aesenclast128(__V, __R);
 }
 
+/// \brief Performs a single round of AES decryption, transforming the state
+///    value from the first source operand using a round key value contained
+///    in the second source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VAESDEC instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesdec_si128(__m128i __V, __m128i __R)
 {
   return (__m128i)__builtin_ia32_aesdec128(__V, __R);
 }
 
+/// \brief Performs the final round of AES decryption, transforming the state
+///    value from the first source operand using a round key value contained
+///    in the second source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VAESDECLAST instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the state value.
+/// \param __R
+///    A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesdeclast_si128(__m128i __V, __m128i __R)
 {
   return (__m128i)__builtin_ia32_aesdeclast128(__V, __R);
 }
 
+/// \brief Applies the AES InvMixColumns() transformation to an expanded key
+///    contained in the source operand, and writes the result to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VAESIMC instruction.
+///
+/// \param __V
+///    A 128-bit integer vector containing the expanded key.
+/// \returns A 128-bit integer vector containing the transformed value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_aesimc_si128(__m128i __V)
 {
   return (__m128i)__builtin_ia32_aesimc128(__V);
 }
 
+/// \brief Expands the round key value contained in the first source operand
+///    using a round constant specified by the second source operand, and
+///    writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c AESKEYGENASSIST instruction.
+///
+/// \param C
+///    A 128-bit integer vector containing the round key value.
+/// \param R
+///    An 8-bit integer containing the round constant.
+/// \returns A 128-bit integer vector containing the expanded round key value.
 #define _mm_aeskeygenassist_si128(C, R) \
   (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
 
Index: lib/Headers/__wmmintrin_pclmul.h
===================================================================
--- lib/Headers/__wmmintrin_pclmul.h
+++ lib/Headers/__wmmintrin_pclmul.h
@@ -23,6 +23,36 @@
 #ifndef _WMMINTRIN_PCLMUL_H
 #define _WMMINTRIN_PCLMUL_H
 
+/// \brief Multiplies two 64-bit integer values, selected from the operands 
+///    using the immediate value operand. The multiplication is a carry-less
+///    multiplication, and the 128-bit integer product is stored in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCLMULQDQ instruction.
+///
+/// \param __X
+///    A 128-bit vector of [2 x i64] containing one of the source
+///    operands.
+/// \param __Y
+///    A 128-bit vector of [2 x i64] containing one of the source
+///    operands.
+/// \param __I
+///    An immediate value specifying which 64-bit values to select
+///    from the operands.
+///    Bit 0 is used to select a value from operand __X,
+///    and bit 4 is used to select a value from operand __Y:
+///    Bit[0]=0 indicates that bits[63:0] of operand __X are used.
+///    Bit[0]=1 indicates that bits[127:64] of operand __X are used.
+///    Bit[4]=0 indicates that bits[63:0] of operand __Y are used.
+///    Bit[4]=1 indicates that bits[127:64] of operand __Y are used.
+/// \returns The 128-bit integer vector containing the result of the carry-less
+///    multiplication of the selected 64-bit values.
 #define _mm_clmulepi64_si128(__X, __Y, __I) \
   ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
                                         (__v2di)(__m128i)(__Y), (char)(__I)))
Index: lib/Headers/avxintrin.h
===================================================================
--- lib/Headers/avxintrin.h
+++ lib/Headers/avxintrin.h
@@ -47,117 +47,412 @@
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))
 
 /* Arithmetic */
+/// \brief Adds 2 packed 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VADDPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands.
+/// \returns A 256-bit vector of [4 x double] containing the sums of both 
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_add_pd(__m256d __a, __m256d __b)
 {
   return __a+__b;
 }
 
+/// \brief Adds 2 packed 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VADDPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands.
+/// \returns A 256-bit vector of [8 x float] containing the sums of both 
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_add_ps(__m256 __a, __m256 __b)
 {
   return __a+__b;
 }
 
+/// \brief Subtracts 2 packed 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSUBPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the minuend.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the differences 
+///    between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_sub_pd(__m256d __a, __m256d __b)
 {
   return __a-__b;
 }
 
+/// \brief Subtracts 2 packed 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSUBPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the minuend.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the differences between
+///    both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_sub_ps(__m256 __a, __m256 __b)
 {
   return __a-__b;
 }
 
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 
+///    2 packed 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VADDSUBPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the left source
+///    operand.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the right source
+///    operand.
+/// \returns A 256-bit vector of [4 x double] containing the alternating sums 
+///    and differences between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_addsub_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 
+///    2 packed 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VADDSUBPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the left source
+///    operand.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the right source
+///    operand.
+/// \returns A 256-bit vector of [8 x float] containing the alternating sums 
+///    and differences between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_addsub_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Divides 2 packed 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VDIVPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the dividend.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the divisor.
+/// \returns A 256-bit vector of [4 x double] containing the quotients between 
+///    both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_div_pd(__m256d __a, __m256d __b)
 {
   return __a / __b;
 }
 
+/// \brief Divides 2 packed 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VDIVPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the dividend.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the divisor.
+/// \returns A 256-bit vector of [8 x float] containing the quotients between 
+///    both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_div_ps(__m256 __a, __m256 __b)
 {
   return __a / __b;
 }
 
+/// \brief Compares 2 packed 256-bit vectors of [4 x double] and stores the
+///    greater of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMAXPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the
+///    operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the
+///    operands.
+/// \returns A 256-bit vector of [4 x double] containing the maximum values 
+///    between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_max_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Compares 2 packed 256-bit vectors of [8 x float] and stores the
+///    greater of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMAXPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the
+///    operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the
+///    operands.
+/// \returns A 256-bit vector of [8 x float] containing the maximum values 
+///    between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_max_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Compares 2 packed 256-bit vectors of [4 x double] and stores the
+///    lesser of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMINPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the
+///    operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the
+///    operands.
+/// \returns A 256-bit vector of [4 x double] containing the minimum values 
+///    between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_min_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Compares 2 packed 256-bit vectors of [8 x float] and stores the 
+///    lesser of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMINPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the
+///    operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the
+///    operands.
+/// \returns A 256-bit vector of [8 x float] containing the minimum values 
+///    between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_min_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Multiplies 2 packed 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMULPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the
+///    operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the
+///    operands.
+/// \returns A 256-bit vector of [4 x double] containing the products between 
+///    both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_mul_pd(__m256d __a, __m256d __b)
 {
   return __a * __b;
 }
 
+/// \brief Multiplies 2 packed 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMULPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the
+///    operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the
+///    operands.
+/// \returns A 256-bit vector of [8 x float] containing the products between 
+///    both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_mul_ps(__m256 __a, __m256 __b)
 {
   return __a * __b;
 }
 
+/// \brief Calculates the square roots of the values stored in a packed 256-bit
+///    vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSQRTPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] values.
+/// \returns A 256-bit vector of [4 x double] containing the square roots of 
+///    the values in the operand.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_sqrt_pd(__m256d __a)
 {
   return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
 }
 
+/// \brief Calculates the square roots of the values stored in a packed 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSQRTPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] values.
+/// \returns A 256-bit vector of [8 x float] containing the square roots of the
+///    values in the operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_sqrt_ps(__m256 __a)
 {
   return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
 }
 
+/// \brief Calculates the reciprocal square roots of the values stored in a
+///    packed 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VRSQRTPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] values.
+/// \returns A 256-bit vector of [8 x float] containing the reciprocal square 
+///    roots of the values in the operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_rsqrt_ps(__m256 __a)
 {
   return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
 }
 
+/// \brief Calculates the reciprocals of the values stored in a packed 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VRCPPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] values.
+/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
+///    values in the operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_rcp_ps(__m256 __a)
 {
   return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
 }
 
+/// \brief Rounds the values stored in a packed 256-bit vector of [4 x double] 
+///    as specified by the byte operand. The source values are rounded to
+///    integer values and returned as 64-bit double-precision floating-point
+///    values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256d _mm256_round_pd(__m256d V, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VROUNDPD instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double] values.
+/// \param M
+///    An integer value that specifies the rounding operation.
+///    Bits [7:4] are reserved.
+///    Bit [3] is a precision exception value:
+///    0: A normal PE exception is used
+///    1: The PE field is not updated
+///    Bit [2] is the rounding control source:
+///    0: Use bits [1:0] of M
+///    1: Use the current MXCSR setting
+///    Bits [1:0] contain the rounding control definition:
+///    00: Nearest
+///    01: Downward (toward negative infinity)
+///    10: Upward (toward positive infinity)
+///    11: Truncated
+/// \returns A 256-bit vector of [4 x double] containing the rounded values.
 #define _mm256_round_pd(V, M) __extension__ ({ \
     (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })
 
+/// \brief Rounds the values stored in a packed 256-bit vector of [8 x float] 
+///    as specified by the byte operand. The source values are rounded to
+///    integer values and returned as floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256 _mm256_round_ps(__m256 V, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VROUNDPS instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float] values.
+/// \param M
+///    An integer value that specifies the rounding operation.
+///    Bits [7:4] are reserved.
+///    Bit [3] is a precision exception value:
+///    0: A normal PE exception is used
+///    1: The PE field is not updated
+///    Bit [2] is the rounding control source:
+///    0: Use bits [1:0] of M
+///    1: Use the current MXCSR setting
+///    Bits [1:0] contain the rounding control definition:
+///    00: Nearest
+///    01: Downward (toward negative infinity)
+///    10: Upward (toward positive infinity)
+///    11: Truncated
+/// \returns A 256-bit vector of [8 x float] containing the rounded values.
 #define _mm256_round_ps(V, M) __extension__ ({ \
   (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })
 
@@ -167,48 +462,165 @@
 #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
 
 /* Logical */
+/// \brief Performs a bitwise AND of 2 packed 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VANDPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
+///    values between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_and_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)((__v4di)__a & (__v4di)__b);
 }
 
+/// \brief Performs a bitwise AND of 2 packed 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VANDPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
+///    values between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_and_ps(__m256 __a, __m256 __b)
 {
   return (__m256)((__v8si)__a & (__v8si)__b);
 }
 
+/// \brief Performs a bitwise AND of 2 packed 256-bit vectors of [4 x double],
+///    using the ones-complement of the values contained in the first
+///    source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VANDNPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the left source
+///    operand. The ones complement of this value is used in
+///    the bitwise AND.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the right source
+///    operand.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
+///    values of the second operand and the ones-complement of the
+///    first operand.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_andnot_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)(~(__v4di)__a & (__v4di)__b);
 }
 
+/// \brief Performs a bitwise AND of 2 packed 256-bit vectors of [8 x float],
+///    using the ones-complement of the values contained in the first
+///    source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VANDNPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the left source
+///    operand. The ones complement of this value is used in
+///    the bitwise AND.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing the right source
+///    operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_andnot_ps(__m256 __a, __m256 __b)
 {
   return (__m256)(~(__v8si)__a & (__v8si)__b);
 }
 
+/// \brief Performs a bitwise OR of 2 packed 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VORPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
+///    values between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_or_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)((__v4di)__a | (__v4di)__b);
 }
 
+/// \brief Performs a bitwise OR of 2 packed 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VORPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
+///    values between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_or_ps(__m256 __a, __m256 __b)
 {
   return (__m256)((__v8si)__a | (__v8si)__b);
 }
 
+/// \brief Performs a bitwise XOR of 2 packed 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VXORPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
+///    values between both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_xor_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)((__v4di)__a ^ (__v4di)__b);
 }
 
+/// \brief Performs a bitwise XOR of 2 packed 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VXORPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
+///    values between both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_xor_ps(__m256 __a, __m256 __b)
 {
@@ -216,24 +628,100 @@
 }
 
 /* Horizontal arithmetic */
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VHADDPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands. The horizontal sums of the values are
+///    stored in the low-order (even-indexed) elements of the
+///    destination.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands. The horizontal sums of the values are
+///    stored in the high-order (odd-indexed) elements of the
+///    destination.
+/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
+///    both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_hadd_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VHADDPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the low-order elements (index 0, 1, 4, 5) of the
+///    destination.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the high-order elements (index 2, 3, 6, 7) of the
+///    destination.
+/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of 
+///    both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_hadd_ps(__m256 __a, __m256 __b)
 {
   return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VHSUBPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands. The horizontal differences between the
+///    values are stored in the low-order (even-indexed) elements
+///    of the destination.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing one of the
+///    source operands. The horizontal differences between the
+///    values are stored in the high-order (odd-indexed) elements
+///    of the destination.
+/// \returns A 256-bit vector of [4 x double] containing the horizontal 
+///    differences of both operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_hsub_pd(__m256d __a, __m256d __b)
 {
   return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VHSUBPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the low-order elements (index 0, 1, 4, 5) of the
+///    destination.
+/// \param __b
+///    A 256-bit vector of [8 x float] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the high-order elements (index 2, 3, 6, 7) of the
+///    destination.
+/// \returns A 256-bit vector of [8 x float] containing the horizontal 
+///    differences of both operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_hsub_ps(__m256 __a, __m256 __b)
 {
@@ -241,35 +729,289 @@
 }
 
 /* Vector permutations */
+/// \brief Copies the values stored in a packed 128-bit vector of [2 x double] 
+///    as specified by the 128-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPERMILPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __c
+///    A 128-bit integer vector operand specifying how the values
+///    are to be copied.
+///    Bit [1]:
+///    0: Bits [63:0] of the source are copied to bits [63:0] of
+///    the destination
+///    1: Bits [127:64] of the source are copied to bits [63:0] of
+///    the destination
+///    Bit [65]:
+///    0: Bits [63:0] of the source are copied to bits [127:64] of
+///    the destination
+///    1: Bits [127:64] of the source are copied to bits [127:64]
+///    of the destination
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
 static __inline __m128d __DEFAULT_FN_ATTRS
 _mm_permutevar_pd(__m128d __a, __m128i __c)
 {
   return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
 }
 
+/// \brief Copies the values stored in a packed 256-bit vector of [4 x double] 
+///    as specified by the 256-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPERMILPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] values.
+/// \param __c
+///    A 256-bit integer vector operand specifying how the values
+///    are to be copied.
+///    Bit [1]:
+///    0: Bits [63:0] of the source are copied to bits [63:0] of
+///    the destination
+///    1: Bits [127:64] of the source are copied to bits [63:0] of
+///    the destination
+///    Bit [65]:
+///    0: Bits [63:0] of the source are copied to bits [127:64] of
+///    the destination
+///    1: Bits [127:64] of the source are copied to bits [127:64]
+///    of the destination
+///    Bit [129]:
+///    0: Bits [191:128] of the source are copied to bits [191:128]
+///    of the destination
+///    1: Bits [255:192] of the source are copied to bits [191:128]
+///    of the destination
+///    Bit [193]:
+///    0: Bits [191:128] of the source are copied to bits [255:192]
+///    of the destination
+///    1: Bits [255:192] of the source are copied to bits [255:192]
+///    of the destination
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_permutevar_pd(__m256d __a, __m256i __c)
 {
   return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
 }
 
+/// \brief Copies the values stored in a packed 128-bit vector of [4 x float] 
+///    as specified by the 128-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPERMILPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __c
+///    A 128-bit integer vector operand specifying how the values
+///    are to be copied.
+///    Bits [1:0]:
+///    00: Bits [31:0] of the source are copied to bits [31:0] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [31:0] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [31:0] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [31:0] of
+///    the destination
+///    Bits [33:32]:
+///    00: Bits [31:0] of the source are copied to bits [63:32] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [63:32] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [63:32] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [63:32]
+///    of the destination
+///    Bits [65:64]:
+///    00: Bits [31:0] of the source are copied to bits [95:64] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [95:64] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [95:64] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [95:64]
+///    of the destination
+///    Bits [97:96]:
+///    00: Bits [31:0] of the source are copied to bits [127:96] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [127:96]
+///    of the destination
+///    10: Bits [95:64] of the source are copied to bits [127:96]
+///    of the destination
+///    11: Bits [127:96] of the source are copied to bits [127:96]
+///    of the destination
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_permutevar_ps(__m128 __a, __m128i __c)
 {
   return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
 }
 
+/// \brief Copies the values stored in a packed 256-bit vector of [8 x float] 
+///    as specified by the 256-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPERMILPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] values.
+/// \param __c
+///    A 256-bit integer vector operand specifying how the values
+///    are to be copied.
+///    Bits [1:0]:
+///    00: Bits [31:0] of the source are copied to bits [31:0] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [31:0] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [31:0] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [31:0] of
+///    the destination
+///    Bits [33:32]:
+///    00: Bits [31:0] of the source are copied to bits [63:32] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [63:32] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [63:32] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [63:32]
+///    of the destination
+///    Bits [65:64]:
+///    00: Bits [31:0] of the source are copied to bits [95:64] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [95:64] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [95:64] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [95:64]
+///    of the destination
+///    Bits [97:96]:
+///    00: Bits [31:0] of the source are copied to bits [127:96] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [127:96]
+///    of the destination
+///    10: Bits [95:64] of the source are copied to bits [127:96]
+///    of the destination
+///    11: Bits [127:96] of the source are copied to bits [127:96]
+///    of the destination
+///    Bits [129:128]:
+///    00: Bits [159:128] of the source are copied to bits
+///    [159:128] of the destination
+///    01: Bits [191:160] of the source are copied to bits
+///    [159:128] of the destination
+///    10: Bits [223:192] of the source are copied to bits
+///    [159:128] of the destination
+///    11: Bits [255:224] of the source are copied to bits
+///    [159:128] of the destination
+///    Bits [161:160]:
+///    00: Bits [159:128] of the source are copied to bits
+///    [191:160] of the destination
+///    01: Bits [191:160] of the source are copied to bits
+///    [191:160] of the destination
+///    10: Bits [223:192] of the source are copied to bits
+///    [191:160] of the destination
+///    11: Bits [255:224] of the source are copied to bits
+///    [191:160] of the destination
+///    Bits [193:192]:
+///    00: Bits [159:128] of the source are copied to bits
+///    [223:192] of the destination
+///    01: Bits [191:160] of the source are copied to bits
+///    [223:192] of the destination
+///    10: Bits [223:192] of the source are copied to bits
+///    [223:192] of the destination
+///    11: Bits [255:224] of the source are copied to bits
+///    [223:192] of the destination
+///    Bits [225:224]:
+///    00: Bits [159:128] of the source are copied to bits
+///    [255:224] of the destination
+///    01: Bits [191:160] of the source are copied to bits
+///    [255:224] of the destination
+///    10: Bits [223:192] of the source are copied to bits
+///    [255:224] of the destination
+///    11: Bits [255:224] of the source are copied to bits
+///    [255:224] of the destination
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_permutevar_ps(__m256 __a, __m256i __c)
 {
   return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
 }
 
+/// \brief Copies the values stored in a packed 128-bit vector of [2 x double] 
+///    as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_permute_pd(__m128d A, const int C);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPERMILPD instruction.
+///
+/// \param A
+///    A 128-bit vector of [2 x double] values.
+/// \param C
+///    An immediate integer operand specifying how the values are
+///    to be copied.
+///    Bit [0]:
+///    0: Bits [63:0] of the source are copied to bits [63:0] of
+///    the destination
+///    1: Bits [127:64] of the source are copied to bits [63:0] of
+///    the destination
+///    Bit [1]:
+///    0: Bits [63:0] of the source are copied to bits [127:64] of
+///    the destination
+///    1: Bits [127:64] of the source are copied to bits [127:64]
+///    of the destination
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_permute_pd(A, C) __extension__ ({ \
   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
                                    (__v2df)_mm_setzero_pd(), \
                                    (C) & 0x1, ((C) & 0x2) >> 1); })
 
+/// \brief Copies the values stored in a packed 256-bit vector of [4 x double] 
+///    as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256d _mm256_permute_pd(__m256d A, const int C);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPERMILPD instruction.
+///
+/// \param A
+///    A 256-bit vector of [4 x double] values.
+/// \param C
+///    An immediate integer operand specifying how the values are
+///    to be copied.
+///    Bit [0]:
+///    0: Bits [63:0] of the source are copied to bits [63:0] of
+///    the destination
+///    1: Bits [127:64] of the source are copied to bits [63:0] of
+///    the destination
+///    Bit [1]:
+///    0: Bits [63:0] of the source are copied to bits [127:64] of
+///    the destination
+///    1: Bits [127:64] of the source are copied to bits [127:64]
+///    of the destination
+///    Bit [2]:
+///    0: Bits [191:128] of the source are copied to bits [191:128]
+///    of the destination
+///    1: Bits [255:192] of the source are copied to bits [191:128]
+///    of the destination
+///    Bit [3]:
+///    0: Bits [191:128] of the source are copied to bits [255:192]
+///    of the destination
+///    1: Bits [255:192] of the source are copied to bits [255:192]
+///    of the destination
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute_pd(A, C) __extension__ ({ \
   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
                                    (__v4df)_mm256_setzero_pd(), \
@@ -277,12 +1019,154 @@
                                    2 + (((C) & 0x4) >> 2), \
                                    2 + (((C) & 0x8) >> 3)); })
 
+/// \brief Copies the values stored in a packed 128-bit vector of [4 x float] 
+///    as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_permute_ps(__m128 A, const int C);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPERMILPS instruction.
+///
+/// \param A
+///    A 128-bit vector of [4 x float] values.
+/// \param C
+///    An immediate integer operand specifying how the values are
+///    to be copied.
+///    Bits [1:0]:
+///    00: Bits [31:0] of the source are copied to bits [31:0] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [31:0] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [31:0] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [31:0] of
+///    the destination
+///    Bits [3:2]:
+///    00: Bits [31:0] of the source are copied to bits [63:32] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [63:32] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [63:32] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [63:32]
+///    of the destination
+///    Bits [5:4]:
+///    00: Bits [31:0] of the source are copied to bits [95:64] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [95:64] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [95:64] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [95:64]
+///    of the destination
+///    Bits [7:6]:
+///    00: Bits [31:0] of the source are copied to bits [127:96] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [127:96]
+///    of the destination
+///    10: Bits [95:64] of the source are copied to bits [127:96]
+///    of the destination
+///    11: Bits [127:96] of the source are copied to bits [127:96]
+///    of the destination
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_permute_ps(A, C) __extension__ ({ \
   (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
                                   (__v4sf)_mm_setzero_ps(), \
                                    (C) & 0x3, ((C) & 0xc) >> 2, \
                                    ((C) & 0x30) >> 4, ((C) & 0xc0) >> 6); })
 
+/// \brief Copies the values stored in a packed 256-bit vector of [8 x float] 
+///    as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256 _mm256_permute_ps(__m256 A, const int C);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPERMILPS instruction.
+///
+/// \param A
+///    A 256-bit vector of [8 x float] values.
+/// \param C
+///    An immediate integer operand specifying how the values are
+///    to be copied.
+///    Bits [1:0]:
+///    00: Bits [31:0] of the source are copied to bits [31:0] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [31:0] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [31:0] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [31:0] of
+///    the destination
+///    Bits [3:2]:
+///    00: Bits [31:0] of the source are copied to bits [63:32] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [63:32] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [63:32] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [63:32]
+///    of the destination
+///    Bits [5:4]:
+///    00: Bits [31:0] of the source are copied to bits [95:64] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [95:64] of
+///    the destination
+///    10: Bits [95:64] of the source are copied to bits [95:64] of
+///    the destination
+///    11: Bits [127:96] of the source are copied to bits [95:64]
+///    of the destination
+///    Bits [7:6]:
+///    00: Bits [31:0] of the source are copied to bits [127:96] of
+///    the destination
+///    01: Bits [63:32] of the source are copied to bits [127:96]
+///    of the destination
+///    10: Bits [95:64] of the source are copied to bits [127:96]
+///    of the destination
+///    11: Bits [127:96] of the source are copied to bits [127:96]
+///    of the destination
+///    Bits [1:0]:
+///    00: Bits [159:128] of the source are copied to bits
+///    [159:128] of the destination
+///    01: Bits [191:160] of the source are copied to bits
+///    [159:128] of the destination
+///    10: Bits [223:192] of the source are copied to bits
+///    [159:128] of the destination
+///    11: Bits [255:224] of the source are copied to bits
+///    [159:128] of the destination
+///    Bits [3:2]:
+///    00: Bits [159:128] of the source are copied to bits
+///    [191:160] of the destination
+///    01: Bits [191:160] of the source are copied to bits
+///    [191:160] of the destination
+///    10: Bits [223:192] of the source are copied to bits
+///    [191:160] of the destination
+///    11: Bits [255:224] of the source are copied to bits
+///    [191:160] of the destination
+///    Bits [5:4]:
+///    00: Bits [159:128] of the source are copied to bits
+///    [223:192] of the destination
+///    01: Bits [191:160] of the source are copied to bits
+///    [223:192] of the destination
+///    10: Bits [223:192] of the source are copied to bits
+///    [223:192] of the destination
+///    11: Bits [255:224] of the source are copied to bits
+///    [223:192] of the destination
+///    Bits [7:6]:
+///    00: Bits [159:128] of the source are copied to bits
+///    [255:224] of the destination
+///    01: Bits [191:160] of the source are copied to bits
+///    [255:224] of the destination
+///    10: Bits [223:192] of the source are copied to bits
+///    [255:224] of the destination
+///    11: Bits [255:224] of the source are copied to bits
+///    [255:224] of the destination
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute_ps(A, C) __extension__ ({ \
   (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
                                   (__v8sf)_mm256_setzero_ps(), \
@@ -293,19 +1177,156 @@
                                   4 + (((C) & 0x30) >> 4), \
                                   4 + (((C) & 0xc0) >> 6)); })
 
+/// \brief Copies 128-bit data values stored in two packed 256-bit vectors of 
+///    [4 x double], as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPERM2F128 instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double] values.
+/// \param V2
+///    A 256-bit vector of [4 x double] values.
+/// \param M
+///    An immediate integer operand specifying how the values are
+///    to be copied.
+///    Bits [1:0]:
+///    00: Bits [127:0] of operand V1 are
+///    copied to bits [127:0] of the destination
+///    01: Bits [255:128] of operand V1 are
+///    copied to bits [127:0] of the destination
+///    10: Bits [127:0] of operand V2 are
+///    copied to bits [127:0] of the destination
+///    11: Bits [255:128] of operand V2 are
+///    copied to bits [127:0] of the destination
+///    Bits [5:4]:
+///    00: Bits [127:0] of operand V1 are
+///    copied to bits [255:128] of the destination
+///    01: Bits [255:128] of operand V1 are
+///    copied to bits [255:128] of the destination
+///    10: Bits [127:0] of operand V2 are
+///    copied to bits [255:128] of the destination
+///    11: Bits [255:128] of operand V2 are
+///    copied to bits [255:128] of the destination
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
   (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
                                            (__v4df)(__m256d)(V2), (M)); })
 
+/// \brief Copies 128-bit data values stored in two packed 256-bit vectors of 
+///    [8 x float], as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPERM2F128 instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float] values.
+/// \param V2
+///    A 256-bit vector of [8 x float] values.
+/// \param M
+///    An immediate integer operand specifying how the values are
+///    to be copied.
+///    Bits [1:0]:
+///    00: Bits [127:0] of operand V1 are
+///    copied to bits [127:0] of the destination
+///    01: Bits [255:128] of operand V1 are
+///    copied to bits [127:0] of the destination
+///    10: Bits [127:0] of operand V2 are
+///    copied to bits [127:0] of the destination
+///    11: Bits [255:128] of operand V2 are
+///    copied to bits [127:0] of the destination
+///    Bits [5:4]:
+///    00: Bits [127:0] of operand V1 are
+///    copied to bits [255:128] of the destination
+///    01: Bits [255:128] of operand V1 are
+///    copied to bits [255:128] of the destination
+///    10: Bits [127:0] of operand V2 are
+///    copied to bits [255:128] of the destination
+///    11: Bits [255:128] of operand V2 are
+///    copied to bits [255:128] of the destination
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
                                           (__v8sf)(__m256)(V2), (M)); })
 
+/// \brief Copies 128-bit data values stored in two packed 256-bit integer
+///    vectors, as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPERM2F128 instruction.
+///
+/// \param V1
+///    A 256-bit integer vector.
+/// \param V2
+///    A 256-bit integer vector.
+/// \param M
+///    An immediate integer operand specifying how the values are
+///    to be copied.
+///    Bits [1:0]:
+///    00: Bits [127:0] of operand V1 are
+///    copied to bits [127:0] of the destination
+///    01: Bits [255:128] of operand V1 are
+///    copied to bits [127:0] of the destination
+///    10: Bits [127:0] of operand V2 are
+///    copied to bits [127:0] of the destination
+///    11: Bits [255:128] of operand V2 are
+///    copied to bits [127:0] of the destination
+///    Bits [5:4]:
+///    00: Bits [127:0] of operand V1 are
+///    copied to bits [255:128] of the destination
+///    01: Bits [255:128] of operand V1 are
+///    copied to bits [255:128] of the destination
+///    10: Bits [127:0] of operand V2 are
+///    copied to bits [255:128] of the destination
+///    11: Bits [255:128] of operand V2 are
+///    copied to bits [255:128] of the destination
+/// \returns A 256-bit integer vector containing the copied values.
 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
                                            (__v8si)(__m256i)(V2), (M)); })
 
 /* Vector Blend */
+/// \brief Copies 64-bit double-precision data values stored in either of the 
+///    two packed 256-bit vectors of [4 x double], as specified by the integer
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VBLENDPD instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double] values.
+/// \param V2
+///    A 256-bit vector of [4 x double] values.
+/// \param M
+///    An immediate integer operand, with mask bits [3:0]
+///    specifying how the values are to be copied. The position of
+///    the mask bit corresponds to the index of a copied value.
+///    When a mask bit is 0, the corresponding 64-bit element in
+///    operand V1 is copied to the same
+///    position in the destination. When a mask bit is 1, the
+///    corresponding 64-bit element in operand V2
+///    is copied to the same position in the destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
   (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
                                    (__v4df)(__m256d)(V2), \
@@ -314,6 +1335,31 @@
                                    (((M) & 0x04) ? 6 : 2), \
                                    (((M) & 0x08) ? 7 : 3)); })
 
+/// \brief Copies 32-bit single-precision data values stored in either of the 
+///    two packed 256-bit vectors of [8 x float], as specified by the integer
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VBLENDPS instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float] values.
+/// \param V2
+///    A 256-bit vector of [8 x float] values.
+/// \param M
+///    An immediate integer operand, with mask bits [7:0]
+///    specifying how the values are to be copied. The position of
+///    the mask bit corresponds to the index of a copied value.
+///    When a mask bit is 0, the corresponding 32-bit element in
+///    operand V1 is copied to the same
+///    position in the destination. When a mask bit is 1, the
+///    corresponding 32-bit element in operand V2
+///    is copied to the same position in the destination.
 #define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
   (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
                                   (__v8sf)(__m256)(V2), \
@@ -326,6 +1372,28 @@
                                   (((M) & 0x40) ? 14 : 6), \
                                   (((M) & 0x80) ? 15 : 7)); })
 
+/// \brief Copies 64-bit double-precision data values stored in either of the 
+///    two packed 256-bit vectors of [4 x double], as specified by the 256-bit
+///    vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VBLENDVPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] values.
+/// \param __b
+///    A 256-bit vector of [4 x double] values.
+/// \param __c
+///    A 256-bit vector operand, with mask bits 255, 191, 127, and
+///    63 specifying how the values are to be copied. The position
+///    of the mask bit corresponds to the most significant bit of a
+///    copied value. When a mask bit is 0, the corresponding 64-bit
+///    element in operand __a is copied to the
+///    same position in the destination. When a mask bit is 1, the
+///    corresponding 64-bit element in operand __b
+///    is copied to the same position in the destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
 {
@@ -333,6 +1401,29 @@
     (__v4df)__a, (__v4df)__b, (__v4df)__c);
 }
 
+/// \brief Copies 32-bit single-precision data values stored in either of the 
+///    two packed 256-bit vectors of [8 x float], as specified by the 256-bit
+///    vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VBLENDVPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] values.
+/// \param __b
+///    A 256-bit vector of [8 x float] values.
+/// \param __c
+///    A 256-bit vector operand, with mask bits 255, 223, 191, 159,
+///    127, 95, 63, and 31 specifying how the values are to be
+///    copied. The position of the mask bit corresponds to the most
+///    significant bit of a copied value. When a mask bit is 0, the
+///    corresponding 32-bit element in operand __a
+///    is copied to the same position in the destination. When a
+///    mask bit is 1, the corresponding 32-bit element in operand
+///    __b is copied to the same position in
+///    the destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 {
@@ -341,11 +1432,98 @@
 }
 
 /* Vector Dot Product */
+/// \brief Computes two dot products: one dot product is computed using the 
+///    lower 128 bits of the two packed 256-bit vectors of [8 x float], and the
+///    other dot product is computed using the upper 128 bits of the two
+///    packed 256-bit vectors of [8 x float]. Both dot products are computed
+///    as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VDPPS instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float] values.
+/// \param V2
+///    A 256-bit vector of [8 x float] values.
+/// \param M
+///    An immediate integer operand. Mask bits [7:4] are used to
+///    select 32-bit segments of the source operands. If a mask bit
+///    is 1, the corresponding bits are used in the dot product
+///    calculation:
+///    Bit [7]: selects bits [127:96] or bits [255:224]
+///    Bit [6]: selects bits [95:64] or bits [223:192]
+///    Bit [5]: selects bits [63:32] or bits [191:160]
+///    Bit [4]: selects bits [31:0] or bits [159:128]
+///    Bits [3:0] select which bits within the destination will be
+///    used to store the 32-bit sum.
+/// \returns A 256-bit vector of [8 x float] containing the two dot products.
 #define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
   (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
                                  (__v8sf)(__m256)(V2), (M)); })
 
 /* Vector shuffle */
+/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
+///    specified by the immediate value operand. The four selected elements
+///    in each operand are copied to the destination according to the bits
+///    specified in the immediate operand. The selected elements from the
+///    first 256-bit operand are copied to bits [63:0] and bits [191:128] of
+///    the destination, and the selected elements from the second 256-bit
+///    operand are copied to bits [127:64] and bits [255:192] of the
+///    destination. For example, if bits [7:0] of the immediate operand
+///    contain a value of 0xFF, the 256-bit destination vector would contain
+///    the following values:
+///    b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VSHUFPS instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x float]. The four selected elements
+///    in this operand are copied to bits [63:0] and bits [191:128]
+///    in the destination, according to the bits specified in the
+///    immediate operand.
+/// \param b
+///    A 256-bit vector of [8 x float]. The four selected elements
+///    in this operand are copied to bits [127:64] and bits
+///    [255:192] in the destination, according to the bits
+///    specified in the immediate operand.
+/// \param mask
+///    An immediate value containing an 8-bit value specifying
+///    which elements to copy from a and b. Bits [3:0] specify the values 
+///    copied
+///    from operand a. Bits [7:4] specify the
+///    values copied from operand b.
+///    The destinations within the 256-bit destination are assigned
+///    values as follows, according to the bit value assignments
+///    described further below:
+///    Bits [1:0] are used to assign values to bits [31:0] and
+///    [159:128] in the destination.
+///    Bits [3:2] are used to assign values to bits [63:32] and
+///    [191:160] in the destination.
+///    Bits [5:4] are used to assign values to bits [95:64] and
+///    [223:192] in the destination.
+///    Bits [7:6] are used to assign values to bits [127:96] and
+///    [255:224] in the destination.
+///    Bit value assignments:
+///    00: Bits [31:0] and [159:128] are copied from the selected
+///    operand.
+///    01: Bits [63:32] and [191:160] are copied from the selected
+///    operand.
+///    10: Bits [95:64] and [223:192] are copied from the selected
+///    operand.
+///    11: Bits [127:96] and [255:224] are copied from the selected
+///    operand.
+/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
 #define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
         (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
                                         (__v8sf)(__m256)(b), \
@@ -358,6 +1536,49 @@
                                         (((mask) & 0x30) >> 4) + 12, \
                                         (((mask) & 0xc0) >> 6) + 12); })
 
+/// \brief Selects four double-precision values from the 256-bit operands of [4 
+///    x double], as specified by the immediate value operand. The selected
+///    elements from the first 256-bit operand are copied to bits [63:0] and
+///    bits [191:128] in the destination, and the selected elements from the
+///    second 256-bit operand are copied to bits [127:64] and bits [255:192]
+///    in the destination. For example, if bits [3:0] of the immediate
+///    operand contain a value of 0xF, the 256-bit destination vector would
+///    contain the following values:
+///    b[3], a[3], b[1],
+///    a[1]
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VSHUFPD instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double].
+/// \param b
+///    A 256-bit vector of [4 x double].
+/// \param mask
+///    An immediate value containing 8-bit values specifying which
+///    elements to copy from a and b:
+///    Bit [0]=0: Bits [63:0] are copied from a
+///    to bits [63:0] of the destination.
+///    Bit [0]=1: Bits [127:64] are copied from a
+///    to bits [63:0] of the destination.
+///    Bit [1]=0: Bits [63:0] are copied from b
+///    to bits [127:64] of the destination.
+///    Bit [1]=1: Bits [127:64] are copied from b
+///    to bits [127:64] of the destination.
+///    Bit [2]=0: Bits [191:128] are copied from a
+///    to bits [191:128] of the destination.
+///    Bit [2]=1: Bits [255:192] are copied from a
+///    to bits [191:128] of the destination.
+///    Bit [3]=0: Bits [191:128] are copied from b
+///    to bits [255:192] of the destination.
+///    Bit [3]=1: Bits [255:192] are copied from b
+///    to bits [255:192] of the destination.
+/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
 #define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
         (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
                                          (__v4df)(__m256d)(b), \
@@ -400,30 +1621,244 @@
 #define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
 #define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double], using the operation specified by
+///    the integer operand. If the result is true, all 64 bits of the
+///    destination vector are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VCMPPD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double] values.
+/// \param b
+///    A 128-bit vector of [2 x double] values.
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying
+///    which comparison operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h:Less than
+///    02h, 0Ah, 12h, 1Ah:Less than or equal
+///    Greater than or equal (swapped operands)
+///    03h, 0Bh, 13h, 1Bh:Unordered
+///    04h, 0Ch, 14h, 1Ch:Not equal
+///    05h, 0Dh, 15h, 1Dh:Not less than
+///    Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh:Not less than or equal
+///    Not greater than or equal (swapped operands)
+///    07h, 0Fh, 17h, 1Fh:Ordered
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
   (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
                                 (__v2df)(__m128d)(b), (c)); })
 
+/// \brief Compares each of the corresponding packed values of the 128-bit
+///    vectors of [4 x float], using the operation specified by the integer
+///    operand. If the result is true, all 32 bits of the destination vector
+///    are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VCMPPS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float] values.
+/// \param b
+///    A 128-bit vector of [4 x float] values.
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying
+///    which comparison operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h:Less than
+///    02h, 0Ah, 12h, 1Ah:Less than or equal
+///    Greater than or equal (swapped operands)
+///    03h, 0Bh, 13h, 1Bh:Unordered
+///    04h, 0Ch, 14h, 1Ch:Not equal
+///    05h, 0Dh, 15h, 1Dh:Not less than
+///    Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh:Not less than or equal
+///    Not greater than or equal (swapped operands)
+///    07h, 0Fh, 17h, 1Fh:Ordered
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
   (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
                                (__v4sf)(__m128)(b), (c)); })
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 256-bit vectors of [4 x double], using the operation specified by
+///    the integer operand. If the result is true, all 64 bits of the
+///    destination vector are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VCMPPD instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double] values.
+/// \param b
+///    A 256-bit vector of [4 x double] values.
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying
+///    which comparison operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h:Less than
+///    02h, 0Ah, 12h, 1Ah:Less than or equal
+///    Greater than or equal (swapped operands)
+///    03h, 0Bh, 13h, 1Bh:Unordered
+///    04h, 0Ch, 14h, 1Ch:Not equal
+///    05h, 0Dh, 15h, 1Dh:Not less than
+///    Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh:Not less than or equal
+///    Not greater than or equal (swapped operands)
+///    07h, 0Fh, 17h, 1Fh:Ordered
+/// \returns A 256-bit vector of [4 x double] containing the comparison results.
 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
   (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
                                    (__v4df)(__m256d)(b), (c)); })
 
+/// \brief Compares each of the corresponding packed values of the 256-bit
+///    vectors of [8 x float], using the operation specified by the integer
+///    operand. If the result is true, all 32 bits of the destination vector
+///    are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VCMPPS instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x float] values.
+/// \param b
+///    A 256-bit vector of [8 x float] values.
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying
+///    which comparison operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h:Less than
+///    02h, 0Ah, 12h, 1Ah:Less than or equal
+///    Greater than or equal (swapped operands)
+///    03h, 0Bh, 13h, 1Bh:Unordered
+///    04h, 0Ch, 14h, 1Ch:Not equal
+///    05h, 0Dh, 15h, 1Dh:Not less than
+///    Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh:Not less than or equal
+///    Not greater than or equal (swapped operands)
+///    07h, 0Fh, 17h, 1Fh:Ordered
+/// \returns A 256-bit vector of [8 x float] containing the comparison results.
 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
   (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
                                   (__v8sf)(__m256)(b), (c)); })
 
+/// \brief Compares each of the corresponding scalar double-precision values of
+///    the 128-bit [2 x double] operands, using the operation specified by
+///    the integer operand. If the result is true, all 64 bits of the
+///    destination vector are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VCMPSD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double] values.
+/// \param b
+///    A 128-bit vector of [2 x double] values.
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying
+///    which comparison operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h:Less than
+///    02h, 0Ah, 12h, 1Ah:Less than or equal
+///    Greater than or equal (swapped operands)
+///    03h, 0Bh, 13h, 1Bh:Unordered
+///    04h, 0Ch, 14h, 1Ch:Not equal
+///    05h, 0Dh, 15h, 1Dh:Not less than
+///    Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh:Not less than or equal
+///    Not greater than or equal (swapped operands)
+///    07h, 0Fh, 17h, 1Fh:Ordered
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
   (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
                                 (__v2df)(__m128d)(b), (c)); })
 
+/// \brief Compares each of the corresponding scalar values of the 128-bit
+///    vectors of [4 x float], using the operation specified by the integer
+///    operand. If the result is true, all 32 bits of the destination vector
+///    are set; otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VCMPSS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float] values.
+/// \param b
+///    A 128-bit vector of [4 x float] values.
+/// \param c
+///    An immediate integer operand, with bits [4:0] specifying
+///    which comparison operation to use:
+///    00h, 08h, 10h, 18h: Equal
+///    01h, 09h, 11h, 19h:Less than
+///    02h, 0Ah, 12h, 1Ah:Less than or equal
+///    Greater than or equal (swapped operands)
+///    03h, 0Bh, 13h, 1Bh:Unordered
+///    04h, 0Ch, 14h, 1Ch:Not equal
+///    05h, 0Dh, 15h, 1Dh:Not less than
+///    Not greater than (swapped operands)
+///    06h, 0Eh, 16h, 1Eh:Not less than or equal
+///    Not greater than or equal (swapped operands)
+///    07h, 0Fh, 17h, 1Fh:Ordered
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
   (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
                                (__v4sf)(__m128)(b), (c)); })
 
+/// \brief Extracts 32 bits of extended packed data from a 256-bit integer 
+///    vector and copies it to the destination, as specified by the integer 
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VEXTRACTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __imm
+///    Determines which bits are extracted using bits [3:0]:
+///    000: Bits [31:0] are copied to the destination.
+///    001: Bits [63:32] are copied to the destination.
+///    010: Bits [95:64] are copied to the destination.
+///    011: Bits [127:96] are copied to the destination.
+///    100: Bits [159:128] are copied to the destination.
+///    101: Bits [191:160] are copied to the destination.
+///    110: Bits [223:192] are copied to the destination.
+///    111: Bits [255:224] are copied to the destination.
+/// \returns A 32-bit integer containing the extracted 32 bits of extended 
+///    packed data.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_extract_epi32(__m256i __a, const int __imm)
 {
@@ -431,6 +1866,36 @@
   return __b[__imm & 7];
 }
 
+/// \brief Extracts 16 bits of extended packed data from a 256-bit integer 
+///    vector and copies it to the destination, as specified by the integer 
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VEXTRACTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __imm
+///    Determines which bits are extracted using bits [3:0]:
+///    0000: Bits [15:0] are copied to the destination.
+///    0001: Bits [31:16] are copied to the destination.
+///    0010: Bits [47:32] are copied to the destination.
+///    0011: Bits [63:48] are copied to the destination.
+///    0100: Bits [79:64] are copied to the destination.
+///    0101: Bits [95:80] are copied to the destination.
+///    0110: Bits [111:96] are copied to the destination.
+///    0111: Bits [127:112] are copied to the destination.
+///    1000: Bits [143:128] are copied to the destination.
+///    1001: Bits [159:144] are copied to the destination.
+///    1010: Bits [175:160] are copied to the destination.
+///    1011: Bits [191:176] are copied to the destination.
+///    1100: Bits [207:192] are copied to the destination.
+///    1101: Bits [223:208] are copied to the destination.
+///    1110: Bits [239:224] are copied to the destination.
+///    1111: Bits [255:240] are copied to the destination.
+/// \returns A 32-bit integer containing the extracted 16 bits of extended 
+///    packed data.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_extract_epi16(__m256i __a, const int __imm)
 {
@@ -438,6 +1903,52 @@
   return __b[__imm & 15];
 }
 
+/// \brief Extracts 8 bits of extended packed data from a 256-bit integer 
+///    vector and copies it to the destination, as specified by the integer 
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VEXTRACTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __imm
+///    Determines which bits are extracted using bits [4:0]:
+///    00000: Bits [7:0] are copied to the destination.
+///    00001: Bits [15:8] are copied to the destination.
+///    00010: Bits [23:16] are copied to the destination.
+///    00011: Bits [31:24] are copied to the destination.
+///    00100: Bits [39:32] are copied to the destination.
+///    00101: Bits [47:40] are copied to the destination.
+///    00110: Bits [55:48] are copied to the destination.
+///    00111: Bits [63:56] are copied to the destination.
+///    01000: Bits [71:64] are copied to the destination.
+///    01001: Bits [79:72] are copied to the destination.
+///    01010: Bits [87:80] are copied to the destination.
+///    01011: Bits [95:88] are copied to the destination.
+///    01100: Bits [103:96] are copied to the destination.
+///    01101: Bits [111:104] are copied to the destination.
+///    01110: Bits [119:112] are copied to the destination.
+///    01111: Bits [127:120] are copied to the destination.
+///    10000: Bits [135:128] are copied to the destination.
+///    10001: Bits [143:136] are copied to the destination.
+///    10010: Bits [151:144] are copied to the destination.
+///    10011: Bits [159:152] are copied to the destination.
+///    10100: Bits [167:160] are copied to the destination.
+///    10101: Bits [175:168] are copied to the destination.
+///    10110: Bits [183:176] are copied to the destination.
+///    00111: Bits [191:184] are copied to the destination.
+///    11000: Bits [199:192] are copied to the destination.
+///    11001: Bits [207:200] are copied to the destination.
+///    11010: Bits [215:208] are copied to the destination.
+///    11011: Bits [223:216] are copied to the destination.
+///    11100: Bits [231:224] are copied to the destination.
+///    11101: Bits [239:232] are copied to the destination.
+///    11110: Bits [247:240] are copied to the destination.
+///    11111: Bits [255:248] are copied to the destination.
+/// \returns A 32-bit integer containing the extracted 8 bits of extended 
+///    packed data.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_extract_epi8(__m256i __a, const int __imm)
 {
@@ -446,6 +1957,24 @@
 }
 
 #ifdef __x86_64__
+/// \brief Extracts 64 bits of extended packed data from a 256-bit integer 
+///    vector and copies it to the destination, as specified by the integer 
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VEXTRACTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \param __imm
+///    Determines which bits are extracted using bits [1:0]:
+///    00: Bits [63:0] are copied to the destination.
+///    01: Bits [127:64] are copied to the destination.
+///    10: Bits [191:128] are copied to the destination.
+///    11: Bits [255:192] are copied to the destination.
+/// \returns A 64-bit integer containing the extracted 64 bits of extended 
+///    packed data.
 static __inline long long  __DEFAULT_FN_ATTRS
 _mm256_extract_epi64(__m256i __a, const int __imm)
 {
@@ -454,6 +1983,35 @@
 }
 #endif
 
+/// \brief Combines 224 bits of extended packed data from the 256-bit integer
+///    vector operand with 32 bits of extended packed data from the 32-bit
+///    integer operand and copies them to the destination, using the offset
+///    specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VINSERTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param __b
+///    An integer. The bits of this operand are written to the
+///    destination beginning at the offset specified by operand __imm.
+/// \param __imm
+///    An immediate integer used to determine which bits in the
+///    destination are used when copying the bits from operand __b:
+///    If c is 0, bits [31:0] are used in the destination.
+///    If c is 1, bits [63:32] are used in the destination.
+///    If c is 2, bits [95:64] are used in the destination.
+///    If c is 3, bits [127:96] are used in the destination.
+///    If c is 4, bits [159:128] are used in the destination.
+///    If c is 5, bits [191:160] are used in the destination.
+///    If c is 6, bits [223:192] are used in the destination.
+///    If c is 7, bits [255:224] are used in the destination.
+/// \returns A 256-bit integer vector containing the copied extended packed 
+///    data from the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi32(__m256i __a, int __b, int const __imm)
 {
@@ -462,6 +2020,43 @@
   return (__m256i)__c;
 }
 
+/// \brief Combines 240 bits of extended packed data from the 256-bit integer
+///    vector operand with 16 bits of extended packed data from the 16-bit
+///    integer operand and copies them to the destination, using the offset
+///    specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VINSERTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param __b
+///    An integer. The bits of this operand are written to the
+///    destination beginning at the offset specified by operand __imm.
+/// \param __imm
+///    An immediate integer used to determine which bits in the
+///    destination are used when copying the bits from operand __b:
+///    If c is 0, bits [15:0] are used in the destination.
+///    If c is 1, bits [31:16] are used in the destination.
+///    If c is 2, bits [47:32] are used in the destination.
+///    If c is 3, bits [63:48] are used in the destination.
+///    If c is 4, bits [79:64] are used in the destination.
+///    If c is 5, bits [95:80] are used in the destination.
+///    If c is 6, bits [111:96] are used in the destination.
+///    If c is 7, bits [127:112] are used in the destination.
+///    If c is 8, bits [143:128] are used in the destination.
+///    If c is 9, bits [159:144] are used in the destination.
+///    If c is 10, bits [175:160] are used in the destination.
+///    If c is 11, bits [191:176] are used in the destination.
+///    If c is 12, bits [207:192] are used in the destination.
+///    If c is 13, bits [223:208] are used in the destination.
+///    If c is 14, bits [239:224] are used in the destination.
+///    If c is 15, bits [255:240] are used in the destination.
+/// \returns A 256-bit integer vector containing the copied extended packed 
+///    data from the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi16(__m256i __a, int __b, int const __imm)
 {
@@ -470,6 +2065,29 @@
   return (__m256i)__c;
 }
 
+/// \brief Combines 248 bits of extended packed data from the 256-bit integer
+///    vector operand with 8 bits of extended packed data from the 8-bit
+///    integer operand and copies them to the destination, using the offset
+///    specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VINSERTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param __b
+///    An integer. The bits of this operand are written to the
+///    destination beginning at the offset specified by operand __imm.
+/// \param __imm
+///    An immediate integer used to determine which bits in the
+///    destination are used when copying the bits from operand __b. Bits [8*c  1,
+///     8*(c -1)] are used
+///    in the destination, where c is a value from [0-31].
+/// \returns A 256-bit integer vector containing the copied extended packed 
+///    data from the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi8(__m256i __a, int __b, int const __imm)
 {
@@ -479,6 +2097,58 @@
 }
 
 #ifdef __x86_64__
+/// \brief Combines 192 bits of extended packed data from the 256-bit integer
+///    vector operand with 64 bits of extended packed data from the 64-bit
+///    integer operand and copies them to the destination, using the offset
+///    specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VINSERTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param __b
+///    A 64-bit integer. The bits of this operand are written to
+///    the destination beginning at the offset specified by operand
+///    __imm.
+/// \param __imm
+///    An immediate integer used to determine which bits in the
+///    destination are used when copying the bits from operand __b:
+///    If c is 0, bits [63:0] are used in the destination.
+///    If c is 1, bits [127:64] are used in the destination.
+///    If c is 2, bits [191:128] are used in the destination.
+///    If c is 3, bits [255:192] are used in the destination.
+/// \returns A 256-bit integer vector containing the copied extended packed 
+///    data from the operands.
+/// \brief Combines 192 bits of extended packed data from the 256-bit integer
+///    vector operand with 64 bits of extended packed data from the 64-bit
+///    integer operand and copies them to the destination, using the offset
+///    specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VINSERTF128+COMPOSITE instruction.
+///
+/// \param __a
+///    A 256-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param __b
+///    A 64-bit integer. The bits of this operand are written to
+///    the destination beginning at the offset specified by operand
+///    __imm.
+/// \param __imm
+///    An immediate integer used to determine which bits in the
+///    destination are used when copying the bits from operand __b:
+///    If c is 0, bits [63:0] are used in the destination.
+///    If c is 1, bits [127:64] are used in the destination.
+///    If c is 2, bits [191:128] are used in the destination.
+///    If c is 3, bits [255:192] are used in the destination.
+/// \returns A 256-bit integer vector containing the copied extended packed 
+///    data from the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
 {
@@ -489,48 +2159,125 @@
 #endif
 
 /* Conversion */
+/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTDQ2PD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_pd(__m128i __a)
 {
   return (__m256d)__builtin_ia32_cvtdq2pd256((__v4si) __a);
 }
 
+/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTDQ2PS instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 256-bit vector of [8 x float] containing the converted values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_cvtepi32_ps(__m256i __a)
 {
   return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
 }
 
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of 
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTPD2PS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm256_cvtpd_ps(__m256d __a)
 {
   return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
 }
 
+/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTPS2DQ instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_cvtps_epi32(__m256 __a)
 {
   return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
 }
 
+/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 
+///    x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTPS2PD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_cvtps_pd(__m128 __a)
 {
   return (__m256d)__builtin_ia32_cvtps2pd256((__v4sf) __a);
 }
 
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of 
+///    [4 x i32], truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTTPD2DQ instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
 static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_cvttpd_epi32(__m256d __a)
 {
   return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
 }
 
+/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of 
+///    [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTPD2DQ instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
 static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_cvtpd_epi32(__m256d __a)
 {
   return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
 }
 
+/// \brief Converts a vector of [8 x float] into a vector of [8x i32], 
+///    truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTTPS2DQ instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_cvttps_epi32(__m256 __a)
 {
@@ -538,18 +2285,74 @@
 }
 
 /* Vector replicate */
+/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
+///    vector of [8 x float] to float values stored in a packed 256-bit
+///    vector of [8 x float].
+///    Bits [255:224] of the source are written to bits [255:224] and
+///    [223:192] of the destination.
+///    Bits [191:160] of the source are written to bits [191:160] and
+///    [159:128] of the destination.
+///    Bits [127:96] of the source are written to bits [127:96] and [95:64]
+///    of the destination.
+///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVSHDUP instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the moved and 
+///    duplicated values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_movehdup_ps(__m256 __a)
 {
   return __builtin_shufflevector(__a, __a, 1, 1, 3, 3, 5, 5, 7, 7);
 }
 
+/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
+///    vector of [8 x float] to float values stored in a packed 256-bit
+///    vector of [8 x float].
+///    Bits [223:192] of the source are written to bits [255:224] and
+///    [223:192] of the destination.
+///    Bits [159:128] of the source are written to bits [191:160] and
+///    [159:128] of the destination.
+///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+///    the destination.
+///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVSLDUP instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the moved and 
+///    duplicated values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_moveldup_ps(__m256 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 0, 2, 2, 4, 4, 6, 6);
 }
 
+/// \brief Moves and duplicates double-precision values from a 256-bit vector 
+///    of [4 x double] to double-precision values stored in a packed 256-bit
+///    vector of [4 x double].
+///    Bits [63:0] of the source are written to bits [127:64] and [63:0] of
+///    the destination.
+///    Bits [191:128] of the source are written to bits [255:192] and
+///    [191:128] of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDDUP instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the moved and 
+///    duplicated values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_movedup_pd(__m256d __a)
 {
@@ -557,24 +2360,118 @@
 }
 
 /* Unpack and Interleave */
+/// \brief Unpacks the high-order (odd-indexed) double-precision values from 
+///    two 256-bit vectors of [4 x double] and interleaves them into a packed
+///    256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUNPCKHPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+///    Bits [127:64] are written to bits [63:0] of the destination.
+///    Bits [255:192] are written to bits [191:128] of the
+///    destination.
+/// \param __b
+///    A 256-bit vector of [4 x double].
+///    Bits [127:64] are written to bits [127:64] of the
+///    destination.
+///    Bits [255:192] are written to bits [255:191] of the
+///    destination.
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_unpackhi_pd(__m256d __a, __m256d __b)
 {
   return __builtin_shufflevector(__a, __b, 1, 5, 1+2, 5+2);
 }
 
+/// \brief Unpacks the low-order (even-indexed) double-precision values from 
+///    two 256-bit vectors of [4 x double] and interleaves them into a packed
+///    256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUNPCKLPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+///    Bits [63:0] are written to bits [63:0] of the destination.
+///    Bits [191:128] are written to bits [191:128] of the
+///    destination.
+/// \param __b
+///    A 256-bit vector of [4 x double].
+///    Bits [63:0] are written to bits [127:64] of the destination.
+///    Bits [191:128] are written to bits [255:191] of the
+///    destination.
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_unpacklo_pd(__m256d __a, __m256d __b)
 {
   return __builtin_shufflevector(__a, __b, 0, 4, 0+2, 4+2);
 }
 
+/// \brief Unpacks the high-order (index 2,3,6,7) values from two 256-bit 
+///    vectors of [8 x float] and interleaves them into a packed 256-bit vector 
+///    of [8
+///    x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUNPCKHPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+///    Bits [95:64] are written to bits [31:0] of the destination.
+///    Bits [127:96] are written to bits [95:64] of the
+///    destination.
+///    Bits [223:192] are written to bits [159:128] of the
+///    destination.
+///    Bits [255:224] are written to bits [223:192] of the
+///    destination.
+/// \param __b
+///    A 256-bit vector of [8 x float].
+///    Bits [95:64] are written to bits [63:32] of the destination.
+///    Bits [127:96] are written to bits [127:96] of the
+///    destination.
+///    Bits [223:192] are written to bits [191:160] of the
+///    destination.
+///    Bits [255:224] are written to bits [255:224] of the
+///    destination.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_unpackhi_ps(__m256 __a, __m256 __b)
 {
   return __builtin_shufflevector(__a, __b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
 }
 
+/// \brief Unpacks the low-order (index 0,1,4,5) values from two 256-bit 
+///    vectors of [8 x float] and interleaves them into a packed 256-bit vector 
+///    of [8
+///    x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUNPCKLPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+///    Bits [31:0] are written to bits [31:0] of the destination.
+///    Bits [63:32] are written to bits [95:64] of the destination.
+///    Bits [159:128] are written to bits [159:128] of the
+///    destination.
+///    Bits [191:160] are written to bits [223:192] of the
+///    destination.
+/// \param __b
+///    A 256-bit vector of [8 x float].
+///    Bits [31:0] are written to bits [63:32] of the destination.
+///    Bits [63:32] are written to bits [127:96] of the
+///    destination.
+///    Bits [159:128] are written to bits [191:160] of the
+///    destination.
+///    Bits [191:160] are written to bits [255:224] of the
+///    destination.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_unpacklo_ps(__m256 __a, __m256 __b)
 {
@@ -582,90 +2479,302 @@
 }
 
 /* Bit Test */
+/// \brief Tests whether the specified sign bits in a 128-bit vector of [2 x
+///    double] are all zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 128-bit vector of [2 x double] selecting which sign bits
+///    to test in operand __a.
+/// \returns TRUE if the specified sign bits are all zeros; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testz_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 128-bit vector of [2 x
+///    double] are all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 128-bit vector of [2 x double] selecting which sign bits
+///    to test in operand __a.
+/// \returns TRUE if the specified sign bits are all ones; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testc_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 128-bit vector of [2 x
+///    double] are neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 128-bit vector of [2 x double] selecting which sign bits
+///    to test in operand __a.
+/// \returns TRUE if the specified sign bits are neither all zeros nor all 
+///    ones; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testnzc_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 128-bit vector of [4 x
+///    float] are all zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 128-bit vector of [4 x float] selecting which sign bits to
+///    test in operand __a.
+/// \returns TRUE if the specified sign bits are all zeros; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testz_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 128-bit vector of [4 x
+///    float] are all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 128-bit vector of [4 x float] selecting which sign bits to
+///    test in operand __a.
+/// \returns TRUE if the specified sign bits are all ones; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testc_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 128-bit vector of [4 x
+///    float] are neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 128-bit vector of [4 x float] selecting which sign bits to
+///    test in operand __a.
+/// \returns TRUE if the specified sign bits are neither all zeros nor all 
+///    ones; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm_testnzc_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 256-bit vector of [4 x
+///    double] are all zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 256-bit vector of [4 x double] selecting which sign bits
+///    to test in operand __a.
+/// \returns TRUE if the specified sign bits are all zeros; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 256-bit vector of [4 x
+///    double] are all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 256-bit vector of [4 x double] selecting which sign bits
+///    to test in operand __a.
+/// \returns TRUE if the specified sign bits are all ones; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 256-bit vector of [4 x
+///    double] are neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 256-bit vector of [4 x double] selecting which sign bits
+///    to test in operand __a.
+/// \returns TRUE if the specified sign bits are neither all zeros nor all 
+///    ones; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_pd(__m256d __a, __m256d __b)
 {
   return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 256-bit vector of [8 x
+///    float] are all zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 256-bit vector of [8 x float] selecting which sign bits to
+///    test in operand __a.
+/// \returns TRUE if the specified sign bits are all zeros; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 256-bit vector of [8 x
+///    float] are all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 256-bit vector of [8 x float] selecting which sign bits to
+///    test in operand __a.
+/// \returns TRUE if the specified sign bits are all ones; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Tests whether the specified sign bits in a 256-bit vector of [8 x
+///    float] are neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VTESTPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the sign bits to
+///    be tested.
+/// \param __b
+///    A 256-bit vector of [8 x float] selecting which sign bits to
+///    test in operand __a.
+/// \returns TRUE if the specified sign bits are neither all zeros nor all 
+///    ones; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_ps(__m256 __a, __m256 __b)
 {
   return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
 }
 
+/// \brief Tests whether the specified bits in a 256-bit integer vector are all
+///    zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPTEST instruction.
+///
+/// \param __a
+///    A 256-bit integer vector containing the bits to be tested.
+/// \param __b
+///    A 256-bit integer vector selecting which bits to test in
+///    operand __a.
+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testz_si256(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
 }
 
+/// \brief Tests whether the specified bits in a 256-bit integer vector are all
+///    ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPTEST instruction.
+///
+/// \param __a
+///    A 256-bit integer vector containing the bits to be tested.
+/// \param __b
+///    A 256-bit integer vector selecting which bits to test in
+///    operand __a.
+/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testc_si256(__m256i __a, __m256i __b)
 {
   return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
 }
 
+/// \brief Tests whether the specified bits in a 256-bit integer vector are
+///    neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPTEST instruction.
+///
+/// \param __a
+///    A 256-bit integer vector containing the bits to be tested.
+/// \param __b
+///    A 256-bit integer vector selecting which bits to test in
+///    operand __a.
+/// \returns TRUE if the specified bits are neither all zeros nor all ones; 
+///    FALSE otherwise.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_testnzc_si256(__m256i __a, __m256i __b)
 {
@@ -673,12 +2782,38 @@
 }
 
 /* Vector extract sign mask */
+/// \brief Extracts the sign bits of packed double-precision values in a 
+///    256-bit vector of [4 x double] and writes them to the lower order bits of 
+///    the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVMSKPD instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the
+///    double-precision values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [3:0].
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_movemask_pd(__m256d __a)
 {
   return __builtin_ia32_movmskpd256((__v4df)__a);
 }
 
+/// \brief Extracts the sign bits of packed double-precision values in a 
+///    256-bit vector of [8 x float] and writes them to the lower order bits of 
+///    the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVMSKPS instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the
+///    double-precision values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [7:0].
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_movemask_ps(__m256 __a)
 {
@@ -686,12 +2821,24 @@
 }
 
 /* Vector __zero */
+/// \brief Clears all the YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VZEROALL instruction.
+///
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_zeroall(void)
 {
   __builtin_ia32_vzeroall();
 }
 
+/// \brief Clears the upper octword of all the YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VZEROUPPER instruction.
+///
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_zeroupper(void)
 {
@@ -699,6 +2846,17 @@
 }
 
 /* Vector load with broadcast */
+/// \brief Loads a float value and writes it to 32-bit elements in a 128-bit
+///    vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VBROADCASTSS instruction.
+///
+/// \param __a
+///    The float value to be broadcast.
+/// \returns A 128-bit vector of [4 x float] whose 32-bit values each contain 
+///    the broadcast value.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_broadcast_ss(float const *__a)
 {
@@ -706,6 +2864,17 @@
   return (__m128)(__v4sf){ __f, __f, __f, __f };
 }
 
+/// \brief Loads a double-precision value and writes it to 64-bit elements in a
+///    256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VBROADCASTSD instruction.
+///
+/// \param __a
+///    The double-precision value to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 64-bit values each contain 
+///    the broadcast value.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_broadcast_sd(double const *__a)
 {
@@ -713,6 +2882,17 @@
   return (__m256d)(__v4df){ __d, __d, __d, __d };
 }
 
+/// \brief Loads a float value and writes it to 32-bit elements in a 256-bit
+///    vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VBROADCASTSS instruction.
+///
+/// \param __a
+///    The float value to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 32-bit values each contain 
+///    the broadcast value.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcast_ss(float const *__a)
 {
@@ -720,12 +2900,34 @@
   return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
 }
 
+/// \brief Loads the data from a 128-bit vector of [2 x double] and writes it 
+///    to 128-bit elements in a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VBROADCASTF128 instruction.
+///
+/// \param __a
+///    The 128-bit vector of [2 x double] to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 128-bit elements each 
+///    contain the broadcast value.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_broadcast_pd(__m128d const *__a)
 {
   return (__m256d)__builtin_ia32_vbroadcastf128_pd256(__a);
 }
 
+/// \brief Loads the data from a 128-bit vector of [4 x float] and writes it to
+///    128-bit elements in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VBROADCASTF128 instruction.
+///
+/// \param __a
+///    The 128-bit vector of [4 x float] to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 128-bit elements each 
+///    contain the broadcast value.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcast_ps(__m128 const *__a)
 {
@@ -733,18 +2935,51 @@
 }
 
 /* SIMD load ops */
+/// \brief Moves packed double-precision values from an aligned memory location
+///    to 64-bit elements in a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPD instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location containing
+///    double-precision values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_load_pd(double const *__p)
 {
   return *(__m256d *)__p;
 }
 
+/// \brief Moves packed float values from an aligned memory location to 32-bit
+///    elements in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPS instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location containing
+///    float values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_load_ps(float const *__p)
 {
   return *(__m256 *)__p;
 }
 
+/// \brief Moves packed double-precision values from an unaligned memory 
+///    location to 64-bit elements in a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVUPD instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing double-precision
+///    values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_loadu_pd(double const *__p)
 {
@@ -754,6 +2989,16 @@
   return ((struct __loadu_pd*)__p)->__v;
 }
 
+/// \brief Moves packed float values from an unaligned memory location to 
+///    32-bit elements in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVUPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing float values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_loadu_ps(float const *__p)
 {
@@ -763,12 +3008,34 @@
   return ((struct __loadu_ps*)__p)->__v;
 }
 
+/// \brief Moves integer values from an aligned memory location to elements in 
+///    a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDQA instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a 256-bit integer vector
+///    containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_load_si256(__m256i const *__p)
 {
   return *__p;
 }
 
+/// \brief Moves integer values from an unaligned memory location to elements 
+///    in a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDQU instruction.
+///
+/// \param __p
+///    A pointer to a 256-bit integer vector containing integer
+///    values.
+/// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_loadu_si256(__m256i const *__p)
 {
@@ -778,6 +3045,18 @@
   return ((struct __loadu_si256*)__p)->__v;
 }
 
+/// \brief Moves integer values from an unaligned memory location to elements 
+///    in a 256-bit integer vector. The instruction may read 32 bytes to
+///    retrieve either or both of the first and second parts of the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VLDDQU instruction.
+///
+/// \param __p
+///    A pointer to a 256-bit integer vector containing integer
+///    values.
+/// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_lddqu_si256(__m256i const *__p)
 {
@@ -785,36 +3064,112 @@
 }
 
 /* SIMD store ops */
+/// \brief Moves packed double-precision values from a 256-bit vector of [4 x
+///    double] to an aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPD instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will
+///    receive the double-precision values.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be
+///    moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_pd(double *__p, __m256d __a)
 {
   *(__m256d *)__p = __a;
 }
 
+/// \brief Moves packed float values from a 256-bit vector of [8 x float] to an
+///    aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPS instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will
+///    receive the float values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be
+///    moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_ps(float *__p, __m256 __a)
 {
   *(__m256 *)__p = __a;
 }
 
+/// \brief Moves packed double-precision values from a 256-bit vector of [4 x
+///    double] to an unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVUPD instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the
+///    double-precision values.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be
+///    moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_pd(double *__p, __m256d __a)
 {
   __builtin_ia32_storeupd256(__p, (__v4df)__a);
 }
 
+/// \brief Moves packed float values from a 256-bit vector of [8 x float] to an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVUPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float
+///    values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be
+///    moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_ps(float *__p, __m256 __a)
 {
   __builtin_ia32_storeups256(__p, (__v8sf)__a);
 }
 
+/// \brief Moves integer values from a 256-bit integer vector to an aligned
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDQA instruction.
+///
+/// \param __p
+///    A 32-byte aligned pointer to a memory location that will
+///    receive the integer values.
+/// \param __a
+///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_store_si256(__m256i *__p, __m256i __a)
 {
   *__p = __a;
 }
 
+/// \brief Moves integer values from a 256-bit integer vector to an unaligned
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDQU instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer
+///    values.
+/// \param __a
+///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_storeu_si256(__m256i *__p, __m256i __a)
 {
@@ -822,12 +3177,48 @@
 }
 
 /* Conditional load ops */
+/// \brief Loads packed double-precision values from a memory location storing
+///    64-bit double-precision values to a 128-bit vector of [2 x double],
+///    according to the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMASKMOVPD instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the
+///    double-precision values.
+/// \param __m
+///    A 128-bit vector of [2 x double] containing the mask. The
+///    most significant bit of each data element represents the
+///    mask bits. If a mask bit is zero, the corresponding value in
+///    the memory location is not loaded and the corresponding
+///    field in the destination vector is set to zero.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
 static __inline __m128d __DEFAULT_FN_ATTRS
 _mm_maskload_pd(double const *__p, __m128i __m)
 {
   return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
 }
 
+/// \brief Loads packed double-precision values from a memory location storing
+///    64-bit double-precision values to a 256-bit vector of [4 x double],
+///    according to the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMASKMOVPD instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the
+///    double-precision values.
+/// \param __m
+///    A 256-bit vector of [4 x double] containing the mask. The
+///    most significant bit of each data element represents the
+///    mask bits. If a mask bit is zero, the corresponding value in
+///    the memory location is not loaded and the corresponding
+///    field in the destination vector is set to zero.
+/// \returns A 256-bit vector of [4 x double] containing the loaded values.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_maskload_pd(double const *__p, __m256i __m)
 {
@@ -835,12 +3226,50 @@
                                                (__v4di)__m);
 }
 
+/// \brief Loads packed float values from a memory location storing 32-bit 
+///    float values to a 128-bit vector of [4 x float], according to the 
+///    specified
+///    mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMASKMOVPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the float
+///    values.
+/// \param __m
+///    A 128-bit vector of [4 x float] containing the mask. The
+///    most significant bit of each data element represents the
+///    mask bits. If a mask bit is zero, the corresponding value in
+///    the memory location is not loaded and the corresponding
+///    field in the destination vector is set to zero.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_maskload_ps(float const *__p, __m128i __m)
 {
   return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
 }
 
+/// \brief Loads packed float values from a memory location storing 32-bit 
+///    float values to a 256-bit vector of [8 x float], according to the 
+///    specified
+///    mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMASKMOVPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location that contains the float
+///    values.
+/// \param __m
+///    A 256-bit vector of [8 x float] containing the mask. The
+///    most significant bit of each data element represents the
+///    mask bits. If a mask bit is zero, the corresponding value in
+///    the memory location is not loaded and the corresponding
+///    field in the destination vector is set to zero.
+/// \returns A 256-bit vector of [8 x float] containing the loaded values.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_maskload_ps(float const *__p, __m256i __m)
 {
@@ -848,24 +3277,104 @@
 }
 
 /* Conditional store ops */
+/// \brief Moves packed float values from a 256-bit vector of [8 x float] to a
+///    memory location, according to the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMASKMOVPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float
+///    values.
+/// \param __m
+///    A 256-bit vector of [8 x float] containing the mask. The
+///    most significant bit of each field in the mask vector
+///    represents the mask bits. If a mask bit is zero, the
+///    corresponding value from vector __a is
+///    not stored and the corresponding field in the destination
+///    memory location is not changed.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be
+///    stored.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
 {
   __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
 }
 
+/// \brief Moves packed double-precision values from a 128-bit vector of [2 x
+///    double] to a memory location, according to the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMASKMOVPD instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float
+///    values.
+/// \param __m
+///    A 128-bit vector of [2 x double] containing the mask. The
+///    most significant bit of each field in the mask vector
+///    represents the mask bits. If a mask bit is zero, the
+///    corresponding value from vector __a is
+///    not stored and the corresponding field in the destination
+///    memory location is not changed.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be
+///    stored.
 static __inline void __DEFAULT_FN_ATTRS
 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
 {
   __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
 }
 
+/// \brief Moves packed double-precision values from a 256-bit vector of [4 x
+///    double] to a memory location, according to the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMASKMOVPD instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float
+///    values.
+/// \param __m
+///    A 256-bit vector of [4 x double] containing the mask. The
+///    most significant bit of each field in the mask vector
+///    represents the mask bits. If a mask bit is zero, the
+///    corresponding value from vector __a is
+///    not stored and the corresponding field in the destination
+///    memory location is not changed.
+/// \param __a
+///    A 256-bit vector of [4 x double] containing the values to be
+///    stored.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
 {
   __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
 }
 
+/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
+///    memory location, according to the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMASKMOVPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float
+///    values.
+/// \param __m
+///    A 128-bit vector of [4 x float] containing the mask. The
+///    most significant bit of each field in the mask vector
+///    represents the mask bits. If a mask bit is zero, the
+///    corresponding value from vector __a is
+///    not stored and the corresponding field in the destination
+///    memory location is not changed.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be
+///    stored.
 static __inline void __DEFAULT_FN_ATTRS
 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 {
@@ -873,18 +3382,60 @@
 }
 
 /* Cacheability support ops */
+/// \brief Moves packed integer values from a 256-bit integer vector to a 
+///    256-bit aligned memory location. To minimize caching, the data is flagged 
+///    as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVNTDQ instruction.
+///
+/// \param __a
+///    A 256-bit aligned pointer to a memory location that will
+///    receive the integer values.
+/// \param __b
+///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_si256(__m256i *__a, __m256i __b)
 {
   __builtin_ia32_movntdq256((__v4di *)__a, (__v4di)__b);
 }
 
+/// \brief Moves packed double-precision values from a 256-bit vector of [4 x
+///    double] to a 256-bit aligned memory location. To minimize caching, the
+///    data is flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVNTPD instruction.
+///
+/// \param __a
+///    A 256-bit aligned pointer to a memory location that will
+///    receive the integer values.
+/// \param __b
+///    A 256-bit vector of [4 x double] containing the values to be
+///    moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_pd(double *__a, __m256d __b)
 {
   __builtin_ia32_movntpd256(__a, (__v4df)__b);
 }
 
+/// \brief Moves packed float values from a 256-bit vector of [8 x float] to a
+///    256-bit aligned memory location. To minimize caching, the data is
+///    flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVNTPS instruction.
+///
+/// \param __p
+///    A 256-bit aligned pointer to a memory location that will
+///    receive the integer values.
+/// \param __a
+///    A 256-bit vector of [8 x float] containing the values to be
+///    moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_ps(float *__p, __m256 __a)
 {
@@ -910,12 +3461,62 @@
   return (__m256i)__builtin_ia32_undef256();
 }
 
+/// \brief Initializes a 256-bit vector of [4 x double] with the specified 
+///    64-bit double-precision values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A double-precision value used to initialize bits [255:192]
+///    of the destination vector of [4 x double].
+/// \param __b
+///    A double-precision value used to initialize bits [191:128]
+///    of the destination vector of [4 x double].
+/// \param __c
+///    A double-precision value used to initialize bits [127:64] of
+///    the destination vector of [4 x double].
+/// \param __d
+///    A double-precision value used to initialize bits [63:0] of
+///    the destination vector of [4 x double].
+/// \returns An initialized 256-bit vector of [4 x double] containing the 
+///    values provided in the operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_set_pd(double __a, double __b, double __c, double __d)
 {
   return (__m256d){ __d, __c, __b, __a };
 }
 
+/// \brief Initializes a 256-bit vector of [8 x float] with the specified 
+///    32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A float value used to initialize the bits [255:224] of the
+///    destination vector of [8 x float].
+/// \param __b
+///    A float value used to initialize the bits [223:192] of the
+///    destination vector of [8 x float].
+/// \param __c
+///    A float value used to initialize the bits [191:160] of the
+///    destination vector of [8 x float].
+/// \param __d
+///    A float value used to initialize the bits [159:128] of the
+///    destination vector of [8 x float].
+/// \param __e
+///    A float value used to initialize the bits [127:96] of the
+///    destination vector of [8 x float].
+/// \param __f
+///    A float value used to initialize the bits [95:64] of the
+///    destination vector of [8 x float].
+/// \param __g
+///    A float value used to initialize the bits [63:32] of the
+///    destination vector of [8 x float].
+/// \param __h
+///    A float value used to initialize the bits [31:0] of the
+///    destination vector of [8 x float].
+/// \returns An initialized 256-bit vector of [8 x float] containing the values
+///    provided in the operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_set_ps(float __a, float __b, float __c, float __d,
               float __e, float __f, float __g, float __h)
@@ -923,6 +3524,37 @@
   return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified integer
+///    values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __i0
+///    A 32-bit integer value used to initialize bits [255:224] of
+///    the destination vector.
+/// \param __i1
+///    A 32-bit integer value used to initialize bits [223:192] of
+///    the destination vector.
+/// \param __i2
+///    A 32-bit integer value used to initialize bits [191:160] of
+///    the destination vector.
+/// \param __i3
+///    A 32-bit integer value used to initialize bits [159:128] of
+///    the destination vector.
+/// \param __i4
+///    A 32-bit integer value used to initialize bits [127:96] of
+///    the destination vector.
+/// \param __i5
+///    A 32-bit integer value used to initialize bits [95:64] of
+///    the destination vector.
+/// \param __i6
+///    A 32-bit integer value used to initialize bits [63:32] of
+///    the destination vector.
+/// \param __i7
+///    A 32-bit integer value used to initialize bits [31:0] of the
+///    destination vector.
+/// \returns An initialized 256-bit integer vector containing the values 
+///    provided in the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
                  int __i4, int __i5, int __i6, int __i7)
@@ -930,6 +3562,60 @@
   return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified short values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w15
+///    A 16-bit integer value used to initialize bits [255:240] of
+///    the destination vector.
+/// \param __w14
+///    A 16-bit integer value used to initialize bits [239:224] of
+///    the destination vector.
+/// \param __w13
+///    A 16-bit integer value used to initialize bits [223:208] of
+///    the destination vector.
+/// \param __w12
+///    A 16-bit integer value used to initialize bits [207:192] of
+///    the destination vector.
+/// \param __w11
+///    A 16-bit integer value used to initialize bits [191:176] of
+///    the destination vector.
+/// \param __w10
+///    A 16-bit integer value used to initialize bits [175:160] of
+///    the destination vector.
+/// \param __w09
+///    A 16-bit integer value used to initialize bits [159:144] of
+///    the destination vector.
+/// \param __w08
+///    A 16-bit integer value used to initialize bits [143:128] of
+///    the destination vector.
+/// \param __w07
+///    A 16-bit integer value used to initialize bits [127:112] of
+///    the destination vector.
+/// \param __w06
+///    A 16-bit integer value used to initialize bits [111:96] of
+///    the destination vector.
+/// \param __w05
+///    A 16-bit integer value used to initialize bits [95:80] of
+///    the destination vector.
+/// \param __w04
+///    A 16-bit integer value used to initialize bits [79:64] of
+///    the destination vector.
+/// \param __w03
+///    A 16-bit integer value used to initialize bits [63:48] of
+///    the destination vector.
+/// \param __w02
+///    A 16-bit integer value used to initialize bits [47:32] of
+///    the destination vector.
+/// \param __w01
+///    A 16-bit integer value used to initialize bits [31:16] of
+///    the destination vector.
+/// \param __w00
+///    A 16-bit integer value used to initialize bits [15:0] of the
+///    destination vector.
+/// \returns An initialized 256-bit integer vector containing the values 
+///    provided in the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
                  short __w11, short __w10, short __w09, short __w08,
@@ -940,6 +3626,76 @@
     __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified char values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __b31
+///    Initializes bits [255:248] of the destination vector.
+/// \param __b30
+///    Initializes bits [247:240] of the destination vector.
+/// \param __b29
+///    Initializes bits [239:232] of the destination vector.
+/// \param __b28
+///    Initializes bits [231:224] of the destination vector.
+/// \param __b27
+///    Initializes bits [223:216] of the destination vector.
+/// \param __b26
+///    Initializes bits [215:208] of the destination vector.
+/// \param __b25
+///    Initializes bits [207:200] of the destination vector.
+/// \param __b24
+///    Initializes bits [199:192] of the destination vector.
+/// \param __b23
+///    Initializes bits [191:184] of the destination vector.
+/// \param __b22
+///    Initializes bits [183:176] of the destination vector.
+/// \param __b21
+///    Initializes bits [175:168] of the destination vector.
+/// \param __b20
+///    Initializes bits [167:160] of the destination vector.
+/// \param __b19
+///    Initializes bits [159:152] of the destination vector.
+/// \param __b18
+///    Initializes bits [151:144] of the destination vector.
+/// \param __b17
+///    Initializes bits [143:136] of the destination vector.
+/// \param __b16
+///    Initializes bits [135:128] of the destination vector.
+/// \param __b15
+///    Initializes bits [127:120] of the destination vector.
+/// \param __b14
+///    Initializes bits [119:112] of the destination vector.
+/// \param __b13
+///    Initializes bits [111:104] of the destination vector.
+/// \param __b12
+///    Initializes bits [103:96] of the destination vector.
+/// \param __b11
+///    Initializes bits [95:88] of the destination vector.
+/// \param __b10
+///    Initializes bits [87:80] of the destination vector.
+/// \param __b09
+///    Initializes bits [79:72] of the destination vector.
+/// \param __b08
+///    Initializes bits [71:64] of the destination vector.
+/// \param __b07
+///    Initializes bits [63:56] of the destination vector.
+/// \param __b06
+///    Initializes bits [55:48] of the destination vector.
+/// \param __b05
+///    Initializes bits [47:40] of the destination vector.
+/// \param __b04
+///    Initializes bits [39:32] of the destination vector.
+/// \param __b03
+///    Initializes bits [31:24] of the destination vector.
+/// \param __b02
+///    Initializes bits [23:16] of the destination vector.
+/// \param __b01
+///    Initializes bits [15:8] of the destination vector.
+/// \param __b00
+///    Initializes bits [7:0] of the destination vector.
+/// \returns An initialized 256-bit integer vector containing the values 
+///    provided in the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
                 char __b27, char __b26, char __b25, char __b24,
@@ -958,6 +3714,25 @@
   };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified 64-bit 
+///    integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 64-bit integer value used to initialize bits [255:192] of
+///    the destination vector of [4 x i64].
+/// \param __b
+///    A 64-bit integer value used to initialize bits [191:128] of
+///    the destination vector of [4 x i64].
+/// \param __c
+///    A 64-bit integer value used to initialize bits [127:64] of
+///    the destination vector of [4 x i64].
+/// \param __d
+///    A 64-bit integer value used to initialize bits [63:0] of the
+///    destination vector of [4 x i64].
+/// \returns An initialized 256-bit integer vector containing the values 
+///    provided in the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
 {
@@ -965,12 +3740,66 @@
 }
 
 /* Create vectors with elements in reverse order */
+/// \brief Initializes a 256-bit vector of [4 x double] with the specified 
+///    64-bit double-precision values, storing the first two operands in the 
+///    lower
+///    bits and the second two operands in the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A double-precision value used to initialize bits [63:0] of
+///    the destination vector of [4 x double].
+/// \param __b
+///    A double-precision value used to initialize bits [127:64] of
+///    the destination vector of [4 x double].
+/// \param __c
+///    A double-precision value used to initialize bits [191:128]
+///    of the destination vector of [4 x double].
+/// \param __d
+///    A double-precision value used to initialize bits [255:192]
+///    of the destination vector of [4 x double].
+/// \returns An initialized 256-bit vector of [4 x double] containing the 
+///    values provided in the operands.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setr_pd(double __a, double __b, double __c, double __d)
 {
   return (__m256d){ __a, __b, __c, __d };
 }
 
+/// \brief Initializes a 256-bit vector of [8 x float] with the specified 
+///    32-bit float values, storing the first 4 operands in the lower bits and 
+///    the
+///    second 4 operands in the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A float value used to initialize the bits [31:0] of the
+///    destination vector of [8 x float].
+/// \param __b
+///    A float value used to initialize the bits [63:32] of the
+///    destination vector of [8 x float].
+/// \param __c
+///    A float value used to initialize the bits [95:64] of the
+///    destination vector of [8 x float].
+/// \param __d
+///    A float value used to initialize the bits [127:96] of the
+///    destination vector of [8 x float].
+/// \param __e
+///    A float value used to initialize the bits [159:128] of the
+///    destination vector of [8 x float].
+/// \param __f
+///    A float value used to initialize the bits [191:160] of the
+///    destination vector of [8 x float].
+/// \param __g
+///    A float value used to initialize the bits [223:192] of the
+///    destination vector of [8 x float].
+/// \param __h
+///    A float value used to initialize the bits [255:224] of the
+///    destination vector of [8 x float].
+/// \returns An initialized 256-bit vector of [8 x float] containing the values
+///    provided in the operands.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setr_ps(float __a, float __b, float __c, float __d,
                float __e, float __f, float __g, float __h)
@@ -978,6 +3807,38 @@
   return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified integer
+///    values, storing the first 4 operands in the lower bits and the second
+///    4 operands in the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __i0
+///    A 32-bit integer value used to initialize bits [31:0] of the
+///    destination vector.
+/// \param __i1
+///    A 32-bit integer value used to initialize bits [63:32] of
+///    the destination vector.
+/// \param __i2
+///    A 32-bit integer value used to initialize bits [95:64] of
+///    the destination vector.
+/// \param __i3
+///    A 32-bit integer value used to initialize bits [127:96] of
+///    the destination vector.
+/// \param __i4
+///    A 32-bit integer value used to initialize bits [159:128] of
+///    the destination vector.
+/// \param __i5
+///    A 32-bit integer value used to initialize bits [191:160] of
+///    the destination vector.
+/// \param __i6
+///    A 32-bit integer value used to initialize bits [223:192] of
+///    the destination vector.
+/// \param __i7
+///    A 32-bit integer value used to initialize bits [255:224] of
+///    the destination vector.
+/// \returns An initialized 256-bit integer vector containing the values 
+///    provided in the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
                   int __i4, int __i5, int __i6, int __i7)
@@ -985,6 +3846,62 @@
   return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified short values,
+///    storing the first 8 operands in the lower bits and the second 8
+///    operands in the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w15
+///    A 16-bit integer value used to initialize bits [15:0] of the
+///    destination vector.
+/// \param __w14
+///    A 16-bit integer value used to initialize bits [31:16] of
+///    the destination vector.
+/// \param __w13
+///    A 16-bit integer value used to initialize bits [47:32] of
+///    the destination vector.
+/// \param __w12
+///    A 16-bit integer value used to initialize bits [63:48] of
+///    the destination vector.
+/// \param __w11
+///    A 16-bit integer value used to initialize bits [79:64] of
+///    the destination vector.
+/// \param __w10
+///    A 16-bit integer value used to initialize bits [95:80] of
+///    the destination vector.
+/// \param __w09
+///    A 16-bit integer value used to initialize bits [111:96] of
+///    the destination vector.
+/// \param __w08
+///    A 16-bit integer value used to initialize bits [127:112] of
+///    the destination vector.
+/// \param __w07
+///    A 16-bit integer value used to initialize bits [143:128] of
+///    the destination vector.
+/// \param __w06
+///    A 16-bit integer value used to initialize bits [159:144] of
+///    the destination vector.
+/// \param __w05
+///    A 16-bit integer value used to initialize bits [175:160] of
+///    the destination vector.
+/// \param __w04
+///    A 16-bit integer value used to initialize bits [191:176] of
+///    the destination vector.
+/// \param __w03
+///    A 16-bit integer value used to initialize bits [207:192] of
+///    the destination vector.
+/// \param __w02
+///    A 16-bit integer value used to initialize bits [223:208] of
+///    the destination vector.
+/// \param __w01
+///    A 16-bit integer value used to initialize bits [239:224] of
+///    the destination vector.
+/// \param __w00
+///    A 16-bit integer value used to initialize bits [255:240] of
+///    the destination vector.
+/// \returns An initialized 256-bit integer vector containing the values 
+///    provided in the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
        short __w11, short __w10, short __w09, short __w08,
@@ -995,6 +3912,78 @@
     __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified char values,
+///    storing the first 16 operands in the lower bits and the second 16
+///    operands in the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __b31
+///    Initializes bits [7:0] of the destination vector.
+/// \param __b30
+///    Initializes bits [15:8] of the destination vector.
+/// \param __b29
+///    Initializes bits [23:16] of the destination vector.
+/// \param __b28
+///    Initializes bits [31:24] of the destination vector.
+/// \param __b27
+///    Initializes bits [39:32] of the destination vector.
+/// \param __b26
+///    Initializes bits [47:40] of the destination vector.
+/// \param __b25
+///    Initializes bits [55:48] of the destination vector.
+/// \param __b24
+///    Initializes bits [63:56] of the destination vector.
+/// \param __b23
+///    Initializes bits [71:64] of the destination vector.
+/// \param __b22
+///    Initializes bits [79:72] of the destination vector.
+/// \param __b21
+///    Initializes bits [87:80] of the destination vector.
+/// \param __b20
+///    Initializes bits [95:88] of the destination vector.
+/// \param __b19
+///    Initializes bits [103:96] of the destination vector.
+/// \param __b18
+///    Initializes bits [111:104] of the destination vector.
+/// \param __b17
+///    Initializes bits [119:112] of the destination vector.
+/// \param __b16
+///    Initializes bits [127:120] of the destination vector.
+/// \param __b15
+///    Initializes bits [135:128] of the destination vector.
+/// \param __b14
+///    Initializes bits [143:136] of the destination vector.
+/// \param __b13
+///    Initializes bits [151:144] of the destination vector.
+/// \param __b12
+///    Initializes bits [159:152] of the destination vector.
+/// \param __b11
+///    Initializes bits [167:160] of the destination vector.
+/// \param __b10
+///    Initializes bits [175:168] of the destination vector.
+/// \param __b09
+///    Initializes bits [183:176] of the destination vector.
+/// \param __b08
+///    Initializes bits [191:184] of the destination vector.
+/// \param __b07
+///    Initializes bits [199:192] of the destination vector.
+/// \param __b06
+///    Initializes bits [207:200] of the destination vector.
+/// \param __b05
+///    Initializes bits [215:208] of the destination vector.
+/// \param __b04
+///    Initializes bits [223:216] of the destination vector.
+/// \param __b03
+///    Initializes bits [231:224] of the destination vector.
+/// \param __b02
+///    Initializes bits [239:232] of the destination vector.
+/// \param __b01
+///    Initializes bits [247:240] of the destination vector.
+/// \param __b00
+///    Initializes bits [255:248] of the destination vector.
+/// \returns An initialized 256-bit integer vector containing the values 
+///    provided in the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
                  char __b27, char __b26, char __b25, char __b24,
@@ -1012,6 +4001,26 @@
     __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified 64-bit 
+///    integer values, storing the first two operands in the lower bits and the
+///    second two operands in the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 64-bit integer value used to initialize bits [63:0] of the
+///    destination vector of [4 x i64].
+/// \param __b
+///    A 64-bit integer value used to initialize bits [127:64] of
+///    the destination vector of [4 x i64].
+/// \param __c
+///    A 64-bit integer value used to initialize bits [191:128] of
+///    the destination vector of [4 x i64].
+/// \param __d
+///    A 64-bit integer value used to initialize bits [255:192] of
+///    the destination vector of [4 x i64].
+/// \returns An initialized 256-bit integer vector containing the values 
+///    provided in the operands.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
 {
@@ -1019,24 +4028,62 @@
 }
 
 /* Create vectors with repeated elements */
+/// \brief Initializes a 256-bit vector of [4 x double] with the specified 
+///    64-bit double-precision value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    Double-precision value used to initialize the destination
+///    vector of [4 x double].
+/// \returns An initialized 256-bit vector of [4 x double] containing the value
+///    provided in the operand.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_set1_pd(double __w)
 {
   return (__m256d){ __w, __w, __w, __w };
 }
 
+/// \brief Initializes a 256-bit vector of [8 x float] with the specified 
+///    32-bit float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    Float value used to initialize the destination vector of [8
+///    x float].
+/// \returns An initialized 256-bit vector of [8 x float] containing the value
+///    provided in the operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_set1_ps(float __w)
 {
   return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __i
+///    Integer value used to initialize the destination integer
+///    vector.
+/// \returns An initialized 256-bit integer vector containing the value provided 
+///    in the operand.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi32(int __i)
 {
   return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified short value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    Short value used to initialize the destination integer
+///    vector.
+/// \returns An initialized 256-bit integer vector containing the value provided 
+///    in the operand.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi16(short __w)
 {
@@ -1044,6 +4091,15 @@
     __w, __w, __w, __w, __w, __w };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified char value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __b
+///    Char values used to initialize the destination integer
+///    vector.
+/// \returns An initialized 256-bit integer vector containing the value provided 
+///    in the operand.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi8(char __b)
 {
@@ -1052,6 +4108,16 @@
     __b, __b, __b, __b, __b, __b, __b };
 }
 
+/// \brief Initializes a 256-bit integer vector with the specified 64-bit 
+///    integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __q
+///    64-bit integer value used to initialize the destination
+///    integer vector.
+/// \returns An initialized 256-bit integer vector containing the value provided 
+///    in the operand.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_set1_epi64x(long long __q)
 {
@@ -1059,18 +4125,38 @@
 }
 
 /* Create __zeroed vectors */
+/// \brief Sets the 256-bit YMM register to zero, or creates a 256-bit vector 
+///    of [4 x double] with all elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \returns An initialized 256-bit vector of [4 x double] with all elements set 
+///    to zero.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setzero_pd(void)
 {
   return (__m256d){ 0, 0, 0, 0 };
 }
 
+/// \brief Sets the 256-bit YMM register to zero, or creates a 256-bit vector 
+///    of [8 x float] with all elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \returns An initialized 256-bit vector of [8 x float] with all elements set 
+///    to zero.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setzero_ps(void)
 {
   return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
 }
 
+/// \brief Sets the 256-bit YMM register to zero, or creates a 256-bit vector 
+///    of [4 x i64] with all elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \returns An initialized 256-bit integer vector with all elements set to zero.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_setzero_si256(void)
 {
@@ -1078,72 +4164,183 @@
 }
 
 /* Cast between vector types */
+/// \brief Casts 64-bit double-precision values as 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] to be cast as float values.
+/// \returns A 256-bit vector of [8 x float] containing the typecast values
+///    provided in the operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castpd_ps(__m256d __a)
 {
   return (__m256)__a;
 }
 
+/// \brief Casts 64-bit double-precision values as integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] to be cast as integer
+///    values.
+/// \returns A 256-bit integer vector containing the typecast values provided 
+///    in the operand.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castpd_si256(__m256d __a)
 {
   return (__m256i)__a;
 }
 
+/// \brief Casts 32-bit float values as 64-bit double-precision values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] to be cast as
+///    double-precision values.
+/// \returns A 256-bit vector of [4 x double] containing the typecast values
+///    provided in the operand.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castps_pd(__m256 __a)
 {
   return (__m256d)__a;
 }
 
+/// \brief Casts 32-bit float values as integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] to be cast as integer
+///    values.
+/// \returns A 256-bit integer vector containing the typecast values provided 
+///    in the operand.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castps_si256(__m256 __a)
 {
   return (__m256i)__a;
 }
 
+/// \brief Casts integer values as 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 256-bit integer vector to be cast as float values.
+/// \returns A 256-bit vector of [8 x float] containing the typecast values
+///    provided in the operand.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castsi256_ps(__m256i __a)
 {
   return (__m256)__a;
 }
 
+/// \brief Casts integer values as 64-bit double-precision values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 256-bit integer vector to be cast as double-precision
+///    values.
+/// \returns A 256-bit vector of [4 x double] containing the typecast values
+///    provided in the operand.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castsi256_pd(__m256i __a)
 {
   return (__m256d)__a;
 }
 
+/// \brief Casts a 256-bit vector of [4 x double] as a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 256-bit vector of [4 x double] to be cast as double
+///    values. The lower 128 bits of this vector are used.
+/// \returns A 128-bit vector of [2 x double] containing the typecast values
+///    provided in the operand.
 static __inline __m128d __DEFAULT_FN_ATTRS
 _mm256_castpd256_pd128(__m256d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1);
 }
 
+/// \brief Casts a 256-bit vector of [8 x float] as a 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 256-bit vector of [8 x float] to be cast as double values.
+///    The lower 128 bits of this vector are used.
+/// \returns A 128-bit vector of [4 x float] containing the typecast values
+///    provided in the operand.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm256_castps256_ps128(__m256 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
 }
 
+/// \brief Casts a 256-bit integer vector as a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 256-bit integer vector of to be cast as integer values.
+///    The lower 128 bits of this vector are used.
+/// \returns A 128-bit integer vector containing the typecast values provided 
+///    in the operand.
 static __inline __m128i __DEFAULT_FN_ATTRS
 _mm256_castsi256_si128(__m256i __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1);
 }
 
+/// \brief Casts a 128-bit vector of [2 x double] as a 256-bit vector of [4 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] to be cast as double
+///    values. The upper 128 bits of the destination vector are
+///    undefined.
+/// \returns A 256-bit vector of [4 x double] containing the typecast values
+///    provided in the operand in the lower 128 bits.
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castpd128_pd256(__m128d __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, -1, -1);
 }
 
+/// \brief Casts a 128-bit vector of [4 x float] as a 256-bit vector of [8 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] to be cast as float values.
+///    The upper 128 bits of the destination vector are undefined.
+/// \returns A 256-bit vector of [8 x float] containing the typecast values
+///    provided in the operand in the lower 128 bits.
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castps128_ps256(__m128 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
 }
 
+/// \brief Casts a 128-bit integer vector as a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit integer vector to be cast as integer values. The
+///    upper 128 bits of the destination vector are undefined.
+/// \returns A 256-bit integer vector containing the typecast values provided 
+///    in the operand in the lower 128 bits.
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castsi128_si256(__m128i __a)
 {
@@ -1155,6 +4352,34 @@
    We use macros rather than inlines because we only want to accept
    invocations where the immediate M is a constant expression.
 */
+/// \brief Combines 128 bits of packed data from the 256-bit vector operand of 
+///    [8 x float] with 128 bits of packed data from the 128-bit vector operand
+///    of [4 x float], using the offset specified by the integer operand, and
+///    copies them to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VINSERTF128 instruction.
+///
+/// \param V1
+///    A 256-bit vector of [8 x float] values. The remaining bits
+///    in the destination are copied from the corresponding bits in
+///    this operand.
+/// \param V2
+///    A 128-bit vector of [4 x float] values. The bits of this
+///    operand are written to the destination beginning at the
+///    offset specified by operand M.
+/// \param M
+///    An immediate integer used to determine which bits in the
+///    destination are used when copying the bits from operand V2:
+///    If bit [0] is 0, bits [127:0] are used in the destination.
+///    If bit [0] is 1, bits [255:128] are used in the destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied packed data 
+///    from the operands.
 #define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
   (__m256)__builtin_shufflevector( \
     (__v8sf)(__m256)(V1), \
@@ -1168,6 +4393,35 @@
     (((M) & 1) ? 10 :  6), \
     (((M) & 1) ? 11 :  7) );})
 
+/// \brief Combines 128 bits of packed data from the 256-bit vector operand of 
+///    [4 x double] with 128 bits of packed data from the 128-bit vector 
+///    operand
+///    of [2 x double], using the offset specified by the integer operand,
+///    and copies them to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VINSERTF128 instruction.
+///
+/// \param V1
+///    A 256-bit vector of [4 x double] values. The remaining bits
+///    in the destination are copied from the corresponding bits in
+///    this operand.
+/// \param V2
+///    A 128-bit vector of [2 x double] values. The bits of this
+///    operand are written to the destination beginning at the
+///    offset specified by operand M.
+/// \param M
+///    An immediate integer used to determine which bits in the
+///    destination are used when copying the bits from operand V2:
+///    If bit [0] is 0, bits [127:0] are used in the destination.
+///    If bit [0] is 1, bits [255:128] are used in the destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied packed data
+///    from the operands.
 #define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
   (__m256d)__builtin_shufflevector( \
     (__v4df)(__m256d)(V1), \
@@ -1177,6 +4431,34 @@
     (((M) & 1) ? 4 : 2), \
     (((M) & 1) ? 5 : 3) );})
 
+/// \brief Combines 128 bits of packed data from the 256-bit integer vector
+///    operand with 128 bits of packed data from the 128-bit integer vector
+///    operand, using the offset specified by the integer operand, and copies
+///    them to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VINSERTF128 instruction.
+///
+/// \param V1
+///    A 256-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param V2
+///    A 128-bit integer vector. The bits of this operand are
+///    written to the destination beginning at the offset specified
+///    by operand M.
+/// \param M
+///    An immediate integer used to determine which bits in the
+///    destination are used when copying the bits from operand V2:
+///    If bit [0] is 0, bits [127:0] are used in the destination.
+///    If bit [0] is 1, bits [255:128] are used in the destination.
+/// \returns A 256-bit integer vector containing the copied packed data from 
+///    the operands.
 #define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
   (__m256i)__builtin_shufflevector( \
     (__v4di)(__m256i)(V1), \
@@ -1191,6 +4473,27 @@
    We use macros rather than inlines because we only want to accept
    invocations where the immediate M is a constant expression.
 */
+/// \brief Extracts 128 bits of packed data from a 256-bit vector of [8 x float]
+///    and copies it to the destination, as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VEXTRACTF128 instruction.
+///
+/// \param V
+///    A 256-bit vector of [8 x float] values.
+/// \param M
+///    An immediate integer used to determine which bits are
+///    extracted:
+///    If bit [0] is 0, bits [127:0] are copied to the destination.
+///    If bit [0] is 1, bits [255:128] are copied to the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the extracted 128 bits 
+///    of packed data.
 #define _mm256_extractf128_ps(V, M) __extension__ ({ \
   (__m128)__builtin_shufflevector( \
     (__v8sf)(__m256)(V), \
@@ -1200,6 +4503,27 @@
     (((M) & 1) ? 6 : 2), \
     (((M) & 1) ? 7 : 3) );})
 
+/// \brief Extracts 128 bits of packed data from a 256-bit vector of [4 x double]
+///    and copies it to the destination, as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VEXTRACTF128 instruction.
+///
+/// \param V
+///    A 256-bit vector of [4 x double] values.
+/// \param M
+///    An immediate integer used to determine which bits are
+///    extracted:
+///    If bit [0] is 0, bits [127:0] are copied to the destination.
+///    If bit [0] is 1, bits [255:128] are copied to the
+///    destination.
+/// \returns A 128-bit vector of [2 x double] containing the extracted 128 bits 
+///    of packed data.
 #define _mm256_extractf128_pd(V, M) __extension__ ({ \
   (__m128d)__builtin_shufflevector( \
     (__v4df)(__m256d)(V), \
@@ -1207,6 +4531,27 @@
     (((M) & 1) ? 2 : 0), \
     (((M) & 1) ? 3 : 1) );})
 
+/// \brief Extracts 128 bits of packed data from a 256-bit integer vector and
+///    copies it to the destination, as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VEXTRACTF128 instruction.
+///
+/// \param V
+///    A 256-bit integer vector.
+/// \param M
+///    An immediate integer used to determine which bits are
+///    extracted:
+///    If bit [0] is 0, bits [127:0] are copied to the destination.
+///    If bit [0] is 1, bits [255:128] are copied to the
+///    destination.
+/// \returns A 128-bit integer vector containing the extracted 128 bits of 
+///    packed data.
 #define _mm256_extractf128_si256(V, M) __extension__ ({ \
   (__m128i)__builtin_shufflevector( \
     (__v4di)(__m256i)(V), \
Index: lib/Headers/bmiintrin.h
===================================================================
--- lib/Headers/bmiintrin.h
+++ lib/Headers/bmiintrin.h
@@ -44,12 +44,36 @@
    to use it as a potentially faster version of BSF. */
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose trailing zeros are to be
+///    counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero 
+///    bits in the operand.
 static __inline__ unsigned short __RELAXED_FN_ATTRS
 __tzcnt_u16(unsigned short __X)
 {
   return __X ? __builtin_ctzs(__X) : 16;
 }
 
+/// \brief Performs a bitwise AND of the second operand with the ones
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned integer containing one of the operands.
+/// \param __Y
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second 
+///    operand with the ones complement of the first operand.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __andn_u32(unsigned int __X, unsigned int __Y)
 {
@@ -57,6 +81,22 @@
 }
 
 /* AMD-specified, double-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and puts them 
+///    into the least significant bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify which bits are
+///    extracted. Bits [7:0] specify the index of the least
+///    significant bit. Bits [15:8] specify the number of bits to
+///    be extracted.
+/// \returns An unsigned integer whose least significant bits contain the 
+///    extracted bits.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __bextr_u32(unsigned int __X, unsigned int __Y)
 {
@@ -64,30 +104,93 @@
 }
 
 /* Intel-specified, single-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and puts them 
+///    into the least significant bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least
+///    significant bit for the bits to be extracted. Bits [7:0]
+///    specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be
+///    extracted. Bits [7:0] specify the number of bits.
+/// \returns An unsigned integer whose least significant bits contain the 
+///    extracted bits.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
 {
   return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }
 
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits 
+///    from the source operand.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsi_u32(unsigned int __X)
 {
   return __X & -__X;
 }
 
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand, and writes the result to the destination. For example, __X ^ 
+///    (__X-1).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsmsk_u32(unsigned int __X)
 {
   return __X ^ (__X - 1);
 }
 
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsr_u32(unsigned int __X)
 {
   return __X & (__X - 1);
 }
 
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be
+///    counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero 
+///    bits in the operand.
 static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
@@ -103,6 +206,19 @@
 #define _blsr_u64(a)      (__blsr_u64((a)))
 #define _tzcnt_u64(a)     (__tzcnt_u64((a)))
 
+/// \brief Performs a bitwise AND of the second operand with the ones
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the 
+///    second operand with the ones complement of the first operand.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __andn_u64 (unsigned long long __X, unsigned long long __Y)
 {
@@ -110,6 +226,22 @@
 }
 
 /* AMD-specified, double-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and puts them 
+///    into the least significant bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned 64-bit integer used to specify which bits are
+///    extracted. Bits [7:0] specify the index of the least
+///    significant bit. Bits [15:8] specify the number of bits to
+///    be extracted.
+/// \returns An unsigned 64-bit integer whose least significant bits contain 
+///    the extracted bits.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __bextr_u64(unsigned long long __X, unsigned long long __Y)
 {
@@ -117,30 +249,93 @@
 }
 
 /* Intel-specified, single-leading-underscore version of BEXTR */
+/// \brief Extracts the specified bits from the first operand and puts them 
+///    into the least significant bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c BEXTR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+///    An unsigned integer used to specify the index of the least
+///    significant bit for the bits to be extracted. Bits [7:0]
+///    specify the index.
+/// \param __Z
+///    An unsigned integer used to specify the number of bits to be
+///    extracted. Bits [7:0] specify the number of bits.
+/// \returns An unsigned 64-bit integer whose least significant bits contain 
+///    the extracted bits.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
 {
   return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }
 
+/// \brief Clears all bits in the source except for the least significant bit
+///    containing a value of 1, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the 
+///    bits from the source operand.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsi_u64(unsigned long long __X)
 {
   return __X & -__X;
 }
 
+/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least siginificant bit that is set to 1 in the source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns A unsigned 64-bit integer containing the newly created mask.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsmsk_u64(unsigned long long __X)
 {
   return __X ^ (__X - 1);
 }
 
+/// \brief Clears the least siginificant bit that is set to 1 in the source
+///    operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing the operand to be
+///    cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsr_u64(unsigned long long __X)
 {
   return __X & (__X - 1);
 }
 
+/// \brief Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be
+///    counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero 
+///    bits in the operand.
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
Index: lib/Headers/emmintrin.h
===================================================================
--- lib/Headers/emmintrin.h
+++ lib/Headers/emmintrin.h
@@ -44,6 +44,22 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
 
+/// \brief Adds the 64-bit double-precision scalar values in the low-order bits
+///    of the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VADDSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 
+///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
+///    from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_add_sd(__m128d __a, __m128d __b)
 {
@@ -51,12 +67,39 @@
   return __a;
 }
 
+/// \brief Adds 2 packed 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VADDPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \returns A 128-bit vector of [2 x double] containing the sums of both 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_add_pd(__m128d __a, __m128d __b)
 {
   return __a + __b;
 }
 
+/// \brief Subtracts the 64-bit double-precision values in the low-order bits 
+///    of the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSUBSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    difference of the lower 64 bits of both operands. The upper 64 bits
+///    are copied from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sub_sd(__m128d __a, __m128d __b)
 {
@@ -64,12 +107,40 @@
   return __a;
 }
 
+/// \brief Subtracts 2 packed 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSUBPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the differences 
+///    between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sub_pd(__m128d __a, __m128d __b)
 {
   return __a - __b;
 }
 
+/// \brief Multiplies the 64-bit double-precision values in the low-order bits 
+///    of the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMULSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    product of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mul_sd(__m128d __a, __m128d __b)
 {
@@ -77,12 +148,40 @@
   return __a;
 }
 
+/// \brief Multiplies 2 packed 128-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMULPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands.
+/// \returns A 128-bit vector of [2 x double] containing the products between 
+///    both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mul_pd(__m128d __a, __m128d __b)
 {
   return __a * __b;
 }
 
+/// \brief Divides the 64-bit double-precision values in the low-order bits of
+///    the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VDIVSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing divisor.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    quotient of the lower 64 bits of both operands. The upper 64 bits are
+///    copied from the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_div_sd(__m128d __a, __m128d __b)
 {
@@ -90,12 +189,45 @@
   return __a;
 }
 
+/// \brief Divides 2 packed 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VDIVPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the divisor.
+/// \returns A 128-bit vector of [2 x double] containing the quotients between 
+///    both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_div_pd(__m128d __a, __m128d __b)
 {
   return __a / __b;
 }
 
+/// \brief Calculates the square root of the 64-bit double-precision value in 
+///    the low-order bits of the second operand, copying the upper 64 bits of 
+///    the
+///    first operand to bits [127:64] of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSQRTSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands. The upper 64 bits of this operand are copied to
+///    the upper 64 bits of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands. The square root is calculated using the lower 64
+///    bits of this operand.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    square root of the lower 64 bits of operand __b,
+///    and whose upper 64 bits are copied from the upper 64 bits of operand
+///    __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sqrt_sd(__m128d __a, __m128d __b)
 {
@@ -103,150 +235,499 @@
   return (__m128d) { __c[0], __a[1] };
 }
 
+/// \brief Calculates the square roots of the values stored in a packed 128-bit
+///    vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSQRTPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the square roots of 
+///    the values in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_sqrt_pd(__m128d __a)
 {
   return __builtin_ia32_sqrtpd(__a);
 }
 
+/// \brief Compares 2 64-bit double-precision values in the low-order bits of
+///    both operands, and stores the lesser of the pair of values in the
+///    lower 64 bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMINSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands. The lower 64 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands. The lower 64 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    minimum value between both operands. The upper 64 bits are copied from
+///    the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_min_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_minsd(__a, __b);
 }
 
+/// \brief Compares 2 packed 128-bit vectors of [2 x double] and stores the
+///    lesser of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMINPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands.
+/// \returns A 128-bit vector of [2 x double] containing the minimum values 
+///    between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_min_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_minpd(__a, __b);
 }
 
+/// \brief Compares 2 64-bit double-precision values in the low-order bits of
+///    both operands, and stores the greater of the pair of values in the
+///    lower 64 bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMAXSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands. The lower 64 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands. The lower 64 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    maximum value between both operands. The upper 64 bits are copied from
+///    the upper 64 bits of the first source operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_max_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_maxsd(__a, __b);
 }
 
+/// \brief Compares 2 packed 128-bit vectors of [2 x double] and stores the
+///    greater of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMAXPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    operands.
+/// \returns A 128-bit vector of [2 x double] containing the maximum values 
+///    between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_max_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_maxpd(__a, __b);
 }
 
+/// \brief Performs a bitwise AND of 2 packed 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPAND instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+///    values between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_and_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v4si)__a & (__v4si)__b);
 }
 
+/// \brief Performs a bitwise AND of 2 packed 128-bit vectors of [2 x double],
+///    using the ones-complement of the values contained in the first
+///    source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPANDN instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source
+///    operand. The ones complement of this value is used in
+///    the bitwise AND.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source
+///    operand.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+///    values in the second operand and the ones-complement of the
+///    first operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_andnot_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)(~(__v4si)__a & (__v4si)__b);
 }
 
+/// \brief Performs a bitwise OR of 2 packed 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPOR instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
+///    values between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_or_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v4si)__a | (__v4si)__b);
 }
 
+/// \brief Performs a bitwise XOR of 2 packed 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPXOR instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
+///    values between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_xor_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)((__v4si)__a ^ (__v4si)__b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPEQPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpeq_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLTPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmplt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLEPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmple_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplepd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLTPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpgt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are greater than or equal to those in the second
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLEPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpge_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplepd(__b, __a);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are ordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPORDPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpord_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are unordered with respect to those in the second
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPUNORDPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpunord_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are unequal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNEQPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpneq_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are not less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLTPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are not less than or equal to those in the second
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLEPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnle_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are not greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLTPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpngt_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [2 x double] to determine if the values in the
+///    first operand are not greater than or equal to those in the second
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLEPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnge_pd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPEQSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x double] values.
+/// \param __b
+///    A 128-bit vector of [4 x double] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpeq_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLTSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmplt_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLESD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmple_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmplesd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLTSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpgt_sd(__m128d __a, __m128d __b)
 {
@@ -254,6 +735,20 @@
   return (__m128d) { __c[0], __a[1] };
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are greater than or equal to those in the second
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLESD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpge_sd(__m128d __a, __m128d __b)
 {
@@ -261,36 +756,116 @@
   return (__m128d) { __c[0], __a[1] };
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are ordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPORDSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpord_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are unordered with respect to those in the second
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPUNORDSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpunord_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are unequal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNEQSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpneq_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are not less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLTSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are not less than or equal to those in the second
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLESD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnle_sd(__m128d __a, __m128d __b)
 {
   return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are not greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLTSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpngt_sd(__m128d __a, __m128d __b)
 {
@@ -298,6 +873,20 @@
   return (__m128d) { __c[0], __a[1] };
 }
 
+/// \brief Compares each of the corresponding packed double-precision values of
+///    the 128-bit vectors of [4 x float] to determine if the values in the
+///    first operand are not greater than or equal to those in the second
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLESD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cmpnge_sd(__m128d __a, __m128d __b)
 {
@@ -305,24 +894,79 @@
   return (__m128d) { __c[0], __a[1] };
 }
 
+/// \brief Compares 2 64-bit double-precision values in the low-order bits of
+///    both operands for equality, and stores the result of the comparison in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdeq(__a, __b);
 }
 
+/// \brief Compares 2 64-bit double-precision values in the low-order bits of
+///    both operands to determine if the first operand is less than the
+///    second operand, and stores the result of the comparison in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdlt(__a, __b);
 }
 
+/// \brief Compares 2 64-bit double-precision values in the low-order bits of
+///    both operands to determine if the first operand is less than or equal
+///    to the second operand, and stores the result of the comparison in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdle(__a, __b);
 }
 
+/// \brief Compares 2 64-bit double-precision values in the low-order bits of
+///    both operands to determine if the first operand is greater than the
+///    second operand, and stores the result of the comparison in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_sd(__m128d __a, __m128d __b)
 {
@@ -335,30 +979,98 @@
   return __builtin_ia32_comisdge(__a, __b);
 }
 
+/// \brief Compares 2 64-bit double-precision values in the low-order bits of
+///    both operands for inequality, and stores the result of the comparison
+///    in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_comisdneq(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 64-bit double-precision values
+///    using the low-order bits of both operands to determine equality, and
+///    stores the result of the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdeq(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 64-bit double-precision values
+///    using the low-order bits of both operands to determine if the first
+///    operand is less than the second operand, and stores the result of the
+///    comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdlt(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 64-bit double-precision values
+///    using the low-order bits of both operands to determine if the first
+///    operand is less than or equal to the second operand, and stores the
+///    result of the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdle(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 64-bit double-precision values
+///    using the low-order bits of both operands to determine if the first
+///    operand is greater than the second operand, and stores the result of
+///    the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_sd(__m128d __a, __m128d __b)
 {
@@ -371,42 +1083,124 @@
   return __builtin_ia32_ucomisdge(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 64-bit double-precision values
+///    using the low-order bits of both operands to determine inequality, and
+///    stores the result of the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \param __b
+///    A 128-bit vector of [2 x double] values.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_sd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_ucomisdneq(__a, __b);
 }
 
+/// \brief Converts a 128-bit vector of [2 x double] into a 128-bit vector of 
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTPD2PS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpd_ps(__m128d __a)
 {
   return __builtin_ia32_cvtpd2ps(__a);
 }
 
+/// \brief Converts a 128-bit vector of [4 x float] into a 128-bit vector of [2 
+///    x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTPS2PD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtps_pd(__m128 __a)
 {
   return __builtin_ia32_cvtps2pd(__a);
 }
 
+/// \brief Converts a vector of [4 x i32] into a vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTDQ2PD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtepi32_pd(__m128i __a)
 {
   return __builtin_ia32_cvtdq2pd((__v4si)__a);
 }
 
+/// \brief Converts a 128-bit vector of [2 x double] into a 128-bit vector of 
+///    [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTPD2DQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit integer vector containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtpd_epi32(__m128d __a)
 {
   return __builtin_ia32_cvtpd2dq(__a);
 }
 
+/// \brief Converts a vector of [2 x double] into a 32-bit signed integer value,
+///    using the lower 64 bits of the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSD2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used
+///    in the conversion.
+/// \returns A 32-bit signed integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsd_si32(__m128d __a)
 {
   return __builtin_ia32_cvtsd2si(__a);
 }
 
+/// \brief Converts a vector of [2 x double] into a vector of [4 x float], 
+///    using the lower 64 bits of the operand. The result is written to the 
+///    lower
+///    32 bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSD2SS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The upper 96 bits of this
+///    parameter are copied to the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] operand containing a double
+///    value to be converted. The lower 64 bits of this operand are
+///    used in the conversion.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value from the second operand. The upper 96 bits are copied
+///    from the upper 96 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsd_ss(__m128 __a, __m128d __b)
 {
@@ -414,6 +1208,22 @@
   return __a;
 }
 
+/// \brief Converts a 32-bit signed integer value into a vector of [2 x double],
+///    writing the result to the lower 64 bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSI2SD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this
+///    parameter are copied to the destination.
+/// \param __b
+///    A 32-bit signed integer operand containing the value to be
+///    converted.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    converted value from the second operand. The upper 64 bits are copied
+///    from the upper 64 bits of the first operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtsi32_sd(__m128d __a, int __b)
 {
@@ -421,6 +1231,25 @@
   return __a;
 }
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a vector of [2 x double], using the lower 64 bits of
+///    the operand. The result is written to the lower 64 bits of the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSS2SD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits are
+///    copied to the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] operand containing a double
+///    value to be converted. The lower 32 bits of this operand are
+///    used in the conversion.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    converted value from the second operand. The upper 64 bits are copied
+///    from the upper 64 bits of the first operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtss_sd(__m128d __a, __m128 __b)
 {
@@ -428,48 +1257,133 @@
   return __a;
 }
 
+/// \brief Converts a 128-bit vector of [2 x double] into a 128-bit vector of 
+///    [4 x i32], truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTTPD2DQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvttpd_epi32(__m128d __a)
 {
   return (__m128i)__builtin_ia32_cvttpd2dq(__a);
 }
 
+/// \brief Converts a vector of [2 x double] into a 32-bit signed integer value,
+///    using the lower 64 bits of the operand, truncating the result when it
+///    is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTTSD2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used
+///    in the conversion.
+/// \returns A 32-bit signed integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvttsd_si32(__m128d __a)
 {
   return __a[0];
 }
 
+/// \brief Converts a 128-bit vector of [2 x double] into a 64-bit vector of [2 
+///    x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPD2PI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtpd_pi32(__m128d __a)
 {
   return (__m64)__builtin_ia32_cvtpd2pi(__a);
 }
 
+/// \brief Converts a 128-bit vector of [2 x double] into a 64-bit vector of [2 
+///    x i32], truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTTPD2PI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvttpd_pi32(__m128d __a)
 {
   return (__m64)__builtin_ia32_cvttpd2pi(__a);
 }
 
+/// \brief Converts a 64-bit vector of [2 x i32] into a128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPI2PD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32].
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtpi32_pd(__m64 __a)
 {
   return __builtin_ia32_cvtpi2pd((__v2si)__a);
 }
 
+/// \brief Extracts a double-precision value from a vector of [2 x double] into 
+///    a double-precision value, using the lower 64 bits of the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] operand containing a double
+///    value to be extracted. The lower 64 bits of this operand are
+///    used in the extraction.
+/// \returns A double-precision value extracted from the lower 64 bits of the
+///    operand.
 static __inline__ double __DEFAULT_FN_ATTRS
 _mm_cvtsd_f64(__m128d __a)
 {
   return __a[0];
 }
 
+/// \brief Moves packed double-precision values from an aligned memory location
+///    to 64-bit elements in a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPD instruction.
+///
+/// \param __dp
+///    A 32-byte aligned pointer to a memory location containing
+///    double-precision values.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load_pd(double const *__dp)
 {
   return *(__m128d*)__dp;
 }
 
+/// \brief Moves and duplicates one double-precision value to double-precision
+///    values stored in a packed 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDDUP instruction.
+///
+/// \param __dp
+///    A double-precision value to be moved and duplicated.
+/// \returns A 128-bit vector of [2 x double] containing the moved and 
+///    duplicated values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load1_pd(double const *__dp)
 {
@@ -482,6 +1396,18 @@
 
 #define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
 
+/// \brief Loads two double-precision values in reverse order into a packed
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPD+shuffling instruction.
+///
+/// \param __dp
+///    An array of double-precision values to be loaded in reverse
+///    order.
+/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
+///    values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadr_pd(double const *__dp)
 {
@@ -489,6 +1415,17 @@
   return __builtin_shufflevector(__u, __u, 1, 0);
 }
 
+/// \brief Moves packed double-precision values from an unaligned memory 
+///    location to 64-bit elements in a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVUPD instruction.
+///
+/// \param __dp
+///    A pointer to a memory location containing double-precision
+///    values.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadu_pd(double const *__dp)
 {
@@ -498,6 +1435,18 @@
   return ((struct __loadu_pd*)__dp)->__v;
 }
 
+/// \brief Moves a packed double-precision value to the lower 64 bits of a
+///    128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MOVSD instruction.
+///
+/// \param __dp
+///    A pointer to a memory location containing a double-precision
+///    value.
+/// \returns A 128-bit vector of [2 x double] containing the moved value in the
+///    lower 64 bits, with a value of 0 assigned to the upper 64 bits.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load_sd(double const *__dp)
 {
@@ -508,6 +1457,21 @@
   return (__m128d){ __u, 0 };
 }
 
+/// \brief Loads a double-precision value into the high-order bits of a 128-bit
+///    vector of [2 x double]. The low-order bits are copied from the
+///    low-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVHPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __dp
+///    A pointer to a double-precision value.
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadh_pd(__m128d __a, double const *__dp)
 {
@@ -518,6 +1482,21 @@
   return (__m128d){ __a[0], __u };
 }
 
+/// \brief Loads a double-precision value into the low-order bits of a 128-bit
+///    vector of [2 x double]. The high-order bits are copied from the
+///    high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVLPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \param __dp
+///    A pointer to a double-precision value.
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadl_pd(__m128d __a, double const *__dp)
 {
@@ -534,42 +1513,124 @@
   return (__m128d)__builtin_ia32_undef128();
 }
 
+/// \brief Initializes a 128-bit vector of [2 x double] with the specified 
+///    64-bit double-precision value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    A double-precision value used to initialize the lower 64
+///    bits of the destination vector of [2 x double]. The upper
+///    bits of the destination are set to zero.
+/// \returns An initialized 128-bit vector of [2 x double] containing the value
+///    provided in the operand. The upper bits of the destination are set to
+///    zero.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set_sd(double __w)
 {
   return (__m128d){ __w, 0 };
 }
 
+/// \brief Initializes both double-precision values in a 128-bit vector of [2 x
+///    double] with the specified 64-bit double-precision value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    A double-precision value used to initialize both 64-bit
+///    double-precision values of the destination vector of [2 x
+///    double].
+/// \returns An initialized 128-bit vector of [2 x double] containing the value
+///    provided in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set1_pd(double __w)
 {
   return (__m128d){ __w, __w };
 }
 
+/// \brief Initializes both double-precision values in a 128-bit vector of [2 x
+///    double] with the specified 64-bit double-precision values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    A double-precision value used to initialize the upper 64
+///    bits of the destination vector of [2 x double].
+/// \param __x
+///    A double-precision value used to initialize the lower 64
+///    bits of the destination vector of [2 x double].
+/// \returns An initialized 128-bit vector of [2 x double] containing the 
+///    values provided in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_set_pd(double __w, double __x)
 {
   return (__m128d){ __x, __w };
 }
 
+/// \brief Initializes both double-precision values in a 128-bit vector of [2 x
+///    double] with the specified 64-bit double-precision values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    A double-precision value used to initialize the lower 64
+///    bits of the destination vector of [2 x double].
+/// \param __x
+///    A double-precision value used to initialize the upper 64
+///    bits of the destination vector of [2 x double].
+/// \returns An initialized 128-bit vector of [2 x double] containing the 
+///    values provided in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_setr_pd(double __w, double __x)
 {
   return (__m128d){ __w, __x };
 }
 
+/// \brief Sets the 64-bit double-precision registers to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \returns An initialized 128-bit vector of [2 x double] with all elements set 
+///    to zero.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_setzero_pd(void)
 {
   return (__m128d){ 0, 0 };
 }
 
+/// \brief Moves two double-precision values into a packed 128-bit vector of [2 
+///    x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVSD instruction.
+///
+/// \param __a
+///    128-bit vector of [2 x double]. The upper 64 bits of this
+///    operand are copied to the upper 64 bits of the destination.
+/// \param __b
+///    128-bit vector of [2 x double]. The lower 64 bits of this
+///    operand are copied to the lower 64 bits of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_move_sd(__m128d __a, __m128d __b)
 {
   return (__m128d){ __b[0], __a[1] };
 }
 
+/// \brief Moves packed double-precision values from a 128-bit vector of [2 x
+///    double] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVSD instruction.
+///
+/// \param __dp
+///    A pointer to a memory location that will receive the
+///    double-precision values.
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values to be
+///    moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_sd(double *__dp, __m128d __a)
 {
@@ -579,6 +1640,19 @@
   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
 }
 
+/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
+///    the upper and lower 64 bits of a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVSD instruction.
+///
+/// \param __dp
+///    A pointer to a memory location that can store 2
+///    double-precision values.
+/// \param __a
+///    A 128-bit vector of [2 x double] whose lower 64 bits are
+///    copied to each of the values in __dp.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store1_pd(double *__dp, __m128d __a)
 {
@@ -589,18 +1663,57 @@
   ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
 }
 
+/// \brief Moves packed double-precision values from a 128-bit vector of [2 x
+///    double] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPD instruction.
+///
+/// \param __dp
+///    A pointer to an aligned memory location that can store 2
+///    double-precision values.
+/// \param __a
+///    A packed 128-bit vector of [2 x double] containing the
+///    values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_pd(double *__dp, __m128d __a)
 {
   *(__m128d *)__dp = __a;
 }
 
+/// \brief Moves packed double-precision values from a 128-bit vector of [2 x
+///    double] to an unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVUPD instruction.
+///
+/// \param __dp
+///    A pointer to an unaligned memory location that can store 2
+///    double-precision values.
+/// \param __a
+///    A packed 128-bit vector of [2 x double] containing the
+///    values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_pd(double *__dp, __m128d __a)
 {
   __builtin_ia32_storeupd(__dp, __a);
 }
 
+/// \brief Moves packed double-precision values, in reverse order, from a 
+///    128-bit vector of [2 x double] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPD + shuffling instruction.
+///
+/// \param __dp
+///    A pointer to an aligned memory location that can store 2
+///    double-precision values in reverse order.
+/// \param __a
+///    A packed 128-bit vector of [2 x double] containing the
+///    values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storer_pd(double *__dp, __m128d __a)
 {
@@ -608,6 +1721,19 @@
   *(__m128d *)__dp = __a;
 }
 
+/// \brief Moves a packed double-precision value from the upper 64 bits of a
+///    128-bit vector of [2 x double] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVHPD instruction.
+///
+/// \param __dp
+///    A pointer to a memory location that will receive the
+///    double-precision value.
+/// \param __a
+///    A packed 128-bit vector of [2 x double] containing the value
+///    to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeh_pd(double *__dp, __m128d __a)
 {
@@ -617,6 +1743,19 @@
   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
 }
 
+/// \brief Moves a packed double-precision value from the lower 64 bits of a
+///    128-bit vector of [2 x double] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVLPD instruction.
+///
+/// \param __dp
+///    A pointer to a memory location that will receive the
+///    double-precision value.
+/// \param __a
+///    A packed 128-bit vector of [2 x double] containing the value
+///    to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_pd(double *__dp, __m128d __a)
 {
@@ -626,216 +1765,747 @@
   ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
 }
 
+/// \brief Adds packed 8-bit integer values and writes the sums to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPADDB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the sums of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qi)__a + (__v16qi)__b);
 }
 
+/// \brief Adds packed 16-bit integer values and writes the sums to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPADDW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the sums of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a + (__v8hi)__b);
 }
 
+/// \brief Adds packed 32-bit integer values and writes the sums to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPADDD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the sums of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a + (__v4si)__b);
 }
 
+/// \brief Adds signed or unsigned 64-bit integer values and writes the sum to
+///    the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c ADD instruction.
+///
+/// \param __a
+///    A 64-bit integer containing one of the source operands.
+/// \param __b
+///    A 64-bit integer containing one of the source operands.
+/// \returns A 64-bit integer containing the sum of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_si64(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_paddq(__a, __b);
 }
 
+/// \brief Adds packed 64-bit integer values and writes the sums to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPADDQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the sums of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_add_epi64(__m128i __a, __m128i __b)
 {
   return __a + __b;
 }
 
+/// \brief Adds packed 8-bit integer values and writes the sums to the
+///    corresponding bits in the destination. Positive sums greater than
+///    7Fh are saturated to 7Fh. Negative sums less than 80h are saturated to 
+///    80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPADDSB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the sums of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Adds packed 16-bit integer values and writes the sums to the
+///    corresponding bits in the destination. Positive sums greater than
+///    7FFFh are saturated to 7FFFh. Negative sums less than 8000h are saturated 
+///    to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPADDSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the sums of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Adds packed 8-bit integer values and writes the unsigned integer 
+///    sums to the corresponding bits in the destination. The sums greater than
+///    FFh are saturated to FFh. Negative sums less than 00h are saturated to 
+///    00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPADDUSB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the unsigned integer sums of 
+///    both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Adds packed 16-bit integer values and writes the unsigned integer 
+///    sums to the corresponding bits in the destination. Positive sums greater
+///    than FFFFh are saturated to
+///    FFFFh. Negative sums less than
+///    0000h are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPADDUSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the unsigned integer sums of 
+///    both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_adds_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Computes the rounded averages of the packed unsigned 8-bit integer
+///    values and writes the averages to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPAVGB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the averages of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Computes the rounded averages of the packed unsigned 16-bit integer
+///    values and writes the averages to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPAVGW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the averages of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Multiplies corresponding pairs of packed 16-bit signed integer values,
+///    adds pairs of contiguous products, and writes the 32-bit sums to the
+///    corresponding bits in the destination. For example, bits [15:0] of
+///    both operands are multiplied, bits [31:16] of both operands are
+///    multiplied, and the sum of both results is written to bits [31:0] of
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMADDWD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the sums of products of both
+///    operands:
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_madd_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit integer values of 
+///    the 128-bit integer vectors, and writes the greater value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMAXSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+///    values of the 128-bit integer vectors, and writes the greater value to
+///    the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMAXUB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit integer values of 
+///    the 128-bit integer vectors, and writes the lesser value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMINSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+///    values of the 128-bit integer vectors, and writes the lesser value to
+///    the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMINUB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Multiplies packed 16-bit signed integer values and writes the
+///    high-order 16 bits of each 32-bit product to the corresponding bits in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMULHW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the products of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhi_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Multiplies packed 16-bit unsigned integer values and writes the
+///    high-order 16 bits of each 32-bit product to the corresponding bits in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMULHUW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the products of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhi_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Multiplies packed 16-bit integer values and writes the low-order 16
+///    bits of each 32-bit product to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMULLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the products of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a * (__v8hi)__b);
 }
 
+/// \brief Multiplies 32-bit unsigned integer values contained in the lower 
+///    bits of the two 64-bit integer vectors, and writes the 64-bit unsigned
+///    product to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMULUDQ instruction.
+///
+/// \param __a
+///    A 64-bit integer containing one of the source operands.
+/// \param __b
+///    A 64-bit integer containing one of the source operands.
+/// \returns A 64-bit integer vector containing the product of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mul_su32(__m64 __a, __m64 __b)
 {
   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
 }
 
+/// \brief Multiplies the even-indexed packed 32-bit unsigned integer values
+///    contained in the two 128-bit integer vectors and writes the 64-bit
+///    unsigned products to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMULUDQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the product of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mul_epu32(__m128i __a, __m128i __b)
 {
   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Subtracts packed 8-bit unsigned integer values and computes the
+///    absolute differences to the corresponding bits in the destination.
+///    Then sums of the absolute differences for the upper 8 source bytes and
+///    the lower 8 source bytes are computed, and written to bits [15:0] and
+///    [79:64] of the destination, respectively. The remaining bits in the
+///    destination are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSADBW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the sums of the sets of 
+///    absolute differences between both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sad_epu8(__m128i __a, __m128i __b)
 {
   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Subtracts the 8-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSUBB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qi)__a - (__v16qi)__b);
 }
 
+/// \brief Subtracts the 16-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSUBW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a - (__v8hi)__b);
 }
 
+/// \brief Subtracts the 32-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSUBD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a - (__v4si)__b);
 }
 
+/// \brief Subtracts signed or unsigned 64-bit integer values and writes the
+///    difference to the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c SUB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the minuend.
+/// \param __b
+///    A 64-bit integer vector containing the subtrahend.
+/// \returns A 64-bit integer vector containing the difference of the values in 
+///    the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_si64(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_psubq(__a, __b);
 }
 
+/// \brief Subtracts the 64-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSUBQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sub_epi64(__m128i __a, __m128i __b)
 {
   return __a - __b;
 }
 
+/// \brief Subtracts packed 8-bit integer values and writes the differences to
+///    the corresponding bits in the destination. Values greater than the
+///    largest signed 8-bit integer are saturated to 7Fh, and values less than 
+///    the smallest
+///    signed 8-bit integer are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSUBSB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Subtracts packed 16-bit integer values and writes the differences to
+///    the corresponding bits in the destination. Values greater than 7FFFh are 
+///    saturated to 7FFFh, and values less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSUBSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Subtracts packed 8-bit integer values and writes the unsigned 
+///    integer differences to the corresponding bits in the destination. Values 
+///    less
+///    than 00h are saturated to 00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSUBUSB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer 
+///    differences of the values in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epu8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Subtracts packed 16-bit integer values and writes the unsigned 
+///    integer differences to the corresponding bits in the destination. Values
+///    greater than FFFFh are saturated
+///    to FFFFh, and values less 0000h are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSUBUSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the minuends.
+/// \param __b
+///    A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer 
+///    differences of the values in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_subs_epu16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Performs a bitwise AND of 2 packed 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPAND instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the bitwise AND of the values
+///    between both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_and_si128(__m128i __a, __m128i __b)
 {
   return __a & __b;
 }
 
+/// \brief Performs a bitwise AND of 2 packed 128-bit integer vectors, using 
+///    the ones-complement of the values contained in the first source
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPEQD+VPXOR+VPAND instruction.
+///
+/// \param __a
+///    A 128-bit vector containing the left source operand. The ones
+///    complement of this value is used in the bitwise AND.
+/// \param __b
+///    A 128-bit vector containing the right source operand.
+/// \returns A 128-bit integer vector containing the bitwise AND of the 
+///    ones-complement of the first operand and the values in the second operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_andnot_si128(__m128i __a, __m128i __b)
 {
   return ~__a & __b;
 }
 
+/// \brief Performs a bitwise OR of 2 packed 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPOR instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the bitwise OR of the values
+///    between both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_or_si128(__m128i __a, __m128i __b)
 {
   return __a | __b;
 }
 
+/// \brief Performs a bitwise exclusive OR of 2 packed 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPXOR instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the bitwise exclusive OR of 
+///    the values between both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_xor_si128(__m128i __a, __m128i __b)
 {
   return __a ^ __b;
 }
 
+/// \brief Left-shifts the 128-bit integer vector operand by the specified 
+///    number of bytes. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_slli_si128(__m128i a, const int imm);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPSLLDQ instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the source operand.
+/// \param imm
+///    An immediate value specifying the number of bytes to
+///    left-shift operand a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
 #define _mm_slli_si128(a, imm) __extension__ ({                         \
   (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(),        \
                                    (__v16qi)(__m128i)(a),               \
@@ -859,66 +2529,217 @@
 #define _mm_bslli_si128(a, imm) \
   _mm_slli_si128((a), (imm))
 
+/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSLLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift
+///    each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
 }
 
+/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand 
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSLLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits[63:0] specify the
+///    number of bits to left-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
 }
 
+/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSLLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift
+///    each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
 }
 
+/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand 
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSLLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits[63:0] specify the
+///    number of bits to left-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
 }
 
+/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSLLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift
+///    each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_slli_epi64(__m128i __a, int __count)
 {
   return __builtin_ia32_psllqi128(__a, __count);
 }
 
+/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand 
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSLLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits[63:0] specify the
+///    number of bits to left-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sll_epi64(__m128i __a, __m128i __count)
 {
   return __builtin_ia32_psllq128(__a, __count);
 }
 
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the
+///    sign bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSRAW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to
+///    right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
 }
 
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the
+///    sign bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSRAW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits[63:0] specify the
+///    number of bits to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
 }
 
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the
+///    sign bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSRAD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to
+///    right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srai_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
 }
 
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are filled with the
+///    sign bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSRAD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits[63:0] specify the
+///    number of bits to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sra_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
 }
 
+/// \brief Right-shifts the 128-bit integer vector operand by the specified
+///    number of bytes. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_srli_si128(__m128i a, const int imm);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPSRLDQ instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the source operand.
+/// \param imm
+///    An immediate value specifying the number of bytes to
+///    right-shift operand a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 #define _mm_srli_si128(a, imm) __extension__ ({                          \
   (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a),                \
                                    (__v16qi)_mm_setzero_si128(),         \
@@ -942,60 +2763,188 @@
 #define _mm_bsrli_si128(a, imm) \
   _mm_srli_si128((a), (imm))
 
+/// \brief Right-shifts each packed 16-bit value in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSRLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to
+///    right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi16(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
 }
 
+/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSRLW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits[63:0] specify the
+///    number of bits to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi16(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
 }
 
+/// \brief Right-shifts each packed 32-bit value in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSRLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to
+///    right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi32(__m128i __a, int __count)
 {
   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
 }
 
+/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSRLD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits[63:0] specify the
+///    number of bits to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi32(__m128i __a, __m128i __count)
 {
   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
 }
 
+/// \brief Right-shifts each packed 64-bit value in the 128-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSRLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to
+///    right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srli_epi64(__m128i __a, int __count)
 {
   return __builtin_ia32_psrlqi128(__a, __count);
 }
 
+/// \brief Right-shifts each 64-bit value in the 128-bit integer vector operand
+///    by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSRLQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the source operand.
+/// \param __count
+///    A 128-bit integer vector in which bits[63:0] specify the
+///    number of bits to right-shift each value in operand __a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_srl_epi64(__m128i __a, __m128i __count)
 {
   return __builtin_ia32_psrlq128(__a, __count);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit values of the 
+///    128-bit integer vectors for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPEQB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v16qi)__a == (__v16qi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit values of the 
+///    128-bit integer vectors for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPEQW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a == (__v8hi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit values of the 
+///    128-bit integer vectors for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPEQD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a == (__v4si)__b);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit values of the 
+///    128-bit integer vectors to determine if the values in the first operand 
+///    are
+///    greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPGTB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
 {
@@ -1004,30 +2953,100 @@
   return (__m128i)((__v16qs)__a > (__v16qs)__b);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit values of the 
+///    128-bit integer vectors to determine if the values in the first operand 
+///    are
+///    greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPGTW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v8hi)__a > (__v8hi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit values of the 
+///    128-bit integer vectors to determine if the values in the first operand 
+///    are
+///    greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPGTD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)((__v4si)__a > (__v4si)__b);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit values of the 
+///    128-bit integer vectors to determine if the values in the first operand 
+///    are
+///    less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPGTB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi8(__m128i __a, __m128i __b)
 {
   return _mm_cmpgt_epi8(__b, __a);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit values of the 
+///    128-bit integer vectors to determine if the values in the first operand 
+///    are
+///    less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPGTW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi16(__m128i __a, __m128i __b)
 {
   return _mm_cmpgt_epi16(__b, __a);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit values of the 
+///    128-bit integer vectors to determine if the values in the first operand 
+///    are
+///    less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPGTD instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __b
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmplt_epi32(__m128i __a, __m128i __b)
 {
@@ -1035,6 +3054,24 @@
 }
 
 #ifdef __x86_64__
+/// \brief Converts a 64-bit signed integer value into a double-precision value,
+///    writing the result to the lower 64 bits of the destination. The upper
+///    64 bits of the first operand are copied to the upper 64 bits of the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSI2SD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The upper 64 bits of this
+///    operand are copied to the upper 64 bits of the destination.
+/// \param __b
+///    A 64-bit signed integer operand containing the value to be
+///    converted.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+///    converted value of the second operand. The upper 64 bits are copied
+///    from the upper 64 bits of the first operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtsi64_sd(__m128d __a, long long __b)
 {
@@ -1042,12 +3079,36 @@
   return __a;
 }
 
+/// \brief Converts a vector of [2 x double] into a 64-bit signed integer value,
+///    using the lower 64 bits of the operand, truncating the result when it
+///    is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSD2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used
+///    in the conversion.
+/// \returns A 64-bit signed integer containing the converted value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtsd_si64(__m128d __a)
 {
   return __builtin_ia32_cvtsd2si64(__a);
 }
 
+/// \brief Converts a vector of [2 x double] into a 64-bit signed integer value,
+///    using the lower 64 bits of the operand, truncating the result when it
+///    is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTTSD2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. The lower 64 bits are used
+///    in the conversion.
+/// \returns A 64-bit signed integer containing the converted value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvttsd_si64(__m128d __a)
 {
@@ -1055,24 +3116,62 @@
 }
 #endif
 
+/// \brief Converts a vector of [4 x i32] into a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTDQ2PS instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtepi32_ps(__m128i __a)
 {
   return __builtin_ia32_cvtdq2ps((__v4si)__a);
 }
 
+/// \brief Converts a 128-bit vector of [4 x float] into a 128-bit vector of [4 
+///    x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTPS2DQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit integer vector containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtps_epi32(__m128 __a)
 {
   return (__m128i)__builtin_ia32_cvtps2dq(__a);
 }
 
+/// \brief Converts a 128-bit vector of [4 x float] into a 128-bit vector of [4 
+///    x i32], truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTTPS2DQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x i32] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvttps_epi32(__m128 __a)
 {
   return (__m128i)__builtin_ia32_cvttps2dq(__a);
 }
 
+/// \brief Converts a 32-bit signed integer value into a vector of [4 x i32],
+///    writing the result to the lower 32 bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 32-bit signed integer operand containing the value to be
+///    converted.
+/// \returns A 128-bit vector of [4 x i32] containing the converted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi32_si128(int __a)
 {
@@ -1080,6 +3179,15 @@
 }
 
 #ifdef __x86_64__
+/// \brief Converts a 64-bit signed integer value into a vector of [2 x i64],
+///    writing the result to the lower 64 bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 64-bit signed integer operand containing the value to be
+///    converted.
+/// \returns A 128-bit vector of [2 x i64] containing the converted value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtsi64_si128(long long __a)
 {
@@ -1087,6 +3195,15 @@
 }
 #endif
 
+/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a
+///    32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32] operand. The least significant
+///    32 bits are moved to the destination.
+/// \returns A 32-bit signed integer containing the moved value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsi128_si32(__m128i __a)
 {
@@ -1095,6 +3212,15 @@
 }
 
 #ifdef __x86_64__
+/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a
+///    64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64] operand. The least significant
+///    64 bits are moved to the destination.
+/// \returns A 64-bit signed integer containing the moved value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtsi128_si64(__m128i __a)
 {
@@ -1102,12 +3228,33 @@
 }
 #endif
 
+/// \brief Moves packed integer values from an aligned 128-bit memory location 
+///    to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDQA instruction.
+///
+/// \param __p
+///    An aligned pointer to a memory location containing integer
+///    values.
+/// \returns A 128-bit integer vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_load_si128(__m128i const *__p)
 {
   return *__p;
 }
 
+/// \brief Moves packed integer values from an unaligned 128-bit memory 
+///    location to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDQU instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_loadu_si128(__m128i const *__p)
 {
@@ -1117,6 +3264,16 @@
   return ((struct __loadu_si128*)__p)->__v;
 }
 
+/// \brief Moves the packed low-order integer values from a 128-bit source
+///    operand of [2 x i64] to the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __p
+///    A 128-bit vector of [2 x i64].
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 
+///    moved value. The higher order bits are cleared.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_loadl_epi64(__m128i const *__p)
 {
@@ -1132,114 +3289,444 @@
   return (__m128i)__builtin_ia32_undef128();
 }
 
+/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+///    the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __q1
+///    A 64-bit integer value used to initialize the upper 64 bits
+///    of the destination vector of [2 x i64].
+/// \param __q0
+///    A 64-bit integer value used to initialize the lower 64 bits
+///    of the destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi64x(long long __q1, long long __q0)
 {
   return (__m128i){ __q0, __q1 };
 }
 
+/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+///    the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __q1
+///    A 64-bit integer value used to initialize the upper 64 bits
+///    of the destination vector of [2 x i64].
+/// \param __q0
+///    A 64-bit integer value used to initialize the lower 64 bits
+///    of the destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi64(__m64 __q1, __m64 __q0)
 {
   return (__m128i){ (long long)__q0, (long long)__q1 };
 }
 
+/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
+///    the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __i3
+///    A 32-bit integer value used to initialize bits [127:96] of
+///    the destination vector.
+/// \param __i2
+///    A 32-bit integer value used to initialize bits [95:64] of
+///    the destination vector.
+/// \param __i1
+///    A 32-bit integer value used to initialize bits [63:32] of
+///    the destination vector.
+/// \param __i0
+///    A 32-bit integer value used to initialize bits [31:0] of the
+///    destination vector.
+/// \returns An initialized 128-bit vector of [4 x i32] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
 {
   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
 }
 
+/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
+///    the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w7
+///    A 16-bit integer value used to initialize bits [127:112] of
+///    the destination vector.
+/// \param __w6
+///    A 16-bit integer value used to initialize bits [111:96] of
+///    the destination vector.
+/// \param __w5
+///    A 16-bit integer value used to initialize bits [95:80] of
+///    the destination vector.
+/// \param __w4
+///    A 16-bit integer value used to initialize bits [79:64] of
+///    the destination vector.
+/// \param __w3
+///    A 16-bit integer value used to initialize bits [63:48] of
+///    the destination vector.
+/// \param __w2
+///    A 16-bit integer value used to initialize bits [47:32] of
+///    the destination vector.
+/// \param __w1
+///    A 16-bit integer value used to initialize bits [31:16] of
+///    the destination vector.
+/// \param __w0
+///    A 16-bit integer value used to initialize bits [15:0] of the
+///    destination vector.
+/// \returns An initialized 128-bit vector of [8 x i16] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
 {
   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
 }
 
+/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 
+///    the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __b15
+///    Initializes bits [127:120] of the destination vector.
+/// \param __b14
+///    Initializes bits [119:112] of the destination vector.
+/// \param __b13
+///    Initializes bits [111:104] of the destination vector.
+/// \param __b12
+///    Initializes bits [103:96] of the destination vector.
+/// \param __b11
+///    Initializes bits [95:88] of the destination vector.
+/// \param __b10
+///    Initializes bits [87:80] of the destination vector.
+/// \param __b9
+///    Initializes bits [79:72] of the destination vector.
+/// \param __b8
+///    Initializes bits [71:64] of the destination vector.
+/// \param __b7
+///    Initializes bits [63:56] of the destination vector.
+/// \param __b6
+///    Initializes bits [55:48] of the destination vector.
+/// \param __b5
+///    Initializes bits [47:40] of the destination vector.
+/// \param __b4
+///    Initializes bits [39:32] of the destination vector.
+/// \param __b3
+///    Initializes bits [31:24] of the destination vector.
+/// \param __b2
+///    Initializes bits [23:16] of the destination vector.
+/// \param __b1
+///    Initializes bits [15:8] of the destination vector.
+/// \param __b0
+///    Initializes bits [7:0] of the destination vector.
+/// \returns An initialized 128-bit vector of [8 x i8] containing the values
+///    provided in the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
 {
   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
 }
 
+/// \brief Initializes both values in a 128-bit integer vector with the 
+///    specified 64-bit integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __q
+///    Integer value used to initialize the elements of the
+///    destination integer vector.
+/// \returns An initialized 128-bit integer vector with all elements containing 
+///    the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi64x(long long __q)
 {
   return (__m128i){ __q, __q };
 }
 
+/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the
+///    specified 64-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __q
+///    A 64-bit value used to initialize the elements of the
+///    destination integer vector.
+/// \returns An initialized 128-bit vector of [2 x i64] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi64(__m64 __q)
 {
   return (__m128i){ (long long)__q, (long long)__q };
 }
 
+/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the
+///    specified 32-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __i
+///    A 32-bit value used to initialize the elements of the
+///    destination integer vector.
+/// \returns An initialized 128-bit vector of [4 x i32] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi32(int __i)
 {
   return (__m128i)(__v4si){ __i, __i, __i, __i };
 }
 
+/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the
+///    specified 16-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    A 16-bit value used to initialize the elements of the
+///    destination integer vector.
+/// \returns An initialized 128-bit vector of [8 x i16] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi16(short __w)
 {
   return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
 }
 
+/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the
+///    specified 8-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __b
+///    An 8-bit value used to initialize the elements of the
+///    destination integer vector.
+/// \returns An initialized 128-bit vector of [16 x i8] with all elements
+///    containing the value provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_set1_epi8(char __b)
 {
   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
 }
 
+/// \brief Initializes both 64-bit integer values in a 128-bit vector of [2 x
+///    i64] in reverse order, using the specified 64-bit values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __q0
+///    A 64-bit value used to initialize the lower 64 bits of the
+///    destination vector.
+/// \param __q1
+///    A 64-bit value used to initialize the upper 64 bits of the
+///    destination vector.
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+///    provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi64(__m64 __q0, __m64 __q1)
 {
   return (__m128i){ (long long)__q0, (long long)__q1 };
 }
 
+/// \brief Initializes all 32-bit integer values in a 128-bit vector of [4 x i32]
+///    in reverse order, using the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __i0
+///    A 32-bit integer value used to initialize bits [31:0] of the
+///    destination vector.
+/// \param __i1
+///    A 32-bit integer value used to initialize bits [63:32] of
+///    the destination vector.
+/// \param __i2
+///    A 32-bit integer value used to initialize bits [95:64] of
+///    the destination vector.
+/// \param __i3
+///    A 32-bit integer value used to initialize bits [127:96] of
+///    the destination vector.
+/// \returns An initialized 128-bit vector of [4 x i32] containing the values
+///    provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
 {
   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
 }
 
+/// \brief Initializes all 16-bit integer values in a 128-bit vector of [8 x i16]
+///    in reverse order, using the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w0
+///    A 16-bit integer value used to initialize bits [15:0] of the
+///    destination vector.
+/// \param __w1
+///    A 16-bit integer value used to initialize bits [31:16] of
+///    the destination vector.
+/// \param __w2
+///    A 16-bit integer value used to initialize bits [47:32] of
+///    the destination vector.
+/// \param __w3
+///    A 16-bit integer value used to initialize bits [63:48] of
+///    the destination vector.
+/// \param __w4
+///    A 16-bit integer value used to initialize bits [79:64] of
+///    the destination vector.
+/// \param __w5
+///    A 16-bit integer value used to initialize bits [95:80] of
+///    the destination vector.
+/// \param __w6
+///    A 16-bit integer value used to initialize bits [111:96] of
+///    the destination vector.
+/// \param __w7
+///    A 16-bit integer value used to initialize bits [127:112] of
+///    the destination vector.
+/// \returns An initialized 128-bit vector of [8 x i16] containing the values
+///    provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
 {
   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
 }
 
+/// \brief Initializes all 16-bit integer values in a 128-bit vector of [16 x i8]
+///    in reverse order, using the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __b0
+///    Initializes bits [7:0] of the destination vector.
+/// \param __b1
+///    Initializes bits [15:8] of the destination vector.
+/// \param __b2
+///    Initializes bits [23:16] of the destination vector.
+/// \param __b3
+///    Initializes bits [31:24] of the destination vector.
+/// \param __b4
+///    Initializes bits [39:32] of the destination vector.
+/// \param __b5
+///    Initializes bits [47:40] of the destination vector.
+/// \param __b6
+///    Initializes bits [55:48] of the destination vector.
+/// \param __b7
+///    Initializes bits [63:56] of the destination vector.
+/// \param __b8
+///    Initializes bits [71:64] of the destination vector.
+/// \param __b9
+///    Initializes bits [79:72] of the destination vector.
+/// \param __b10
+///    Initializes bits [87:80] of the destination vector.
+/// \param __b11
+///    Initializes bits [95:88] of the destination vector.
+/// \param __b12
+///    Initializes bits [103:96] of the destination vector.
+/// \param __b13
+///    Initializes bits [111:104] of the destination vector.
+/// \param __b14
+///    Initializes bits [119:112] of the destination vector.
+/// \param __b15
+///    Initializes bits [127:120] of the destination vector.
+/// \returns An initialized 128-bit vector of [16 x i8] containing the values
+///    provided in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
 {
   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
 }
 
+/// \brief Sets the 128-bit integer registers to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \returns An initialized 128-bit integer vector with all elements set to zero.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setzero_si128(void)
 {
   return (__m128i){ 0LL, 0LL };
 }
 
+/// \brief Moves packed integer values from a 128-bit integer vector to an
+///    aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDQA instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location that will receive
+///    the integer values.
+/// \param __b
+///    A packed 128-bit integer vector containing the values to be
+///    moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_si128(__m128i *__p, __m128i __b)
 {
   *__p = __b;
 }
 
+/// \brief Moves packed integer values from a 128-bit integer vector to an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDQU instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer
+///    values.
+/// \param __b
+///    A packed 128-bit integer vector containing the values to be
+///    moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_si128(__m128i *__p, __m128i __b)
 {
   __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
 }
 
+/// \brief Moves bytes selected by the mask from the first operand to the
+///    specified unaligned memory location. When a mask bit is 1, the
+///    corresponding byte is written, otherwise it is not written. Exception
+///    and trap behavior for elements not selected for storage to memory are
+///    implementation dependent.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMASKMOVDQU instruction.
+///
+/// \param __d
+///    A 128-bit integer vector containing the values to be moved.
+/// \param __n
+///    A 128-bit integer vector containing the mask. The most
+///    significant bit of each byte represents the mask bits.
+/// \param __p
+///    A 128-bit unaligned memory location where the specified
+///    values are moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
 {
   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
 }
 
+/// \brief Moves a packed 64-bit integer value from the lower 64 bits of a
+///    128-bit vector of [2 x i64] to a 128-bit integer vector memory
+///    location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVQ instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the 64-bit
+///    integer value.
+/// \param __a
+///    A packed 128-bit vector of [2 x i64]. The lower 64 bits
+///    contain the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_epi64(__m128i *__p, __m128i __a)
 {
@@ -1249,18 +3736,58 @@
   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
 }
 
+/// \brief Stores double-precision values in a 128-bit memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVNTPD instruction.
+///
+/// \param __p
+///    The 128-bit memory location used to store the value.
+/// \param __a
+///    A vector of [2 x double] containing the 64-bit values to be
+///    stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pd(double *__p, __m128d __a)
 {
   __builtin_ia32_movntpd(__p, __a);
 }
 
+/// \brief Moves packed integer values from a 128-bit integer vector to a 
+///    128-bit aligned memory location. To minimize caching, the data is flagged 
+///    as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVNTDQ instruction.
+///
+/// \param __p
+///    A 128-bit aligned pointer to a memory location that will
+///    receive the integer values.
+/// \param __a
+///    A 128-bit integer vector containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si128(__m128i *__p, __m128i __a)
 {
   __builtin_ia32_movntdq(__p, __a);
 }
 
+/// \brief Stores a 32-bit integer value in the specified aligned memory
+///    location. To minimize caching, the data is flagged as non-temporal
+///    (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MOVNTI instruction.
+///
+/// \param __p
+///    The aligned memory location used to store the register
+///    value.
+/// \param __a
+///    A 32-bit integer containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si32(int *__p, int __a)
 {
@@ -1275,42 +3802,143 @@
 }
 #endif
 
+/// \brief The cache line containing __p is flushed
+///    and invalidated from all caches in the coherency domain.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CLFLUSH instruction.
+///
+/// \param __p
+///    The memory location used to identify the cache line to be
+///    flushed.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_clflush(void const *__p)
 {
   __builtin_ia32_clflush(__p);
 }
 
+/// \brief Forces strong memory ordering (serialization) between load
+///    instructions preceding this instruction and load instructions
+///    following this instruction, assuring the system completes all previous
+///    loads before executing subsequent loads.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c LFENCE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_lfence(void)
 {
   __builtin_ia32_lfence();
 }
 
+/// \brief Forces strong memory ordering (serialization) between load and store
+///    instructions preceding this instruction and load and store
+///    instructions following this instruction, assuring that the system
+///    completes all previous memory accesses before executing subsequent
+///    memory accesses.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MFENCE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mfence(void)
 {
   __builtin_ia32_mfence();
 }
 
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 7Fh are saturated to 7Fh. 
+///    Negative values less than 80h are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPACKSSWB instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16]. The converted values are
+///    written to the lower order bits of the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16]. The converted values are
+///    written to the upper order bits of the destination.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+///    operands into 16-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 7FFFh are saturated to 7FFFh. 
+///    Negative values less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPACKSSDW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32]. The converted values are
+///    written to the lower order bits of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32]. The converted values are
+///    written to the upper order bits of the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit unsigned integers, and packs the results into the
+///    destination. Values greater than 7Fh
+///    are saturated to 7Fh. Values
+///    less than 00h are saturated to
+///    00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPACKUSWB instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16]. The converted values are
+///    written to the lower order bits of the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16]. The converted values are
+///    written to the upper order bits of the destination.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packus_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Extracts 16 bits of extended packed data from a 128-bit integer 
+///    vector and copies it to the destination, as specified by the integer 
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPEXTRW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __imm
+///    Determines which bits are extracted using bits [3:0]:
+///    000: Bits [15:0] are copied to the destination.
+///    001: Bits [31:16] are copied to the destination.
+///    010: Bits [47:32] are copied to the destination.
+///    011: Bits [63:48] are copied to the destination.
+///    100: Bits [79:64] are copied to the destination.
+///    101: Bits [95:80] are copied to the destination.
+///    110: Bits [111:96] are copied to the destination.
+///    111: Bits [127:112] are copied to the destination.
+/// \returns A 16-bit integer containing the extracted 16 bits of extended 
+///    packed data.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_extract_epi16(__m128i __a, int __imm)
 {
@@ -1318,6 +3946,27 @@
   return (unsigned short)__b[__imm & 7];
 }
 
+/// \brief Copies extended packed data from the 128-bit integer vector operand 
+///    to the destination and inserts the lower 16-bits of an integer operand,
+///    using the offset specified by the immediate operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPINSRW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param __b
+///    An integer. The bits of this operand are written to the
+///    destination beginning at the offset specified by operand __imm.
+/// \param __imm
+///    Specifies the bit offset to be used in the destination. The
+///    remaining bits in the destination are copied from the
+///    corresponding bits in operand __a.
+/// \returns A 128-bit integer vector containing the copied extended packed 
+///    data from the operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_insert_epi16(__m128i __a, int __b, int __imm)
 {
@@ -1326,18 +3975,69 @@
   return (__m128i)__c;
 }
 
+/// \brief Copies the values of the most significant bits from each 8-bit 
+///    element in a 128-bit integer vector to create a 16-bit mask value,
+///    zero-extends the value, and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVMSKB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values with bits to
+///    be extracted.
+/// \returns The most significant bits from each 8-bit element in the operand,
+///    written to bits [15:0].
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_epi8(__m128i __a)
 {
   return __builtin_ia32_pmovmskb128((__v16qi)__a);
 }
 
+/// \brief Shuffles the 4 32-bit integers from a 128-bit integer vector to the
+///    destination, as specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPSHUFB instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param imm
+///    An immediate value containing 8-bit values specifying which
+///    elements to copy from a. If bit 7 is set, the corresponding
+///    8-bit element in the destination is cleared. Bits [3:0]
+///    select which 8-bit element to copy.
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
                                    (__v4si)_mm_setzero_si128(), \
                                    (imm) & 0x3, ((imm) & 0xc) >> 2, \
                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
 
+/// \brief Shuffles the lower 4 16-bit integers from a 128-bit integer vector 
+///    to the destination, as specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm;
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPSHUFLW instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param imm
+///    An immediate value containing 8-bit values specifying which
+///    elements to copy from a. If bit 7 is set, the corresponding
+///    8-bit element in the destination is cleared. Bits [3:0]
+///    select which 8-bit element to copy.
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
                                    (__v8hi)_mm_setzero_si128(), \
@@ -1345,6 +4045,25 @@
                                    ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
                                    4, 5, 6, 7); })
 
+/// \brief Shuffles the upper 4 16-bit integers from a 128-bit integer vector 
+///    to the destination, as specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPSHUFLW instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param imm
+///    An immediate value containing 8-bit values specifying which
+///    elements to copy from a. If bit 7 is set, the corresponding
+///    8-bit element in the destination is cleared. Bits [3:0]
+///    select which 8-bit element to copy.
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
                                    (__v8hi)_mm_setzero_si128(), \
@@ -1354,130 +4073,442 @@
                                    4 + (((imm) & 0x30) >> 4), \
                                    4 + (((imm) & 0xc0) >> 6)); })
 
+/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors 
+///    of [16 x i8] and interleaves them into a packed 128-bit vector of [16 x
+///    i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPUNPCKHBW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+///    Bits [71:64] are written to bits [7:0] of the destination.
+///    This pattern continues until:
+///    Bits [127:120] are written to bits [119:112] of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+///    Bits [71:64] are written to bits [15:8] of the destination.
+///    This pattern continues until:
+///    Bits [127:120] are written to bits [127:120] of the
+///    destination.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
 }
 
+/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors 
+///    of [8 x i16] and interleaves them into a packed 128-bit vector of [8 x
+///    i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPUNPCKHWD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [15:0] of the destination.
+///    This pattern continues until:
+///    Bits [127:112] are written to bits [111:96] of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [31:16] of the destination.
+///    This pattern continues until:
+///    Bits [127:112] are written to bits [127:112] of the
+///    destination.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
 }
 
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors 
+///    of [4 x i32] and interleaves them into a packed 128-bit vector of [4 x
+///    i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPUNPCKHDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+///    Bits [95:64] are written to bits [31:0] of the destination.
+///    Bits [127:96] are written to bits [95:64] of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32].
+///    Bits [95:64] are written to bits [64:32] of the destination.
+///    Bits [127:96] are written to bits [127:96] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
 }
 
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x i64] and interleaves them into a packed 128-bit vector of [2 x
+///    i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPUNPCKHQDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x i64].
+///    Bits [127:64] are written to bits [127:64] of the
+///    destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
 }
 
+/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
+///    [16 x i8] and interleaves them into a packed 128-bit vector of [16 x
+///    i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPUNPCKLBW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+///    Bits [7:0] are written to bits [7:0] of the destination.
+///    This pattern continues until:
+///    Bits [63:56] are written to bits [119:112] of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+///    Bits [7:0] are written to bits [15:8] of the destination.
+///    This pattern continues until:
+///    Bits [63:56] are written to bits [127:120] of the
+///    destination.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
 }
 
+/// \brief Unpacks the low-order (index 0-3) values from two 128-bit vectors of
+///    [8 x i16] and interleaves them into a packed 128-bit vector of [8 x
+///    i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPUNPCKLWD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [15:0] of the destination.
+///    This pattern continues until:
+///    Bits [63:48] are written to bits [111:96] of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [31:16] of the destination.
+///    This pattern continues until:
+///    Bits [63:48] are written to bits [127:112] of the
+///    destination.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
 }
 
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a packed 128-bit vector of [4 x
+///    i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPUNPCKLDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+///    Bits [31:0] are written to bits [31:0] of the destination.
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32].
+///    Bits [31:0] are written to bits [64:32] of the destination.
+///    Bits [63:32] are written to bits [127:96] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
 }
 
+/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
+///    of [2 x i64] and interleaves them into a packed 128-bit vector of [2 x
+///    i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPUNPCKLQDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x i64].
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
 }
 
+/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 64-bit
+///    register.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MOVDQ2Q instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are
+///    moved to the destination.
+/// \returns A 64-bit register containing the lower 64 bits of the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_movepi64_pi64(__m128i __a)
 {
   return (__m64)__a[0];
 }
 
+/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+///    upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MOVQ2DQ instruction.
+///
+/// \param __a
+///    A 64-bit value.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from 
+///    the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_movpi64_epi64(__m64 __a)
 {
   return (__m128i){ (long long)__a, 0 };
 }
 
+/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
+///    integer vector, zeroing the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are
+///    moved to the destination.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from 
+///    the operand. The upper 64 bits are assigned zeros.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_move_epi64(__m128i __a)
 {
   return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
 }
 
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a packed 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUNPCKHPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double].
+///    Bits [127:64] are written to bits [127:64] of the
+///    destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpackhi_pd(__m128d __a, __m128d __b)
 {
   return __builtin_shufflevector(__a, __b, 1, 2+1);
 }
 
+/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a packed 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUNPCKLPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double].
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpacklo_pd(__m128d __a, __m128d __b)
 {
   return __builtin_shufflevector(__a, __b, 0, 2+0);
 }
 
+/// \brief Extracts the sign bits of the packed double-precision values in the
+///    128-bit vector of [2 x double], zero-extends the value, and writes it
+///    to the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVMSKPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values with
+///    sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [1:0]. The 
+///    remaining bits are assigned values of zero.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_pd(__m128d __a)
 {
   return __builtin_ia32_movmskpd(__a);
 }
 
+/// \brief Selects two double-precision values from the 128-bit operands of [2 
+///    x double], as specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VSHUFPD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param i
+///    An immediate value containing 8-bit values specifying which
+///    elements to copy from a and b:
+///    Bit [0]=0: Bits [63:0] are copied from a.
+///    Bit [0]=1: Bits [127:64] are copied from a.
+///    Bit [1]=0: Bits [63:0] are copied from b.
+///    Bit [1]=1: Bits [127:64] are copied from b.
+/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
                                    (i) & 1, (((i) & 2) >> 1) + 2); })
 
+/// \brief Casts 64-bit double-precision values as packed 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] to be cast as float values.
+/// \returns A 128-bit vector of [4 x float] containing the typecast values
+///    provided in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castpd_ps(__m128d __a)
 {
   return (__m128)__a;
 }
 
+/// \brief Casts 64-bit double-precision values as integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] to be cast as integer
+///    values.
+/// \returns A 128-bit integer vector containing the typecast values provided 
+///    in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castpd_si128(__m128d __a)
 {
   return (__m128i)__a;
 }
 
+/// \brief Casts 32-bit float values as 64-bit double-precision values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] to be cast as
+///    double-precision values.
+/// \returns A 128-bit vector of [2 x double] containing the typecast values
+///    provided in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castps_pd(__m128 __a)
 {
   return (__m128d)__a;
 }
 
+/// \brief Casts 32-bit float values as integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] to be cast as integer
+///    values.
+/// \returns A 128-bit integer vector containing the typecast values provided 
+///    in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castps_si128(__m128 __a)
 {
   return (__m128i)__a;
 }
 
+/// \brief Casts integer values as 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit integer vector to be cast as float values.
+/// \returns A 128-bit vector of [4 x float] containing the typecast values
+///    provided in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castsi128_ps(__m128i __a)
 {
   return (__m128)__a;
 }
 
+/// \brief Casts integer values as 64-bit double-precision values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit integer vector to be cast as double-precision
+///    values.
+/// \returns A 128-bit vector of [2 x double] containing the typecast values
+///    provided in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castsi128_pd(__m128i __a)
 {
   return (__m128d)__a;
 }
 
+/// \brief Indicates that a spin loop is being executed for the purposes of
+///    optimizing power consumption during the loop.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PAUSE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_pause(void)
 {
Index: lib/Headers/f16cintrin.h
===================================================================
--- lib/Headers/f16cintrin.h
+++ lib/Headers/f16cintrin.h
@@ -31,9 +31,43 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
 
+/// \brief Converts a 128-bit vector containing 32-bit float values into a
+///    128-bit vector containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VCVTPS2PH instruction.
+///
+/// \param a
+///    A 128-bit vector containing 32-bit float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]:
+///    000: Nearest
+///    001: Down
+///    010: Up
+///    011: Truncate
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing converted 16-bit half-precision float
+///    values. The lower 64 bits are used to store the converted 16-bit
+///    half-precision floating-point values.
 #define _mm_cvtps_ph(a, imm) __extension__ ({ \
  (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
 
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 128-bit vector containing 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTPH2PS instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float
+///    values. The lower 64 bits are used in the conversion.
+/// \returns A 128-bit vector of [4 x float] containing converted float values.
 static __inline __m128 __DEFAULT_FN_ATTRS
 _mm_cvtph_ps(__m128i __a)
 {
Index: lib/Headers/immintrin.h
===================================================================
--- lib/Headers/immintrin.h
+++ lib/Headers/immintrin.h
@@ -46,9 +46,46 @@
    Intel documents these as being in immintrin.h, and
    they depend on typedefs from avxintrin.h. */
 
+/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
+///    containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VCVTPS2PH instruction.
+///
+/// \param a
+///    A 256-bit vector containing 32-bit single-precision float
+///    values to be converted to 16-bit half-precision float
+///    values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]:
+///    000: Nearest
+///    001: Down
+///    010: Up
+///    011: Truncate
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing the converted 16-bit half-precision 
+///    float values.
 #define _mm256_cvtps_ph(a, imm) __extension__ ({ \
  (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
 
+/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTPH2PS instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float
+///    values to be converted to 32-bit single-precision float
+///    values.
+/// \returns A vector of [8 x float] containing the converted 32-bit
+///    single-precision float values.
 static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
 _mm256_cvtph_ps(__m128i __a)
 {
Index: lib/Headers/mmintrin.h
===================================================================
--- lib/Headers/mmintrin.h
+++ lib/Headers/mmintrin.h
@@ -33,366 +33,1229 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
 
+/// \brief Clears the MMX state by setting the state of the stack registers to
+///    empty.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c EMMS instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_empty(void)
 {
     __builtin_ia32_emms();
 }
 
+/// \brief Converts a 32-bit signed integer value into a 64-bit vector, writing
+///    the result to the lower 32 bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVD instruction.
+///
+/// \param __i
+///    A 32-bit signed integer operand containing the value to be
+///    converted.
+/// \returns A 64-bit vector containing the converted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtsi32_si64(int __i)
 {
     return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
 }
 
+/// \brief Converts a 64-bit vector into a 32-bit signed integer value, using 
+///    the lower 32 bits of the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVD instruction.
+///
+/// \param __m
+///    A 64-bit vector. The lower 32 bits are used in the
+///    conversion.
+/// \returns A 32-bit signed integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtsi64_si32(__m64 __m)
 {
     return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
 }
 
+/// \brief Converts a 64-bit signed integer value into a 64-bit vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MOVQ instruction.
+///
+/// \param __i
+///    A 64-bit signed integer operand containing the value to be
+///    converted.
+/// \returns A 64-bit vector containing the converted value of operand __i.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtsi64_m64(long long __i)
 {
     return (__m64)__i;
 }
 
+/// \brief Converts a 64-bit vector into a 64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MOVQ instruction.
+///
+/// \param __m
+///    A 64-bit vector operand containing the value to be
+///    converted.
+/// \returns A 64-bit signed integer containing the converted value of operand 
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtm64_si64(__m64 __m)
 {
     return (long long)__m;
 }
 
+/// \brief Converts 16-bit signed integers from both 64-bit vector operands 
+///    into 8-bit signed integer values, and packs the results into the
+///    destination. Positive values greater than 7Fh are saturated to 7Fh. 
+///    Negative values less than 80h are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PACKSSWB instruction.
+///
+/// \param __m1
+///    A 64-bit vector of [4 x i16]. The converted values are
+///    written to the lower order bits of the destination.
+/// \param __m2
+///    A 64-bit vector of [4 x i16]. The converted values are
+///    written to the upper order bits of the destination.
+/// \returns A 64-bit vector of [8 x i8] containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_packs_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Converts 32-bit signed integers from both 64-bit integer vector
+///    operands into 16-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 7FFFh are saturated to 7FFFh. 
+///    Negative values less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PACKSSDW instruction.
+///
+/// \param __m1
+///    A 64-bit vector of [2 x i32]. The converted values are
+///    written to the lower order bits of the destination.
+/// \param __m2
+///    A 64-bit vector of [2 x i32]. The converted values are
+///    written to the upper order bits of the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_packs_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+///    operands into 8-bit unsigned integers, and packs the results into the
+///    destination. Values greater than 7Fh
+///    are saturated to 7Fh. Values
+///    less than 00h are saturated to
+///    00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PACKUSWB instruction.
+///
+/// \param __m1
+///    A 64-bit vector of [4 x i16]. The converted values are
+///    written to the lower order bits of the destination.
+/// \param __m2
+///    A 64-bit vector of [4 x i16]. The converted values are
+///    written to the upper order bits of the destination.
+/// \returns A 64-bit vector of [8 x i8] containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_packs_pu16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Unpacks the high-order (index 4-7) values from two 64-bit vectors of
+///    [8 x i8] and interleaves them into a packed 64-bit vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PUNPCKHBW instruction.
+///
+/// \param __m1
+///    A 64-bit vector of [8 x i8].
+///    Bits [39:32] are written to bits [7:0] of the destination.
+///    This pattern continues until:
+///    Bits [63:56] are written to bits [55:48] of the destination.
+/// \param __m2
+///    A 64-bit vector of [8 x i8].
+///    Bits [39:32] are written to bits [15:8] of the destination.
+///    This pattern continues until:
+///    Bits [63:56] are written to bits [63:56] of the destination.
+/// \returns A 64-bit vector of [8 x i8] containing the interleaved values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Unpacks the high-order (index 2,3) values from two 64-bit vectors of
+///    [4 x i16] and interleaves them into a packed 64-bit vector of [4 x
+///    i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PUNPCKHWD instruction.
+///
+/// \param __m1
+///    A 64-bit vector of [4 x i16].
+///    Bits [47:32] are written to bits [15:0] of the destination.
+///    Bits [63:48] are written to bits [47:32] of the destination.
+/// \param __m2
+///    A 64-bit vector of [4 x i16].
+///    Bits [47:32] are written to bits [31:16] of the destination.
+///    Bits [63:48] are written to bits [63:48] of the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the interleaved values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Unpacks the high-order (odd-indexed) values from two 64-bit vectors 
+///    of [2 x i32] and interleaves them into a packed 64-bit vector of [2 x
+///    i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PUNPCKHDQ instruction.
+///
+/// \param __m1
+///    A 64-bit vector of [2 x i32].
+///    Bits [63:32] are written to bits [31:0] of the destination.
+/// \param __m2
+///    A 64-bit vector of [2 x i32].
+///    Bits [63:32] are written to bits [63:32] of the destination.
+/// \returns A 64-bit vector of [2 x i32] containing the interleaved values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Unpacks the low-order (index 0-3) values from two 64-bit vectors of 
+///    [8 x i8] and interleaves them into a packed 64-bit vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PUNPCKLBW instruction.
+///
+/// \param __m1
+///    A 64-bit vector of [8 x i8].
+///    Bits [7:0] are written to bits [7:0] of the destination.
+///    This pattern continues until:
+///    Bits [31:24] are written to bits [55:48] of the destination.
+/// \param __m2
+///    A 64-bit vector of [8 x i8].
+///    Bits [7:0] are written to bits [15:8] of the destination.
+///    This pattern continues until:
+///    Bits [31:24] are written to bits [63:56] of the destination.
+/// \returns A 64-bit vector of [8 x i8] containing the interleaved values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Unpacks the low-order (index 0,1) values from two 64-bit vectors of 
+///    [4 x i16] and interleaves them into a packed 64-bit vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PUNPCKLWD instruction.
+///
+/// \param __m1
+///    A 64-bit vector of [4 x i16].
+///    Bits [15:0] are written to bits [15:0] of the destination.
+///    Bits [31:16] are written to bits [47:32] of the destination.
+/// \param __m2
+///    A 64-bit vector of [4 x i16].
+///    Bits [15:0] are written to bits [31:16] of the destination.
+///    Bits [31:16] are written to bits [63:48] of the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the interleaved values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Unpacks the low-order (even-indexed) values from two 64-bit vectors 
+///    of [2 x i32] and interleaves them into a packed 64-bit vector of [2 x
+///    i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PUNPCKLDQ instruction.
+///
+/// \param __m1
+///    A 64-bit vector of [2 x i32].
+///    Bits [31:0] are written to bits [31:0] of the destination.
+/// \param __m2
+///    A 64-bit vector of [2 x i32].
+///    Bits [31:0] are written to bits [63:32] of the destination.
+/// \returns A 64-bit vector of [2 x i32] containing the interleaved values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Adds packed 8-bit integer values and writes the sums to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PADDB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the sums of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Adds packed 16-bit integer values and writes the sums to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PADDW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the sums of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Adds packed 32-bit integer values and writes the sums to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PADDD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the sums of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_add_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Adds packed 8-bit integer values and writes the sums to the
+///    corresponding bits in the destination. Positive sums greater than
+///    7Fh are saturated to 7Fh. Negative sums less than 80h are saturated to 
+///    80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PADDSB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the sums of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_adds_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Adds packed 16-bit integer values and writes the sums to the
+///    corresponding bits in the destination. Positive sums greater than
+///    7FFFh are saturated to 7FFFh. Negative sums less than 8000h are saturated 
+///    to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PADDSW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the sums of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_adds_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Adds packed 8-bit integer values and writes the unsigned integer 
+///    sums to the corresponding bits in the destination. The sums greater than
+///    FFh are saturated to FFh. Negative sums less than 00h are saturated to 
+///    00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PADDUSB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the unsigned integer sums of 
+///    both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_adds_pu8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Adds packed 16-bit integer values and writes the unsigned integer 
+///    sums to the corresponding bits in the destination. Positive sums greater
+///    than FFFFh are saturated to
+///    FFFFh. Negative sums less than
+///    0000h are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PADDUSW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the unsigned integer sums of 
+///    both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_adds_pu16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Subtracts the 8-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSUBB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector containing the subtrahends.
+/// \returns A 64-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Subtracts the 16-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSUBW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector containing the subtrahends.
+/// \returns A 64-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Subtracts the 32-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSUBD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector containing the subtrahends.
+/// \returns A 64-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sub_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Subtracts packed 8-bit integer values and writes the differences to
+///    the corresponding bits in the destination. Values greater than the
+///    largest signed 8-bit integer are saturated to 7Fh, and values less than 
+///    the smallest
+///    signed 8-bit integer are saturated to 80h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSUBSB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector containing the subtrahends.
+/// \returns A 64-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Subtracts packed 16-bit integer values and writes the differences to
+///    the corresponding bits in the destination. Values greater than 7FFFh are 
+///    saturated to 7FFFh, and values less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSUBSW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector containing the subtrahends.
+/// \returns A 64-bit integer vector containing the differences of the values 
+///    in the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Subtracts packed 8-bit integer values and writes the unsigned 
+///    integer differences to the corresponding bits in the destination. Values 
+///    less
+///    than 00h are saturated to 00h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSUBUSB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector containing the subtrahends.
+/// \returns A 64-bit integer vector containing the unsigned integer differences 
+///    of the values in the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pu8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Subtracts packed 16-bit integer values and writes the unsigned 
+///    integer differences to the corresponding bits in the destination. Values
+///    greater than FFFFh are saturated
+///    to FFFFh, and values less 0000h are saturated to 0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSUBUSW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing the minuends.
+/// \param __m2
+///    A 64-bit integer vector containing the subtrahends.
+/// \returns A 64-bit integer vector containing the unsigned integer differences 
+///    of the values in the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_subs_pu16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Multiplies corresponding pairs of packed 16-bit signed integer values,
+///    adds pairs of contiguous products, and writes the 32-bit sums to the
+///    corresponding bits in the destination. For example, bits [15:0] of
+///    both operands are multiplied, bits [31:16] of both operands are
+///    multiplied, and the sum of both results is written to bits [31:0] of
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMADDWD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the sums of products of both
+///    operands:
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_madd_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Multiplies packed 16-bit signed integer values and writes the
+///    high-order 16 bits of each 32-bit product to the corresponding bits in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMULHW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the products of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Multiplies packed 16-bit integer values and writes the low-order 16
+///    bits of each 32-bit product to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMULLW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the products of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Left-shifts each 16-bit value in the 64-bit integer vector operand 
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSLLW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    A 64-bit integer vector specifying the number of bits to
+///    left-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the left-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_pi16(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
 }
 
+/// \brief Left-shifts each 16-bit value in the 64-bit integer vector operand 
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSLLW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift
+///    each value in operand __m.
+/// \returns A 64-bit integer vector containing the left-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_pi16(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
 }
 
+/// \brief Left-shifts each 32-bit value in the 64-bit integer vector operand 
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSLLD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    A 64-bit integer vector specifying the number of bits to
+///    left-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the left-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_pi32(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
 }
 
+/// \brief Left-shifts each 32-bit value in the 64-bit integer vector operand 
+///    by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSLLD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift
+///    each value in operand __m.
+/// \returns A 64-bit integer vector containing the left-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_pi32(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
 }
 
+/// \brief Left-shifts the 64-bit integer vector operand by the specified 
+///    number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSLLQ instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    A 64-bit integer vector specifying the number of bits to
+///    left-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the left-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sll_si64(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psllq(__m, __count);
 }
 
+/// \brief Left-shifts the 64-bit integer vector operand by the specified 
+///    number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSLLQ instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to left-shift
+///    each value in operand __m.
+/// \returns A 64-bit integer vector containing the left-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_slli_si64(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psllqi(__m, __count);
 }
 
+/// \brief Right-shifts each 16-bit value in the 64-bit integer vector operand 
+///    by the specified number of bits. High-order bits are filled with the 
+///    sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSRAW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    A 64-bit integer vector specifying the number of bits to
+///    right-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sra_pi16(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
 }
 
+/// \brief Right-shifts each 16-bit value in the 64-bit integer vector operand 
+///    by the specified number of bits. High-order bits are filled with the 
+///    sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSRAW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to
+///    right-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srai_pi16(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
 }
 
+/// \brief Right-shifts each 32-bit value in the 64-bit integer vector operand 
+///    by the specified number of bits. High-order bits are filled with the 
+///    sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSRAD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    A 64-bit integer vector specifying the number of bits to
+///    right-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sra_pi32(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
 }
 
+/// \brief Right-shifts each 32-bit value in the 64-bit integer vector operand 
+///    by the specified number of bits. High-order bits are filled with the 
+///    sign
+///    bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSRAD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to
+///    right-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srai_pi32(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
 }
 
+/// \brief Right-shifts each 16-bit value in the 64-bit integer vector operand 
+///    by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSRLW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    A 64-bit integer vector specifying the number of bits to
+///    right-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srl_pi16(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
 }
 
+/// \brief Right-shifts each packed 16-bit value in the 64-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSRLW instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to
+///    right-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srli_pi16(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
 }
 
+/// \brief Right-shifts each 32-bit value in the 64-bit integer vector operand 
+///    by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSRLD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    A 64-bit integer vector specifying the number of bits to
+///    right-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srl_pi32(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
 }
 
+/// \brief Right-shifts each packed 32-bit value in the 64-bit integer vector
+///    operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSRLD instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to
+///    right-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srli_pi32(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
 }
 
+/// \brief Right-shifts the 64-bit integer vector operand by the specified 
+///    number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSRLQ instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    A 64-bit integer vector specifying the number of bits to
+///    right-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srl_si64(__m64 __m, __m64 __count)
 {
     return (__m64)__builtin_ia32_psrlq(__m, __count);
 }
 
+/// \brief Right-shifts the 64-bit integer vector operand by the specified 
+///    number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSRLQ instruction.
+///
+/// \param __m
+///    A 64-bit integer vector containing the source operand.
+/// \param __count
+///    An integer value specifying the number of bits to
+///    right-shift each value in operand __m.
+/// \returns A 64-bit integer vector containing the right-shifted value.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_srli_si64(__m64 __m, int __count)
 {
     return (__m64)__builtin_ia32_psrlqi(__m, __count);
 }
 
+/// \brief Performs a bitwise AND of 2 packed 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PAND instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the bitwise AND of the values
+///    between both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_and_si64(__m64 __m1, __m64 __m2)
 {
     return __builtin_ia32_pand(__m1, __m2);
 }
 
+/// \brief Performs a bitwise AND of 2 packed 64-bit integer vectors, using the
+///    ones-complement of the values contained in the first source
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PANDN instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing the left source operand.
+///    The ones complement of this value is used in the
+///    bitwise AND.
+/// \param __m2
+///    A 64-bit integer vector containing the right source operand.
+/// \returns A 64-bit integer vector containing the bitwise AND of the 
+///    ones-complement of the first operand and the values in the second operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_andnot_si64(__m64 __m1, __m64 __m2)
 {
     return __builtin_ia32_pandn(__m1, __m2);
 }
 
+/// \brief Performs a bitwise OR of 2 packed 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c POR instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the bitwise OR of the values
+///    between both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_or_si64(__m64 __m1, __m64 __m2)
 {
     return __builtin_ia32_por(__m1, __m2);
 }
 
+/// \brief Performs a bitwise exclusive OR of 2 packed 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PXOR instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __m2
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the bitwise exclusive OR of the
+///    values between both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_xor_si64(__m64 __m1, __m64 __m2)
 {
     return __builtin_ia32_pxor(__m1, __m2);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit values of the 64-bit
+///    integer vectors for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PCMPEQB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit values of the 
+///    64-bit integer vectors for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PCMPEQW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit values of the 
+///    64-bit integer vectors for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PCMPEQD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit values of the 64-bit
+///    integer vectors to determine if the values in the first operand are
+///    greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PCMPGTB instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit values of the 
+///    64-bit integer vectors to determine if the values in the first operand 
+///    are
+///    greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PCMPGTW instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit values of the 
+///    64-bit integer vectors to determine if the values in the first operand 
+///    are
+///    greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PCMPGTD instruction.
+///
+/// \param __m1
+///    A 64-bit integer vector.
+/// \param __m2
+///    A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 {
     return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
 }
 
+/// \brief Sets a 64-bit integer register to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \returns An initialized 64-bit integer vector with all elements set to zero.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setzero_si64(void)
 {
     return (__m64){ 0LL };
 }
 
+/// \brief Initializes the 32-bit values in a 64-bit vector of [2 x i32] with 
+///    the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __i1
+///    A 32-bit integer value used to initialize bits [63:32] of
+///    the destination vector.
+/// \param __i0
+///    A 32-bit integer value used to initialize bits [31:0] of the
+///    destination vector.
+/// \returns An initialized 64-bit vector of [2 x i32] containing the values
+///    provided in the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set_pi32(int __i1, int __i0)
 {
     return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
 }
 
+/// \brief Initializes the 16-bit values in a 64-bit vector of [4 x i16] with 
+///    the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __s3
+///    A 16-bit integer value used to initialize bits [63:48] of
+///    the destination vector.
+/// \param __s2
+///    A 16-bit integer value used to initialize bits [47:32] of
+///    the destination vector.
+/// \param __s1
+///    A 16-bit integer value used to initialize bits [31:16] of
+///    the destination vector.
+/// \param __s0
+///    A 16-bit integer value used to initialize bits [15:0] of the
+///    destination vector.
+/// \returns An initialized 64-bit vector of [4 x i16] containing the values
+///    provided in the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
 {
     return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
 }
 
+/// \brief Initializes the 8-bit values in a 64-bit vector of [8 x i8] with the
+///    specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __b7
+///    Initializes bits [63:56] of the destination vector.
+/// \param __b6
+///    Initializes bits [55:48] of the destination vector.
+/// \param __b5
+///    Initializes bits [47:40] of the destination vector.
+/// \param __b4
+///    Initializes bits [39:32] of the destination vector.
+/// \param __b3
+///    Initializes bits [31:24] of the destination vector.
+/// \param __b2
+///    Initializes bits [23:16] of the destination vector.
+/// \param __b1
+///    Initializes bits [15:8] of the destination vector.
+/// \param __b0
+///    Initializes bits [7:0] of the destination vector.
+/// \returns An initialized 64-bit vector of [8 x i8] containing the values
+///    provided in the operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
             char __b1, char __b0)
@@ -401,36 +1264,121 @@
                                                __b4, __b5, __b6, __b7);
 }
 
+/// \brief Initializes all values in a 64-bit vector of [2 x i32] with the
+///    specified 32-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __i
+///    A 32-bit value used to initialize the elements of the
+///    destination integer vector.
+/// \returns An initialized 64-bit vector of [2 x i32] with all elements 
+///    containing the value provided in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set1_pi32(int __i)
 {
     return _mm_set_pi32(__i, __i);
 }
 
+/// \brief Initializes all values in a 64-bit vector of [4 x i16] with the
+///    specified 16-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    A 16-bit value used to initialize the elements of the
+///    destination integer vector.
+/// \returns An initialized 64-bit vector of [4 x i16] with all elements 
+///    containing the value provided in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set1_pi16(short __w)
 {
     return _mm_set_pi16(__w, __w, __w, __w);
 }
 
+/// \brief Initializes all values in a 64-bit vector of [8 x i8] with the
+///    specified 8-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __b
+///    An 8-bit value used to initialize the elements of the
+///    destination integer vector.
+/// \returns An initialized 64-bit vector of [8 x i8] with all elements 
+///    containing the value provided in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_set1_pi8(char __b)
 {
     return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
 }
 
+/// \brief Initializes all 32-bit integer values in a 64-bit vector of [2 x i32]
+///    in reverse order, using the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __i0
+///    A 32-bit integer value used to initialize bits [31:0] of the
+///    destination vector.
+/// \param __i1
+///    A 32-bit integer value used to initialize bits [63:32] of
+///    the destination vector.
+/// \returns An initialized 64-bit vector of [2 x i32] containing the values
+///    provided in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setr_pi32(int __i0, int __i1)
 {
     return _mm_set_pi32(__i1, __i0);
 }
 
+/// \brief Initializes all 16-bit integer values in a 64-bit vector of [4 x i16]
+///    in reverse order, using the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w0
+///    A 16-bit integer value used to initialize bits [15:0] of the
+///    destination vector.
+/// \param __w1
+///    A 16-bit integer value used to initialize bits [31:16] of
+///    the destination vector.
+/// \param __w2
+///    A 16-bit integer value used to initialize bits [47:32] of
+///    the destination vector.
+/// \param __w3
+///    A 16-bit integer value used to initialize bits [63:48] of
+///    the destination vector.
+/// \returns An initialized 64-bit vector of [4 x i16] containing the values
+///    provided in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
 {
     return _mm_set_pi16(__w3, __w2, __w1, __w0);
 }
 
+/// \brief Initializes all 8-bit integer values in a 64-bit vector of [8 x i8] 
+///    in reverse order, using the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __b0
+///    Initializes bits [7:0] of the destination vector.
+/// \param __b1
+///    Initializes bits [15:8] of the destination vector.
+/// \param __b2
+///    Initializes bits [23:16] of the destination vector.
+/// \param __b3
+///    Initializes bits [31:24] of the destination vector.
+/// \param __b4
+///    Initializes bits [39:32] of the destination vector.
+/// \param __b5
+///    Initializes bits [47:40] of the destination vector.
+/// \param __b6
+///    Initializes bits [55:48] of the destination vector.
+/// \param __b7
+///    Initializes bits [63:56] of the destination vector.
+/// \returns An initialized 64-bit vector of [8 x i8] containing the values
+///    provided in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
              char __b6, char __b7)
Index: lib/Headers/pmmintrin.h
===================================================================
--- lib/Headers/pmmintrin.h
+++ lib/Headers/pmmintrin.h
@@ -29,62 +29,233 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
 
+/// \brief Moves integer values from an unaligned memory location to elements 
+///    in a 128-bit integer vector. The instruction may read 16 bytes to
+///    retrieve either or both of the first and second parts of the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VLDDQU instruction.
+///
+/// \param __p
+///    A pointer to a 128-bit integer vector containing integer
+///    values.
+/// \returns A 128-bit integer vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_lddqu_si128(__m128i const *__p)
 {
   return (__m128i)__builtin_ia32_lddqu((char const *)__p);
 }
 
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 
+///    2 packed 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VADDSUBPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the left source
+///    operand.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the right source
+///    operand.
+/// \returns A 128-bit vector of [4 x float] containing the alternating sums 
+///    and differences between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_addsub_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_addsubps(__a, __b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VHADDPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the lower bits of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the upper bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of 
+///    both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_hadd_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_haddps(__a, __b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VHSUBPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the lower bits of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the upper bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal 
+///    differences of both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_hsub_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_hsubps(__a, __b);
 }
 
+/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
+///    vector of [4 x float] to float values stored in a packed 128-bit
+///    vector of [4 x float].
+///    Bits [127:96] of the source are written to bits [127:96] and [95:64]
+///    of the destination.
+///    Bits [63:32] of the source are written to bits [63:32] and [31:0] of
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVHDUP instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the moved and 
+///    duplicated values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_movehdup_ps(__m128 __a)
 {
   return __builtin_shufflevector(__a, __a, 1, 1, 3, 3);
 }
 
+/// \brief Moves and duplicates low-order (even-indexed) values from a 128-bit
+///    vector of [4 x float] to float values stored in a packed 128-bit
+///    vector of [4 x float].
+///    Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+///    the destination.
+///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVSLDUP instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the moved and 
+///    duplicated values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_moveldup_ps(__m128 __a)
 {
   return __builtin_shufflevector(__a, __a, 0, 0, 2, 2);
 }
 
+/// \brief Adds the even-indexed values and subtracts the odd-indexed values of 
+///    2 packed 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VADDSUBPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the left source
+///    operand.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing the right source
+///    operand.
+/// \returns A 128-bit vector of [2 x double] containing the alternating sums 
+///    and differences between both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_addsub_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_addsubpd(__a, __b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VHADDPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands. The horizontal sums of the values are
+///    stored in the lower bits of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands. The horizontal sums of the values are
+///    stored in the upper bits of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
+///    both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_hadd_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_haddpd(__a, __b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VHSUBPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands. The horizontal differences between the
+///    values are stored in the lower bits of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double] containing one of the
+///    source operands. The horizontal differences between the
+///    values are stored in the upper bits of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal 
+///    differences of both operands.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_hsub_pd(__m128d __a, __m128d __b)
 {
   return __builtin_ia32_hsubpd(__a, __b);
 }
 
+/// \brief Moves and duplicates one double-precision value to double-precision
+///    values stored in a packed 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_loaddup_pd(double const * dp);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VMOVDDUP instruction.
+///
+/// \param dp
+///    A pointer to a double-precision value to be moved and
+///    duplicated.
+/// \returns A 128-bit vector of [2 x double] containing the moved and 
+///    duplicated values.
 #define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
 
+/// \brief Moves and duplicates the double-precision value in the lower bits of 
+///    a 128-bit vector of [2 x double] to double-precision values stored in a
+///    packed 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVDDUP instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double]. Bits [63:0] are written to
+///    bits [127:64] and [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the moved and 
+///    duplicated values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_movedup_pd(__m128d __a)
 {
@@ -99,12 +270,45 @@
 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
 
+/// \brief Establishes a linear address memory range to be monitored and puts 
+///    the processor in the monitor event pending state. Data stored in the
+///    monitored address range causes the processor to exit the pending
+///    state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MONITOR instruction.
+///
+/// \param __p
+///    The memory range to be monitored. The size of the range is
+///    determined by CPUID function 0000_0005h.
+/// \param __extensions
+///    Optional extensions for the monitoring state.
+/// \param __hints
+///    Optional hints for the monitoring state.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
 {
   __builtin_ia32_monitor((void *)__p, __extensions, __hints);
 }
 
+/// \brief Used with the MONITOR instruction to wait while the processor is in
+///    the monitor event pending state. Data stored in the monitored address
+///    range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MWAIT instruction.
+///
+/// \param __extensions
+///    Optional extensions for the monitoring state. Only setting
+///    bit 0, which allows interrupts to wake MWAIT, is supported.
+///    Setting any other bits results in a General Protection
+///    fault.
+/// \param __hints
+///    Optional hints for the monitoring state. No hints are
+///    actually defined: any bits set in this value are ignored by
+///    the processor.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mwait(unsigned __extensions, unsigned __hints)
 {
Index: lib/Headers/popcntintrin.h
===================================================================
--- lib/Headers/popcntintrin.h
+++ lib/Headers/popcntintrin.h
@@ -27,6 +27,15 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
 
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c POPCNT instruction.
+///
+/// \param __A
+///    An unsigned 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits in the source 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_popcnt_u32(unsigned int __A)
 {
@@ -40,6 +49,15 @@
 }
 
 #ifdef __x86_64__
+/// \brief Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c POPCNT instruction.
+///
+/// \param __A
+///    An unsigned 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits in the source 
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_popcnt_u64(unsigned long long __A)
 {
Index: lib/Headers/prfchwintrin.h
===================================================================
--- lib/Headers/prfchwintrin.h
+++ lib/Headers/prfchwintrin.h
@@ -35,6 +35,28 @@
   __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
 }
 
+/// \brief Loads a memory sequence containing the specified memory address into
+///    the L1 data cache. Data can be
+///    written to the cache line without additional delay, because the data
+///    is already prefetched in the modified cache-coherency state. Data can
+///    also be read from the cache line without additional delay. However,
+///    prefetching write data takes longer than prefetching read data if the
+///    processor must wait for another caching master to first write back its
+///    modified copy of the requested data to memory before the prefetch
+///    request is satisfied.
+///    The PREFETCHW instruction
+///    provides a hint to the processor that the cache line is to be
+///    modified, and is intended for use when the cache line will be written
+///    to shortly after the prefetch is performed. The processor can place
+///    the cache line in the modified state when it is prefetched, but before
+///    it is actually written.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PREFETCHW instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
 _m_prefetchw(void *__P)
 {
Index: lib/Headers/smmintrin.h
===================================================================
--- lib/Headers/smmintrin.h
+++ lib/Headers/smmintrin.h
@@ -46,37 +46,352 @@
 #define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
 
+/// \brief Rounds up the values stored in a packed 128-bit vector of [4 x float].
+///    The source values are rounded to integer values and returned as
+///    floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_ceil_ps(__m128 X);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+/// \brief Rounds up the values stored in a packed 128-bit vector of [2 x
+///    double]. The source values are rounded to integer values and returned
+///    as double-precision values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_ceil_pd(__m128d X);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+/// \brief Copies the values stored in bits [127:32] from the first operand to
+///    the destination. Rounds up the low-order value stored in bits [31:0]
+///    of the second operand to an integer value and stores the result in
+///    bits [31:0] of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values. The values stored in
+///    bits [127:32] are copied to the corresponding bits in the
+///    destination.
+/// \param Y
+///    A 128-bit vector of [4 x float] values. The value stored in
+///    bits [31:0] is rounded up to the nearest integer and is
+///    stored in bits [31:0] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+///    values.
 #define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+/// \brief Copies the value stored in bits [127:64] from the first operand to 
+///    the destination. Rounds up the low-order value stored in bits [63:0] of
+///    the second operand to an integer value and stores the result in bits
+///    [63:0] of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [2 x double] values. The values stored
+///    in bits [127:64] are copied to the corresponding bits in the
+///    destination.
+/// \param Y
+///    A 128-bit vector of [2 x double] values. The value stored in
+///    bits [63:0] is rounded up to the nearest integer and is
+///    stored in bits [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+///    values.
 #define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
 
+/// \brief Rounds down the values stored in a packed 128-bit vector of [4 x
+///    float]. The source values are rounded to integer values and returned
+///    as floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_floor_ps(__m128 X);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+/// \brief Rounds down the values stored in a packed 128-bit vector of [2 x
+///    double]. The source values are rounded to integer values and returned
+///    as double-precision values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_floor_pd(__m128d X);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [2 x double] values.
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+/// \brief Copies the values stored in bits [127:32] from the first operand to
+///    the destination. Rounds down the low-order value stored in bits [31:0]
+///    of the second operand to an integer value and stores the result in
+///    bits [31:0] of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values. The values stored in
+///    bits [127:32] are copied to the corresponding bits in the
+///    destination.
+/// \param Y
+///    A 128-bit vector of [4 x float] values. The value stored in
+///    bits [31:0] is rounded down to the nearest integer and is
+///    stored in bits [31:0] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+///    values.
 #define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+/// \brief Copies the value stored in bits [127:64] from the first operand to 
+///    the destination. Rounds down the low-order value stored in bits [63:0] 
+///    of
+///    the second operand to an integer value and stores the result in bits
+///    [63:0] of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [2 x double] values. The values stored
+///    in bits [127:64] are copied to the corresponding bits in the
+///    destination.
+/// \param Y
+///    A 128-bit vector of [2 x double] values. The value stored in
+///    bits [63:0] is rounded down to the nearest integer and is
+///    stored in bits [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+///    values.
 #define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
 
+/// \brief Rounds the values stored in a packed 128-bit vector of [4 x float]
+///    using the specified rounding control. The source values are rounded to
+///    integer values and returned as floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_round_ps(__m128 X, const int M);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values.
+/// \param M
+///    An integer value that specifies the rounding operation.
+///    Bits [7:4] are reserved.
+///    Bit [3] is a precision exception value:
+///    0: A normal PE exception is used
+///    1: The PE field is not updated
+///    Bit [2] is the rounding control source:
+///    0: Use bits [1:0] of M
+///    1: Use the current MXCSR setting
+///    Bits [1:0] contain the rounding control definition:
+///    00: Nearest
+///    01: Downward (toward negative infinity)
+///    10: Upward (toward positive infinity)
+///    11: Truncated
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_round_ps(X, M) __extension__ ({ \
   (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
 
+/// \brief Copies the values stored in bits [127:32] from the first operand to
+///    the destination. Rounds the low-order value stored in bits [31:0] of
+///    the second operand to an integer value using the specified rounding
+///    control, and stores the result in bits [31:0] of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values. The values stored in
+///    bits [127:32] are copied to the corresponding bits in the
+///    destination.
+/// \param Y
+///    A 128-bit vector of [4 x float] values. The value stored in
+///    bits [31:0] is rounded to the nearest integer using the
+///    specified rounding control, and is stored in bits [31:0] of
+///    the destination.
+/// \param M
+///    An integer value that specifies the rounding operation.
+///    Bits [7:4] are reserved.
+///    Bit [3] is a precision exception value:
+///    0: A normal PE exception is used
+///    1: The PE field is not updated
+///    Bit [2] is the rounding control source:
+///    0: Use bits [1:0] of M
+///    1: Use the current MXCSR setting
+///    Bits [1:0] contain the rounding control definition:
+///    00: Nearest
+///    01: Downward (toward negative infinity)
+///    10: Upward (toward positive infinity)
+///    11: Truncated
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+///    values.
 #define _mm_round_ss(X, Y, M) __extension__ ({ \
   (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
                                  (__v4sf)(__m128)(Y), (M)); })
 
+/// \brief Rounds up the values stored in a packed 128-bit vector of [2 x double]
+///    using the specified rounding control. The source values are rounded to
+///    integer values and returned as double-precision values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_round_pd(__m128d X, const int M);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [2 x double] values.
+/// \param M
+///    An integer value that specifies the rounding operation.
+///    Bits [7:4] are reserved.
+///    Bit [3] is a precision exception value:
+///    0: A normal PE exception is used
+///    1: The PE field is not updated
+///    Bit [2] is the rounding control source:
+///    0: Use bits [1:0] of M
+///    1: Use the current MXCSR setting
+///    Bits [1:0] contain the rounding control definition:
+///    00: Nearest
+///    01: Downward (toward negative infinity)
+///    10: Upward (toward positive infinity)
+///    11: Truncated
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_round_pd(X, M) __extension__ ({ \
   (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
 
+/// \brief Copies the value stored in bits [127:64] from the first operand to 
+///    the destination. Rounds the low-order value stored in bits [63:0] of the
+///    second operand to an integer value using the specified rounding
+///    control, and stores the result in bits [63:0] of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
+/// \endcode 
+///
+/// \param X
+///    A 128-bit vector of [2 x double] values. The values stored
+///    in bits [127:64] are copied to the corresponding bits in the
+///    destination.
+/// \param Y
+///    A 128-bit vector of [2 x double] values. The value stored in
+///    bits [63:0] is rounded to the nearest integer using the
+///    specified rounding control, and is stored in bits [63:0] of
+///    the destination.
+/// \param M
+///    An integer value that specifies the rounding operation.
+///    Bits [7:4] are reserved.
+///    Bit [3] is a precision exception value:
+///    0: A normal PE exception is used
+///    1: The PE field is not updated
+///    Bit [2] is the rounding control source:
+///    0: Use bits [1:0] of M
+///    1: Use the current MXCSR setting
+///    Bits [1:0] contain the rounding control definition:
+///    00: Nearest
+///    01: Downward (toward negative infinity)
+///    10: Upward (toward positive infinity)
+///    11: Truncated
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+///    values.
 #define _mm_round_sd(X, Y, M) __extension__ ({ \
   (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
                                   (__v2df)(__m128d)(Y), (M)); })
 
 /* SSE4 Packed Blending Intrinsics.  */
+/// \brief Copies 64-bit double-precision data values stored in either of the 
+///    two packed 128-bit vectors of [2 x double], as specified by the integer
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// _mm_blend_pd( V1,  V2,  M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VBLENDPD instruction.
+///
+/// \param V1
+///    A 128-bit vector of [2 x double] values.
+/// \param V2
+///    A 128-bit vector of [2 x double] values.
+/// \param M
+///    An immediate integer operand, with mask bits [1:0]
+///    specifying how the values are to be copied. The position of
+///    the mask bit corresponds to the index of a copied value.
+///    When a mask bit is 0, the corresponding 64-bit element in
+///    operand V1 is copied to the same
+///    position in the destination. When a mask bit is 1, the
+///    corresponding 64-bit element in operand V2
+///    is copied to the same position in the destination.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_blend_pd(V1, V2, M) __extension__ ({ \
   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
                                    (__v2df)(__m128d)(V2), \
                                    (((M) & 0x01) ? 2 : 0), \
                                    (((M) & 0x02) ? 3 : 1)); })
 
+/// \brief Copies 32-bit single-precision data values stored in either of the 
+///    two packed 128-bit vectors of [4 x float], as specified by the integer
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// _mm_blend_ps( V1,  V2,  M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VBLENDPS instruction.
+///
+/// \param V1
+///    A 128-bit vector of [4 x float] values.
+/// \param V2
+///    A 128-bit vector of [4 x float] values.
+/// \param M
+///    An immediate integer operand, with mask bits [3:0]
+///    specifying how the values are to be copied. The position of
+///    the mask bit corresponds to the index of a copied value.
+///    When a mask bit is 0, the corresponding 32-bit element in
+///    operand V1 is copied to the same
+///    position in the destination. When a mask bit is 1, the
+///    corresponding 32-bit element in operand V2
+///    is copied to the same position in the destination.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_blend_ps(V1, V2, M) __extension__ ({ \
   (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
                                   (((M) & 0x01) ? 4 : 0), \
@@ -84,6 +399,28 @@
                                   (((M) & 0x04) ? 6 : 2), \
                                   (((M) & 0x08) ? 7 : 3)); })
 
+/// \brief Copies 64-bit double-precision data values stored in either of the 
+///    two packed 128-bit vectors of [2 x double], as specified by the 128-bit
+///    vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VBLENDVPD instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [2 x double] values.
+/// \param __V2
+///    A 128-bit vector of [2 x double] values.
+/// \param __M
+///    A 128-bit vector operand, with mask bits 127 and 63
+///    specifying how the values are to be copied. The position of
+///    the mask bit corresponds to the most significant bit of a
+///    copied value. When a mask bit is 0, the corresponding 64-bit
+///    element in operand __V1 is copied to the
+///    same position in the destination. When a mask bit is 1, the
+///    corresponding 64-bit element in operand __V2
+///    is copied to the same position in the destination.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
 {
@@ -91,6 +428,28 @@
                                             (__v2df)__M);
 }
 
+/// \brief Copies 32-bit single-precision data values stored in either of the 
+///    two packed 128-bit vectors of [4 x float], as specified by the 128-bit
+///    vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VBLENDVPS instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x float] values.
+/// \param __V2
+///    A 128-bit vector of [4 x float] values.
+/// \param __M
+///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31
+///    specifying how the values are to be copied. The position of
+///    the mask bit corresponds to the most significant bit of a
+///    copied value. When a mask bit is 0, the corresponding 32-bit
+///    element in operand __V1 is copied to the
+///    same position in the destination. When a mask bit is 1, the
+///    corresponding 32-bit element in operand __V2
+///    is copied to the same position in the destination.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
 {
@@ -98,6 +457,28 @@
                                            (__v4sf)__M);
 }
 
+/// \brief Copies packed 8-bit integer data values stored in either of the two
+///    packed 128-bit vectors of [16 x i8], as specified by the 128-bit
+///    vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPBLENDVB instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [16 x i8] values.
+/// \param __V2
+///    A 128-bit vector of [16 x i8] values.
+/// \param __M
+///    A 128-bit vector operand, with mask bits 127, 119, 111 ... 7
+///    specifying how the values are to be copied. The position of
+///    the mask bit corresponds to the most significant bit of a
+///    copied value. When a mask bit is 0, the corresponding 8-bit
+///    element in operand __V1 is copied to the
+///    same position in the destination. When a mask bit is 1, the
+///    corresponding 8-bit element in operand __V2
+///    is copied to the same position in the destination.
+/// \returns A 128-bit vector of [16 x i8] containing the copied values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 {
@@ -105,6 +486,32 @@
                                                (__v16qi)__M);
 }
 
+/// \brief Copies packed 16-bit integer data values stored in either of the two
+///    packed 128-bit vectors of [8 x i16], as specified by the 128-bit
+///    vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// _mm_blend_epi16( V1,  V2,  M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPBLENDW instruction.
+///
+/// \param V1
+///    A 128-bit vector of [8 x i16] values.
+/// \param V2
+///    A 128-bit vector of [8 x i16] values.
+/// \param M
+///    An immediate integer operand, with mask bits [7:0]
+///    specifying how the values are to be copied. The position of
+///    the mask bit corresponds to the index of a copied value.
+///    When a mask bit is 0, the corresponding 16-bit element in
+///    operand V1 is copied to the same
+///    position in the destination. When a mask bit is 1, the
+///    corresponding 16-bit element in operand V2
+///    is copied to the same position in the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the copied values.
 #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
                                    (__v8hi)(__m128i)(V2), \
@@ -118,12 +525,44 @@
                                    (((M) & 0x80) ? 15 : 7)); })
 
 /* SSE4 Dword Multiply Instructions.  */
+/// \brief Multiplies packed 32-bit integer values and writes the low-order 32
+///    bits of each 64-bit product to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMULLD instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __V2
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the products of both operands.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) ((__v4si)__V1 * (__v4si)__V2);
 }
 
+/// \brief Multiplies the even-indexed packed 32-bit signed integer values
+///    contained in the two 128-bit integer vectors and writes the 64-bit
+///    signed products to the destination.
+///    The following code illustrates this intrinsics behavior:
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMULDQ instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x i32] containing one of the source
+///    operands.
+/// \param __V2
+///    A 128-bit vector of [4 x i32] containing one of the source
+///    operands.
+/// \returns A 128-bit vector of [2 x i64] containing the products of both
+///    operands.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 {
@@ -131,15 +570,81 @@
 }
 
 /* SSE4 Floating Point Dot Product Instructions.  */
+/// \brief Computes the dot product of the two packed 128-bit vectors of [4 x
+///    float], as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VDPPS instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values.
+/// \param Y
+///    A 128-bit vector of [4 x float] values.
+/// \param M
+///    An immediate integer operand. Mask bits [7:4] are used to
+///    select 32-bit segments of the source operands. If a mask bit
+///    is 1, the corresponding bits are used in the dot product
+///    calculation:
+///    Bit [7]: selects bits [127:96]
+///    Bit [6]: selects bits [95:64]
+///    Bit [5]: selects bits [63:32]
+///    Bit [4]: selects bits [31:0]
+///    Bits [3:0] select which bits within the destination will be
+///    used to store the 32-bit sum.
+/// \returns A 128-bit vector of [4 x float] containing the dot product.
 #define _mm_dp_ps(X, Y, M) __extension__ ({ \
   (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
                                (__v4sf)(__m128)(Y), (M)); })
 
+/// \brief Computes the dot product of the two packed 128-bit vectors of [2 x
+///    double], as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c DPPD instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double] values.
+/// \param Y
+///    A 128-bit vector of [2 x double] values.
+/// \param M
+///    An immediate integer operand. Mask bits [7:4] are used to
+///    select 64-bit segments of the source operands. If a mask bit
+///    is 1, the corresponding bits are used in the dot product
+///    calculation:
+///    Bit [5]: selects bits [127:64]
+///    Bit [4]: selects bits [63:0]
+///    Bits [1:0] select which bits within the destination will be
+///    used to store the 64-bit sum.
+/// \returns A 128-bit vector of [2 x double] containing the dot product.
 #define _mm_dp_pd(X, Y, M) __extension__ ({\
   (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
                                 (__v2df)(__m128d)(Y), (M)); })
 
 /* SSE4 Streaming Load Hint Instruction.  */
+/// \brief Loads integer values from a 128-bit aligned memory location to a
+///    128-bit integer vector.
+///    The PlayStation4 CPU does not
+///    support non-temporal hints for load operations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVNTDQA instruction.
+///
+/// \param __V
+///    A 128-bit aligned pointer to a memory location that contains
+///    the integer values.
+/// \returns A 128-bit integer vector containing the data stored at the 
+///    specified memory location.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_stream_load_si128 (__m128i const *__V)
 {
@@ -147,48 +652,168 @@
 }
 
 /* SSE4 Packed Integer Min/Max Instructions.  */
+/// \brief Compares each of the corresponding packed 8-bit integer values of 
+///    the 128-bit integer vectors, and writes the lesser value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMINSB instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __V2
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi8 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit integer values of 
+///    the 128-bit integer vectors, and writes the greater value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMAXSB instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __V2
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi8 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit unsigned integer
+///    values of the 128-bit integer vectors, and writes the lesser value to
+///    the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMINUW instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __V2
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu16 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit unsigned integer
+///    values of the 128-bit integer vectors, and writes the greater value to
+///    the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMAXUW instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __V2
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu16 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit integer values of 
+///    the 128-bit integer vectors, and writes the lesser value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMINSD instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __V2
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit integer values of 
+///    the 128-bit integer vectors, and writes the greater value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMAXSD instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __V2
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit unsigned integer
+///    values of the 128-bit integer vectors, and writes the lesser value to
+///    the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMINUD instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __V2
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit unsigned integer
+///    values of the 128-bit integer vectors, and writes the greater value to
+///    the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMAXUD instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \param __V2
+///    A 128-bit integer vector containing one of the source
+///    operands.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu32 (__m128i __V1, __m128i __V2)
 {
@@ -196,7 +821,78 @@
 }
 
 /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
+/// \brief Copies 32-bit single-precision floating-point data from the 128-bit
+///    vector operands to the destination, using the bit indexes specified by
+///    the immediate operand. The immediate operand may indicate that
+///    specific index values should be set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VINSERTPS instruction.
+///
+/// \param X
+///    A 128-bit vector source operand of [4 x float]. With the
+///    exception of those bits in the destination copied from
+///    parameter Y and zeroed by bits [3:0]
+///    of N, all bits from this parameter
+///    are copied to the destination.
+/// \param Y
+///    A 128-bit vector source operand of [4 x float]. One
+///    single-precision floating-point element from this source, as
+///    determined by the immediate parameter, is copied to the
+///    destination.
+/// \param N
+///    Specifies the bits to be copied from operand Y,
+///    the bits in the destination to which the selected bits from
+///    operand Y are copied, and any bits in
+///    the destination to be masked. The following assignments are
+///    made:
+///    Bits [7:6] specify the bits to copy from operand Y:
+///    00: Selects bits [31:0] from operand Y.
+///    01: Selects bits [63:32] from operand Y.
+///    10: Selects bits [95:64] from operand Y.
+///    11: Selects bits [127:96] from operand Y.
+///    Bits [5:4] specify the bits in the destination to which the
+///    selected bits from
+///    operand Y are copied:
+///    00: Copies the selected bits from operand Y
+///    to bits [31:0] of the destination.
+///    01: Copies the selected bits from operand Y
+///    to bits [63:32] of the destination.
+///    10: Copies the selected bits from operand Y
+///    to bits [95:64] of the destination.
+///    11: Copies the selected bits from operand Y
+///    to bits [127:96] of the destination.
+///    Bits[3:0]: If any of these bits are set, the corresponding
+///    destination element is
+///    cleared.
+/// \returns A 128-bit vector of [4 x float] containing the copied float data 
+///    from the operands.
 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
+///    copies it to the destination, as specified by the integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_extract_ps(__m128 X, const int N);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c EXTRACTPS instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float].
+/// \param N
+///    Determines which bits are extracted using bits [1:0]:
+///    00: Bits [31:0] are copied to the destination.
+///    01: Bits [63:32] are copied to the destination.
+///    10: Bits [95:64] are copied to the destination.
+///    11: Bits [127:96] are copied to the destination.
+/// \returns A 32-bit integer containing the extracted 32 bits of float data.
 #define _mm_extract_ps(X, N) (__extension__                      \
                               ({ union { int __i; float __f; } __t;  \
                                  __v4sf __a = (__v4sf)(__m128)(X);       \
@@ -217,15 +913,91 @@
                                              _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
 
 /* Insert int into packed integer array at index.  */
+/// \brief Inserts 8 bits of extended packed data from the 128-bit integer 
+///    vector operand and the integer operand and copies them to the destination,
+///    using the specified offset.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c PINSRB instruction.
+///
+/// \param X
+///    A 128-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param I
+///    An integer. The bits of this operand are written to the
+///    destination beginning at the offset specified by operand N.
+/// \param N
+///    Specifies the bit offset to be used in the destination. The
+///    remaining bits in the destination are copied from the
+///    corresponding bits in operand X.
+/// \returns A 128-bit integer vector containing the copied extended packed 
+///    data from the operands.
 #define _mm_insert_epi8(X, I, N) (__extension__                           \
                                   ({ __v16qi __a = (__v16qi)(__m128i)(X); \
                                      __a[(N) & 15] = (I);                 \
                                      __a;}))
+/// \brief Inserts 32 bits of extended packed data from the 128-bit integer
+///    vector operand and the integer operand and copies them to the
+///    destination, using the specified offset.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c PINSRD instruction.
+///
+/// \param X
+///    A 128-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param I
+///    An integer. The bits of this operand are written to the
+///    destination beginning at the offset specified by operand N.
+/// \param N
+///    Specifies the bit offset to be used in the destination. The
+///    remaining bits in the destination are copied from the
+///    corresponding bits in operand X.
+/// \returns A 128-bit integer vector containing the copied extended packed 
+///    data from the operands.
 #define _mm_insert_epi32(X, I, N) (__extension__                         \
                                    ({ __v4si __a = (__v4si)(__m128i)(X); \
                                       __a[(N) & 3] = (I);                \
                                       __a;}))
 #ifdef __x86_64__
+/// \brief Inserts 64 bits of extended packed data from the 128-bit integer
+///    vector operand and the integer operand and copies them to the
+///    destination, using the specified offset.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c PINSRQ instruction.
+///
+/// \param X
+///    A 128-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param I
+///    A 64-bit integer. The bits of this operand are written to
+///    the destination beginning at the offset specified by operand
+///    N.
+/// \param N
+///    Specifies the bit offset to be used in the destination. The
+///    remaining bits in the destination are copied from the
+///    corresponding bits in operand X.
+/// \returns A 128-bit integer vector containing the copied extended packed 
+///    data from the operands.
 #define _mm_insert_epi64(X, I, N) (__extension__                         \
                                    ({ __v2di __a = (__v2di)(__m128i)(X); \
                                       __a[(N) & 1] = (I);                \
@@ -235,42 +1007,198 @@
 /* Extract int from packed integer array at index.  This returns the element
  * as a zero extended value, so it is unsigned.
  */
+/// \brief Extracts 8 bits of extended packed data from a 128-bit integer 
+///    vector and copies it to the destination, as specified by the integer 
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_extract_epi8(__m128i X, const int N);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c PEXTRB instruction.
+///
+/// \param X
+///    A 128-bit integer vector.
+/// \param N
+///    Specifies which element to copy to the destination.
+/// \returns An 8-bit unsigned integer containing the extracted bits of 
+///    extended packed data.
 #define _mm_extract_epi8(X, N) (__extension__                           \
                                 ({ __v16qi __a = (__v16qi)(__m128i)(X); \
                                    (int)(unsigned char) __a[(N) & 15];}))
+/// \brief Extracts 32 bits of extended packed data from a 128-bit integer 
+///    vector and copies it to the destination, as specified by the integer 
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_extract_epi32(__m128i X, const int N);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c PEXTRD instruction.
+///
+/// \param X
+///    A 128-bit integer vector.
+/// \param N
+///    Specifies which element to copy to the destination.
+/// \returns A 32-bit unsigned integer containing the extracted bits of 
+///    extended packed data.
 #define _mm_extract_epi32(X, N) (__extension__                         \
                                  ({ __v4si __a = (__v4si)(__m128i)(X); \
                                     (int)__a[(N) & 3];}))
 #ifdef __x86_64__
+/// \brief Extracts 64 bits of extended packed data from a 128-bit integer 
+///    vector and copies it to the destination, as specified by the integer 
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// long long _mm_extract_epi64(__m128i X, const int N);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c PEXTRQ instruction.
+///
+/// \param X
+///    A 128-bit integer vector.
+/// \param N
+///    Specifies which element to copy to the destination.
+/// \returns A 64-bit unsigned integer containing the extracted bits of 
+///    extended packed data.
 #define _mm_extract_epi64(X, N) (__extension__                         \
                                  ({ __v2di __a = (__v2di)(__m128i)(X); \
                                     (long long)__a[(N) & 1];}))
 #endif /* __x86_64 */
 
 /* SSE4 128-bit Packed Integer Comparisons.  */
+/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+///    zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPTEST instruction.
+///
+/// \param __M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+///    A 128-bit integer vector selecting which bits to test in
+///    operand __M.
+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testz_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
 }
 
+/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+///    ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPTEST instruction.
+///
+/// \param __M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+///    A 128-bit integer vector selecting which bits to test in
+///    operand __M.
+/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testc_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
 }
 
+/// \brief Tests whether the specified bits in a 128-bit integer vector are
+///    neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPTEST instruction.
+///
+/// \param __M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+///    A 128-bit integer vector selecting which bits to test in
+///    operand __M.
+/// \returns TRUE if the specified bits are neither all zeros nor all ones; 
+///    FALSE otherwise.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testnzc_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
 }
 
+/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+///    ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_test_all_ones(__m128i V);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPTEST instruction.
+///
+/// \param V
+///    A 128-bit integer vector containing the bits to be tested.
+/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
+///    otherwise.
 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+/// \brief Tests whether the specified bits in a 128-bit integer vector are
+///    neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPTEST instruction.
+///
+/// \param M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param V
+///    A 128-bit integer vector selecting which bits to test in
+///    operand M.
+/// \returns TRUE if the specified bits are neither all zeros nor all ones; 
+///    FALSE otherwise.
 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+///    zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_test_all_zeros(__m128i M, __m128i V);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPTEST instruction.
+///
+/// \param M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param V
+///    A 128-bit integer vector selecting which bits to test in
+///    operand M.
+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
 
 /* SSE4 64-bit Packed Integer Comparisons.  */
+/// \brief Compares each of the corresponding packed 64-bit values of the 
+///    128-bit integer vectors for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPEQQ instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector.
+/// \param __V2
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
 {
@@ -278,6 +1206,17 @@
 }
 
 /* SSE4 Packed Integer Sign-Extension.  */
+/// \brief Sign-extends each of the packed 8-bit integers in the lower bits of 
+///    a 128-bit integer vector to 16-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVSXBW instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The values stored in bits
+///    [63:0] are sign-extended.
+/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi16(__m128i __V)
 {
@@ -286,6 +1225,17 @@
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
+/// \brief Sign-extends each of the packed 8-bit integers in the lower bits of 
+///    a 128-bit integer vector to 32-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVSXBD instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The values stored in bits
+///    [31:0] are sign-extended.
+/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi32(__m128i __V)
 {
@@ -294,6 +1244,17 @@
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
 }
 
+/// \brief Sign-extends each of the packed 8-bit integers in the lower bits of 
+///    a 128-bit integer vector to 64-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVSXBQ instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The values stored in bits
+///    [15:0] are sign-extended.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi64(__m128i __V)
 {
@@ -303,18 +1264,51 @@
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
 }
 
+/// \brief Sign-extends each of the packed 16-bit integers in the lower bits of 
+///    a 128-bit integer vector to 32-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVSXWD instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The values stored in bits
+///    [63:0] are sign-extended.
+/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi32(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
 }
 
+/// \brief Sign-extends each of the packed 16-bit integers in the lower bits of 
+///    a 128-bit integer vector to 64-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVSXWQ instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The values stored in bits
+///    [31:0] are sign-extended.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi64(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
 }
 
+/// \brief Sign-extends each of the packed 32-bit integers in the lower bits of 
+///    a 128-bit integer vector to 64-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVSXDQ instruction.
+///
+/// \param __V
+///    A 128-bit vector of [4 x i32]. The values stored in bits
+///    [63:0] are sign-extended.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi32_epi64(__m128i __V)
 {
@@ -322,36 +1316,102 @@
 }
 
 /* SSE4 Packed Integer Zero-Extension.  */
+/// \brief Zero-extends each of the packed 8-bit integers in the lower bits of 
+///    a 128-bit integer vector to 16-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVZXBW instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The values stored in bits
+///    [63:0] are zero-extended.
+/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi16(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxbw128((__v16qi) __V);
 }
 
+/// \brief Zero-extends each of the packed 8-bit integers in the lower bits of 
+///    a 128-bit integer vector to 32-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVZXBD instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The values stored in bits
+///    [31:0] are zero-extended.
+/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi32(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxbd128((__v16qi)__V);
 }
 
+/// \brief Zero-extends each of the packed 8-bit integers in the lower bits of 
+///    a 128-bit integer vector to 64-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVZXBQ instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The values stored in bits
+///    [15:0] are zero-extended.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi64(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxbq128((__v16qi)__V);
 }
 
+/// \brief Zero-extends each of the packed 16-bit integers in the lower bits of 
+///    a 128-bit integer vector to 32-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVZXWD instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The values stored in bits
+///    [63:0] are zero-extended.
+/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi32(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxwd128((__v8hi)__V);
 }
 
+/// \brief Zero-extends each of the packed 16-bit integers in the lower bits of 
+///    a 128-bit integer vector to 64-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVZXWQ instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The values stored in bits
+///    [31:0] are zero-extended.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi64(__m128i __V)
 {
   return (__m128i) __builtin_ia32_pmovzxwq128((__v8hi)__V);
 }
 
+/// \brief Zero-extends each of the packed 32-bit integers in the lower bits of 
+///    a 128-bit integer vector to 64-bit values stored in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMOVZXDQ instruction.
+///
+/// \param __V
+///    A 128-bit vector of [4 x i32]. The values stored in bits
+///    [63:0] are zero-extended.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu32_epi64(__m128i __V)
 {
@@ -359,6 +1419,26 @@
 }
 
 /* SSE4 Pack with Unsigned Saturation.  */
+/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+///    operands into 16-bit unsigned integers, and packs the results into the
+///    destination. Values greater than FFFFh
+///    are saturated to FFFFh. Values
+///    less than 0000h are saturated to
+///    0000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPACKUSDW instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x i32]. The
+///    converted values are written to the lower order bits of the
+///    destination.
+/// \param __V2
+///    A 128-bit vector of [4 x i32]. The
+///    converted values are written to the upper order bits of the
+///    destination.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packus_epi32(__m128i __V1, __m128i __V2)
 {
@@ -366,6 +1446,31 @@
 }
 
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
+/// \brief Subtracts packed 8-bit unsigned integer values and computes the
+///    absolute values of the differences to the corresponding bits in the
+///    destination. Then sums of the absolute differences are written to the
+///    destination, according to the bit fields in the immediate operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c MPSADBW instruction.
+///
+/// \param X
+///    A 128-bit vector of [16 x i8] containing one of the source
+///    operands.
+/// \param Y
+///    A 128-bit vector of [16 x i8] containing one of the source
+///    operands.
+/// \param M
+///    An 8-bit immediate operand specifying how the absolute
+///    differences are to be calculated, according to the following
+///    algorithm:
+/// \returns A 128-bit integer vector containing the sums of the sets of 
+///    absolute differences between both operands.
 #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
   (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
                                       (__v16qi)(__m128i)(Y), (M)); })
@@ -411,61 +1516,859 @@
 #define _SIDD_UNIT_MASK                 0x40
 
 /* SSE4.2 Packed Comparison Intrinsics.  */
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with implicitly defined lengths that is
+///    contained in source operands A and B. Returns a 128-bit integer vector 
+///    representing the
+///    result mask of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPISTRM instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words, the type of comparison to perform, and
+///    the format of the return value.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+///    Bit [6]: Determines whether the result is zero-extended or
+///    expanded to 16 bytes.
+///    0: The result is zero-extended to 16 bytes.
+///    1: The result is expanded to 16 bytes (this expansion is
+///    performed by
+///    repeating each bit 8 or 16 times).
+/// \returns Returns a 128-bit integer vector representing the result mask of 
+///    the comparison.
 #define _mm_cmpistrm(A, B, M) \
   (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
                                        (__v16qi)(__m128i)(B), (int)(M))
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with implicitly defined lengths that is
+///    contained in source operands A and B. Returns an integer representing the 
+///    result index
+///    of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPISTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words, the type of comparison to perform, and
+///    the format of the return value.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+///    Bit [6]: Determines whether the index of the lowest set bit
+///    or the highest set bit is
+///    returned.
+///    0: The index of the least significant set bit.
+///    1: The index of the most significant set bit.
+/// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpistri(A, B, M) \
   (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
                                    (__v16qi)(__m128i)(B), (int)(M))
 
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with explicitly defined lengths that is
+///    contained in source operands A and B. Returns a 128-bit integer vector 
+///    representing the
+///    result mask of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPESTRM instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LA
+///    An integer that specifies the length of the string in A.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LB
+///    An integer that specifies the length of the string in B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words, the type of comparison to perform, and
+///    the format of the return value.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+///    Bit [6]: Determines whether the result is zero-extended or
+///    expanded to 16 bytes.
+///    0: The result is zero-extended to 16 bytes.
+///    1: The result is expanded to 16 bytes (this expansion is
+///    performed by
+///    repeating each bit 8 or 16 times).
+/// \returns Returns a 128-bit integer vector representing the result mask of 
+///    the comparison.
 #define _mm_cmpestrm(A, LA, B, LB, M) \
   (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
                                        (__v16qi)(__m128i)(B), (int)(LB), \
                                        (int)(M))
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with explicitly defined lengths that is
+///    contained in source operands A and B. Returns an integer representing the 
+///    result index
+///    of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPESTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LA
+///    An integer that specifies the length of the string in A.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LB
+///    An integer that specifies the length of the string in B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words, the type of comparison to perform, and
+///    the format of the return value.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+///    Bit [6]: Determines whether the index of the lowest set bit
+///    or the highest set bit is
+///    returned.
+///    0: The index of the least significant set bit.
+///    1: The index of the most significant set bit.
+/// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpestri(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
                                    (__v16qi)(__m128i)(B), (int)(LB), \
                                    (int)(M))
 
 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with implicitly defined lengths that is
+///    contained in source operands A and B. Returns 1 if the bit mask is not 
+///    zero and the
+///    length of the string in B is the maximum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPISTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words and the type of comparison to perform.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+/// \returns Returns 1 if the bit mask is not zero and the length of the string 
+///    in B is the maximum; otherwise returns 0.
 #define _mm_cmpistra(A, B, M) \
   (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with implicitly defined lengths that is
+///    contained in source operands A and B. Returns 1 if the bit mask is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPISTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words and the type of comparison to perform.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+/// \returns Returns 1 if the bit mask is zero; otherwise returns 0.
 #define _mm_cmpistrc(A, B, M) \
   (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with implicitly defined lengths that is
+///    contained in source operands A and B. Returns 1 if the least significant 
+///    bit of the bit
+///    mask is 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPISTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words and the type of comparison to perform.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+/// \returns Returns 1 if the least significant bit of the bit mask is 1; 
+///    otherwise returns 0.
 #define _mm_cmpistro(A, B, M) \
   (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with implicitly defined lengths that is
+///    contained in source operands A and B. Returns 1 if the length of the 
+///    string in A is less than the maximum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPISTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words and the type of comparison to perform.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+/// \returns Returns 1 if the length of the string in A is
+///    less than the maximum; otherwise returns 0.
 #define _mm_cmpistrs(A, B, M) \
   (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with implicitly defined lengths that is
+///    contained in source operands A and B. Returns 1 if the length of the 
+///    string in B is less than the maximum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPISTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words and the type of comparison to perform.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+/// \returns Returns 1 if the length of the string in B is
+///    less than the maximum; otherwise returns 0.
 #define _mm_cmpistrz(A, B, M) \
   (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
 
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with explicitly defined lengths that is
+///    contained in source operands A and B. Returns 1 if the bit mask is not 
+///    zero and the
+///    length of the string in B is the maximum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPESTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LA
+///    An integer that specifies the length of the string in A.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LB
+///    An integer that specifies the length of the string in B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words and the type of comparison to perform.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+/// \returns Returns 1 if the bit mask is not zero and the length of the string 
+///    in B is the maximum; otherwise returns 0.
 #define _mm_cmpestra(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with explicitly defined lengths that is
+///    contained in source operands A and B. Returns 1 if the bit mask is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPESTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LA
+///    An integer that specifies the length of the string in A.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LB
+///    An integer that specifies the length of the string in B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words and the type of comparison to perform.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+/// \returns Returns 1 if the bit mask is zero; otherwise returns 0.
 #define _mm_cmpestrc(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with explicitly defined lengths that is
+///    contained in source operands A and B. Returns 1 if the least significant 
+///    bit of the bit
+///    mask is 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPESTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LA
+///    An integer that specifies the length of the string in A.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LB
+///    An integer that specifies the length of the string in B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words and the type of comparison to perform.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+/// \returns Returns 1 if the least significant bit of the bit mask is 1; 
+///    otherwise returns 0.
 #define _mm_cmpestro(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with explicitly defined lengths that is
+///    contained in source operands A and B. Returns 1 if the length of the 
+///    string in A is less than the maximum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPESTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LA
+///    An integer that specifies the length of the string in A.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LB
+///    An integer that specifies the length of the string in B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words and the type of comparison to perform.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+/// \returns Returns 1 if the length of the string in A is
+///    less than the maximum; otherwise returns 0.
 #define _mm_cmpestrs(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+/// \brief Uses the immediate operand M to perform a packed
+///    comparison of string data with explicitly defined lengths that is
+///    contained in source operands A and B. Returns 1 if the length of the 
+///    string in B is less than the maximum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VPCMPESTRI instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LA
+///    An integer that specifies the length of the string in A.
+/// \param B
+///    A 128-bit integer vector containing one of the source
+///    operands to be compared.
+/// \param LB
+///    An integer that specifies the length of the string in B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters
+///    are bytes or words and the type of comparison to perform.
+///    Bits [1:0]: Determine source data format.
+///    00: 16 packed unsigned bytes
+///    01: 8 packed unsigned words
+///    10: 16 packed signed bytes
+///    11: 8 packed signed words
+///    Bits [3:2]: Determine comparison type and aggregation
+///    method.
+///    00: Subset: Each character in B is
+///    compared for equality with all the
+///    characters in A.
+///    01: Ranges: Each character in B is
+///    compared to A. The comparison basis is
+///    greater than or equal for even-indexed elements in A, and less than or
+///    equal for odd-indexed elements in A.
+///    10: Match: Compare each pair of corresponding characters in
+///    A and B
+///    for equality.
+///    11: Substring: Search B for substring
+///    matches of A.
+///    Bits [5:4]: Determine whether to perform a ones
+///    complement on the bit
+///    mask of the comparison results.
+///    00: No effect.
+///    01: Negate the bit mask.
+///    10: No effect.
+///    11: Negate the bit mask only for bits with an index less
+///    than or equal to the
+///    size of A or B.
+/// \returns Returns 1 if the length of the string in B is
+///    less than the maximum; otherwise returns 0.
 #define _mm_cmpestrz(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
 
 /* SSE4.2 Compare Packed Data -- Greater Than.  */
+/// \brief Compares each of the corresponding packed 64-bit values of the 
+///    128-bit integer vectors to determine if the values in the first operand 
+///    are
+///    greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPCMPGTQ instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector.
+/// \param __V2
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
 {
@@ -473,18 +2376,63 @@
 }
 
 /* SSE4.2 Accumulate CRC32.  */
+/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+///    character operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CRC32 instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum
+///    of operand __D.
+/// \param __D
+///    An unsigned 8-bit integer operand used to compute the
+///    CRC-32C checksum.
+/// \returns The result of adding operand __C to the CRC-32C
+///    checksum of operand __D.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u8(unsigned int __C, unsigned char __D)
 {
   return __builtin_ia32_crc32qi(__C, __D);
 }
 
+/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+///    character operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CRC32 instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum
+///    of operand __D.
+/// \param __D
+///    An unsigned 16-bit integer operand used to compute the
+///    CRC-32C checksum.
+/// \returns The result of adding operand __C to the CRC-32C
+///    checksum of operand __D.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u16(unsigned int __C, unsigned short __D)
 {
   return __builtin_ia32_crc32hi(__C, __D);
 }
 
+/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+///    character operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CRC32 instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum
+///    of operand __D.
+/// \param __D
+///    An unsigned 32-bit integer operand used to compute the
+///    CRC-32C checksum.
+/// \returns The result of adding operand __C to the CRC-32C
+///    checksum of operand __D.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u32(unsigned int __C, unsigned int __D)
 {
@@ -492,6 +2440,21 @@
 }
 
 #ifdef __x86_64__
+/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+///    character operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CRC32 instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum
+///    of operand __D.
+/// \param __D
+///    An unsigned 64-bit integer operand used to compute the
+///    CRC-32C checksum.
+/// \returns The result of adding operand __C to the CRC-32C
+///    checksum of operand __D.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
 {
Index: lib/Headers/tmmintrin.h
===================================================================
--- lib/Headers/tmmintrin.h
+++ lib/Headers/tmmintrin.h
@@ -29,187 +29,758 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
 
+/// \brief Computes the absolute value of each of the packed 8-bit signed
+///    integers in the source operand and stores the 8-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PABSB instruction.
+///
+/// \param __a
+///    A 64-bit vector of [8 x i8].
+/// \returns A 64-bit integer vector containing the absolute values of the 
+///    elements in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi8(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
 }
 
+/// \brief Computes the absolute value of each of the packed 8-bit signed
+///    integers in the source operand and stores the 8-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPABSB instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi8(__m128i __a)
 {
     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
 }
 
+/// \brief Computes the absolute value of each of the packed 16-bit signed
+///    integers in the source operand and stores the 16-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PABSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16].
+/// \returns A 64-bit integer vector containing the absolute values of the 
+///    elements in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi16(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
 }
 
+/// \brief Computes the absolute value of each of the packed 16-bit signed
+///    integers in the source operand and stores the 16-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPABSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi16(__m128i __a)
 {
     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
 }
 
+/// \brief Computes the absolute value of each of the packed 32-bit signed
+///    integers in the source operand and stores the 32-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PABSD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32].
+/// \returns A 64-bit integer vector containing the absolute values of the 
+///    elements in the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_abs_pi32(__m64 __a)
 {
     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
 }
 
+/// \brief Computes the absolute value of each of the packed 32-bit signed
+///    integers in the source operand and stores the 32-bit unsigned integer
+///    results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPABSD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit integer vector containing the absolute values of the
+///    elements in the operand.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi32(__m128i __a)
 {
     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
 }
 
+/// \brief Concatenates the two 128-bit integer vector operands, and 
+///    right-shifts the result by the number of bytes specified in the immediate 
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
+/// \endcode 
+///
+/// \param a
+///    A 128-bit vector of [16 x i8] containing one of the source
+///    operands.
+/// \param b
+///    A 128-bit vector of [16 x i8] containing one of the source
+///    operands.
+/// \param n
+///    An immediate operand specifying how many bytes to
+///    right-shift the result.
+/// \returns A 128-bit integer vector containing the concatenated right-shifted
+///    value.
 #define _mm_alignr_epi8(a, b, n) __extension__ ({ \
   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
                                      (__v16qi)(__m128i)(b), (n)); })
 
+/// \brief Concatenates the two 64-bit integer vector operands, and 
+///    right-shifts the result by the number of bytes specified in the immediate 
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
+/// \endcode 
+///
+/// \param a
+///    A 64-bit vector of [8 x i8] containing one of the source
+///    operands.
+/// \param b
+///    A 64-bit vector of [8 x i8] containing one of the source
+///    operands.
+/// \param n
+///    An immediate operand specifying how many bytes to
+///    right-shift the result.
+/// \returns A 64-bit integer vector containing the concatenated right-shifted
+///    value.
 #define _mm_alignr_pi8(a, b, n) __extension__ ({ \
   (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPHADDW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the lower bits of the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the upper bits of the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of 
+///    both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hadd_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPHADDD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the lower bits of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the upper bits of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of 
+///    both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hadd_epi32(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PHADDW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the lower bits of the destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the upper bits of the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of 
+///    both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PHADDD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the lower bits of the destination.
+/// \param __b
+///    A 64-bit vector of [2 x i32] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the upper bits of the destination.
+/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of 
+///    both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are 
+///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPHADDSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the lower bits of the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the upper bits of the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated 
+///    sums of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hadds_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are 
+///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PHADDSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the lower bits of the destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source
+///    operands. The horizontal sums of the values are stored in
+///    the upper bits of the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated 
+///    sums of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPHSUBW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the lower bits of the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the upper bits of the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences 
+///    of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsub_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPHSUBD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the lower bits of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the upper bits of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences 
+///    of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsub_epi32(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PHSUBW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the lower bits of the destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the upper bits of the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences 
+///    of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PHSUBD instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the lower bits of the destination.
+/// \param __b
+///    A 64-bit vector of [2 x i32] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the upper bits of the destination.
+/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences 
+///    of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
+///    7FFFh are saturated to 7FFFh. Negative differences less than
+///    8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPHSUBSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the lower bits of the destination.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the upper bits of the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
+///    differences of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_hsubs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
+///    7FFFh are saturated to 7FFFh. Negative differences less than
+///    8000h are saturated to 8000h.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PHSUBSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the lower bits of the destination.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source
+///    operands. The horizontal differences between the values are
+///    stored in the upper bits of the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
+///    differences of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer 
+///    values contained in the first source operand and packed 8-bit signed 
+///    integer
+///    values contained in the second source operand, adds pairs of
+///    contiguous products with signed saturation, and writes the 16-bit sums
+///    to the corresponding bits in the destination. For example, bits [7:0]
+///    of both operands are multiplied, bits [15:8] of both operands are
+///    multiplied, and the sum of both results is written to bits [15:0] of
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMADDUBSW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the first source
+///    operands.
+/// \param __b
+///    A 128-bit integer vector containing the second source
+///    operands.
+/// \returns A 128-bit integer vector containing the sums of products of both
+///    operands:
+///    R0 := (a0 * b0) + (a1 * b1)
+///    R1 := (a2 * b2) + (a3 * b3)
+///    R2 := (a4 * b4) + (a5 * b5)
+///    R3 := (a6 * b6) + (a7 * b7)
+///    R4 := (a8 * b8) + (a9 * b9)
+///    R5 := (a10 * b10) + (a11 * b11)
+///    R6 := (a12 * b12) + (a13 * b13)
+///    R7 := (a14 * b14) + (a15 * b15)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maddubs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer 
+///    values contained in the first source operand and packed 8-bit signed 
+///    integer
+///    values contained in the second source operand, adds pairs of
+///    contiguous products with signed saturation, and writes the 16-bit sums
+///    to the corresponding bits in the destination. For example, bits [7:0]
+///    of both operands are multiplied, bits [15:8] of both operands are
+///    multiplied, and the sum of both results is written to bits [15:0] of
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMADDUBSW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the first source
+///    operands.
+/// \param __b
+///    A 64-bit integer vector containing the second source
+///    operands.
+/// \returns A 64-bit integer vector containing the sums of products of both
+///    operands:
+///    R0 := (a0 * b0) + (a1 * b1)
+///    R1 := (a2 * b2) + (a3 * b3)
+///    R2 := (a4 * b4) + (a5 * b5)
+///    R3 := (a6 * b6) + (a7 * b7)
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+///    products to the 18 most significant bits by right-shifting, rounds the
+///    truncated value, and writes bits [16:1] to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPMULHRSW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16] containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit vector of [8 x i16] containing one of the source
+///    operands.
+/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
+///    products of both operands.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+///    products to the 18 most significant bits by right-shifting, rounds the
+///    truncated value, and writes bits [16:1] to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMULHRSW instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16] containing one of the source
+///    operands.
+/// \param __b
+///    A 64-bit vector of [4 x i16] containing one of the source
+///    operands.
+/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
+///    products of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
+///    destination or clears 8-bit values in the destination, as specified by
+///    the second source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSHUFB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control bytes
+///    corresponding to positions in the destination:
+///    Bit 7:
+///    1: Clear the corresponding byte in the destination.
+///    0: Copy the selected source byte to the corresponding byte
+///    in the destination.
+///    Bits [6:4] Reserved.
+///    Bits [3:0] select the source byte to be copied.
+/// \returns A 128-bit integer vector containing the copied or cleared values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shuffle_epi8(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
+///    destination or clears 8-bit values in the destination, as specified by
+///    the second source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSHUFB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control bytes
+///    corresponding to positions in the destination:
+///    Bit 7:
+///    1: Clear the corresponding byte in the destination.
+///    0: Copy the selected source byte to the corresponding byte
+///    in the destination.
+///    Bits [3:0] select the source byte to be copied.
+/// \returns A 64-bit integer vector containing the copied or cleared values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief For each 8-bit integer in the first source operand, perform one of 
+///    the following actions as specified by the second source operand:
+///    If the byte in the second source is negative, calculate the twos
+///    complement of the corresponding byte in the first source, and write
+///    that value to the destination.
+///    If the byte in the second source is positive, copy the corresponding
+///    byte from the first source to the destination.
+///    If the byte in the second source is zero, clear the corresponding byte
+///    in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSIGNB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control bytes
+///    corresponding to positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sign_epi8(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 }
 
+/// \brief For each 16-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand:
+///    If the word in the second source is negative, calculate the twos
+///    complement of the corresponding byte in the first source, and write
+///    that value to the destination.
+///    If the word in the second source is positive, copy the corresponding
+///    byte from the first source to the destination.
+///    If the word in the second source is zero, clear the corresponding byte
+///    in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSIGNW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control words
+///    corresponding to positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sign_epi16(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief For each 32-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand:
+///    If the doubleword in the second source is negative, calculate the twos
+///    complement of the corresponding byte in the first source, and write
+///    that value to the destination.
+///    If the doubleword in the second source is positive, copy the
+///    corresponding byte from the first source to the destination.
+///    If the doubleword in the second source is zero, clear the
+///    corresponding byte in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPSIGND instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 128-bit integer vector containing control doublewords
+///    corresponding to positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sign_epi32(__m128i __a, __m128i __b)
 {
     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief For each 8-bit integer in the first source operand, perform one of 
+///    the following actions as specified by the second source operand:
+///    If the byte in the second source is negative, calculate the twos
+///    complement of the corresponding byte in the first source, and write
+///    that value to the destination.
+///    If the byte in the second source is positive, copy the corresponding
+///    byte from the first source to the destination.
+///    If the byte in the second source is zero, clear the corresponding byte
+///    in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSIGNB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control bytes
+///    corresponding to positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief For each 16-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand:
+///    If the word in the second source is negative, calculate the twos
+///    complement of the corresponding byte in the first source, and write
+///    that value to the destination.
+///    If the word in the second source is positive, copy the corresponding
+///    byte from the first source to the destination.
+///    If the word in the second source is zero, clear the corresponding byte
+///    in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSIGNW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control words
+///    corresponding to positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief For each 32-bit integer in the first source operand, perform one of
+///    the following actions as specified by the second source operand:
+///    If the doubleword in the second source is negative, calculate the twos
+///    complement of the corresponding byte in the first source, and write
+///    that value to the destination.
+///    If the doubleword in the second source is positive, copy the
+///    corresponding byte from the first source to the destination.
+///    If the doubleword in the second source is zero, clear the
+///    corresponding byte in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSIGND instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param __b
+///    A 64-bit integer vector containing control doubleword
+///    corresponding to positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
Index: lib/Headers/xmmintrin.h
===================================================================
--- lib/Headers/xmmintrin.h
+++ lib/Headers/xmmintrin.h
@@ -39,6 +39,23 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse")))
 
+/// \brief Adds the 32-bit float values in the low-order bits of the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VADDSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands. The lower 32 bits of this operand are used in the
+///    calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands. The lower 32 bits of this operand are used in the
+///    calculation.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum 
+///    of the lower 32 bits of both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_add_ss(__m128 __a, __m128 __b)
 {
@@ -46,12 +63,43 @@
   return __a;
 }
 
+/// \brief Adds each of the values of 2 packed 128-bit vectors of [4 x float],
+///    and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VADDPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands.
+/// \returns A 128-bit vector of [4 x float] containing the sums of both 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_add_ps(__m128 __a, __m128 __b)
 {
   return __a + __b;
 }
 
+/// \brief Subtracts the 32-bit float value in the low-order bits of the second
+///    operand from the corresponding value in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSUBSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the minuend. The
+///    lower 32 bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+///    The lower 32 bits of this operand are used in the
+///    calculation.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    difference of the lower 32 bits of both operands. The upper 96 bits
+///    are copied from the upper 96 bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sub_ss(__m128 __a, __m128 __b)
 {
@@ -59,12 +107,44 @@
   return __a;
 }
 
+/// \brief Subtracts each of the values of the second operand from the first
+///    operand, both of which are 2 packed 128-bit vectors of [4 x float],
+///    and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSUBPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the minuend.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the differences between
+///    both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sub_ps(__m128 __a, __m128 __b)
 {
   return __a - __b;
 }
 
+/// \brief Multiplies 2 32-bit float values in the low-order bits of 2 the
+///    operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMULSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands. The lower 32 bits of this operand are used in the
+///    calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands. The lower 32 bits of this operand are used in the
+///    calculation.
+/// \returns A 128-bit vector of [4 x float] containing the product of the lower 
+///    32 bits of both operands. The upper 96 bits are copied from the upper 96
+///    bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mul_ss(__m128 __a, __m128 __b)
 {
@@ -72,12 +152,44 @@
   return __a;
 }
 
+/// \brief Multiplies each of the values of 2 packed 128-bit vectors of [4 x
+///    float], and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMULPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands.
+/// \returns A 128-bit vector of [4 x float] containing the products of both
+///    operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mul_ps(__m128 __a, __m128 __b)
 {
   return __a * __b;
 }
 
+/// \brief Divides the value in the low-order 32 bits of the first operand by 
+///    the corresponding value in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VDIVSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the dividend. The
+///    lower 32 bits of this operand are used in the calculation.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the divisor. The
+///    lower 32 bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the quotients of the 
+///    lower 32 bits of both operands. The upper 96 bits are copied from the 
+///    upper
+///    96 bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_div_ss(__m128 __a, __m128 __b)
 {
@@ -85,12 +197,36 @@
   return __a;
 }
 
+/// \brief Divides 2 packed 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VDIVPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the dividend.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the divisor.
+/// \returns A 128-bit vector of [4 x float] containing the quotients between 
+///    both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_div_ps(__m128 __a, __m128 __b)
 {
   return __a / __b;
 }
 
+/// \brief Calculates the square root of the value stored in the low-order bits
+///    of a packed 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSQRTSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the square root of the
+///    value in the operand in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sqrt_ss(__m128 __a)
 {
@@ -98,12 +234,35 @@
   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 }
 
+/// \brief Calculates the square roots of the values stored in a packed 128-bit
+///    vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSQRTPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the square roots of the
+///    values in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_sqrt_ps(__m128 __a)
 {
   return __builtin_ia32_sqrtps(__a);
 }
 
+/// \brief Calculates the approximate reciprocal of the value stored in the
+///    low-order bits of a packed 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VRCPSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the approximate 
+///    reciprocal of the value in the operand in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rcp_ss(__m128 __a)
 {
@@ -111,12 +270,37 @@
   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 }
 
+/// \brief Calculates the approximate reciprocals of the values stored in a
+///    packed 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VRCPPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the approximate 
+///    reciprocals of the values in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rcp_ps(__m128 __a)
 {
   return __builtin_ia32_rcpps(__a);
 }
 
+/// \brief Calculates the approximate reciprocal of the square root of the 
+///    value stored in the low-order bits of a packed 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VRSQRTSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the approximate 
+///    reciprocal of the square root of the value in the operand in the 
+///    low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rsqrt_ss(__m128 __a)
 {
@@ -124,96 +308,349 @@
   return (__m128) { __c[0], __a[1], __a[2], __a[3] };
 }
 
+/// \brief Calculates the approximate reciprocals of the square roots of the
+///    values stored in a packed 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VRSQRTPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the approximate 
+///    reciprocals of the square roots of the values in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_rsqrt_ps(__m128 __a)
 {
   return __builtin_ia32_rsqrtps(__a);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both operands,
+///    and stores the lesser of the pair of values in the low-order bits of
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMINSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    minimum value between both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_min_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_minss(__a, __b);
 }
 
+/// \brief Compares 2 packed 128-bit vectors of [4 x float] and stores the 
+///    lesser of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMINPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands.
+/// \returns A 128-bit vector of [4 x float] containing the minimum values 
+///    between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_min_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_minps(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both operands,
+///    and stores the greater of the pair of values in the low-order bits of
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMAXSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    maximum value between both operands. The upper 96 bits are copied from
+///    the upper 96 bits of the first source operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_max_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_maxss(__a, __b);
 }
 
+/// \brief Compares 2 packed 128-bit vectors of [4 x float] and stores the
+///    greater of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMAXPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands.
+/// \returns A 128-bit vector of [4 x float] containing the maximum values 
+///    between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_max_ps(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_maxps(__a, __b);
 }
 
+/// \brief Performs a bitwise AND of 2 packed 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPAND instruction.
+///
+/// \param __a
+///    A 128-bit vector containing one of the source operands.
+/// \param __b
+///    A 128-bit vector containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
+///    values between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_and_ps(__m128 __a, __m128 __b)
 {
   return (__m128)((__v4si)__a & (__v4si)__b);
 }
 
+/// \brief Performs a bitwise AND of 2 packed 128-bit vectors of [4 x float],
+///    using the ones-complement of the values contained in the first
+///    source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPANDN instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the left source
+///    operand. The ones complement of this value is used in
+///    the bitwise AND.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing the right source
+///    operand.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the 
+///    ones-complement of the first operand and the values in the second operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_andnot_ps(__m128 __a, __m128 __b)
 {
   return (__m128)(~(__v4si)__a & (__v4si)__b);
 }
 
+/// \brief Performs a bitwise OR of 2 packed 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPOR instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
+///    values between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_or_ps(__m128 __a, __m128 __b)
 {
   return (__m128)((__v4si)__a | (__v4si)__b);
 }
 
+/// \brief Performs a bitwise exclusive OR of 2 packed 128-bit vectors of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPXOR instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the source
+///    operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR 
+///    of the values between both operands.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_xor_ps(__m128 __a, __m128 __b)
 {
   return (__m128)((__v4si)__a ^ (__v4si)__b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands for equality, and stores the result of the comparison in the 
+///    low-order
+///    bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPEQSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpeq_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpeqss(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPEQPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpeq_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpeqps(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the value in the first operand is less than the
+///    corresponding value in the second operand, and stores the result of
+///    the comparison in the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLTSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmplt_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpltss(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLTPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmplt_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpltps(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the value in the first operand is less than or 
+///    equal
+///    to the corresponding value in the second operand, and stores the
+///    result of the comparison in the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLESS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmple_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpless(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLEPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmple_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpleps(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the value in the first operand is greater than 
+///    the
+///    corresponding value in the second operand, and stores the result of
+///    the comparison in the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLTSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpgt_ss(__m128 __a, __m128 __b)
 {
@@ -222,12 +659,45 @@
                                          4, 1, 2, 3);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLTPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpgt_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpltps(__b, __a);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the value in the first operand is greater than 
+///    or
+///    equal to the corresponding value in the second operand, and stores the
+///    result of the comparison in the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLESS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpge_ss(__m128 __a, __m128 __b)
 {
@@ -236,48 +706,177 @@
                                          4, 1, 2, 3);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are greater than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPLEPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpge_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpleps(__b, __a);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands for inequality, and stores the result of the comparison in the
+///    low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNEQSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpneq_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpneqss(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] for inequality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNEQPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpneq_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpneqps(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the value in the first operand is not less than 
+///    the
+///    corresponding value in the second operand, and stores the result of
+///    the comparison in the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLTSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnltss(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLTPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnltps(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the value in the first operand is not less than 
+///    or
+///    equal to the corresponding value in the second operand, and stores the
+///    result of the comparison in the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLESS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnle_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnless(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLEPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnle_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnleps(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the value in the first operand is not greater 
+///    than the
+///    corresponding value in the second operand, and stores the result of
+///    the comparison in the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLTSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpngt_ss(__m128 __a, __m128 __b)
 {
@@ -286,12 +885,45 @@
                                          4, 1, 2, 3);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLTPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpngt_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnltps(__b, __a);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the value in the first operand is not greater 
+///    than or
+///    equal to the corresponding value in the second operand, and stores the
+///    result of the comparison in the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLESS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnge_ss(__m128 __a, __m128 __b)
 {
@@ -300,114 +932,407 @@
                                          4, 1, 2, 3);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are not greater than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPNLEPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpnge_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpnleps(__b, __a);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the value in the first operand is ordered with 
+///    respect
+///    to the corresponding value in the second operand, and stores the
+///    result of the comparison in the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPORDSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpord_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpordss(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are ordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPORDPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpord_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpordps(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the value in the first operand is unordered 
+///    with
+///    respect to the corresponding value in the second operand, and stores
+///    the result of the comparison in the low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPUNORDSS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] containing one of the
+///    operands. The lower 32 bits of this operand are used in the
+///    comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results 
+///    in the low-order bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpunord_ss(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpunordss(__a, __b);
 }
 
+/// \brief Compares each of the corresponding packed 32-bit float values of the
+///    128-bit vectors of [4 x float] to determine if the values in the first
+///    operand are unordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCMPUNORDPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values.
+/// \param __b
+///    A 128-bit vector of [4 x float] values.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cmpunord_ps(__m128 __a, __m128 __b)
 {
   return (__m128)__builtin_ia32_cmpunordps(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands for equality, and stores the result of the comparison in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comieq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comieq(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the first operand is less than the second 
+///    operand, and
+///    stores the result of the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comilt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comilt(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the first operand is less than or equal to the 
+///    second
+///    operand, and stores the result of the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comile_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comile(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the first operand is greater than the second 
+///    operand,
+///    and stores the result of the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comigt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comigt(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the first operand is greater than or equal to 
+///    the
+///    second operand, and stores the result of the comparison in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comige_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comige(__a, __b);
 }
 
+/// \brief Compares 2 32-bit float values in the low-order bits of both 
+///    operands to determine if the first operand is not equal to the second 
+///    operand,
+///    and stores the result of the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_comineq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_comineq(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 32-bit float values using the
+///    low-order bits of both operands to determine equality, and stores the
+///    result of the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomieq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomieq(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 32-bit float values using the
+///    low-order bits of both operands to determine if the first operand is
+///    less than the second operand, and stores the result of the comparison
+///    in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomilt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomilt(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 32-bit float values using the
+///    low-order bits of both operands to determine if the first operand is
+///    less than or equal to the second operand, and stores the result of the
+///    comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomile_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomile(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 32-bit float values using the
+///    low-order bits of both operands to determine if the first operand is
+///    greater than the second operand, and stores the result of the
+///    comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomigt_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomigt(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 32-bit float values using the
+///    low-order bits of both operands to determine if the first operand is
+///    greater than or equal to the second operand, and stores the result of
+///    the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomige_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomige(__a, __b);
 }
 
+/// \brief Performs an unordered comparison of 2 32-bit float values using the
+///    low-order bits of both operands to determine inequality, and stores
+///    the result of the comparison in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUCOMISS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \param __b
+///    A 128-bit vector of [4 x float] values. The lower 32 bits of
+///    this operand are used in the comparison.
+/// \returns An integer containing the comparison results.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_ucomineq_ss(__m128 __a, __m128 __b)
 {
   return __builtin_ia32_ucomineq(__a, __b);
 }
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer. The result is written to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSS2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this
+///    operand are used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtss_si32(__m128 __a)
 {
   return __builtin_ia32_cvtss2si(__a);
 }
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer. The result is written to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSS2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this
+///    operand are used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvt_ss2si(__m128 __a)
 {
@@ -416,6 +1341,18 @@
 
 #ifdef __x86_64__
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 64-bit integer. The result is written to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSS2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this
+///    operand are used in the conversion.
+/// \returns A 64-bit integer containing the converted value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvtss_si64(__m128 __a)
 {
@@ -424,48 +1361,145 @@
 
 #endif
 
+/// \brief Converts 2 low-order float values in a 128-bit vector of [4 x float]
+///    into a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPS2PI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtps_pi32(__m128 __a)
 {
   return (__m64)__builtin_ia32_cvtps2pi(__a);
 }
 
+/// \brief Converts 2 low-order float values in a 128-bit vector of [4 x float]
+///    into a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPS2PI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvt_ps2pi(__m128 __a)
 {
   return _mm_cvtps_pi32(__a);
 }
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer, truncating the result when it is
+///    inexact. The result is written to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTTSS2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this
+///    operand are used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvttss_si32(__m128 __a)
 {
   return __a[0];
 }
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit integer, truncating the result when it is
+///    inexact. The result is written to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTTSS2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this
+///    operand are used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_cvtt_ss2si(__m128 __a)
 {
   return _mm_cvttss_si32(__a);
 }
 
+/// \brief Converts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 64-bit integer, truncating the result when it is
+///    inexact. The result is written to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTTSS2SI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this
+///    operand are used in the conversion.
+/// \returns A 64-bit integer containing the converted value.
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm_cvttss_si64(__m128 __a)
 {
   return __a[0];
 }
 
+/// \brief Converts 2 low-order float values in a 128-bit vector of [4 x float]
+///    into a 64-bit vector of [2 x i32], truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTTPS2PI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvttps_pi32(__m128 __a)
 {
   return (__m64)__builtin_ia32_cvttps2pi(__a);
 }
 
+/// \brief Converts 2 low-order float values in a 128-bit vector of [4 x float]
+///    into a 64-bit vector of [2 x i32], truncating the result when it is
+///    inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTTPS2PI instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtt_ps2pi(__m128 __a)
 {
   return _mm_cvttps_pi32(__a);
 }
 
+/// \brief Converts a 32-bit signed integer value into a vector of [4 x float],
+///    writing the result to the lower 32 bits of the destination. The
+///    remaining higher order elements of the destination are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSI2SS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The upper 96 bits of the
+///    destination are copied from the corresponding elements in
+///    this operand.
+/// \param __b
+///    A 32-bit signed integer operand containing the value to be
+///    converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied
+///    from the upper 96 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsi32_ss(__m128 __a, int __b)
 {
@@ -473,6 +1507,25 @@
   return __a;
 }
 
+/// \brief Converts a 32-bit signed integer value into a vector of [4 x float],
+///    writing the result to the lower 32 bits of the destination. The
+///    remaining higher order elements of the destination are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSI2SS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The upper 96 bits of the
+///    destination are copied from the corresponding elements in
+///    this operand.
+/// \param __b
+///    A 32-bit signed integer operand containing the value to be
+///    converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied
+///    from the upper 96 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvt_si2ss(__m128 __a, int __b)
 {
@@ -481,6 +1534,25 @@
 
 #ifdef __x86_64__
 
+/// \brief Converts a 64-bit signed integer value into a vector of [4 x float],
+///    writing the result to the lower 32 bits of the destination. The
+///    remaining higher order elements of the destination are copied from the
+///    corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VCVTSI2SS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The upper 96 bits of the
+///    destination are copied from the corresponding elements in
+///    this operand.
+/// \param __b
+///    A 64-bit signed integer operand containing the value to be
+///    converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+///    converted value of the second operand. The upper 96 bits are copied
+///    from the upper 96 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtsi64_ss(__m128 __a, long long __b)
 {
@@ -490,24 +1562,85 @@
 
 #endif
 
+/// \brief Converts a 64-bit vector of [2 x i32] into a 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPI2PS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The upper 64 bits of this
+///    operand are copied to the destination.
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The elements in this vector
+///    are converted and written to the corresponding low-order
+///    elements in the destination.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted value of the second operand. The upper 64 bits are copied
+///    from the upper 64 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
 {
   return __builtin_ia32_cvtpi2ps(__a, (__v2si)__b);
 }
 
+/// \brief Converts a 64-bit vector of [2 x i32] into a 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPI2PS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The remaining higher order
+///    elements of the destination are copied from the
+///    corresponding elements in this operand.
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The elements in this vector
+///    are converted and written to the corresponding low-order
+///    elements in the destination.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+///    converted value from the second operand. The upper 64 bits are copied
+///    from the upper 64 bits of the first operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
 {
   return _mm_cvtpi32_ps(__a, __b);
 }
 
+/// \brief Extracts a float value contained in the lower 32 bits of a vector of
+///    [4 x float] into a 32-bit float. The result is written to the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The lower 32 bits of this
+///    operand are used in the extraction.
+/// \returns A 32-bit float containing the extracted value.
 static __inline__ float __DEFAULT_FN_ATTRS
 _mm_cvtss_f32(__m128 __a)
 {
   return __a[0];
 }
 
+/// \brief Loads float values into the high-order bits of a 128-bit vector of 
+///    [4 x float]. The low-order bits are copied from the low-order bits of 
+///    the
+///    first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MOVLHPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __p
+///    A pointer to float values.
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadh_pi(__m128 __a, const __m64 *__p)
 {
@@ -520,6 +1653,22 @@
   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
 }
 
+/// \brief Loads float values into the low-order bits of a 128-bit vector of [4 
+///    x float]. The high-order bits are copied from the high-order bits of the
+///    first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MOVLPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+///    Bits [127:64] are written to bits [127:64] of the
+///    destination.
+/// \param __p
+///    A pointer to a float values.
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadl_pi(__m128 __a, const __m64 *__p)
 {
@@ -532,6 +1681,17 @@
   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
 }
 
+/// \brief Loads a single float value to the low element in a 128-bit integer
+///    vector and clears the upper elements.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVSS instruction.
+///
+/// \param __p
+///    An aligned pointer to a memory location containing a 32-bit
+///    float value.
+/// \returns A 128-bit vector of [4 x float] containing the moved value.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load_ss(const float *__p)
 {
@@ -542,6 +1702,15 @@
   return (__m128){ __u, 0, 0, 0 };
 }
 
+/// \brief Moves and duplicates one float value to float values stored in a
+///    packed 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __p
+///    A float value to be moved and duplicated.
+/// \returns A 128-bit vector of [4 x float] containing the moved and 
+///    duplicated values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load1_ps(const float *__p)
 {
@@ -554,12 +1723,33 @@
 
 #define        _mm_load_ps1(p) _mm_load1_ps(p)
 
+/// \brief Moves packed float values from an aligned memory location to 32-bit
+///    elements in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPS instruction.
+///
+/// \param __p
+///    A 16-byte aligned pointer to a memory location containing
+///    float values.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load_ps(const float *__p)
 {
   return *(__m128*)__p;
 }
 
+/// \brief Moves packed float values from an unaligned memory location to 
+///    32-bit elements in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVUPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location containing float values.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadu_ps(const float *__p)
 {
@@ -569,6 +1759,18 @@
   return ((struct __loadu_ps*)__p)->__v;
 }
 
+/// \brief Moves packed float values, in reverse order, from an aligned memory
+///    location to 32-bit elements in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPS+shuffling instruction.
+///
+/// \param __p
+///    A 16-byte aligned pointer to a memory location containing
+///    float values.
+/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded 
+///    in reverse order.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_loadr_ps(const float *__p)
 {
@@ -582,12 +1784,34 @@
   return (__m128)__builtin_ia32_undef128();
 }
 
+/// \brief Initializes a 128-bit vector of [4 x float] with the specified 
+///    32-bit float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    A float value used to initialize the lower 32 bits of the
+///    destination vector of [4 x float]. The upper bits of the
+///    destination are set to zero.
+/// \returns An initialized 128-bit vector of [4 x float] containing the value
+///    provided in the operand. The upper bits of the destination are set to
+///    zero.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ss(float __w)
 {
   return (__m128){ __w, 0, 0, 0 };
 }
 
+/// \brief Initializes all elements a 128-bit vector of [4 x float] with the
+///    specified 32-bit float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    A float value used to initialize all elements of the
+///    destination vector of [4 x float].
+/// \returns An initialized 128-bit vector of [4 x float] in which all elements
+///    containing the value provided in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set1_ps(float __w)
 {
@@ -595,42 +1819,137 @@
 }
 
 /* Microsoft specific. */
+/// \brief Initializes all elements a 128-bit vector of [4 x float] with the
+///    specified 32-bit float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __w
+///    A float value used to initialize all elements of the
+///    destination vector of [4 x float].
+/// \returns An initialized 128-bit vector of [4 x float] in which all elements
+///    containing the value provided in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ps1(float __w)
 {
     return _mm_set1_ps(__w);
 }
 
+/// \brief Initializes the float values in a 128-bit vector of [4 x float] with
+///    the specified 32-bit float values.
+///    The following code illustrates this intrinsics behavior:
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __z
+///    A value value used to initialize the bits [127:96] of the
+///    destination vector of [4 x float].
+/// \param __y
+///    A value value used to initialize the bits [95:64] of the
+///    destination vector of [4 x float].
+/// \param __x
+///    A value value used to initialize the bits [63:32] of the
+///    destination vector of [4 x float].
+/// \param __w
+///    A value value used to initialize the bits [31:0] of the
+///    destination vector of [4 x float].
+/// \returns An initialized 128-bit vector of [4 x float] containing the values
+///    provided in the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_set_ps(float __z, float __y, float __x, float __w)
 {
   return (__m128){ __w, __x, __y, __z };
 }
 
+/// \brief Initializes the float values in a 128-bit vector of [4 x float] in
+///    reverse order with the specified 32-bit float values.
+///    The following code illustrates this intrinsics behavior:
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __z
+///    A value value used to initialize the bits [31:0] of the
+///    destination vector of [4 x float].
+/// \param __y
+///    A value value used to initialize the bits [63:32] of the
+///    destination vector of [4 x float].
+/// \param __x
+///    A value value used to initialize the bits [95:64] of the
+///    destination vector of [4 x float].
+/// \param __w
+///    A value value used to initialize the bits [127:96] of the
+///    destination vector of [4 x float].
+/// \returns An initialized 128-bit vector of [4 x float] containing the values
+///    provided in the operand, loaded in reverse order.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_setr_ps(float __z, float __y, float __x, float __w)
 {
   return (__m128){ __z, __y, __x, __w };
 }
 
+/// \brief Sets the 32-bit float registers to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \returns An initialized 128-bit vector of [4 x float] with all elements set 
+///    to zero.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_setzero_ps(void)
 {
   return (__m128){ 0, 0, 0, 0 };
 }
 
+/// \brief Moves the packed float values from the upper 64 bits of a 128-bit
+///    vector of [4 x float] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VPEXTRQ instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float
+///    values.
+/// \param __a
+///    A packed 128-bit vector of [4 x float] containing the values
+///    to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeh_pi(__m64 *__p, __m128 __a)
 {
   __builtin_ia32_storehps((__v2si *)__p, __a);
 }
 
+/// \brief Moves the packed float values from the lower 64 bits of a 128-bit
+///    vector of [4 x float] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVLPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float
+///    values.
+/// \param __a
+///    A packed 128-bit vector of [4 x float] containing the values
+///    to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_pi(__m64 *__p, __m128 __a)
 {
   __builtin_ia32_storelps((__v2si *)__p, __a);
 }
 
+/// \brief Moves the packed float value from the lower 32 bits of a 128-bit
+///    vector of [4 x float] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVSS instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the float
+///    value.
+/// \param __a
+///    A packed 128-bit vector of [4 x float] containing the value
+///    to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ss(float *__p, __m128 __a)
 {
@@ -640,12 +1959,36 @@
   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
 }
 
+/// \brief Moves packed float values from a 128-bit vector of [4 x float] to an
+///    unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVUPS instruction.
+///
+/// \param __p
+///    A pointer to an unaligned memory location that can store 4
+///    float values.
+/// \param __a
+///    A packed 128-bit vector of [4 x float] containing the values
+///    to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_ps(float *__p, __m128 __a)
 {
   __builtin_ia32_storeups(__p, __a);
 }
 
+/// \brief Moves the lower 32 bits of a 128-bit vector of [4 x float] four 
+///    times to all the elements of a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __p
+///    A pointer to a memory location that can store 4 float
+///    values.
+/// \param __a
+///    A 128-bit vector of [4 x float] whose lower 32 bits are
+///    copied to each of the values in __p.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store1_ps(float *__p, __m128 __a)
 {
@@ -653,18 +1996,55 @@
   _mm_storeu_ps(__p, __a);
 }
 
+/// \brief Moves the lower 32 bits of a 128-bit vector of [4 x float] four 
+///    times to all the elements of a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __p
+///    A pointer to a memory location that can store 4 float
+///    values.
+/// \param __a
+///    A 128-bit vector of [4 x float] whose lower 32 bits are
+///    copied to each of the values in __p.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ps1(float *__p, __m128 __a)
 {
     return _mm_store1_ps(__p, __a);
 }
 
+/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
+///    memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPS instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location that can store 4
+///    float values.
+/// \param __a
+///    A packed 128-bit vector of [4 x float] containing the values
+///    to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_ps(float *__p, __m128 __a)
 {
   *(__m128 *)__p = __a;
 }
 
+/// \brief Moves packed float values, in reverse order, from a 128-bit vector 
+///    of [4 x float] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVAPS+shuffling instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location that can store 4
+///    float values, which are loaded in reverse order.
+/// \param __a
+///    A packed 128-bit vector of [4 x float] containing the values
+///    to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storer_ps(float *__p, __m128 __a)
 {
@@ -681,27 +2061,111 @@
 /* FIXME: We have to #define this because "sel" must be a constant integer, and
    Sema doesn't do any form of constant propagation yet. */
 
+/// \brief Loads one cache line of data from the specified address to a 
+///    location closer to the processor.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// void _mm_prefetch(const void * a, const int sel);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c PREFETCHNTA instruction.
+///
+/// \param a
+///    A pointer to a memory location containing a cache line of
+///    data.
+/// \param sel
+///    A predefined integer constant specifying the type of
+///    prefetch operation:
+///    _MM_HINT_NTA: Move
+///    data using the non-temporal access (NTA) hint.
+///    The PREFETCHNTA
+///    instruction will be generated.
+///    _MM_HINT_T0: Move data
+///    using the T0 hint. The PREFETCHT0
+///    instruction will be generated.
+///    _MM_HINT_T1: Move data
+///    using the T1 hint. The PREFETCHT1
+///    instruction will be generated.
+///    _MM_HINT_T2: Move data
+///    using the T2 hint. The PREFETCHT2
+///    instruction will be generated.
 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
 #endif
 
+/// \brief Stores a 64-bit integer in the specified aligned memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MOVNTQ instruction.
+///
+/// \param __p
+///    The aligned memory location used to store the register
+///    value.
+/// \param __a
+///    A 64-bit integer containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pi(__m64 *__p, __m64 __a)
 {
   __builtin_ia32_movntq(__p, __a);
 }
 
+/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a
+///    128-bit aligned memory location. To minimize caching, the data is
+///    flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVNTPS instruction.
+///
+/// \param __p
+///    A 128-bit aligned pointer to a memory location that will
+///    receive the integer values.
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values to be
+///    moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_ps(float *__p, __m128 __a)
 {
   __builtin_ia32_movntps(__p, __a);
 }
 
+/// \brief Forces strong memory ordering (serialization) between store
+///    instructions preceding this instruction and store instructions
+///    following this instruction, assuring the system completes all previous
+///    stores before executing subsequent stores.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c SFENCE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_sfence(void)
 {
   __builtin_ia32_sfence();
 }
 
+/// \brief Extracts 16 bits of packed data from a 64-bit integer vector and
+///    copies it to the destination, as specified by the immediate integer
+///    operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PEXTRW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __n
+///    An immediate integer operand that determines which bits are
+///    extracted:
+///    0: Bits [15:0] are copied to the destination.
+///    1: Bits [31:16] are copied to the destination.
+///    2: Bits [47:32] are copied to the destination.
+///    3: Bits [63:48] are copied to the destination.
+/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_extract_pi16(__m64 __a, int __n)
 {
@@ -709,6 +2173,27 @@
   return (unsigned short)__b[__n & 3];
 }
 
+/// \brief Copies packed data from the 64-bit integer vector operand to the
+///    destination, and inserts the lower 16-bits of an integer operand,
+///    using the offset specified by the immediate operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PINSRW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector. The remaining bits in the
+///    destination are copied from the corresponding bits in this
+///    operand.
+/// \param __d
+///    An integer. The bits of this operand are written to the
+///    destination beginning at the offset specified by operand __n.
+/// \param __n
+///    Specifies the bit offset to be used in the destination. The
+///    remaining bits in the destination are copied from the
+///    corresponding bits in operand __a.
+/// \returns A 64-bit integer vector containing the copied packed data from the
+///    operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_insert_pi16(__m64 __a, int __d, int __n)
 {
@@ -717,117 +2202,484 @@
    return (__m64)__b;
 }
 
+/// \brief Compares each of the corresponding packed 16-bit integer values of 
+///    the 64-bit integer vectors, and writes the greater value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMAXSW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_max_pi16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+///    values of the 64-bit integer vectors, and writes the greater value to
+///    the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMAXUB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_max_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 16-bit integer values of 
+///    the 64-bit integer vectors, and writes the lesser value to the
+///    corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMINSW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_min_pi16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Compares each of the corresponding packed 8-bit unsigned integer
+///    values of the 64-bit integer vectors, and writes the lesser value to
+///    the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMINUB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the comparison results.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_min_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief Copies the values of the most significant bits from each 8-bit 
+///    element in a 64-bit integer vector to create a 16-bit mask value, 
+///    zero-extends
+///    the value, and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMOVMSKB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing the values with bits to
+///    be extracted.
+/// \returns The most significant bits from each 8-bit element in the operand,
+///    written to bits [15:0].
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_pi8(__m64 __a)
 {
   return __builtin_ia32_pmovmskb((__v8qi)__a);
 }
 
+/// \brief Multiplies packed 16-bit unsigned integer values and writes the
+///    high-order 16 bits of each 32-bit product to the corresponding bits in
+///    the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PMULHUW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the products of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_mulhi_pu16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+///    destination, as specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
+/// \endcode 
+///
+/// \param a
+///    A 64-bit integer vector containing the values to be copied.
+/// \param n
+///    An immediate value containing 8-bit values specifying which
+///    elements to copy from a. If bit 7 is
+///    set, the corresponding 8-bit element in the destination is
+///    cleared. Bits [3:0] select which 8-bit element to copy.
+/// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) __extension__ ({ \
   (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); })
 
+/// \brief Conditionally copies the values from each 8-bit element in the first
+///    64-bit integer vector operand to the specified memory location, as
+///    specified by the most significant bit in the corresponding element in
+///    the second 64-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c MASKMOVQ instruction.
+///
+/// \param __d
+///    A 64-bit integer vector containing the values with elements
+///    to be copied.
+/// \param __n
+///    A 64-bit integer vector operand. The most significant bit
+///    from each 8-bit element determines whether the corresponding
+///    element in operand __d is copied. If the
+///    most significant bit of a given element is 1, the
+///    corresponding element in operand __d is
+///    copied.
+/// \param __p
+///    A 64-bit aligned pointer to a memory location that will
+///    receive the conditionally copied integer values.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
 {
   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
 }
 
+/// \brief Computes the rounded averages of the packed unsigned 8-bit integer
+///    values and writes the averages to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PAVGB instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the averages of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_avg_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief Computes the rounded averages of the packed unsigned 16-bit integer
+///    values and writes the averages to the corresponding bits in the
+///    destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PAVGW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector containing the averages of both operands.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_avg_pu16(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
 }
 
+/// \brief Subtracts 8-bit unsigned integer values and computes the absolute
+///    differences to the corresponding bits in the destination. Then sum of
+///    the absolute differences is written bits [15:0] of the destination.
+///    The upper elements in the destination are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c PSADBW instruction.
+///
+/// \param __a
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \param __b
+///    A 64-bit integer vector containing one of the source
+///    operands.
+/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of 
+///    the sets of absolute differences between both operands. The upper bits 
+///    are
+///    cleared.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_sad_pu8(__m64 __a, __m64 __b)
 {
   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
 }
 
+/// \brief Saves the content of the MXCSR register to the 32-bit unsigned 
+///    integer destination.
+///    There are several classes of macros available with this intrinsic.
+///    These macros include masks for getting or setting exceptions, rounding
+///    modes, flushing, and denormalization.
+///    The following macros are exception flag masks that are used with 
+///    _mm_getcsr() to check if certain exceptions
+///    have been raised:
+///    For example, the following expression checks if an overflow exception
+///    has occurred:
+///    This expression checks for division by zero:
+///    This expression checks for if any exception has occurred:
+///    The following macros are used to get or set rounding modes:
+///    This expression gets the current rounding mode:
+///    Note that the FLUSH and DENORMALS masks can be used to check the DAZ
+///    (denormals are zero) and FZ (flush to zero) modes.
+///    The following macros, which wrap_mm_getcsr()and _mm_setcsr(), are used
+///    as convenience wrappers to easily get and set exception, flushing,
+///    denormalization, and rounding mode states:
+///    This expression gets the current rounding mode:
+///    This statement checks for floating-point exceptions:
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VSTMXCSR instruction.
+///
+/// \returns A 32-bit unsigned integer containing the content of the MXCSR
+///    register.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_getcsr(void)
 {
   return __builtin_ia32_stmxcsr();
 }
 
+/// \brief Loads the MXCSR register with the value stored in the 32-bit 
+///    unsigned integer operand.
+///    There are several classes of macros available with this intrinsic.
+///    These macros include masks for getting or setting exceptions, rounding
+///    modes, flushing, and denormalization.
+///    The following mask macros are used to set exception masks:
+///    For example, the following expression sets a mask to ignore underflow
+///    exceptions:
+///    This expression turns underflow exceptions back on:
+///    The following macros are used to get or set rounding modes:
+///    For example, the following expression causes floating-point operations
+///    to round up:
+///    Note that the FLUSH and DENORMALS masks can be used to check the DAZ
+///    (denormals are zero) and FZ (flush to zero) modes.
+///    The following macros, which wrap_mm_getcsr()and _mm_setcsr(), are used
+///    as convenience wrappers to easily get and set exception, flushing,
+///    denormalization, and rounding mode states:
+///    For example, this code sets the DAZ and FZ flags:
+///    This expression sets the rounding mode:
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VLDMXCSR instruction.
+///
+/// \param __i
+///    A 32-bit unsigned integer operand whose value is loaded into
+///    the MXCSR register.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_setcsr(unsigned int __i)
 {
   __builtin_ia32_ldmxcsr(__i);
 }
 
+/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as
+///    specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
+/// \endcode 
+///
+/// This intrinsic corresponds to \c VSHUFPS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float].
+/// \param b
+///    A 128-bit vector of [4 x float].
+/// \param mask
+///    An immediate value containing an 8-bit value specifying
+///    which elements to copy from a and b. Bits [3:0] specify the values 
+///    copied
+///    from operand a. Bits [7:4] specify the
+///    values copied from operand b. The
+///    destinations within the 128-bit destination are assigned
+///    values as follows:
+///    Bits [1:0] are used to assign values to bits [31:0] in the
+///    destination.
+///    Bits [3:2] are used to assign values to bits [63:32] in the
+///    destination.
+///    Bits [5:4] are used to assign values to bits [95:64] in the
+///    destination.
+///    Bits [7:6] are used to assign values to bits [127:96] in the
+///    destination.
+///    Bit value assignments:
+///    00: Bits [31:0] copied from the specified operand.
+///    01: Bits [63:32] copied from the specified operand.
+///    10: Bits [95:64] copied from the specified operand.
+///    11: Bits [127:96] copied from the specified operand.
+/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
 #define _mm_shuffle_ps(a, b, mask) __extension__ ({ \
   (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
                                   (mask) & 0x3, ((mask) & 0xc) >> 2, \
                                   (((mask) & 0x30) >> 4) + 4, \
                                   (((mask) & 0xc0) >> 6) + 4); })
 
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors 
+///    of [4 x float] and interleaves them into a packed 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUNPCKHPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+///    Bits [95:64] are written to bits [31:0] of the destination.
+///    Bits [127:96] are written to bits [95:64] of the
+///    destination.
+/// \param __b
+///    A 128-bit vector of [4 x float].
+///    Bits [95:64] are written to bits [63:32] of the destination.
+///    Bits [127:96] are written to bits [127:96] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_unpackhi_ps(__m128 __a, __m128 __b)
 {
   return __builtin_shufflevector(__a, __b, 2, 6, 3, 7);
 }
 
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x float] and interleaves them into a packed 128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VUNPCKLPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+///    Bits [31:0] are written to bits [31:0] of the destination.
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x float].
+///    Bits [31:0] are written to bits [63:32] of the destination.
+///    Bits [63:32] are written to bits [127:96] of the
+///    destination.
+/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_unpacklo_ps(__m128 __a, __m128 __b)
 {
   return __builtin_shufflevector(__a, __b, 0, 4, 1, 5);
 }
 
+/// \brief Moves the low-order 32-bit element from the second operand to the
+///    low-order element of the destination, and copies the corresponding
+///    upper elements from the first operand.
+///    The following code illustrates this intrinsics behavior:
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVSS instruction.
+///
+/// \param __a
+///    128-bit vector of [4 x float]. The upper 96 bits of this
+///    operand are copied to the upper 96 bits of the destination.
+/// \param __b
+///    128-bit vector of [4 x float]. The lower 32 bits of this
+///    operand are copied to the lower 32 bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_move_ss(__m128 __a, __m128 __b)
 {
   return __builtin_shufflevector(__a, __b, 4, 1, 2, 3);
 }
 
+/// \brief Moves the 2 high-order 32-bit elements from the second operand to 
+///    the low-order elements of the destination, and copies the corresponding
+///    upper elements from the first operand.
+///    The following code illustrates this intrinsics behavior:
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVHLPS instruction.
+///
+/// \param __a
+///    128-bit vector of [4 x float]. The upper 64 bits of this
+///    operand are copied to the upper 64 bits of the destination.
+/// \param __b
+///    128-bit vector of [4 x float]. The upper 64 bits of this
+///    operand are copied to the lower 64 bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_movehl_ps(__m128 __a, __m128 __b)
 {
   return __builtin_shufflevector(__a, __b, 6, 7, 2, 3);
 }
 
+/// \brief Moves the 2 low-order 32-bit elements from the second operand to the
+///    high-order elements of the destination, and copies the corresponding
+///    upper elements from the first operand.
+///    The following code illustrates this intrinsics behavior:
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVLHPS instruction.
+///
+/// \param __a
+///    128-bit vector of [4 x float]. The lower 64 bits of this
+///    operand are copied to the lower 64 bits of the destination.
+/// \param __b
+///    128-bit vector of [4 x float]. The lower 64 bits of this
+///    operand are copied to the upper 64 bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_movelh_ps(__m128 __a, __m128 __b)
 {
   return __builtin_shufflevector(__a, __b, 0, 1, 4, 5);
 }
 
+/// \brief Converts a 64-bit vector of [4 x i16] into a128-bit vector of [4 x
+///    float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPI2PS+COMPOSITE instruction.
+///
+/// \param __a
+///    A 64-bit vector of [4 x i16]. The elements of the
+///    destination are copied from the corresponding elements in
+///    this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and 
+///    converted values from the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi16_ps(__m64 __a)
 {
@@ -846,6 +2698,19 @@
   return __r;
 }
 
+/// \brief Converts a 64-bit vector of 16-bit unsigned integer values into
+///    a128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPI2PS+COMPOSITE instruction.
+///
+/// \param __a
+///    A 64-bit vector of 16-bit unsigned integer values. The
+///    elements of the destination are copied from the
+///    corresponding elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and 
+///    converted values from the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpu16_ps(__m64 __a)
 {
@@ -863,6 +2728,19 @@
   return __r;
 }
 
+/// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
+///    into a128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPI2PS+COMPOSITE instruction.
+///
+/// \param __a
+///    A 64-bit vector of [8 x i8]. The elements of the destination
+///    are copied from the corresponding lower 4 elements in this
+///    operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and 
+///    converted values from the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi8_ps(__m64 __a)
 {
@@ -875,6 +2753,19 @@
   return _mm_cvtpi16_ps(__b);
 }
 
+/// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit
+///    vector into a128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPI2PS+COMPOSITE instruction.
+///
+/// \param __a
+///    A 64-bit vector of unsigned 8-bit integer values. The
+///    elements of the destination are copied from the
+///    corresponding lower 4 elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and 
+///    converted values from the operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpu8_ps(__m64 __a)
 {
@@ -886,6 +2777,24 @@
   return _mm_cvtpi16_ps(__b);
 }
 
+/// \brief Converts the 2 32-bit signed integer values from each 64-bit vector
+///    operand of [2 x i32] into a128-bit vector of [4 x float].
+///    The following code illustrates this intrinsics behavior:
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPI2PS+COMPOSITE instruction.
+///
+/// \param __a
+///    A 64-bit vector of [2 x i32]. The lower elements of the
+///    destination are copied from the elements in this operand.
+/// \param __b
+///    A 64-bit vector of [2 x i32]. The upper elements of the
+///    destination are copied from the elements in this operand.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 
+///    copied and converted values from the first operand. The upper 64 bits 
+///    contain
+///    the copied and converted values from the second operand.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 {
@@ -898,6 +2807,19 @@
   return _mm_cvtpi32_ps(__c, __a);
 }
 
+/// \brief Converts a 128-bit vector of [4 x float] into a 64-bit vector of [4 
+///    x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPS2PI+COMPOSITE instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float]. The elements of the
+///    destination are copied from the corresponding elements in
+///    this operand.
+/// \returns A 64-bit vector of [4 x i16] containing the copied and converted
+///    values from the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtps_pi16(__m128 __a)
 {
@@ -910,6 +2832,19 @@
   return _mm_packs_pi32(__b, __c);
 }
 
+/// \brief Converts the values in a 128-bit vector of [4 x float] to the lower
+///    four 8-bit values in a 64-bit vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c CVTPS2PI+COMPOSITE instruction.
+///
+/// \param __a
+///    128-bit vector of [4 x float]. The corresponding lower 4
+///    elements elements of the destination are copied from the
+///    values in this operand.
+/// \returns A 64-bit vector of [8 x i8] containing the copied and converted 
+///    values from the operand.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_cvtps_pi8(__m128 __a)
 {
@@ -921,6 +2856,19 @@
   return _mm_packs_pi16(__b, __c);
 }
 
+/// \brief Extracts the sign bits of the packed float values in the 128-bit
+///    vector of [4 x float], zero-extends the value, and writes it to the
+///    low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to \c VMOVMSKPS instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float] containing the values with
+///    sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [3:0]. The 
+///    remaining bits are assigned values of zero.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_ps(__m128 __a)
 {