Index: ammintrin.h
===================================================================
--- ammintrin.h
+++ ammintrin.h
@@ -30,10 +30,47 @@
 
 #include <pmmintrin.h>
 
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    operand, using the length and bit index specified in the immediate
+///    bytes.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
+/// \endcode 
+///
+/// \code 
+/// This intrinsics corresponds to \c EXTRQ instruction.
+/// \endcode 
+///
+/// \param x
+///    The value from which bits are extracted.
+/// \param len
+///    Specifies the length at [5:0].
+/// \param idx
+///    Specifies the index of the least significant bit at [5:0].
+/// \returns The bits extracted from the operand.
 #define _mm_extracti_si64(x, len, idx) \
   ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
                                   (char)(len), (char)(idx)))
 
+/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+///    integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// This intrinsics corresponds to \c EXTRQ instruction.
+/// \endcode 
+///
+/// \param __x
+///    The value from which bits are extracted.
+/// \param __y
+///    Specifies the index of the least significant bit at [13:8],
+///    and the length at [5:0].
+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted 
+///    from the operand.
 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
 _mm_extract_si64(__m128i __x, __m128i __y)
 {
@@ -40,11 +77,59 @@
   return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
 }
 
+/// \brief Inserts bits from the lower 64 bits of the source integer vector
+///    operand into the lower 64 bits of the destination integer vector
+///    operand, using the length and bit index specified in the immediate
+///    bytes. No other bits in the lower 64 bits of the destination are
+///    modified. The upper 64 bits of the destination are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
+/// const int idx);
+/// \endcode 
+///
+/// \code 
+/// This intrinsics corresponds to \c INSERTQ instruction.
+/// \endcode 
+///
+/// \param x
+///    The operand containing the bits used for the destination.
+/// \param y
+///    The source operand.
+/// \param len
+///    Immediate specifying the length at [5:0].
+/// \param idx
+///    Immediate specifying the bit index at [5:0].
+/// \returns The result of inserting bits from operand y into
+///    operand x.
 #define _mm_inserti_si64(x, y, len, idx) \
   ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
                                     (__v2di)(__m128i)(y), \
                                     (char)(len), (char)(idx)))
 
+/// \brief Inserts bits from the lower 64 bits of the source integer vector
+///    operand into the lower 64 bits of the destination integer vector
+///    operand at the index specified by operand __y. No
+///    other bits in the lower 64 bits of the destination are modified. The
+///    upper 64 bits of the destination are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// This intrinsics corresponds to \c INSERTQ instruction.
+/// \endcode 
+///
+/// \param __x
+///    The lower 64 bits of this operand are copied to the lower 64
+///    bits of the destination.
+/// \param __y
+///    The field beginning at bit 0 of this operand with length
+///    specified by bits [69:64] is inserted into the destination
+///    at the index specified by bits [77:72].
+/// \returns The result of inserting bits from operand __y into
+///    operand __x.
 static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
 _mm_insert_si64(__m128i __x, __m128i __y)
 {
@@ -51,6 +136,21 @@
   return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
 }
 
+/// \brief Stores a 64-bit double-precision value in a 64-bit memory location. 
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// This intrinsics corresponds to \c MOVNTSD instruction.
+/// \endcode 
+///
+/// \param __p
+///    The 64-bit memory location used to store the register value.
+/// \param __a
+///    The 64-bit double-precision floating-point register value to
+///    be stored.
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
 _mm_stream_sd(double *__p, __m128d __a)
 {
@@ -57,6 +157,21 @@
   __builtin_ia32_movntsd(__p, (__v2df)__a);
 }
 
+/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
+///    memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code 
+/// This intrinsics corresponds to \c MOVNTSS instruction.
+/// \endcode 
+///
+/// \param __p
+///    The 32-bit memory location used to store the register value.
+/// \param __a
+///    The 32-bit single-precision floating-point register value to
+///    be stored.
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
 _mm_stream_ss(float *__p, __m128 __a)
 {