diff --git a/clang/lib/Headers/__wmmintrin_aes.h b/clang/lib/Headers/__wmmintrin_aes.h
--- a/clang/lib/Headers/__wmmintrin_aes.h
+++ b/clang/lib/Headers/__wmmintrin_aes.h
@@ -133,7 +133,7 @@
 ///    An 8-bit round constant used to generate the AES encryption key.
 /// \returns A 128-bit round key for AES encryption.
 #define _mm_aeskeygenassist_si128(C, R) \
-  (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
+  ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
 
 #undef __DEFAULT_FN_ATTRS
 
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -20,8 +20,8 @@
 
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
 #define _mm256_mpsadbw_epu8(X, Y, M) \
-  (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
-                                     (__v32qi)(__m256i)(Y), (int)(M))
+  ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
+                                      (__v32qi)(__m256i)(Y), (int)(M)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_abs_epi8(__m256i __a)
@@ -114,8 +114,8 @@
 }
 
 #define _mm256_alignr_epi8(a, b, n) \
-  (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
-                                     (__v32qi)(__m256i)(b), (n))
+  ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
+                                      (__v32qi)(__m256i)(b), (n)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_and_si256(__m256i __a, __m256i __b)
@@ -149,8 +149,8 @@
 }
 
 #define _mm256_blend_epi16(V1, V2, M) \
-  (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
-                                     (__v16hi)(__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
+                                      (__v16hi)(__m256i)(V2), (int)(M)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
@@ -467,13 +467,13 @@
 }
 
 #define _mm256_shuffle_epi32(a, imm) \
-  (__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
 
 #define _mm256_shufflehi_epi16(a, imm) \
-  (__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
 
 #define _mm256_shufflelo_epi16(a, imm) \
-  (__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sign_epi8(__m256i __a, __m256i __b)
@@ -494,10 +494,10 @@
 }
 
 #define _mm256_slli_si256(a, imm) \
-  (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
 
 #define _mm256_bslli_epi128(a, imm) \
-  (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_slli_epi16(__m256i __a, int __count)
@@ -560,10 +560,10 @@
 }
 
 #define _mm256_srli_si256(a, imm) \
-  (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
 
 #define _mm256_bsrli_epi128(a, imm) \
-  (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srli_epi16(__m256i __a, int __count)
@@ -743,12 +743,12 @@
 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
 
 #define _mm_blend_epi32(V1, V2, M) \
-  (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
-                                     (__v4si)(__m128i)(V2), (int)(M))
+  ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
+                                      (__v4si)(__m128i)(V2), (int)(M)))
 
 #define _mm256_blend_epi32(V1, V2, M) \
-  (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
-                                     (__v8si)(__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
+                                      (__v8si)(__m256i)(V2), (int)(M)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastb_epi8(__m128i __X)
@@ -806,7 +806,7 @@
 }
 
 #define _mm256_permute4x64_pd(V, M) \
-  (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))
+  ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
@@ -815,17 +815,17 @@
 }
 
 #define _mm256_permute4x64_epi64(V, M) \
-  (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))
+  ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
 
 #define _mm256_permute2x128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
 
 #define _mm256_extracti128_si256(V, M) \
-  (__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))
+  ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
 
 #define _mm256_inserti128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
-                                        (__v2di)(__m128i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
+                                         (__v2di)(__m128i)(V2), (int)(M)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskload_epi32(int const *__X, __m256i __M)
@@ -936,211 +936,211 @@
 }
 
 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
-  (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
-                                     (double const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v2df)(__m128d)(mask), (s))
+  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
+                                      (double const *)(m), \
+                                      (__v4si)(__m128i)(i), \
+                                      (__v2df)(__m128d)(mask), (s)))
 
 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
-  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
-                                        (double const *)(m), \
-                                        (__v4si)(__m128i)(i), \
-                                        (__v4df)(__m256d)(mask), (s))
+  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
+                                         (double const *)(m), \
+                                         (__v4si)(__m128i)(i), \
+                                         (__v4df)(__m256d)(mask), (s)))
 
 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
-  (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
-                                     (double const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v2df)(__m128d)(mask), (s))
+  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
+                                      (double const *)(m), \
+                                      (__v2di)(__m128i)(i), \
+                                      (__v2df)(__m128d)(mask), (s)))
 
 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
-  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
-                                        (double const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4df)(__m256d)(mask), (s))
+  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
+                                         (double const *)(m), \
+                                         (__v4di)(__m256i)(i), \
+                                         (__v4df)(__m256d)(mask), (s)))
 
 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
-  (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
-                                    (float const *)(m), \
-                                    (__v4si)(__m128i)(i), \
-                                    (__v4sf)(__m128)(mask), (s))
+  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
+                                     (float const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v4sf)(__m128)(mask), (s)))
 
 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
-  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
-                                       (float const *)(m), \
-                                       (__v8si)(__m256i)(i), \
-                                       (__v8sf)(__m256)(mask), (s))
+  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
+                                        (float const *)(m), \
+                                        (__v8si)(__m256i)(i), \
+                                        (__v8sf)(__m256)(mask), (s)))
 
 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
-  (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
-                                    (float const *)(m), \
-                                    (__v2di)(__m128i)(i), \
-                                    (__v4sf)(__m128)(mask), (s))
+  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
+                                     (float const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v4sf)(__m128)(mask), (s)))
 
 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
-  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
-                                       (float const *)(m), \
-                                       (__v4di)(__m256i)(i), \
-                                       (__v4sf)(__m128)(mask), (s))
+  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
+                                        (float const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4sf)(__m128)(mask), (s)))
 
 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
-                                    (int const *)(m), \
-                                    (__v4si)(__m128i)(i), \
-                                    (__v4si)(__m128i)(mask), (s))
+  ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
+                                     (int const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v4si)(__m128i)(mask), (s)))
 
 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
-  (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
-                                       (int const *)(m), \
-                                       (__v8si)(__m256i)(i), \
-                                       (__v8si)(__m256i)(mask), (s))
+  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
+                                        (int const *)(m), \
+                                        (__v8si)(__m256i)(i), \
+                                        (__v8si)(__m256i)(mask), (s)))
 
 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
-                                    (int const *)(m), \
-                                    (__v2di)(__m128i)(i), \
-                                    (__v4si)(__m128i)(mask), (s))
+  ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
+                                     (int const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v4si)(__m128i)(mask), (s)))
 
 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
-                                       (int const *)(m), \
-                                       (__v4di)(__m256i)(i), \
-                                       (__v4si)(__m128i)(mask), (s))
+  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
+                                        (int const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4si)(__m128i)(mask), (s)))
 
 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
-                                    (long long const *)(m), \
-                                    (__v4si)(__m128i)(i), \
-                                    (__v2di)(__m128i)(mask), (s))
+  ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
+                                     (long long const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2di)(__m128i)(mask), (s)))
 
 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
-  (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
-                                       (long long const *)(m), \
-                                       (__v4si)(__m128i)(i), \
-                                       (__v4di)(__m256i)(mask), (s))
+  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
+                                        (long long const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4di)(__m256i)(mask), (s)))
 
 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
-                                    (long long const *)(m), \
-                                    (__v2di)(__m128i)(i), \
-                                    (__v2di)(__m128i)(mask), (s))
+  ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
+                                     (long long const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2di)(__m128i)(mask), (s)))
 
 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
-  (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
-                                       (long long const *)(m), \
-                                       (__v4di)(__m256i)(i), \
-                                       (__v4di)(__m256i)(mask), (s))
+  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
+                                        (long long const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4di)(__m256i)(mask), (s)))
 
 #define _mm_i32gather_pd(m, i, s) \
-  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
-                                     (double const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
-                                                          _mm_setzero_pd()), \
-                                     (s))
+  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
+                                      (double const *)(m), \
+                                      (__v4si)(__m128i)(i), \
+                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                           _mm_setzero_pd()), \
+                                      (s)))
 
 #define _mm256_i32gather_pd(m, i, s) \
-  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
-                                        (double const *)(m), \
-                                        (__v4si)(__m128i)(i), \
-                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
-                                                              _mm256_setzero_pd(), \
-                                                              _CMP_EQ_OQ), \
-                                        (s))
+  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
+                                         (double const *)(m), \
+                                         (__v4si)(__m128i)(i), \
+                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                               _mm256_setzero_pd(), \
+                                                               _CMP_EQ_OQ), \
+                                         (s)))
 
 #define _mm_i64gather_pd(m, i, s) \
-  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
-                                     (double const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
-                                                          _mm_setzero_pd()), \
-                                     (s))
+  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
+                                      (double const *)(m), \
+                                      (__v2di)(__m128i)(i), \
+                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                           _mm_setzero_pd()), \
+                                      (s)))
 
 #define _mm256_i64gather_pd(m, i, s) \
-  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
-                                        (double const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
-                                                              _mm256_setzero_pd(), \
-                                                              _CMP_EQ_OQ), \
-                                        (s))
+  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
+                                         (double const *)(m), \
+                                         (__v4di)(__m256i)(i), \
+                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                               _mm256_setzero_pd(), \
+                                                               _CMP_EQ_OQ), \
+                                         (s)))
 
 #define _mm_i32gather_ps(m, i, s) \
-  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
-                                    (float const *)(m), \
-                                    (__v4si)(__m128i)(i), \
-                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                         _mm_setzero_ps()), \
-                                    (s))
+  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
+                                     (float const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                          _mm_setzero_ps()), \
+                                     (s)))
 
 #define _mm256_i32gather_ps(m, i, s) \
-  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
-                                       (float const *)(m), \
-                                       (__v8si)(__m256i)(i), \
-                                       (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
-                                                             _mm256_setzero_ps(), \
-                                                             _CMP_EQ_OQ), \
-                                       (s))
+  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
+                                        (float const *)(m), \
+                                        (__v8si)(__m256i)(i), \
+                                        (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
+                                                              _mm256_setzero_ps(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)))
 
 #define _mm_i64gather_ps(m, i, s) \
-  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
-                                    (float const *)(m), \
-                                    (__v2di)(__m128i)(i), \
-                                    (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                         _mm_setzero_ps()), \
-                                    (s))
+  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
+                                     (float const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                          _mm_setzero_ps()), \
+                                     (s)))
 
 #define _mm256_i64gather_ps(m, i, s) \
-  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
-                                       (float const *)(m), \
-                                       (__v4di)(__m256i)(i), \
-                                       (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                            _mm_setzero_ps()), \
-                                       (s))
+  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
+                                        (float const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                             _mm_setzero_ps()), \
+                                        (s)))
 
 #define _mm_i32gather_epi32(m, i, s) \
-  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
-                                    (int const *)(m), (__v4si)(__m128i)(i), \
-                                    (__v4si)_mm_set1_epi32(-1), (s))
+  ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
+                                     (int const *)(m), (__v4si)(__m128i)(i), \
+                                     (__v4si)_mm_set1_epi32(-1), (s)))
 
 #define _mm256_i32gather_epi32(m, i, s) \
-  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
-                                       (int const *)(m), (__v8si)(__m256i)(i), \
-                                       (__v8si)_mm256_set1_epi32(-1), (s))
+  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
+                                        (int const *)(m), (__v8si)(__m256i)(i), \
+                                        (__v8si)_mm256_set1_epi32(-1), (s)))
 
 #define _mm_i64gather_epi32(m, i, s) \
-  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
-                                    (int const *)(m), (__v2di)(__m128i)(i), \
-                                    (__v4si)_mm_set1_epi32(-1), (s))
+  ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
+                                     (int const *)(m), (__v2di)(__m128i)(i), \
+                                     (__v4si)_mm_set1_epi32(-1), (s)))
 
 #define _mm256_i64gather_epi32(m, i, s) \
-  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
-                                       (int const *)(m), (__v4di)(__m256i)(i), \
-                                       (__v4si)_mm_set1_epi32(-1), (s))
+  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
+                                        (int const *)(m), (__v4di)(__m256i)(i), \
+                                        (__v4si)_mm_set1_epi32(-1), (s)))
 
 #define _mm_i32gather_epi64(m, i, s) \
-  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
-                                    (long long const *)(m), \
-                                    (__v4si)(__m128i)(i), \
-                                    (__v2di)_mm_set1_epi64x(-1), (s))
+  ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
+                                     (long long const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2di)_mm_set1_epi64x(-1), (s)))
 
 #define _mm256_i32gather_epi64(m, i, s) \
-  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
-                                       (long long const *)(m), \
-                                       (__v4si)(__m128i)(i), \
-                                       (__v4di)_mm256_set1_epi64x(-1), (s))
+  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
+                                        (long long const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
 
 #define _mm_i64gather_epi64(m, i, s) \
-  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
-                                    (long long const *)(m), \
-                                    (__v2di)(__m128i)(i), \
-                                    (__v2di)_mm_set1_epi64x(-1), (s))
+  ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
+                                     (long long const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2di)_mm_set1_epi64x(-1), (s)))
 
 #define _mm256_i64gather_epi64(m, i, s) \
-  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
-                                       (long long const *)(m), \
-                                       (__v4di)(__m256i)(i), \
-                                       (__v4di)_mm256_set1_epi64x(-1), (s))
+  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
+                                        (long long const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
 
 #undef __DEFAULT_FN_ATTRS256
 #undef __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -400,7 +400,7 @@
 ///      11: Truncated.
 /// \returns A 256-bit vector of [4 x double] containing the rounded values.
 #define _mm256_round_pd(V, M) \
-    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
+  ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
 
 /// Rounds the values stored in a 256-bit vector of [8 x float] as
 ///    specified by the byte operand. The source values are rounded to integer
@@ -432,7 +432,7 @@
 ///      11: Truncated.
 /// \returns A 256-bit vector of [8 x float] containing the rounded values.
 #define _mm256_round_ps(V, M) \
-  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
+  ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
 
 /// Rounds up the values stored in a 256-bit vector of [4 x double]. The
 ///    source values are rounded up to integer values and returned as 64-bit
@@ -989,7 +989,7 @@
 ///         returned vector.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_permute_pd(A, C) \
-  (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
+  ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
 
 /// Copies the values in a 256-bit vector of [4 x double] as specified by
 ///    the immediate integer operand.
@@ -1029,7 +1029,7 @@
 ///         returned vector.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute_pd(A, C) \
-  (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
+  ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
 
 /// Copies the values in a 128-bit vector of [4 x float] as specified by
 ///    the immediate integer operand.
@@ -1085,7 +1085,7 @@
 ///          returned vector.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_permute_ps(A, C) \
-  (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
+  ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
 
 /// Copies the values in a 256-bit vector of [8 x float] as specified by
 ///    the immediate integer operand.
@@ -1177,7 +1177,7 @@
 ///          returned vector.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute_ps(A, C) \
-  (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
+  ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
 
 /// Permutes 128-bit data values stored in two 256-bit vectors of
 ///    [4 x double], as specified by the immediate integer operand.
@@ -1217,8 +1217,8 @@
 ///          destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute2f128_pd(V1, V2, M) \
-  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
-                                           (__v4df)(__m256d)(V2), (int)(M))
+  ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
+                                            (__v4df)(__m256d)(V2), (int)(M)))
 
 /// Permutes 128-bit data values stored in two 256-bit vectors of
 ///    [8 x float], as specified by the immediate integer operand.
@@ -1258,8 +1258,8 @@
 ///    destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute2f128_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
-                                          (__v8sf)(__m256)(V2), (int)(M))
+  ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
+                                           (__v8sf)(__m256)(V2), (int)(M)))
 
 /// Permutes 128-bit data values stored in two 256-bit integer vectors,
 ///    as specified by the immediate integer operand.
@@ -1298,8 +1298,8 @@
 ///    destination.
 /// \returns A 256-bit integer vector containing the copied values.
 #define _mm256_permute2f128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
-                                           (__v8si)(__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
+                                            (__v8si)(__m256i)(V2), (int)(M)))
 
 /* Vector Blend */
 /// Merges 64-bit double-precision data values stored in either of the
@@ -1327,8 +1327,8 @@
 ///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_blend_pd(V1, V2, M) \
-  (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
-                                     (__v4df)(__m256d)(V2), (int)(M))
+  ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
+                                      (__v4df)(__m256d)(V2), (int)(M)))
 
 /// Merges 32-bit single-precision data values stored in either of the
 ///    two 256-bit vectors of [8 x float], as specified by the immediate
@@ -1355,8 +1355,8 @@
 ///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_blend_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
-                                    (__v8sf)(__m256)(V2), (int)(M))
+  ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
+                                     (__v8sf)(__m256)(V2), (int)(M)))
 
 /// Merges 64-bit double-precision data values stored in either of the
 ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
@@ -1453,8 +1453,8 @@
 ///    two parallel dot product computations.
 /// \returns A 256-bit vector of [8 x float] containing the two dot products.
 #define _mm256_dp_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
-                                 (__v8sf)(__m256)(V2), (M))
+  ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
+                                  (__v8sf)(__m256)(V2), (M)))
 
 /* Vector shuffle */
 /// Selects 8 float values from the 256-bit operands of [8 x float], as
@@ -1507,8 +1507,8 @@
 ///    11: Bits [127:96] and [255:224] are copied from the selected operand.
 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
 #define _mm256_shuffle_ps(a, b, mask) \
-  (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
-                                   (__v8sf)(__m256)(b), (int)(mask))
+  ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
+                                    (__v8sf)(__m256)(b), (int)(mask)))
 
 /// Selects four double-precision values from the 256-bit operands of
 ///    [4 x double], as specified by the immediate value operand.
@@ -1553,8 +1553,8 @@
 ///    destination.
 /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
 #define _mm256_shuffle_pd(a, b, mask) \
-  (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
-                                    (__v4df)(__m256d)(b), (int)(mask))
+  ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
+                                     (__v4df)(__m256d)(b), (int)(mask)))
 
 /* Compare */
 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
@@ -1647,8 +1647,8 @@
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_pd(a, b, c) \
-  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
-                                (__v2df)(__m128d)(b), (c))
+  ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
+                                 (__v2df)(__m128d)(b), (c)))
 
 /// Compares each of the corresponding values of two 128-bit vectors of
 ///    [4 x float], using the operation specified by the immediate integer
@@ -1707,8 +1707,8 @@
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ps(a, b, c) \
-  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
-                               (__v4sf)(__m128)(b), (c))
+  ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
+                                (__v4sf)(__m128)(b), (c)))
 
 /// Compares each of the corresponding double-precision values of two
 ///    256-bit vectors of [4 x double], using the operation specified by the
@@ -1767,8 +1767,8 @@
 ///    0x1F: True (unordered, signaling)
 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
 #define _mm256_cmp_pd(a, b, c) \
-  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
-                                   (__v4df)(__m256d)(b), (c))
+  ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
+                                    (__v4df)(__m256d)(b), (c)))
 
 /// Compares each of the corresponding values of two 256-bit vectors of
 ///    [8 x float], using the operation specified by the immediate integer
@@ -1827,8 +1827,8 @@
 ///    0x1F: True (unordered, signaling)
 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
 #define _mm256_cmp_ps(a, b, c) \
-  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
-                                  (__v8sf)(__m256)(b), (c))
+  ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
+                                   (__v8sf)(__m256)(b), (c)))
 
 /// Compares each of the corresponding scalar double-precision values of
 ///    two 128-bit vectors of [2 x double], using the operation specified by the
@@ -1886,8 +1886,8 @@
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_sd(a, b, c) \
-  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
-                                (__v2df)(__m128d)(b), (c))
+  ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
+                                 (__v2df)(__m128d)(b), (c)))
 
 /// Compares each of the corresponding scalar values of two 128-bit
 ///    vectors of [4 x float], using the operation specified by the immediate
@@ -1945,8 +1945,8 @@
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ss(a, b, c) \
-  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
-                               (__v4sf)(__m128)(b), (c))
+  ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
+                                (__v4sf)(__m128)(b), (c)))
 
 /// Takes a [8 x i32] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
@@ -1964,7 +1964,7 @@
 /// \returns A 32-bit integer containing the extracted 32 bits of extended
 ///    packed data.
 #define _mm256_extract_epi32(X, N) \
-  (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
+  ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
 
 /// Takes a [16 x i16] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
@@ -1982,8 +1982,8 @@
 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
 ///    packed data.
 #define _mm256_extract_epi16(X, N) \
-  (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
-                                                    (int)(N))
+  ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
+                                                     (int)(N)))
 
 /// Takes a [32 x i8] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
@@ -2001,8 +2001,8 @@
 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
 ///    packed data.
 #define _mm256_extract_epi8(X, N) \
-  (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
-                                                   (int)(N))
+  ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
+                                                    (int)(N)))
 
 #ifdef __x86_64__
 /// Takes a [4 x i64] vector and returns the vector element value
@@ -2021,7 +2021,7 @@
 /// \returns A 64-bit integer containing the extracted 64 bits of extended
 ///    packed data.
 #define _mm256_extract_epi64(X, N) \
-  (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
+  ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
 #endif
 
 /// Takes a [8 x i32] vector and replaces the vector element value
@@ -2043,8 +2043,8 @@
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///    \a __imm with \a __b.
 #define _mm256_insert_epi32(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
-                                       (int)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
+                                        (int)(I), (int)(N)))
 
 
 /// Takes a [16 x i16] vector and replaces the vector element value
@@ -2066,8 +2066,8 @@
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///    \a __imm with \a __b.
 #define _mm256_insert_epi16(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
-                                        (int)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
+                                         (int)(I), (int)(N)))
 
 /// Takes a [32 x i8] vector and replaces the vector element value
 ///    indexed by the immediate constant operand with a new value. Returns the
@@ -2088,8 +2088,8 @@
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///    \a __imm with \a __b.
 #define _mm256_insert_epi8(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
-                                        (int)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
+                                         (int)(I), (int)(N)))
 
 #ifdef __x86_64__
 /// Takes a [4 x i64] vector and replaces the vector element value
@@ -2111,8 +2111,8 @@
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///     \a __imm with \a __b.
 #define _mm256_insert_epi64(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
-                                       (long long)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
+                                        (long long)(I), (int)(N)))
 #endif
 
 /* Conversion */
@@ -4592,8 +4592,8 @@
 ///    result.
 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 #define _mm256_insertf128_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
-                                           (__v4sf)(__m128)(V2), (int)(M))
+  ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
+                                            (__v4sf)(__m128)(V2), (int)(M)))
 
 /// Constructs a new 256-bit vector of [4 x double] by first duplicating
 ///    a 256-bit vector of [4 x double] given in the first parameter, and then
@@ -4630,8 +4630,8 @@
 ///    result.
 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 #define _mm256_insertf128_pd(V1, V2, M) \
-  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
-                                            (__v2df)(__m128d)(V2), (int)(M))
+  ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
+                                             (__v2df)(__m128d)(V2), (int)(M)))
 
 /// Constructs a new 256-bit integer vector by first duplicating a
 ///    256-bit integer vector given in the first parameter, and then replacing
@@ -4668,8 +4668,8 @@
 ///    result.
 /// \returns A 256-bit integer vector containing the interleaved values.
 #define _mm256_insertf128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
-                                            (__v4si)(__m128i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
+                                             (__v4si)(__m128i)(V2), (int)(M)))
 
 /*
    Vector extract.
@@ -4698,7 +4698,7 @@
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit vector of [4 x float] containing the extracted bits.
 #define _mm256_extractf128_ps(V, M) \
-  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
+  ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
 
 /// Extracts either the upper or the lower 128 bits from a 256-bit vector
 ///    of [4 x double], as determined by the immediate integer parameter, and
@@ -4722,7 +4722,7 @@
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit vector of [2 x double] containing the extracted bits.
 #define _mm256_extractf128_pd(V, M) \
-  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
+  ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
 
 /// Extracts either the upper or the lower 128 bits from a 256-bit
 ///    integer vector, as determined by the immediate integer parameter, and
@@ -4746,7 +4746,7 @@
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit integer vector containing the extracted bits.
 #define _mm256_extractf128_si256(V, M) \
-  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
+  ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
 
 /* SIMD load ops (unaligned) */
 /// Loads two 128-bit floating-point vectors of [4 x float] from
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -2818,10 +2818,10 @@
 ///    \a a.
 /// \returns A 128-bit integer vector containing the left-shifted value.
 #define _mm_slli_si128(a, imm) \
-  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
 
 #define _mm_bslli_si128(a, imm) \
-  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
 
 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. Low-order bits are cleared.
@@ -3035,10 +3035,10 @@
 ///    \a a.
 /// \returns A 128-bit integer vector containing the right-shifted value.
 #define _mm_srli_si128(a, imm) \
-  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
 
 #define _mm_bsrli_si128(a, imm) \
-  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
 
 /// Right-shifts each of 16-bit values in the 128-bit integer vector
 ///    operand by the specified number of bits. High-order bits are cleared.
@@ -4356,8 +4356,8 @@
 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
 ///    integer vector parameter and the remaining bits are assigned zeros.
 #define _mm_extract_epi16(a, imm) \
-  (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
-                                                   (int)(imm))
+  ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
+                                                    (int)(imm)))
 
 /// Constructs a 128-bit integer vector by first making a copy of the
 ///    128-bit integer vector parameter, and then inserting the lower 16 bits
@@ -4380,8 +4380,8 @@
 ///    lower 16 bits of \a __b are written.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi16(a, b, imm) \
-  (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
-                                       (int)(imm))
+  ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
+                                        (int)(imm)))
 
 /// Copies the values of the most significant bits from each 8-bit
 ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
@@ -4430,7 +4430,7 @@
 ///    11: assign values from bits [127:96] of \a a.
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shuffle_epi32(a, imm) \
-  (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
 
 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
@@ -4460,7 +4460,7 @@
 ///    11: assign values from bits [63:48] of \a a. \n
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflelo_epi16(a, imm) \
-  (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
 
 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
@@ -4490,7 +4490,7 @@
 ///    11: assign values from bits [127:112] of \a a. \n
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflehi_epi16(a, imm) \
-  (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
 
 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
 ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
@@ -4844,8 +4844,8 @@
 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
 #define _mm_shuffle_pd(a, b, i) \
-  (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
-                                 (int)(i))
+  ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
+                                  (int)(i)))
 
 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
 ///    floating-point vector of [4 x float].
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -231,7 +231,7 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_round_ps(X, M) \
-  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
+  ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
 
 /// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
@@ -272,8 +272,8 @@
 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 ///    values.
 #define _mm_round_ss(X, Y, M) \
-  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
-                                 (__v4sf)(__m128)(Y), (M))
+  ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+                                  (__v4sf)(__m128)(Y), (M)))
 
 /// Rounds each element of the 128-bit vector of [2 x double] to an
 ///    integer value according to the rounding control specified by the second
@@ -306,7 +306,7 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_round_pd(X, M) \
-  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
+  ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
 
 /// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
@@ -347,8 +347,8 @@
 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 ///    values.
 #define _mm_round_sd(X, Y, M) \
-  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
-                                  (__v2df)(__m128d)(Y), (M))
+  ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+                                   (__v2df)(__m128d)(Y), (M)))
 
 /* SSE4 Packed Blending Intrinsics.  */
 /// Returns a 128-bit vector of [2 x double] where the values are
@@ -376,8 +376,8 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_blend_pd(V1, V2, M) \
-  (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
-                                    (__v2df)(__m128d)(V2), (int)(M))
+  ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+                                     (__v2df)(__m128d)(V2), (int)(M)))
 
 /// Returns a 128-bit vector of [4 x float] where the values are selected
 ///    from either the first or second operand as specified by the third
@@ -404,8 +404,8 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_blend_ps(V1, V2, M) \
-  (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
-                                   (__v4sf)(__m128)(V2), (int)(M))
+  ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+                                    (__v4sf)(__m128)(V2), (int)(M)))
 
 /// Returns a 128-bit vector of [2 x double] where the values are
 ///    selected from either the first or second operand as specified by the
@@ -513,8 +513,8 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [8 x i16] containing the copied values.
 #define _mm_blend_epi16(V1, V2, M) \
-  (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
-                                       (__v8hi)(__m128i)(V2), (int)(M))
+  ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+                                        (__v8hi)(__m128i)(V2), (int)(M)))
 
 /* SSE4 Dword Multiply Instructions.  */
 /// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
@@ -590,8 +590,8 @@
 ///    in the corresponding element; otherwise that element is set to zero.
 /// \returns A 128-bit vector of [4 x float] containing the dot product.
 #define _mm_dp_ps(X, Y, M) \
-  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
-                               (__v4sf)(__m128)(Y), (M))
+  ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+                                (__v4sf)(__m128)(Y), (M)))
 
 /// Computes the dot product of the two 128-bit vectors of [2 x double]
 ///    and returns it in the elements of the 128-bit result vector of
@@ -625,8 +625,8 @@
 ///    each [2 x double] vector. If a bit is set, the dot product is returned in
 ///    the corresponding element; otherwise that element is set to zero.
 #define _mm_dp_pd(X, Y, M) \
-  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
-                                (__v2df)(__m128d)(Y), (M))
+  ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+                                 (__v2df)(__m128d)(Y), (M)))
 
 /* SSE4 Streaming Load Hint Instruction.  */
 /// Loads integer values from a 128-bit aligned memory location to a
@@ -925,8 +925,8 @@
 ///    1111: Bits [127:120] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi8(X, I, N) \
-  (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
-                                        (int)(I), (int)(N))
+  ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
+                                         (int)(I), (int)(N)))
 
 /// Constructs a 128-bit vector of [4 x i32] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the 32-bit
@@ -957,8 +957,8 @@
 ///    11: Bits [127:96] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi32(X, I, N) \
-  (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
-                                       (int)(I), (int)(N))
+  ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
+                                        (int)(I), (int)(N)))
 
 #ifdef __x86_64__
 /// Constructs a 128-bit vector of [2 x i64] by first making a copy of
@@ -988,8 +988,8 @@
 ///    1: Bits [127:64] of the result are used for insertion. \n
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi64(X, I, N) \
-  (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
-                                       (long long)(I), (int)(N))
+  ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
+                                        (long long)(I), (int)(N)))
 #endif /* __x86_64__ */
 
 /* Extract int from packed integer array at index.  This returns the element
@@ -1031,8 +1031,8 @@
 ///    128-bit integer vector parameter and the remaining bits are assigned
 ///    zeros.
 #define _mm_extract_epi8(X, N) \
-  (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
-                                                   (int)(N))
+  ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
+                                                    (int)(N)))
 
 /// Extracts a 32-bit element from the 128-bit integer vector of
 ///    [4 x i32], using the immediate value parameter \a N as a selector.
@@ -1057,7 +1057,7 @@
 /// \returns  An integer, whose lower 32 bits are selected from the 128-bit
 ///    integer vector parameter and the remaining bits are assigned zeros.
 #define _mm_extract_epi32(X, N) \
-  (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
+  ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
 
 #ifdef __x86_64__
 /// Extracts a 64-bit element from the 128-bit integer vector of
@@ -1080,7 +1080,7 @@
 ///    1: Bits [127:64] are returned. \n
 /// \returns  A 64-bit integer.
 #define _mm_extract_epi64(X, N) \
-  (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
+  ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
 #endif /* __x86_64 */
 
 /* SSE4 128-bit Packed Integer Comparisons.  */
@@ -1514,8 +1514,8 @@
 /// \returns A 128-bit integer vector containing the sums of the sets of
 ///    absolute differences between both operands.
 #define _mm_mpsadbw_epu8(X, Y, M) \
-  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
-                                      (__v16qi)(__m128i)(Y), (M))
+  ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+                                       (__v16qi)(__m128i)(Y), (M)))
 
 /// Finds the minimum unsigned 16-bit element in the input 128-bit
 ///    vector of [8 x u16] and returns it and along with its index.
@@ -1624,8 +1624,8 @@
 /// \returns Returns a 128-bit integer vector representing the result mask of
 ///    the comparison.
 #define _mm_cmpistrm(A, B, M) \
-  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
-                                       (__v16qi)(__m128i)(B), (int)(M))
+  ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+                                        (__v16qi)(__m128i)(B), (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@@ -1678,8 +1678,8 @@
 ///      1: The index of the most significant set bit. \n
 /// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpistri(A, B, M) \
-  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
-                                   (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+                                   (__v16qi)(__m128i)(B), (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@@ -1738,9 +1738,9 @@
 /// \returns Returns a 128-bit integer vector representing the result mask of
 ///    the comparison.
 #define _mm_cmpestrm(A, LA, B, LB, M) \
-  (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
-                                       (__v16qi)(__m128i)(B), (int)(LB), \
-                                       (int)(M))
+  ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+                                        (__v16qi)(__m128i)(B), (int)(LB), \
+                                        (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@@ -1797,9 +1797,9 @@
 ///      1: The index of the most significant set bit. \n
 /// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpestri(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
-                                   (__v16qi)(__m128i)(B), (int)(LB), \
-                                   (int)(M))
+  ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+                                    (__v16qi)(__m128i)(B), (int)(LB), \
+                                    (int)(M)))
 
 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
 /// Uses the immediate operand \a M to perform a comparison of string
@@ -1849,8 +1849,8 @@
 /// \returns Returns 1 if the bit mask is zero and the length of the string in
 ///    \a B is the maximum; otherwise, returns 0.
 #define _mm_cmpistra(A, B, M) \
-  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@@ -1898,8 +1898,8 @@
 ///          to the size of \a A or \a B.
 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
 #define _mm_cmpistrc(A, B, M) \
-  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@@ -1946,8 +1946,8 @@
 ///          to the size of \a A or \a B. \n
 /// \returns Returns bit 0 of the resulting bit mask.
 #define _mm_cmpistro(A, B, M) \
-  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@@ -1996,8 +1996,8 @@
 /// \returns Returns 1 if the length of the string in \a A is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpistrs(A, B, M) \
-  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@@ -2046,8 +2046,8 @@
 /// \returns Returns 1 if the length of the string in \a B is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpistrz(A, B, M) \
-  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@@ -2100,9 +2100,9 @@
 /// \returns Returns 1 if the bit mask is zero and the length of the string in
 ///    \a B is the maximum, otherwise, returns 0.
 #define _mm_cmpestra(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+  ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+                                     (__v16qi)(__m128i)(B), (int)(LB), \
+                                     (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@@ -2154,9 +2154,9 @@
 ///          to the size of \a A or \a B. \n
 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
 #define _mm_cmpestrc(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+  ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+                                     (__v16qi)(__m128i)(B), (int)(LB), \
+                                     (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@@ -2207,9 +2207,9 @@
 ///          to the size of \a A or \a B.
 /// \returns Returns bit 0 of the resulting bit mask.
 #define _mm_cmpestro(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+  ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+                                     (__v16qi)(__m128i)(B), (int)(LB), \
+                                     (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@@ -2262,9 +2262,9 @@
 /// \returns Returns 1 if the length of the string in \a A is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpestrs(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+  ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+                                     (__v16qi)(__m128i)(B), (int)(LB), \
+                                     (int)(M)))
 
 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@@ -2316,9 +2316,9 @@
 /// \returns Returns 1 if the length of the string in \a B is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpestrz(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
-                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+  ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+                                     (__v16qi)(__m128i)(B), (int)(LB), \
+                                     (int)(M)))
 
 /* SSE4.2 Compare Packed Data -- Greater Than.  */
 /// Compares each of the corresponding 64-bit values of the 128-bit
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -145,8 +145,8 @@
 /// \returns A 128-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_epi8(a, b, n) \
-  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
-                                     (__v16qi)(__m128i)(b), (n))
+  ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
+                                      (__v16qi)(__m128i)(b), (n)))
 
 /// Concatenates the two 64-bit integer vector operands, and right-shifts
 ///    the result by the number of bytes specified in the immediate operand.
@@ -168,7 +168,7 @@
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_pi8(a, b, n) \
-  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
+  ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
 
 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -2181,7 +2181,7 @@
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 #define _mm_extract_pi16(a, n) \
-  (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
+  ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
 
 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
@@ -2212,7 +2212,7 @@
 /// \returns A 64-bit integer vector containing the copied packed data from the
 ///    operands.
 #define _mm_insert_pi16(a, d, n) \
-  (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
+  ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
 
 /// Compares each of the corresponding packed 16-bit integer values of
 ///    the 64-bit integer vectors, and writes the greater value to the
@@ -2359,7 +2359,7 @@
 ///    11: assigned from bits [63:48] of \a a.
 /// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) \
-  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
+  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
 
 /// Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
@@ -2601,8 +2601,8 @@
 ///    11: Bits [127:96] copied from the specified operand.
 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
 #define _mm_shuffle_ps(a, b, mask) \
-  (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
-                                (int)(mask))
+  ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
+                                 (int)(mask)))
 
 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -393,3 +393,11 @@
   // CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_testz_si128(x, y);
 }
+
+// Make sure brackets work after macro intrinsics.
+float pr51324(__m128 a) {
+  // CHECK-LABEL: pr51324
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0)
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  return _mm_round_ps(a, 0)[0];
+}