diff --git a/clang/lib/Headers/__wmmintrin_aes.h b/clang/lib/Headers/__wmmintrin_aes.h --- a/clang/lib/Headers/__wmmintrin_aes.h +++ b/clang/lib/Headers/__wmmintrin_aes.h @@ -133,7 +133,7 @@ /// An 8-bit round constant used to generate the AES encryption key. /// \returns A 128-bit round key for AES encryption. #define _mm_aeskeygenassist_si128(C, R) \ - (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)) + ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))) #undef __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -20,8 +20,8 @@ /* SSE4 Multiple Packed Sums of Absolute Difference. */ #define _mm256_mpsadbw_epu8(X, Y, M) \ - (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ - (__v32qi)(__m256i)(Y), (int)(M)) + ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(M))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi8(__m256i __a) @@ -114,8 +114,8 @@ } #define _mm256_alignr_epi8(a, b, n) \ - (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (n)) + ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (n))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_and_si256(__m256i __a, __m256i __b) @@ -149,8 +149,8 @@ } #define _mm256_blend_epi16(V1, V2, M) \ - (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ - (__v16hi)(__m256i)(V2), (int)(M)) + ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ + (__v16hi)(__m256i)(V2), (int)(M))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi8(__m256i __a, __m256i __b) @@ -467,13 +467,13 @@ } #define _mm256_shuffle_epi32(a, imm) \ - (__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)) + ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))) #define _mm256_shufflehi_epi16(a, imm) \ - (__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)) + ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))) #define _mm256_shufflelo_epi16(a, imm) \ - (__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)) + ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a, __m256i __b) @@ -494,10 +494,10 @@ } #define _mm256_slli_si256(a, imm) \ - (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)) + ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) #define _mm256_bslli_epi128(a, imm) \ - (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)) + ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi16(__m256i __a, int __count) @@ -560,10 +560,10 @@ } #define _mm256_srli_si256(a, imm) \ - (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)) + ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) #define _mm256_bsrli_epi128(a, imm) \ - (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)) + ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi16(__m256i __a, int __count) @@ -743,12 +743,12 @@ #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) #define _mm_blend_epi32(V1, V2, M) \ - (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ - (__v4si)(__m128i)(V2), (int)(M)) + ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ + (__v4si)(__m128i)(V2), (int)(M))) #define _mm256_blend_epi32(V1, V2, M) \ - (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ - (__v8si)(__m256i)(V2), (int)(M)) + ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ + (__v8si)(__m256i)(V2), (int)(M))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastb_epi8(__m128i __X) @@ -806,7 +806,7 @@ } #define _mm256_permute4x64_pd(V, M) \ - (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)) + ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) @@ -815,17 +815,17 @@ } #define _mm256_permute4x64_epi64(V, M) \ - (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)) + ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) #define _mm256_permute2x128_si256(V1, V2, M) \ - (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)) + ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))) #define _mm256_extracti128_si256(V, M) \ - (__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)) + ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))) #define _mm256_inserti128_si256(V1, V2, M) \ - (__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ - (__v2di)(__m128i)(V2), (int)(M)) + ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ + (__v2di)(__m128i)(V2), (int)(M))) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32(int const *__X, __m256i __M) @@ -936,211 +936,211 @@ } #define _mm_mask_i32gather_pd(a, m, i, mask, s) \ - (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ - (double const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v2df)(__m128d)(mask), (s)) + ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ + (double const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v2df)(__m128d)(mask), (s))) #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \ - (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ - (double const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4df)(__m256d)(mask), (s)) + ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ + (double const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4df)(__m256d)(mask), (s))) #define _mm_mask_i64gather_pd(a, m, i, mask, s) \ - (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ - (double const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v2df)(__m128d)(mask), (s)) + ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ + (double const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v2df)(__m128d)(mask), (s))) #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \ - (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ - (double const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4df)(__m256d)(mask), (s)) + ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ + (double const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4df)(__m256d)(mask), (s))) #define _mm_mask_i32gather_ps(a, m, i, mask, s) \ - (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ - (float const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4sf)(__m128)(mask), (s)) + ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ + (float const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4sf)(__m128)(mask), (s))) #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \ - (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ - (float const *)(m), \ - (__v8si)(__m256i)(i), \ - (__v8sf)(__m256)(mask), (s)) + ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ + (float const *)(m), \ + (__v8si)(__m256i)(i), \ + (__v8sf)(__m256)(mask), (s))) #define _mm_mask_i64gather_ps(a, m, i, mask, s) \ - (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ - (float const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v4sf)(__m128)(mask), (s)) + ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ + (float const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v4sf)(__m128)(mask), (s))) #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \ - (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ - (float const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4sf)(__m128)(mask), (s)) + ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ + (float const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4sf)(__m128)(mask), (s))) #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \ - (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ - (int const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4si)(__m128i)(mask), (s)) + ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ + (int const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4si)(__m128i)(mask), (s))) #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \ - (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ - (int const *)(m), \ - (__v8si)(__m256i)(i), \ - (__v8si)(__m256i)(mask), (s)) + ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ + (int const *)(m), \ + (__v8si)(__m256i)(i), \ + (__v8si)(__m256i)(mask), (s))) #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \ - (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ - (int const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v4si)(__m128i)(mask), (s)) + ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ + (int const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v4si)(__m128i)(mask), (s))) #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \ - (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ - (int const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4si)(__m128i)(mask), (s)) + ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ + (int const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4si)(__m128i)(mask), (s))) #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \ - (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ - (long long const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v2di)(__m128i)(mask), (s)) + ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ + (long long const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v2di)(__m128i)(mask), (s))) #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \ - (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ - (long long const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4di)(__m256i)(mask), (s)) + ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ + (long long const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4di)(__m256i)(mask), (s))) #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \ - (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ - (long long const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v2di)(__m128i)(mask), (s)) + ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ + (long long const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v2di)(__m128i)(mask), (s))) #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \ - (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ - (long long const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4di)(__m256i)(mask), (s)) + ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ + (long long const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4di)(__m256i)(mask), (s))) #define _mm_i32gather_pd(m, i, s) \ - (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ - (double const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ - _mm_setzero_pd()), \ - (s)) + ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ + (double const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ + _mm_setzero_pd()), \ + (s))) #define _mm256_i32gather_pd(m, i, s) \ - (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ - (double const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ - _mm256_setzero_pd(), \ - _CMP_EQ_OQ), \ - (s)) + ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ + (double const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ + _mm256_setzero_pd(), \ + _CMP_EQ_OQ), \ + (s))) #define _mm_i64gather_pd(m, i, s) \ - (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ - (double const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ - _mm_setzero_pd()), \ - (s)) + ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ + (double const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ + _mm_setzero_pd()), \ + (s))) #define _mm256_i64gather_pd(m, i, s) \ - (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ - (double const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ - _mm256_setzero_pd(), \ - _CMP_EQ_OQ), \ - (s)) + ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ + (double const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ + _mm256_setzero_pd(), \ + _CMP_EQ_OQ), \ + (s))) #define _mm_i32gather_ps(m, i, s) \ - (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ - (float const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ - _mm_setzero_ps()), \ - (s)) + ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ + (float const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ + _mm_setzero_ps()), \ + (s))) #define _mm256_i32gather_ps(m, i, s) \ - (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ - (float const *)(m), \ - (__v8si)(__m256i)(i), \ - (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ - _mm256_setzero_ps(), \ - _CMP_EQ_OQ), \ - (s)) + ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ + (float const *)(m), \ + (__v8si)(__m256i)(i), \ + (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ + _mm256_setzero_ps(), \ + _CMP_EQ_OQ), \ + (s))) #define _mm_i64gather_ps(m, i, s) \ - (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ - (float const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ - _mm_setzero_ps()), \ - (s)) + ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ + (float const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ + _mm_setzero_ps()), \ + (s))) #define _mm256_i64gather_ps(m, i, s) \ - (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ - (float const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ - _mm_setzero_ps()), \ - (s)) + ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ + (float const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ + _mm_setzero_ps()), \ + (s))) #define _mm_i32gather_epi32(m, i, s) \ - (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ - (int const *)(m), (__v4si)(__m128i)(i), \ - (__v4si)_mm_set1_epi32(-1), (s)) + ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ + (int const *)(m), (__v4si)(__m128i)(i), \ + (__v4si)_mm_set1_epi32(-1), (s))) #define _mm256_i32gather_epi32(m, i, s) \ - (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ - (int const *)(m), (__v8si)(__m256i)(i), \ - (__v8si)_mm256_set1_epi32(-1), (s)) + ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ + (int const *)(m), (__v8si)(__m256i)(i), \ + (__v8si)_mm256_set1_epi32(-1), (s))) #define _mm_i64gather_epi32(m, i, s) \ - (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ - (int const *)(m), (__v2di)(__m128i)(i), \ - (__v4si)_mm_set1_epi32(-1), (s)) + ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ + (int const *)(m), (__v2di)(__m128i)(i), \ + (__v4si)_mm_set1_epi32(-1), (s))) #define _mm256_i64gather_epi32(m, i, s) \ - (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ - (int const *)(m), (__v4di)(__m256i)(i), \ - (__v4si)_mm_set1_epi32(-1), (s)) + ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ + (int const *)(m), (__v4di)(__m256i)(i), \ + (__v4si)_mm_set1_epi32(-1), (s))) #define _mm_i32gather_epi64(m, i, s) \ - (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ - (long long const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v2di)_mm_set1_epi64x(-1), (s)) + ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ + (long long const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v2di)_mm_set1_epi64x(-1), (s))) #define _mm256_i32gather_epi64(m, i, s) \ - (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ - (long long const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4di)_mm256_set1_epi64x(-1), (s)) + ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ + (long long const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4di)_mm256_set1_epi64x(-1), (s))) #define _mm_i64gather_epi64(m, i, s) \ - (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ - (long long const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v2di)_mm_set1_epi64x(-1), (s)) + ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ + (long long const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v2di)_mm_set1_epi64x(-1), (s))) #define _mm256_i64gather_epi64(m, i, s) \ - (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ - (long long const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4di)_mm256_set1_epi64x(-1), (s)) + ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ + (long long const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4di)_mm256_set1_epi64x(-1), (s))) #undef __DEFAULT_FN_ATTRS256 #undef __DEFAULT_FN_ATTRS128 diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h --- a/clang/lib/Headers/avxintrin.h +++ b/clang/lib/Headers/avxintrin.h @@ -400,7 +400,7 @@ /// 11: Truncated. /// \returns A 256-bit vector of [4 x double] containing the rounded values. #define _mm256_round_pd(V, M) \ - (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)) + ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))) /// Rounds the values stored in a 256-bit vector of [8 x float] as /// specified by the byte operand. The source values are rounded to integer @@ -432,7 +432,7 @@ /// 11: Truncated. /// \returns A 256-bit vector of [8 x float] containing the rounded values. #define _mm256_round_ps(V, M) \ - (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)) + ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))) /// Rounds up the values stored in a 256-bit vector of [4 x double]. The /// source values are rounded up to integer values and returned as 64-bit @@ -989,7 +989,7 @@ /// returned vector. /// \returns A 128-bit vector of [2 x double] containing the copied values. #define _mm_permute_pd(A, C) \ - (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)) + ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))) /// Copies the values in a 256-bit vector of [4 x double] as specified by /// the immediate integer operand. @@ -1029,7 +1029,7 @@ /// returned vector. /// \returns A 256-bit vector of [4 x double] containing the copied values. #define _mm256_permute_pd(A, C) \ - (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)) + ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))) /// Copies the values in a 128-bit vector of [4 x float] as specified by /// the immediate integer operand. @@ -1085,7 +1085,7 @@ /// returned vector. /// \returns A 128-bit vector of [4 x float] containing the copied values. #define _mm_permute_ps(A, C) \ - (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)) + ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))) /// Copies the values in a 256-bit vector of [8 x float] as specified by /// the immediate integer operand. @@ -1177,7 +1177,7 @@ /// returned vector. /// \returns A 256-bit vector of [8 x float] containing the copied values. #define _mm256_permute_ps(A, C) \ - (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)) + ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))) /// Permutes 128-bit data values stored in two 256-bit vectors of /// [4 x double], as specified by the immediate integer operand. @@ -1217,8 +1217,8 @@ /// destination. /// \returns A 256-bit vector of [4 x double] containing the copied values. #define _mm256_permute2f128_pd(V1, V2, M) \ - (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ - (__v4df)(__m256d)(V2), (int)(M)) + ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ + (__v4df)(__m256d)(V2), (int)(M))) /// Permutes 128-bit data values stored in two 256-bit vectors of /// [8 x float], as specified by the immediate integer operand. @@ -1258,8 +1258,8 @@ /// destination. /// \returns A 256-bit vector of [8 x float] containing the copied values. #define _mm256_permute2f128_ps(V1, V2, M) \ - (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ - (__v8sf)(__m256)(V2), (int)(M)) + ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), (int)(M))) /// Permutes 128-bit data values stored in two 256-bit integer vectors, /// as specified by the immediate integer operand. @@ -1298,8 +1298,8 @@ /// destination. /// \returns A 256-bit integer vector containing the copied values. #define _mm256_permute2f128_si256(V1, V2, M) \ - (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ - (__v8si)(__m256i)(V2), (int)(M)) + ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ + (__v8si)(__m256i)(V2), (int)(M))) /* Vector Blend */ /// Merges 64-bit double-precision data values stored in either of the @@ -1327,8 +1327,8 @@ /// operand \a V2 is copied to the same position in the destination. /// \returns A 256-bit vector of [4 x double] containing the copied values. #define _mm256_blend_pd(V1, V2, M) \ - (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \ - (__v4df)(__m256d)(V2), (int)(M)) + ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \ + (__v4df)(__m256d)(V2), (int)(M))) /// Merges 32-bit single-precision data values stored in either of the /// two 256-bit vectors of [8 x float], as specified by the immediate @@ -1355,8 +1355,8 @@ /// operand \a V2 is copied to the same position in the destination. /// \returns A 256-bit vector of [8 x float] containing the copied values. #define _mm256_blend_ps(V1, V2, M) \ - (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \ - (__v8sf)(__m256)(V2), (int)(M)) + ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), (int)(M))) /// Merges 64-bit double-precision data values stored in either of the /// two 256-bit vectors of [4 x double], as specified by the 256-bit vector @@ -1453,8 +1453,8 @@ /// two parallel dot product computations. /// \returns A 256-bit vector of [8 x float] containing the two dot products. #define _mm256_dp_ps(V1, V2, M) \ - (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ - (__v8sf)(__m256)(V2), (M)) + ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), (M))) /* Vector shuffle */ /// Selects 8 float values from the 256-bit operands of [8 x float], as @@ -1507,8 +1507,8 @@ /// 11: Bits [127:96] and [255:224] are copied from the selected operand. /// \returns A 256-bit vector of [8 x float] containing the shuffled values. #define _mm256_shuffle_ps(a, b, mask) \ - (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \ - (__v8sf)(__m256)(b), (int)(mask)) + ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), (int)(mask))) /// Selects four double-precision values from the 256-bit operands of /// [4 x double], as specified by the immediate value operand. @@ -1553,8 +1553,8 @@ /// destination. /// \returns A 256-bit vector of [4 x double] containing the shuffled values. #define _mm256_shuffle_pd(a, b, mask) \ - (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \ - (__v4df)(__m256d)(b), (int)(mask)) + ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), (int)(mask))) /* Compare */ #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ @@ -1647,8 +1647,8 @@ /// 0x1F: True (unordered, signaling) /// \returns A 128-bit vector of [2 x double] containing the comparison results. #define _mm_cmp_pd(a, b, c) \ - (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ - (__v2df)(__m128d)(b), (c)) + ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ + (__v2df)(__m128d)(b), (c))) /// Compares each of the corresponding values of two 128-bit vectors of /// [4 x float], using the operation specified by the immediate integer @@ -1707,8 +1707,8 @@ /// 0x1F: True (unordered, signaling) /// \returns A 128-bit vector of [4 x float] containing the comparison results. #define _mm_cmp_ps(a, b, c) \ - (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ - (__v4sf)(__m128)(b), (c)) + ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), (c))) /// Compares each of the corresponding double-precision values of two /// 256-bit vectors of [4 x double], using the operation specified by the @@ -1767,8 +1767,8 @@ /// 0x1F: True (unordered, signaling) /// \returns A 256-bit vector of [4 x double] containing the comparison results. #define _mm256_cmp_pd(a, b, c) \ - (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ - (__v4df)(__m256d)(b), (c)) + ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), (c))) /// Compares each of the corresponding values of two 256-bit vectors of /// [8 x float], using the operation specified by the immediate integer @@ -1827,8 +1827,8 @@ /// 0x1F: True (unordered, signaling) /// \returns A 256-bit vector of [8 x float] containing the comparison results. #define _mm256_cmp_ps(a, b, c) \ - (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ - (__v8sf)(__m256)(b), (c)) + ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), (c))) /// Compares each of the corresponding scalar double-precision values of /// two 128-bit vectors of [2 x double], using the operation specified by the @@ -1886,8 +1886,8 @@ /// 0x1F: True (unordered, signaling) /// \returns A 128-bit vector of [2 x double] containing the comparison results. #define _mm_cmp_sd(a, b, c) \ - (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ - (__v2df)(__m128d)(b), (c)) + ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ + (__v2df)(__m128d)(b), (c))) /// Compares each of the corresponding scalar values of two 128-bit /// vectors of [4 x float], using the operation specified by the immediate @@ -1945,8 +1945,8 @@ /// 0x1F: True (unordered, signaling) /// \returns A 128-bit vector of [4 x float] containing the comparison results. #define _mm_cmp_ss(a, b, c) \ - (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ - (__v4sf)(__m128)(b), (c)) + ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), (c))) /// Takes a [8 x i32] vector and returns the vector element value /// indexed by the immediate constant operand. @@ -1964,7 +1964,7 @@ /// \returns A 32-bit integer containing the extracted 32 bits of extended /// packed data. #define _mm256_extract_epi32(X, N) \ - (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)) + ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))) /// Takes a [16 x i16] vector and returns the vector element value /// indexed by the immediate constant operand. @@ -1982,8 +1982,8 @@ /// \returns A 32-bit integer containing the extracted 16 bits of zero extended /// packed data. #define _mm256_extract_epi16(X, N) \ - (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \ - (int)(N)) + ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \ + (int)(N))) /// Takes a [32 x i8] vector and returns the vector element value /// indexed by the immediate constant operand. @@ -2001,8 +2001,8 @@ /// \returns A 32-bit integer containing the extracted 8 bits of zero extended /// packed data. #define _mm256_extract_epi8(X, N) \ - (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \ - (int)(N)) + ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \ + (int)(N))) #ifdef __x86_64__ /// Takes a [4 x i64] vector and returns the vector element value @@ -2021,7 +2021,7 @@ /// \returns A 64-bit integer containing the extracted 64 bits of extended /// packed data. #define _mm256_extract_epi64(X, N) \ - (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)) + ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))) #endif /// Takes a [8 x i32] vector and replaces the vector element value @@ -2043,8 +2043,8 @@ /// \returns A copy of vector \a __a, after replacing its element indexed by /// \a __imm with \a __b. #define _mm256_insert_epi32(X, I, N) \ - (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \ - (int)(I), (int)(N)) + ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \ + (int)(I), (int)(N))) /// Takes a [16 x i16] vector and replaces the vector element value @@ -2066,8 +2066,8 @@ /// \returns A copy of vector \a __a, after replacing its element indexed by /// \a __imm with \a __b. #define _mm256_insert_epi16(X, I, N) \ - (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \ - (int)(I), (int)(N)) + ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \ + (int)(I), (int)(N))) /// Takes a [32 x i8] vector and replaces the vector element value /// indexed by the immediate constant operand with a new value. Returns the @@ -2088,8 +2088,8 @@ /// \returns A copy of vector \a __a, after replacing its element indexed by /// \a __imm with \a __b. #define _mm256_insert_epi8(X, I, N) \ - (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \ - (int)(I), (int)(N)) + ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \ + (int)(I), (int)(N))) #ifdef __x86_64__ /// Takes a [4 x i64] vector and replaces the vector element value @@ -2111,8 +2111,8 @@ /// \returns A copy of vector \a __a, after replacing its element indexed by /// \a __imm with \a __b. #define _mm256_insert_epi64(X, I, N) \ - (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \ - (long long)(I), (int)(N)) + ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \ + (long long)(I), (int)(N))) #endif /* Conversion */ @@ -4592,8 +4592,8 @@ /// result. /// \returns A 256-bit vector of [8 x float] containing the interleaved values. #define _mm256_insertf128_ps(V1, V2, M) \ - (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \ - (__v4sf)(__m128)(V2), (int)(M)) + ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \ + (__v4sf)(__m128)(V2), (int)(M))) /// Constructs a new 256-bit vector of [4 x double] by first duplicating /// a 256-bit vector of [4 x double] given in the first parameter, and then @@ -4630,8 +4630,8 @@ /// result. /// \returns A 256-bit vector of [4 x double] containing the interleaved values. #define _mm256_insertf128_pd(V1, V2, M) \ - (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \ - (__v2df)(__m128d)(V2), (int)(M)) + ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \ + (__v2df)(__m128d)(V2), (int)(M))) /// Constructs a new 256-bit integer vector by first duplicating a /// 256-bit integer vector given in the first parameter, and then replacing @@ -4668,8 +4668,8 @@ /// result. /// \returns A 256-bit integer vector containing the interleaved values. #define _mm256_insertf128_si256(V1, V2, M) \ - (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \ - (__v4si)(__m128i)(V2), (int)(M)) + ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \ + (__v4si)(__m128i)(V2), (int)(M))) /* Vector extract. @@ -4698,7 +4698,7 @@ /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. /// \returns A 128-bit vector of [4 x float] containing the extracted bits. #define _mm256_extractf128_ps(V, M) \ - (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)) + ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))) /// Extracts either the upper or the lower 128 bits from a 256-bit vector /// of [4 x double], as determined by the immediate integer parameter, and @@ -4722,7 +4722,7 @@ /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. /// \returns A 128-bit vector of [2 x double] containing the extracted bits. #define _mm256_extractf128_pd(V, M) \ - (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)) + ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))) /// Extracts either the upper or the lower 128 bits from a 256-bit /// integer vector, as determined by the immediate integer parameter, and @@ -4746,7 +4746,7 @@ /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. /// \returns A 128-bit integer vector containing the extracted bits. #define _mm256_extractf128_si256(V, M) \ - (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)) + ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))) /* SIMD load ops (unaligned) */ /// Loads two 128-bit floating-point vectors of [4 x float] from diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -2818,10 +2818,10 @@ /// \a a. /// \returns A 128-bit integer vector containing the left-shifted value. #define _mm_slli_si128(a, imm) \ - (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) + ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) #define _mm_bslli_si128(a, imm) \ - (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) + ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) /// Left-shifts each 16-bit value in the 128-bit integer vector operand /// by the specified number of bits. Low-order bits are cleared. @@ -3035,10 +3035,10 @@ /// \a a. /// \returns A 128-bit integer vector containing the right-shifted value. #define _mm_srli_si128(a, imm) \ - (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) + ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) #define _mm_bsrli_si128(a, imm) \ - (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) + ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) /// Right-shifts each of 16-bit values in the 128-bit integer vector /// operand by the specified number of bits. High-order bits are cleared. @@ -4356,8 +4356,8 @@ /// \returns An integer, whose lower 16 bits are selected from the 128-bit /// integer vector parameter and the remaining bits are assigned zeros. #define _mm_extract_epi16(a, imm) \ - (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ - (int)(imm)) + ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ + (int)(imm))) /// Constructs a 128-bit integer vector by first making a copy of the /// 128-bit integer vector parameter, and then inserting the lower 16 bits @@ -4380,8 +4380,8 @@ /// lower 16 bits of \a __b are written. /// \returns A 128-bit integer vector containing the constructed values. #define _mm_insert_epi16(a, b, imm) \ - (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ - (int)(imm)) + ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ + (int)(imm))) /// Copies the values of the most significant bits from each 8-bit /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask @@ -4430,7 +4430,7 @@ /// 11: assign values from bits [127:96] of \a a. /// \returns A 128-bit integer vector containing the shuffled values. #define _mm_shuffle_epi32(a, imm) \ - (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)) + ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))) /// Constructs a 128-bit integer vector by shuffling four lower 16-bit /// elements of a 128-bit integer vector of [8 x i16], using the immediate @@ -4460,7 +4460,7 @@ /// 11: assign values from bits [63:48] of \a a. \n /// \returns A 128-bit integer vector containing the shuffled values. #define _mm_shufflelo_epi16(a, imm) \ - (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)) + ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))) /// Constructs a 128-bit integer vector by shuffling four upper 16-bit /// elements of a 128-bit integer vector of [8 x i16], using the immediate @@ -4490,7 +4490,7 @@ /// 11: assign values from bits [127:112] of \a a. \n /// \returns A 128-bit integer vector containing the shuffled values. #define _mm_shufflehi_epi16(a, imm) \ - (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)) + ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))) /// Unpacks the high-order (index 8-15) values from two 128-bit vectors /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. @@ -4844,8 +4844,8 @@ /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n /// \returns A 128-bit vector of [2 x double] containing the shuffled values. #define _mm_shuffle_pd(a, b, i) \ - (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ - (int)(i)) + ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ + (int)(i))) /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit /// floating-point vector of [4 x float]. diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h --- a/clang/lib/Headers/smmintrin.h +++ b/clang/lib/Headers/smmintrin.h @@ -231,7 +231,7 @@ /// 11: Truncated /// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_round_ps(X, M) \ - (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)) + ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))) /// Copies three upper elements of the first 128-bit vector operand to /// the corresponding three upper elements of the 128-bit result vector of @@ -272,8 +272,8 @@ /// \returns A 128-bit vector of [4 x float] containing the copied and rounded /// values. #define _mm_round_ss(X, Y, M) \ - (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (M)) + ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (M))) /// Rounds each element of the 128-bit vector of [2 x double] to an /// integer value according to the rounding control specified by the second @@ -306,7 +306,7 @@ /// 11: Truncated /// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_round_pd(X, M) \ - (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)) + ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))) /// Copies the upper element of the first 128-bit vector operand to the /// corresponding upper element of the 128-bit result vector of [2 x double]. @@ -347,8 +347,8 @@ /// \returns A 128-bit vector of [2 x double] containing the copied and rounded /// values. #define _mm_round_sd(X, Y, M) \ - (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (M)) + ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (M))) /* SSE4 Packed Blending Intrinsics. */ /// Returns a 128-bit vector of [2 x double] where the values are @@ -376,8 +376,8 @@ /// is copied to the same position in the result. /// \returns A 128-bit vector of [2 x double] containing the copied values. #define _mm_blend_pd(V1, V2, M) \ - (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ - (__v2df)(__m128d)(V2), (int)(M)) + ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ + (__v2df)(__m128d)(V2), (int)(M))) /// Returns a 128-bit vector of [4 x float] where the values are selected /// from either the first or second operand as specified by the third @@ -404,8 +404,8 @@ /// is copied to the same position in the result. /// \returns A 128-bit vector of [4 x float] containing the copied values. #define _mm_blend_ps(V1, V2, M) \ - (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ - (__v4sf)(__m128)(V2), (int)(M)) + ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ + (__v4sf)(__m128)(V2), (int)(M))) /// Returns a 128-bit vector of [2 x double] where the values are /// selected from either the first or second operand as specified by the @@ -513,8 +513,8 @@ /// is copied to the same position in the result. /// \returns A 128-bit vector of [8 x i16] containing the copied values. #define _mm_blend_epi16(V1, V2, M) \ - (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ - (__v8hi)(__m128i)(V2), (int)(M)) + ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ + (__v8hi)(__m128i)(V2), (int)(M))) /* SSE4 Dword Multiply Instructions. */ /// Multiples corresponding elements of two 128-bit vectors of [4 x i32] @@ -590,8 +590,8 @@ /// in the corresponding element; otherwise that element is set to zero. /// \returns A 128-bit vector of [4 x float] containing the dot product. #define _mm_dp_ps(X, Y, M) \ - (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (M)) + ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (M))) /// Computes the dot product of the two 128-bit vectors of [2 x double] /// and returns it in the elements of the 128-bit result vector of @@ -625,8 +625,8 @@ /// each [2 x double] vector. If a bit is set, the dot product is returned in /// the corresponding element; otherwise that element is set to zero. #define _mm_dp_pd(X, Y, M) \ - (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (M)) + ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (M))) /* SSE4 Streaming Load Hint Instruction. */ /// Loads integer values from a 128-bit aligned memory location to a @@ -925,8 +925,8 @@ /// 1111: Bits [127:120] of the result are used for insertion. /// \returns A 128-bit integer vector containing the constructed values. #define _mm_insert_epi8(X, I, N) \ - (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \ - (int)(I), (int)(N)) + ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \ + (int)(I), (int)(N))) /// Constructs a 128-bit vector of [4 x i32] by first making a copy of /// the 128-bit integer vector parameter, and then inserting the 32-bit @@ -957,8 +957,8 @@ /// 11: Bits [127:96] of the result are used for insertion. /// \returns A 128-bit integer vector containing the constructed values. #define _mm_insert_epi32(X, I, N) \ - (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \ - (int)(I), (int)(N)) + ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \ + (int)(I), (int)(N))) #ifdef __x86_64__ /// Constructs a 128-bit vector of [2 x i64] by first making a copy of @@ -988,8 +988,8 @@ /// 1: Bits [127:64] of the result are used for insertion. \n /// \returns A 128-bit integer vector containing the constructed values. #define _mm_insert_epi64(X, I, N) \ - (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \ - (long long)(I), (int)(N)) + ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \ + (long long)(I), (int)(N))) #endif /* __x86_64__ */ /* Extract int from packed integer array at index. This returns the element @@ -1031,8 +1031,8 @@ /// 128-bit integer vector parameter and the remaining bits are assigned /// zeros. #define _mm_extract_epi8(X, N) \ - (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ - (int)(N)) + ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ + (int)(N))) /// Extracts a 32-bit element from the 128-bit integer vector of /// [4 x i32], using the immediate value parameter \a N as a selector. @@ -1057,7 +1057,7 @@ /// \returns An integer, whose lower 32 bits are selected from the 128-bit /// integer vector parameter and the remaining bits are assigned zeros. #define _mm_extract_epi32(X, N) \ - (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)) + ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))) #ifdef __x86_64__ /// Extracts a 64-bit element from the 128-bit integer vector of @@ -1080,7 +1080,7 @@ /// 1: Bits [127:64] are returned. \n /// \returns A 64-bit integer. #define _mm_extract_epi64(X, N) \ - (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)) + ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))) #endif /* __x86_64 */ /* SSE4 128-bit Packed Integer Comparisons. */ @@ -1514,8 +1514,8 @@ /// \returns A 128-bit integer vector containing the sums of the sets of /// absolute differences between both operands. #define _mm_mpsadbw_epu8(X, Y, M) \ - (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (M)) + ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (M))) /// Finds the minimum unsigned 16-bit element in the input 128-bit /// vector of [8 x u16] and returns it and along with its index. @@ -1624,8 +1624,8 @@ /// \returns Returns a 128-bit integer vector representing the result mask of /// the comparison. #define _mm_cmpistrm(A, B, M) \ - (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M)) + ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands @@ -1678,8 +1678,8 @@ /// 1: The index of the most significant set bit. \n /// \returns Returns an integer representing the result index of the comparison. #define _mm_cmpistri(A, B, M) \ - (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M)) + ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands @@ -1738,9 +1738,9 @@ /// \returns Returns a 128-bit integer vector representing the result mask of /// the comparison. #define _mm_cmpestrm(A, LA, B, LB, M) \ - (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M)) + ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands @@ -1797,9 +1797,9 @@ /// 1: The index of the most significant set bit. \n /// \returns Returns an integer representing the result index of the comparison. #define _mm_cmpestri(A, LA, B, LB, M) \ - (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M)) + ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ /// Uses the immediate operand \a M to perform a comparison of string @@ -1849,8 +1849,8 @@ /// \returns Returns 1 if the bit mask is zero and the length of the string in /// \a B is the maximum; otherwise, returns 0. #define _mm_cmpistra(A, B, M) \ - (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M)) + ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands @@ -1898,8 +1898,8 @@ /// to the size of \a A or \a B. /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. #define _mm_cmpistrc(A, B, M) \ - (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M)) + ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands @@ -1946,8 +1946,8 @@ /// to the size of \a A or \a B. \n /// \returns Returns bit 0 of the resulting bit mask. #define _mm_cmpistro(A, B, M) \ - (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M)) + ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands @@ -1996,8 +1996,8 @@ /// \returns Returns 1 if the length of the string in \a A is less than the /// maximum, otherwise, returns 0. #define _mm_cmpistrs(A, B, M) \ - (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M)) + ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands @@ -2046,8 +2046,8 @@ /// \returns Returns 1 if the length of the string in \a B is less than the /// maximum, otherwise, returns 0. #define _mm_cmpistrz(A, B, M) \ - (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M)) + ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands @@ -2100,9 +2100,9 @@ /// \returns Returns 1 if the bit mask is zero and the length of the string in /// \a B is the maximum, otherwise, returns 0. #define _mm_cmpestra(A, LA, B, LB, M) \ - (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M)) + ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands @@ -2154,9 +2154,9 @@ /// to the size of \a A or \a B. \n /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. #define _mm_cmpestrc(A, LA, B, LB, M) \ - (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M)) + ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands @@ -2207,9 +2207,9 @@ /// to the size of \a A or \a B. /// \returns Returns bit 0 of the resulting bit mask. #define _mm_cmpestro(A, LA, B, LB, M) \ - (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M)) + ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands @@ -2262,9 +2262,9 @@ /// \returns Returns 1 if the length of the string in \a A is less than the /// maximum, otherwise, returns 0. #define _mm_cmpestrs(A, LA, B, LB, M) \ - (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M)) + ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) /// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands @@ -2316,9 +2316,9 @@ /// \returns Returns 1 if the length of the string in \a B is less than the /// maximum, otherwise, returns 0. #define _mm_cmpestrz(A, LA, B, LB, M) \ - (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M)) + ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) /* SSE4.2 Compare Packed Data -- Greater Than. */ /// Compares each of the corresponding 64-bit values of the 128-bit diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -145,8 +145,8 @@ /// \returns A 128-bit integer vector containing the concatenated right-shifted /// value. #define _mm_alignr_epi8(a, b, n) \ - (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (n)) + ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (n))) /// Concatenates the two 64-bit integer vector operands, and right-shifts /// the result by the number of bytes specified in the immediate operand. @@ -168,7 +168,7 @@ /// \returns A 64-bit integer vector containing the concatenated right-shifted /// value. #define _mm_alignr_pi8(a, b, n) \ - (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)) + ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))) /// Horizontally adds the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [8 x i16]. diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -2181,7 +2181,7 @@ /// 3: Bits [63:48] are copied to the destination. /// \returns A 16-bit integer containing the extracted 16 bits of packed data. #define _mm_extract_pi16(a, n) \ - (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n) + ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)) /// Copies data from the 64-bit vector of [4 x i16] to the destination, /// and inserts the lower 16-bits of an integer operand at the 16-bit offset @@ -2212,7 +2212,7 @@ /// \returns A 64-bit integer vector containing the copied packed data from the /// operands. #define _mm_insert_pi16(a, d, n) \ - (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n) + ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)) /// Compares each of the corresponding packed 16-bit integer values of /// the 64-bit integer vectors, and writes the greater value to the @@ -2359,7 +2359,7 @@ /// 11: assigned from bits [63:48] of \a a. /// \returns A 64-bit integer vector containing the shuffled values. #define _mm_shuffle_pi16(a, n) \ - (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)) + ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))) /// Conditionally copies the values from each 8-bit element in the first /// 64-bit integer vector operand to the specified memory location, as @@ -2601,8 +2601,8 @@ /// 11: Bits [127:96] copied from the specified operand. /// \returns A 128-bit vector of [4 x float] containing the shuffled values. #define _mm_shuffle_ps(a, b, mask) \ - (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ - (int)(mask)) + ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ + (int)(mask))) /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of /// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -393,3 +393,11 @@ // CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) return _mm_testz_si128(x, y); } + +// Make sure brackets work after macro intrinsics. +float pr51324(__m128 a) { + // CHECK-LABEL: pr51324 + // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0) + // CHECK: extractelement <4 x float> %{{.*}}, i32 0 + return _mm_round_ps(a, 0)[0]; +}