Index: include/clang/Basic/BuiltinsX86.def =================================================================== --- include/clang/Basic/BuiltinsX86.def +++ include/clang/Basic/BuiltinsX86.def @@ -436,9 +436,6 @@ BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "") BUILTIN(__builtin_ia32_cmppd256, "V4dV4dV4dIc", "") BUILTIN(__builtin_ia32_cmpps256, "V8fV8fV8fIc", "") -BUILTIN(__builtin_ia32_vextractf128_pd256, "V2dV4dIc", "") -BUILTIN(__builtin_ia32_vextractf128_ps256, "V4fV8fIc", "") -BUILTIN(__builtin_ia32_vextractf128_si256, "V4iV8iIc", "") BUILTIN(__builtin_ia32_cvtdq2pd256, "V4dV4i", "") BUILTIN(__builtin_ia32_cvtdq2ps256, "V8fV8i", "") BUILTIN(__builtin_ia32_cvtpd2ps256, "V4fV4d", "") Index: lib/Headers/avxintrin.h =================================================================== --- lib/Headers/avxintrin.h +++ lib/Headers/avxintrin.h @@ -429,19 +429,6 @@ __m128 __b = (b); \ (__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); }) -/* Vector extract */ -#define _mm256_extractf128_pd(A, O) __extension__ ({ \ - __m256d __A = (A); \ - (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); }) - -#define _mm256_extractf128_ps(A, O) __extension__ ({ \ - __m256 __A = (A); \ - (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); }) - -#define _mm256_extractf128_si256(A, O) __extension__ ({ \ - __m256i __A = (A); \ - (__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); }) - static __inline int __attribute__((__always_inline__, __nodebug__)) _mm256_extract_epi32(__m256i __a, const int __imm) { @@ -1186,6 +1173,34 @@ (((M) & 1) ? 4 : 2), \ (((M) & 1) ? 5 : 3) );}) +/* + Vector extract. + We use macros rather than inlines because we only want to accept + invocations where the immediate M is a constant expression. +*/ +#define _mm256_extractf128_ps(V, M) __extension__ ({ \ + (__m128)__builtin_shufflevector( \ + (__v8sf)(V), \ + (__v8sf)(V), \ + (((M) & 1) ? 4 : 0), \ + (((M) & 1) ? 5 : 1), \ + (((M) & 1) ? 6 : 2), \ + (((M) & 1) ? 7 : 3) );}) + +#define _mm256_extractf128_pd(V, M) __extension__ ({ \ + (__m128d)__builtin_shufflevector( \ + (__v4df)(V), \ + (__v4df)(V), \ + (((M) & 1) ? 2 : 0), \ + (((M) & 1) ? 3 : 1) );}) + +#define _mm256_extractf128_si256(V, M) __extension__ ({ \ + (__m128i)__builtin_shufflevector( \ + (__v4di)(V), \ + (__v4di)(V), \ + (((M) & 1) ? 2 : 0), \ + (((M) & 1) ? 3 : 1) );}) + /* SIMD load ops (unaligned) */ static __inline __m256 __attribute__((__always_inline__, __nodebug__)) _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) Index: lib/Sema/SemaChecking.cpp =================================================================== --- lib/Sema/SemaChecking.cpp +++ lib/Sema/SemaChecking.cpp @@ -878,9 +878,6 @@ switch (BuiltinID) { default: return false; case X86::BI_mm_prefetch: i = 1; l = 0; u = 3; break; - case X86::BI__builtin_ia32_vextractf128_pd256: - case X86::BI__builtin_ia32_vextractf128_ps256: - case X86::BI__builtin_ia32_vextractf128_si256: case X86::BI__builtin_ia32_extract128i256: i = 1, l = 0, u = 1; break; case X86::BI__builtin_ia32_insert128i256: i = 2, l = 0; u = 1; break; case X86::BI__builtin_ia32_sha1rnds4: i = 2, l = 0; u = 3; break; Index: test/CodeGen/avx-shuffle-builtins.c =================================================================== --- test/CodeGen/avx-shuffle-builtins.c +++ test/CodeGen/avx-shuffle-builtins.c @@ -100,7 +100,7 @@ // Make sure we have the correct mask for each insertf128 case. -__m256d test_mm256_insertf128_ps_0(__m256 a, __m128 b) { +__m256 test_mm256_insertf128_ps_0(__m256 a, __m128 b) { // CHECK-LABEL: @test_mm256_insertf128_ps_0 // CHECK: shufflevector{{.*}} return _mm256_insertf128_ps(a, b, 0); @@ -112,13 +112,13 @@ return _mm256_insertf128_pd(a, b, 0); } -__m256d test_mm256_insertf128_si256_0(__m256i a, __m128i b) { +__m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) { // CHECK-LABEL: @test_mm256_insertf128_si256_0 // CHECK: shufflevector{{.*}} return _mm256_insertf128_si256(a, b, 0); } -__m256d test_mm256_insertf128_ps_1(__m256 a, __m128 b) { +__m256 test_mm256_insertf128_ps_1(__m256 a, __m128 b) { // CHECK-LABEL: @test_mm256_insertf128_ps_1 // CHECK: shufflevector{{.*}} return _mm256_insertf128_ps(a, b, 1); @@ -130,9 +130,47 @@ return _mm256_insertf128_pd(a, b, 1); } -__m256d test_mm256_insertf128_si256_1(__m256i a, __m128i b) { +__m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) { // CHECK-LABEL: @test_mm256_insertf128_si256_1 // CHECK: shufflevector{{.*}} return _mm256_insertf128_si256(a, b, 1); } +// Make sure we have the correct mask for each extractf128 case. + +__m128 test_mm256_extractf128_ps_0(__m256 a) { + // CHECK-LABEL: @test_mm256_extractf128_ps_0 + // CHECK: shufflevector{{.*}} + return _mm256_extractf128_ps(a, 0); +} + +__m128d test_mm256_extractf128_pd_0(__m256d a) { + // CHECK-LABEL: @test_mm256_extractf128_pd_0 + // CHECK: shufflevector{{.*}} + return _mm256_extractf128_pd(a, 0); +} + +__m128i test_mm256_extractf128_si256_0(__m256i a) { + // CHECK-LABEL: @test_mm256_extractf128_si256_0 + // CHECK: shufflevector{{.*}} + return _mm256_extractf128_si256(a, 0); +} + +__m128 test_mm256_extractf128_ps_1(__m256 a) { + // CHECK-LABEL: @test_mm256_extractf128_ps_1 + // CHECK: shufflevector{{.*}} + return _mm256_extractf128_ps(a, 1); +} + +__m128d test_mm256_extractf128_pd_1(__m256d a) { + // CHECK-LABEL: @test_mm256_extractf128_pd_1 + // CHECK: shufflevector{{.*}} + return _mm256_extractf128_pd(a, 1); +} + +__m128i test_mm256_extractf128_si256_1(__m256i a) { + // CHECK-LABEL: @test_mm256_extractf128_si256_1 + // CHECK: shufflevector{{.*}} + return _mm256_extractf128_si256(a, 1); +} + Index: test/CodeGen/builtins-x86.c =================================================================== --- test/CodeGen/builtins-x86.c +++ test/CodeGen/builtins-x86.c @@ -405,9 +405,6 @@ tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7); tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0); tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0); - tmp_V2d = __builtin_ia32_vextractf128_pd256(tmp_V4d, 0x1); - tmp_V4f = __builtin_ia32_vextractf128_ps256(tmp_V8f, 0x1); - tmp_V4i = __builtin_ia32_vextractf128_si256(tmp_V8i, 0x1); tmp_V4d = __builtin_ia32_cvtdq2pd256(tmp_V4i); tmp_V8f = __builtin_ia32_cvtdq2ps256(tmp_V8i); tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d);