Skip to content

Commit 0c351ab

Browse files
committedMar 12, 2015
[X86, AVX] replace vextractf128 intrinsics with generic shuffles
This is very much like D8088 (checked in at r231792). Now that we've replaced the vinsertf128 intrinsics, do the same for their extract twins. Differential Revision: http://reviews.llvm.org/D8275 llvm-svn: 232052
1 parent bfa4357 commit 0c351ab

File tree

5 files changed

+70
-26
lines changed

5 files changed

+70
-26
lines changed
 

‎clang/include/clang/Basic/BuiltinsX86.def

-3
Original file line numberDiff line numberDiff line change
@@ -436,9 +436,6 @@ BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "")
436436
BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "")
437437
BUILTIN(__builtin_ia32_cmppd256, "V4dV4dV4dIc", "")
438438
BUILTIN(__builtin_ia32_cmpps256, "V8fV8fV8fIc", "")
439-
BUILTIN(__builtin_ia32_vextractf128_pd256, "V2dV4dIc", "")
440-
BUILTIN(__builtin_ia32_vextractf128_ps256, "V4fV8fIc", "")
441-
BUILTIN(__builtin_ia32_vextractf128_si256, "V4iV8iIc", "")
442439
BUILTIN(__builtin_ia32_cvtdq2pd256, "V4dV4i", "")
443440
BUILTIN(__builtin_ia32_cvtdq2ps256, "V8fV8i", "")
444441
BUILTIN(__builtin_ia32_cvtpd2ps256, "V4fV4d", "")

‎clang/lib/Headers/avxintrin.h

+28-13
Original file line numberDiff line numberDiff line change
@@ -429,19 +429,6 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
429429
__m128 __b = (b); \
430430
(__m128)__builtin_ia32_cmpss((__v4sf)__a, (__v4sf)__b, (c)); })
431431

432-
/* Vector extract */
433-
#define _mm256_extractf128_pd(A, O) __extension__ ({ \
434-
__m256d __A = (A); \
435-
(__m128d)__builtin_ia32_vextractf128_pd256((__v4df)__A, (O)); })
436-
437-
#define _mm256_extractf128_ps(A, O) __extension__ ({ \
438-
__m256 __A = (A); \
439-
(__m128)__builtin_ia32_vextractf128_ps256((__v8sf)__A, (O)); })
440-
441-
#define _mm256_extractf128_si256(A, O) __extension__ ({ \
442-
__m256i __A = (A); \
443-
(__m128i)__builtin_ia32_vextractf128_si256((__v8si)__A, (O)); })
444-
445432
static __inline int __attribute__((__always_inline__, __nodebug__))
446433
_mm256_extract_epi32(__m256i __a, const int __imm)
447434
{
@@ -1186,6 +1173,34 @@ _mm256_castsi128_si256(__m128i __a)
11861173
(((M) & 1) ? 4 : 2), \
11871174
(((M) & 1) ? 5 : 3) );})
11881175

1176+
/*
1177+
Vector extract.
1178+
We use macros rather than inlines because we only want to accept
1179+
invocations where the immediate M is a constant expression.
1180+
*/
1181+
#define _mm256_extractf128_ps(V, M) __extension__ ({ \
1182+
(__m128)__builtin_shufflevector( \
1183+
(__v8sf)(V), \
1184+
(__v8sf)(V), \
1185+
(((M) & 1) ? 4 : 0), \
1186+
(((M) & 1) ? 5 : 1), \
1187+
(((M) & 1) ? 6 : 2), \
1188+
(((M) & 1) ? 7 : 3) );})
1189+
1190+
#define _mm256_extractf128_pd(V, M) __extension__ ({ \
1191+
(__m128d)__builtin_shufflevector( \
1192+
(__v4df)(V), \
1193+
(__v4df)(V), \
1194+
(((M) & 1) ? 2 : 0), \
1195+
(((M) & 1) ? 3 : 1) );})
1196+
1197+
#define _mm256_extractf128_si256(V, M) __extension__ ({ \
1198+
(__m128i)__builtin_shufflevector( \
1199+
(__v4di)(V), \
1200+
(__v4di)(V), \
1201+
(((M) & 1) ? 2 : 0), \
1202+
(((M) & 1) ? 3 : 1) );})
1203+
11891204
/* SIMD load ops (unaligned) */
11901205
static __inline __m256 __attribute__((__always_inline__, __nodebug__))
11911206
_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)

‎clang/lib/Sema/SemaChecking.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -882,9 +882,6 @@ bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
882882
switch (BuiltinID) {
883883
default: return false;
884884
case X86::BI_mm_prefetch: i = 1; l = 0; u = 3; break;
885-
case X86::BI__builtin_ia32_vextractf128_pd256:
886-
case X86::BI__builtin_ia32_vextractf128_ps256:
887-
case X86::BI__builtin_ia32_vextractf128_si256:
888885
case X86::BI__builtin_ia32_extract128i256: i = 1, l = 0, u = 1; break;
889886
case X86::BI__builtin_ia32_insert128i256: i = 2, l = 0; u = 1; break;
890887
case X86::BI__builtin_ia32_sha1rnds4: i = 2, l = 0; u = 3; break;

‎clang/test/CodeGen/avx-shuffle-builtins.c

+42-4
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ test_mm256_broadcast_ss(float const *__a) {
100100

101101
// Make sure we have the correct mask for each insertf128 case.
102102

103-
__m256d test_mm256_insertf128_ps_0(__m256 a, __m128 b) {
103+
__m256 test_mm256_insertf128_ps_0(__m256 a, __m128 b) {
104104
// CHECK-LABEL: @test_mm256_insertf128_ps_0
105105
// CHECK: shufflevector{{.*}}<i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
106106
return _mm256_insertf128_ps(a, b, 0);
@@ -112,13 +112,13 @@ __m256d test_mm256_insertf128_pd_0(__m256d a, __m128d b) {
112112
return _mm256_insertf128_pd(a, b, 0);
113113
}
114114

115-
__m256d test_mm256_insertf128_si256_0(__m256i a, __m128i b) {
115+
__m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) {
116116
// CHECK-LABEL: @test_mm256_insertf128_si256_0
117117
// CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 2, i32 3>
118118
return _mm256_insertf128_si256(a, b, 0);
119119
}
120120

121-
__m256d test_mm256_insertf128_ps_1(__m256 a, __m128 b) {
121+
__m256 test_mm256_insertf128_ps_1(__m256 a, __m128 b) {
122122
// CHECK-LABEL: @test_mm256_insertf128_ps_1
123123
// CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
124124
return _mm256_insertf128_ps(a, b, 1);
@@ -130,9 +130,47 @@ __m256d test_mm256_insertf128_pd_1(__m256d a, __m128d b) {
130130
return _mm256_insertf128_pd(a, b, 1);
131131
}
132132

133-
__m256d test_mm256_insertf128_si256_1(__m256i a, __m128i b) {
133+
__m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) {
134134
// CHECK-LABEL: @test_mm256_insertf128_si256_1
135135
// CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
136136
return _mm256_insertf128_si256(a, b, 1);
137137
}
138138

139+
// Make sure we have the correct mask for each extractf128 case.
140+
141+
__m128 test_mm256_extractf128_ps_0(__m256 a) {
142+
// CHECK-LABEL: @test_mm256_extractf128_ps_0
143+
// CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 2, i32 3>
144+
return _mm256_extractf128_ps(a, 0);
145+
}
146+
147+
__m128d test_mm256_extractf128_pd_0(__m256d a) {
148+
// CHECK-LABEL: @test_mm256_extractf128_pd_0
149+
// CHECK: shufflevector{{.*}}<i32 0, i32 1>
150+
return _mm256_extractf128_pd(a, 0);
151+
}
152+
153+
__m128i test_mm256_extractf128_si256_0(__m256i a) {
154+
// CHECK-LABEL: @test_mm256_extractf128_si256_0
155+
// CHECK: shufflevector{{.*}}<i32 0, i32 1>
156+
return _mm256_extractf128_si256(a, 0);
157+
}
158+
159+
__m128 test_mm256_extractf128_ps_1(__m256 a) {
160+
// CHECK-LABEL: @test_mm256_extractf128_ps_1
161+
// CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 6, i32 7>
162+
return _mm256_extractf128_ps(a, 1);
163+
}
164+
165+
__m128d test_mm256_extractf128_pd_1(__m256d a) {
166+
// CHECK-LABEL: @test_mm256_extractf128_pd_1
167+
// CHECK: shufflevector{{.*}}<i32 2, i32 3>
168+
return _mm256_extractf128_pd(a, 1);
169+
}
170+
171+
__m128i test_mm256_extractf128_si256_1(__m256i a) {
172+
// CHECK-LABEL: @test_mm256_extractf128_si256_1
173+
// CHECK: shufflevector{{.*}}<i32 2, i32 3>
174+
return _mm256_extractf128_si256(a, 1);
175+
}
176+

‎clang/test/CodeGen/builtins-x86.c

-3
Original file line numberDiff line numberDiff line change
@@ -405,9 +405,6 @@ void f0() {
405405
tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7);
406406
tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0);
407407
tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0);
408-
tmp_V2d = __builtin_ia32_vextractf128_pd256(tmp_V4d, 0x1);
409-
tmp_V4f = __builtin_ia32_vextractf128_ps256(tmp_V8f, 0x1);
410-
tmp_V4i = __builtin_ia32_vextractf128_si256(tmp_V8i, 0x1);
411408
tmp_V4d = __builtin_ia32_cvtdq2pd256(tmp_V4i);
412409
tmp_V8f = __builtin_ia32_cvtdq2ps256(tmp_V8i);
413410
tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d);

0 commit comments

Comments
 (0)
Please sign in to comment.