Skip to content

Commit

Permalink
[Headers][X86] Use __builtin_shufflevector in AVX2 broadcasts.
Browse files Browse the repository at this point in the history
This lets us optimize them better. We agreed to remove the intrinsics,
instead of combining them later, as, at -O0, we generate the expected
instructions. Plus, it's a nice cleanup.

Differential Revision: http://reviews.llvm.org/D10556

llvm-svn: 245605
  • Loading branch information
ahmedbougacha committed Aug 20, 2015
1 parent 6511eb5 commit 5e354cb
Showing 3 changed files with 44 additions and 33 deletions.
11 changes: 0 additions & 11 deletions clang/include/clang/Basic/BuiltinsX86.def
Original file line number Diff line number Diff line change
@@ -590,17 +590,6 @@ TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4LLiV4LLii", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4LLiV4LLiV2LLi", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_movntdqa256, "V4LLiV4LLi*", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_vbroadcastss_ps, "V4fV4f", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_vbroadcastss_ps256, "V8fV4f", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_vbroadcastsd_pd256, "V4dV2d", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_pbroadcastb256, "V32cV16c", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_pbroadcastw256, "V16sV8s", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_pbroadcastd256, "V8iV4i", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_pbroadcastq256, "V4LLiV2LLi", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_pbroadcastb128, "V16cV16c", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_pbroadcastw128, "V8sV8s", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_pbroadcastd128, "V4iV4i", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_pbroadcastq128, "V2LLiV2LLi", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8f", "", "avx2")
TARGET_BUILTIN(__builtin_ia32_permti256, "V4LLiV4LLiV4LLiIc", "", "avx2")
22 changes: 11 additions & 11 deletions clang/lib/Headers/avx2intrin.h
Original file line number Diff line number Diff line change
@@ -760,7 +760,7 @@ _mm256_stream_load_si256(__m256i *__V)
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_broadcastss_ps(__m128 __X)
{
return (__m128)__builtin_ia32_vbroadcastss_ps((__v4sf)__X);
return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
}

static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -772,13 +772,13 @@ _mm_broadcastsd_pd(__m128d __a)
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_broadcastss_ps(__m128 __X)
{
return (__m256)__builtin_ia32_vbroadcastss_ps256((__v4sf)__X);
return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
}

static __inline__ __m256d __DEFAULT_FN_ATTRS
_mm256_broadcastsd_pd(__m128d __X)
{
return (__m256d)__builtin_ia32_vbroadcastsd_pd256((__v2df)__X);
return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
}

static __inline__ __m256i __DEFAULT_FN_ATTRS
@@ -812,50 +812,50 @@ _mm256_broadcastsi128_si256(__m128i __X)
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastb_epi8(__m128i __X)
{
return (__m256i)__builtin_ia32_pbroadcastb256((__v16qi)__X);
return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}

static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastw_epi16(__m128i __X)
{
return (__m256i)__builtin_ia32_pbroadcastw256((__v8hi)__X);
return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}

static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastd_epi32(__m128i __X)
{
return (__m256i)__builtin_ia32_pbroadcastd256((__v4si)__X);
return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
}

static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcastq_epi64(__m128i __X)
{
return (__m256i)__builtin_ia32_pbroadcastq256(__X);
return (__m256i)__builtin_shufflevector(__X, __X, 0, 0, 0, 0);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcastb_epi8(__m128i __X)
{
return (__m128i)__builtin_ia32_pbroadcastb128((__v16qi)__X);
return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcastw_epi16(__m128i __X)
{
return (__m128i)__builtin_ia32_pbroadcastw128((__v8hi)__X);
return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
}


static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcastd_epi32(__m128i __X)
{
return (__m128i)__builtin_ia32_pbroadcastd128((__v4si)__X);
return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcastq_epi64(__m128i __X)
{
return (__m128i)__builtin_ia32_pbroadcastq128(__X);
return (__m128i)__builtin_shufflevector(__X, __X, 0, 0);
}

static __inline__ __m256i __DEFAULT_FN_ATTRS
44 changes: 33 additions & 11 deletions clang/test/CodeGen/avx2-builtins.c
Original file line number Diff line number Diff line change
@@ -607,7 +607,9 @@ __m256i test_mm256_stream_load_si256(__m256i *a) {
}

__m128 test_mm_broadcastss_ps(__m128 a) {
// CHECK: @llvm.x86.avx2.vbroadcast.ss.ps
// CHECK-LABEL: test_mm_broadcastss_ps
// CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
return _mm_broadcastss_ps(a);
}

@@ -617,12 +619,16 @@ __m128d test_mm_broadcastsd_pd(__m128d a) {
}

__m256 test_mm256_broadcastss_ps(__m128 a) {
// CHECK: @llvm.x86.avx2.vbroadcast.ss.ps.256
// CHECK-LABEL: test_mm256_broadcastss_ps
// CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps.256
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
return _mm256_broadcastss_ps(a);
}

__m256d test_mm256_broadcastsd_pd(__m128d a) {
// check: @llvm.x86.avx2.vbroadcast.sd.pd.256
// CHECK-LABEL: test_mm256_broadcastsd_pd
// CHECK-NOT: @llvm.x86.avx2.vbroadcast.sd.pd.256
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
return _mm256_broadcastsd_pd(a);
}

@@ -646,42 +652,58 @@ __m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
}

__m256i test_mm256_broadcastb_epi8(__m128i a) {
// CHECK: @llvm.x86.avx2.pbroadcastb.256
// CHECK-LABEL: test_mm256_broadcastb_epi8
// CHECK-NOT: @llvm.x86.avx2.pbroadcastb.256
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
return _mm256_broadcastb_epi8(a);
}

__m256i test_mm256_broadcastw_epi16(__m128i a) {
// CHECK: @llvm.x86.avx2.pbroadcastw.256
// CHECK-LABEL: test_mm256_broadcastw_epi16
// CHECK-NOT: @llvm.x86.avx2.pbroadcastw.256
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
return _mm256_broadcastw_epi16(a);
}

__m256i test_mm256_broadcastd_epi32(__m128i a) {
// CHECK: @llvm.x86.avx2.pbroadcastd.256
// CHECK-LABEL: test_mm256_broadcastd_epi32
// CHECK-NOT: @llvm.x86.avx2.pbroadcastd.256
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
return _mm256_broadcastd_epi32(a);
}

__m256i test_mm256_broadcastq_epi64(__m128i a) {
// CHECK: @llvm.x86.avx2.pbroadcastq.256
// CHECK-LABEL: test_mm256_broadcastq_epi64
// CHECK-NOT: @llvm.x86.avx2.pbroadcastq.256
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
return _mm256_broadcastq_epi64(a);
}

__m128i test_mm_broadcastb_epi8(__m128i a) {
// CHECK: @llvm.x86.avx2.pbroadcastb.128
// CHECK-LABEL: test_mm_broadcastb_epi8
// CHECK-NOT: @llvm.x86.avx2.pbroadcastb.128
// CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer
return _mm_broadcastb_epi8(a);
}

__m128i test_mm_broadcastw_epi16(__m128i a) {
// CHECK: @llvm.x86.avx2.pbroadcastw.128
// CHECK-LABEL: test_mm_broadcastw_epi16
// CHECK-NOT: @llvm.x86.avx2.pbroadcastw.128
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
return _mm_broadcastw_epi16(a);
}

__m128i test_mm_broadcastd_epi32(__m128i a) {
// CHECK: @llvm.x86.avx2.pbroadcastd.128
// CHECK-LABEL: test_mm_broadcastd_epi32
// CHECK-NOT: @llvm.x86.avx2.pbroadcastd.128
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
return _mm_broadcastd_epi32(a);
}

__m128i test_mm_broadcastq_epi64(__m128i a) {
// CHECK: @llvm.x86.avx2.pbroadcastq.128
// CHECK-LABEL: test_mm_broadcastq_epi64
// CHECK-NOT: @llvm.x86.avx2.pbroadcastq.128
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> zeroinitializer
return _mm_broadcastq_epi64(a);
}

0 comments on commit 5e354cb

Please sign in to comment.