diff --git a/clang/lib/Headers/ppc_wrappers/bmi2intrin.h b/clang/lib/Headers/ppc_wrappers/bmi2intrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/bmi2intrin.h @@ -0,0 +1,133 @@ +/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined X86GPRINTRIN_H_ +#error "Never use directly; include instead." +#endif + +#ifndef BMI2INTRIN_H_ +#define BMI2INTRIN_H_ + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bzhi_u32(unsigned int __X, unsigned int __Y) { + return ((__X << (32 - __Y)) >> (32 - __Y)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) { + unsigned long long __res = (unsigned long long)__X * __Y; + *__P = (unsigned int)(__res >> 32); + return (unsigned int)__res; +} + +#ifdef __PPC64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bzhi_u64(unsigned long long __X, unsigned long long __Y) { + return ((__X << (64 - __Y)) >> (64 - __Y)); +} + +/* __int128 requires base 64-bit. */ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mulx_u64(unsigned long long __X, unsigned long long __Y, + unsigned long long *__P) { + unsigned __int128 __res = (unsigned __int128)__X * __Y; + *__P = (unsigned long long)(__res >> 64); + return (unsigned long long)__res; +} + +#ifdef _ARCH_PWR7 +/* popcount and bpermd require power7 minimum. */ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pdep_u64(unsigned long long __X, unsigned long long __M) { + unsigned long result = 0x0UL; + const unsigned long mask = 0x8000000000000000UL; + unsigned long m = __M; + unsigned long c, t; + unsigned long p; + + /* The pop-count of the mask gives the number of the bits from + source to process. This is also needed to shift bits from the + source into the correct position for the result. */ + p = 64 - __builtin_popcountl(__M); + + /* The loop is for the number of '1' bits in the mask and clearing + each mask bit as it is processed. */ + while (m != 0) { + c = __builtin_clzl(m); + t = __X << (p - c); + m ^= (mask >> c); + result |= (t & (mask >> c)); + p++; + } + return (result); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pext_u64(unsigned long long __X, unsigned long long __M) { + unsigned long p = 0x4040404040404040UL; // initial bit permute control + const unsigned long mask = 0x8000000000000000UL; + unsigned long m = __M; + unsigned long c; + unsigned long result; + + /* if the mask is constant and selects 8 bits or less we can use + the Power8 Bit permute instruction. */ + if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) { + /* Also if the pext mask is constant, then the popcount is + constant, we can evaluate the following loop at compile + time and use a constant bit permute vector. */ + for (long i = 0; i < __builtin_popcountl(__M); i++) { + c = __builtin_clzl(m); + p = (p << 8) | c; + m ^= (mask >> c); + } + result = __builtin_bpermd(p, __X); + } else { + p = 64 - __builtin_popcountl(__M); + result = 0; + /* We could a use a for loop here, but that combined with + -funroll-loops can expand to a lot of code. The while + loop avoids unrolling and the compiler commons the xor + from clearing the mask bit with the (m != 0) test. The + result is a more compact loop setup and body. */ + while (m != 0) { + unsigned long t; + c = __builtin_clzl(m); + t = (__X & (mask >> c)) >> (p - c); + m ^= (mask >> c); + result |= (t); + p++; + } + } + return (result); +} + +/* these 32-bit implementations depend on 64-bit pdep/pext + which depend on _ARCH_PWR7. */ +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pdep_u32(unsigned int __X, unsigned int __Y) { + return _pdep_u64(__X, __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _pext_u32(unsigned int __X, unsigned int __Y) { + return _pext_u64(__X, __Y); +} +#endif /* _ARCH_PWR7 */ +#endif /* __PPC64__ */ + +#endif /* BMI2INTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/bmiintrin.h b/clang/lib/Headers/ppc_wrappers/bmiintrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/bmiintrin.h @@ -0,0 +1,165 @@ +/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined X86GPRINTRIN_H_ +#error "Never use directly; include instead." +#endif + +#ifndef BMIINTRIN_H_ +#define BMIINTRIN_H_ + +extern __inline unsigned short + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u16(unsigned short __X) { + return __builtin_ctz(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __andn_u32(unsigned int __X, unsigned int __Y) { + return (~__X & __Y); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) { + return ((__X << (32 - (__L + __P))) >> (32 - __L)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextr_u32(unsigned int __X, unsigned int __Y) { + unsigned int __P, __L; + __P = __Y & 0xFF; + __L = (__Y >> 8) & 0xFF; + return (_bextr_u32(__X, __P, __L)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsi_u32(unsigned int __X) { + return (__X & -__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsi_u32(unsigned int __X) { + return __blsi_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsmsk_u32(unsigned int __X) { + return (__X ^ (__X - 1)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsmsk_u32(unsigned int __X) { + return __blsmsk_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsr_u32(unsigned int __X) { + return (__X & (__X - 1)); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsr_u32(unsigned int __X) { + return __blsr_u32(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u32(unsigned int __X) { + return __builtin_ctz(__X); +} + +extern __inline unsigned int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tzcnt_u32(unsigned int __X) { + return __builtin_ctz(__X); +} + +/* use the 64-bit shift, rotate, and count leading zeros instructions + for long long. */ +#ifdef __PPC64__ +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __andn_u64(unsigned long long __X, unsigned long long __Y) { + return (~__X & __Y); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) { + return ((__X << (64 - (__L + __P))) >> (64 - __L)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __bextr_u64(unsigned long long __X, unsigned long long __Y) { + unsigned int __P, __L; + __P = __Y & 0xFF; + __L = (__Y & 0xFF00) >> 8; + return (_bextr_u64(__X, __P, __L)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsi_u64(unsigned long long __X) { + return __X & -__X; +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsi_u64(unsigned long long __X) { + return __blsi_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsmsk_u64(unsigned long long __X) { + return (__X ^ (__X - 1)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsmsk_u64(unsigned long long __X) { + return __blsmsk_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __blsr_u64(unsigned long long __X) { + return (__X & (__X - 1)); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _blsr_u64(unsigned long long __X) { + return __blsr_u64(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + __tzcnt_u64(unsigned long long __X) { + return __builtin_ctzll(__X); +} + +extern __inline unsigned long long + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _tzcnt_u64(unsigned long long __X) { + return __builtin_ctzll(__X); +} +#endif /* __PPC64__ */ + +#endif /* BMIINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/emmintrin.h b/clang/lib/Headers/ppc_wrappers/emmintrin.h --- a/clang/lib/Headers/ppc_wrappers/emmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/emmintrin.h @@ -38,6 +38,7 @@ #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) #include +#include /* We need definitions from the SSE header files. */ #include @@ -127,6 +128,7 @@ extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_pd (double const *__P) { + assert(((unsigned long)__P & 0xfUL) == 0UL); return ((__m128d)vec_ld(0, (__v16qu*)__P)); } @@ -169,6 +171,7 @@ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_pd (double *__P, __m128d __A) { + assert(((unsigned long)__P & 0xfUL) == 0UL); vec_st((__v16qu)__A, 0, (__v16qu*)__P); } @@ -405,20 +408,10 @@ extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cmpord_pd (__m128d __A, __m128d __B) { -#if _ARCH_PWR8 __v2du c, d; /* Compare against self will return false (0's) if NAN. */ c = (__v2du)vec_cmpeq (__A, __A); d = (__v2du)vec_cmpeq (__B, __B); -#else - __v2du a, b; - __v2du c, d; - const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000}; - a = (__v2du)vec_abs ((__v2df)__A); - b = (__v2du)vec_abs ((__v2df)__B); - c = (__v2du)vec_cmpgt (double_exp_mask, a); - d = (__v2du)vec_cmpgt (double_exp_mask, b); -#endif /* A != NAN and B != NAN. */ return ((__m128d)vec_and(c, d)); } @@ -777,6 +770,7 @@ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_si128 (__m128i *__P, __m128i __B) { + assert(((unsigned long )__P & 0xfUL) == 0UL); vec_st ((__v16qu) __B, 0, (__v16qu*)__P); } @@ -861,7 +855,11 @@ : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4si) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -896,7 +894,11 @@ : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4sf) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -925,7 +927,11 @@ : ); #ifdef _ARCH_PWR8 +#ifdef __LITTLE_ENDIAN__ temp = vec_mergeo (temp, temp); +#else + temp = vec_mergee (temp, temp); +#endif result = (__v4si) vec_vpkudum ((__vector long long) temp, (__vector long long) vzero); #else @@ -1205,6 +1211,9 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pd (__m128d __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__v2du) __A); +#else __vector unsigned long long result; static const __vector unsigned int perm_mask = { @@ -1224,6 +1233,7 @@ #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ @@ -1434,6 +1444,7 @@ return ((__m64)a * (__m64)b); } +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_epu32 (__m128i __A, __m128i __B) { @@ -1460,6 +1471,7 @@ return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B); #endif } +#endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi16 (__m128i __A, int __B) @@ -1749,7 +1761,7 @@ lshift = vec_splat ((__v2du) __B, 0); shmask = vec_cmplt (lshift, shmax); result = vec_sl ((__v2du) __A, lshift); - result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask); + result = vec_sel ((__v2du) shmask, result, shmask); return (__m128i) result; } @@ -1843,7 +1855,7 @@ rshift = vec_splat ((__v2du) __B, 0); shmask = vec_cmplt (rshift, shmax); result = vec_sr ((__v2du) __A, rshift); - result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask); + result = vec_sel ((__v2du) shmask, result, shmask); return (__m128i) result; } @@ -1995,10 +2007,14 @@ #ifdef _ARCH_PWR8 /* Intrinsic functions that require PowerISA 2.07 minimum. */ -/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ +/* Return a mask created from the most significant bit of each 8-bit + element in A. */ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_epi8 (__m128i __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__v16qu) __A); +#else __vector unsigned long long result; static const __vector unsigned char perm_mask = { @@ -2015,6 +2031,7 @@ #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ @@ -2158,27 +2175,37 @@ _mm_sad_epu8 (__m128i __A, __m128i __B) { __v16qu a, b; - __v16qu vmin, vmax, vabsdiff; + __v16qu vabsdiff; __v4si vsum; const __v4su zero = { 0, 0, 0, 0 }; __v4si result; a = (__v16qu) __A; b = (__v16qu) __B; - vmin = vec_min (a, b); - vmax = vec_max (a, b); +#ifndef _ARCH_PWR9 + __v16qu vmin = vec_min (a, b); + __v16qu vmax = vec_max (a, b); vabsdiff = vec_sub (vmax, vmin); +#else + vabsdiff = vec_absd (a, b); +#endif /* Sum four groups of bytes into integers. */ vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); +#ifdef __LITTLE_ENDIAN__ + /* Sum across four integers with two integer results. */ + asm ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero)); + /* Note: vec_sum2s could be used here, but on little-endian, vector + shifts are added that are not needed for this use-case. + A vector shift to correctly position the 32-bit integer results + (currently at [0] and [2]) to [1] and [3] would then need to be + swapped back again since the desired results are two 64-bit + integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */ +#else /* Sum across four integers with two integer results. */ result = vec_sum2s (vsum, (__vector signed int) zero); /* Rotate the sums into the correct position. */ -#ifdef __LITTLE_ENDIAN__ - result = vec_sld (result, result, 4); -#else result = vec_sld (result, result, 6); #endif - /* Rotate the sums into the correct position. */ return (__m128i) result; } diff --git a/clang/lib/Headers/ppc_wrappers/immintrin.h b/clang/lib/Headers/ppc_wrappers/immintrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/immintrin.h @@ -0,0 +1,27 @@ +/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef IMMINTRIN_H_ +#define IMMINTRIN_H_ + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#endif /* IMMINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/nmmintrin.h b/clang/lib/Headers/ppc_wrappers/nmmintrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/nmmintrin.h @@ -0,0 +1,26 @@ +/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#endif + +#ifndef NMMINTRIN_H_ +#define NMMINTRIN_H_ + +/* We just include SSE4.1 header file. */ +#include + +#endif /* NMMINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/pmmintrin.h b/clang/lib/Headers/ppc_wrappers/pmmintrin.h --- a/clang/lib/Headers/ppc_wrappers/pmmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/pmmintrin.h @@ -111,17 +111,21 @@ vec_mergel ((__v2df) __X, (__v2df)__Y)); } +#ifdef _ARCH_PWR8 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movehdup_ps (__m128 __X) { return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_moveldup_ps (__m128 __X) { return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X); } +#endif extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loaddup_pd (double const *__P) diff --git a/clang/lib/Headers/ppc_wrappers/smmintrin.h b/clang/lib/Headers/ppc_wrappers/smmintrin.h --- a/clang/lib/Headers/ppc_wrappers/smmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/smmintrin.h @@ -34,77 +34,683 @@ #include #include -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi8(__m128i __X, const int __N) { - return (unsigned char)((__v16qi)__X)[__N & 15]; +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_ZERO 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_NEG_INF 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +#define _MM_FROUND_NINT \ + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR \ + (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL \ + (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC \ + (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) + +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NO_EXC 0x08 + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_pd (__m128d __A, int __rounding) +{ + __v2df __r; + union { + double __fr; + long long __fpscr; + } __enables_save, __fpscr_save; + + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Save enabled exceptions, disable all exceptions, + and preserve the rounding mode. */ +#ifdef _ARCH_PWR9 + __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr)); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; +#else + __fpscr_save.__fr = __builtin_mffs (); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; + __fpscr_save.__fpscr &= ~0xf8; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); +#endif + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + } + + switch (__rounding) + { + case _MM_FROUND_TO_NEAREST_INT: + __fpscr_save.__fr = __builtin_mffsl (); + __attribute__ ((fallthrough)); + case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: + __builtin_set_fpscr_rn (0b00); + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + + __r = vec_rint ((__v2df) __A); + + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + __builtin_set_fpscr_rn (__fpscr_save.__fpscr); + break; + case _MM_FROUND_TO_NEG_INF: + case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: + __r = vec_floor ((__v2df) __A); + break; + case _MM_FROUND_TO_POS_INF: + case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: + __r = vec_ceil ((__v2df) __A); + break; + case _MM_FROUND_TO_ZERO: + case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: + __r = vec_trunc ((__v2df) __A); + break; + case _MM_FROUND_CUR_DIRECTION: + __r = vec_rint ((__v2df) __A); + break; + } + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + /* Restore enabled exceptions. */ + __fpscr_save.__fr = __builtin_mffsl (); + __fpscr_save.__fpscr |= __enables_save.__fpscr; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); + } + return (__m128d) __r; } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi32(__m128i __X, const int __N) { +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_sd (__m128d __A, __m128d __B, int __rounding) +{ + __B = _mm_round_pd (__B, __rounding); + __v2df __r = { ((__v2df) __B)[0], ((__v2df) __A)[1] }; + return (__m128d) __r; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ps (__m128 __A, int __rounding) +{ + __v4sf __r; + union { + double __fr; + long long __fpscr; + } __enables_save, __fpscr_save; + + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Save enabled exceptions, disable all exceptions, + and preserve the rounding mode. */ +#ifdef _ARCH_PWR9 + __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr)); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; +#else + __fpscr_save.__fr = __builtin_mffs (); + __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; + __fpscr_save.__fpscr &= ~0xf8; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); +#endif + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + } + + switch (__rounding) + { + case _MM_FROUND_TO_NEAREST_INT: + __fpscr_save.__fr = __builtin_mffsl (); + __attribute__ ((fallthrough)); + case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: + __builtin_set_fpscr_rn (0b00); + /* Insert an artificial "read/write" reference to the variable + read below, to ensure the compiler does not schedule + a read/use of the variable before the FPSCR is modified, above. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : "+wa" (__A)); + + __r = vec_rint ((__v4sf) __A); + + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + __builtin_set_fpscr_rn (__fpscr_save.__fpscr); + break; + case _MM_FROUND_TO_NEG_INF: + case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: + __r = vec_floor ((__v4sf) __A); + break; + case _MM_FROUND_TO_POS_INF: + case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: + __r = vec_ceil ((__v4sf) __A); + break; + case _MM_FROUND_TO_ZERO: + case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: + __r = vec_trunc ((__v4sf) __A); + break; + case _MM_FROUND_CUR_DIRECTION: + __r = vec_rint ((__v4sf) __A); + break; + } + if (__rounding & _MM_FROUND_NO_EXC) + { + /* Insert an artificial "read" reference to the variable written + above, to ensure the compiler does not schedule the computation + of the value after the manipulation of the FPSCR, below. + This can be removed if and when GCC PR102783 is fixed. + */ + __asm__ ("" : : "wa" (__r)); + /* Restore enabled exceptions. */ + __fpscr_save.__fr = __builtin_mffsl (); + __fpscr_save.__fpscr |= __enables_save.__fpscr; + __builtin_mtfsf (0b00000011, __fpscr_save.__fr); + } + return (__m128) __r; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ss (__m128 __A, __m128 __B, int __rounding) +{ + __B = _mm_round_ps (__B, __rounding); + __v4sf __r = (__v4sf) __A; + __r[0] = ((__v4sf) __B)[0]; + return (__m128) __r; +} + +#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL) +#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) +#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR) + +#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL) +#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR) +#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR) + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi8 (__m128i const __A, int const __D, int const __N) +{ + __v16qi result = (__v16qi)__A; + + result [__N & 0xf] = __D; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi32 (__m128i const __A, int const __D, int const __N) +{ + __v4si result = (__v4si)__A; + + result [__N & 3] = __D; + + return (__m128i) result; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N) +{ + __v2di result = (__v2di)__A; + + result [__N & 1] = __D; + + return (__m128i) result; +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi8 (__m128i __X, const int __N) +{ + return (unsigned char) ((__v16qi)__X)[__N & 15]; +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi32 (__m128i __X, const int __N) +{ return ((__v4si)__X)[__N & 3]; } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_epi64(__m128i __X, const int __N) { +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi64 (__m128i __X, const int __N) +{ return ((__v2di)__X)[__N & 1]; } -extern __inline int - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_extract_ps(__m128 __X, const int __N) { +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_ps (__m128 __X, const int __N) +{ return ((__v4si)__X)[__N & 3]; } +#ifdef _ARCH_PWR8 +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8) +{ + __v16qi __charmask = vec_splats ((signed char) __imm8); + __charmask = vec_gb (__charmask); + __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask); + #ifdef __BIG_ENDIAN__ + __shortmask = vec_reve (__shortmask); + #endif + return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask); +} +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask) +{ +#ifdef _ARCH_PWR10 + return (__m128i) vec_blendv ((__v16qi) __A, (__v16qi) __B, (__v16qu) __mask); +#else + const __v16qu __seven = vec_splats ((unsigned char) 0x07); + __v16qu __lmask = vec_sra ((__v16qu) __mask, __seven); + return (__m128i) vec_sel ((__v16qi) __A, (__v16qi) __B, __lmask); +#endif +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8) +{ + __v16qu __pcv[] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + }; + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); + return (__m128) __r; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask) +{ +#ifdef _ARCH_PWR10 + return (__m128) vec_blendv ((__v4sf) __A, (__v4sf) __B, (__v4su) __mask); +#else + const __v4si __zero = {0}; + const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero); + return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask); +#endif +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8) +{ + __v16qu __pcv[] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 } + }; + __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]); + return (__m128d) __r; +} + +#ifdef _ARCH_PWR8 +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask) +{ +#ifdef _ARCH_PWR10 + return (__m128d) vec_blendv ((__v2df) __A, (__v2df) __B, (__v2du) __mask); +#else + const __v2di __zero = {0}; + const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero); + return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask); +#endif +} +#endif + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + const __v16qu __zero = {0}; + return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + const __v16qu __zero = {0}; + const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A); + return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_si128 (__m128i __A, __m128i __B) +{ + /* Note: This implementation does NOT set "zero" or "carry" flags. */ + return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0; +} + +#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) + +#define _mm_test_all_ones(V) \ + _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V))) + +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V)) + +#ifdef _ARCH_PWR8 extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { - __v16qi __charmask = vec_splats((signed char)__imm8); - __charmask = vec_gb(__charmask); - __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask); -#ifdef __BIG_ENDIAN__ - __shortmask = vec_reve(__shortmask); +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y); +} #endif - return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v16qi)__X, (__v16qi)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { - const __v16qu __seven = vec_splats((unsigned char)0x07); - __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); - return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask); +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v8hu)__X, (__v8hu)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { - __v16qi result = (__v16qi)__A; - result[__N & 0xf] = __D; - return (__m128i)result; +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v4si)__X, (__v4si)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { - __v4si result = (__v4si)__A; - result[__N & 3] = __D; - return (__m128i)result; +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_min ((__v4su)__X, (__v4su)__Y); } extern __inline __m128i - __attribute__((__gnu_inline__, __always_inline__, __artificial__)) - _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { - __v2di result = (__v2di)__A; - result[__N & 1] = __D; - return (__m128i)result; +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v8hu)__X, (__v8hu)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_max ((__v4su)__X, (__v4su)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y); +} +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi16 (__m128i __A) +{ + return (__m128i) vec_unpackh ((__v16qi) __A); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi32 (__m128i __A) +{ + __A = (__m128i) vec_unpackh ((__v16qi) __A); + return (__m128i) vec_unpackh ((__v8hi) __A); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi64 (__m128i __A) +{ + __A = (__m128i) vec_unpackh ((__v16qi) __A); + __A = (__m128i) vec_unpackh ((__v8hi) __A); + return (__m128i) vec_unpackh ((__v4si) __A); +} +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi32 (__m128i __A) +{ + return (__m128i) vec_unpackh ((__v8hi) __A); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi64 (__m128i __A) +{ + __A = (__m128i) vec_unpackh ((__v8hi) __A); + return (__m128i) vec_unpackh ((__v4si) __A); +} +#endif + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_epi64 (__m128i __A) +{ + return (__m128i) vec_unpackh ((__v4si) __A); } +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi16 (__m128i __A) +{ + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi32 (__m128i __A) +{ + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero); + __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A); + __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi64 (__m128i __A) +{ + const __v16qu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero); + __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero); + __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A); + __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A); + __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi32 (__m128i __A) +{ + const __v8hu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi64 (__m128i __A) +{ + const __v8hu __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero); + __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A); + __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_epi64 (__m128i __A) +{ + const __v4su __zero = {0}; +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_mergeh ((__v4su) __A, __zero); +#else /* __BIG_ENDIAN__. */ + __A = (__m128i) vec_mergeh (__zero, (__v4su) __A); +#endif /* __BIG_ENDIAN__. */ + return __A; +} + +/* Return horizontal packed word minimum and its index in bits [15:0] + and bits [18:16] respectively. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_minpos_epu16 (__m128i __A) +{ + union __u + { + __m128i __m; + __v8hu __uh; + }; + union __u __u = { .__m = __A }, __r = { .__m = {0} }; + unsigned short __ridx = 0; + unsigned short __rmin = __u.__uh[__ridx]; + for (unsigned long __i = 1; __i < 8; __i++) + { + if (__u.__uh[__i] < __rmin) + { + __rmin = __u.__uh[__i]; + __ridx = __i; + } + } + __r.__uh[0] = __rmin; + __r.__uh[1] = __ridx; + return __r.__m; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packus_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y); +} + +#ifdef _ARCH_PWR8 +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y); +} +#endif #else #include_next #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ */ -#endif /* _SMMINTRIN_H_ */ +#endif /* SMMINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/tmmintrin.h b/clang/lib/Headers/ppc_wrappers/tmmintrin.h --- a/clang/lib/Headers/ppc_wrappers/tmmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/tmmintrin.h @@ -28,6 +28,7 @@ #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) #include +#include /* We need definitions from the SSE header files. */ #include @@ -339,6 +340,7 @@ return (__m64) ((__v2du) (__C))[0]; } +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_epi8 (__m128i __A, __m128i __B) @@ -350,7 +352,9 @@ __v16qi __conv = vec_add (__selectneg, __selectpos); return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_epi16 (__m128i __A, __m128i __B) @@ -362,7 +366,9 @@ __v8hi __conv = vec_add (__selectneg, __selectpos); return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_epi32 (__m128i __A, __m128i __B) @@ -374,7 +380,9 @@ __v4si __conv = vec_add (__selectneg, __selectpos); return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_pi8 (__m64 __A, __m64 __B) @@ -385,7 +393,9 @@ __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); return (__m64) ((__v2du) (__C))[0]; } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_pi16 (__m64 __A, __m64 __B) @@ -396,7 +406,9 @@ __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); return (__m64) ((__v2du) (__C))[0]; } +#endif +#ifdef _ARCH_PWR8 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sign_pi32 (__m64 __A, __m64 __B) @@ -407,6 +419,7 @@ __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); return (__m64) ((__v2du) (__C))[0]; } +#endif extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/clang/lib/Headers/ppc_wrappers/x86gprintrin.h b/clang/lib/Headers/ppc_wrappers/x86gprintrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/x86gprintrin.h @@ -0,0 +1,17 @@ +/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef X86GPRINTRIN_H_ +#define X86GPRINTRIN_H_ + +#include + +#include + +#endif /* X86GPRINTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/x86intrin.h b/clang/lib/Headers/ppc_wrappers/x86intrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/ppc_wrappers/x86intrin.h @@ -0,0 +1,28 @@ +/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#endif + +#ifndef X86INTRIN_H_ +#define X86INTRIN_H_ + +#ifdef __ALTIVEC__ +#include +#endif /* __ALTIVEC__ */ + +#endif /* X86INTRIN_H_ */ diff --git a/clang/lib/Headers/ppc_wrappers/xmmintrin.h b/clang/lib/Headers/ppc_wrappers/xmmintrin.h --- a/clang/lib/Headers/ppc_wrappers/xmmintrin.h +++ b/clang/lib/Headers/ppc_wrappers/xmmintrin.h @@ -31,10 +31,8 @@ #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." #endif -#ifndef _XMMINTRIN_H_INCLUDED -#define _XMMINTRIN_H_INCLUDED - -#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) +#ifndef XMMINTRIN_H_ +#define XMMINTRIN_H_ /* Define four value permute mask */ #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) @@ -52,6 +50,8 @@ #undef bool #endif +#include + /* We need type definitions from the MMX header file. */ #include @@ -62,13 +62,14 @@ /* The Intel API is flexible enough that we must allow aliasing with other vector types, and their scalar components. */ -typedef vector float __m128 __attribute__((__may_alias__)); +typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); /* Unaligned version of the same type. */ -typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1))); +typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, + __aligned__ (1))); /* Internal data types for implementing the intrinsics. */ -typedef vector float __v4sf; +typedef float __v4sf __attribute__ ((__vector_size__ (16))); /* Create an undefined vector. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -89,6 +90,7 @@ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_ps (float const *__P) { + assert(((unsigned long)__P & 0xfUL) == 0UL); return ((__m128)vec_ld(0, (__v4sf*)__P)); } @@ -145,6 +147,7 @@ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_ps (float *__P, __m128 __A) { + assert(((unsigned long)__P & 0xfUL) == 0UL); vec_st((__v4sf)__A, 0, (__v4sf*)__P); } @@ -881,7 +884,7 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_si32 (__m128 __A) { - __m64 res = 0; + int res; #ifdef _ARCH_PWR8 double dtmp; __asm__( @@ -914,8 +917,8 @@ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_si64 (__m128 __A) { - __m64 res = 0; -#ifdef _ARCH_PWR8 + long long res; +#if defined (_ARCH_PWR8) && defined (__powerpc64__) double dtmp; __asm__( #ifdef __LITTLE_ENDIAN__ @@ -1328,6 +1331,9 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_ps (__m128 __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__vector unsigned int) __A); +#else __vector unsigned long long result; static const __vector unsigned int perm_mask = { @@ -1347,6 +1353,7 @@ #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ @@ -1553,6 +1560,7 @@ extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pi8 (__m64 __A) { +#ifdef __powerpc64__ unsigned long long p = #ifdef __LITTLE_ENDIAN__ 0x0008101820283038UL; // permute control for sign bits @@ -1560,6 +1568,18 @@ 0x3830282018100800UL; // permute control for sign bits #endif return __builtin_bpermd (p, __A); +#else +#ifdef __LITTLE_ENDIAN__ + unsigned int mask = 0x20283038UL; + unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf; + unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf; +#else + unsigned int mask = 0x38302820UL; + unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf; + unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf; +#endif + return (r2 << 4) | r1; +#endif } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1841,4 +1861,4 @@ #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \ */ -#endif /* _XMMINTRIN_H_INCLUDED */ +#endif /* XMMINTRIN_H_ */