Index: CMakeLists.txt =================================================================== --- CMakeLists.txt +++ CMakeLists.txt @@ -1,3 +1,4 @@ cmake_minimum_required(VERSION 3.1) +add_subdirectory(sleef) add_subdirectory(streamexecutor) Index: sleef/CMakeLists.txt =================================================================== --- /dev/null +++ sleef/CMakeLists.txt @@ -0,0 +1,74 @@ +cmake_minimum_required(VERSION 3.1) + +option(SLEEF_UNIT_TESTS "enable unit tests" OFF) + +# First find includes relative to the SLEEF top-level source path. +include_directories(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/include) + +# If we are not building as part of LLVM, build SLEEF as a standalone +# project using LLVM as an external library: +string( + COMPARE + EQUAL + "${CMAKE_SOURCE_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}" + SLEEF_STANDALONE) + +if(SLEEF_STANDALONE) + project(SLEEF) + + find_package(LLVM REQUIRED CONFIG) + message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") + message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") + + include_directories(${LLVM_INCLUDE_DIRS}) + add_definitions(${LLVM_DEFINITIONS}) + + # Get the LLVM cxxflags by using llvm-config. + # + # This is necessary to get -fno-rtti if LLVM is compiled that way. + execute_process( + COMMAND + "${LLVM_BINARY_DIR}/bin/llvm-config" + --cxxflags + OUTPUT_VARIABLE + LLVM_CXXFLAGS + OUTPUT_STRIP_TRAILING_WHITESPACE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS}") + + set(LLVM_CMAKE_PATH "${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/cmake/llvm") + list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_PATH}") + include(AddLLVM) + + if(SLEEF_UNIT_TESTS) + enable_testing() + find_package(GTest REQUIRED) + include_directories(${GTEST_INCLUDE_DIRS}) + find_package(Threads REQUIRED) + endif() +else(NOT SLEEF_STANDALONE) + if(SLEEF_UNIT_TESTS) + include_directories( + "${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include") + endif() +endif(SLEEF_STANDALONE) + +# Find the libraries that correspond to the LLVM components +# that we wish to use +llvm_map_components_to_libnames(llvm_libs support symbolize) + +# Insist on C++ 11 features. +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Add warning flags. +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-unused-parameter") + +add_subdirectory(lib) + +if(SLEEF_UNIT_TESTS) + add_subdirectory(unittests) +endif() + +install(DIRECTORY include/ DESTINATION include) + Index: sleef/include/__sleef.def =================================================================== --- /dev/null +++ sleef/include/__sleef.def @@ -0,0 +1,119 @@ +/*===---------- __sleef.def - SLEEF functions ------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef _SLEEF_H_INCLUDED +#error "Never use this file directly; include instead." +#endif + +#ifdef _SLEEF_SP +typedef struct { + _SLEEF_N(vfloat) __x, __y; +} _SLEEF_N(vfloat2); + +_SLEEF_N(vfloat) _SLEEF_N(xldexpf)(_SLEEF_N(vfloat) __x, _SLEEF_N(vint2) __q) __attribute__((__const__)); + +_SLEEF_N(vfloat) _SLEEF_N(xsinf)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xcosf)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat2) _SLEEF_N(xsincosf)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xtanf)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xasinf)(_SLEEF_N(vfloat) __s) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xacosf)(_SLEEF_N(vfloat) __s) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xatanf)(_SLEEF_N(vfloat) __s) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xatan2f)(_SLEEF_N(vfloat) __y, _SLEEF_N(vfloat) __x) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xlogf)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xexpf)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xcbrtf)(_SLEEF_N(vfloat) __s) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xsqrtf)(_SLEEF_N(vfloat) __s) __attribute__((__const__)); + +_SLEEF_N(vfloat) _SLEEF_N(xpowf)(_SLEEF_N(vfloat) __x, _SLEEF_N(vfloat) __y) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xsinhf)(_SLEEF_N(vfloat) __x) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xcoshf)(_SLEEF_N(vfloat) __x) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xtanhf)(_SLEEF_N(vfloat) __x) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xasinhf)(_SLEEF_N(vfloat) __x) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xacoshf)(_SLEEF_N(vfloat) __x) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xatanhf)(_SLEEF_N(vfloat) __x) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xexp2f)(_SLEEF_N(vfloat) __a) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xexp10f)(_SLEEF_N(vfloat) __a) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xexpm1f)(_SLEEF_N(vfloat) __a) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xlog10f)(_SLEEF_N(vfloat) __a) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xlog1pf)(_SLEEF_N(vfloat) __a) __attribute__((__const__)); + +_SLEEF_N(vfloat) _SLEEF_N(xsinf_u1)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xcosf_u1)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat2) _SLEEF_N(xsincosf_u1)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xtanf_u1)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xasinf_u1)(_SLEEF_N(vfloat) __s) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xacosf_u1)(_SLEEF_N(vfloat) __s) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xatanf_u1)(_SLEEF_N(vfloat) __s) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xatan2f_u1)(_SLEEF_N(vfloat) __y, _SLEEF_N(vfloat) __x) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xlogf_u1)(_SLEEF_N(vfloat) __d) __attribute__((__const__)); +_SLEEF_N(vfloat) _SLEEF_N(xcbrtf_u1)(_SLEEF_N(vfloat) __s) __attribute__((__const__)); +#endif + +#ifdef _SLEEF_DP +typedef struct { + _SLEEF_N(vdouble) __x, __y; +} _SLEEF_N(vdouble2); + +_SLEEF_N(vdouble) _SLEEF_N(xldexp)(_SLEEF_N(vdouble) __x, _SLEEF_N(vint) __q) __attribute__((__const__)); +_SLEEF_N(vint) _SLEEF_N(xilogb)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); + +_SLEEF_N(vdouble) _SLEEF_N(xsin)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xcos)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble2) _SLEEF_N(xsincos)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xtan)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xasin)(_SLEEF_N(vdouble) __s) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xacos)(_SLEEF_N(vdouble) __s) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xatan)(_SLEEF_N(vdouble) __s) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xatan2)(_SLEEF_N(vdouble) __y, _SLEEF_N(vdouble) __x) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xlog)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xexp)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xpow)(_SLEEF_N(vdouble) __x, _SLEEF_N(vdouble) __y) __attribute__((__const__)); + +_SLEEF_N(vdouble) _SLEEF_N(xsinh)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xcosh)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xtanh)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xasinh)(_SLEEF_N(vdouble) __s) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xacosh)(_SLEEF_N(vdouble) __s) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xatanh)(_SLEEF_N(vdouble) __s) __attribute__((__const__)); + +_SLEEF_N(vdouble) _SLEEF_N(xcbrt)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); + +_SLEEF_N(vdouble) _SLEEF_N(xexp2)(_SLEEF_N(vdouble) __a) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xexp10)(_SLEEF_N(vdouble) __a) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xexpm1)(_SLEEF_N(vdouble) __a) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xlog10)(_SLEEF_N(vdouble) __a) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xlog1p)(_SLEEF_N(vdouble) __a) __attribute__((__const__)); + +_SLEEF_N(vdouble) _SLEEF_N(xsin_u1)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xcos_u1)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble2) _SLEEF_N(xsincos_u1)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xtan_u1)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xasin_u1)(_SLEEF_N(vdouble) __s) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xacos_u1)(_SLEEF_N(vdouble) __s) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xatan_u1)(_SLEEF_N(vdouble) __s) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xatan2_u1)(_SLEEF_N(vdouble) __y, _SLEEF_N(vdouble) __x) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xlog_u1)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +_SLEEF_N(vdouble) _SLEEF_N(xcbrt_u1)(_SLEEF_N(vdouble) __d) __attribute__((__const__)); +#endif + Index: sleef/include/sleef.h =================================================================== --- /dev/null +++ sleef/include/sleef.h @@ -0,0 +1,148 @@ +/*===---------- sleef.h - SLEEF functions ----------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef _SLEEF_H_INCLUDED +#define _SLEEF_H_INCLUDED + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define _SLEEF_N(f) __ ## f + +typedef double _SLEEF_N(vdouble); +typedef int _SLEEF_N(vint); + +typedef float _SLEEF_N(vfloat); +typedef int _SLEEF_N(vint2); + +#define _SLEEF_SP +#define _SLEEF_DP + +#include <__sleef.def> + +#undef _SLEEF_SP +#undef _SLEEF_DP + +#undef _SLEEF_N + +#ifdef __SSE2__ +#include + +#define _SLEEF_N(f) __ ## f ## __sse2 + +typedef __m128d _SLEEF_N(vdouble); +typedef __m128i _SLEEF_N(vint); + +typedef __m128 _SLEEF_N(vfloat); +typedef __m128i _SLEEF_N(vint2); + +#define _SLEEF_SP +#define _SLEEF_DP + +#include <__sleef.def> + +#undef _SLEEF_SP +#undef _SLEEF_DP + +#undef _SLEEF_N + +#endif /* __SSE2 __ */ + +#if defined(__AVX__) || defined(__FMA4__) +#include + +#define _SLEEF_N(f) __ ## f ## __avx + +typedef __m256d _SLEEF_N(vdouble); +typedef __m128i _SLEEF_N(vint); + +typedef __m256 _SLEEF_N(vfloat); +typedef struct { + _SLEEF_N(vint) x, y; +} _SLEEF_N(vint2); + +#define _SLEEF_SP +#define _SLEEF_DP + +#include <__sleef.def> + +#undef _SLEEF_SP +#undef _SLEEF_DP + +#undef _SLEEF_N + +#endif /* __AVX__ or __FMA4__ */ + +#ifdef __AVX2__ +#include + +#define _SLEEF_N(f) __ ## f ## __avx2 + +typedef __m256d _SLEEF_N(vdouble); +typedef __m128i _SLEEF_N(vint); + +typedef __m256 _SLEEF_N(vfloat); +typedef __m256i _SLEEF_N(vint2); + +#define _SLEEF_SP +#define _SLEEF_DP + +#include <__sleef.def> + +#undef _SLEEF_SP +#undef _SLEEF_DP + +#undef _SLEEF_N + +#endif /* __AVX2__ */ + +#ifdef __ARM_NEON +#include + +#define _SLEEF_N(f) __ ## f ## __neon + +typedef int32x4_t _SLEEF_N(vint); +typedef uint32x4_t _SLEEF_N(vmask); + +typedef float32x4_t _SLEEF_N(vfloat); +typedef int32x4_t _SLEEF_N(vint2); + +#define _SLEEF_SP + +#include <__sleef.def> + +#undef _SLEEF_SP + +#undef _SLEEF_N + +#endif /* __ARM_NEON */ + +#ifdef __cplusplus +extern } +#endif + +#endif /* _SLEEF_H_INCLUDED */ + Index: sleef/lib/CMakeLists.txt =================================================================== --- /dev/null +++ sleef/lib/CMakeLists.txt @@ -0,0 +1,89 @@ +macro(add_sleef_library name) + add_llvm_library(${name} ${ARGN}) + set_target_properties(${name} PROPERTIES FOLDER "SLEEF library") +endmacro(add_sleef_library) + +set(SLEEF_SOURCES + dp-scalar.cpp + sp-scalar.cpp +) + +set(SLEEF_SSE2_SOURCES + dp-sse2.cpp + sp-sse2.cpp +) + +set(SLEEF_AVX_SOURCES + dp-avx.cpp + sp-avx.cpp +) + +set(SLEEF_AVX2_SOURCES + dp-avx2.cpp + sp-avx2.cpp +) + +set(SLEEF_NEON_SOURCES + sp-neon.cpp +) + +check_symbol_exists(__SSE2__ "" __SSE2) +check_symbol_exists(__AVX__ "" __AVX) +check_symbol_exists(__AVX2__ "" __AVX2) +check_symbol_exists(__FMA__ "" __FMA3) +check_symbol_exists(__ARM_NEON "" __NEON) + +# Silence the warning from llvm_check_source_file_list +list(APPEND LLVM_OPTIONAL_SOURCES + sp.cpp dp.cpp + ${SLEEF_SSE2_SOURCES} + ${SLEEF_AVX_SOURCES} + ${SLEEF_AVX2_SOURCES} + ${SLEEF_NEON_SOURCES} +) + +message(STATUS "SLEEF architecture: ${CMAKE_SYSTEM_PROCESSOR}") + +if(__SSE2) + list(APPEND SLEEF_SOURCES ${SLEEF_SSE2_SOURCES}) +endif() + +if(__AVX) + list(APPEND SLEEF_SOURCES ${SLEEF_AVX_SOURCES}) +else() + check_cxx_compiler_flag(-mavx __AVX_FLAG) + if (__AVX_FLAG) + set_source_files_properties(${SLEEF_AVX_SOURCES} PROPERTIES COMPILE_FLAGS -mavx) + list(APPEND SLEEF_SOURCES ${SLEEF_AVX_SOURCES}) + endif() +endif() + +if(__AVX2 AND __FMA3) + list(APPEND SLEEF_SOURCES ${SLEEF_AVX2_SOURCES}) +else() + check_cxx_compiler_flag("-mavx2 -mfma" __AVX2_FLAG) + if (__AVX2_FLAG) + set_source_files_properties(${SLEEF_AVX2_SOURCES} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma") + list(APPEND SLEEF_SOURCES ${SLEEF_AVX2_SOURCES}) + endif() +endif() + +if(__NEON) + list(APPEND SLEEF_SOURCES ${SLEEF_NEON_SOURCES}) +else() + check_cxx_compiler_flag(-mfpu=neon __NEON_FLAG) + if (__NEON_FLAG) + set_source_files_properties(${SLEEF_NEON_SOURCES} PROPERTIES COMPILE_FLAGS -mfpu=neon) + list(APPEND SLEEF_SOURCES ${SLEEF_NEON_SOURCES}) + endif() +endif() + +add_sleef_library( + sleef + ${SLEEF_SOURCES} + + LINK_LIBS + utils + ) + +install(TARGETS sleef DESTINATION lib) Index: sleef/lib/avx.h =================================================================== --- /dev/null +++ sleef/lib/avx.h @@ -0,0 +1,305 @@ +/*===---------- avx.h - AVX functions --------------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __AVX__ +#error Please specify -mavx. +#endif + +#include + +typedef __m256d vdouble; +typedef __m128i vint; +typedef __m256i vmask; + +typedef __m256 vfloat; +typedef struct { vint x, y; } vint2; + +// + +static inline vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } +static inline vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } +static inline vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } +static inline vdouble vcast_vd_d(double d) { return _mm256_set_pd(d, d, d, d); } +static inline vint vcast_vi_i(int i) { return _mm_set_epi32(i, i, i, i); } + +static inline vmask vreinterpret_vm_vd(vdouble vd) { return (__m256i)vd; } +static inline vdouble vreinterpret_vd_vm(vmask vm) { return (__m256d)vm; } + +static inline vmask vreinterpret_vm_vf(vfloat vf) { return (__m256i)vf; } +static inline vfloat vreinterpret_vf_vm(vmask vm) { return (__m256)vm; } + +// + +static inline vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } +static inline vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } +static inline vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } + +static inline vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } +static inline vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } +static inline vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } +static inline vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } + +static inline vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); } +static inline vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); } +static inline vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); } + +// + +static inline vmask vand_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_and_pd((__m256d)x, (__m256d)y); } +static inline vmask vandnot_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_andnot_pd((__m256d)x, (__m256d)y); } +static inline vmask vor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_or_pd((__m256d)x, (__m256d)y); } +static inline vmask vxor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_xor_pd((__m256d)x, (__m256d)y); } + +static inline vmask veq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_EQ_OQ); } +static inline vmask vneq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_NEQ_UQ); } +static inline vmask vlt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LT_OQ); } +static inline vmask vle_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LE_OQ); } +static inline vmask vgt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GT_OQ); } +static inline vmask vge_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GE_OQ); } + +static inline vmask veq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_EQ_OQ); } +static inline vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_NEQ_UQ); } +static inline vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LT_OQ); } +static inline vmask vle_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LE_OQ); } +static inline vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GT_OQ); } +static inline vmask vge_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GE_OQ); } + +// + +static inline vfloat vcast_vf_f(float f) { return _mm256_set_ps(f, f, f, f, f, f, f, f); } + +static inline vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); } +static inline vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); } +static inline vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); } +static inline vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); } +static inline vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } +static inline vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); } +static inline vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); } +static inline vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); } + +static inline vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } +static inline vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } +static inline vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); } +static inline vfloat vneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)d); } + +// + +static inline vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); } +static inline vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); } +static inline vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); } +static inline vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); } +static inline vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set_pd(1, 1, 1, 1), x); } +static inline vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); } +static inline vdouble vabs_vd_vd(vdouble d) { return (__m256d)_mm256_andnot_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static inline vdouble vneg_vd_vd(vdouble d) { return (__m256d)_mm256_xor_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static inline vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } +static inline vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } + +static inline vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); } +static inline vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); } + + +// + +static inline vmask veq_vm_vi_vi(vint x, vint y) { + __m256d r = _mm256_cvtepi32_pd(_mm_and_si128(_mm_cmpeq_epi32(x, y), _mm_set_epi32(1, 1, 1, 1))); + return veq_vm_vd_vd(r, _mm256_set_pd(1, 1, 1, 1)); +} + +static inline vdouble vsel_vd_vm_vd_vd(vmask mask, vdouble x, vdouble y) { + return (__m256d)vor_vm_vm_vm(vand_vm_vm_vm(mask, (__m256i)x), vandnot_vm_vm_vm(mask, (__m256i)y)); +} + +static inline vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { + return (vfloat)vor_vm_vm_vm(vand_vm_vm_vm(mask, (vmask)x), vandnot_vm_vm_vm(mask, (vmask)y)); +} + +static inline vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { + __m128i mask = _mm256_cvtpd_epi32(_mm256_and_pd(_mm256_cmp_pd(d0, d1, _CMP_LT_OQ), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))); + mask = _mm_cmpeq_epi32(mask, _mm_set_epi32(1, 1, 1, 1)); + return vor_vi_vi_vi(vand_vi_vi_vi(mask, x), vandnot_vi_vi_vi(mask, y)); +} + +// + +static inline vint2 vcast_vi2_vm(vmask vm) { + vint2 r; + r.x = _mm256_castsi256_si128(vm); + r.y = _mm256_extractf128_si256(vm, 1); + return r; +} + +static inline vmask vcast_vm_vi2(vint2 vi) { + vmask m = _mm256_castsi128_si256(vi.x); + m = _mm256_insertf128_si256(m, vi.y, 1); + return m; +} + +static inline vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvtps_epi32(vf)); } +static inline vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvttps_epi32(vf)); } +static inline vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); } +static inline vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = vcast_vi_i(i); return r; } + +static inline vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vadd_vi_vi_vi(x.x, y.x); r.y = vadd_vi_vi_vi(x.y, y.y); return r; } +static inline vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vsub_vi_vi_vi(x.x, y.x); r.y = vsub_vi_vi_vi(x.y, y.y); return r; } +static inline vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } + +static inline vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vand_vi_vi_vi(x.x, y.x); r.y = vand_vi_vi_vi(x.y, y.y); return r; } +static inline vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vandnot_vi_vi_vi(x.x, y.x); r.y = vandnot_vi_vi_vi(x.y, y.y); return r; } +static inline vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vor_vi_vi_vi(x.x, y.x); r.y = vor_vi_vi_vi(x.y, y.y); return r; } +static inline vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vxor_vi_vi_vi(x.x, y.x); r.y = vxor_vi_vi_vi(x.y, y.y); return r; } + +static inline vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsll_vi_vi_i(x.x, c); r.y = vsll_vi_vi_i(x.y, c); return r; } +static inline vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsrl_vi_vi_i(x.x, c); r.y = vsrl_vi_vi_i(x.y, c); return r; } +static inline vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsra_vi_vi_i(x.x, c); r.y = vsra_vi_vi_i(x.y, c); return r; } + +static inline vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpeq_epi32(x.x, y.x); + r.y = _mm_cmpeq_epi32(x.y, y.y); + return vcast_vm_vi2(r); +} + +static inline vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpgt_epi32(x.x, y.x); + r.y = _mm_cmpgt_epi32(x.y, y.y); + return vcast_vm_vi2(r); +} + +static inline vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpgt_epi32(x.x, y.x); + r.y = _mm_cmpgt_epi32(x.y, y.y); + return r; +} + +static inline vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { + vint2 r, m2 = vcast_vi2_vm(m); + r.x = vor_vi_vi_vi(vand_vi_vi_vi(m2.x, x.x), vandnot_vi_vi_vi(m2.x, y.x)); + r.y = vor_vi_vi_vi(vand_vi_vi_vi(m2.y, x.y), vandnot_vi_vi_vi(m2.y, y.y)); + return r; +} + +// + +static inline double vcast_d_vd(vdouble v) { + double s[4]; + _mm256_storeu_pd(s, v); + return s[0]; +} + +static inline float vcast_f_vf(vfloat v) { + float s[8]; + _mm256_storeu_ps(s, v); + return s[0]; +} + +static inline vmask vsignbit_vm_vd(vdouble d) { + return (vmask)_mm256_and_pd(d, _mm256_set_pd(-0.0,-0.0,-0.0,-0.0)); +} + +static inline vdouble vsign_vd_vd(vdouble d) { + return _mm256_or_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), (vdouble)vsignbit_vm_vd(d)); +} + +static inline vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) { + return (__m256d)vxor_vm_vm_vm((__m256i)x, vsignbit_vm_vd(y)); +} + +static inline vmask visinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static inline vmask vispinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static inline vmask visminf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(-INFINITY, -INFINITY, -INFINITY, -INFINITY), _CMP_EQ_OQ); +} + +static inline vmask visnan_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, d, _CMP_NEQ_UQ); +} + +static inline vdouble visinf(vdouble d) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), vsign_vd_vd(d)); +} + +static inline vdouble visinf2(vdouble d, vdouble m) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), _mm256_or_pd((vdouble)vsignbit_vm_vd(d), m)); +} + +static inline vdouble vpow2i_vd_vi(vint q) { + vint r; + vdouble y; + q = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), q); + q = _mm_slli_epi32(q, 20); + r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1,0,0,0)); + y = _mm256_castpd128_pd256((__m128d)r); + r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(3,2,2,2)); + y = _mm256_insertf128_pd(y, (__m128d)r, 1); + y = _mm256_and_pd(y, (__m256d)_mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return y; +} + +static inline vdouble vldexp_vd_vd_vi(vdouble x, vint q) { + vint m = _mm_srai_epi32(q, 31); + m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7); + q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2)); + m = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), m); + m = _mm_andnot_si128(_mm_cmplt_epi32(m, _mm_set_epi32(0, 0, 0, 0)), m); + vint n = _mm_cmpgt_epi32(m, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff)); + m = _mm_or_si128(_mm_andnot_si128(n, m), _mm_and_si128(n, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff))); + m = _mm_slli_epi32(m, 20); + vint r = (__m128i)_mm_shuffle_ps((__m128)m, (__m128)m, _MM_SHUFFLE(1,0,0,0)); + vdouble y = _mm256_castpd128_pd256((__m128d)r); + r = (__m128i)_mm_shuffle_ps((__m128)m, (__m128)m, _MM_SHUFFLE(3,2,2,2)); + y = _mm256_insertf128_pd(y, (__m128d)r, 1); + y = _mm256_and_pd(y, (__m256d)_mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q)); +} + +static inline vint vilogbp1_vi_vd(vdouble d) { + vint q, r, c; + vmask m = vlt_vm_vd_vd(d, vcast_vd_d(4.9090934652977266E-91)); + d = vsel_vd_vm_vd_vd(m, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); + c = _mm256_cvtpd_epi32(vsel_vd_vm_vd_vd(m, vcast_vd_d(300+0x3fe), vcast_vd_d(0x3fe))); + q = (__m128i)_mm256_castpd256_pd128(d); + q = (__m128i)_mm_shuffle_ps((__m128)q, _mm_set_ps(0, 0, 0, 0), _MM_SHUFFLE(0,0,3,1)); + r = (__m128i)_mm256_extractf128_pd(d, 1); + r = (__m128i)_mm_shuffle_ps(_mm_set_ps(0, 0, 0, 0), (__m128)r, _MM_SHUFFLE(3,1,0,0)); + q = _mm_or_si128(q, r); + q = _mm_srli_epi32(q, 20); + q = _mm_sub_epi32(q, c); + return q; +} + +static inline vdouble vupper_vd_vd(vdouble d) { + return (__m256d)_mm256_and_pd(d, (vdouble)_mm256_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000)); +} + +static inline vfloat vupper_vf_vf(vfloat d) { + return (vfloat)vand_vm_vm_vm((vmask)d, _mm256_set_epi32(0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000,0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000)); +} Index: sleef/lib/avx2.h =================================================================== --- /dev/null +++ sleef/lib/avx2.h @@ -0,0 +1,276 @@ +/*===---------- avx2.h - AVX2 functions ------------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __AVX2__ +#error Please specify -mavx2. +#endif + +#include + +typedef __m256d vdouble; +typedef __m128i vint; +typedef __m256i vmask; + +typedef __m256 vfloat; +typedef __m256i vint2; + +#define ENABLE_FMA_DP +#define ENABLE_FMA_SP + +// + +static inline vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } +static inline vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } +static inline vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } +static inline vdouble vcast_vd_d(double d) { return _mm256_set_pd(d, d, d, d); } +static inline vint vcast_vi_i(int i) { return _mm_set_epi32(i, i, i, i); } + +static inline vmask vreinterpret_vm_vd(vdouble vd) { return (__m256i)vd; } +static inline vdouble vreinterpret_vd_vm(vmask vm) { return (__m256d)vm; } + +static inline vmask vreinterpret_vm_vf(vfloat vf) { return (__m256i)vf; } +static inline vfloat vreinterpret_vf_vm(vmask vm) { return (__m256)vm; } + +// + +static inline vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } +static inline vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } +static inline vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } + +static inline vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } +static inline vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } +static inline vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } +static inline vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } + +static inline vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32 (x, c); } +static inline vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32 (x, c); } +static inline vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32 (x, c); } + +// + +static inline vmask vand_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_and_pd((__m256d)x, (__m256d)y); } +static inline vmask vandnot_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_andnot_pd((__m256d)x, (__m256d)y); } +static inline vmask vor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_or_pd((__m256d)x, (__m256d)y); } +static inline vmask vxor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_xor_pd((__m256d)x, (__m256d)y); } + +static inline vmask veq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_EQ_OQ); } +static inline vmask vneq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_NEQ_UQ); } +static inline vmask vlt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LT_OQ); } +static inline vmask vle_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LE_OQ); } +static inline vmask vgt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GT_OQ); } +static inline vmask vge_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GE_OQ); } + +static inline vmask veq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_EQ_OQ); } +static inline vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_NEQ_UQ); } +static inline vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LT_OQ); } +static inline vmask vle_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LE_OQ); } +static inline vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GT_OQ); } +static inline vmask vge_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GE_OQ); } + +// + +static inline vfloat vcast_vf_f(float f) { return _mm256_set_ps(f, f, f, f, f, f, f, f); } + +static inline vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); } +static inline vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); } +static inline vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); } +static inline vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); } +static inline vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } +static inline vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); } +static inline vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); } +static inline vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); } + +static inline vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); } +static inline vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); } +static inline vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); } +static inline vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); } +static inline vfloat vneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)d); } + +static inline vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); } +static inline vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); } +static inline vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); } +static inline vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); } +static inline vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmsub_ps(x, y, z); } + +// + +static inline vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); } +static inline vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); } +static inline vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); } +static inline vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); } +static inline vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set_pd(1, 1, 1, 1), x); } +static inline vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); } +static inline vdouble vabs_vd_vd(vdouble d) { return (__m256d)_mm256_andnot_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static inline vdouble vneg_vd_vd(vdouble d) { return (__m256d)_mm256_xor_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static inline vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); } +static inline vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); } +static inline vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); } + +static inline vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); } +static inline vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); } + +static inline vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); } +static inline vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); } +static inline vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); } +static inline vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); } +static inline vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmsub_pd(x, y, z); } + +// + +static inline vmask veq_vm_vi_vi(vint x, vint y) { + return _mm256_cvtepi32_epi64(_mm_cmpeq_epi32(x, y)); +} + +static inline vdouble vsel_vd_vm_vd_vd(vmask mask, vdouble x, vdouble y) { + return (__m256d)vor_vm_vm_vm(vand_vm_vm_vm(mask, (__m256i)x), vandnot_vm_vm_vm(mask, (__m256i)y)); +} + +static inline vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { + return (vfloat)vor_vm_vm_vm(vand_vm_vm_vm(mask, (vmask)x), vandnot_vm_vm_vm(mask, (vmask)y)); +} + +static inline vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { + __m128i mask = _mm256_cvtpd_epi32(_mm256_and_pd(_mm256_cmp_pd(d0, d1, _CMP_LT_OQ), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))); + mask = _mm_cmpeq_epi32(mask, _mm_set_epi32(1, 1, 1, 1)); + return vor_vi_vi_vi(vand_vi_vi_vi(mask, x), vandnot_vi_vi_vi(mask, y)); +} + +// + +static inline vint2 vcast_vi2_vm(vmask vm) { return vm; } +static inline vmask vcast_vm_vi2(vint2 vi) { return vi; } + +static inline vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvtps_epi32(vf)); } +static inline vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvttps_epi32(vf)); } +static inline vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); } +static inline vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); } + +static inline vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi32(x, y); } +static inline vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi32(x, y); } +static inline vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } + +static inline vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_and_si256(x, y); } +static inline vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_andnot_si256(x, y); } +static inline vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or_si256(x, y); } +static inline vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_xor_si256(x, y); } + +static inline vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_epi32(x, c); } +static inline vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_epi32(x, c); } +static inline vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_epi32(x, c); } + +static inline vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); } +static inline vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); } +static inline vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); } +static inline vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); } + +// + +static inline double vcast_d_vd(vdouble v) { + double s[4]; + _mm256_storeu_pd(s, v); + return s[0]; +} + +static inline float vcast_f_vf(vfloat v) { + float s[8]; + _mm256_storeu_ps(s, v); + return s[0]; +} + +static inline vmask vsignbit_vm_vd(vdouble d) { + return (vmask)_mm256_and_pd(d, _mm256_set_pd(-0.0,-0.0,-0.0,-0.0)); +} + +static inline vdouble vsign_vd_vd(vdouble d) { + return _mm256_or_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), (vdouble)vsignbit_vm_vd(d)); +} + +static inline vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) { + return (__m256d)vxor_vm_vm_vm((__m256i)x, vsignbit_vm_vd(y)); +} + +static inline vmask visinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static inline vmask vispinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static inline vmask visminf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(-INFINITY, -INFINITY, -INFINITY, -INFINITY), _CMP_EQ_OQ); +} + +static inline vmask visnan_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, d, _CMP_NEQ_UQ); +} + +static inline vdouble visinf(vdouble d) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), vsign_vd_vd(d)); +} + +static inline vdouble visinf2(vdouble d, vdouble m) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), _mm256_or_pd((vdouble)vsignbit_vm_vd(d), m)); +} + +static inline vdouble vpow2i_vd_vi(vint q) { + vint2 r = _mm256_slli_epi64(_mm256_cvtepi32_epi64(_mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), q)), 52); + r = _mm256_and_si256(r, _mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return (vdouble)r; +} + +static inline vdouble vldexp_vd_vd_vi(vdouble x, vint q) { + vint m = _mm_srai_epi32(q, 31); + m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7); + q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2)); + m = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), m); + m = _mm_andnot_si128(_mm_cmplt_epi32(m, _mm_set_epi32(0, 0, 0, 0)), m); + vint n = _mm_cmpgt_epi32(m, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff)); + m = _mm_or_si128(_mm_andnot_si128(n, m), _mm_and_si128(n, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff))); + vint2 r = _mm256_slli_epi64(_mm256_cvtepi32_epi64(m), 52); + vdouble y = (vdouble)_mm256_and_si256(r, _mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q)); +} + +static inline vint vilogbp1_vi_vd(vdouble d) { + vint q, r, c; + vmask m = vlt_vm_vd_vd(d, vcast_vd_d(4.9090934652977266E-91)); + d = vsel_vd_vm_vd_vd(m, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); + c = _mm256_cvtpd_epi32(vsel_vd_vm_vd_vd(m, vcast_vd_d(300+0x3fe), vcast_vd_d(0x3fe))); + q = (__m128i)_mm256_castpd256_pd128(d); + q = (__m128i)_mm_shuffle_ps((__m128)q, _mm_set_ps(0, 0, 0, 0), _MM_SHUFFLE(0,0,3,1)); + r = (__m128i)_mm256_extractf128_pd(d, 1); + r = (__m128i)_mm_shuffle_ps(_mm_set_ps(0, 0, 0, 0), (__m128)r, _MM_SHUFFLE(3,1,0,0)); + q = _mm_or_si128(q, r); + q = _mm_srli_epi32(q, 20); + q = _mm_sub_epi32(q, c); + return q; +} + +static inline vdouble vupper_vd_vd(vdouble d) { + return (__m256d)_mm256_and_pd(d, (vdouble)_mm256_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000)); +} + +static inline vfloat vupper_vf_vf(vfloat d) { + return (vfloat)vand_vm_vm_vm((vmask)d, _mm256_set_epi32(0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000,0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000)); +} Index: sleef/lib/dd.h =================================================================== --- /dev/null +++ sleef/lib/dd.h @@ -0,0 +1,365 @@ +/*===---------- dd.h - vdouble2 functions ----------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +typedef struct { + vdouble x, y; +} vdouble2; + +static inline vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) { + vdouble2 ret = {h, l}; + return ret; +} + +static inline vdouble2 vcast_vd2_d_d(double h, double l) { + vdouble2 ret = {vcast_vd_d(h), vcast_vd_d(l)}; + return ret; +} + +static inline vdouble2 vsel_vd2_vm_vd2_vd2(vmask m, vdouble2 x, vdouble2 y) { + vdouble2 r; + r.x = vsel_vd_vm_vd_vd(m, x.x, y.x); + r.y = vsel_vd_vm_vd_vd(m, x.y, y.y); + return r; +} + +static inline vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) { + return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2); +} + +static inline vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) { + return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3); +} + +static inline vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) { + return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4); +} + +static inline vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) { + return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5); +} + +static inline vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) { + return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6); +} + +static inline vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) { + return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2); +} + +static inline vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) { + return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3); +} + +static inline vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) { + return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4); +} + +static inline vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) { + return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5); +} + +// + +static inline vdouble2 ddneg_vd2_vd2(vdouble2 x) { + return vcast_vd2_vd_vd(vneg_vd_vd(x.x), vneg_vd_vd(x.y)); +} + +static inline vdouble2 ddnormalize_vd2_vd2(vdouble2 t) { + vdouble2 s; + + s.x = vadd_vd_vd_vd(t.x, t.y); + s.y = vadd_vd_vd_vd(vsub_vd_vd_vd(t.x, s.x), t.y); + + return s; +} + +static inline vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) { + vdouble2 r = {vmul_vd_vd_vd(d.x, s), vmul_vd_vd_vd(d.y, s)}; + return r; +} + +static inline vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x, y); + r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x, r.x), y); + + return r; +} + +static inline vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x, y); + vdouble v = vsub_vd_vd_vd(r.x, x); + r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(r.x, v)), vsub_vd_vd_vd(y, v)); + + return r; +} + +static inline vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x.x, y); + r.y = vadd_vd_3vd(vsub_vd_vd_vd(x.x, r.x), y, x.y); + + return r; +} + +static inline vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x.x, y); + vdouble v = vsub_vd_vd_vd(r.x, x.x); + r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x.x, vsub_vd_vd_vd(r.x, v)), vsub_vd_vd_vd(y, v)); + r.y = vadd_vd_vd_vd(r.y, x.y); + + return r; +} + +static inline vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x, y.x); + r.y = vadd_vd_3vd(vsub_vd_vd_vd(x, r.x), y.x, y.y); + + return r; +} + +static inline vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + // |x| >= |y| + + vdouble2 r; + + r.x = vadd_vd_vd_vd(x.x, y.x); + r.y = vadd_vd_4vd(vsub_vd_vd_vd(x.x, r.x), y.x, x.y, y.y); + + return r; +} + +static inline vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + vdouble2 r; + + r.x = vadd_vd_vd_vd(x.x, y.x); + vdouble v = vsub_vd_vd_vd(r.x, x.x); + r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x.x, vsub_vd_vd_vd(r.x, v)), vsub_vd_vd_vd(y.x, v)); + r.y = vadd_vd_vd_vd(r.y, vadd_vd_vd_vd(x.y, y.y)); + + return r; +} + +static inline vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) { + // |x| >= |y| + + vdouble2 r; + + r.x = vsub_vd_vd_vd(x, y); + r.y = vsub_vd_vd_vd(vsub_vd_vd_vd(x, r.x), y); + + return r; +} + +static inline vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + // |x| >= |y| + + vdouble2 r; + + r.x = vsub_vd_vd_vd(x.x, y.x); + r.y = vsub_vd_vd_vd(x.x, r.x); + r.y = vsub_vd_vd_vd(r.y, y.x); + r.y = vadd_vd_vd_vd(r.y, x.y); + r.y = vsub_vd_vd_vd(r.y, y.y); + + return r; +} + +#if 0 +static inline vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + // |x| >= |y| + + vdouble2 r; + + r.x = vsub_vd_vd_vd(x.x, y.x); + r.y = vsub_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(x.x, r.x), y.x), x.y), y.y); + + return r; +} +#endif + +#ifdef ENABLE_FMA_DP +static inline vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) { + vdouble2 q; + vdouble t = vrec_vd_vd(d.x), u; + + q.x = vmul_vd_vd_vd(n.x, t); + u = vfmapn_vd_vd_vd_vd(t, n.x, q.x); + q.y = vfmanp_vd_vd_vd_vd(d.y, t, vfmanp_vd_vd_vd_vd(d.x, t, vcast_vd_d(1))); + q.y = vfma_vd_vd_vd_vd(q.x, q.y, vfma_vd_vd_vd_vd(n.y, t, u)); + + return q; +} + +static inline vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) { + vdouble2 r; + + r.x = vmul_vd_vd_vd(x, y); + r.y = vfmapn_vd_vd_vd_vd(x, y, r.x); + + return r; +} + +static inline vdouble2 ddsqu_vd2_vd2(vdouble2 x) { + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, x.x); + r.y = vfma_vd_vd_vd_vd(vadd_vd_vd_vd(x.x, x.x), x.y, vfmapn_vd_vd_vd_vd(x.x, x.x, r.x)); + + return r; +} + +static inline vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, y.x); + r.y = vfma_vd_vd_vd_vd(x.x, y.y, vfma_vd_vd_vd_vd(x.y, y.x, vfmapn_vd_vd_vd_vd(x.x, y.x, r.x))); + + return r; +} + +static inline vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) { + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, y); + r.y = vfma_vd_vd_vd_vd(x.y, y, vfmapn_vd_vd_vd_vd(x.x, y, r.x)); + + return r; +} + +static inline vdouble2 ddrec_vd2_vd(vdouble d) { + vdouble2 q; + + q.x = vrec_vd_vd(d); + q.y = vmul_vd_vd_vd(q.x, vfmanp_vd_vd_vd_vd(d, q.x, vcast_vd_d(1))); + + return q; +} + +static inline vdouble2 ddrec_vd2_vd2(vdouble2 d) { + vdouble2 q; + + q.x = vrec_vd_vd(d.x); + q.y = vmul_vd_vd_vd(q.x, vfmanp_vd_vd_vd_vd(d.y, q.x, vfmanp_vd_vd_vd_vd(d.x, q.x, vcast_vd_d(1)))); + + return q; +} +#else +static inline vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) { + vdouble t = vrec_vd_vd(d.x); + vdouble dh = vupper_vd_vd(d.x), dl = vsub_vd_vd_vd(d.x, dh); + vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th); + vdouble nhh = vupper_vd_vd(n.x), nhl = vsub_vd_vd_vd(n.x, nhh); + + vdouble2 q; + + q.x = vmul_vd_vd_vd(n.x, t); + + vdouble u = vadd_vd_5vd(vsub_vd_vd_vd(vmul_vd_vd_vd(nhh, th), q.x), vmul_vd_vd_vd(nhh, tl), vmul_vd_vd_vd(nhl, th), vmul_vd_vd_vd(nhl, tl), + vmul_vd_vd_vd(q.x, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl)))); + + q.y = vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(n.y, vmul_vd_vd_vd(q.x, d.y)), u); + + return q; +} + +static inline vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) { + vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh); + vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh); + vdouble2 r; + + r.x = vmul_vd_vd_vd(x, y); + r.y = vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(r.x), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl)); + + return r; +} + +static inline vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) { + vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh); + vdouble yh = vupper_vd_vd(y ), yl = vsub_vd_vd_vd(y , yh); + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, y); + r.y = vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(r.x), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(x.y, y)); + + return r; +} + +static inline vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) { + vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh); + vdouble yh = vupper_vd_vd(y.x), yl = vsub_vd_vd_vd(y.x, yh); + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, y.x); + r.y = vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(r.x), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(x.x, y.y), vmul_vd_vd_vd(x.y, y.x)); + + return r; +} + +static inline vdouble2 ddsqu_vd2_vd2(vdouble2 x) { + vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh); + vdouble2 r; + + r.x = vmul_vd_vd_vd(x.x, x.x); + r.y = vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(r.x), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(x.x, vadd_vd_vd_vd(x.y, x.y))); + + return r; +} + +static inline vdouble2 ddrec_vd2_vd(vdouble d) { + vdouble t = vrec_vd_vd(d); + vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh); + vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th); + vdouble2 q; + + q.x = t; + q.y = vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))); + + return q; +} + +static inline vdouble2 ddrec_vd2_vd2(vdouble2 d) { + vdouble t = vrec_vd_vd(d.x); + vdouble dh = vupper_vd_vd(d.x), dl = vsub_vd_vd_vd(d.x, dh); + vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th); + vdouble2 q; + + q.x = t; + q.y = vmul_vd_vd_vd(t, vsub_vd_6vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl), vmul_vd_vd_vd(d.y, t))); + + return q; +} +#endif + +static inline vdouble2 ddsqrt_vd2_vd2(vdouble2 d) { + vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(d.x, d.y)); + return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5)); +} Index: sleef/lib/df.h =================================================================== --- /dev/null +++ sleef/lib/df.h @@ -0,0 +1,419 @@ +/*===---------- df.h - vfloat2 functions ----------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +typedef struct { + vfloat x, y; +} vfloat2; + +static inline vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) { + vfloat2 ret = {h, l}; + return ret; +} + +static inline vfloat2 vcast_vf2_f_f(float h, float l) { + vfloat2 ret = {vcast_vf_f(h), vcast_vf_f(l)}; + return ret; +} + +static inline vfloat2 vsel_vf2_vm_vf2_vf2(vmask m, vfloat2 x, vfloat2 y) { + vfloat2 r; + r.x = vsel_vf_vm_vf_vf(m, x.x, y.x); + r.y = vsel_vf_vm_vf_vf(m, x.y, y.y); + return r; +} + +static inline vfloat2 vabs_vf2_vf2(vfloat2 x) { + return vcast_vf2_vf_vf((vfloat)vxor_vm_vm_vm(vand_vm_vm_vm((vmask)vcast_vf_f(-0.0), (vmask)x.x), (vmask)x.x), + (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm((vmask)vcast_vf_f(-0.0), (vmask)x.x), (vmask)x.y)); +} + +static inline vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) { + return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2); +} + +static inline vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) { + return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3); +} + +static inline vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) { + return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4); +} + +static inline vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) { + return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5); +} + +static inline vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) { + return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6); +} + +static inline vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) { + return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2); +} + +static inline vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) { + return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3); +} + +static inline vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) { + return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4); +} + +// + +static inline vfloat2 dfneg_vf2_vf2(vfloat2 x) { + return vcast_vf2_vf_vf(vneg_vf_vf(x.x), vneg_vf_vf(x.y)); +} + +static inline vfloat2 dfnormalize_vf2_vf2(vfloat2 t) { + vfloat2 s; + + s.x = vadd_vf_vf_vf(t.x, t.y); + s.y = vadd_vf_vf_vf(vsub_vf_vf_vf(t.x, s.x), t.y); + + return s; +} + +static inline vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) { + vfloat2 r = {vmul_vf_vf_vf(d.x, s), vmul_vf_vf_vf(d.y, s)}; + return r; +} + +static inline vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x, y); + r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x, r.x), y); + + return r; +} + +static inline vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x, y); + vfloat v = vsub_vf_vf_vf(r.x, x); + r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(r.x, v)), vsub_vf_vf_vf(y, v)); + + return r; +} + +static inline vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x.x, y); + r.y = vadd_vf_3vf(vsub_vf_vf_vf(x.x, r.x), y, x.y); + + return r; +} + +static inline vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x.x, y); + vfloat v = vsub_vf_vf_vf(r.x, x.x); + r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x.x, vsub_vf_vf_vf(r.x, v)), vsub_vf_vf_vf(y, v)); + r.y = vadd_vf_vf_vf(r.y, x.y); + + return r; +} + +static inline vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x, y.x); + r.y = vadd_vf_3vf(vsub_vf_vf_vf(x, r.x), y.x, y.y); + + return r; +} + +static inline vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { + // |x| >= |y| + + vfloat2 r; + + r.x = vadd_vf_vf_vf(x.x, y.x); + r.y = vadd_vf_4vf(vsub_vf_vf_vf(x.x, r.x), y.x, x.y, y.y); + + return r; +} + +static inline vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { + vfloat2 r; + + r.x = vadd_vf_vf_vf(x.x, y.x); + vfloat v = vsub_vf_vf_vf(r.x, x.x); + r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x.x, vsub_vf_vf_vf(r.x, v)), vsub_vf_vf_vf(y.x, v)); + r.y = vadd_vf_vf_vf(r.y, vadd_vf_vf_vf(x.y, y.y)); + + return r; +} + +static inline vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) { + // |x| >= |y| + + vfloat2 r; + + r.x = vsub_vf_vf_vf(x, y); + r.y = vsub_vf_vf_vf(vsub_vf_vf_vf(x, r.x), y); + + return r; +} + +static inline vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { + // |x| >= |y| + + vfloat2 r; + + r.x = vsub_vf_vf_vf(x.x, y.x); + r.y = vsub_vf_vf_vf(x.x, r.x); + r.y = vsub_vf_vf_vf(r.y, y.x); + r.y = vadd_vf_vf_vf(r.y, x.y); + r.y = vsub_vf_vf_vf(r.y, y.y); + + return r; +} + +#ifdef ENABLE_FMA_SP +static inline vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) { + vfloat2 q; + vfloat t = vrec_vf_vf(d.x), u; + + q.x = vmul_vf_vf_vf(n.x, t); + u = vfmapn_vf_vf_vf_vf(t, n.x, q.x); + q.y = vfmanp_vf_vf_vf_vf(d.y, t, vfmanp_vf_vf_vf_vf(d.x, t, vcast_vf_f(1))); + q.y = vfma_vf_vf_vf_vf(q.x, q.y, vfma_vf_vf_vf_vf(n.y, t, u)); + + return q; +} + +static inline vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) { + vfloat2 r; + + r.x = vmul_vf_vf_vf(x, y); + r.y = vfmapn_vf_vf_vf_vf(x, y, r.x); + + return r; +} + +static inline vfloat2 dfsqu_vf2_vf2(vfloat2 x) { + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, x.x); + r.y = vfma_vf_vf_vf_vf(vadd_vf_vf_vf(x.x, x.x), x.y, vfmapn_vf_vf_vf_vf(x.x, x.x, r.x)); + + return r; +} + +static inline vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, y.x); + r.y = vfma_vf_vf_vf_vf(x.x, y.y, vfma_vf_vf_vf_vf(x.y, y.x, vfmapn_vf_vf_vf_vf(x.x, y.x, r.x))); + + return r; +} + +static inline vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) { + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, y); + r.y = vfma_vf_vf_vf_vf(x.y, y, vfmapn_vf_vf_vf_vf(x.x, y, r.x)); + + return r; +} + +static inline vfloat2 dfrec_vf2_vf(vfloat d) { + vfloat2 q; + + q.x = vrec_vf_vf(d); + q.y = vmul_vf_vf_vf(q.x, vfmanp_vf_vf_vf_vf(d, q.x, vcast_vf_f(1))); + + return q; +} + +static inline vfloat2 dfrec_vf2_vf2(vfloat2 d) { + vfloat2 q; + + q.x = vrec_vf_vf(d.x); + q.y = vmul_vf_vf_vf(q.x, vfmanp_vf_vf_vf_vf(d.y, q.x, vfmanp_vf_vf_vf_vf(d.x, q.x, vcast_vf_f(1)))); + + return q; +} +#else +static inline vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) { + vfloat t = vrec_vf_vf(d.x); + vfloat dh = vupper_vf_vf(d.x), dl = vsub_vf_vf_vf(d.x, dh); + vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th); + vfloat nhh = vupper_vf_vf(n.x), nhl = vsub_vf_vf_vf(n.x, nhh); + + vfloat2 q; + + q.x = vmul_vf_vf_vf(n.x, t); + + //vfloat u = vadd_vf_5vf(vsub_vf_vf_vf(vmul_vf_vf_vf(nhh, th), q.x), vmul_vf_vf_vf(nhh, tl), vmul_vf_vf_vf(nhl, th), vmul_vf_vf_vf(nhl, tl), + //vmul_vf_vf_vf(q.x, vsub_vf_5vf(vcast_vf_f(1), vmul_vf_vf_vf(dh, th), vmul_vf_vf_vf(dh, tl), vmul_vf_vf_vf(dl, th), vmul_vf_vf_vf(dl, tl)))); + + vfloat u, w; + w = vcast_vf_f(-1); + w = vmla_vf_vf_vf_vf(dh, th, w); + w = vmla_vf_vf_vf_vf(dh, tl, w); + w = vmla_vf_vf_vf_vf(dl, th, w); + w = vmla_vf_vf_vf_vf(dl, tl, w); + w = vneg_vf_vf(w); + + u = vmla_vf_vf_vf_vf(nhh, th, vneg_vf_vf(q.x)); + u = vmla_vf_vf_vf_vf(nhh, tl, u); + u = vmla_vf_vf_vf_vf(nhl, th, u); + u = vmla_vf_vf_vf_vf(nhl, tl, u); + u = vmla_vf_vf_vf_vf(q.x, w , u); + + q.y = vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(n.y, vmul_vf_vf_vf(q.x, d.y)), u); + + return q; +} + +static inline vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) { + vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh); + vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh); + vfloat2 r; + + r.x = vmul_vf_vf_vf(x, y); + //r.y = vadd_vf_5vf(vmul_vf_vf_vf(xh, yh), vneg_vf_vf(r.x), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yl)); + + vfloat t; + t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(r.x)); + t = vmla_vf_vf_vf_vf(xl, yh, t); + t = vmla_vf_vf_vf_vf(xh, yl, t); + t = vmla_vf_vf_vf_vf(xl, yl, t); + r.y = t; + + return r; +} + +static inline vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) { + vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh); + vfloat yh = vupper_vf_vf(y ), yl = vsub_vf_vf_vf(y , yh); + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, y); + //r.y = vadd_vf_6vf(vmul_vf_vf_vf(xh, yh), vneg_vf_vf(r.x), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(x.y, y)); + + vfloat t; + t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(r.x)); + t = vmla_vf_vf_vf_vf(xl, yh, t); + t = vmla_vf_vf_vf_vf(xh, yl, t); + t = vmla_vf_vf_vf_vf(xl, yl, t); + t = vmla_vf_vf_vf_vf(x.y, y, t); + r.y = t; + + return r; +} + +static inline vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) { + vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh); + vfloat yh = vupper_vf_vf(y.x), yl = vsub_vf_vf_vf(y.x, yh); + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, y.x); + //r.y = vadd_vf_7vf(vmul_vf_vf_vf(xh, yh), vneg_vf_vf(r.x), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(x.x, y.y), vmul_vf_vf_vf(x.y, y.x)); + + vfloat t; + t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(r.x)); + t = vmla_vf_vf_vf_vf(xl, yh, t); + t = vmla_vf_vf_vf_vf(xh, yl, t); + t = vmla_vf_vf_vf_vf(xl, yl, t); + t = vmla_vf_vf_vf_vf(x.x, y.y, t); + t = vmla_vf_vf_vf_vf(x.y, y.x, t); + r.y = t; + + return r; +} + +static inline vfloat2 dfsqu_vf2_vf2(vfloat2 x) { + vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh); + vfloat2 r; + + r.x = vmul_vf_vf_vf(x.x, x.x); + //r.y = vadd_vf_5vf(vmul_vf_vf_vf(xh, xh), vneg_vf_vf(r.x), vmul_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl), vmul_vf_vf_vf(xl, xl), vmul_vf_vf_vf(x.x, vadd_vf_vf_vf(x.y, x.y))); + + vfloat t; + t = vmla_vf_vf_vf_vf(xh, xh, vneg_vf_vf(r.x)); + t = vmla_vf_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl, t); + t = vmla_vf_vf_vf_vf(xl, xl, t); + t = vmla_vf_vf_vf_vf(x.x, vadd_vf_vf_vf(x.y, x.y), t); + r.y = t; + + return r; +} + +static inline vfloat2 dfrec_vf2_vf(vfloat d) { + vfloat t = vrec_vf_vf(d); + vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh); + vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th); + vfloat2 q; + + q.x = t; + //q.y = vmul_vf_vf_vf(t, vsub_vf_5vf(vcast_vf_f(1), vmul_vf_vf_vf(dh, th), vmul_vf_vf_vf(dh, tl), vmul_vf_vf_vf(dl, th), vmul_vf_vf_vf(dl, tl))); + + vfloat u = vcast_vf_f(-1); + u = vmla_vf_vf_vf_vf(dh, th, u); + u = vmla_vf_vf_vf_vf(dh, tl, u); + u = vmla_vf_vf_vf_vf(dl, th, u); + u = vmla_vf_vf_vf_vf(dl, tl, u); + q.y = vmul_vf_vf_vf(vneg_vf_vf(t), u); + + return q; +} + +static inline vfloat2 dfrec_vf2_vf2(vfloat2 d) { + vfloat t = vrec_vf_vf(d.x); + vfloat dh = vupper_vf_vf(d.x), dl = vsub_vf_vf_vf(d.x, dh); + vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th); + vfloat2 q; + + q.x = t; + //q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t); + + vfloat u = vcast_vf_f(-1); + u = vmla_vf_vf_vf_vf(dh, th, u); + u = vmla_vf_vf_vf_vf(dh, tl, u); + u = vmla_vf_vf_vf_vf(dl, th, u); + u = vmla_vf_vf_vf_vf(dl, tl, u); + u = vmla_vf_vf_vf_vf(d.y, t, u); + q.y = vmul_vf_vf_vf(vneg_vf_vf(t), u); + + return q; +} +#endif + +static inline vfloat2 dfsqrt_vf2_vf2(vfloat2 d) { +#ifdef ENABLE_RECSQRT_SP + vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(d.x, d.y)); + vfloat2 r = dfmul_vf2_vf2_vf(d, x); + return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(r, dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(r, x), vcast_vf_f(-3.0))), vcast_vf_f(-0.5)); +#else + vfloat t = vsqrt_vf_vf(vadd_vf_vf_vf(d.x, d.y)); + return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5)); +#endif +} Index: sleef/lib/dp-avx.cpp =================================================================== --- /dev/null +++ sleef/lib/dp-avx.cpp @@ -0,0 +1,3 @@ +#define BUILD_AVX +#include "dp.cpp" + Index: sleef/lib/dp-avx2.cpp =================================================================== --- /dev/null +++ sleef/lib/dp-avx2.cpp @@ -0,0 +1,3 @@ +#define BUILD_AVX2 +#include "dp.cpp" + Index: sleef/lib/dp-scalar.cpp =================================================================== --- /dev/null +++ sleef/lib/dp-scalar.cpp @@ -0,0 +1,1265 @@ +#include +#include +#include + +#include "nonnumber.h" +#define _SLEEF_N(f) __ ## f + +#define PI4_A 0.78539816290140151978 +#define PI4_B 4.9604678871439933374e-10 +#define PI4_C 1.1258708853173288931e-18 +#define PI4_D 1.7607799325916000908e-27 + +#define M_4_PI 1.273239544735162542821171882678754627704620361328125 + +#define L2U .69314718055966295651160180568695068359375 +#define L2L .28235290563031577122588448175013436025525412068e-12 +#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931 + +static inline int64_t doubleToRawLongBits(double d) { + union { + double f; + int64_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +static inline double longBitsToDouble(int64_t i) { + union { + double f; + int64_t i; + } tmp; + tmp.i = i; + return tmp.f; +} + +static inline double xfabs(double x) { + return longBitsToDouble(0x7fffffffffffffffLL & doubleToRawLongBits(x)); +} + +static inline double mulsign(double x, double y) { + return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (1LL << 63))); +} + +static inline double sign(double d) { return mulsign(1, d); } +static inline double mla(double x, double y, double z) { return x * y + z; } +static inline double xrint(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); } + +static inline int xisnan(double x) { return x != x; } +static inline int xisinf(double x) { return x == INFINITY || x == -INFINITY; } +static inline int xisminf(double x) { return x == -INFINITY; } +static inline int xispinf(double x) { return x == INFINITY; } + +static inline double pow2i(int q) { + return longBitsToDouble(((int64_t)(q + 0x3ff)) << 52); +} + +static inline double ldexpk(double x, int q) { + double u; + int m; + m = q >> 31; + m = (((m + q) >> 9) - m) << 7; + q = q - (m << 2); + m += 0x3ff; + m = m < 0 ? 0 : m; + m = m > 0x7ff ? 0x7ff : m; + u = longBitsToDouble(((int64_t)m) << 52); + x = x * u * u * u * u; + u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52); + return x * u; +} + +static inline int ilogbp1(double d) { + int m = d < 4.9090934652977266E-91; + d = m ? 2.037035976334486E90 * d : d; + int q = (doubleToRawLongBits(d) >> 52) & 0x7ff; + q = m ? q - (300 + 0x03fe) : q - 0x03fe; + return q; +} + +// + +typedef struct { + double x, y; +} double2; + +extern "C" { +double _SLEEF_N(xldexp)(double x, int q); +int _SLEEF_N(xilogb)(double d); +double _SLEEF_N(xatan2)(double y, double x); +double _SLEEF_N(xasin)(double d); +double _SLEEF_N(xacos)(double d); +double _SLEEF_N(xatan)(double s); +double _SLEEF_N(xatan2_u1)(double y, double x); +double _SLEEF_N(xasin_u1)(double d); +double _SLEEF_N(xacos_u1)(double d); +double _SLEEF_N(xatan_u1)(double d); +double _SLEEF_N(xsin)(double d); +double _SLEEF_N(xsin_u1)(double d); +double _SLEEF_N(xcos)(double d); +double _SLEEF_N(xcos_u1)(double d); +double2 _SLEEF_N(xsincos)(double d); +double2 _SLEEF_N(xsincos_u1)(double d); +double _SLEEF_N(xtan)(double d); +double _SLEEF_N(xtan_u1)(double d); +double _SLEEF_N(xlog)(double d); +double _SLEEF_N(xexp)(double d); +double _SLEEF_N(xlog_u1)(double d); +double _SLEEF_N(xpow)(double x, double y); +double _SLEEF_N(xsinh)(double x); +double _SLEEF_N(xcosh)(double x); +double _SLEEF_N(xtanh)(double x); +double _SLEEF_N(xasinh)(double x); +double _SLEEF_N(xacosh)(double x); +double _SLEEF_N(xatanh)(double x); +double _SLEEF_N(xsqrt)(double d); +double _SLEEF_N(xcbrt)(double d); +double _SLEEF_N(xcbrt_u1)(double d); +double _SLEEF_N(xexp2)(double a); +double _SLEEF_N(xexp10)(double a); +double _SLEEF_N(xexpm1)(double a); +double _SLEEF_N(xlog10)(double a); +double _SLEEF_N(xlog1p)(double a); +} // extern "C" + +double _SLEEF_N(xldexp)(double x, int q) { return ldexpk(x, q); } + +int _SLEEF_N(xilogb)(double d) { + int e = ilogbp1(xfabs(d)) - 1; + e = d == 0 ? -2147483648 : e; + e = d == INFINITY || d == -INFINITY ? 2147483647 : e; + return e; +} + +#ifndef NDEBUG +static int checkfp(double x) { + if (xisinf(x) || xisnan(x)) return 1; + return 0; +} +#endif + +static inline double upper(double d) { + return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffff8000000LL); +} + +static inline double2 dd(double h, double l) { + double2 ret; + ret.x = h; ret.y = l; + return ret; +} + +static inline double2 ddnormalize_d2_d2(double2 t) { + double2 s; + + s.x = t.x + t.y; + s.y = t.x - s.x + t.y; + + return s; +} + +static inline double2 ddscale_d2_d2_d(double2 d, double s) { + double2 r; + + r.x = d.x * s; + r.y = d.y * s; + + return r; +} + +static inline double2 ddneg_d2_d2(double2 d) { + double2 r; + + r.x = -d.x; + r.y = -d.y; + + return r; +} + +static inline double2 ddadd_d2_d_d(double x, double y) { + // |x| >= |y| + + double2 r; + + r.x = x + y; + r.y = x - r.x + y; + + return r; +} + +static inline double2 ddadd2_d2_d_d(double x, double y) { + double2 r; + + r.x = x + y; + double v = r.x - x; + r.y = (x - (r.x - v)) + (y - v); + + return r; +} + +static inline double2 ddadd_d2_d2_d(double2 x, double y) { + // |x| >= |y| + + double2 r; + + r.x = x.x + y; + r.y = x.x - r.x + y + x.y; + + return r; +} + +static inline double2 ddadd2_d2_d2_d(double2 x, double y) { + // |x| >= |y| + + double2 r; + + r.x = x.x + y; + double v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y - v); + r.y += x.y; + + return r; +} + +static inline double2 ddadd_d2_d_d2(double x, double2 y) { + // |x| >= |y| + + double2 r; + + r.x = x + y.x; + r.y = x - r.x + y.x + y.y; + + return r; +} + +static inline double2 ddadd2_d2_d_d2(double x, double2 y) { + double2 r; + + r.x = x + y.x; + double v = r.x - x; + r.y = (x - (r.x - v)) + (y.x - v) + y.y; + + return r; +} + +static inline double2 ddadd_d2_d2_d2(double2 x, double2 y) { + // |x| >= |y| + + double2 r; + + r.x = x.x + y.x; + r.y = x.x - r.x + y.x + x.y + y.y; + + return r; +} + +static inline double2 ddadd2_d2_d2_d2(double2 x, double2 y) { + double2 r; + + r.x = x.x + y.x; + double v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y.x - v); + r.y += x.y + y.y; + + return r; +} + +static inline double2 ddsub_d2_d2_d2(double2 x, double2 y) { + // |x| >= |y| + + double2 r; + + r.x = x.x - y.x; + r.y = x.x - r.x - y.x + x.y - y.y; + + return r; +} + +static inline double2 dddiv_d2_d2_d2(double2 n, double2 d) { + double t = 1.0 / d.x; + double dh = upper(d.x), dl = d.x - dh; + double th = upper(t ), tl = t - th; + double nhh = upper(n.x), nhl = n.x - nhh; + + double2 q; + + q.x = n.x * t; + + double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl + + q.x * (1 - dh * th - dh * tl - dl * th - dl * tl); + + q.y = t * (n.y - q.x * d.y) + u; + + return q; +} + +static inline double2 ddmul_d2_d_d(double x, double y) { + double xh = upper(x), xl = x - xh; + double yh = upper(y), yl = y - yh; + double2 r; + + r.x = x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl; + + return r; +} + +static inline double2 ddmul_d2_d2_d(double2 x, double y) { + double xh = upper(x.x), xl = x.x - xh; + double yh = upper(y ), yl = y - yh; + double2 r; + + r.x = x.x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y; + + return r; +} + +static inline double2 ddmul_d2_d2_d2(double2 x, double2 y) { + double xh = upper(x.x), xl = x.x - xh; + double yh = upper(y.x), yl = y.x - yh; + double2 r; + + r.x = x.x * y.x; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x; + + return r; +} + +static inline double2 ddsqu_d2_d2(double2 x) { + double xh = upper(x.x), xl = x.x - xh; + double2 r; + + r.x = x.x * x.x; + r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y); + + return r; +} + +static inline double2 ddrec_d2_d(double d) { + double t = 1.0 / d; + double dh = upper(d), dl = d - dh; + double th = upper(t), tl = t - th; + double2 q; + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl); + + return q; +} + +static inline double2 ddrec_d2_d2(double2 d) { + double t = 1.0 / d.x; + double dh = upper(d.x), dl = d.x - dh; + double th = upper(t ), tl = t - th; + double2 q; + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t); + + return q; +} + +static inline double2 ddsqrt_d2_d2(double2 d) { + double t = sqrt(d.x + d.y); + return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d2_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5); +} + +// + +static inline double atan2k(double y, double x) { + double s, t, u; + int q = 0; + + if (x < 0) { x = -x; q = -2; } + if (y > x) { t = x; x = y; y = -t; q += 1; } + + s = y / x; + t = s * s; + + u = -1.88796008463073496563746e-05; + u = u * t + (0.000209850076645816976906797); + u = u * t + (-0.00110611831486672482563471); + u = u * t + (0.00370026744188713119232403); + u = u * t + (-0.00889896195887655491740809); + u = u * t + (0.016599329773529201970117); + u = u * t + (-0.0254517624932312641616861); + u = u * t + (0.0337852580001353069993897); + u = u * t + (-0.0407629191276836500001934); + u = u * t + (0.0466667150077840625632675); + u = u * t + (-0.0523674852303482457616113); + u = u * t + (0.0587666392926673580854313); + u = u * t + (-0.0666573579361080525984562); + u = u * t + (0.0769219538311769618355029); + u = u * t + (-0.090908995008245008229153); + u = u * t + (0.111111105648261418443745); + u = u * t + (-0.14285714266771329383765); + u = u * t + (0.199999999996591265594148); + u = u * t + (-0.333333333333311110369124); + + t = u * t * s + s; + t = q * (M_PI/2) + t; + + return t; +} + +double _SLEEF_N(xatan2)(double y, double x) { + double r = atan2k(xfabs(y), x); + + r = mulsign(r, x); + if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI /2)) : 0); + if (xisinf(y) ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0); + if ( y == 0) r = (sign(x) == -1 ? M_PI : 0); + + return xisnan(x) || xisnan(y) ? NAN : mulsign(r, y); +} + +double _SLEEF_N(xasin)(double d) { + return mulsign(atan2k(xfabs(d), sqrt((1+d)*(1-d))), d); +} + +double _SLEEF_N(xacos)(double d) { + return mulsign(atan2k(sqrt((1+d)*(1-d)), xfabs(d)), d) + (d < 0 ? M_PI : 0); +} + +double _SLEEF_N(xatan)(double s) { + double t, u; + int q = 0; + + if (s < 0) { s = -s; q = 2; } + if (s > 1) { s = 1.0 / s; q |= 1; } + + t = s * s; + + u = -1.88796008463073496563746e-05; + u = u * t + (0.000209850076645816976906797); + u = u * t + (-0.00110611831486672482563471); + u = u * t + (0.00370026744188713119232403); + u = u * t + (-0.00889896195887655491740809); + u = u * t + (0.016599329773529201970117); + u = u * t + (-0.0254517624932312641616861); + u = u * t + (0.0337852580001353069993897); + u = u * t + (-0.0407629191276836500001934); + u = u * t + (0.0466667150077840625632675); + u = u * t + (-0.0523674852303482457616113); + u = u * t + (0.0587666392926673580854313); + u = u * t + (-0.0666573579361080525984562); + u = u * t + (0.0769219538311769618355029); + u = u * t + (-0.090908995008245008229153); + u = u * t + (0.111111105648261418443745); + u = u * t + (-0.14285714266771329383765); + u = u * t + (0.199999999996591265594148); + u = u * t + (-0.333333333333311110369124); + + t = s + s * (t * u); + + if ((q & 1) != 0) t = 1.570796326794896557998982 - t; + if ((q & 2) != 0) t = -t; + + return t; +} + +static double2 atan2k_u1(double2 y, double2 x) { + double u; + double2 s, t; + int q = 0; + + if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; } + if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; } + + s = dddiv_d2_d2_d2(y, x); + t = ddsqu_d2_d2(s); + t = ddnormalize_d2_d2(t); + + u = 1.06298484191448746607415e-05; + u = mla(u, t.x, -0.000125620649967286867384336); + u = mla(u, t.x, 0.00070557664296393412389774); + u = mla(u, t.x, -0.00251865614498713360352999); + u = mla(u, t.x, 0.00646262899036991172313504); + u = mla(u, t.x, -0.0128281333663399031014274); + u = mla(u, t.x, 0.0208024799924145797902497); + u = mla(u, t.x, -0.0289002344784740315686289); + u = mla(u, t.x, 0.0359785005035104590853656); + u = mla(u, t.x, -0.041848579703592507506027); + u = mla(u, t.x, 0.0470843011653283988193763); + u = mla(u, t.x, -0.0524914210588448421068719); + u = mla(u, t.x, 0.0587946590969581003860434); + u = mla(u, t.x, -0.0666620884778795497194182); + u = mla(u, t.x, 0.0769225330296203768654095); + u = mla(u, t.x, -0.0909090442773387574781907); + u = mla(u, t.x, 0.111111108376896236538123); + u = mla(u, t.x, -0.142857142756268568062339); + u = mla(u, t.x, 0.199999999997977351284817); + u = mla(u, t.x, -0.333333333333317605173818); + + t = ddmul_d2_d2_d(t, u); + t = ddmul_d2_d2_d2(s, ddadd_d2_d_d2(1, t)); + t = ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(1.570796326794896557998982, 6.12323399573676603586882e-17), q), t); + + return t; +} + +double _SLEEF_N(xatan2_u1)(double y, double x) { + double2 d = atan2k_u1(dd(xfabs(y), 0), dd(x, 0)); + double r = d.x + d.y; + + r = mulsign(r, x); + if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI /2)) : 0); + if (xisinf(y) ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0); + if ( y == 0) r = (sign(x) == -1 ? M_PI : 0); + + return xisnan(x) || xisnan(y) ? NAN : mulsign(r, y); +} + +double _SLEEF_N(xasin_u1)(double d) { + double2 d2 = atan2k_u1(dd(xfabs(d), 0), ddsqrt_d2_d2(ddmul_d2_d2_d2(ddadd_d2_d_d(1, d), ddadd_d2_d_d(1,-d)))); + double r = d2.x + d2.y; + if (xfabs(d) == 1) r = 1.570796326794896557998982; + return mulsign(r, d); +} + +double _SLEEF_N(xacos_u1)(double d) { + double2 d2 = atan2k_u1(ddsqrt_d2_d2(ddmul_d2_d2_d2(ddadd_d2_d_d(1, d), ddadd_d2_d_d(1,-d))), dd(xfabs(d), 0)); + d2 = ddscale_d2_d2_d(d2, mulsign(1, d)); + if (xfabs(d) == 1) d2 = dd(0, 0); + if (d < 0) d2 = ddadd_d2_d2_d2(dd(3.141592653589793116, 1.2246467991473532072e-16), d2); + return d2.x + d2.y; +} + +double _SLEEF_N(xatan_u1)(double d) { + double2 d2 = atan2k_u1(dd(xfabs(d), 0), dd(1, 0)); + double r = d2.x + d2.y; + if (xisinf(d)) r = 1.570796326794896557998982; + return mulsign(r, d); +} + +double _SLEEF_N(xsin)(double d) { + int q; + double u, s; + + q = (int)xrint(d * M_1_PI); + + d = mla(q, -PI4_A*4, d); + d = mla(q, -PI4_B*4, d); + d = mla(q, -PI4_C*4, d); + d = mla(q, -PI4_D*4, d); + + s = d * d; + + if ((q & 1) != 0) d = -d; + + u = -7.97255955009037868891952e-18; + u = mla(u, s, 2.81009972710863200091251e-15); + u = mla(u, s, -7.64712219118158833288484e-13); + u = mla(u, s, 1.60590430605664501629054e-10); + u = mla(u, s, -2.50521083763502045810755e-08); + u = mla(u, s, 2.75573192239198747630416e-06); + u = mla(u, s, -0.000198412698412696162806809); + u = mla(u, s, 0.00833333333333332974823815); + u = mla(u, s, -0.166666666666666657414808); + + u = mla(s, u * d, d); + + return u; +} + +double _SLEEF_N(xsin_u1)(double d) { + int q; + double u; + double2 s, t, x; + + q = (int)xrint(d * M_1_PI); + + s = ddadd2_d2_d_d(d, q * (-PI4_A*4)); + s = ddadd2_d2_d2_d(s, q * (-PI4_B*4)); + s = ddadd2_d2_d2_d(s, q * (-PI4_C*4)); + s = ddadd2_d2_d2_d(s, q * (-PI4_D*4)); + + t = s; + s = ddsqu_d2_d2(s); + + u = 2.72052416138529567917983e-15; + u = mla(u, s.x, -7.6429259411395447190023e-13); + u = mla(u, s.x, 1.60589370117277896211623e-10); + u = mla(u, s.x, -2.5052106814843123359368e-08); + u = mla(u, s.x, 2.75573192104428224777379e-06); + u = mla(u, s.x, -0.000198412698412046454654947); + u = mla(u, s.x, 0.00833333333333318056201922); + + x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s)); + + x = ddmul_d2_d2_d2(t, x); + u = x.x + x.y; + + if ((q & 1) != 0) u = -u; + + return u; +} + +double _SLEEF_N(xcos)(double d) { + int q; + double u, s; + + q = 1 + 2*(int)xrint(d * M_1_PI - 0.5); + + d = mla(q, -PI4_A*2, d); + d = mla(q, -PI4_B*2, d); + d = mla(q, -PI4_C*2, d); + d = mla(q, -PI4_D*2, d); + + s = d * d; + + if ((q & 2) == 0) d = -d; + + u = -7.97255955009037868891952e-18; + u = mla(u, s, 2.81009972710863200091251e-15); + u = mla(u, s, -7.64712219118158833288484e-13); + u = mla(u, s, 1.60590430605664501629054e-10); + u = mla(u, s, -2.50521083763502045810755e-08); + u = mla(u, s, 2.75573192239198747630416e-06); + u = mla(u, s, -0.000198412698412696162806809); + u = mla(u, s, 0.00833333333333332974823815); + u = mla(u, s, -0.166666666666666657414808); + + u = mla(s, u * d, d); + + return u; +} + +double _SLEEF_N(xcos_u1)(double d) { + double u, q; + double2 s, t, x; + + d = fabs(d); + + q = mla(2, xrint(d * M_1_PI - 0.5), 1); + + s = ddadd2_d2_d_d(d, q * (-PI4_A*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_B*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_C*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_D*2)); + + t = s; + s = ddsqu_d2_d2(s); + + u = 2.72052416138529567917983e-15; + u = mla(u, s.x, -7.6429259411395447190023e-13); + u = mla(u, s.x, 1.60589370117277896211623e-10); + u = mla(u, s.x, -2.5052106814843123359368e-08); + u = mla(u, s.x, 2.75573192104428224777379e-06); + u = mla(u, s.x, -0.000198412698412046454654947); + u = mla(u, s.x, 0.00833333333333318056201922); + + x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s)); + + x = ddmul_d2_d2_d2(t, x); + + u = x.x + x.y; + + if ((((int)q) & 2) == 0) u = -u; + + return u; +} + +double2 _SLEEF_N(xsincos)(double d) { + int q; + double u, s, t; + double2 r; + + q = (int)xrint(d * (2 * M_1_PI)); + + s = d; + + s = mla(-q, PI4_A*2, s); + s = mla(-q, PI4_B*2, s); + s = mla(-q, PI4_C*2, s); + s = mla(-q, PI4_D*2, s); + + t = s; + + s = s * s; + + u = 1.58938307283228937328511e-10; + u = mla(u, s, -2.50506943502539773349318e-08); + u = mla(u, s, 2.75573131776846360512547e-06); + u = mla(u, s, -0.000198412698278911770864914); + u = mla(u, s, 0.0083333333333191845961746); + u = mla(u, s, -0.166666666666666130709393); + u = u * s * t; + + r.x = t + u; + + u = -1.13615350239097429531523e-11; + u = mla(u, s, 2.08757471207040055479366e-09); + u = mla(u, s, -2.75573144028847567498567e-07); + u = mla(u, s, 2.48015872890001867311915e-05); + u = mla(u, s, -0.00138888888888714019282329); + u = mla(u, s, 0.0416666666666665519592062); + u = mla(u, s, -0.5); + + r.y = u * s + 1; + + if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; } + if ((q & 2) != 0) { r.x = -r.x; } + if (((q+1) & 2) != 0) { r.y = -r.y; } + + if (xisinf(d)) { r.x = r.y = NAN; } + + return r; +} + +double2 _SLEEF_N(xsincos_u1)(double d) { + int q; + double u; + double2 r, s, t, x; + + q = (int)xrint(d * (2 * M_1_PI)); + + s = ddadd2_d2_d_d(d, q * (-PI4_A*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_B*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_C*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_D*2)); + + t = s; + s = ddsqu_d2_d2(s); + s.x = s.x + s.y; + + u = 1.58938307283228937328511e-10; + u = mla(u, s.x, -2.50506943502539773349318e-08); + u = mla(u, s.x, 2.75573131776846360512547e-06); + u = mla(u, s.x, -0.000198412698278911770864914); + u = mla(u, s.x, 0.0083333333333191845961746); + u = mla(u, s.x, -0.166666666666666130709393); + + u *= s.x * t.x; + + x = ddadd_d2_d2_d(t, u); + r.x = x.x + x.y; + + u = -1.13615350239097429531523e-11; + u = mla(u, s.x, 2.08757471207040055479366e-09); + u = mla(u, s.x, -2.75573144028847567498567e-07); + u = mla(u, s.x, 2.48015872890001867311915e-05); + u = mla(u, s.x, -0.00138888888888714019282329); + u = mla(u, s.x, 0.0416666666666665519592062); + u = mla(u, s.x, -0.5); + + x = ddadd_d2_d_d2(1, ddmul_d2_d_d(s.x, u)); + r.y = x.x + x.y; + + if ((q & 1) != 0) { u = r.y; r.y = r.x; r.x = u; } + if ((q & 2) != 0) { r.x = -r.x; } + if (((q+1) & 2) != 0) { r.y = -r.y; } + + if (xisinf(d)) { r.x = r.y = NAN; } + + return r; +} + +double _SLEEF_N(xtan)(double d) { + int q; + double u, s, x; + + q = (int)xrint(d * (2 * M_1_PI)); + + x = mla(q, -PI4_A*2, d); + x = mla(q, -PI4_B*2, x); + x = mla(q, -PI4_C*2, x); + x = mla(q, -PI4_D*2, x); + + s = x * x; + + if ((q & 1) != 0) x = -x; + + u = 1.01419718511083373224408e-05; + u = mla(u, s, -2.59519791585924697698614e-05); + u = mla(u, s, 5.23388081915899855325186e-05); + u = mla(u, s, -3.05033014433946488225616e-05); + u = mla(u, s, 7.14707504084242744267497e-05); + u = mla(u, s, 8.09674518280159187045078e-05); + u = mla(u, s, 0.000244884931879331847054404); + u = mla(u, s, 0.000588505168743587154904506); + u = mla(u, s, 0.00145612788922812427978848); + u = mla(u, s, 0.00359208743836906619142924); + u = mla(u, s, 0.00886323944362401618113356); + u = mla(u, s, 0.0218694882853846389592078); + u = mla(u, s, 0.0539682539781298417636002); + u = mla(u, s, 0.133333333333125941821962); + u = mla(u, s, 0.333333333333334980164153); + + u = mla(s, u * x, x); + + if ((q & 1) != 0) u = 1.0 / u; + + if (xisinf(d)) u = NAN; + + return u; +} + +double _SLEEF_N(xtan_u1)(double d) { + int q; + double u; + double2 s, t, x; + + q = (int)xrint(d * M_2_PI); + + s = ddadd2_d2_d_d(d, q * (-PI4_A*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_B*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_C*2)); + s = ddadd2_d2_d2_d(s, q * (-PI4_D*2)); + + if ((q & 1) != 0) s = ddneg_d2_d2(s); + + t = s; + s = ddsqu_d2_d2(s); + + u = 1.01419718511083373224408e-05; + u = mla(u, s.x, -2.59519791585924697698614e-05); + u = mla(u, s.x, 5.23388081915899855325186e-05); + u = mla(u, s.x, -3.05033014433946488225616e-05); + u = mla(u, s.x, 7.14707504084242744267497e-05); + u = mla(u, s.x, 8.09674518280159187045078e-05); + u = mla(u, s.x, 0.000244884931879331847054404); + u = mla(u, s.x, 0.000588505168743587154904506); + u = mla(u, s.x, 0.00145612788922812427978848); + u = mla(u, s.x, 0.00359208743836906619142924); + u = mla(u, s.x, 0.00886323944362401618113356); + u = mla(u, s.x, 0.0218694882853846389592078); + u = mla(u, s.x, 0.0539682539781298417636002); + u = mla(u, s.x, 0.133333333333125941821962); + + x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(0.333333333333334980164153, u * s.x), s)); + x = ddmul_d2_d2_d2(t, x); + + if ((q & 1) != 0) x = ddrec_d2_d2(x); + + u = x.x + x.y; + + return u; +} + +double _SLEEF_N(xlog)(double d) { + double x, x2, t, m; + int e; + + e = ilogbp1(d * 0.7071); + m = ldexpk(d, -e); + + x = (m-1) / (m+1); + x2 = x * x; + + t = 0.148197055177935105296783; + t = mla(t, x2, 0.153108178020442575739679); + t = mla(t, x2, 0.181837339521549679055568); + t = mla(t, x2, 0.22222194152736701733275); + t = mla(t, x2, 0.285714288030134544449368); + t = mla(t, x2, 0.399999999989941956712869); + t = mla(t, x2, 0.666666666666685503450651); + t = mla(t, x2, 2); + + x = x * t + 0.693147180559945286226764 * e; + + if (xisinf(d)) x = INFINITY; + if (d < 0) x = NAN; + if (d == 0) x = -INFINITY; + + return x; +} + +double _SLEEF_N(xexp)(double d) { + int q = (int)xrint(d * R_LN2); + double s, u; + + s = mla(q, -L2U, d); + s = mla(q, -L2L, s); + + u = 2.08860621107283687536341e-09; + u = mla(u, s, 2.51112930892876518610661e-08); + u = mla(u, s, 2.75573911234900471893338e-07); + u = mla(u, s, 2.75572362911928827629423e-06); + u = mla(u, s, 2.4801587159235472998791e-05); + u = mla(u, s, 0.000198412698960509205564975); + u = mla(u, s, 0.00138888888889774492207962); + u = mla(u, s, 0.00833333333331652721664984); + u = mla(u, s, 0.0416666666666665047591422); + u = mla(u, s, 0.166666666666666851703837); + u = mla(u, s, 0.5); + + u = s * s * u + s + 1; + u = ldexpk(u, q); + + if (xisminf(d)) u = 0; + + return u; +} + +static inline double2 logk(double d) { + double2 x, x2; + double m, t; + int e; + + e = ilogbp1(d * 0.7071); + m = ldexpk(d, -e); + + x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m)); + x2 = ddsqu_d2_d2(x); + + t = 0.134601987501262130076155; + t = mla(t, x2.x, 0.132248509032032670243288); + t = mla(t, x2.x, 0.153883458318096079652524); + t = mla(t, x2.x, 0.181817427573705403298686); + t = mla(t, x2.x, 0.222222231326187414840781); + t = mla(t, x2.x, 0.285714285651261412873718); + t = mla(t, x2.x, 0.400000000000222439910458); + t = mla(t, x2.x, 0.666666666666666371239645); + + return ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e), + ddadd2_d2_d2_d2(ddscale_d2_d2_d(x, 2), ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t))); +} + +double _SLEEF_N(xlog_u1)(double d) { + double2 s = logk(d); + double x = s.x + s.y; + + if (xisinf(d)) x = INFINITY; + if (d < 0) x = NAN; + if (d == 0) x = -INFINITY; + + return x; +} + +static inline double expk(double2 d) { + int q = (int)xrint((d.x + d.y) * R_LN2); + double2 s, t; + double u; + + s = ddadd2_d2_d2_d(d, q * -L2U); + s = ddadd2_d2_d2_d(s, q * -L2L); + + s = ddnormalize_d2_d2(s); + + u = 2.51069683420950419527139e-08; + u = mla(u, s.x, 2.76286166770270649116855e-07); + u = mla(u, s.x, 2.75572496725023574143864e-06); + u = mla(u, s.x, 2.48014973989819794114153e-05); + u = mla(u, s.x, 0.000198412698809069797676111); + u = mla(u, s.x, 0.0013888888939977128960529); + u = mla(u, s.x, 0.00833333333332371417601081); + u = mla(u, s.x, 0.0416666666665409524128449); + u = mla(u, s.x, 0.166666666666666740681535); + u = mla(u, s.x, 0.500000000000000999200722); + + t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); + + t = ddadd_d2_d_d2(1, t); + return ldexpk(t.x + t.y, q); +} + +double _SLEEF_N(xpow)(double x, double y) { + int yisint = (int)y == y; + int yisodd = (1 & (int)y) != 0 && yisint; + + double result = expk(ddmul_d2_d2_d(logk(xfabs(x)), y)); + + result = xisnan(result) ? INFINITY : result; + result *= (x >= 0 ? 1 : (!yisint ? NAN : (yisodd ? -1 : 1))); + + double efx = mulsign(xfabs(x) - 1, y); + if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : INFINITY); + if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : INFINITY); + if (xisnan(x) || xisnan(y)) result = NAN; + if (y == 0 || x == 1) result = 1; + + return result; +} + +static inline double2 expk2(double2 d) { + int q = (int)xrint((d.x + d.y) * R_LN2); + double2 s, t; + double u; + + s = ddadd2_d2_d2_d(d, q * -L2U); + s = ddadd2_d2_d2_d(s, q * -L2L); + + u = 2.51069683420950419527139e-08; + u = mla(u, s.x, 2.76286166770270649116855e-07); + u = mla(u, s.x, 2.75572496725023574143864e-06); + u = mla(u, s.x, 2.48014973989819794114153e-05); + u = mla(u, s.x, 0.000198412698809069797676111); + u = mla(u, s.x, 0.0013888888939977128960529); + u = mla(u, s.x, 0.00833333333332371417601081); + u = mla(u, s.x, 0.0416666666665409524128449); + u = mla(u, s.x, 0.166666666666666740681535); + u = mla(u, s.x, 0.500000000000000999200722); + + t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u)); + + t = ddadd_d2_d_d2(1, t); + return ddscale_d2_d2_d(t, pow2i(q)); +} + +double _SLEEF_N(xsinh)(double x) { + double y = xfabs(x); + double2 d = expk2(dd(y, 0)); + d = ddsub_d2_d2_d2(d, ddrec_d2_d2(d)); + y = (d.x + d.y) * 0.5; + + y = xfabs(x) > 710 ? INFINITY : y; + y = xisnan(y) ? INFINITY : y; + y = mulsign(y, x); + y = xisnan(x) ? NAN : y; + + return y; +} + +double _SLEEF_N(xcosh)(double x) { + double y = xfabs(x); + double2 d = expk2(dd(y, 0)); + d = ddadd_d2_d2_d2(d, ddrec_d2_d2(d)); + y = (d.x + d.y) * 0.5; + + y = xfabs(x) > 710 ? INFINITY : y; + y = xisnan(y) ? INFINITY : y; + y = xisnan(x) ? NAN : y; + + return y; +} + +double _SLEEF_N(xtanh)(double x) { + double y = xfabs(x); + double2 d = expk2(dd(y, 0)); + double2 e = ddrec_d2_d2(d); + d = dddiv_d2_d2_d2(ddsub_d2_d2_d2(d, e), ddadd_d2_d2_d2(d, e)); + y = d.x + d.y; + + y = xfabs(x) > 18.714973875 ? 1.0 : y; + y = xisnan(y) ? 1.0 : y; + y = mulsign(y, x); + y = xisnan(x) ? NAN : y; + + return y; +} + +static inline double2 logk2(double2 d) { + double2 x, x2, m; + double t; + int e; + + e = ilogbp1(d.x * 0.7071); + m = ddscale_d2_d2_d(d, pow2i(-e)); + + x = dddiv_d2_d2_d2(ddadd2_d2_d2_d(m, -1), ddadd2_d2_d2_d(m, 1)); + x2 = ddsqu_d2_d2(x); + + t = 0.134601987501262130076155; + t = mla(t, x2.x, 0.132248509032032670243288); + t = mla(t, x2.x, 0.153883458318096079652524); + t = mla(t, x2.x, 0.181817427573705403298686); + t = mla(t, x2.x, 0.222222231326187414840781); + t = mla(t, x2.x, 0.285714285651261412873718); + t = mla(t, x2.x, 0.400000000000222439910458); + t = mla(t, x2.x, 0.666666666666666371239645); + + return ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e), + ddadd2_d2_d2_d2(ddscale_d2_d2_d(x, 2), ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t))); +} + +double _SLEEF_N(xasinh)(double x) { + double y = xfabs(x); + double2 d = logk2(ddadd_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddmul_d2_d_d(y, y), 1)), y)); + y = d.x + d.y; + + y = xisinf(x) || xisnan(y) ? INFINITY : y; + y = mulsign(y, x); + y = xisnan(x) ? NAN : y; + + return y; +} + +double _SLEEF_N(xacosh)(double x) { + double2 d = logk2(ddadd2_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddmul_d2_d_d(x, x), -1)), x)); + double y = d.x + d.y; + + y = xisinf(x) || xisnan(y) ? INFINITY : y; + y = x == 1.0 ? 0.0 : y; + y = x < 1.0 ? NAN : y; + y = xisnan(x) ? NAN : y; + + return y; +} + +double _SLEEF_N(xatanh)(double x) { + double y = xfabs(x); + double2 d = logk2(dddiv_d2_d2_d2(ddadd2_d2_d_d(1, y), ddadd2_d2_d_d(1, -y))); + y = y > 1.0 ? NAN : (y == 1.0 ? INFINITY : (d.x + d.y) * 0.5); + + y = xisinf(x) || xisnan(y) ? NAN : y; + y = mulsign(y, x); + y = xisnan(x) ? NAN : y; + + return y; +} + +// + +static double xfma(double x, double y, double z) { + union { + double f; + long long int i; + } tmp; + + tmp.f = x; + tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL; + double xh = tmp.f, xl = x - xh; + + tmp.f = y; + tmp.i = (tmp.i + 0x4000000) & 0xfffffffff8000000LL; + double yh = tmp.f, yl = y - yh; + + double h = x * y; + double l = xh * yh - h + xl * yh + xh * yl + xl * yl; + + double h2, l2, v; + + h2 = h + z; + v = h2 - h; + l2 = (h - (h2 - v)) + (z - v) + l; + + return h2 + l2; +} + +double _SLEEF_N(xsqrt)(double d) { // max error : 0.5 ulp + double q = 1; + + if (d < 8.636168555094445E-78) { + d *= 1.157920892373162E77; + q = 2.9387358770557188E-39; + } + + // http://en.wikipedia.org/wiki/Fast_inverse_square_root + double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1)); + + x = x * (1.5 - 0.5 * d * x * x); + x = x * (1.5 - 0.5 * d * x * x); + x = x * (1.5 - 0.5 * d * x * x); + + // You can change xfma to fma if fma is correctly implemented + x = xfma(d * x, d * x, -d) * (x * -0.5) + d * x; + + return d == INFINITY ? INFINITY : x * q; +} + +double _SLEEF_N(xcbrt)(double d) { // max error : 2 ulps + double x, y, q = 1.0; + int e, r; + + e = ilogbp1(d); + d = ldexpk(d, -e); + r = (e + 6144) % 3; + q = (r == 1) ? 1.2599210498948731647672106 : q; + q = (r == 2) ? 1.5874010519681994747517056 : q; + q = ldexpk(q, (e + 6144) / 3 - 2048); + + q = mulsign(q, d); + d = xfabs(d); + + x = -0.640245898480692909870982; + x = x * d + 2.96155103020039511818595; + x = x * d + -5.73353060922947843636166; + x = x * d + 6.03990368989458747961407; + x = x * d + -3.85841935510444988821632; + x = x * d + 2.2307275302496609725722; + + y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0); + y = d * x * x; + y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q; + + return y; +} + +double _SLEEF_N(xcbrt_u1)(double d) { + double x, y, z; + double2 q2 = dd(1, 0), u, v; + int e, r; + + e = ilogbp1(d); + d = ldexpk(d, -e); + r = (e + 6144) % 3; + q2 = (r == 1) ? dd(1.2599210498948731907, -2.5899333753005069177e-17) : q2; + q2 = (r == 2) ? dd(1.5874010519681995834, -1.0869008194197822986e-16) : q2; + + q2.x = mulsign(q2.x, d); q2.y = mulsign(q2.y, d); + d = xfabs(d); + + x = -0.640245898480692909870982; + x = x * d + 2.96155103020039511818595; + x = x * d + -5.73353060922947843636166; + x = x * d + 6.03990368989458747961407; + x = x * d + -3.85841935510444988821632; + x = x * d + 2.2307275302496609725722; + + y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0); + + z = x; + + u = ddmul_d2_d_d(x, x); + u = ddmul_d2_d2_d2(u, u); + u = ddmul_d2_d2_d(u, d); + u = ddadd2_d2_d2_d(u, -x); + y = u.x + u.y; + + y = -2.0 / 3.0 * y * z; + v = ddadd2_d2_d2_d(ddmul_d2_d_d(z, z), y); + v = ddmul_d2_d2_d(v, d); + v = ddmul_d2_d2_d2(v, q2); + z = ldexp(v.x + v.y, (e + 6144) / 3 - 2048); + + if (xisinf(d)) { z = mulsign(INFINITY, q2.x); } + if (d == 0) { z = mulsign(0, q2.x); } + + return z; +} + +double _SLEEF_N(xexp2)(double a) { + double u = expk(ddmul_d2_d2_d(dd(0.69314718055994528623, 2.3190468138462995584e-17), a)); + if (a > 1023) u = INFINITY; + if (xisminf(a)) u = 0; + return u; +} + +double _SLEEF_N(xexp10)(double a) { + double u = expk(ddmul_d2_d2_d(dd(2.3025850929940459011, -2.1707562233822493508e-16), a)); + if (a > 308) u = INFINITY; + if (xisminf(a)) u = 0; + return u; +} + +double _SLEEF_N(xexpm1)(double a) { + double2 d = ddadd2_d2_d2_d(expk2(dd(a, 0)), -1.0); + double x = d.x + d.y; + if (a > 700) x = INFINITY; + if (a < -0.36043653389117156089696070315825181539851971360337e+2) x = -1; + return x; +} + +double _SLEEF_N(xlog10)(double a) { + double2 d = ddmul_d2_d2_d2(logk(a), dd(0.43429448190325176116, 6.6494347733425473126e-17)); + double x = d.x + d.y; + + if (xisinf(a)) x = INFINITY; + if (a < 0) x = NAN; + if (a == 0) x = -INFINITY; + + return x; +} + +double _SLEEF_N(xlog1p)(double a) { + double2 d = logk2(ddadd2_d2_d_d(a, 1)); + double x = d.x + d.y; + + if (xisinf(a)) x = INFINITY; + if (a < -1) x = NAN; + if (a == -1) x = -INFINITY; + + return x; +} Index: sleef/lib/dp-sse2.cpp =================================================================== --- /dev/null +++ sleef/lib/dp-sse2.cpp @@ -0,0 +1,3 @@ +#define BUILD_SSE2 +#include "dp.cpp" + Index: sleef/lib/dp.cpp =================================================================== --- /dev/null +++ sleef/lib/dp.cpp @@ -0,0 +1,1036 @@ +#include +#include + +#include "nonnumber.h" +#include "isa.h" +#include "dd.h" + +// + +#define PI4_A 0.78539816290140151978 +#define PI4_B 4.9604678871439933374e-10 +#define PI4_C 1.1258708853173288931e-18 +#define PI4_D 1.7607799325916000908e-27 + +#define M_4_PI 1.273239544735162542821171882678754627704620361328125 + +#define L2U .69314718055966295651160180568695068359375 +#define L2L .28235290563031577122588448175013436025525412068e-12 +#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931 + +// + +#define PI4_Af 0.78515625f +#define PI4_Bf 0.00024187564849853515625f +#define PI4_Cf 3.7747668102383613586e-08f +#define PI4_Df 1.2816720341285448015e-12f + +#define L2Uf 0.693145751953125f +#define L2Lf 1.428606765330187045e-06f +#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f + +// + +extern "C" { +vdouble _SLEEF_N(xldexp)(vdouble x, vint q); +vint _SLEEF_N(xilogb)(vdouble d); +vdouble _SLEEF_N(xsin)(vdouble d); +vdouble _SLEEF_N(xsin_u1)(vdouble d); +vdouble _SLEEF_N(xcos)(vdouble d); +vdouble _SLEEF_N(xcos_u1)(vdouble d); +vdouble2 _SLEEF_N(xsincos)(vdouble d); +vdouble2 _SLEEF_N(xsincos_u1)(vdouble d); +vdouble _SLEEF_N(xtan)(vdouble d); +vdouble _SLEEF_N(xtan_u1)(vdouble d); +vdouble _SLEEF_N(xatan2)(vdouble y, vdouble x); +vdouble _SLEEF_N(xatan2_u1)(vdouble y, vdouble x); +vdouble _SLEEF_N(xasin)(vdouble d); +vdouble _SLEEF_N(xasin_u1)(vdouble d); +vdouble _SLEEF_N(xacos)(vdouble d); +vdouble _SLEEF_N(xacos_u1)(vdouble d); +vdouble _SLEEF_N(xatan_u1)(vdouble d); +vdouble _SLEEF_N(xatan)(vdouble s); +vdouble _SLEEF_N(xlog)(vdouble d); +vdouble _SLEEF_N(xexp)(vdouble d); +vdouble _SLEEF_N(xlog_u1)(vdouble d); +vdouble _SLEEF_N(xpow)(vdouble x, vdouble y); +vdouble _SLEEF_N(xsinh)(vdouble x); +vdouble _SLEEF_N(xcosh)(vdouble x); +vdouble _SLEEF_N(xtanh)(vdouble x); +vdouble _SLEEF_N(xasinh)(vdouble x); +vdouble _SLEEF_N(xacosh)(vdouble x); +vdouble _SLEEF_N(xatanh)(vdouble x); +vdouble _SLEEF_N(xcbrt)(vdouble d); +vdouble _SLEEF_N(xcbrt_u1)(vdouble d); +vdouble _SLEEF_N(xexp2)(vdouble a); +vdouble _SLEEF_N(xexp10)(vdouble a); +vdouble _SLEEF_N(xexpm1)(vdouble a); +vdouble _SLEEF_N(xlog10)(vdouble a); +vdouble _SLEEF_N(xlog1p)(vdouble a); +} // extern "C" + +vdouble _SLEEF_N(xldexp)(vdouble x, vint q) { return vldexp_vd_vd_vi(x, q); } + +vint _SLEEF_N(xilogb)(vdouble d) { + vdouble e = vcast_vd_vi(vsub_vi_vi_vi(vilogbp1_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1))); + e = vsel_vd_vm_vd_vd(veq_vm_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-2147483648.0), e); + e = vsel_vd_vm_vd_vd(veq_vm_vd_vd(vabs_vd_vd(d), vcast_vd_d(INFINITY)), vcast_vd_d(2147483647), e); + return vrint_vi_vd(e); +} + +vdouble _SLEEF_N(xsin)(vdouble d) { + vint q; + vdouble u, s; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); + + u = vcast_vd_vi(q); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*4), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*4), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*4), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*4), d); + + s = vmul_vd_vd_vd(d, d); + + d = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)), (vmask)vcast_vd_d(-0.0)), (vmask)d); + + u = vcast_vd_d(-7.97255955009037868891952e-18); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-7.64712219118158833288484e-13)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.60590430605664501629054e-10)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50521083763502045810755e-08)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573192239198747630416e-06)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698412696162806809)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); + + u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, d), d); + + return u; +} + +vdouble _SLEEF_N(xsin_u1)(vdouble d) { + vint q; + vdouble u; + vdouble2 s, t, x; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI))); + u = vcast_vd_vi(q); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*4))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*4))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*4))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*4))); + + t = s; + s = ddsqu_vd2_vd2(s); + + u = vcast_vd_d(2.72052416138529567917983e-15); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-7.6429259411395447190023e-13)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(1.60589370117277896211623e-10)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.5052106814843123359368e-08)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573192104428224777379e-06)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698412046454654947)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333333318056201922)); + + x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, s.x)), s)); + + x = ddmul_vd2_vd2_vd2(t, x); + u = vadd_vd_vd_vd(x.x, x.y); + + u = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)), (vmask)vcast_vd_d(-0.0)), (vmask)u); + + return u; +} + +vdouble _SLEEF_N(xcos)(vdouble d) { + vint q; + vdouble u, s; + + q = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))); + q = vadd_vi_vi_vi(vadd_vi_vi_vi(q, q), vcast_vi_i(1)); + + u = vcast_vd_vi(q); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), d); + d = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), d); + + s = vmul_vd_vd_vd(d, d); + + d = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)), (vmask)vcast_vd_d(-0.0)), (vmask)d); + + u = vcast_vd_d(-7.97255955009037868891952e-18); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-7.64712219118158833288484e-13)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.60590430605664501629054e-10)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50521083763502045810755e-08)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573192239198747630416e-06)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698412696162806809)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808)); + + u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, d), d); + + return u; +} + +vdouble _SLEEF_N(xcos_u1)(vdouble d) { + vint q; + vdouble u; + vdouble2 s, t, x; + + q = vrint_vi_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5))); + q = vadd_vi_vi_vi(vadd_vi_vi_vi(q, q), vcast_vi_i(1)); + u = vcast_vd_vi(q); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2))); + + t = s; + s = ddsqu_vd2_vd2(s); + + u = vcast_vd_d(2.72052416138529567917983e-15); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-7.6429259411395447190023e-13)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(1.60589370117277896211623e-10)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.5052106814843123359368e-08)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573192104428224777379e-06)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698412046454654947)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333333318056201922)); + + x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, s.x)), s)); + + x = ddmul_vd2_vd2_vd2(t, x); + u = vadd_vd_vd_vd(x.x, x.y); + + u = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)), (vmask)vcast_vd_d(-0.0)), (vmask)u); + + return u; +} + +vdouble2 _SLEEF_N(xsincos)(vdouble d) { + vint q; + vmask m; + vdouble u, s, t, rx, ry; + vdouble2 r; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); + + s = d; + + u = vcast_vd_vi(q); + s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), s); + s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), s); + s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), s); + s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), s); + + t = s; + + s = vmul_vd_vd_vd(s, s); + + u = vcast_vd_d(1.58938307283228937328511e-10); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50506943502539773349318e-08)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573131776846360512547e-06)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698278911770864914)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0083333333333191845961746)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666130709393)); + u = vmul_vd_vd_vd(vmul_vd_vd_vd(u, s), t); + + rx = vadd_vd_vd_vd(t, u); + + u = vcast_vd_d(-1.13615350239097429531523e-11); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.08757471207040055479366e-09)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.75573144028847567498567e-07)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.48015872890001867311915e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.00138888888888714019282329)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665519592062)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.5)); + + ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1)); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(0)); + r.x = vsel_vd_vm_vd_vd(m, rx, ry); + r.y = vsel_vd_vm_vd_vd(m, ry, rx); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)); + r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x))); + + m = veq_vm_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)); + r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y))); + + m = visinf_vm_vd(d); + r.x = (vdouble)vor_vm_vm_vm(m, (vmask)r.x); + r.y = (vdouble)vor_vm_vm_vm(m, (vmask)r.y); + + return r; +} + +vdouble2 _SLEEF_N(xsincos_u1)(vdouble d) { + vint q; + vmask m; + vdouble u, rx, ry; + vdouble2 r, s, t, x; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI))); + u = vcast_vd_vi(q); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2))); + + t = s; + + s = ddsqu_vd2_vd2(s); + s.x = vadd_vd_vd_vd(s.x, s.y); + + u = vcast_vd_d(1.58938307283228937328511e-10); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.50506943502539773349318e-08)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573131776846360512547e-06)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698278911770864914)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0083333333333191845961746)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.166666666666666130709393)); + + u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(s.x, t.x)); + + x = ddadd_vd2_vd2_vd(t, u); + rx = vadd_vd_vd_vd(x.x, x.y); + + u = vcast_vd_d(-1.13615350239097429531523e-11); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.08757471207040055479366e-09)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.75573144028847567498567e-07)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.48015872890001867311915e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.00138888888888714019282329)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0416666666666665519592062)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.5)); + + x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(s.x, u)); + ry = vadd_vd_vd_vd(x.x, x.y); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(0)); + r.x = vsel_vd_vm_vd_vd(m, rx, ry); + r.y = vsel_vd_vm_vd_vd(m, ry, rx); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)); + r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x))); + + m = veq_vm_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)); + r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y))); + + m = visinf_vm_vd(d); + r.x = (vdouble)vor_vm_vm_vm(m, (vmask)r.x); + r.y = (vdouble)vor_vm_vm_vm(m, (vmask)r.y); + + return r; +} + +vdouble _SLEEF_N(xtan)(vdouble d) { + vint q; + vdouble u, s, x; + vmask m; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); + + u = vcast_vd_vi(q); + x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_A*2), d); + x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_B*2), x); + x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_C*2), x); + x = vmla_vd_vd_vd_vd(u, vcast_vd_d(-PI4_D*2), x); + + s = vmul_vd_vd_vd(x, x); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)); + x = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(m, (vmask)vcast_vd_d(-0.0)), (vmask)x); + + u = vcast_vd_d(1.01419718511083373224408e-05); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.59519791585924697698614e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(5.23388081915899855325186e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.05033014433946488225616e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(7.14707504084242744267497e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(8.09674518280159187045078e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000244884931879331847054404)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000588505168743587154904506)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00145612788922812427978848)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00359208743836906619142924)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00886323944362401618113356)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0218694882853846389592078)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0539682539781298417636002)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.133333333333125941821962)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.333333333333334980164153)); + + u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x); + + u = vsel_vd_vm_vd_vd(m, vrec_vd_vd(u), u); + + u = (vdouble)vor_vm_vm_vm(visinf_vm_vd(d), (vmask)u); + + return u; +} + +vdouble _SLEEF_N(xtan_u1)(vdouble d) { + vint q; + vdouble u; + vdouble2 s, t, x; + vmask m; + + q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_2_PI))); + u = vcast_vd_vi(q); + + s = ddadd2_vd2_vd_vd (d, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_A*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_B*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_C*2))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(u, vcast_vd_d(-PI4_D*2))); + + m = veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)); + vmask n = vand_vm_vm_vm(m, (vmask)vcast_vd_d(-0.0)); + s.x = (vdouble)vxor_vm_vm_vm((vmask)s.x, n); + s.y = (vdouble)vxor_vm_vm_vm((vmask)s.y, n); + + t = s; + s = ddsqu_vd2_vd2(s); + + u = vcast_vd_d(1.01419718511083373224408e-05); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.59519791585924697698614e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(5.23388081915899855325186e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-3.05033014433946488225616e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(7.14707504084242744267497e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(8.09674518280159187045078e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000244884931879331847054404)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000588505168743587154904506)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00145612788922812427978848)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00359208743836906619142924)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00886323944362401618113356)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0218694882853846389592078)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0539682539781298417636002)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.133333333333125941821962)); + + x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(0.333333333333334980164153), vmul_vd_vd_vd(u, s.x)), s)); + x = ddmul_vd2_vd2_vd2(t, x); + + x = vsel_vd2_vm_vd2_vd2(m, ddrec_vd2_vd2(x), x); + + u = vadd_vd_vd_vd(x.x, x.y); + + return u; +} + +static inline vdouble atan2k(vdouble y, vdouble x) { + vdouble s, t, u; + vint q; + vmask p; + + q = vsel_vi_vd_vd_vi_vi(x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0)); + x = vabs_vd_vd(x); + + q = vsel_vi_vd_vd_vi_vi(x, y, vadd_vi_vi_vi(q, vcast_vi_i(1)), q); + p = vlt_vm_vd_vd(x, y); + s = vsel_vd_vm_vd_vd(p, vneg_vd_vd(x), y); + t = vmax_vd_vd_vd(x, y); + + s = vdiv_vd_vd_vd(s, t); + t = vmul_vd_vd_vd(s, s); + + u = vcast_vd_d(-1.88796008463073496563746e-05); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.000209850076645816976906797)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00110611831486672482563471)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.00370026744188713119232403)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00889896195887655491740809)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.016599329773529201970117)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0254517624932312641616861)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0337852580001353069993897)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0407629191276836500001934)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0466667150077840625632675)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0523674852303482457616113)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0587666392926673580854313)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0666573579361080525984562)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0769219538311769618355029)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.090908995008245008229153)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.111111105648261418443745)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.14285714266771329383765)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.199999999996591265594148)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.333333333333311110369124)); + + t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s); + t = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(M_PI/2), t); + + return t; +} + +static inline vdouble2 atan2k_u1(vdouble2 y, vdouble2 x) { + vdouble u; + vdouble2 s, t; + vint q; + vmask p; + + q = vsel_vi_vd_vd_vi_vi(x.x, vcast_vd_d(0), vcast_vi_i(-2), vcast_vi_i(0)); + p = vlt_vm_vd_vd(x.x, vcast_vd_d(0)); + p = vand_vm_vm_vm(p, (vmask)vcast_vd_d(-0.0)); + x.x = (vdouble)vxor_vm_vm_vm((vmask)x.x, p); + x.y = (vdouble)vxor_vm_vm_vm((vmask)x.y, p); + + q = vsel_vi_vd_vd_vi_vi(x.x, y.x, vadd_vi_vi_vi(q, vcast_vi_i(1)), q); + p = vlt_vm_vd_vd(x.x, y.x); + s = vsel_vd2_vm_vd2_vd2(p, ddneg_vd2_vd2(x), y); + t = vsel_vd2_vm_vd2_vd2(p, y, x); + + s = dddiv_vd2_vd2_vd2(s, t); + t = ddsqu_vd2_vd2(s); + t = ddnormalize_vd2_vd2(t); + + u = vcast_vd_d(1.06298484191448746607415e-05); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.000125620649967286867384336)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.00070557664296393412389774)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.00251865614498713360352999)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.00646262899036991172313504)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0128281333663399031014274)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0208024799924145797902497)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0289002344784740315686289)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0359785005035104590853656)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.041848579703592507506027)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0470843011653283988193763)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0524914210588448421068719)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0587946590969581003860434)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0666620884778795497194182)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0769225330296203768654095)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0909090442773387574781907)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.111111108376896236538123)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.142857142756268568062339)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.199999999997977351284817)); + u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.333333333333317605173818)); + + t = ddmul_vd2_vd2_vd(t, u); + t = ddmul_vd2_vd2_vd2(s, ddadd_vd2_vd_vd2(vcast_vd_d(1), t)); + t = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_d_d(1.570796326794896557998982, 6.12323399573676603586882e-17), vcast_vd_vi(q)), t); + + return t; +} + +vdouble _SLEEF_N(xatan2)(vdouble y, vdouble x) { + vdouble r = atan2k(vabs_vd_vd(y), x); + + r = vmulsign_vd_vd_vd(r, x); + r = vsel_vd_vm_vd_vd(vor_vm_vm_vm(visinf_vm_vd(x), veq_vm_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r); + r = vsel_vd_vm_vd_vd(visinf_vm_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r); + r = vsel_vd_vm_vd_vd(veq_vm_vd_vd(y, vcast_vd_d(0.0)), (vdouble)vand_vm_vm_vm(veq_vm_vd_vd(vsign_vd_vd(x), vcast_vd_d(-1.0)), (vmask)vcast_vd_d(M_PI)), r); + + r = (vdouble)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vd(x), visnan_vm_vd(y)), (vmask)vmulsign_vd_vd_vd(r, y)); + return r; +} + +vdouble _SLEEF_N(xatan2_u1)(vdouble y, vdouble x) { + vdouble2 d = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(y), vcast_vd_d(0)), vcast_vd2_vd_vd(x, vcast_vd_d(0))); + vdouble r = vadd_vd_vd_vd(d.x, d.y); + + r = vmulsign_vd_vd_vd(r, x); + r = vsel_vd_vm_vd_vd(vor_vm_vm_vm(visinf_vm_vd(x), veq_vm_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r); + r = vsel_vd_vm_vd_vd(visinf_vm_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r); + r = vsel_vd_vm_vd_vd(veq_vm_vd_vd(y, vcast_vd_d(0.0)), (vdouble)vand_vm_vm_vm(veq_vm_vd_vd(vsign_vd_vd(x), vcast_vd_d(-1.0)), (vmask)vcast_vd_d(M_PI)), r); + + r = (vdouble)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vd(x), visnan_vm_vd(y)), (vmask)vmulsign_vd_vd_vd(r, y)); + return r; +} + +vdouble _SLEEF_N(xasin)(vdouble d) { + vdouble x, y; + x = vadd_vd_vd_vd(vcast_vd_d(1), d); + y = vsub_vd_vd_vd(vcast_vd_d(1), d); + x = vmul_vd_vd_vd(x, y); + x = vsqrt_vd_vd(x); + x = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)atan2k(vabs_vd_vd(d), x)); + return vmulsign_vd_vd_vd(x, d); +} + +vdouble _SLEEF_N(xasin_u1)(vdouble d) { + vdouble2 d2 = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(1), d), ddsub_vd2_vd_vd(vcast_vd_d(1), d)))); + vdouble r = vadd_vd_vd_vd(d2.x, d2.y); + r = vsel_vd_vm_vd_vd(veq_vm_vd_vd(vabs_vd_vd(d), vcast_vd_d(1)), vcast_vd_d(1.570796326794896557998982), r); + return vmulsign_vd_vd_vd(r, d); +} + +vdouble _SLEEF_N(xacos)(vdouble d) { + vdouble x, y; + x = vadd_vd_vd_vd(vcast_vd_d(1), d); + y = vsub_vd_vd_vd(vcast_vd_d(1), d); + x = vmul_vd_vd_vd(x, y); + x = vsqrt_vd_vd(x); + x = vmulsign_vd_vd_vd(atan2k(x, vabs_vd_vd(d)), d); + y = (vdouble)vand_vm_vm_vm(vlt_vm_vd_vd(d, vcast_vd_d(0)), (vmask)vcast_vd_d(M_PI)); + x = vadd_vd_vd_vd(x, y); + return x; +} + +vdouble _SLEEF_N(xacos_u1)(vdouble d) { + vdouble2 d2 = atan2k_u1(ddsqrt_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(1), d), ddsub_vd2_vd_vd(vcast_vd_d(1), d))), vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0))); + d2 = ddscale_vd2_vd2_vd(d2, vmulsign_vd_vd_vd(vcast_vd_d(1), d)); + + vmask m; + m = vneq_vm_vd_vd(vabs_vd_vd(d), vcast_vd_d(1)); + d2.x = (vdouble)vand_vm_vm_vm(m, (vmask)d2.x); + d2.y = (vdouble)vand_vm_vm_vm(m, (vmask)d2.y); + m = vlt_vm_vd_vd(d, vcast_vd_d(0)); + d2 = vsel_vd2_vm_vd2_vd2(m, ddadd_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116, 1.2246467991473532072e-16), d2), d2); + + return vadd_vd_vd_vd(d2.x, d2.y); +} + +vdouble _SLEEF_N(xatan_u1)(vdouble d) { + vdouble2 d2 = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), vcast_vd2_d_d(1, 0)); + vdouble r = vadd_vd_vd_vd(d2.x, d2.y); + r = vsel_vd_vm_vd_vd(visinf_vm_vd(d), vcast_vd_d(1.570796326794896557998982), r); + return vmulsign_vd_vd_vd(r, d); +} + +vdouble _SLEEF_N(xatan)(vdouble s) { + vdouble t, u; + vint q; + + q = vsel_vi_vd_vd_vi_vi(s, vcast_vd_d(0), vcast_vi_i(2), vcast_vi_i(0)); + s = vabs_vd_vd(s); + + q = vsel_vi_vd_vd_vi_vi(vcast_vd_d(1), s, vadd_vi_vi_vi(q, vcast_vi_i(1)), q); + s = vsel_vd_vm_vd_vd(vlt_vm_vd_vd(vcast_vd_d(1), s), vrec_vd_vd(s), s); + + t = vmul_vd_vd_vd(s, s); + + u = vcast_vd_d(-1.88796008463073496563746e-05); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.000209850076645816976906797)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00110611831486672482563471)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.00370026744188713119232403)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00889896195887655491740809)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.016599329773529201970117)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0254517624932312641616861)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0337852580001353069993897)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0407629191276836500001934)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0466667150077840625632675)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0523674852303482457616113)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0587666392926673580854313)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0666573579361080525984562)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0769219538311769618355029)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.090908995008245008229153)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.111111105648261418443745)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.14285714266771329383765)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.199999999996591265594148)); + u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.333333333333311110369124)); + + t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s); + + t = vsel_vd_vm_vd_vd(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1)), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), t), t); + t = (vdouble)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)), (vmask)vcast_vd_d(-0.0)), (vmask)t); + + return t; +} + +vdouble _SLEEF_N(xlog)(vdouble d) { + vdouble x, x2; + vdouble t, m; + vint e; + + e = vilogbp1_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(0.7071))); + m = vldexp_vd_vd_vi(d, vneg_vi_vi(e)); + + x = vdiv_vd_vd_vd(vadd_vd_vd_vd(vcast_vd_d(-1), m), vadd_vd_vd_vd(vcast_vd_d(1), m)); + x2 = vmul_vd_vd_vd(x, x); + + t = vcast_vd_d(0.148197055177935105296783); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.153108178020442575739679)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.181837339521549679055568)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.22222194152736701733275)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.285714288030134544449368)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.399999999989941956712869)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.666666666666685503450651)); + t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(2)); + + x = vmla_vd_vd_vd_vd(x, t, vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e))); + + x = vsel_vd_vm_vd_vd(vispinf_vm_vd(d), vcast_vd_d(INFINITY), x); + x = (vdouble)vor_vm_vm_vm(vgt_vm_vd_vd(vcast_vd_d(0), d), (vmask)x); + x = vsel_vd_vm_vd_vd(veq_vm_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-INFINITY), x); + + return x; +} + +vdouble _SLEEF_N(xexp)(vdouble d) { + vint q = vrint_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))); + vdouble s, u; + + s = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2U), d); + s = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2L), s); + + u = vcast_vd_d(2.08860621107283687536341e-09); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.51112930892876518610661e-08)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573911234900471893338e-07)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75572362911928827629423e-06)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.4801587159235472998791e-05)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000198412698960509205564975)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00138888888889774492207962)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333331652721664984)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665047591422)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.166666666666666851703837)); + u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.5)); + + u = vadd_vd_vd_vd(vcast_vd_d(1), vmla_vd_vd_vd_vd(vmul_vd_vd_vd(s, s), u, s)); + + u = vldexp_vd_vd_vi(u, q); + + u = (vdouble)vandnot_vm_vm_vm(visminf_vm_vd(d), (vmask)u); + + return u; +} + +static inline vdouble2 logk(vdouble d) { + vdouble2 x, x2; + vdouble t, m; + vint e; + + e = vilogbp1_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(0.7071))); + m = vldexp_vd_vd_vi(d, vneg_vi_vi(e)); + + x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m)); + x2 = ddsqu_vd2_vd2(x); + + t = vcast_vd_d(0.134601987501262130076155); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.132248509032032670243288)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153883458318096079652524)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181817427573705403298686)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.222222231326187414840781)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285651261412873718)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000222439910458)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.666666666666666371239645)); + + return ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)), + vcast_vd_vi(e)), + ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)), ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t))); +} + +vdouble _SLEEF_N(xlog_u1)(vdouble d) { + vdouble2 s = logk(d); + vdouble x = vadd_vd_vd_vd(s.x, s.y); + + x = vsel_vd_vm_vd_vd(vispinf_vm_vd(d), vcast_vd_d(INFINITY), x); + x = (vdouble)vor_vm_vm_vm(vgt_vm_vd_vd(vcast_vd_d(0), d), (vmask)x); + x = vsel_vd_vm_vd_vd(veq_vm_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-INFINITY), x); + + return x; +} + +static inline vdouble expk(vdouble2 d) { + vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(R_LN2)); + vint q = vrint_vi_vd(u); + vdouble2 s, t; + + s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2U))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2L))); + + s = ddnormalize_vd2_vd2(s); + + u = vcast_vd_d(2.51069683420950419527139e-08); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.76286166770270649116855e-07)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75572496725023574143864e-06)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.48014973989819794114153e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000198412698809069797676111)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0013888888939977128960529)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333332371417601081)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0416666666665409524128449)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.166666666666666740681535)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.500000000000000999200722)); + + t = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddsqu_vd2_vd2(s), u)); + + t = ddadd_vd2_vd_vd2(vcast_vd_d(1), t); + u = vadd_vd_vd_vd(t.x, t.y); + u = vldexp_vd_vd_vi(u, q); + + return u; +} + +vdouble _SLEEF_N(xpow)(vdouble x, vdouble y) { +#if 1 + vmask yisnint = vneq_vm_vd_vd(vcast_vd_vi(vrint_vi_vd(y)), y); + vmask yisodd = vandnot_vm_vm_vm(yisnint, veq_vm_vi_vi(vand_vi_vi_vi(vrint_vi_vd(y), vcast_vi_i(1)), vcast_vi_i(1))); + + vdouble result = expk(ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y)); + + result = vmul_vd_vd_vd(result, + vsel_vd_vm_vd_vd(vgt_vm_vd_vd(x, vcast_vd_d(0)), + vcast_vd_d(1), + (vdouble)vor_vm_vm_vm(yisnint, (vmask)vsel_vd_vm_vd_vd(yisodd, vcast_vd_d(-1.0), vcast_vd_d(1))))); + + vdouble efx = (vdouble)vxor_vm_vm_vm((vmask)vsub_vd_vd_vd(vabs_vd_vd(x), vcast_vd_d(1)), vsignbit_vm_vd(y)); + + result = vsel_vd_vm_vd_vd(visinf_vm_vd(y), + (vdouble)vandnot_vm_vm_vm(vlt_vm_vd_vd(efx, vcast_vd_d(0.0)), + (vmask)vsel_vd_vm_vd_vd(veq_vm_vd_vd(efx, vcast_vd_d(0.0)), + vcast_vd_d(1.0), + vcast_vd_d(INFINITY))), + result); + + result = vsel_vd_vm_vd_vd(vor_vm_vm_vm(visinf_vm_vd(x), veq_vm_vd_vd(x, vcast_vd_d(0.0))), + vmul_vd_vd_vd(vsel_vd_vm_vd_vd(yisodd, vsign_vd_vd(x), vcast_vd_d(1.0)), + (vdouble)vandnot_vm_vm_vm(vlt_vm_vd_vd(vsel_vd_vm_vd_vd(veq_vm_vd_vd(x, vcast_vd_d(0.0)), vneg_vd_vd(y), y), vcast_vd_d(0.0)), + (vmask)vcast_vd_d(INFINITY))), + result); + + result = (vdouble)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vd(x), visnan_vm_vd(y)), (vmask)result); + + result = vsel_vd_vm_vd_vd(vor_vm_vm_vm(veq_vm_vd_vd(y, vcast_vd_d(0)), veq_vm_vd_vd(x, vcast_vd_d(1))), vcast_vd_d(1), result); + + return result; +#else + return expk(ddmul_vd2_vd2_vd(logk(x), y)); +#endif +} + +static inline vdouble2 expk2(vdouble2 d) { + vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(R_LN2)); + vint q = vrint_vi_vd(u); + vdouble2 s, t; + + s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2U))); + s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(-L2L))); + + u = vcast_vd_d(2.51069683420950419527139e-08); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.76286166770270649116855e-07)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75572496725023574143864e-06)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.48014973989819794114153e-05)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000198412698809069797676111)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0013888888939977128960529)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333332371417601081)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0416666666665409524128449)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.166666666666666740681535)); + u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.500000000000000999200722)); + + t = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddsqu_vd2_vd2(s), u)); + + t = ddadd_vd2_vd_vd2(vcast_vd_d(1), t); + + return ddscale_vd2_vd2_vd(t, vpow2i_vd_vi(q)); +} + +vdouble _SLEEF_N(xsinh)(vdouble x) { + vdouble y = vabs_vd_vd(x); + vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); + d = ddsub_vd2_vd2_vd2(d, ddrec_vd2_vd2(d)); + y = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5)); + + y = vsel_vd_vm_vd_vd(vor_vm_vm_vm(vgt_vm_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vm_vd(y)), vcast_vd_d(INFINITY), y); + y = vmulsign_vd_vd_vd(y, x); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +vdouble _SLEEF_N(xcosh)(vdouble x) { + vdouble y = vabs_vd_vd(x); + vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); + d = ddadd_vd2_vd2_vd2(d, ddrec_vd2_vd2(d)); + y = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5)); + + y = vsel_vd_vm_vd_vd(vor_vm_vm_vm(vgt_vm_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vm_vd(y)), vcast_vd_d(INFINITY), y); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +vdouble _SLEEF_N(xtanh)(vdouble x) { + vdouble y = vabs_vd_vd(x); + vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0))); + vdouble2 e = ddrec_vd2_vd2(d); + d = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddneg_vd2_vd2(e)), ddadd2_vd2_vd2_vd2(d, e)); + y = vadd_vd_vd_vd(d.x, d.y); + + y = vsel_vd_vm_vd_vd(vor_vm_vm_vm(vgt_vm_vd_vd(vabs_vd_vd(x), vcast_vd_d(18.714973875)), visnan_vm_vd(y)), vcast_vd_d(1.0), y); + y = vmulsign_vd_vd_vd(y, x); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +static inline vdouble2 logk2(vdouble2 d) { + vdouble2 x, x2, m; + vdouble t; + vint e; + + e = vilogbp1_vi_vd(vmul_vd_vd_vd(d.x, vcast_vd_d(0.7071))); + m = ddscale_vd2_vd2_vd(d, vpow2i_vd_vi(vneg_vi_vi(e))); + + x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(m, vcast_vd_d(-1)), ddadd2_vd2_vd2_vd(m, vcast_vd_d(1))); + x2 = ddsqu_vd2_vd2(x); + + t = vcast_vd_d(0.134601987501262130076155); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.132248509032032670243288)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153883458318096079652524)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181817427573705403298686)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.222222231326187414840781)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285651261412873718)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000222439910458)); + t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.666666666666666371239645)); + + return ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)), + vcast_vd_vi(e)), + ddadd2_vd2_vd2_vd2(ddscale_vd2_vd2_vd(x, vcast_vd_d(2)), ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t))); +} + +vdouble _SLEEF_N(xasinh)(vdouble x) { + vdouble y = vabs_vd_vd(x); + vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(y, y), vcast_vd_d(1))), y)); + y = vadd_vd_vd_vd(d.x, d.y); + + y = vsel_vd_vm_vd_vd(vor_vm_vm_vm(visinf_vm_vd(x), visnan_vm_vd(y)), vcast_vd_d(INFINITY), y); + y = vmulsign_vd_vd_vd(y, x); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +vdouble _SLEEF_N(xacosh)(vdouble x) { + vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(x, x), vcast_vd_d(-1))), x)); + vdouble y = vadd_vd_vd_vd(d.x, d.y); + + y = vsel_vd_vm_vd_vd(vor_vm_vm_vm(visinf_vm_vd(x), visnan_vm_vd(y)), vcast_vd_d(INFINITY), y); + y = (vdouble)vandnot_vm_vm_vm(veq_vm_vd_vd(x, vcast_vd_d(1.0)), (vmask)y); + + y = (vdouble)vor_vm_vm_vm(vlt_vm_vd_vd(x, vcast_vd_d(1.0)), (vmask)y); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +vdouble _SLEEF_N(xatanh)(vdouble x) { + vdouble y = vabs_vd_vd(x); + vdouble2 d = logk2(dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(1), y), ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(y)))); + y = (vdouble)vor_vm_vm_vm(vgt_vm_vd_vd(y, vcast_vd_d(1.0)), (vmask)vsel_vd_vm_vd_vd(veq_vm_vd_vd(y, vcast_vd_d(1.0)), vcast_vd_d(INFINITY), vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5)))); + + y = (vdouble)vor_vm_vm_vm(vor_vm_vm_vm(visinf_vm_vd(x), visnan_vm_vd(y)), (vmask)y); + + y = vmulsign_vd_vd_vd(y, x); + y = (vdouble)vor_vm_vm_vm(visnan_vm_vd(x), (vmask)y); + + return y; +} + +vdouble _SLEEF_N(xcbrt)(vdouble d) { + vdouble x, y, q = vcast_vd_d(1.0); + vint e, qu, re; + vdouble t; + + e = vilogbp1_vi_vd(vabs_vd_vd(d)); + d = vldexp_vd_vd_vi(d, vneg_vi_vi(e)); + + t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144)); + qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0))); + re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3)))); + + q = vsel_vd_vm_vd_vd(veq_vm_vi_vi(re, vcast_vi_i(1)), vcast_vd_d(1.2599210498948731647672106), q); + q = vsel_vd_vm_vd_vd(veq_vm_vi_vi(re, vcast_vi_i(2)), vcast_vd_d(1.5874010519681994747517056), q); + q = vldexp_vd_vd_vi(q, vsub_vi_vi_vi(qu, vcast_vi_i(2048))); + + q = vmulsign_vd_vd_vd(q, d); + + d = vabs_vd_vd(d); + + x = vcast_vd_d(-0.640245898480692909870982); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722)); + + y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0))); + y = vmul_vd_vd_vd(vmul_vd_vd_vd(d, x), x); + y = vmul_vd_vd_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(2.0 / 3.0), y), vmla_vd_vd_vd_vd(y, x, vcast_vd_d(-1.0)))), q); + + return y; +} + +vdouble _SLEEF_N(xcbrt_u1)(vdouble d) { + vdouble x, y, z, t; + vdouble2 q2 = vcast_vd2_d_d(1, 0), u, v; + vint e, qu, re; + + e = vilogbp1_vi_vd(vabs_vd_vd(d)); + d = vldexp_vd_vd_vi(d, vneg_vi_vi(e)); + + t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144)); + qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0))); + re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3)))); + + q2 = vsel_vd2_vm_vd2_vd2(veq_vm_vi_vi(re, vcast_vi_i(1)), vcast_vd2_d_d(1.2599210498948731907, -2.5899333753005069177e-17), q2); + q2 = vsel_vd2_vm_vd2_vd2(veq_vm_vi_vi(re, vcast_vi_i(2)), vcast_vd2_d_d(1.5874010519681995834, -1.0869008194197822986e-16), q2); + + q2.x = vmulsign_vd_vd_vd(q2.x, d); q2.y = vmulsign_vd_vd_vd(q2.y, d); + d = vabs_vd_vd(d); + + x = vcast_vd_d(-0.640245898480692909870982); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632)); + x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722)); + + y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0))); + + z = x; + + u = ddmul_vd2_vd_vd(x, x); + u = ddmul_vd2_vd2_vd2(u, u); + u = ddmul_vd2_vd2_vd(u, d); + u = ddadd2_vd2_vd2_vd(u, vneg_vd_vd(x)); + y = vadd_vd_vd_vd(u.x, u.y); + + y = vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(-2.0 / 3.0), y), z); + v = ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(z, z), y); + v = ddmul_vd2_vd2_vd(v, d); + v = ddmul_vd2_vd2_vd2(v, q2); + z = vldexp_vd_vd_vi(vadd_vd_vd_vd(v.x, v.y), vsub_vi_vi_vi(qu, vcast_vi_i(2048))); + + z = vsel_vd_vm_vd_vd(visinf_vm_vd(d), vmulsign_vd_vd_vd(vcast_vd_d(INFINITY), q2.x), z); + z = vsel_vd_vm_vd_vd(veq_vm_vd_vd(d, vcast_vd_d(0)), (vdouble)vsignbit_vm_vd(q2.x), z); + + return z; +} + +vdouble _SLEEF_N(xexp2)(vdouble a) { + vdouble u = expk(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.69314718055994528623), vcast_vd_d(2.3190468138462995584e-17)), a)); + u = vsel_vd_vm_vd_vd(vgt_vm_vd_vd(a, vcast_vd_d(1023)), vcast_vd_d(INFINITY), u); + u = (vdouble)vandnot_vm_vm_vm(visminf_vm_vd(a), (vmask)u); + return u; +} + +vdouble _SLEEF_N(xexp10)(vdouble a) { + vdouble u = expk(ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(2.3025850929940459011), vcast_vd_d(-2.1707562233822493508e-16)), a)); + u = vsel_vd_vm_vd_vd(vgt_vm_vd_vd(a, vcast_vd_d(308)), vcast_vd_d(INFINITY), u); + u = (vdouble)vandnot_vm_vm_vm(visminf_vm_vd(a), (vmask)u); + return u; +} + +vdouble _SLEEF_N(xexpm1)(vdouble a) { + vdouble2 d = ddadd2_vd2_vd2_vd(expk2(vcast_vd2_vd_vd(a, vcast_vd_d(0))), vcast_vd_d(-1.0)); + vdouble x = vadd_vd_vd_vd(d.x, d.y); + x = vsel_vd_vm_vd_vd(vgt_vm_vd_vd(a, vcast_vd_d(700)), vcast_vd_d(INFINITY), x); + x = vsel_vd_vm_vd_vd(vlt_vm_vd_vd(a, vcast_vd_d(-0.36043653389117156089696070315825181539851971360337e+2)), vcast_vd_d(-1), x); + return x; +} + +vdouble _SLEEF_N(xlog10)(vdouble a) { + vdouble2 d = ddmul_vd2_vd2_vd2(logk(a), vcast_vd2_vd_vd(vcast_vd_d(0.43429448190325176116), vcast_vd_d(6.6494347733425473126e-17))); + vdouble x = vadd_vd_vd_vd(d.x, d.y); + + x = vsel_vd_vm_vd_vd(vispinf_vm_vd(a), vcast_vd_d(INFINITY), x); + x = (vdouble)vor_vm_vm_vm(vgt_vm_vd_vd(vcast_vd_d(0), a), (vmask)x); + x = vsel_vd_vm_vd_vd(veq_vm_vd_vd(a, vcast_vd_d(0)), vcast_vd_d(-INFINITY), x); + + return x; +} + +vdouble _SLEEF_N(xlog1p)(vdouble a) { + vdouble2 d = logk2(ddadd2_vd2_vd_vd(a, vcast_vd_d(1))); + vdouble x = vadd_vd_vd_vd(d.x, d.y); + + x = vsel_vd_vm_vd_vd(vispinf_vm_vd(a), vcast_vd_d(INFINITY), x); + x = (vdouble)vor_vm_vm_vm(vgt_vm_vd_vd(vcast_vd_d(-1.0), a), (vmask)x); + x = vsel_vd_vm_vd_vd(veq_vm_vd_vd(a, vcast_vd_d(-1)), vcast_vd_d(-INFINITY), x); + + return x; +} Index: sleef/lib/fma4.h =================================================================== --- /dev/null +++ sleef/lib/fma4.h @@ -0,0 +1,320 @@ +/*===---------- fma4.h - FMA4 functions ------------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __FMA4__ +#error Please specify -mfma4. +#endif + +#include + +typedef __m256d vdouble; +typedef __m128i vint; +typedef __m256i vmask; + +typedef __m256 vfloat; +typedef struct { vint x, y; } vint2; + +#define ENABLE_FMA_DP +#define ENABLE_FMA_SP + +// + +static inline vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); } +static inline vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); } +static inline vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); } +static inline vdouble vcast_vd_d(double d) { return _mm256_set_pd(d, d, d, d); } +static inline vint vcast_vi_i(int i) { return _mm_set_epi32(i, i, i, i); } + +static inline vmask vreinterpret_vm_vd(vdouble vd) { return (__m256i)vd; } +static inline vdouble vreinterpret_vd_vm(vmask vm) { return (__m256d)vm; } + +static inline vmask vreinterpret_vm_vf(vfloat vf) { return (__m256i)vf; } +static inline vfloat vreinterpret_vf_vm(vmask vm) { return (__m256)vm; } + +// + +static inline vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } +static inline vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } +static inline vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } + +static inline vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } +static inline vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } +static inline vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } +static inline vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } + +static inline vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); } +static inline vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); } +static inline vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); } + +// + +static inline vmask vand_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_and_pd((__m256d)x, (__m256d)y); } +static inline vmask vandnot_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_andnot_pd((__m256d)x, (__m256d)y); } +static inline vmask vor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_or_pd((__m256d)x, (__m256d)y); } +static inline vmask vxor_vm_vm_vm(vmask x, vmask y) { return (vmask)_mm256_xor_pd((__m256d)x, (__m256d)y); } + +static inline vmask veq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_EQ_OQ); } +static inline vmask vneq_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_NEQ_UQ); } +static inline vmask vlt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LT_OQ); } +static inline vmask vle_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_LE_OQ); } +static inline vmask vgt_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GT_OQ); } +static inline vmask vge_vm_vd_vd(vdouble x, vdouble y) { return (__m256i)_mm256_cmp_pd(x, y, _CMP_GE_OQ); } + +static inline vmask veq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_EQ_OQ); } +static inline vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_NEQ_UQ); } +static inline vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LT_OQ); } +static inline vmask vle_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_LE_OQ); } +static inline vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GT_OQ); } +static inline vmask vge_vm_vf_vf(vfloat x, vfloat y) { return (__m256i)_mm256_cmp_ps(x, y, _CMP_GE_OQ); } + +// + +static inline vfloat vcast_vf_f(float f) { return _mm256_set_ps(f, f, f, f, f, f, f, f); } + +static inline vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); } +static inline vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); } +static inline vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); } +static inline vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); } +static inline vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } +static inline vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); } +static inline vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); } +static inline vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); } + +static inline vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } +static inline vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); } +static inline vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); } +static inline vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); } +static inline vfloat vneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)d); } + +static inline vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } +static inline vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); } +static inline vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); } +static inline vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); } +static inline vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); } + +// + +static inline vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); } +static inline vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); } +static inline vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); } +static inline vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); } +static inline vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set_pd(1, 1, 1, 1), x); } +static inline vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); } +static inline vdouble vabs_vd_vd(vdouble d) { return (__m256d)_mm256_andnot_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static inline vdouble vneg_vd_vd(vdouble d) { return (__m256d)_mm256_xor_pd(_mm256_set_pd(-0.0,-0.0,-0.0,-0.0), d); } +static inline vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } +static inline vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); } + +static inline vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); } +static inline vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); } + +static inline vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } +static inline vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); } +static inline vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); } +static inline vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); } +static inline vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); } + +// + +static inline vmask veq_vm_vi_vi(vint x, vint y) { + __m256d r = _mm256_cvtepi32_pd(_mm_and_si128(_mm_cmpeq_epi32(x, y), _mm_set_epi32(1, 1, 1, 1))); + return veq_vm_vd_vd(r, _mm256_set_pd(1, 1, 1, 1)); +} + +static inline vdouble vsel_vd_vm_vd_vd(vmask mask, vdouble x, vdouble y) { + return (__m256d)vor_vm_vm_vm(vand_vm_vm_vm(mask, (__m256i)x), vandnot_vm_vm_vm(mask, (__m256i)y)); +} + +static inline vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { + return (vfloat)vor_vm_vm_vm(vand_vm_vm_vm(mask, (vmask)x), vandnot_vm_vm_vm(mask, (vmask)y)); +} + +static inline vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { + __m128i mask = _mm256_cvtpd_epi32(_mm256_and_pd(_mm256_cmp_pd(d0, d1, _CMP_LT_OQ), _mm256_set_pd(1.0, 1.0, 1.0, 1.0))); + mask = _mm_cmpeq_epi32(mask, _mm_set_epi32(1, 1, 1, 1)); + return vor_vi_vi_vi(vand_vi_vi_vi(mask, x), vandnot_vi_vi_vi(mask, y)); +} + +// + +static inline vint2 vcast_vi2_vm(vmask vm) { + vint2 r; + r.x = _mm256_castsi256_si128(vm); + r.y = _mm256_extractf128_si256(vm, 1); + return r; +} + +static inline vmask vcast_vm_vi2(vint2 vi) { + vmask m = _mm256_castsi128_si256(vi.x); + m = _mm256_insertf128_si256(m, vi.y, 1); + return m; +} + +static inline vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvtps_epi32(vf)); } +static inline vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm256_cvttps_epi32(vf)); } +static inline vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); } +static inline vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = vcast_vi_i(i); return r; } + +static inline vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vadd_vi_vi_vi(x.x, y.x); r.y = vadd_vi_vi_vi(x.y, y.y); return r; } +static inline vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vsub_vi_vi_vi(x.x, y.x); r.y = vsub_vi_vi_vi(x.y, y.y); return r; } +static inline vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } + +static inline vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vand_vi_vi_vi(x.x, y.x); r.y = vand_vi_vi_vi(x.y, y.y); return r; } +static inline vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vandnot_vi_vi_vi(x.x, y.x); r.y = vandnot_vi_vi_vi(x.y, y.y); return r; } +static inline vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vor_vi_vi_vi(x.x, y.x); r.y = vor_vi_vi_vi(x.y, y.y); return r; } +static inline vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { vint2 r; r.x = vxor_vi_vi_vi(x.x, y.x); r.y = vxor_vi_vi_vi(x.y, y.y); return r; } + +static inline vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsll_vi_vi_i(x.x, c); r.y = vsll_vi_vi_i(x.y, c); return r; } +static inline vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsrl_vi_vi_i(x.x, c); r.y = vsrl_vi_vi_i(x.y, c); return r; } +static inline vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 r; r.x = vsra_vi_vi_i(x.x, c); r.y = vsra_vi_vi_i(x.y, c); return r; } + +static inline vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpeq_epi32(x.x, y.x); + r.y = _mm_cmpeq_epi32(x.y, y.y); + return vcast_vm_vi2(r); +} + +static inline vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpgt_epi32(x.x, y.x); + r.y = _mm_cmpgt_epi32(x.y, y.y); + return vcast_vm_vi2(r); +} + +static inline vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { + vint2 r; + r.x = _mm_cmpgt_epi32(x.x, y.x); + r.y = _mm_cmpgt_epi32(x.y, y.y); + return r; +} + +static inline vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { + vint2 r, m2 = vcast_vi2_vm(m); + r.x = vor_vi_vi_vi(vand_vi_vi_vi(m2.x, x.x), vandnot_vi_vi_vi(m2.x, y.x)); + r.y = vor_vi_vi_vi(vand_vi_vi_vi(m2.y, x.y), vandnot_vi_vi_vi(m2.y, y.y)); + return r; +} + +// + +static inline double vcast_d_vd(vdouble v) { + double s[4]; + _mm256_storeu_pd(s, v); + return s[0]; +} + +static inline float vcast_f_vf(vfloat v) { + float s[8]; + _mm256_storeu_ps(s, v); + return s[0]; +} + +static inline vmask vsignbit_vm_vd(vdouble d) { + return (vmask)_mm256_and_pd(d, _mm256_set_pd(-0.0,-0.0,-0.0,-0.0)); +} + +static inline vdouble vsign_vd_vd(vdouble d) { + return _mm256_or_pd(_mm256_set_pd(1.0, 1.0, 1.0, 1.0), (vdouble)vsignbit_vm_vd(d)); +} + +static inline vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) { + return (__m256d)vxor_vm_vm_vm((__m256i)x, vsignbit_vm_vd(y)); +} + +static inline vmask visinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static inline vmask vispinf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(INFINITY, INFINITY, INFINITY, INFINITY), _CMP_EQ_OQ); +} + +static inline vmask visminf_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, _mm256_set_pd(-INFINITY, -INFINITY, -INFINITY, -INFINITY), _CMP_EQ_OQ); +} + +static inline vmask visnan_vm_vd(vdouble d) { + return (vmask)_mm256_cmp_pd(d, d, _CMP_NEQ_UQ); +} + +static inline vdouble visinf(vdouble d) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), vsign_vd_vd(d)); +} + +static inline vdouble visinf2(vdouble d, vdouble m) { + return _mm256_and_pd((vdouble)visinf_vm_vd(d), _mm256_or_pd((vdouble)vsignbit_vm_vd(d), m)); +} + +static inline vdouble vpow2i_vd_vi(vint q) { + vint r; + vdouble y; + q = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), q); + q = _mm_slli_epi32(q, 20); + r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1,0,0,0)); + y = _mm256_castpd128_pd256((__m128d)r); + r = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(3,2,2,2)); + y = _mm256_insertf128_pd(y, (__m128d)r, 1); + y = _mm256_and_pd(y, (__m256d)_mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return y; +} + +static inline vdouble vldexp_vd_vd_vi(vdouble x, vint q) { + vint m = _mm_srai_epi32(q, 31); + m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7); + q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2)); + m = _mm_add_epi32(_mm_set_epi32(0x3ff, 0x3ff, 0x3ff, 0x3ff), m); + m = _mm_andnot_si128(_mm_cmplt_epi32(m, _mm_set_epi32(0, 0, 0, 0)), m); + vint n = _mm_cmpgt_epi32(m, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff)); + m = _mm_or_si128(_mm_andnot_si128(n, m), _mm_and_si128(n, _mm_set_epi32(0x7ff, 0x7ff, 0x7ff, 0x7ff))); + m = _mm_slli_epi32(m, 20); + vint r = (__m128i)_mm_shuffle_ps((__m128)m, (__m128)m, _MM_SHUFFLE(1,0,0,0)); + vdouble y = _mm256_castpd128_pd256((__m128d)r); + r = (__m128i)_mm_shuffle_ps((__m128)m, (__m128)m, _MM_SHUFFLE(3,2,2,2)); + y = _mm256_insertf128_pd(y, (__m128d)r, 1); + y = _mm256_and_pd(y, (__m256d)_mm256_set_epi32(0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0, 0xfff00000, 0)); + return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q)); +} + +static inline vint vilogbp1_vi_vd(vdouble d) { + vint q, r, c; + vmask m = vlt_vm_vd_vd(d, vcast_vd_d(4.9090934652977266E-91)); + d = vsel_vd_vm_vd_vd(m, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); + c = _mm256_cvtpd_epi32(vsel_vd_vm_vd_vd(m, vcast_vd_d(300+0x3fe), vcast_vd_d(0x3fe))); + q = (__m128i)_mm256_castpd256_pd128(d); + q = (__m128i)_mm_shuffle_ps((__m128)q, _mm_set_ps(0, 0, 0, 0), _MM_SHUFFLE(0,0,3,1)); + r = (__m128i)_mm256_extractf128_pd(d, 1); + r = (__m128i)_mm_shuffle_ps(_mm_set_ps(0, 0, 0, 0), (__m128)r, _MM_SHUFFLE(3,1,0,0)); + q = _mm_or_si128(q, r); + q = _mm_srli_epi32(q, 20); + q = _mm_sub_epi32(q, c); + return q; +} + +static inline vdouble vupper_vd_vd(vdouble d) { + return (__m256d)_mm256_and_pd(d, (vdouble)_mm256_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000)); +} + +static inline vfloat vupper_vf_vf(vfloat d) { + return (vfloat)vand_vm_vm_vm((vmask)d, _mm256_set_epi32(0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000,0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000)); +} Index: sleef/lib/isa.h =================================================================== --- /dev/null +++ sleef/lib/isa.h @@ -0,0 +1,20 @@ +#if defined(BUILD_SSE2) +#include "sse2.h" +#define _SLEEF_N(f) __ ## f ## __sse2 +#elif defined(BUILD_AVX) +#ifdef __FMA4__ +#include "fma4.h" +#else +#include "avx.h" +#endif +#define _SLEEF_N(f) __ ## f ## __avx +#elif defined(BUILD_AVX2) +#include "avx2.h" +#define _SLEEF_N(f) __ ## f ## __avx2 +#elif defined(BUILD_NEON) +#include "neon.h" +#define _SLEEF_N(f) __ ## f ## __neon +#else +#error "No BUILD_ defined" +#endif + Index: sleef/lib/neon.h =================================================================== --- /dev/null +++ sleef/lib/neon.h @@ -0,0 +1,167 @@ +/*===---------- neon.h - NEON functions ------------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __ARM_NEON__ +#error Please specify -mfpu=neon. +#endif + +#include + +typedef int32x4_t vint; +typedef uint32x4_t vmask; + +typedef float32x4_t vfloat; +typedef int32x4_t vint2; + +// + +static inline vint vcast_vi_i(int i) { return vdupq_n_s32(i); } + +static inline vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; } +static inline vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; } + +// + +static inline vint vadd_vi_vi_vi(vint x, vint y) { return vaddq_s32(x, y); } +static inline vint vsub_vi_vi_vi(vint x, vint y) { return vsubq_s32(x, y); } +static inline vint vneg_vi_vi(vint e) { return vnegq_s32(e); } + +static inline vint vand_vi_vi_vi(vint x, vint y) { return vandq_s32(x, y); } +static inline vint vandnot_vi_vi_vi(vint x, vint y) { return vbicq_s32(y, x); } +static inline vint vor_vi_vi_vi(vint x, vint y) { return vorrq_s32(x, y); } +static inline vint vxor_vi_vi_vi(vint x, vint y) { return veorq_s32(x, y); } + +static inline vint vsll_vi_vi_i(vint x, int c) { return (int32x4_t) vshlq_n_u32((uint32x4_t)x, c); } +static inline vint vsrl_vi_vi_i(vint x, int c) { return (int32x4_t) vshrq_n_u32((uint32x4_t)x, c); } +static inline vint vsra_vi_vi_i(vint x, int c) { return vshrq_n_s32(x, c); } + +// + +static inline vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); } +static inline vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); } +static inline vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); } +static inline vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); } + +static inline vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); } +static inline vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); } +static inline vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); } +static inline vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); } +static inline vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); } +static inline vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); } + +// + +static inline vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); } + +static inline vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); } +static inline vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); } +static inline vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); } +static inline vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); } +static inline vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); } + +static inline vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); } +static inline vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); } +static inline vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); } +static inline vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); } + +static inline vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { + return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y); +} + +static inline vfloat vrec_vf_vf(vfloat d) { + float32x4_t x = vrecpeq_f32(d); + x = vmulq_f32(x, vrecpsq_f32(d, x)); + return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d); +} + +static inline vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) { + float32x4_t x = vrecpeq_f32(d); + x = vmulq_f32(x, vrecpsq_f32(d, x)); + float32x4_t t = vmulq_f32(n, x); + return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d); +} + +static inline vfloat vsqrt_vf_vf(vfloat d) { + float32x4_t x = vrsqrteq_f32(d); + x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x))); + float32x4_t u = vmulq_f32(x, d); + u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5))); + return (float32x4_t)vbicq_u32((uint32x4_t)u, vceqq_f32(d, vdupq_n_f32(0.0f))); +} + +static inline vfloat vrecsqrt_vf_vf(vfloat d) { + float32x4_t x = vrsqrteq_f32(d); + x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x))); + return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5))); +} + +#define ENABLE_RECSQRT_SP + +// + +static inline vmask veq_vm_vi_vi(vint x, vint y) { return vceqq_s32(x, y); } + +// + +static inline vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; } +static inline vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; } + +static inline vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); } + +static inline vint2 vrint_vi2_vf(vfloat d) { + //return vcvtq_s32_f32(vrndqn_f32(d)); + return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f)))); +} + +static inline vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); } +static inline vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); } + +static inline vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); } +static inline vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); } +static inline vint vneg_vi2_vi2(vint2 e) { return vneg_vi_vi(e); } + +static inline vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); } +static inline vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); } +static inline vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); } +static inline vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); } + +static inline vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); } +static inline vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); } +static inline vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); } + +static inline vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); } +static inline vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); } +static inline vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgeq_s32(x, y); } +static inline vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); } + +// + +static inline float vcast_f_vf(vfloat v) { + float p[4]; + vst1q_f32 (p, v); + return p[0]; +} + +static inline vfloat vupper_vf_vf(vfloat d) { + return (vfloat)vandq_s32((vint)d, vdupq_n_s32(0xfffff000)); +} Index: sleef/lib/nonnumber.h =================================================================== --- /dev/null +++ sleef/lib/nonnumber.h @@ -0,0 +1,42 @@ +/*===---------- nonnumber.h - inf/nan constants -----------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#if defined (__GNUC__) || defined (__INTEL_COMPILER) || defined (__clang__) +#ifdef INFINITY +#undef INFINITY +#endif + +#ifdef NAN +#undef NAN +#endif + +#define NAN __builtin_nan("") +#define NANf __builtin_nanf("") +#define INFINITY __builtin_inf() +#define INFINITYf __builtin_inff() +#else + +#include +#include + +#endif Index: sleef/lib/sp-avx.cpp =================================================================== --- /dev/null +++ sleef/lib/sp-avx.cpp @@ -0,0 +1,3 @@ +#define BUILD_AVX +#include "sp.cpp" + Index: sleef/lib/sp-avx2.cpp =================================================================== --- /dev/null +++ sleef/lib/sp-avx2.cpp @@ -0,0 +1,3 @@ +#define BUILD_AVX2 +#include "sp.cpp" + Index: sleef/lib/sp-neon.cpp =================================================================== --- /dev/null +++ sleef/lib/sp-neon.cpp @@ -0,0 +1,3 @@ +#define BUILD_NEON +#include "sp.cpp" + Index: sleef/lib/sp-scalar.cpp =================================================================== --- /dev/null +++ sleef/lib/sp-scalar.cpp @@ -0,0 +1,1111 @@ +#include +#include +#include +#include + +#include "nonnumber.h" +#define _SLEEF_N(f) __ ## f + +#define PI4_Af 0.78515625f +#define PI4_Bf 0.00024187564849853515625f +#define PI4_Cf 3.7747668102383613586e-08f +#define PI4_Df 1.2816720341285448015e-12f + +#define L2Uf 0.693145751953125f +#define L2Lf 1.428606765330187045e-06f + +#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f +#define M_PIf ((float)M_PI) + +static inline int32_t floatToRawIntBits(float d) { + union { + float f; + int32_t i; + } tmp; + tmp.f = d; + return tmp.i; +} + +static inline float intBitsToFloat(int32_t i) { + union { + float f; + int32_t i; + } tmp; + tmp.i = i; + return tmp.f; +} + +static inline float xfabsf(float x) { + return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x)); +} + +static inline float mulsignf(float x, float y) { + return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31))); +} + +static inline float signf(float d) { return mulsignf(1, d); } +static inline float mlaf(float x, float y, float z) { return x * y + z; } +static inline float xrintf(float x) { return x < 0 ? (int)(x - 0.5f) : (int)(x + 0.5f); } + +static inline int xisnanf(float x) { return x != x; } +static inline int xisinff(float x) { return x == INFINITYf || x == -INFINITYf; } +static inline int xisminff(float x) { return x == -INFINITYf; } +static inline int xispinff(float x) { return x == INFINITYf; } + +static inline int ilogbp1f(float d) { + int m = d < 5.421010862427522E-20f; + d = m ? 1.8446744073709552E19f * d : d; + int q = (floatToRawIntBits(d) >> 23) & 0xff; + q = m ? q - (64 + 0x7e) : q - 0x7e; + return q; +} + +static inline float pow2if(int q) { + return intBitsToFloat(((int32_t)(q + 0x7f)) << 23); +} + +static inline float ldexpkf(float x, int q) { + float u; + int m; + m = q >> 31; + m = (((m + q) >> 6) - m) << 4; + q = q - (m << 2); + m += 127; + m = m < 0 ? 0 : m; + m = m > 255 ? 255 : m; + u = intBitsToFloat(((int32_t)m) << 23); + x = x * u * u * u * u; + u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23); + return x * u; +} + +// + +typedef struct { + float x, y; +} float2; + +extern "C" { +float _SLEEF_N(xldexpf)(float x, int q); +float _SLEEF_N(xsinf)(float d); +float _SLEEF_N(xsinf_u1)(float d); +float _SLEEF_N(xcosf)(float d); +float _SLEEF_N(xcosf_u1)(float d); +float2 _SLEEF_N(xsincosf)(float d); +float2 _SLEEF_N(xsincosf_u1)(float d); +float _SLEEF_N(xtanf)(float d); +float _SLEEF_N(xtanf_u1)(float d); +float _SLEEF_N(xatanf)(float s); +float _SLEEF_N(xatan2f)(float y, float x); +float _SLEEF_N(xasinf)(float d); +float _SLEEF_N(xacosf)(float d); +float _SLEEF_N(xatan2f_u1)(float y, float x); +float _SLEEF_N(xasinf_u1)(float d); +float _SLEEF_N(xacosf_u1)(float d); +float _SLEEF_N(xatanf_u1)(float d); +float _SLEEF_N(xlogf)(float d); +float _SLEEF_N(xexpf)(float d); +float _SLEEF_N(xlogf_u1)(float d); +float _SLEEF_N(xpowf)(float x, float y); +float _SLEEF_N(xsinhf)(float x); +float _SLEEF_N(xcoshf)(float x); +float _SLEEF_N(xtanhf)(float x); +float _SLEEF_N(xasinhf)(float x); +float _SLEEF_N(xacoshf)(float x); +float _SLEEF_N(xatanhf)(float x); +float _SLEEF_N(xexp2f)(float a); +float _SLEEF_N(xexp10f)(float a); +float _SLEEF_N(xexpm1f)(float a); +float _SLEEF_N(xlog10f)(float a); +float _SLEEF_N(xlog1pf)(float a); +float _SLEEF_N(xsqrtf)(float f); +float _SLEEF_N(xcbrtf)(float d); +float _SLEEF_N(xcbrtf_u1)(float d); +} // extern "C" + +float _SLEEF_N(xldexpf)(float x, int q) { return ldexpkf(x, q); } + +#ifndef NDEBUG +static int checkfp(float x) { + if (xisinff(x) || xisnanf(x)) return 1; + return 0; +} +#endif + +static inline float upperf(float d) { + return intBitsToFloat(floatToRawIntBits(d) & 0xfffff000); +} + +static inline float2 df(float h, float l) { + float2 ret; + ret.x = h; ret.y = l; + return ret; +} + +static inline float2 dfnormalize_f2_f2(float2 t) { + float2 s; + + s.x = t.x + t.y; + s.y = t.x - s.x + t.y; + + return s; +} + +static inline float2 dfscale_f2_f2_f(float2 d, float s) { + float2 r; + + r.x = d.x * s; + r.y = d.y * s; + + return r; +} + +static inline float2 dfneg_f2_f2(float2 d) { + float2 r; + + r.x = -d.x; + r.y = -d.y; + + return r; +} + +static inline float2 dfadd_f2_f_f(float x, float y) { + // |x| >= |y| + + float2 r; + + r.x = x + y; + r.y = x - r.x + y; + + return r; +} + +static inline float2 dfadd2_f2_f_f(float x, float y) { + float2 r; + + r.x = x + y; + float v = r.x - x; + r.y = (x - (r.x - v)) + (y - v); + + return r; +} + +static inline float2 dfadd_f2_f2_f(float2 x, float y) { + // |x| >= |y| + + float2 r; + + r.x = x.x + y; + r.y = x.x - r.x + y + x.y; + + return r; +} + +static inline float2 dfadd2_f2_f2_f(float2 x, float y) { + // |x| >= |y| + + float2 r; + + r.x = x.x + y; + float v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y - v); + r.y += x.y; + + return r; +} + +static inline float2 dfadd_f2_f_f2(float x, float2 y) { + // |x| >= |y| + + float2 r; + + r.x = x + y.x; + r.y = x - r.x + y.x + y.y; + + return r; +} + +static inline float2 dfadd_f2_f2_f2(float2 x, float2 y) { + // |x| >= |y| + + float2 r; + + r.x = x.x + y.x; + r.y = x.x - r.x + y.x + x.y + y.y; + + return r; +} + +static inline float2 dfadd2_f2_f2_f2(float2 x, float2 y) { + float2 r; + + r.x = x.x + y.x; + float v = r.x - x.x; + r.y = (x.x - (r.x - v)) + (y.x - v); + r.y += x.y + y.y; + + return r; +} + +static inline float2 dfsub_f2_f2_f2(float2 x, float2 y) { + // |x| >= |y| + + float2 r; + + r.x = x.x - y.x; + r.y = x.x - r.x - y.x + x.y - y.y; + + return r; +} + +static inline float2 dfdiv_f2_f2_f2(float2 n, float2 d) { + float t = 1.0f / d.x; + float dh = upperf(d.x), dl = d.x - dh; + float th = upperf(t ), tl = t - th; + float nhh = upperf(n.x), nhl = n.x - nhh; + + float2 q; + + q.x = n.x * t; + + float u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl + + q.x * (1 - dh * th - dh * tl - dl * th - dl * tl); + + q.y = t * (n.y - q.x * d.y) + u; + + return q; +} + +static inline float2 dfmul_f2_f_f(float x, float y) { + float xh = upperf(x), xl = x - xh; + float yh = upperf(y), yl = y - yh; + float2 r; + + r.x = x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl; + + return r; +} + +static inline float2 dfmul_f2_f2_f(float2 x, float y) { + float xh = upperf(x.x), xl = x.x - xh; + float yh = upperf(y ), yl = y - yh; + float2 r; + + r.x = x.x * y; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y; + + return r; +} + +static inline float2 dfmul_f2_f2_f2(float2 x, float2 y) { + float xh = upperf(x.x), xl = x.x - xh; + float yh = upperf(y.x), yl = y.x - yh; + float2 r; + + r.x = x.x * y.x; + r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x; + + return r; +} + +static inline float2 dfsqu_f2_f2(float2 x) { + float xh = upperf(x.x), xl = x.x - xh; + float2 r; + + r.x = x.x * x.x; + r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y); + + return r; +} + +static inline float2 dfrec_f2_f(float d) { + float t = 1.0f / d; + float dh = upperf(d), dl = d - dh; + float th = upperf(t), tl = t - th; + float2 q; + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl); + + return q; +} + +static inline float2 dfrec_f2_f2(float2 d) { + float t = 1.0f / d.x; + float dh = upperf(d.x), dl = d.x - dh; + float th = upperf(t ), tl = t - th; + float2 q; + + q.x = t; + q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t); + + return q; +} + +static inline float2 dfsqrt_f2_f2(float2 d) { + float t = sqrtf(d.x + d.y); + return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f2_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5f); +} + +// + +float _SLEEF_N(xsinf)(float d) { + int q; + float u, s; + + q = (int)xrintf(d * (float)M_1_PI); + + d = mlaf(q, -PI4_Af*4, d); + d = mlaf(q, -PI4_Bf*4, d); + d = mlaf(q, -PI4_Cf*4, d); + d = mlaf(q, -PI4_Df*4, d); + + s = d * d; + + if ((q & 1) != 0) d = -d; + + u = 2.6083159809786593541503e-06f; + u = mlaf(u, s, -0.0001981069071916863322258f); + u = mlaf(u, s, 0.00833307858556509017944336f); + u = mlaf(u, s, -0.166666597127914428710938f); + + u = mlaf(s, u * d, d); + + if (xisinff(d)) u = NANf; + + return u; +} + +float _SLEEF_N(xsinf_u1)(float d) { + int q; + float u; + float2 s, t, x; + + q = (int)xrintf(d * (float)M_1_PI); + + s = dfadd2_f2_f_f(d, q * (-PI4_Af*4)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*4)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*4)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Df*4)); + + t = s; + s = dfsqu_f2_f2(s); + + u = 2.6083159809786593541503e-06f; + u = mlaf(u, s.x, -0.0001981069071916863322258f); + u = mlaf(u, s.x, 0.00833307858556509017944336f); + + x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s)); + + x = dfmul_f2_f2_f2(t, x); + u = x.x + x.y; + + if ((q & 1) != 0) u = -u; + + return u; +} + +float _SLEEF_N(xcosf)(float d) { + int q; + float u, s; + + q = 1 + 2*(int)xrintf(d * (float)M_1_PI - 0.5f); + + d = mlaf(q, -PI4_Af*2, d); + d = mlaf(q, -PI4_Bf*2, d); + d = mlaf(q, -PI4_Cf*2, d); + d = mlaf(q, -PI4_Df*2, d); + + s = d * d; + + if ((q & 2) == 0) d = -d; + + u = 2.6083159809786593541503e-06f; + u = mlaf(u, s, -0.0001981069071916863322258f); + u = mlaf(u, s, 0.00833307858556509017944336f); + u = mlaf(u, s, -0.166666597127914428710938f); + + u = mlaf(s, u * d, d); + + if (xisinff(d)) u = NANf; + + return u; +} + +float _SLEEF_N(xcosf_u1)(float d) { + float u, q; + float2 s, t, x; + + d = fabsf(d); + + q = 1 + 2*(int)xrintf(d * (float)M_1_PI - 0.5f); + + s = dfadd2_f2_f_f(d, q * (-PI4_Af*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2)); + + t = s; + s = dfsqu_f2_f2(s); + + u = 2.6083159809786593541503e-06f; + u = mlaf(u, s.x, -0.0001981069071916863322258f); + u = mlaf(u, s.x, 0.00833307858556509017944336f); + + x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s)); + + x = dfmul_f2_f2_f2(t, x); + u = x.x + x.y; + + if ((((int)q) & 2) == 0) u = -u; + + return u; +} + +float2 _SLEEF_N(xsincosf)(float d) { + int q; + float u, s, t; + float2 r; + + q = (int)xrintf(d * ((float)(2 * M_1_PI))); + + s = d; + + s = mlaf(q, -PI4_Af*2, s); + s = mlaf(q, -PI4_Bf*2, s); + s = mlaf(q, -PI4_Cf*2, s); + s = mlaf(q, -PI4_Df*2, s); + + t = s; + + s = s * s; + + u = -0.000195169282960705459117889f; + u = mlaf(u, s, 0.00833215750753879547119141f); + u = mlaf(u, s, -0.166666537523269653320312f); + u = u * s * t; + + r.x = t + u; + + u = -2.71811842367242206819355e-07f; + u = mlaf(u, s, 2.47990446951007470488548e-05f); + u = mlaf(u, s, -0.00138888787478208541870117f); + u = mlaf(u, s, 0.0416666641831398010253906f); + u = mlaf(u, s, -0.5f); + + r.y = u * s + 1; + + if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; } + if ((q & 2) != 0) { r.x = -r.x; } + if (((q+1) & 2) != 0) { r.y = -r.y; } + + if (xisinff(d)) { r.x = r.y = NANf; } + + return r; +} + +float2 _SLEEF_N(xsincosf_u1)(float d) { + int q; + float u; + float2 r, s, t, x; + + q = (int)xrintf(d * (float)(2 * M_1_PI)); + + s = dfadd2_f2_f_f(d, q * (-PI4_Af*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2)); + + t = s; + s = dfsqu_f2_f2(s); + s.x = s.x + s.y; + + u = -0.000195169282960705459117889f; + u = mlaf(u, s.x, 0.00833215750753879547119141f); + u = mlaf(u, s.x, -0.166666537523269653320312f); + + u *= s.x * t.x; + + x = dfadd_f2_f2_f(t, u); + r.x = x.x + x.y; + + u = -2.71811842367242206819355e-07f; + u = mlaf(u, s.x, 2.47990446951007470488548e-05f); + u = mlaf(u, s.x, -0.00138888787478208541870117f); + u = mlaf(u, s.x, 0.0416666641831398010253906f); + u = mlaf(u, s.x, -0.5f); + + x = dfadd_f2_f_f2(1, dfmul_f2_f_f(s.x, u)); + r.y = x.x + x.y; + + if ((q & 1) != 0) { u = r.y; r.y = r.x; r.x = u; } + if ((q & 2) != 0) { r.x = -r.x; } + if (((q+1) & 2) != 0) { r.y = -r.y; } + + if (xisinff(d)) { r.x = r.y = NAN; } + + return r; +} + +float _SLEEF_N(xtanf)(float d) { + int q; + float u, s, x; + + q = (int)xrintf(d * (float)(2 * M_1_PI)); + + x = d; + + x = mlaf(q, -PI4_Af*2, x); + x = mlaf(q, -PI4_Bf*2, x); + x = mlaf(q, -PI4_Cf*2, x); + x = mlaf(q, -PI4_Df*2, x); + + s = x * x; + + if ((q & 1) != 0) x = -x; + + u = 0.00927245803177356719970703f; + u = mlaf(u, s, 0.00331984995864331722259521f); + u = mlaf(u, s, 0.0242998078465461730957031f); + u = mlaf(u, s, 0.0534495301544666290283203f); + u = mlaf(u, s, 0.133383005857467651367188f); + u = mlaf(u, s, 0.333331853151321411132812f); + + u = mlaf(s, u * x, x); + + if ((q & 1) != 0) u = 1.0f / u; + + if (xisinff(d)) u = NANf; + + return u; +} + +float _SLEEF_N(xtanf_u1)(float d) { + int q; + float u; + float2 s, t, x; + + q = (int)xrintf(d * (float)(2 * M_1_PI)); + + s = dfadd2_f2_f_f(d, q * (-PI4_Af*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Bf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Cf*2)); + s = dfadd2_f2_f2_f(s, q * (-PI4_Df*2)); + + if ((q & 1) != 0) s = dfneg_f2_f2(s); + + t = s; + s = dfsqu_f2_f2(s); + s = dfnormalize_f2_f2(s); + + u = 0.00446636462584137916564941f; + u = mlaf(u, s.x, -8.3920182078145444393158e-05f); + u = mlaf(u, s.x, 0.0109639242291450500488281f); + u = mlaf(u, s.x, 0.0212360303848981857299805f); + u = mlaf(u, s.x, 0.0540687143802642822265625f); + + x = dfadd_f2_f_f(0.133325666189193725585938f, u * s.x); + x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f2(0.33333361148834228515625f, dfmul_f2_f2_f2(s, x)), s)); + x = dfmul_f2_f2_f2(t, x); + + if ((q & 1) != 0) x = dfrec_f2_f2(x); + + u = x.x + x.y; + + return u; +} + +float _SLEEF_N(xatanf)(float s) { + float t, u; + int q = 0; + + if (s < 0) { s = -s; q = 2; } + if (s > 1) { s = 1.0f / s; q |= 1; } + + t = s * s; + + u = 0.00282363896258175373077393f; + u = mlaf(u, t, -0.0159569028764963150024414f); + u = mlaf(u, t, 0.0425049886107444763183594f); + u = mlaf(u, t, -0.0748900920152664184570312f); + u = mlaf(u, t, 0.106347933411598205566406f); + u = mlaf(u, t, -0.142027363181114196777344f); + u = mlaf(u, t, 0.199926957488059997558594f); + u = mlaf(u, t, -0.333331018686294555664062f); + + t = s + s * (t * u); + + if ((q & 1) != 0) t = 1.570796326794896557998982f - t; + if ((q & 2) != 0) t = -t; + + return t; +} + +static inline float atan2kf(float y, float x) { + float s, t, u; + int q = 0; + + if (x < 0) { x = -x; q = -2; } + if (y > x) { t = x; x = y; y = -t; q += 1; } + + s = y / x; + t = s * s; + + u = 0.00282363896258175373077393f; + u = mlaf(u, t, -0.0159569028764963150024414f); + u = mlaf(u, t, 0.0425049886107444763183594f); + u = mlaf(u, t, -0.0748900920152664184570312f); + u = mlaf(u, t, 0.106347933411598205566406f); + u = mlaf(u, t, -0.142027363181114196777344f); + u = mlaf(u, t, 0.199926957488059997558594f); + u = mlaf(u, t, -0.333331018686294555664062f); + + t = u * t * s + s; + t = q * (float)(M_PI/2) + t; + + return t; +} + +float _SLEEF_N(xatan2f)(float y, float x) { + float r = atan2kf(xfabsf(y), x); + + r = mulsignf(r, x); + if (xisinff(x) || x == 0) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI /2)) : 0); + if (xisinff(y) ) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0); + if ( y == 0) r = (signf(x) == -1 ? M_PIf : 0); + + return xisnanf(x) || xisnanf(y) ? NANf : mulsignf(r, y); +} + +float _SLEEF_N(xasinf)(float d) { + return mulsignf(atan2kf(fabsf(d), sqrtf((1.0f+d)*(1.0f-d))), d); +} + +float _SLEEF_N(xacosf)(float d) { + return mulsignf(atan2kf(sqrtf((1.0f+d)*(1.0f-d)), fabsf(d)), d) + (d < 0 ? (float)M_PI : 0.0f); +} + +static float2 atan2kf_u1(float2 y, float2 x) { + float u; + float2 s, t; + int q = 0; + + if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; } + if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; } + + s = dfdiv_f2_f2_f2(y, x); + t = dfsqu_f2_f2(s); + t = dfnormalize_f2_f2(t); + + u = -0.00176397908944636583328247f; + u = mlaf(u, t.x, 0.0107900900766253471374512f); + u = mlaf(u, t.x, -0.0309564601629972457885742f); + u = mlaf(u, t.x, 0.0577365085482597351074219f); + u = mlaf(u, t.x, -0.0838950723409652709960938f); + u = mlaf(u, t.x, 0.109463557600975036621094f); + u = mlaf(u, t.x, -0.142626821994781494140625f); + u = mlaf(u, t.x, 0.199983194470405578613281f); + + //u = mlaf(u, t.x, -0.333332866430282592773438f); + //t = dfmul_f2_f2_f(t, u); + + t = dfmul_f2_f2_f2(t, dfadd_f2_f_f(-0.333332866430282592773438f, u * t.x)); + t = dfmul_f2_f2_f2(s, dfadd_f2_f_f2(1, t)); + t = dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(1.5707963705062866211f, -4.3711388286737928865e-08f), q), t); + + return t; +} + +float _SLEEF_N(xatan2f_u1)(float y, float x) { + float2 d = atan2kf_u1(df(xfabsf(y), 0), df(x, 0)); + float r = d.x + d.y; + + r = mulsignf(r, x); + if (xisinff(x) || x == 0) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI /2)) : 0.0f); + if (xisinff(y) ) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0.0f); + if ( y == 0) r = (signf(x) == -1 ? (float)M_PI : 0.0f); + + return xisnanf(x) || xisnanf(y) ? NANf : mulsignf(r, y); +} + +float _SLEEF_N(xasinf_u1)(float d) { + float2 d2 = atan2kf_u1(df(xfabsf(d), 0), dfsqrt_f2_f2(dfmul_f2_f2_f2(dfadd_f2_f_f(1, d), dfadd_f2_f_f(1,-d)))); + float r = d2.x + d2.y; + if (xfabsf(d) == 1) r = 1.570796326794896557998982f; + return mulsignf(r, d); +} + +float _SLEEF_N(xacosf_u1)(float d) { + float2 d2 = atan2kf_u1(dfsqrt_f2_f2(dfmul_f2_f2_f2(dfadd_f2_f_f(1, d), dfadd_f2_f_f(1,-d))), df(xfabsf(d), 0)); + d2 = dfscale_f2_f2_f(d2, mulsignf(1.0f, d)); + if (xfabsf(d) == 1) d2 = df(0.0f, 0.0f); + if (d < 0) d2 = dfadd_f2_f2_f2(df(3.1415927410125732422f,-8.7422776573475857731e-08f), d2); + return d2.x + d2.y; +} + +float _SLEEF_N(xatanf_u1)(float d) { + float2 d2 = atan2kf_u1(df(xfabsf(d), 0.0f), df(1.0f, 0.0f)); + float r = d2.x + d2.y; + if (xisinff(d)) r = 1.570796326794896557998982f; + return mulsignf(r, d); +} + +float _SLEEF_N(xlogf)(float d) { + float x, x2, t, m; + int e; + + e = ilogbp1f(d * 0.7071f); + m = ldexpkf(d, -e); + + x = (m-1.0f) / (m+1.0f); + x2 = x * x; + + t = 0.2371599674224853515625f; + t = mlaf(t, x2, 0.285279005765914916992188f); + t = mlaf(t, x2, 0.400005519390106201171875f); + t = mlaf(t, x2, 0.666666567325592041015625f); + t = mlaf(t, x2, 2.0f); + + x = x * t + 0.693147180559945286226764f * e; + + if (xisinff(d)) x = INFINITYf; + if (d < 0) x = NANf; + if (d == 0) x = -INFINITYf; + + return x; +} + +float _SLEEF_N(xexpf)(float d) { + int q = (int)xrintf(d * R_LN2f); + float s, u; + + s = mlaf(q, -L2Uf, d); + s = mlaf(q, -L2Lf, s); + + u = 0.00136324646882712841033936f; + u = mlaf(u, s, 0.00836596917361021041870117f); + u = mlaf(u, s, 0.0416710823774337768554688f); + u = mlaf(u, s, 0.166665524244308471679688f); + u = mlaf(u, s, 0.499999850988388061523438f); + + u = s * s * u + s + 1.0f; + u = ldexpkf(u, q); + + if (xisminff(d)) u = 0; + + return u; +} + +//#define L2Af 0.693145751953125 +//#define L2Bf 1.4285906217992305756e-06 +//#define L2Cf 1.619850954759360917e-11 + +static inline float expkf(float2 d) { + int q = (int)xrintf((d.x + d.y) * R_LN2f); + float2 s, t; + float u; + + s = dfadd2_f2_f2_f(d, q * -L2Uf); + s = dfadd2_f2_f2_f(s, q * -L2Lf); + + //s = dfadd2_f2_f2_f(d, q * -L2Af); + //s = dfadd2_f2_f2_f(s, q * -L2Bf); + //s = dfadd2_f2_f2_f(s, q * -L2Cf); + + s = dfnormalize_f2_f2(s); + + u = 0.00136324646882712841033936f; + u = mlaf(u, s.x, 0.00836596917361021041870117f); + u = mlaf(u, s.x, 0.0416710823774337768554688f); + u = mlaf(u, s.x, 0.166665524244308471679688f); + u = mlaf(u, s.x, 0.499999850988388061523438f); + + t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u)); + + t = dfadd_f2_f_f2(1, t); + return ldexpkf(t.x + t.y, q); +} + +static inline float2 logkf(float d) { + float2 x, x2; + float m, t; + int e; + + e = ilogbp1f(d * 0.7071f); + m = ldexpkf(d, -e); + + x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m)); + x2 = dfsqu_f2_f2(x); + + t = 0.2371599674224853515625f; + t = mlaf(t, x2.x, 0.285279005765914916992188f); + t = mlaf(t, x2.x, 0.400005519390106201171875f); + t = mlaf(t, x2.x, 0.666666567325592041015625f); + + return dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e), + dfadd2_f2_f2_f2(dfscale_f2_f2_f(x, 2), dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t))); +} + +float _SLEEF_N(xlogf_u1)(float d) { + float2 s = logkf(d); + float x = s.x + s.y; + + if (xisinff(d)) x = INFINITYf; + if (d < 0) x = NANf; + if (d == 0) x = -INFINITYf; + + return x; +} + +static inline float2 expk2f(float2 d) { + int q = (int)xrintf((d.x + d.y) * R_LN2f); + float2 s, t; + float u; + + s = dfadd2_f2_f2_f(d, q * -L2Uf); + s = dfadd2_f2_f2_f(s, q * -L2Lf); + + u = 0.00136324646882712841033936f; + u = mlaf(u, s.x, 0.00836596917361021041870117f); + u = mlaf(u, s.x, 0.0416710823774337768554688f); + u = mlaf(u, s.x, 0.166665524244308471679688f); + u = mlaf(u, s.x, 0.499999850988388061523438f); + + t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u)); + + t = dfadd_f2_f_f2(1, t); + return dfscale_f2_f2_f(t, pow2if(q)); +} + +float _SLEEF_N(xpowf)(float x, float y) { + int yisint = (int)y == y; + int yisodd = (1 & (int)y) != 0 && yisint; + + float result = expkf(dfmul_f2_f2_f(logkf(xfabsf(x)), y)); + + result = xisnanf(result) ? INFINITYf : result; + result *= (x >= 0 ? 1 : (!yisint ? NANf : (yisodd ? -1 : 1))); + + float efx = mulsignf(xfabsf(x) - 1, y); + if (xisinff(y)) result = efx < 0 ? 0.0f : (efx == 0 ? 1.0f : INFINITYf); + if (xisinff(x) || x == 0) result = (yisodd ? signf(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : INFINITYf); + if (xisnanf(x) || xisnanf(y)) result = NANf; + if (y == 0 || x == 1) result = 1; + + return result; +} + +float _SLEEF_N(xsinhf)(float x) { + float y = xfabsf(x); + float2 d = expk2f(df(y, 0)); + d = dfsub_f2_f2_f2(d, dfrec_f2_f2(d)); + y = (d.x + d.y) * 0.5f; + + y = xfabsf(x) > 89 ? INFINITY : y; + y = xisnanf(y) ? INFINITYf : y; + y = mulsignf(y, x); + y = xisnanf(x) ? NANf : y; + + return y; +} + +float _SLEEF_N(xcoshf)(float x) { + float y = xfabsf(x); + float2 d = expk2f(df(y, 0)); + d = dfadd_f2_f2_f2(d, dfrec_f2_f2(d)); + y = (d.x + d.y) * 0.5f; + + y = xfabsf(x) > 89 ? INFINITY : y; + y = xisnanf(y) ? INFINITYf : y; + y = xisnanf(x) ? NANf : y; + + return y; +} + +float _SLEEF_N(xtanhf)(float x) { + float y = xfabsf(x); + float2 d = expk2f(df(y, 0)); + float2 e = dfrec_f2_f2(d); + d = dfdiv_f2_f2_f2(dfsub_f2_f2_f2(d, e), dfadd_f2_f2_f2(d, e)); + y = d.x + d.y; + + y = xfabsf(x) > 8.664339742f ? 1.0f : y; + y = xisnanf(y) ? 1.0f : y; + y = mulsignf(y, x); + y = xisnanf(x) ? NANf : y; + + return y; +} + +static inline float2 logk2f(float2 d) { + float2 x, x2, m; + float t; + int e; + + e = ilogbp1f(d.x * 0.7071f); + m = dfscale_f2_f2_f(d, pow2if(-e)); + + x = dfdiv_f2_f2_f2(dfadd2_f2_f2_f(m, -1), dfadd2_f2_f2_f(m, 1)); + x2 = dfsqu_f2_f2(x); + + t = 0.2371599674224853515625f; + t = mlaf(t, x2.x, 0.285279005765914916992188f); + t = mlaf(t, x2.x, 0.400005519390106201171875f); + t = mlaf(t, x2.x, 0.666666567325592041015625f); + + return dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e), + dfadd2_f2_f2_f2(dfscale_f2_f2_f(x, 2), dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t))); +} + +float _SLEEF_N(xasinhf)(float x) { + float y = xfabsf(x); + float2 d = logk2f(dfadd2_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfmul_f2_f_f(y, y), 1)), y)); + y = d.x + d.y; + + y = xisinff(x) || xisnanf(y) ? INFINITYf : y; + y = mulsignf(y, x); + y = xisnanf(x) ? NANf : y; + + return y; +} + +float _SLEEF_N(xacoshf)(float x) { + float2 d = logk2f(dfadd2_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfmul_f2_f_f(x, x), -1)), x)); + float y = d.x + d.y; + + y = xisinff(x) || xisnanf(y) ? INFINITYf : y; + y = x == 1.0f ? 0.0f : y; + y = x < 1.0f ? NANf : y; + y = xisnanf(x) ? NANf : y; + + return y; +} + +float _SLEEF_N(xatanhf)(float x) { + float y = xfabsf(x); + float2 d = logk2f(dfdiv_f2_f2_f2(dfadd2_f2_f_f(1, y), dfadd2_f2_f_f(1, -y))); + y = y > 1.0 ? NANf : (y == 1.0 ? INFINITYf : (d.x + d.y) * 0.5f); + + y = xisinff(x) || xisnanf(y) ? NANf : y; + y = mulsignf(y, x); + y = xisnanf(x) ? NANf : y; + + return y; +} + +float _SLEEF_N(xexp2f)(float a) { + float u = expkf(dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), a)); + if (xispinff(a)) u = INFINITYf; + if (xisminff(a)) u = 0; + return u; +} + +float _SLEEF_N(xexp10f)(float a) { + float u = expkf(dfmul_f2_f2_f(df(2.3025851249694824219f, -3.1975436520781386207e-08f), a)); + if (xispinff(a)) u = INFINITYf; + if (xisminff(a)) u = 0; + return u; +} + +float _SLEEF_N(xexpm1f)(float a) { + float2 d = dfadd2_f2_f2_f(expk2f(df(a, 0)), -1.0f); + float x = d.x + d.y; + if (a > 88.0f) x = INFINITYf; + if (a < -0.15942385152878742116596338793538061065739925620174e+2f) x = -1; + return x; +} + +float _SLEEF_N(xlog10f)(float a) { + float2 d = dfmul_f2_f2_f2(logkf(a), df(0.43429449200630187988f, -1.0103050118726031315e-08f)); + float x = d.x + d.y; + + if (xisinff(a)) x = INFINITYf; + if (a < 0) x = NANf; + if (a == 0) x = -INFINITYf; + + return x; +} + +float _SLEEF_N(xlog1pf)(float a) { + float2 d = logk2f(dfadd2_f2_f_f(a, 1)); + float x = d.x + d.y; + + if (xisinff(a)) x = INFINITYf; + if (a < -1) x = NANf; + if (a == -1) x = -INFINITYf; + + return x; +} + +float _SLEEF_N(xsqrtf)(float f) { return sqrtf(f); } + +float _SLEEF_N(xcbrtf)(float d) { + float x, y, q = 1.0f; + int e, r; + + e = ilogbp1f(d); + d = ldexpkf(d, -e); + r = (e + 6144) % 3; + q = (r == 1) ? 1.2599210498948731647672106f : q; + q = (r == 2) ? 1.5874010519681994747517056f : q; + q = ldexpkf(q, (e + 6144) / 3 - 2048); + + q = mulsignf(q, d); + d = xfabsf(d); + + x = -0.601564466953277587890625f; + x = mlaf(x, d, 2.8208892345428466796875f); + x = mlaf(x, d, -5.532182216644287109375f); + x = mlaf(x, d, 5.898262500762939453125f); + x = mlaf(x, d, -3.8095417022705078125f); + x = mlaf(x, d, 2.2241256237030029296875f); + + y = d * x * x; + y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q; + + return y; +} + +float _SLEEF_N(xcbrtf_u1)(float d) { + float x, y, z; + float2 q2 = df(1, 0), u, v; + int e, r; + + e = ilogbp1f(d); + d = ldexpkf(d, -e); + r = (e + 6144) % 3; + q2 = (r == 1) ? df(1.2599210739135742188, -2.4018701694217270415e-08) : q2; + q2 = (r == 2) ? df(1.5874010324478149414, 1.9520385308169352356e-08) : q2; + + q2.x = mulsignf(q2.x, d); q2.y = mulsignf(q2.y, d); + d = xfabsf(d); + + x = -0.601564466953277587890625f; + x = mlaf(x, d, 2.8208892345428466796875f); + x = mlaf(x, d, -5.532182216644287109375f); + x = mlaf(x, d, 5.898262500762939453125f); + x = mlaf(x, d, -3.8095417022705078125f); + x = mlaf(x, d, 2.2241256237030029296875f); + + y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0f); + + z = x; + + u = dfmul_f2_f_f(x, x); + u = dfmul_f2_f2_f2(u, u); + u = dfmul_f2_f2_f(u, d); + u = dfadd2_f2_f2_f(u, -x); + y = u.x + u.y; + + y = -2.0 / 3.0 * y * z; + v = dfadd2_f2_f2_f(dfmul_f2_f_f(z, z), y); + v = dfmul_f2_f2_f(v, d); + v = dfmul_f2_f2_f2(v, q2); + z = ldexpf(v.x + v.y, (e + 6144) / 3 - 2048); + + if (xisinff(d)) { z = mulsignf(INFINITYf, q2.x); } + if (d == 0) { z = mulsignf(0, q2.x); } + + return z; +} Index: sleef/lib/sp-sse2.cpp =================================================================== --- /dev/null +++ sleef/lib/sp-sse2.cpp @@ -0,0 +1,3 @@ +#define BUILD_SSE2 +#include "sp.cpp" + Index: sleef/lib/sp.cpp =================================================================== --- /dev/null +++ sleef/lib/sp.cpp @@ -0,0 +1,1016 @@ +#include +#include + +#include "nonnumber.h" +#include "isa.h" +#include "df.h" + +// + +#define PI4_Af 0.78515625f +#define PI4_Bf 0.00024187564849853515625f +#define PI4_Cf 3.7747668102383613586e-08f +#define PI4_Df 1.2816720341285448015e-12f + +#define L2Uf 0.693145751953125f +#define L2Lf 1.428606765330187045e-06f +#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f + +// + +extern "C" { +vfloat _SLEEF_N(xldexpf)(vfloat x, vint2 q); +vfloat _SLEEF_N(xsinf)(vfloat d); +vfloat _SLEEF_N(xcosf)(vfloat d); +vfloat2 _SLEEF_N(xsincosf)(vfloat d); +vfloat _SLEEF_N(xtanf)(vfloat d); +vfloat _SLEEF_N(xsinf_u1)(vfloat d); +vfloat _SLEEF_N(xcosf_u1)(vfloat d); +vfloat2 _SLEEF_N(xsincosf_u1)(vfloat d); +vfloat _SLEEF_N(xtanf_u1)(vfloat d); +vfloat _SLEEF_N(xatanf)(vfloat d); +vfloat _SLEEF_N(xatan2f)(vfloat y, vfloat x); +vfloat _SLEEF_N(xasinf)(vfloat d); +vfloat _SLEEF_N(xacosf)(vfloat d); +vfloat _SLEEF_N(xatan2f_u1)(vfloat y, vfloat x); +vfloat _SLEEF_N(xasinf_u1)(vfloat d); +vfloat _SLEEF_N(xacosf_u1)(vfloat d); +vfloat _SLEEF_N(xatanf_u1)(vfloat d); +vfloat _SLEEF_N(xlogf)(vfloat d); +vfloat _SLEEF_N(xexpf)(vfloat d); +vfloat _SLEEF_N(xsqrtf)(vfloat d); +vfloat _SLEEF_N(xsqrtf)(vfloat d); +vfloat _SLEEF_N(xcbrtf)(vfloat d); +vfloat _SLEEF_N(xcbrtf_u1)(vfloat d); +vfloat _SLEEF_N(xlogf_u1)(vfloat d); +vfloat _SLEEF_N(xpowf)(vfloat x, vfloat y); +vfloat _SLEEF_N(xsinhf)(vfloat x); +vfloat _SLEEF_N(xcoshf)(vfloat x); +vfloat _SLEEF_N(xtanhf)(vfloat x); +vfloat _SLEEF_N(xasinhf)(vfloat x); +vfloat _SLEEF_N(xacoshf)(vfloat x); +vfloat _SLEEF_N(xatanhf)(vfloat x); +vfloat _SLEEF_N(xexp2f)(vfloat a); +vfloat _SLEEF_N(xexp10f)(vfloat a); +vfloat _SLEEF_N(xexpm1f)(vfloat a); +vfloat _SLEEF_N(xlog10f)(vfloat a); +vfloat _SLEEF_N(xlog1pf)(vfloat a); +} // extern "C" + +static inline vint2 vsel_vi2_vf_vf_vi2_vi2(vfloat f0, vfloat f1, vint2 x, vint2 y) { + vint2 m2 = vcast_vi2_vm(vlt_vm_vf_vf(f0, f1)); + return vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(m2, x), vandnot_vi2_vi2_vi2(m2, y)); +} + +static inline vmask vsignbit_vm_vf(vfloat f) { + return vand_vm_vm_vm((vmask)f, (vmask)vcast_vf_f(-0.0f)); +} + +static inline vfloat vmulsign_vf_vf_vf(vfloat x, vfloat y) { + return (vfloat)vxor_vm_vm_vm((vmask)x, vsignbit_vm_vf(y)); +} + +static inline vfloat vsign_vf_vf(vfloat f) { + return (vfloat)vor_vm_vm_vm((vmask)vcast_vf_f(1.0f), vand_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f)); +} + +static inline vmask visinf_vm_vf(vfloat d) { return veq_vm_vf_vf(vabs_vf_vf(d), vcast_vf_f(INFINITYf)); } +static inline vmask vispinf_vm_vf(vfloat d) { return veq_vm_vf_vf(d, vcast_vf_f(INFINITYf)); } +static inline vmask visminf_vm_vf(vfloat d) { return veq_vm_vf_vf(d, vcast_vf_f(-INFINITYf)); } +static inline vmask visnan_vm_vf(vfloat d) { return vneq_vm_vf_vf(d, d); } +static inline vfloat visinf2_vf_vf_vm(vfloat d, vfloat m) { return (vfloat)vand_vm_vm_vm(visinf_vm_vf(d), vor_vm_vm_vm(vsignbit_vm_vf(d), (vmask)m)); } +static inline vfloat visinff(vfloat d) { return visinf2_vf_vf_vm(d, vcast_vf_f(1.0f)); } + +static inline vint2 vilogbp1_vi2_vf(vfloat d) { + vmask m = vlt_vm_vf_vf(d, vcast_vf_f(5.421010862427522E-20f)); + d = vsel_vf_vm_vf_vf(m, vmul_vf_vf_vf(vcast_vf_f(1.8446744073709552E19f), d), d); + vint2 q = vand_vi2_vi2_vi2(vsrl_vi2_vi2_i(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff)); + q = vsub_vi2_vi2_vi2(q, vsel_vi2_vm_vi2_vi2(m, vcast_vi2_i(64 + 0x7e), vcast_vi2_i(0x7e))); + return q; +} + +static inline vfloat vpow2i_vf_vi2(vint2 q) { + return (vfloat)vcast_vm_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23)); +} + +static inline vfloat vldexp_vf_vf_vi2(vfloat x, vint2 q) { + vfloat u; + vint2 m = vsra_vi2_vi2_i(q, 31); + m = vsll_vi2_vi2_i(vsub_vi2_vi2_vi2(vsra_vi2_vi2_i(vadd_vi2_vi2_vi2(m, q), 6), m), 4); + q = vsub_vi2_vi2_vi2(q, vsll_vi2_vi2_i(m, 2)); + m = vadd_vi2_vi2_vi2(m, vcast_vi2_i(0x7f)); + m = vand_vi2_vi2_vi2(vgt_vi2_vi2_vi2(m, vcast_vi2_i(0)), m); + vint2 n = vgt_vi2_vi2_vi2(m, vcast_vi2_i(0xff)); + m = vor_vi2_vi2_vi2(vandnot_vi2_vi2_vi2(n, m), vand_vi2_vi2_vi2(n, vcast_vi2_i(0xff))); + u = vreinterpret_vf_vm(vcast_vm_vi2(vsll_vi2_vi2_i(m, 23))); + x = vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(x, u), u), u), u); + u = vreinterpret_vf_vm(vcast_vm_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23))); + return vmul_vf_vf_vf(x, u); +} + +vfloat _SLEEF_N(xldexpf)(vfloat x, vint2 q) { return vldexp_vf_vf_vi2(x, q); } + +vfloat _SLEEF_N(xsinf)(vfloat d) { + vint2 q; + vfloat u, s; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI))); + u = vcast_vf_vi2(q); + + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*4), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*4), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*4), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*4), d); + + s = vmul_vf_vf_vf(d, d); + + d = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), (vmask)vcast_vf_f(-0.0f)), (vmask)d); + + u = vcast_vf_f(2.6083159809786593541503e-06f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f)); + + u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, d), d); + + u = (vfloat)vor_vm_vm_vm(visinf_vm_vf(d), (vmask)u); + + return u; +} + +vfloat _SLEEF_N(xcosf)(vfloat d) { + vint2 q; + vfloat u, s; + + q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f))); + q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1)); + + u = vcast_vf_vi2(q); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), d); + d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), d); + + s = vmul_vf_vf_vf(d, d); + + d = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), (vmask)vcast_vf_f(-0.0f)), (vmask)d); + + u = vcast_vf_f(2.6083159809786593541503e-06f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f)); + + u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, d), d); + + u = (vfloat)vor_vm_vm_vm(visinf_vm_vf(d), (vmask)u); + + return u; +} + +vfloat2 _SLEEF_N(xsincosf)(vfloat d) { + vint2 q; + vmask m; + vfloat u, s, t, rx, ry; + vfloat2 r; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI))); + + s = d; + + u = vcast_vf_vi2(q); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), s); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), s); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), s); + s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), s); + + t = s; + + s = vmul_vf_vf_vf(s, s); + + u = vcast_vf_f(-0.000195169282960705459117889f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833215750753879547119141f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666537523269653320312f)); + u = vmul_vf_vf_vf(vmul_vf_vf_vf(u, s), t); + + rx = vadd_vf_vf_vf(t, u); + + u = vcast_vf_f(-2.71811842367242206819355e-07f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(2.47990446951007470488548e-05f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.00138888787478208541870117f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416666641831398010253906f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.5)); + + ry = vmla_vf_vf_vf_vf(s, u, vcast_vf_f(1)); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)); + r.x = vsel_vf_vm_vf_vf(m, rx, ry); + r.y = vsel_vf_vm_vf_vf(m, ry, rx); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)); + r.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x))); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2)); + r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y))); + + m = visinf_vm_vf(d); + + r.x = (vfloat)vor_vm_vm_vm(m, (vmask)r.x); + r.y = (vfloat)vor_vm_vm_vm(m, (vmask)r.y); + + return r; +} + +vfloat _SLEEF_N(xtanf)(vfloat d) { + vint2 q; + vmask m; + vfloat u, s, x; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI)))); + + x = d; + + u = vcast_vf_vi2(q); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2), x); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2), x); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2), x); + x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2), x); + + s = vmul_vf_vf_vf(x, x); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); + x = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(m, (vmask)vcast_vf_f(-0.0f)), (vmask)x); + + u = vcast_vf_f(0.00927245803177356719970703f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f)); + + u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x); + + u = vsel_vf_vm_vf_vf(m, vrec_vf_vf(u), u); + + u = (vfloat)vor_vm_vm_vm(visinf_vm_vf(d), (vmask)u); + + return u; +} + +vfloat _SLEEF_N(xsinf_u1)(vfloat d) { + vint2 q; + vfloat u; + vfloat2 s, t, x; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI))); + u = vcast_vf_vi2(q); + + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*4))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*4))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*4))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*4))); + + t = s; + s = dfsqu_vf2_vf2(s); + + u = vcast_vf_f(2.6083159809786593541503e-06f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.0001981069071916863322258f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00833307858556509017944336f)); + + x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, s.x)), s)); + + x = dfmul_vf2_vf2_vf2(t, x); + u = vadd_vf_vf_vf(x.x, x.y); + + u = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), (vmask)vcast_vf_f(-0.0)), (vmask)u); + + return u; +} + +vfloat _SLEEF_N(xcosf_u1)(vfloat d) { + vint2 q; + vfloat u; + vfloat2 s, t, x; + + q = vrint_vi2_vf(vmla_vf_vf_vf_vf(d, vcast_vf_f(M_1_PI), vcast_vf_f(-0.5))); + q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1)); + u = vcast_vf_vi2(q); + + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2))); + + t = s; + s = dfsqu_vf2_vf2(s); + + u = vcast_vf_f(2.6083159809786593541503e-06f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.0001981069071916863322258f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00833307858556509017944336f)); + + x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, s.x)), s)); + + x = dfmul_vf2_vf2_vf2(t, x); + u = vadd_vf_vf_vf(x.x, x.y); + + u = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), (vmask)vcast_vf_f(-0.0)), (vmask)u); + + return u; +} + +vfloat2 _SLEEF_N(xsincosf_u1)(vfloat d) { + vint2 q; + vmask m; + vfloat u, rx, ry; + vfloat2 r, s, t, x; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI))); + u = vcast_vf_vi2(q); + + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2))); + + t = s; + + s = dfsqu_vf2_vf2(s); + s.x = vadd_vf_vf_vf(s.x, s.y); + + u = vcast_vf_f(-0.000195169282960705459117889f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00833215750753879547119141f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.166666537523269653320312f)); + + u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(s.x, t.x)); + + x = dfadd_vf2_vf2_vf(t, u); + rx = vadd_vf_vf_vf(x.x, x.y); + + u = vcast_vf_f(-2.71811842367242206819355e-07f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(2.47990446951007470488548e-05f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.00138888787478208541870117f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0416666641831398010253906f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.5)); + + x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(s.x, u)); + ry = vadd_vf_vf_vf(x.x, x.y); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0)); + r.x = vsel_vf_vm_vf_vf(m, rx, ry); + r.y = vsel_vf_vm_vf_vf(m, ry, rx); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)); + r.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x))); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2)); + r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(m, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y))); + + m = visinf_vm_vf(d); + r.x = (vfloat)vor_vm_vm_vm(m, (vmask)r.x); + r.y = (vfloat)vor_vm_vm_vm(m, (vmask)r.y); + + return r; +} + +vfloat _SLEEF_N(xtanf_u1)(vfloat d) { + vint2 q; + vfloat u; + vfloat2 s, t, x; + vmask m; + + q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_2_PI))); + u = vcast_vf_vi2(q); + + s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Af*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Bf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Cf*2))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI4_Df*2))); + + m = veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)); + vmask n = vand_vm_vm_vm(m, (vmask)vcast_vf_f(-0.0)); + s.x = (vfloat)vxor_vm_vm_vm((vmask)s.x, n); + s.y = (vfloat)vxor_vm_vm_vm((vmask)s.y, n); + + t = s; + s = dfsqu_vf2_vf2(s); + s = dfnormalize_vf2_vf2(s); + + u = vcast_vf_f(0.00446636462584137916564941f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-8.3920182078145444393158e-05f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0109639242291450500488281f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0212360303848981857299805f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0540687143802642822265625f)); + + x = dfadd_vf2_vf_vf(vcast_vf_f(0.133325666189193725585938f), vmul_vf_vf_vf(u, s.x)); + x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(0.33333361148834228515625f), dfmul_vf2_vf2_vf2(s, x)), s)); + x = dfmul_vf2_vf2_vf2(t, x); + + x = vsel_vf2_vm_vf2_vf2(m, dfrec_vf2_vf2(x), x); + + u = vadd_vf_vf_vf(x.x, x.y); + + return u; +} + +vfloat _SLEEF_N(xatanf)(vfloat d) { + vfloat s, t, u; + vint2 q; + + q = vsel_vi2_vf_vf_vi2_vi2(d, vcast_vf_f(0.0f), vcast_vi2_i(2), vcast_vi2_i(0)); + s = vabs_vf_vf(d); + + q = vsel_vi2_vf_vf_vi2_vi2(vcast_vf_f(1.0f), s, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q); + s = vsel_vf_vm_vf_vf(vlt_vm_vf_vf(vcast_vf_f(1.0f), s), vrec_vf_vf(s), s); + + t = vmul_vf_vf_vf(s, s); + + u = vcast_vf_f(0.00282363896258175373077393f); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0159569028764963150024414f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.0425049886107444763183594f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0748900920152664184570312f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.106347933411598205566406f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.142027363181114196777344f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.199926957488059997558594f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.333331018686294555664062f)); + + t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s); + + t = vsel_vf_vm_vf_vf(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), t), t); + + t = (vfloat)vxor_vm_vm_vm(vand_vm_vm_vm(veq_vm_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), (vmask)vcast_vf_f(-0.0f)), (vmask)t); + +#ifdef __ARM_NEON__ + t = vsel_vf_vm_vf_vf(visinf_vm_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(1.5874010519681994747517056f), d), t); +#endif + + return t; +} + +static inline vfloat atan2kf(vfloat y, vfloat x) { + vfloat s, t, u; + vint2 q; + vmask p; + + q = vsel_vi2_vf_vf_vi2_vi2(x, vcast_vf_f(0.0f), vcast_vi2_i(-2), vcast_vi2_i(0)); + x = vabs_vf_vf(x); + + q = vsel_vi2_vf_vf_vi2_vi2(x, y, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q); + p = vlt_vm_vf_vf(x, y); + s = vsel_vf_vm_vf_vf(p, vneg_vf_vf(x), y); + t = vmax_vf_vf_vf(x, y); + + s = vdiv_vf_vf_vf(s, t); + t = vmul_vf_vf_vf(s, s); + + u = vcast_vf_f(0.00282363896258175373077393f); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0159569028764963150024414f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.0425049886107444763183594f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0748900920152664184570312f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.106347933411598205566406f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.142027363181114196777344f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.199926957488059997558594f)); + u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.333331018686294555664062f)); + + t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s); + t = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f((float)(M_PI/2)), t); + + return t; +} + +vfloat _SLEEF_N(xatan2f)(vfloat y, vfloat x) { + vfloat r = atan2kf(vabs_vf_vf(y), x); + + r = vmulsign_vf_vf_vf(r, x); + r = vsel_vf_vm_vf_vf(vor_vm_vm_vm(visinf_vm_vf(x), veq_vm_vf_vf(x, vcast_vf_f(0.0f))), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vm(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), x))), r); + r = vsel_vf_vm_vf_vf(visinf_vm_vf(y), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vm(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/4)), x))), r); + + r = vsel_vf_vm_vf_vf(veq_vm_vf_vf(y, vcast_vf_f(0.0f)), (vfloat)vand_vm_vm_vm(veq_vm_vf_vf(vsign_vf_vf(x), vcast_vf_f(-1.0f)), (vmask)vcast_vf_f((float)M_PI)), r); + + r = (vfloat)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vf(x), visnan_vm_vf(y)), (vmask)vmulsign_vf_vf_vf(r, y)); + return r; +} + +vfloat _SLEEF_N(xasinf)(vfloat d) { + vfloat x, y; + x = vadd_vf_vf_vf(vcast_vf_f(1.0f), d); + y = vsub_vf_vf_vf(vcast_vf_f(1.0f), d); + x = vmul_vf_vf_vf(x, y); + x = vsqrt_vf_vf(x); + x = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)atan2kf(vabs_vf_vf(d), x)); + return vmulsign_vf_vf_vf(x, d); +} + +vfloat _SLEEF_N(xacosf)(vfloat d) { + vfloat x, y; + x = vadd_vf_vf_vf(vcast_vf_f(1.0f), d); + y = vsub_vf_vf_vf(vcast_vf_f(1.0f), d); + x = vmul_vf_vf_vf(x, y); + x = vsqrt_vf_vf(x); + x = vmulsign_vf_vf_vf(atan2kf(x, vabs_vf_vf(d)), d); + y = (vfloat)vand_vm_vm_vm(vlt_vm_vf_vf(d, vcast_vf_f(0.0f)), (vmask)vcast_vf_f((float)M_PI)); + x = vadd_vf_vf_vf(x, y); + return x; +} + +// + +static inline vfloat2 atan2kf_u1(vfloat2 y, vfloat2 x) { + vfloat u; + vfloat2 s, t; + vint2 q; + vmask p; + + q = vsel_vi2_vf_vf_vi2_vi2(x.x, vcast_vf_f(0), vcast_vi2_i(-2), vcast_vi2_i(0)); + p = vlt_vm_vf_vf(x.x, vcast_vf_f(0)); + p = vand_vm_vm_vm(p, (vmask)vcast_vf_f(-0.0)); + x.x = (vfloat)vxor_vm_vm_vm((vmask)x.x, p); + x.y = (vfloat)vxor_vm_vm_vm((vmask)x.y, p); + + q = vsel_vi2_vf_vf_vi2_vi2(x.x, y.x, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q); + p = vlt_vm_vf_vf(x.x, y.x); + s = vsel_vf2_vm_vf2_vf2(p, dfneg_vf2_vf2(x), y); + t = vsel_vf2_vm_vf2_vf2(p, y, x); + + s = dfdiv_vf2_vf2_vf2(s, t); + t = dfsqu_vf2_vf2(s); + t = dfnormalize_vf2_vf2(t); + + u = vcast_vf_f(-0.00176397908944636583328247f); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.0107900900766253471374512f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.0309564601629972457885742f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.0577365085482597351074219f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.0838950723409652709960938f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.109463557600975036621094f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.142626821994781494140625f)); + u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.199983194470405578613281f)); + + //u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.333332866430282592773438f)); + //t = dfmul_vf2_vf2_vf(t, u); + + t = dfmul_vf2_vf2_vf2(t, dfadd_vf2_vf_vf(vcast_vf_f(-0.333332866430282592773438f), vmul_vf_vf_vf(u, t.x))); + t = dfmul_vf2_vf2_vf2(s, dfadd_vf2_vf_vf2(vcast_vf_f(1), t)); + t = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_f_f(1.5707963705062866211f, -4.3711388286737928865e-08f), vcast_vf_vi2(q)), t); + + return t; +} + +vfloat _SLEEF_N(xatan2f_u1)(vfloat y, vfloat x) { + vfloat2 d = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(y), vcast_vf_f(0)), vcast_vf2_vf_vf(x, vcast_vf_f(0))); + vfloat r = vadd_vf_vf_vf(d.x, d.y); + + r = vmulsign_vf_vf_vf(r, x); + r = vsel_vf_vm_vf_vf(vor_vm_vm_vm(visinf_vm_vf(x), veq_vm_vf_vf(x, vcast_vf_f(0))), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vm(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/2), x))), r); + r = vsel_vf_vm_vf_vf(visinf_vm_vf(y), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vm(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/4), x))), r); + r = vsel_vf_vm_vf_vf(veq_vm_vf_vf(y, vcast_vf_f(0.0)), (vfloat)vand_vm_vm_vm(veq_vm_vf_vf(vsign_vf_vf(x), vcast_vf_f(-1.0)), (vmask)vcast_vf_f(M_PI)), r); + + r = (vfloat)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vf(x), visnan_vm_vf(y)), (vmask)vmulsign_vf_vf_vf(r, y)); + return r; +} + +vfloat _SLEEF_N(xasinf_u1)(vfloat d) { + vfloat2 d2 = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(1), d), dfsub_vf2_vf_vf(vcast_vf_f(1), d)))); + vfloat r = vadd_vf_vf_vf(d2.x, d2.y); + r = vsel_vf_vm_vf_vf(veq_vm_vf_vf(vabs_vf_vf(d), vcast_vf_f(1)), vcast_vf_f(1.570796326794896557998982), r); + return vmulsign_vf_vf_vf(r, d); +} + +vfloat _SLEEF_N(xacosf_u1)(vfloat d) { + vfloat2 d2 = atan2kf_u1(dfsqrt_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(1), d), dfsub_vf2_vf_vf(vcast_vf_f(1), d))), vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0))); + d2 = dfscale_vf2_vf2_vf(d2, vmulsign_vf_vf_vf(vcast_vf_f(1), d)); + + vmask m; + m = vneq_vm_vf_vf(vabs_vf_vf(d), vcast_vf_f(1)); + d2.x = (vfloat)vand_vm_vm_vm(m, (vmask)d2.x); + d2.y = (vfloat)vand_vm_vm_vm(m, (vmask)d2.y); + m = vlt_vm_vf_vf(d, vcast_vf_f(0)); + d2 = vsel_vf2_vm_vf2_vf2(m, dfadd_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f,-8.7422776573475857731e-08f), d2), d2); + + return vadd_vf_vf_vf(d2.x, d2.y); +} + +vfloat _SLEEF_N(xatanf_u1)(vfloat d) { + vfloat2 d2 = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), vcast_vf2_f_f(1, 0)); + vfloat r = vadd_vf_vf_vf(d2.x, d2.y); + r = vsel_vf_vm_vf_vf(visinf_vm_vf(d), vcast_vf_f(1.570796326794896557998982), r); + return vmulsign_vf_vf_vf(r, d); +} + +// + +vfloat _SLEEF_N(xlogf)(vfloat d) { + vfloat x, x2, t, m; + vint2 e; + + e = vilogbp1_vi2_vf(x = vmul_vf_vf_vf(d, vcast_vf_f(0.7071f))); + m = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e)); + d = x; + + x = vdiv_vf_vf_vf(vadd_vf_vf_vf(vcast_vf_f(-1.0f), m), vadd_vf_vf_vf(vcast_vf_f(1.0f), m)); + x2 = vmul_vf_vf_vf(x, x); + + t = vcast_vf_f(0.2371599674224853515625f); + t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.285279005765914916992188f)); + t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.400005519390106201171875f)); + t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.666666567325592041015625f)); + t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(2.0f)); + + x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e))); + + x = vsel_vf_vm_vf_vf(vispinf_vm_vf(d), vcast_vf_f(INFINITYf), x); + x = (vfloat)vor_vm_vm_vm(vgt_vm_vf_vf(vcast_vf_f(0), d), (vmask)x); + x = vsel_vf_vm_vf_vf(veq_vm_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x); + + return x; +} + +vfloat _SLEEF_N(xexpf)(vfloat d) { + vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f))); + vfloat s, u; + + s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d); + s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s); + + u = vcast_vf_f(0.00136324646882712841033936f); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00836596917361021041870117f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416710823774337768554688f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166665524244308471679688f)); + u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.499999850988388061523438f)); + + u = vadd_vf_vf_vf(vcast_vf_f(1.0f), vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s)); + + u = vldexp_vf_vf_vi2(u, q); + + u = (vfloat)vandnot_vm_vm_vm(visminf_vm_vf(d), (vmask)u); + + return u; +} + +#ifdef __ARM_NEON__ +vfloat _SLEEF_N(xsqrtf)(vfloat d) { + vfloat e = (vfloat)vadd_vi2_vi2_vi2(vcast_vi2_i(0x20000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x7f000000), vsrl_vi2_vi2_i((vint2)d, 1))); + vfloat m = (vfloat)vadd_vi2_vi2_vi2(vcast_vi2_i(0x3f000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x01ffffff), (vint2)d)); + float32x4_t x = vrsqrteq_f32(m); + x = vmulq_f32(x, vrsqrtsq_f32(m, vmulq_f32(x, x))); + float32x4_t u = vmulq_f32(x, m); + u = vmlaq_f32(u, vmlsq_f32(m, u, u), vmulq_f32(x, vdupq_n_f32(0.5))); + e = (vfloat)vandnot_vm_vm_vm(veq_vm_vf_vf(d, vcast_vf_f(0)), (vmask)e); + u = vmul_vf_vf_vf(e, u); + + u = vsel_vf_vm_vf_vf(visinf_vm_vf(d), vcast_vf_f(INFINITYf), u); + u = (vfloat)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vf(d), vlt_vm_vf_vf(d, vcast_vf_f(0))), (vmask)u); + u = vmulsign_vf_vf_vf(u, d); + + return u; +} +#else +vfloat _SLEEF_N(xsqrtf)(vfloat d) { return vsqrt_vf_vf(d); } +#endif + +vfloat _SLEEF_N(xcbrtf)(vfloat d) { + vfloat x, y, q = vcast_vf_f(1.0), t; + vint2 e, qu, re; + + e = vilogbp1_vi2_vf(vabs_vf_vf(d)); + d = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e)); + + t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144)); + qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0f/3.0f))); + re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3)))); + + q = vsel_vf_vm_vf_vf(veq_vm_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q); + q = vsel_vf_vm_vf_vf(veq_vm_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q); + q = vldexp_vf_vf_vi2(q, vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048))); + + q = vmulsign_vf_vf_vf(q, d); + d = vabs_vf_vf(d); + + x = vcast_vf_f(-0.601564466953277587890625f); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f)); + + y = vmul_vf_vf_vf(vmul_vf_vf_vf(d, x), x); + y = vmul_vf_vf_vf(vsub_vf_vf_vf(y, vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2.0f / 3.0f), y), vmla_vf_vf_vf_vf(y, x, vcast_vf_f(-1.0f)))), q); + + return y; +} + +vfloat _SLEEF_N(xcbrtf_u1)(vfloat d) { + vfloat x, y, z, t; + vfloat2 q2 = vcast_vf2_f_f(1, 0), u, v; + vint2 e, qu, re; + + e = vilogbp1_vi2_vf(vabs_vf_vf(d)); + d = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e)); + + t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144)); + qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0/3.0))); + re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3)))); + + q2 = vsel_vf2_vm_vf2_vf2(veq_vm_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf2_f_f(1.2599210739135742188f, -2.4018701694217270415e-08), q2); + q2 = vsel_vf2_vm_vf2_vf2(veq_vm_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf2_f_f(1.5874010324478149414f, 1.9520385308169352356e-08), q2); + + q2.x = vmulsign_vf_vf_vf(q2.x, d); q2.y = vmulsign_vf_vf_vf(q2.y, d); + d = vabs_vf_vf(d); + + x = vcast_vf_f(-0.601564466953277587890625f); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f)); + x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f)); + + y = vmul_vf_vf_vf(x, x); y = vmul_vf_vf_vf(y, y); x = vsub_vf_vf_vf(x, vmul_vf_vf_vf(vmlanp_vf_vf_vf_vf(d, y, x), vcast_vf_f(-1.0 / 3.0))); + + z = x; + + u = dfmul_vf2_vf_vf(x, x); + u = dfmul_vf2_vf2_vf2(u, u); + u = dfmul_vf2_vf2_vf(u, d); + u = dfadd2_vf2_vf2_vf(u, vneg_vf_vf(x)); + y = vadd_vf_vf_vf(u.x, u.y); + + y = vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(-2.0 / 3.0), y), z); + v = dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(z, z), y); + v = dfmul_vf2_vf2_vf(v, d); + v = dfmul_vf2_vf2_vf2(v, q2); + z = vldexp_vf_vf_vi2(vadd_vf_vf_vf(v.x, v.y), vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048))); + + z = vsel_vf_vm_vf_vf(visinf_vm_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(INFINITY), q2.x), z); + z = vsel_vf_vm_vf_vf(veq_vm_vf_vf(d, vcast_vf_f(0)), (vfloat)vsignbit_vm_vf(q2.x), z); + + return z; +} + +static inline vfloat2 logkf(vfloat d) { + vfloat2 x, x2; + vfloat t, m; + vint2 e; + + e = vilogbp1_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(0.7071f))); + m = vldexp_vf_vf_vi2(d, vneg_vi2_vi2(e)); + + x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m)); + x2 = dfsqu_vf2_vf2(x); + + t = vcast_vf_f(0.2371599674224853515625f); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.285279005765914916992188f)); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.400005519390106201171875f)); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.666666567325592041015625f)); + + return dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), + vcast_vf_vi2(e)), + dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)), dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t))); +} + +vfloat _SLEEF_N(xlogf_u1)(vfloat d) { + vfloat2 s = logkf(d); + vfloat x = vadd_vf_vf_vf(s.x, s.y); + + x = vsel_vf_vm_vf_vf(vispinf_vm_vf(d), vcast_vf_f(INFINITY), x); +#ifdef __ARM_NEON__ + x = vsel_vf_vm_vf_vf(vlt_vm_vf_vf(d, vcast_vf_f(1e-37f)), vcast_vf_f(-INFINITY), x); +#else + x = vsel_vf_vm_vf_vf(veq_vm_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-INFINITY), x); +#endif + x = (vfloat)vor_vm_vm_vm(vgt_vm_vf_vf(vcast_vf_f(0), d), (vmask)x); + + return x; +} + +static inline vfloat expkf(vfloat2 d) { + vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(R_LN2f)); + vint2 q = vrint_vi2_vf(u); + vfloat2 s, t; + + s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf))); + + s = dfnormalize_vf2_vf2(s); + + u = vcast_vf_f(0.00136324646882712841033936f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00836596917361021041870117f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0416710823774337768554688f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.166665524244308471679688f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.499999850988388061523438f)); + + t = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfsqu_vf2_vf2(s), u)); + + t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t); + u = vadd_vf_vf_vf(t.x, t.y); + u = vldexp_vf_vf_vi2(u, q); + + return u; +} + +vfloat _SLEEF_N(xpowf)(vfloat x, vfloat y) { +#if 1 + vmask yisnint = vneq_vm_vf_vf(vcast_vf_vi2(vrint_vi2_vf(y)), y); + vmask yisodd = vandnot_vm_vm_vm(yisnint, veq_vm_vi2_vi2(vand_vi2_vi2_vi2(vrint_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1))); + + vfloat result = expkf(dfmul_vf2_vf2_vf(logkf(vabs_vf_vf(x)), y)); + + result = vmul_vf_vf_vf(result, + vsel_vf_vm_vf_vf(vgt_vm_vf_vf(x, vcast_vf_f(0)), + vcast_vf_f(1), + (vfloat)vor_vm_vm_vm(yisnint, (vmask)vsel_vf_vm_vf_vf(yisodd, vcast_vf_f(-1), vcast_vf_f(1))))); + + vfloat efx = (vfloat)vxor_vm_vm_vm((vmask)vsub_vf_vf_vf(vabs_vf_vf(x), vcast_vf_f(1)), vsignbit_vm_vf(y)); + + result = vsel_vf_vm_vf_vf(visinf_vm_vf(y), + (vfloat)vandnot_vm_vm_vm(vlt_vm_vf_vf(efx, vcast_vf_f(0.0f)), + (vmask)vsel_vf_vm_vf_vf(veq_vm_vf_vf(efx, vcast_vf_f(0.0f)), + vcast_vf_f(1.0f), + vcast_vf_f(INFINITYf))), + result); + + result = vsel_vf_vm_vf_vf(vor_vm_vm_vm(visinf_vm_vf(x), veq_vm_vf_vf(x, vcast_vf_f(0))), + vmul_vf_vf_vf(vsel_vf_vm_vf_vf(yisodd, vsign_vf_vf(x), vcast_vf_f(1)), + (vfloat)vandnot_vm_vm_vm(vlt_vm_vf_vf(vsel_vf_vm_vf_vf(veq_vm_vf_vf(x, vcast_vf_f(0)), vneg_vf_vf(y), y), vcast_vf_f(0)), + (vmask)vcast_vf_f(INFINITYf))), + result); + + result = (vfloat)vor_vm_vm_vm(vor_vm_vm_vm(visnan_vm_vf(x), visnan_vm_vf(y)), (vmask)result); + + result = vsel_vf_vm_vf_vf(vor_vm_vm_vm(veq_vm_vf_vf(y, vcast_vf_f(0)), veq_vm_vf_vf(x, vcast_vf_f(1))), vcast_vf_f(1), result); + + return result; +#else + return expkf(dfmul_vf2_vf2_vf(logkf(x), y)); +#endif +} + +static inline vfloat2 expk2f(vfloat2 d) { + vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(R_LN2f)); + vint2 q = vrint_vi2_vf(u); + vfloat2 s, t; + + s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf))); + s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf))); + + u = vcast_vf_f(0.00136324646882712841033936f); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00836596917361021041870117f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0416710823774337768554688f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.166665524244308471679688f)); + u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.499999850988388061523438f)); + + t = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfsqu_vf2_vf2(s), u)); + + t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t); + + return dfscale_vf2_vf2_vf(t, vpow2i_vf_vi2(q)); +} + +vfloat _SLEEF_N(xsinhf)(vfloat x) { + vfloat y = vabs_vf_vf(x); + vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); + d = dfsub_vf2_vf2_vf2(d, dfrec_vf2_vf2(d)); + y = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(0.5)); + + y = vsel_vf_vm_vf_vf(vor_vm_vm_vm(vgt_vm_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)), + visnan_vm_vf(y)), vcast_vf_f(INFINITYf), y); + y = vmulsign_vf_vf_vf(y, x); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +vfloat _SLEEF_N(xcoshf)(vfloat x) { + vfloat y = vabs_vf_vf(x); + vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); + d = dfadd_vf2_vf2_vf2(d, dfrec_vf2_vf2(d)); + y = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(0.5)); + + y = vsel_vf_vm_vf_vf(vor_vm_vm_vm(vgt_vm_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)), + visnan_vm_vf(y)), vcast_vf_f(INFINITYf), y); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +vfloat _SLEEF_N(xtanhf)(vfloat x) { + vfloat y = vabs_vf_vf(x); + vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0))); + vfloat2 e = dfrec_vf2_vf2(d); + d = dfdiv_vf2_vf2_vf2(dfadd_vf2_vf2_vf2(d, dfneg_vf2_vf2(e)), dfadd_vf2_vf2_vf2(d, e)); + y = vadd_vf_vf_vf(d.x, d.y); + + y = vsel_vf_vm_vf_vf(vor_vm_vm_vm(vgt_vm_vf_vf(vabs_vf_vf(x), vcast_vf_f(8.664339742f)), + visnan_vm_vf(y)), vcast_vf_f(1.0f), y); + y = vmulsign_vf_vf_vf(y, x); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +static inline vfloat2 logk2f(vfloat2 d) { + vfloat2 x, x2, m; + vfloat t; + vint2 e; + + e = vilogbp1_vi2_vf(vmul_vf_vf_vf(d.x, vcast_vf_f(0.7071))); + m = dfscale_vf2_vf2_vf(d, vpow2i_vf_vi2(vneg_vi2_vi2(e))); + + x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(m, vcast_vf_f(-1)), dfadd2_vf2_vf2_vf(m, vcast_vf_f(1))); + x2 = dfsqu_vf2_vf2(x); + + t = vcast_vf_f(0.2371599674224853515625f); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.285279005765914916992188f)); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.400005519390106201171875f)); + t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.666666567325592041015625f)); + + return dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), + vcast_vf_vi2(e)), + dfadd2_vf2_vf2_vf2(dfscale_vf2_vf2_vf(x, vcast_vf_f(2)), dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t))); +} + +vfloat _SLEEF_N(xasinhf)(vfloat x) { + vfloat y = vabs_vf_vf(x); + vfloat2 d = logk2f(dfadd_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(y, y), vcast_vf_f(1))), y)); + y = vadd_vf_vf_vf(d.x, d.y); + + y = vsel_vf_vm_vf_vf(vor_vm_vm_vm(visinf_vm_vf(x), visnan_vm_vf(y)), vcast_vf_f(INFINITYf), y); + y = vmulsign_vf_vf_vf(y, x); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +vfloat _SLEEF_N(xacoshf)(vfloat x) { + vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(x, x), vcast_vf_f(-1))), x)); + vfloat y = vadd_vf_vf_vf(d.x, d.y); + + y = vsel_vf_vm_vf_vf(vor_vm_vm_vm(visinf_vm_vf(x), visnan_vm_vf(y)), vcast_vf_f(INFINITYf), y); + + y = (vfloat)vandnot_vm_vm_vm(veq_vm_vf_vf(x, vcast_vf_f(1.0f)), (vmask)y); + + y = (vfloat)vor_vm_vm_vm(vlt_vm_vf_vf(x, vcast_vf_f(1.0f)), (vmask)y); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +vfloat _SLEEF_N(xatanhf)(vfloat x) { + vfloat y = vabs_vf_vf(x); + vfloat2 d = logk2f(dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(1), y), dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(y)))); + y = (vfloat)vor_vm_vm_vm(vgt_vm_vf_vf(y, vcast_vf_f(1.0)), (vmask)vsel_vf_vm_vf_vf(veq_vm_vf_vf(y, vcast_vf_f(1.0)), vcast_vf_f(INFINITYf), vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(0.5)))); + + y = (vfloat)vor_vm_vm_vm(vor_vm_vm_vm(visinf_vm_vf(x), visnan_vm_vf(y)), (vmask)y); + y = vmulsign_vf_vf_vf(y, x); + y = (vfloat)vor_vm_vm_vm(visnan_vm_vf(x), (vmask)y); + + return y; +} + +vfloat _SLEEF_N(xexp2f)(vfloat a) { + vfloat u = expkf(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), a)); +#ifdef __ARM_NEON__ + u = vsel_vf_vm_vf_vf(vgt_vm_vf_vf(a, vcast_vf_f(127.0f)), vcast_vf_f(INFINITYf), u); +#else + u = vsel_vf_vm_vf_vf(vispinf_vm_vf(a), vcast_vf_f(INFINITYf), u); +#endif + u = (vfloat)vandnot_vm_vm_vm(visminf_vm_vf(a), (vmask)u); + return u; +} + +vfloat _SLEEF_N(xexp10f)(vfloat a) { + vfloat u = expkf(dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(2.3025851249694824219f), vcast_vf_f(-3.1975436520781386207e-08f)), a)); +#ifdef __ARM_NEON__ + u = vsel_vf_vm_vf_vf(vgt_vm_vf_vf(a, vcast_vf_f(38.0f)), vcast_vf_f(INFINITYf), u); +#else + u = vsel_vf_vm_vf_vf(vispinf_vm_vf(a), vcast_vf_f(INFINITYf), u); +#endif + u = (vfloat)vandnot_vm_vm_vm(visminf_vm_vf(a), (vmask)u); + return u; +} + +vfloat _SLEEF_N(xexpm1f)(vfloat a) { + vfloat2 d = dfadd2_vf2_vf2_vf(expk2f(vcast_vf2_vf_vf(a, vcast_vf_f(0))), vcast_vf_f(-1.0)); + vfloat x = vadd_vf_vf_vf(d.x, d.y); + x = vsel_vf_vm_vf_vf(vgt_vm_vf_vf(a, vcast_vf_f(88.0f)), vcast_vf_f(INFINITYf), x); + x = vsel_vf_vm_vf_vf(vlt_vm_vf_vf(a, vcast_vf_f(-0.15942385152878742116596338793538061065739925620174e+2f)), vcast_vf_f(-1), x); + return x; +} + +vfloat _SLEEF_N(xlog10f)(vfloat a) { + vfloat2 d = dfmul_vf2_vf2_vf2(logkf(a), vcast_vf2_vf_vf(vcast_vf_f(0.43429449200630187988f), vcast_vf_f(-1.0103050118726031315e-08f))); + vfloat x = vadd_vf_vf_vf(d.x, d.y); + + x = vsel_vf_vm_vf_vf(vispinf_vm_vf(a), vcast_vf_f(INFINITYf), x); + x = (vfloat)vor_vm_vm_vm(vgt_vm_vf_vf(vcast_vf_f(0), a), (vmask)x); + x = vsel_vf_vm_vf_vf(veq_vm_vf_vf(a, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x); + + return x; +} + +vfloat _SLEEF_N(xlog1pf)(vfloat a) { + vfloat2 d = logk2f(dfadd2_vf2_vf_vf(a, vcast_vf_f(1))); + vfloat x = vadd_vf_vf_vf(d.x, d.y); + + x = vsel_vf_vm_vf_vf(vispinf_vm_vf(a), vcast_vf_f(INFINITYf), x); + x = (vfloat)vor_vm_vm_vm(vgt_vm_vf_vf(vcast_vf_f(-1), a), (vmask)x); + x = vsel_vf_vm_vf_vf(veq_vm_vf_vf(a, vcast_vf_f(-1)), vcast_vf_f(-INFINITYf), x); + + return x; +} Index: sleef/lib/sse2.h =================================================================== --- /dev/null +++ sleef/lib/sse2.h @@ -0,0 +1,257 @@ +/*===---------- sse2.h - SSE2 functions ------------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __SSE2__ +#error Please specify -msse2. +#endif + +#include + +typedef __m128d vdouble; +typedef __m128i vint; +typedef __m128i vmask; + +typedef __m128 vfloat; +typedef __m128i vint2; + +// + +static inline vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); } +static inline vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); } +static inline vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); } +static inline vdouble vcast_vd_d(double d) { return _mm_set_pd(d, d); } +static inline vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); } + +static inline vmask vreinterpret_vm_vd(vdouble vd) { return (__m128i)vd; } +static inline vdouble vreinterpret_vd_vm(vint vm) { return (__m128d)vm; } + +static inline vmask vreinterpret_vm_vf(vfloat vf) { return (__m128i)vf; } +static inline vfloat vreinterpret_vf_vm(vmask vm) { return (__m128)vm; } + +// + +static inline vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); } +static inline vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); } +static inline vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); } + +static inline vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); } +static inline vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); } +static inline vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); } +static inline vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); } + +static inline vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); } +static inline vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); } +static inline vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); } + +// + +static inline vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm_and_si128(x, y); } +static inline vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); } +static inline vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm_or_si128(x, y); } +static inline vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); } + +static inline vmask veq_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmpeq_pd(x, y); } +static inline vmask vneq_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmpneq_pd(x, y); } +static inline vmask vlt_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmplt_pd(x, y); } +static inline vmask vle_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmple_pd(x, y); } +static inline vmask vgt_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmpgt_pd(x, y); } +static inline vmask vge_vm_vd_vd(vdouble x, vdouble y) { return (__m128i)_mm_cmpge_pd(x, y); } + +static inline vmask veq_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmpeq_ps(x, y); } +static inline vmask vneq_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmpneq_ps(x, y); } +static inline vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmplt_ps(x, y); } +static inline vmask vle_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmple_ps(x, y); } +static inline vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmpgt_ps(x, y); } +static inline vmask vge_vm_vf_vf(vfloat x, vfloat y) { return (__m128i)_mm_cmpge_ps(x, y); } + +// + +static inline vfloat vcast_vf_f(float f) { return _mm_set_ps(f, f, f, f); } + +static inline vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); } +static inline vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); } +static inline vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); } +static inline vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); } +static inline vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); } +static inline vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); } +static inline vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); } +static inline vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); } + +static inline vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); } +static inline vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); } +static inline vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); } +static inline vfloat vneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)d); } + +// + +static inline vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); } +static inline vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); } +static inline vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); } +static inline vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); } +static inline vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set_pd(1, 1), x); } +static inline vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); } +static inline vdouble vabs_vd_vd(vdouble d) { return (__m128d)_mm_andnot_pd(_mm_set_pd(-0.0,-0.0), d); } +static inline vdouble vneg_vd_vd(vdouble d) { return (__m128d)_mm_xor_pd(_mm_set_pd(-0.0,-0.0), d); } +static inline vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } +static inline vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); } + +static inline vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); } +static inline vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); } + +// + +static inline vmask veq_vm_vi_vi(vint x, vint y) { + __m128 s = (__m128)_mm_cmpeq_epi32(x, y); + return (__m128i)_mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 0, 0)); +} + +static inline vdouble vsel_vd_vm_vd_vd(vmask mask, vdouble x, vdouble y) { + return (__m128d)vor_vm_vm_vm(vand_vm_vm_vm(mask, (__m128i)x), vandnot_vm_vm_vm(mask, (__m128i)y)); +} + +static inline vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) { + return (vfloat)vor_vm_vm_vm(vand_vm_vm_vm(mask, (vmask)x), vandnot_vm_vm_vm(mask, (vmask)y)); +} + +static inline vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { + vmask mask = (vmask)_mm_cmpeq_ps(_mm_cvtpd_ps((vdouble)vlt_vm_vd_vd(d0, d1)), _mm_set_ps(0, 0, 0, 0)); + return vor_vi_vi_vi(vandnot_vi_vi_vi(mask, x), vand_vi_vi_vi(mask, y)); +} + +// + +static inline vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; } +static inline vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; } + +static inline vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); } +static inline vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); } +static inline vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); } +static inline vint2 vcast_vi2_i(int i) { return _mm_set_epi32(i, i, i, i); } + +static inline vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); } +static inline vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); } +static inline vint vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); } + +static inline vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); } +static inline vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); } +static inline vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); } +static inline vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); } + +static inline vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); } +static inline vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); } +static inline vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); } + +static inline vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); } +static inline vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); } +static inline vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); } +static inline vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); } + +// + +static inline double vcast_d_vd(vdouble v) { + double s[2]; + _mm_storeu_pd(s, v); + return s[0]; +} + +static inline float vcast_f_vf(vfloat v) { + float s[4]; + _mm_storeu_ps(s, v); + return s[0]; +} + +static inline vmask vsignbit_vm_vd(vdouble d) { + return _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0)); +} + +static inline vdouble vsign_vd_vd(vdouble d) { + return (__m128d)_mm_or_si128((__m128i)_mm_set_pd(1, 1), _mm_and_si128((__m128i)d, _mm_set_epi32(0x80000000, 0x0, 0x80000000, 0x0))); +} + +static inline vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) { + return (__m128d)vxor_vi_vi_vi((__m128i)x, vsignbit_vm_vd(y)); +} + +static inline vmask visinf_vm_vd(vdouble d) { + return (vmask)_mm_cmpeq_pd(vabs_vd_vd(d), _mm_set_pd(INFINITY, INFINITY)); +} + +static inline vmask vispinf_vm_vd(vdouble d) { + return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(INFINITY, INFINITY)); +} + +static inline vmask visminf_vm_vd(vdouble d) { + return (vmask)_mm_cmpeq_pd(d, _mm_set_pd(-INFINITY, -INFINITY)); +} + +static inline vmask visnan_vm_vd(vdouble d) { + return (vmask)_mm_cmpneq_pd(d, d); +} + +static inline vdouble visinf(vdouble d) { + return (__m128d)_mm_and_si128(visinf_vm_vd(d), _mm_or_si128(vsignbit_vm_vd(d), (__m128i)_mm_set_pd(1, 1))); +} + +static inline vdouble visinf2(vdouble d, vdouble m) { + return (__m128d)_mm_and_si128(visinf_vm_vd(d), _mm_or_si128(vsignbit_vm_vd(d), (__m128i)m)); +} + +// + +static inline vdouble vpow2i_vd_vi(vint q) { + q = _mm_add_epi32(_mm_set_epi32(0x0, 0x0, 0x3ff, 0x3ff), q); + q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(1,3,0,3)); + return (__m128d)_mm_slli_epi32(q, 20); +} + +static inline vdouble vldexp_vd_vd_vi(vdouble x, vint q) { + vint m = _mm_srai_epi32(q, 31); + m = _mm_slli_epi32(_mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(m, q), 9), m), 7); + q = _mm_sub_epi32(q, _mm_slli_epi32(m, 2)); + m = _mm_add_epi32(_mm_set_epi32(0x0, 0x0, 0x3ff, 0x3ff), m); + m = _mm_andnot_si128(_mm_cmplt_epi32(m, _mm_set_epi32(0, 0, 0, 0)), m); + vint n = _mm_cmpgt_epi32(m, _mm_set_epi32(0x0, 0x0, 0x7ff, 0x7ff)); + m = _mm_or_si128(_mm_andnot_si128(n, m), _mm_and_si128(n, _mm_set_epi32(0x0, 0x0, 0x7ff, 0x7ff))); + m = (__m128i)_mm_shuffle_ps((__m128)m, (__m128)m, _MM_SHUFFLE(1,3,0,3)); + vdouble y = (__m128d)_mm_slli_epi32(m, 20); + return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q)); +} + +static inline vint vilogbp1_vi_vd(vdouble d) { + vint m = vlt_vm_vd_vd(d, vcast_vd_d(4.9090934652977266E-91)); + d = vsel_vd_vm_vd_vd(m, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d); + __m128i q = _mm_and_si128((__m128i)d, _mm_set_epi32(((1 << 12)-1) << 20, 0, ((1 << 12)-1) << 20, 0)); + q = _mm_srli_epi32(q, 20); + q = vor_vm_vm_vm(vand_vm_vm_vm (m, _mm_sub_epi32(q, _mm_set_epi32(300 + 0x3fe, 0, 300 + 0x3fe, 0))), + vandnot_vm_vm_vm(m, _mm_sub_epi32(q, _mm_set_epi32( 0x3fe, 0, 0x3fe, 0)))); + q = (__m128i)_mm_shuffle_ps((__m128)q, (__m128)q, _MM_SHUFFLE(0,0,3,1)); + return q; +} + +static inline vdouble vupper_vd_vd(vdouble d) { + return (__m128d)_mm_and_si128((__m128i)d, _mm_set_epi32(0xffffffff, 0xf8000000, 0xffffffff, 0xf8000000)); +} + +static inline vfloat vupper_vf_vf(vfloat d) { + return (__m128)_mm_and_si128((__m128i)d, _mm_set_epi32(0xfffff000, 0xfffff000, 0xfffff000, 0xfffff000)); +}