I have added 2 missing intrinsics _cvtss_sh and _mm_cvtps_ph to the intrinsics header f16intrin.h.
GCC has these intrinsics in f16cintrin.h. Here is the definition:
extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_cvtsh_ss (unsigned short __S)
{
__v8hi __H = __extension__ (__v8hi){ __S, 0, 0, 0, 0, 0, 0, 0 };
__v4sf __A = __builtin_ia32_vcvtph2ps (__H);
return __builtin_ia32_vec_ext_v4sf (__A, 0);
`}
#ifdef __OPTIMIZE__
extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_cvtss_sh (float __F, const int __I)
{
__v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 };
__v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I);
return (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0);
`}
#else
#define _cvtss_sh(__F, __I) \
(__extension__ \
({ \
__v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 }; \
__v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I); \
(unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0); \
}))
#endif /* __OPTIMIZE */Intel's documentation expects _cvtsh_ss to have 2 parameters (instead of one)
https://software.intel.com/en-us/node/524287
but most likely the documentation is wrong, because Intel’s' headers contain these intrinsic prototypes in emmintrin.h
extern float __ICL_INTRINCC _cvtsh_ss(unsigned short); extern unsigned short __ICL_INTRINCC _cvtss_sh(float, int);
BTW, emmintrin.h includes f16cintrin.h, so it should be OK to place these 2 intrinsics in f16cintrin.h. This should satisfy both Intel's and GCC's expectations.
Clang generates the following IR for cvtsh_ss (see below). In the test I simply checked that the builtin @llvm.x86.vcvtph2ps.128 is generated for _cvtsh_ss. I was afraid that checks for initialization of the vector 'v' might be too lengthy and the IR is prone to change frequently, so I didn't add these checks. However, if you think that this is important and it won't create too much headache because IR will keep changing over time, I could certainly add them.
The same goes for test_cvtss_sh.
static __inline float __DEFAULT_FN_ATTRS
cvtsh_ss(unsigned short a)
{
__v8hi v = {(short)a, 0, 0, 0, 0, 0, 0, 0};
__v4sf r = __builtin_ia32_vcvtph2ps(v);
return r[0];
}define float @test_cvtsh_ss(i16 zeroext %a) #0 {
entry:
%a.addr.i = alloca i16, align 2
%v.i = alloca <8 x i16>, align 16
%r.i = alloca <4 x float>, align 16
%a.addr = alloca i16, align 2
store i16 %a, i16* %a.addr, align 2
%0 = load i16, i16* %a.addr, align 2
store i16 %0, i16* %a.addr.i, align 2
%1 = load i16, i16* %a.addr.i, align 2
%vecinit.i = insertelement <8 x i16> undef, i16 %1, i32 0
%vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 0, i32 1
%vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 0, i32 2
%vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 0, i32 3
%vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 0, i32 4
%vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 0, i32 5
%vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 0, i32 6
%vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 0, i32 7
store <8 x i16> %vecinit7.i, <8 x i16>* %v.i, align 16
%2 = load <8 x i16>, <8 x i16>* %v.i, align 16
%3 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %2) #2
store <4 x float> %3, <4 x float>* %r.i, align 16
%4 = load <4 x float>, <4 x float>* %r.i, align 16
%vecext.i = extractelement <4 x float> %4, i32 0
ret float %vecext.i
}