I have added 2 missing intrinsics _cvtss_sh and _mm_cvtps_ph to the intrinsics header f16intrin.h.
GCC has these intrinsics in f16cintrin.h. Here is the definition:
extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_cvtsh_ss (unsigned short __S)
{
  __v8hi __H = __extension__ (__v8hi){ __S, 0, 0, 0, 0, 0, 0, 0 };
  __v4sf __A = __builtin_ia32_vcvtph2ps (__H);
  return __builtin_ia32_vec_ext_v4sf (__A, 0);
`}
#ifdef __OPTIMIZE__
extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_cvtss_sh (float __F, const int __I)
{
  __v4sf __A =  __extension__ (__v4sf){ __F, 0, 0, 0 };
  __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I);
  return (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0);
`}
#else
#define _cvtss_sh(__F, __I)                                             \
  (__extension__                                                        \
   ({                                                                   \
      __v4sf __A =  __extension__ (__v4sf){ __F, 0, 0, 0 };             \
      __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I);                 \
      (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0);            \
    }))
#endif /* __OPTIMIZE */Intel's documentation expects _cvtsh_ss to have 2 parameters (instead of one)
https://software.intel.com/en-us/node/524287
but most likely the documentation is wrong, because Intel’s' headers contain these intrinsic prototypes in emmintrin.h
extern float __ICL_INTRINCC _cvtsh_ss(unsigned short); extern unsigned short __ICL_INTRINCC _cvtss_sh(float, int);
BTW, emmintrin.h includes f16cintrin.h, so it should be OK to place these 2 intrinsics in f16cintrin.h. This should satisfy both Intel's and GCC's expectations.
Clang generates the following IR for cvtsh_ss (see below).  In the test I simply checked that the builtin @llvm.x86.vcvtph2ps.128 is generated for _cvtsh_ss. I was afraid that checks for initialization of the vector 'v' might be too lengthy and the IR is prone to change frequently, so I didn't add these checks. However, if you think that this is important and it won't create too much headache because IR will keep changing over time, I could certainly add them.
The same goes for test_cvtss_sh.
static __inline float __DEFAULT_FN_ATTRS
cvtsh_ss(unsigned short a)
{
  __v8hi v = {(short)a, 0, 0, 0, 0, 0, 0, 0};
  __v4sf r = __builtin_ia32_vcvtph2ps(v);
  return r[0];
}define float @test_cvtsh_ss(i16 zeroext %a) #0 {
entry:
  %a.addr.i = alloca i16, align 2
  %v.i = alloca <8 x i16>, align 16
  %r.i = alloca <4 x float>, align 16
  %a.addr = alloca i16, align 2
  store i16 %a, i16* %a.addr, align 2
  %0 = load i16, i16* %a.addr, align 2
  store i16 %0, i16* %a.addr.i, align 2
  %1 = load i16, i16* %a.addr.i, align 2
  %vecinit.i = insertelement <8 x i16> undef, i16 %1, i32 0
  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 0, i32 1
  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 0, i32 2
  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 0, i32 3
  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 0, i32 4
  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 0, i32 5
  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 0, i32 6
  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 0, i32 7
  store <8 x i16> %vecinit7.i, <8 x i16>* %v.i, align 16
  %2 = load <8 x i16>, <8 x i16>* %v.i, align 16
  %3 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %2) #2
  store <4 x float> %3, <4 x float>* %r.i, align 16
  %4 = load <4 x float>, <4 x float>* %r.i, align 16
  %vecext.i = extractelement <4 x float> %4, i32 0
  ret float %vecext.i
}