Index: clang/include/clang/Basic/BuiltinsNEON.def =================================================================== --- clang/include/clang/Basic/BuiltinsNEON.def +++ clang/include/clang/Basic/BuiltinsNEON.def @@ -16,6 +16,7 @@ #define GET_NEON_BUILTINS #include "clang/Basic/arm_neon.inc" +#include "clang/Basic/arm_fp16.inc" #undef GET_NEON_BUILTINS #undef BUILTIN Index: clang/include/clang/Basic/CMakeLists.txt =================================================================== --- clang/include/clang/Basic/CMakeLists.txt +++ clang/include/clang/Basic/CMakeLists.txt @@ -46,3 +46,7 @@ -I ${CMAKE_CURRENT_SOURCE_DIR}/../../ SOURCE arm_neon.td TARGET ClangARMNeon) +clang_tablegen(arm_fp16.inc -gen-arm-neon-sema + -I ${CMAKE_CURRENT_SOURCE_DIR}/../../ + SOURCE arm_fp16.td + TARGET ClangARMFP16) Index: clang/include/clang/Basic/arm_fp16.td =================================================================== --- /dev/null +++ clang/include/clang/Basic/arm_fp16.td @@ -0,0 +1,131 @@ +//===--- arm_fp16.td - ARM FP16 compiler interface ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the TableGen definitions from which the ARM FP16 header +// file will be generated. +// +//===----------------------------------------------------------------------===// + +include "arm_neon_incl.td" + +// ARMv8.2-A FP16 intrinsics. +let ArchGuard = "defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && defined(__aarch64__)" in { + + // Negate + def VNEGSH : SInst<"vneg", "ss", "Sh">; + + // Reciprocal/Sqrt + def SCALAR_FRECPSH : IInst<"vrecps", "sss", "Sh">; + def FSQRTSH : SInst<"vsqrt", "ss", "Sh">; + def SCALAR_FRSQRTSH : IInst<"vrsqrts", "sss", "Sh">; + + // Reciprocal Estimate + def SCALAR_FRECPEH : IInst<"vrecpe", "ss", "Sh">; + + // Reciprocal Exponent + def SCALAR_FRECPXH : IInst<"vrecpx", "ss", "Sh">; + + // Reciprocal Square Root Estimate + def SCALAR_FRSQRTEH : IInst<"vrsqrte", "ss", "Sh">; + + // Rounding + def FRINTZ_S64H : SInst<"vrnd", "ss", "Sh">; + def FRINTA_S64H : SInst<"vrnda", "ss", "Sh">; + def FRINTI_S64H : SInst<"vrndi", "ss", "Sh">; + def FRINTM_S64H : SInst<"vrndm", "ss", "Sh">; + def FRINTN_S64H : SInst<"vrndn", "ss", "Sh">; + def FRINTP_S64H : SInst<"vrndp", "ss", "Sh">; + def FRINTX_S64H : SInst<"vrndx", "ss", "Sh">; + + // Conversion + def SCALAR_SCVTFSH : SInst<"vcvth_f16", "Ys", "silUsUiUl">; + def SCALAR_FCVTZSH : SInst<"vcvt_s16", "$s", "Sh">; + def SCALAR_FCVTZSH1 : SInst<"vcvt_s32", "Is", "Sh">; + def SCALAR_FCVTZSH2 : SInst<"vcvt_s64", "Ls", "Sh">; + def SCALAR_FCVTZUH : SInst<"vcvt_u16", "bs", "Sh">; + def SCALAR_FCVTZUH1 : SInst<"vcvt_u32", "Us", "Sh">; + def SCALAR_FCVTZUH2 : SInst<"vcvt_u64", "Os", "Sh">; + def SCALAR_FCVTASH : SInst<"vcvta_s16", "$s", "Sh">; + def SCALAR_FCVTASH1 : SInst<"vcvta_s32", "Is", "Sh">; + def SCALAR_FCVTASH2 : SInst<"vcvta_s64", "Ls", "Sh">; + def SCALAR_FCVTAUH : SInst<"vcvta_u16", "bs", "Sh">; + def SCALAR_FCVTAUH1 : SInst<"vcvta_u32", "Us", "Sh">; + def SCALAR_FCVTAUH2 : SInst<"vcvta_u64", "Os", "Sh">; + def SCALAR_FCVTMSH : SInst<"vcvtm_s16", "$s", "Sh">; + def SCALAR_FCVTMSH1 : SInst<"vcvtm_s32", "Is", "Sh">; + def SCALAR_FCVTMSH2 : SInst<"vcvtm_s64", "Ls", "Sh">; + def SCALAR_FCVTMUH : SInst<"vcvtm_u16", "bs", "Sh">; + def SCALAR_FCVTMUH1 : SInst<"vcvtm_u32", "Us", "Sh">; + def SCALAR_FCVTMUH2 : SInst<"vcvtm_u64", "Os", "Sh">; + def SCALAR_FCVTNSH : SInst<"vcvtn_s16", "$s", "Sh">; + def SCALAR_FCVTNSH1 : SInst<"vcvtn_s32", "Is", "Sh">; + def SCALAR_FCVTNSH2 : SInst<"vcvtn_s64", "Ls", "Sh">; + def SCALAR_FCVTNUH : SInst<"vcvtn_u16", "bs", "Sh">; + def SCALAR_FCVTNUH1 : SInst<"vcvtn_u32", "Us", "Sh">; + def SCALAR_FCVTNUH2 : SInst<"vcvtn_u64", "Os", "Sh">; + def SCALAR_FCVTPSH : SInst<"vcvtp_s16", "$s", "Sh">; + def SCALAR_FCVTPSH1 : SInst<"vcvtp_s32", "Is", "Sh">; + def SCALAR_FCVTPSH2 : SInst<"vcvtp_s64", "Ls", "Sh">; + def SCALAR_FCVTPUH : SInst<"vcvtp_u16", "bs", "Sh">; + def SCALAR_FCVTPUH1 : SInst<"vcvtp_u32", "Us", "Sh">; + def SCALAR_FCVTPUH2 : SInst<"vcvtp_u64", "Os", "Sh">; + + def SCALAR_SCVTFSHO : SInst<"vcvth_n_f16", "Ysi", "silUsUiUl">; + def SCALAR_FCVTZSHO : SInst<"vcvt_n_s16", "$si", "Sh">; + def SCALAR_FCVTZSH1O: SInst<"vcvt_n_s32", "Isi", "Sh">; + def SCALAR_FCVTZSH2O: SInst<"vcvt_n_s64", "Lsi", "Sh">; + def SCALAR_FCVTZUHO : SInst<"vcvt_n_u16", "bsi", "Sh">; + def SCALAR_FCVTZUH1O: SInst<"vcvt_n_u32", "Usi", "Sh">; + def SCALAR_FCVTZUH2O: SInst<"vcvt_n_u64", "Osi", "Sh">; + + // Comparison + def SCALAR_CMEQRH : SInst<"vceq", "bss", "Sh">; + def SCALAR_CMEQZH : SInst<"vceqz", "bs", "Sh">; + def SCALAR_CMGERH : SInst<"vcge", "bss", "Sh">; + def SCALAR_CMGEZH : SInst<"vcgez", "bs", "Sh">; + def SCALAR_CMGTRH : SInst<"vcgt", "bss", "Sh">; + def SCALAR_CMGTZH : SInst<"vcgtz", "bs", "Sh">; + def SCALAR_CMLERH : SInst<"vcle", "bss", "Sh">; + def SCALAR_CMLEZH : SInst<"vclez", "bs", "Sh">; + def SCALAR_CMLTH : SInst<"vclt", "bss", "Sh">; + def SCALAR_CMLTZH : SInst<"vcltz", "bs", "Sh">; + + // Absolute Compare Mask Greater Than Or Equal + def SCALAR_FACGEH : IInst<"vcage", "bss", "Sh">; + def SCALAR_FACLEH : IInst<"vcale", "bss", "Sh">; + + // Absolute Compare Mask Greater Than + def SCALAR_FACGT : IInst<"vcagt", "bss", "Sh">; + def SCALAR_FACLT : IInst<"vcalt", "bss", "Sh">; + + // Scalar Absolute Value + def SCALAR_ABSH : SInst<"vabs", "ss", "Sh">; + + // Scalar Absolute Difference + def SCALAR_ABDH: IInst<"vabd", "sss", "Sh">; + + // Add/Sub + def VADDSH : SInst<"vadd", "sss", "Sh">; + def VSUBHS : SInst<"vsub", "sss", "Sh">; + + // Max/Min + def VMAXHS : SInst<"vmax", "sss", "Sh">; + def VMINHS : SInst<"vmin", "sss", "Sh">; + def FMAXNMHS : SInst<"vmaxnm", "sss", "Sh">; + def FMINNMHS : SInst<"vminnm", "sss", "Sh">; + + // Multiplication/Division + def VMULHS : SInst<"vmul", "sss", "Sh">; + def MULXHS : SInst<"vmulx", "sss", "Sh">; + def FDIVHS : SInst<"vdiv", "sss", "Sh">; + + // Vector fused multiply-add operations + def VFMAHS : SInst<"vfma", "ssss", "Sh">; + def VFMSHS : SInst<"vfms", "ssss", "Sh">; +} Index: clang/include/clang/Basic/arm_neon.td =================================================================== --- clang/include/clang/Basic/arm_neon.td +++ clang/include/clang/Basic/arm_neon.td @@ -11,309 +11,8 @@ // file will be generated. See ARM document DUI0348B. // //===----------------------------------------------------------------------===// -// -// Each intrinsic is a subclass of the Inst class. An intrinsic can either -// generate a __builtin_* call or it can expand to a set of generic operations. -// -// The operations are subclasses of Operation providing a list of DAGs, the -// last of which is the return value. The available DAG nodes are documented -// below. -// -//===----------------------------------------------------------------------===// - -// The base Operation class. All operations must subclass this. -class Operation ops=[]> { - list Ops = ops; - bit Unavailable = 0; -} -// An operation that only contains a single DAG. -class Op : Operation<[op]>; -// A shorter version of Operation - takes a list of DAGs. The last of these will -// be the return value. -class LOp ops> : Operation; - -// These defs and classes are used internally to implement the SetTheory -// expansion and should be ignored. -foreach Index = 0-63 in - def sv##Index; -class MaskExpand; - -//===----------------------------------------------------------------------===// -// Available operations -//===----------------------------------------------------------------------===// - -// DAG arguments can either be operations (documented below) or variables. -// Variables are prefixed with '$'. There are variables for each input argument, -// with the name $pN, where N starts at zero. So the zero'th argument will be -// $p0, the first $p1 etc. - -// op - Binary or unary operator, depending on the number of arguments. The -// operator itself is just treated as a raw string and is not checked. -// example: (op "+", $p0, $p1) -> "__p0 + __p1". -// (op "-", $p0) -> "-__p0" -def op; -// call - Invoke another intrinsic. The input types are type checked and -// disambiguated. If there is no intrinsic defined that takes -// the given types (or if there is a type ambiguity) an error is -// generated at tblgen time. The name of the intrinsic is the raw -// name as given to the Inst class (not mangled). -// example: (call "vget_high", $p0) -> "vgetq_high_s16(__p0)" -// (assuming $p0 has type int16x8_t). -def call; -// cast - Perform a cast to a different type. This gets emitted as a static -// C-style cast. For a pure reinterpret cast (T x = *(T*)&y), use -// "bitcast". -// -// The syntax is (cast MOD* VAL). The last argument is the value to -// cast, preceded by a sequence of type modifiers. The target type -// starts off as the type of VAL, and is modified by MOD in sequence. -// The available modifiers are: -// - $X - Take the type of parameter/variable X. For example: -// (cast $p0, $p1) would cast $p1 to the type of $p0. -// - "R" - The type of the return type. -// - A typedef string - A NEON or stdint.h type that is then parsed. -// for example: (cast "uint32x4_t", $p0). -// - "U" - Make the type unsigned. -// - "S" - Make the type signed. -// - "H" - Halve the number of lanes in the type. -// - "D" - Double the number of lanes in the type. -// - "8" - Convert type to an equivalent vector of 8-bit signed -// integers. -// example: (cast "R", "U", $p0) -> "(uint32x4_t)__p0" (assuming the return -// value is of type "int32x4_t". -// (cast $p0, "D", "8", $p1) -> "(int8x16_t)__p1" (assuming __p0 -// has type float64x1_t or any other vector type of 64 bits). -// (cast "int32_t", $p2) -> "(int32_t)__p2" -def cast; -// bitcast - Same as "cast", except a reinterpret-cast is produced: -// (bitcast "T", $p0) -> "*(T*)&__p0". -// The VAL argument is saved to a temporary so it can be used -// as an l-value. -def bitcast; -// dup - Take a scalar argument and create a vector by duplicating it into -// all lanes. The type of the vector is the base type of the intrinsic. -// example: (dup $p1) -> "(uint32x2_t) {__p1, __p1}" (assuming the base type -// is uint32x2_t). -def dup; -// splat - Take a vector and a lane index, and return a vector of the same type -// containing repeated instances of the source vector at the lane index. -// example: (splat $p0, $p1) -> -// "__builtin_shufflevector(__p0, __p0, __p1, __p1, __p1, __p1)" -// (assuming __p0 has four elements). -def splat; -// save_temp - Create a temporary (local) variable. The variable takes a name -// based on the zero'th parameter and can be referenced using -// using that name in subsequent DAGs in the same -// operation. The scope of a temp is the operation. If a variable -// with the given name already exists, an error will be given at -// tblgen time. -// example: [(save_temp $var, (call "foo", $p0)), -// (op "+", $var, $p1)] -> -// "int32x2_t __var = foo(__p0); return __var + __p1;" -def save_temp; -// name_replace - Return the name of the current intrinsic with the first -// argument replaced by the second argument. Raises an error if -// the first argument does not exist in the intrinsic name. -// example: (call (name_replace "_high_", "_"), $p0) (to call the non-high -// version of this intrinsic). -def name_replace; -// literal - Create a literal piece of code. The code is treated as a raw -// string, and must be given a type. The type is a stdint.h or -// NEON intrinsic type as given to (cast). -// example: (literal "int32_t", "0") -def literal; -// shuffle - Create a vector shuffle. The syntax is (shuffle ARG0, ARG1, MASK). -// The MASK argument is a set of elements. The elements are generated -// from the two special defs "mask0" and "mask1". "mask0" expands to -// the lane indices in sequence for ARG0, and "mask1" expands to -// the lane indices in sequence for ARG1. They can be used as-is, e.g. -// -// (shuffle $p0, $p1, mask0) -> $p0 -// (shuffle $p0, $p1, mask1) -> $p1 -// -// or, more usefully, they can be manipulated using the SetTheory -// operators plus some extra operators defined in the NEON emitter. -// The operators are described below. -// example: (shuffle $p0, $p1, (add (highhalf mask0), (highhalf mask1))) -> -// A concatenation of the high halves of the input vectors. -def shuffle; - -// add, interleave, decimate: These set operators are vanilla SetTheory -// operators and take their normal definition. -def add; -def interleave; -def decimate; -// rotl - Rotate set left by a number of elements. -// example: (rotl mask0, 3) -> [3, 4, 5, 6, 0, 1, 2] -def rotl; -// rotl - Rotate set right by a number of elements. -// example: (rotr mask0, 3) -> [4, 5, 6, 0, 1, 2, 3] -def rotr; -// highhalf - Take only the high half of the input. -// example: (highhalf mask0) -> [4, 5, 6, 7] (assuming mask0 had 8 elements) -def highhalf; -// highhalf - Take only the low half of the input. -// example: (lowhalf mask0) -> [0, 1, 2, 3] (assuming mask0 had 8 elements) -def lowhalf; -// rev - Perform a variable-width reversal of the elements. The zero'th argument -// is a width in bits to reverse. The lanes this maps to is determined -// based on the element width of the underlying type. -// example: (rev 32, mask0) -> [3, 2, 1, 0, 7, 6, 5, 4] (if 8-bit elements) -// example: (rev 32, mask0) -> [1, 0, 3, 2] (if 16-bit elements) -def rev; -// mask0 - The initial sequence of lanes for shuffle ARG0 -def mask0 : MaskExpand; -// mask0 - The initial sequence of lanes for shuffle ARG1 -def mask1 : MaskExpand; - -def OP_NONE : Operation; -def OP_UNAVAILABLE : Operation { - let Unavailable = 1; -} - -//===----------------------------------------------------------------------===// -// Instruction definitions -//===----------------------------------------------------------------------===// -// Every intrinsic subclasses "Inst". An intrinsic has a name, a prototype and -// a sequence of typespecs. -// -// The name is the base name of the intrinsic, for example "vget_lane". This is -// then mangled by the tblgen backend to add type information ("vget_lane_s16"). -// -// A typespec is a sequence of uppercase characters (modifiers) followed by one -// lowercase character. A typespec encodes a particular "base type" of the -// intrinsic. -// -// An example typespec is "Qs" - quad-size short - uint16x8_t. The available -// typespec codes are given below. -// -// The string given to an Inst class is a sequence of typespecs. The intrinsic -// is instantiated for every typespec in the sequence. For example "sdQsQd". -// -// The prototype is a string that defines the return type of the intrinsic -// and the type of each argument. The return type and every argument gets a -// "modifier" that can change in some way the "base type" of the intrinsic. -// -// The modifier 'd' means "default" and does not modify the base type in any -// way. The available modifiers are given below. -// -// Typespecs -// --------- -// c: char -// s: short -// i: int -// l: long -// k: 128-bit long -// f: float -// h: half-float -// d: double -// -// Typespec modifiers -// ------------------ -// S: scalar, only used for function mangling. -// U: unsigned -// Q: 128b -// H: 128b without mangling 'q' -// P: polynomial -// -// Prototype modifiers -// ------------------- -// prototype: return (arg, arg, ...) -// -// v: void -// t: best-fit integer (int/poly args) -// x: signed integer (int/float args) -// u: unsigned integer (int/float args) -// f: float (int args) -// F: double (int args) -// H: half (int args) -// d: default -// g: default, ignore 'Q' size modifier. -// j: default, force 'Q' size modifier. -// w: double width elements, same num elts -// n: double width elements, half num elts -// h: half width elements, double num elts -// q: half width elements, quad num elts -// e: half width elements, double num elts, unsigned -// m: half width elements, same num elts -// i: constant int -// l: constant uint64 -// s: scalar of element type -// z: scalar of half width element type, signed -// r: scalar of double width element type, signed -// a: scalar of element type (splat to vector type) -// b: scalar of unsigned integer/long type (int/float args) -// $: scalar of signed integer/long type (int/float args) -// y: scalar of float -// o: scalar of double -// k: default elt width, double num elts -// 2,3,4: array of default vectors -// B,C,D: array of default elts, force 'Q' size modifier. -// p: pointer type -// c: const pointer type - -// Every intrinsic subclasses Inst. -class Inst { - string Name = n; - string Prototype = p; - string Types = t; - string ArchGuard = ""; - - Operation Operation = o; - bit CartesianProductOfTypes = 0; - bit BigEndianSafe = 0; - bit isShift = 0; - bit isScalarShift = 0; - bit isScalarNarrowShift = 0; - bit isVCVT_N = 0; - // For immediate checks: the immediate will be assumed to specify the lane of - // a Q register. Only used for intrinsics which end up calling polymorphic - // builtins. - bit isLaneQ = 0; - - // Certain intrinsics have different names than their representative - // instructions. This field allows us to handle this correctly when we - // are generating tests. - string InstName = ""; - - // Certain intrinsics even though they are not a WOpInst or LOpInst, - // generate a WOpInst/LOpInst instruction (see below for definition - // of a WOpInst/LOpInst). For testing purposes we need to know - // this. Ex: vset_lane which outputs vmov instructions. - bit isHiddenWInst = 0; - bit isHiddenLInst = 0; -} - -// The following instruction classes are implemented via builtins. -// These declarations are used to generate Builtins.def: -// -// SInst: Instruction with signed/unsigned suffix (e.g., "s8", "u8", "p8") -// IInst: Instruction with generic integer suffix (e.g., "i8") -// WInst: Instruction with only bit size suffix (e.g., "8") -class SInst : Inst {} -class IInst : Inst {} -class WInst : Inst {} - -// The following instruction classes are implemented via operators -// instead of builtins. As such these declarations are only used for -// the purpose of generating tests. -// -// SOpInst: Instruction with signed/unsigned suffix (e.g., "s8", -// "u8", "p8"). -// IOpInst: Instruction with generic integer suffix (e.g., "i8"). -// WOpInst: Instruction with bit size only suffix (e.g., "8"). -// LOpInst: Logical instruction with no bit size suffix. -// NoTestOpInst: Intrinsic that has no corresponding instruction. -class SOpInst : Inst {} -class IOpInst : Inst {} -class WOpInst : Inst {} -class LOpInst : Inst {} -class NoTestOpInst : Inst {} - -//===----------------------------------------------------------------------===// -// Operations -//===----------------------------------------------------------------------===// +include "arm_neon_incl.td" def OP_ADD : Op<(op "+", $p0, $p1)>; def OP_ADDL : Op<(op "+", (call "vmovl", $p0), (call "vmovl", $p1))>; Index: clang/include/clang/Basic/arm_neon_incl.td =================================================================== --- /dev/null +++ clang/include/clang/Basic/arm_neon_incl.td @@ -0,0 +1,313 @@ +//===--- arm_neon_incl.td - ARM NEON compiler interface ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines data structures shared by arm_neon.td and arm_fp16.td. +// It constains base operation classes, operations, instructions, instruction +// modifiers, etc. +// +//===----------------------------------------------------------------------===// +// +// Each intrinsic is a subclass of the Inst class. An intrinsic can either +// generate a __builtin_* call or it can expand to a set of generic operations. +// +// The operations are subclasses of Operation providing a list of DAGs, the +// last of which is the return value. The available DAG nodes are documented +// below. +// +//===----------------------------------------------------------------------===// + +// The base Operation class. All operations must subclass this. +class Operation ops=[]> { + list Ops = ops; + bit Unavailable = 0; +} +// An operation that only contains a single DAG. +class Op : Operation<[op]>; +// A shorter version of Operation - takes a list of DAGs. The last of these will +// be the return value. +class LOp ops> : Operation; + +// These defs and classes are used internally to implement the SetTheory +// expansion and should be ignored. +foreach Index = 0-63 in + def sv##Index; +class MaskExpand; + +//===----------------------------------------------------------------------===// +// Available operations +//===----------------------------------------------------------------------===// + +// DAG arguments can either be operations (documented below) or variables. +// Variables are prefixed with '$'. There are variables for each input argument, +// with the name $pN, where N starts at zero. So the zero'th argument will be +// $p0, the first $p1 etc. + +// op - Binary or unary operator, depending on the number of arguments. The +// operator itself is just treated as a raw string and is not checked. +// example: (op "+", $p0, $p1) -> "__p0 + __p1". +// (op "-", $p0) -> "-__p0" +def op; +// call - Invoke another intrinsic. The input types are type checked and +// disambiguated. If there is no intrinsic defined that takes +// the given types (or if there is a type ambiguity) an error is +// generated at tblgen time. The name of the intrinsic is the raw +// name as given to the Inst class (not mangled). +// example: (call "vget_high", $p0) -> "vgetq_high_s16(__p0)" +// (assuming $p0 has type int16x8_t). +def call; +// cast - Perform a cast to a different type. This gets emitted as a static +// C-style cast. For a pure reinterpret cast (T x = *(T*)&y), use +// "bitcast". +// +// The syntax is (cast MOD* VAL). The last argument is the value to +// cast, preceded by a sequence of type modifiers. The target type +// starts off as the type of VAL, and is modified by MOD in sequence. +// The available modifiers are: +// - $X - Take the type of parameter/variable X. For example: +// (cast $p0, $p1) would cast $p1 to the type of $p0. +// - "R" - The type of the return type. +// - A typedef string - A NEON or stdint.h type that is then parsed. +// for example: (cast "uint32x4_t", $p0). +// - "U" - Make the type unsigned. +// - "S" - Make the type signed. +// - "H" - Halve the number of lanes in the type. +// - "D" - Double the number of lanes in the type. +// - "8" - Convert type to an equivalent vector of 8-bit signed +// integers. +// example: (cast "R", "U", $p0) -> "(uint32x4_t)__p0" (assuming the return +// value is of type "int32x4_t". +// (cast $p0, "D", "8", $p1) -> "(int8x16_t)__p1" (assuming __p0 +// has type float64x1_t or any other vector type of 64 bits). +// (cast "int32_t", $p2) -> "(int32_t)__p2" +def cast; +// bitcast - Same as "cast", except a reinterpret-cast is produced: +// (bitcast "T", $p0) -> "*(T*)&__p0". +// The VAL argument is saved to a temporary so it can be used +// as an l-value. +def bitcast; +// dup - Take a scalar argument and create a vector by duplicating it into +// all lanes. The type of the vector is the base type of the intrinsic. +// example: (dup $p1) -> "(uint32x2_t) {__p1, __p1}" (assuming the base type +// is uint32x2_t). +def dup; +// splat - Take a vector and a lane index, and return a vector of the same type +// containing repeated instances of the source vector at the lane index. +// example: (splat $p0, $p1) -> +// "__builtin_shufflevector(__p0, __p0, __p1, __p1, __p1, __p1)" +// (assuming __p0 has four elements). +def splat; +// save_temp - Create a temporary (local) variable. The variable takes a name +// based on the zero'th parameter and can be referenced using +// using that name in subsequent DAGs in the same +// operation. The scope of a temp is the operation. If a variable +// with the given name already exists, an error will be given at +// tblgen time. +// example: [(save_temp $var, (call "foo", $p0)), +// (op "+", $var, $p1)] -> +// "int32x2_t __var = foo(__p0); return __var + __p1;" +def save_temp; +// name_replace - Return the name of the current intrinsic with the first +// argument replaced by the second argument. Raises an error if +// the first argument does not exist in the intrinsic name. +// example: (call (name_replace "_high_", "_"), $p0) (to call the non-high +// version of this intrinsic). +def name_replace; +// literal - Create a literal piece of code. The code is treated as a raw +// string, and must be given a type. The type is a stdint.h or +// NEON intrinsic type as given to (cast). +// example: (literal "int32_t", "0") +def literal; +// shuffle - Create a vector shuffle. The syntax is (shuffle ARG0, ARG1, MASK). +// The MASK argument is a set of elements. The elements are generated +// from the two special defs "mask0" and "mask1". "mask0" expands to +// the lane indices in sequence for ARG0, and "mask1" expands to +// the lane indices in sequence for ARG1. They can be used as-is, e.g. +// +// (shuffle $p0, $p1, mask0) -> $p0 +// (shuffle $p0, $p1, mask1) -> $p1 +// +// or, more usefully, they can be manipulated using the SetTheory +// operators plus some extra operators defined in the NEON emitter. +// The operators are described below. +// example: (shuffle $p0, $p1, (add (highhalf mask0), (highhalf mask1))) -> +// A concatenation of the high halves of the input vectors. +def shuffle; + +// add, interleave, decimate: These set operators are vanilla SetTheory +// operators and take their normal definition. +def add; +def interleave; +def decimate; +// rotl - Rotate set left by a number of elements. +// example: (rotl mask0, 3) -> [3, 4, 5, 6, 0, 1, 2] +def rotl; +// rotl - Rotate set right by a number of elements. +// example: (rotr mask0, 3) -> [4, 5, 6, 0, 1, 2, 3] +def rotr; +// highhalf - Take only the high half of the input. +// example: (highhalf mask0) -> [4, 5, 6, 7] (assuming mask0 had 8 elements) +def highhalf; +// highhalf - Take only the low half of the input. +// example: (lowhalf mask0) -> [0, 1, 2, 3] (assuming mask0 had 8 elements) +def lowhalf; +// rev - Perform a variable-width reversal of the elements. The zero'th argument +// is a width in bits to reverse. The lanes this maps to is determined +// based on the element width of the underlying type. +// example: (rev 32, mask0) -> [3, 2, 1, 0, 7, 6, 5, 4] (if 8-bit elements) +// example: (rev 32, mask0) -> [1, 0, 3, 2] (if 16-bit elements) +def rev; +// mask0 - The initial sequence of lanes for shuffle ARG0 +def mask0 : MaskExpand; +// mask0 - The initial sequence of lanes for shuffle ARG1 +def mask1 : MaskExpand; + +def OP_NONE : Operation; +def OP_UNAVAILABLE : Operation { + let Unavailable = 1; +} + +//===----------------------------------------------------------------------===// +// Instruction definitions +//===----------------------------------------------------------------------===// + +// Every intrinsic subclasses "Inst". An intrinsic has a name, a prototype and +// a sequence of typespecs. +// +// The name is the base name of the intrinsic, for example "vget_lane". This is +// then mangled by the tblgen backend to add type information ("vget_lane_s16"). +// +// A typespec is a sequence of uppercase characters (modifiers) followed by one +// lowercase character. A typespec encodes a particular "base type" of the +// intrinsic. +// +// An example typespec is "Qs" - quad-size short - uint16x8_t. The available +// typespec codes are given below. +// +// The string given to an Inst class is a sequence of typespecs. The intrinsic +// is instantiated for every typespec in the sequence. For example "sdQsQd". +// +// The prototype is a string that defines the return type of the intrinsic +// and the type of each argument. The return type and every argument gets a +// "modifier" that can change in some way the "base type" of the intrinsic. +// +// The modifier 'd' means "default" and does not modify the base type in any +// way. The available modifiers are given below. +// +// Typespecs +// --------- +// c: char +// s: short +// i: int +// l: long +// k: 128-bit long +// f: float +// h: half-float +// d: double +// +// Typespec modifiers +// ------------------ +// S: scalar, only used for function mangling. +// U: unsigned +// Q: 128b +// H: 128b without mangling 'q' +// P: polynomial +// +// Prototype modifiers +// ------------------- +// prototype: return (arg, arg, ...) +// +// v: void +// t: best-fit integer (int/poly args) +// x: signed integer (int/float args) +// u: unsigned integer (int/float args) +// f: float (int args) +// F: double (int args) +// H: half (int args) +// d: default +// g: default, ignore 'Q' size modifier. +// j: default, force 'Q' size modifier. +// w: double width elements, same num elts +// n: double width elements, half num elts +// h: half width elements, double num elts +// q: half width elements, quad num elts +// e: half width elements, double num elts, unsigned +// m: half width elements, same num elts +// i: constant int +// l: constant uint64 +// s: scalar of element type +// z: scalar of half width element type, signed +// r: scalar of double width element type, signed +// a: scalar of element type (splat to vector type) +// b: scalar of unsigned integer/long type (int/float args) +// $: scalar of signed integer/long type (int/float args) +// y: scalar of float +// o: scalar of double +// k: default elt width, double num elts +// 2,3,4: array of default vectors +// B,C,D: array of default elts, force 'Q' size modifier. +// p: pointer type +// c: const pointer type + +// Every intrinsic subclasses Inst. +class Inst { + string Name = n; + string Prototype = p; + string Types = t; + string ArchGuard = ""; + + Operation Operation = o; + bit CartesianProductOfTypes = 0; + bit BigEndianSafe = 0; + bit isShift = 0; + bit isScalarShift = 0; + bit isScalarNarrowShift = 0; + bit isVCVT_N = 0; + // For immediate checks: the immediate will be assumed to specify the lane of + // a Q register. Only used for intrinsics which end up calling polymorphic + // builtins. + bit isLaneQ = 0; + + // Certain intrinsics have different names than their representative + // instructions. This field allows us to handle this correctly when we + // are generating tests. + string InstName = ""; + + // Certain intrinsics even though they are not a WOpInst or LOpInst, + // generate a WOpInst/LOpInst instruction (see below for definition + // of a WOpInst/LOpInst). For testing purposes we need to know + // this. Ex: vset_lane which outputs vmov instructions. + bit isHiddenWInst = 0; + bit isHiddenLInst = 0; +} + +// The following instruction classes are implemented via builtins. +// These declarations are used to generate Builtins.def: +// +// SInst: Instruction with signed/unsigned suffix (e.g., "s8", "u8", "p8") +// IInst: Instruction with generic integer suffix (e.g., "i8") +// WInst: Instruction with only bit size suffix (e.g., "8") +class SInst : Inst {} +class IInst : Inst {} +class WInst : Inst {} + +// The following instruction classes are implemented via operators +// instead of builtins. As such these declarations are only used for +// the purpose of generating tests. +// +// SOpInst: Instruction with signed/unsigned suffix (e.g., "s8", +// "u8", "p8"). +// IOpInst: Instruction with generic integer suffix (e.g., "i8"). +// WOpInst: Instruction with bit size only suffix (e.g., "8"). +// LOpInst: Logical instruction with no bit size suffix. +// NoTestOpInst: Intrinsic that has no corresponding instruction. +class SOpInst : Inst {} +class IOpInst : Inst {} +class WOpInst : Inst {} +class LOpInst : Inst {} +class NoTestOpInst : Inst {} Index: clang/lib/Basic/Targets/AArch64.cpp =================================================================== --- clang/lib/Basic/Targets/AArch64.cpp +++ clang/lib/Basic/Targets/AArch64.cpp @@ -183,6 +183,8 @@ if ((FPU & NeonMode) && HasFullFP16) Builder.defineMacro("__ARM_FEATURE_FP16_VECTOR_ARITHMETIC", "1"); + if (HasFullFP16) + Builder.defineMacro("__ARM_FEATURE_FP16_SCALAR_ARITHMETIC", "1"); switch (ArchKind) { default: Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -4101,6 +4101,54 @@ NEONMAP1(vuqaddd_s64, aarch64_neon_suqadd, Add1ArgType), NEONMAP1(vuqaddh_s16, aarch64_neon_suqadd, Vectorize1ArgType | Use64BitVectors), NEONMAP1(vuqadds_s32, aarch64_neon_suqadd, Add1ArgType), + // FP16 scalar intrinisics go here. + NEONMAP1(vabdh_f16, aarch64_sisd_fabd, Add1ArgType), + NEONMAP1(vabsh_f16, aarch64_neon_abs, Add1ArgType), + NEONMAP1(vcageh_f16, aarch64_neon_facge, AddRetType | Add1ArgType), + NEONMAP1(vcagth_f16, aarch64_neon_facgt, AddRetType | Add1ArgType), + NEONMAP1(vcaleh_f16, aarch64_neon_facge, AddRetType | Add1ArgType), + NEONMAP1(vcalth_f16, aarch64_neon_facgt, AddRetType | Add1ArgType), + NEONMAP1(vcvtah_s16_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType), + NEONMAP1(vcvtah_s32_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType), + NEONMAP1(vcvtah_s64_f16, aarch64_neon_fcvtas, AddRetType | Add1ArgType), + NEONMAP1(vcvtah_u16_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType), + NEONMAP1(vcvtah_u32_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType), + NEONMAP1(vcvtah_u64_f16, aarch64_neon_fcvtau, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_f16_s16, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_f16_s32, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_f16_s64, aarch64_neon_vcvtfxs2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_f16_u16, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_f16_u32, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_f16_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_s16_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_s32_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_s64_f16, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_u16_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_u32_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvth_n_u64_f16, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), + NEONMAP1(vcvtmh_s16_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), + NEONMAP1(vcvtmh_s32_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), + NEONMAP1(vcvtmh_s64_f16, aarch64_neon_fcvtms, AddRetType | Add1ArgType), + NEONMAP1(vcvtmh_u16_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), + NEONMAP1(vcvtmh_u32_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), + NEONMAP1(vcvtmh_u64_f16, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), + NEONMAP1(vcvtnh_s16_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType), + NEONMAP1(vcvtnh_s32_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType), + NEONMAP1(vcvtnh_s64_f16, aarch64_neon_fcvtns, AddRetType | Add1ArgType), + NEONMAP1(vcvtnh_u16_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), + NEONMAP1(vcvtnh_u32_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), + NEONMAP1(vcvtnh_u64_f16, aarch64_neon_fcvtnu, AddRetType | Add1ArgType), + NEONMAP1(vcvtph_s16_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType), + NEONMAP1(vcvtph_s32_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType), + NEONMAP1(vcvtph_s64_f16, aarch64_neon_fcvtps, AddRetType | Add1ArgType), + NEONMAP1(vcvtph_u16_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), + NEONMAP1(vcvtph_u32_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), + NEONMAP1(vcvtph_u64_f16, aarch64_neon_fcvtpu, AddRetType | Add1ArgType), + NEONMAP1(vmulxh_f16, aarch64_neon_fmulx, Add1ArgType), + NEONMAP1(vrecpeh_f16, aarch64_neon_frecpe, Add1ArgType), + NEONMAP1(vrecpxh_f16, aarch64_neon_frecpx, Add1ArgType), + NEONMAP1(vrsqrteh_f16, aarch64_neon_frsqrte, Add1ArgType), + NEONMAP1(vrsqrtsh_f16, aarch64_neon_frsqrts, Add1ArgType), }; #undef NEONMAP0 @@ -6125,6 +6173,58 @@ return Builder.CreateUIToFP(Ops[0], FTy); return Builder.CreateSIToFP(Ops[0], FTy); } + case NEON::BI__builtin_neon_vcvth_f16_u16: + case NEON::BI__builtin_neon_vcvth_f16_u32: + case NEON::BI__builtin_neon_vcvth_f16_u64: + usgn = true; + // FALL THROUGH + case NEON::BI__builtin_neon_vcvth_f16_s16: + case NEON::BI__builtin_neon_vcvth_f16_s32: + case NEON::BI__builtin_neon_vcvth_f16_s64: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + llvm::Type *FTy = HalfTy; + llvm::Type *InTy; + if (Ops[0]->getType()->getPrimitiveSizeInBits() == 64) + InTy = Int64Ty; + else if (Ops[0]->getType()->getPrimitiveSizeInBits() == 32) + InTy = Int32Ty; + else + InTy = Int16Ty; + Ops[0] = Builder.CreateBitCast(Ops[0], InTy); + if (usgn) + return Builder.CreateUIToFP(Ops[0], FTy); + return Builder.CreateSIToFP(Ops[0], FTy); + } + case NEON::BI__builtin_neon_vcvth_u16_f16: + usgn = true; + // FALL THROUGH + case NEON::BI__builtin_neon_vcvth_s16_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); + if (usgn) + return Builder.CreateFPToUI(Ops[0], Int16Ty); + return Builder.CreateFPToSI(Ops[0], Int16Ty); + } + case NEON::BI__builtin_neon_vcvth_u32_f16: + usgn = true; + // FALL THROUGH + case NEON::BI__builtin_neon_vcvth_s32_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); + if (usgn) + return Builder.CreateFPToUI(Ops[0], Int32Ty); + return Builder.CreateFPToSI(Ops[0], Int32Ty); + } + case NEON::BI__builtin_neon_vcvth_u64_f16: + usgn = true; + // FALL THROUGH + case NEON::BI__builtin_neon_vcvth_s64_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); + if (usgn) + return Builder.CreateFPToUI(Ops[0], Int64Ty); + return Builder.CreateFPToSI(Ops[0], Int64Ty); + } case NEON::BI__builtin_neon_vpaddd_s64: { llvm::Type *Ty = llvm::VectorType::get(Int64Ty, 2); Value *Vec = EmitScalarExpr(E->getArg(0)); @@ -6166,6 +6266,7 @@ case NEON::BI__builtin_neon_vceqzd_s64: case NEON::BI__builtin_neon_vceqzd_f64: case NEON::BI__builtin_neon_vceqzs_f32: + case NEON::BI__builtin_neon_vceqzh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), @@ -6173,6 +6274,7 @@ case NEON::BI__builtin_neon_vcgezd_s64: case NEON::BI__builtin_neon_vcgezd_f64: case NEON::BI__builtin_neon_vcgezs_f32: + case NEON::BI__builtin_neon_vcgezh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), @@ -6180,6 +6282,7 @@ case NEON::BI__builtin_neon_vclezd_s64: case NEON::BI__builtin_neon_vclezd_f64: case NEON::BI__builtin_neon_vclezs_f32: + case NEON::BI__builtin_neon_vclezh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), @@ -6187,6 +6290,7 @@ case NEON::BI__builtin_neon_vcgtzd_s64: case NEON::BI__builtin_neon_vcgtzd_f64: case NEON::BI__builtin_neon_vcgtzs_f32: + case NEON::BI__builtin_neon_vcgtzh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), @@ -6194,6 +6298,7 @@ case NEON::BI__builtin_neon_vcltzd_s64: case NEON::BI__builtin_neon_vcltzd_f64: case NEON::BI__builtin_neon_vcltzs_f32: + case NEON::BI__builtin_neon_vcltzh_f16: Ops.push_back(EmitScalarExpr(E->getArg(0))); return EmitAArch64CompareBuiltinExpr( Ops[0], ConvertType(E->getCallReturnType(getContext())), @@ -6246,6 +6351,26 @@ Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]); return Builder.CreateSExt(Ops[0], Int32Ty, "vcmpd"); } + case NEON::BI__builtin_neon_vceqh_f16: + case NEON::BI__builtin_neon_vcleh_f16: + case NEON::BI__builtin_neon_vclth_f16: + case NEON::BI__builtin_neon_vcgeh_f16: + case NEON::BI__builtin_neon_vcgth_f16: { + llvm::CmpInst::Predicate P; + switch (BuiltinID) { + default: llvm_unreachable("missing builtin ID in switch!"); + case NEON::BI__builtin_neon_vceqh_f16: P = llvm::FCmpInst::FCMP_OEQ; break; + case NEON::BI__builtin_neon_vcleh_f16: P = llvm::FCmpInst::FCMP_OLE; break; + case NEON::BI__builtin_neon_vclth_f16: P = llvm::FCmpInst::FCMP_OLT; break; + case NEON::BI__builtin_neon_vcgeh_f16: P = llvm::FCmpInst::FCMP_OGE; break; + case NEON::BI__builtin_neon_vcgth_f16: P = llvm::FCmpInst::FCMP_OGT; break; + } + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Ops[0] = Builder.CreateBitCast(Ops[0], HalfTy); + Ops[1] = Builder.CreateBitCast(Ops[1], HalfTy); + Ops[0] = Builder.CreateFCmp(P, Ops[0], Ops[1]); + return Builder.CreateSExt(Ops[0], Int16Ty, "vcmpd"); + } case NEON::BI__builtin_neon_vceqd_s64: case NEON::BI__builtin_neon_vceqd_u64: case NEON::BI__builtin_neon_vcgtd_s64: @@ -6383,6 +6508,31 @@ llvm::VectorType::get(DoubleTy, 2)); return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), "vgetq_lane"); + case NEON::BI__builtin_neon_vaddh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return Builder.CreateFAdd(Ops[0], Ops[1], "vaddh"); + case NEON::BI__builtin_neon_vsubh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return Builder.CreateFSub(Ops[0], Ops[1], "vsubh"); + case NEON::BI__builtin_neon_vmulh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return Builder.CreateFMul(Ops[0], Ops[1], "vmulh"); + case NEON::BI__builtin_neon_vdivh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return Builder.CreateFDiv(Ops[0], Ops[1], "vdivh"); + case NEON::BI__builtin_neon_vfmah_f16: { + Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy); + // NEON intrinsic puts accumulator first, unlike the LLVM fma. + return Builder.CreateCall(F, + {EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2)), Ops[0]}); + } + case NEON::BI__builtin_neon_vfmsh_f16: { + Value *F = CGM.getIntrinsic(Intrinsic::fma, HalfTy); + Value *Zero = llvm::ConstantFP::getZeroValueForNegation(HalfTy); + Value* Sub = Builder.CreateFSub(Zero, EmitScalarExpr(E->getArg(1)), "vsubh"); + // NEON intrinsic puts accumulator first, unlike the LLVM fma. + return Builder.CreateCall(F, {Sub, EmitScalarExpr(E->getArg(2)), Ops[0]}); + } case NEON::BI__builtin_neon_vaddd_s64: case NEON::BI__builtin_neon_vaddd_u64: return Builder.CreateAdd(Ops[0], EmitScalarExpr(E->getArg(1)), "vaddd"); @@ -6657,12 +6807,22 @@ Int = usgn ? Intrinsic::aarch64_neon_umax : Intrinsic::aarch64_neon_smax; if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmax; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmax"); + case NEON::BI__builtin_neon_vmaxh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Int = Intrinsic::aarch64_neon_fmax; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmax"); + } case NEON::BI__builtin_neon_vmin_v: case NEON::BI__builtin_neon_vminq_v: // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. Int = usgn ? Intrinsic::aarch64_neon_umin : Intrinsic::aarch64_neon_smin; if (Ty->isFPOrFPVectorTy()) Int = Intrinsic::aarch64_neon_fmin; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmin"); + case NEON::BI__builtin_neon_vminh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Int = Intrinsic::aarch64_neon_fmin; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmin"); + } case NEON::BI__builtin_neon_vabd_v: case NEON::BI__builtin_neon_vabdq_v: // FIXME: improve sharing scheme to cope with 3 alternative LLVM intrinsics. @@ -6701,20 +6861,31 @@ case NEON::BI__builtin_neon_vminnmq_v: Int = Intrinsic::aarch64_neon_fminnm; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vminnm"); + case NEON::BI__builtin_neon_vminnmh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Int = Intrinsic::aarch64_neon_fminnm; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vminnm"); case NEON::BI__builtin_neon_vmaxnm_v: case NEON::BI__builtin_neon_vmaxnmq_v: Int = Intrinsic::aarch64_neon_fmaxnm; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmaxnm"); + case NEON::BI__builtin_neon_vmaxnmh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + Int = Intrinsic::aarch64_neon_fmaxnm; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmaxnm"); case NEON::BI__builtin_neon_vrecpss_f32: { Ops.push_back(EmitScalarExpr(E->getArg(1))); return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, FloatTy), Ops, "vrecps"); } - case NEON::BI__builtin_neon_vrecpsd_f64: { + case NEON::BI__builtin_neon_vrecpsd_f64: Ops.push_back(EmitScalarExpr(E->getArg(1))); return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, DoubleTy), Ops, "vrecps"); - } + case NEON::BI__builtin_neon_vrecpsh_f16: + Ops.push_back(EmitScalarExpr(E->getArg(1))); + return EmitNeonCall(CGM.getIntrinsic(Intrinsic::aarch64_neon_frecps, HalfTy), + Ops, "vrecps"); case NEON::BI__builtin_neon_vqshrun_n_v: Int = Intrinsic::aarch64_neon_sqshrun; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqshrun_n"); @@ -6730,36 +6901,71 @@ case NEON::BI__builtin_neon_vqrshrn_n_v: Int = usgn ? Intrinsic::aarch64_neon_uqrshrn : Intrinsic::aarch64_neon_sqrshrn; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vqrshrn_n"); + case NEON::BI__builtin_neon_vrndah_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::round; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrnda"); + } case NEON::BI__builtin_neon_vrnda_v: case NEON::BI__builtin_neon_vrndaq_v: { Int = Intrinsic::round; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrnda"); } + case NEON::BI__builtin_neon_vrndih_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::nearbyint; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndi"); + } case NEON::BI__builtin_neon_vrndi_v: case NEON::BI__builtin_neon_vrndiq_v: { Int = Intrinsic::nearbyint; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndi"); } + case NEON::BI__builtin_neon_vrndmh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::floor; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndm"); + } case NEON::BI__builtin_neon_vrndm_v: case NEON::BI__builtin_neon_vrndmq_v: { Int = Intrinsic::floor; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndm"); } + case NEON::BI__builtin_neon_vrndnh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::aarch64_neon_frintn; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndn"); + } case NEON::BI__builtin_neon_vrndn_v: case NEON::BI__builtin_neon_vrndnq_v: { Int = Intrinsic::aarch64_neon_frintn; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndn"); } + case NEON::BI__builtin_neon_vrndph_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::ceil; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndp"); + } case NEON::BI__builtin_neon_vrndp_v: case NEON::BI__builtin_neon_vrndpq_v: { Int = Intrinsic::ceil; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndp"); } + case NEON::BI__builtin_neon_vrndxh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::rint; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndx"); + } case NEON::BI__builtin_neon_vrndx_v: case NEON::BI__builtin_neon_vrndxq_v: { Int = Intrinsic::rint; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vrndx"); } + case NEON::BI__builtin_neon_vrndh_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::trunc; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vrndz"); + } case NEON::BI__builtin_neon_vrnd_v: case NEON::BI__builtin_neon_vrndq_v: { Int = Intrinsic::trunc; @@ -6908,6 +7114,8 @@ } case NEON::BI__builtin_neon_vnegd_s64: return Builder.CreateNeg(EmitScalarExpr(E->getArg(0)), "vnegd"); + case NEON::BI__builtin_neon_vnegh_f16: + return Builder.CreateFNeg(EmitScalarExpr(E->getArg(0)), "vnegh"); case NEON::BI__builtin_neon_vpmaxnm_v: case NEON::BI__builtin_neon_vpmaxnmq_v: { Int = Intrinsic::aarch64_neon_fmaxnmp; @@ -6918,6 +7126,11 @@ Int = Intrinsic::aarch64_neon_fminnmp; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vpminnm"); } + case NEON::BI__builtin_neon_vsqrth_f16: { + Ops.push_back(EmitScalarExpr(E->getArg(0))); + Int = Intrinsic::sqrt; + return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vsqrt"); + } case NEON::BI__builtin_neon_vsqrt_v: case NEON::BI__builtin_neon_vsqrtq_v: { Int = Intrinsic::sqrt; Index: clang/lib/Headers/CMakeLists.txt =================================================================== --- clang/lib/Headers/CMakeLists.txt +++ clang/lib/Headers/CMakeLists.txt @@ -116,7 +116,12 @@ # Generate arm_neon.h clang_tablegen(arm_neon.h -gen-arm-neon + -I ${CLANG_SOURCE_DIR}/include/clang/Basic/ SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_neon.td) +# Generate arm_fp16.h +clang_tablegen(arm_fp16.h -gen-arm-fp16 + -I ${CLANG_SOURCE_DIR}/include/clang/Basic/ + SOURCE ${CLANG_SOURCE_DIR}/include/clang/Basic/arm_fp16.td) set(out_files) foreach( f ${files} ${cuda_wrapper_files} ) @@ -134,6 +139,11 @@ COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_neon.h ${output_dir}/arm_neon.h COMMENT "Copying clang's arm_neon.h...") list(APPEND out_files ${output_dir}/arm_neon.h) +add_custom_command(OUTPUT ${output_dir}/arm_fp16.h + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/arm_fp16.h + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_BINARY_DIR}/arm_fp16.h ${output_dir}/arm_fp16.h + COMMENT "Copying clang's arm_fp16.h...") +list(APPEND out_files ${output_dir}/arm_fp16.h) add_custom_target(clang-headers ALL DEPENDS ${out_files}) set_target_properties(clang-headers PROPERTIES FOLDER "Misc") @@ -145,6 +155,12 @@ DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include) install( + FILES ${files} ${CMAKE_CURRENT_BINARY_DIR}/arm_fp16.h + COMPONENT clang-headers + PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ + DESTINATION lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include) + +install( FILES ${cuda_wrapper_files} COMPONENT clang-headers PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ Index: clang/lib/Headers/module.modulemap =================================================================== --- clang/lib/Headers/module.modulemap +++ clang/lib/Headers/module.modulemap @@ -38,6 +38,7 @@ explicit module neon { requires neon header "arm_neon.h" + header "arm_fp16.h" export * } } Index: clang/lib/Sema/SemaChecking.cpp =================================================================== --- clang/lib/Sema/SemaChecking.cpp +++ clang/lib/Sema/SemaChecking.cpp @@ -1353,6 +1353,7 @@ switch (BuiltinID) { #define GET_NEON_OVERLOAD_CHECK #include "clang/Basic/arm_neon.inc" +#include "clang/Basic/arm_fp16.inc" #undef GET_NEON_OVERLOAD_CHECK } @@ -1404,6 +1405,7 @@ return false; #define GET_NEON_IMMEDIATE_CHECK #include "clang/Basic/arm_neon.inc" +#include "clang/Basic/arm_fp16.inc" #undef GET_NEON_IMMEDIATE_CHECK } Index: clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-v8.2a-fp16-intrinsics.c @@ -0,0 +1,643 @@ +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +fullfp16\ +// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg \ +// RUN: | FileCheck %s + +// REQUIRES: aarch64-registered-target + +#include + +// CHECK-LABEL: test_vabsh_f16 +// CHECK: [[ABS:%.*]] = call half @llvm.aarch64.neon.abs.f16(half %a) +// CHECK: ret half [[ABS]] +float16_t test_vabsh_f16(float16_t a) { + return vabsh_f16(a); +} + +// CHECK-LABEL: test_vceqzh_f16 +// CHECK: [[TMP1:%.*]] = fcmp oeq half %a, 0xH0000 +// CHECK: [[TMP2:%.*]] = sext i1 [[TMP1]] to i16 +// CHECK: ret i16 [[TMP2]] +uint16_t test_vceqzh_f16(float16_t a) { + return vceqzh_f16(a); +} + +// CHECK-LABEL: test_vcgezh_f16 +// CHECK: [[TMP1:%.*]] = fcmp oge half %a, 0xH0000 +// CHECK: [[TMP2:%.*]] = sext i1 [[TMP1]] to i16 +// CHECK: ret i16 [[TMP2]] +uint16_t test_vcgezh_f16(float16_t a) { + return vcgezh_f16(a); +} + +// CHECK-LABEL: test_vcgtzh_f16 +// CHECK: [[TMP1:%.*]] = fcmp ogt half %a, 0xH0000 +// CHECK: [[TMP2:%.*]] = sext i1 [[TMP1]] to i16 +// CHECK: ret i16 [[TMP2]] +uint16_t test_vcgtzh_f16(float16_t a) { + return vcgtzh_f16(a); +} + +// CHECK-LABEL: test_vclezh_f16 +// CHECK: [[TMP1:%.*]] = fcmp ole half %a, 0xH0000 +// CHECK: [[TMP2:%.*]] = sext i1 [[TMP1]] to i16 +// CHECK: ret i16 [[TMP2]] +uint16_t test_vclezh_f16(float16_t a) { + return vclezh_f16(a); +} + +// CHECK-LABEL: test_vcltzh_f16 +// CHECK: [[TMP1:%.*]] = fcmp olt half %a, 0xH0000 +// CHECK: [[TMP2:%.*]] = sext i1 [[TMP1]] to i16 +// CHECK: ret i16 [[TMP2]] +uint16_t test_vcltzh_f16(float16_t a) { + return vcltzh_f16(a); +} + +// CHECK-LABEL: test_vcvth_f16_s16 +// CHECK: [[VCVT:%.*]] = sitofp i16 %a to half +// CHECK: ret half [[VCVT]] +float16_t test_vcvth_f16_s16 (int16_t a) { + return vcvth_f16_s16(a); +} + +// CHECK-LABEL: test_vcvth_f16_s32 +// CHECK: [[VCVT:%.*]] = sitofp i32 %a to half +// CHECK: ret half [[VCVT]] +float16_t test_vcvth_f16_s32 (int32_t a) { + return vcvth_f16_s32(a); +} + +// CHECK-LABEL: test_vcvth_f16_s64 +// CHECK: [[VCVT:%.*]] = sitofp i64 %a to half +// CHECK: ret half [[VCVT]] +float16_t test_vcvth_f16_s64 (int64_t a) { + return vcvth_f16_s64(a); +} + +// CHECK-LABEL: test_vcvth_f16_u16 +// CHECK: [[VCVT:%.*]] = uitofp i16 %a to half +// CHECK: ret half [[VCVT]] +float16_t test_vcvth_f16_u16 (uint16_t a) { + return vcvth_f16_u16(a); +} + +// CHECK-LABEL: test_vcvth_f16_u32 +// CHECK: [[VCVT:%.*]] = uitofp i32 %a to half +// CHECK: ret half [[VCVT]] +float16_t test_vcvth_f16_u32 (uint32_t a) { + return vcvth_f16_u32(a); +} + +// CHECK-LABEL: test_vcvth_f16_u64 +// CHECK: [[VCVT:%.*]] = uitofp i64 %a to half +// CHECK: ret half [[VCVT]] +float16_t test_vcvth_f16_u64 (uint64_t a) { + return vcvth_f16_u64(a); +} + +// CHECK-LABEL: test_vcvth_s16_f16 +// CHECK: [[VCVT:%.*]] = fptosi half %a to i16 +// CHECK: ret i16 [[VCVT]] +int16_t test_vcvth_s16_f16 (float16_t a) { + return vcvth_s16_f16(a); +} + +// CHECK-LABEL: test_vcvth_s32_f16 +// CHECK: [[VCVT:%.*]] = fptosi half %a to i32 +// CHECK: ret i32 [[VCVT]] +int32_t test_vcvth_s32_f16 (float16_t a) { + return vcvth_s32_f16(a); +} + +// CHECK-LABEL: test_vcvth_s64_f16 +// CHECK: [[VCVT:%.*]] = fptosi half %a to i64 +// CHECK: ret i64 [[VCVT]] +int64_t test_vcvth_s64_f16 (float16_t a) { + return vcvth_s64_f16(a); +} + +// CHECK-LABEL: test_vcvth_u16_f16 +// CHECK: [[VCVT:%.*]] = fptoui half %a to i16 +// CHECK: ret i16 [[VCVT]] +uint16_t test_vcvth_u16_f16 (float16_t a) { + return vcvth_u16_f16(a); +} + +// CHECK-LABEL: test_vcvth_u32_f16 +// CHECK: [[VCVT:%.*]] = fptoui half %a to i32 +// CHECK: ret i32 [[VCVT]] +uint32_t test_vcvth_u32_f16 (float16_t a) { + return vcvth_u32_f16(a); +} + +// CHECK-LABEL: test_vcvth_u64_f16 +// CHECK: [[VCVT:%.*]] = fptoui half %a to i64 +// CHECK: ret i64 [[VCVT]] +uint64_t test_vcvth_u64_f16 (float16_t a) { + return vcvth_u64_f16(a); +} + +// CHECK-LABEL: test_vcvtah_s16_f16 +// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a) +// CHECK: ret i16 [[VCVT]] +int16_t test_vcvtah_s16_f16 (float16_t a) { + return vcvtah_s16_f16(a); +} + +// CHECK-LABEL: test_vcvtah_s32_f16 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a) +// CHECK: ret i32 [[VCVT]] +int32_t test_vcvtah_s32_f16 (float16_t a) { + return vcvtah_s32_f16(a); +} + +// CHECK-LABEL: test_vcvtah_s64_f16 +// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f16(half %a) +// CHECK: ret i64 [[VCVT]] +int64_t test_vcvtah_s64_f16 (float16_t a) { + return vcvtah_s64_f16(a); +} + +// CHECK-LABEL: test_vcvtah_u16_f16 +// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a) +// CHECK: ret i16 [[VCVT]] +uint16_t test_vcvtah_u16_f16 (float16_t a) { + return vcvtah_u16_f16(a); +} + +// CHECK-LABEL: test_vcvtah_u32_f16 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a) +// CHECK: ret i32 [[VCVT]] +uint32_t test_vcvtah_u32_f16 (float16_t a) { + return vcvtah_u32_f16(a); +} + +// CHECK-LABEL: test_vcvtah_u64_f16 +// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f16(half %a) +// CHECK: ret i64 [[VCVT]] +uint64_t test_vcvtah_u64_f16 (float16_t a) { + return vcvtah_u64_f16(a); +} + +// CHECK-LABEL: test_vcvtmh_s16_f16 +// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtms.i16.f16(half %a) +// CHECK: ret i16 [[VCVT]] +int16_t test_vcvtmh_s16_f16 (float16_t a) { + return vcvtmh_s16_f16(a); +} + +// CHECK-LABEL: test_vcvtmh_s32_f16 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f16(half %a) +// CHECK: ret i32 [[VCVT]] +int32_t test_vcvtmh_s32_f16 (float16_t a) { + return vcvtmh_s32_f16(a); +} + +// CHECK-LABEL: test_vcvtmh_s64_f16 +// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f16(half %a) +// CHECK: ret i64 [[VCVT]] +int64_t test_vcvtmh_s64_f16 (float16_t a) { + return vcvtmh_s64_f16(a); +} + +// CHECK-LABEL: test_vcvtmh_u16_f16 +// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half %a) +// CHECK: ret i16 [[VCVT]] +uint16_t test_vcvtmh_u16_f16 (float16_t a) { + return vcvtmh_u16_f16(a); +} + +// CHECK-LABEL: test_vcvtmh_u32_f16 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half %a) +// CHECK: ret i32 [[VCVT]] +uint32_t test_vcvtmh_u32_f16 (float16_t a) { + return vcvtmh_u32_f16(a); +} + +// CHECK-LABEL: test_vcvtmh_u64_f16 +// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f16(half %a) +// CHECK: ret i64 [[VCVT]] +uint64_t test_vcvtmh_u64_f16 (float16_t a) { + return vcvtmh_u64_f16(a); +} + +// CHECK-LABEL: test_vcvtnh_s16_f16 +// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtns.i16.f16(half %a) +// CHECK: ret i16 [[VCVT]] +int16_t test_vcvtnh_s16_f16 (float16_t a) { + return vcvtnh_s16_f16(a); +} + +// CHECK-LABEL: test_vcvtnh_s32_f16 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f16(half %a) +// CHECK: ret i32 [[VCVT]] +int32_t test_vcvtnh_s32_f16 (float16_t a) { + return vcvtnh_s32_f16(a); +} + +// CHECK-LABEL: test_vcvtnh_s64_f16 +// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f16(half %a) +// CHECK: ret i64 [[VCVT]] +int64_t test_vcvtnh_s64_f16 (float16_t a) { + return vcvtnh_s64_f16(a); +} + +// CHECK-LABEL: test_vcvtnh_u16_f16 +// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half %a) +// CHECK: ret i16 [[VCVT]] +uint16_t test_vcvtnh_u16_f16 (float16_t a) { + return vcvtnh_u16_f16(a); +} + +// CHECK-LABEL: test_vcvtnh_u32_f16 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f16(half %a) +// CHECK: ret i32 [[VCVT]] +uint32_t test_vcvtnh_u32_f16 (float16_t a) { + return vcvtnh_u32_f16(a); +} + +// CHECK-LABEL: test_vcvtnh_u64_f16 +// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f16(half %a) +// CHECK: ret i64 [[VCVT]] +uint64_t test_vcvtnh_u64_f16 (float16_t a) { + return vcvtnh_u64_f16(a); +} + +// CHECK-LABEL: test_vcvtph_s16_f16 +// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtps.i16.f16(half %a) +// CHECK: ret i16 [[VCVT]] +int16_t test_vcvtph_s16_f16 (float16_t a) { + return vcvtph_s16_f16(a); +} + +// CHECK-LABEL: test_vcvtph_s32_f16 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f16(half %a) +// CHECK: ret i32 [[VCVT]] +int32_t test_vcvtph_s32_f16 (float16_t a) { + return vcvtph_s32_f16(a); +} + +// CHECK-LABEL: test_vcvtph_s64_f16 +// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f16(half %a) +// CHECK: ret i64 [[VCVT]] +int64_t test_vcvtph_s64_f16 (float16_t a) { + return vcvtph_s64_f16(a); +} + +// CHECK-LABEL: test_vcvtph_u16_f16 +// CHECK: [[VCVT:%.*]] = call i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half %a) +// CHECK: ret i16 [[VCVT]] +uint16_t test_vcvtph_u16_f16 (float16_t a) { + return vcvtph_u16_f16(a); +} + +// CHECK-LABEL: test_vcvtph_u32_f16 +// CHECK: [[VCVT:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half %a) +// CHECK: ret i32 [[VCVT]] +uint32_t test_vcvtph_u32_f16 (float16_t a) { + return vcvtph_u32_f16(a); +} + +// CHECK-LABEL: test_vcvtph_u64_f16 +// CHECK: [[VCVT:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half %a) +// CHECK: ret i64 [[VCVT]] +uint64_t test_vcvtph_u64_f16 (float16_t a) { + return vcvtph_u64_f16(a); +} + +// CHECK-LABEL: test_vnegh_f16 +// CHECK: [[NEG:%.*]] = fsub half 0xH8000, %a +// CHECK: ret half [[NEG]] +float16_t test_vnegh_f16(float16_t a) { + return vnegh_f16(a); +} + +// CHECK-LABEL: test_vrecpeh_f16 +// CHECK: [[VREC:%.*]] = call half @llvm.aarch64.neon.frecpe.f16(half %a) +// CHECK: ret half [[VREC]] +float16_t test_vrecpeh_f16(float16_t a) { + return vrecpeh_f16(a); +} + +// CHECK-LABEL: test_vrecpxh_f16 +// CHECK: [[VREC:%.*]] = call half @llvm.aarch64.neon.frecpx.f16(half %a) +// CHECK: ret half [[VREC]] +float16_t test_vrecpxh_f16(float16_t a) { + return vrecpxh_f16(a); +} + +// CHECK-LABEL: test_vrndh_f16 +// CHECK: [[RND:%.*]] = call half @llvm.trunc.f16(half %a) +// CHECK: ret half [[RND]] +float16_t test_vrndh_f16(float16_t a) { + return vrndh_f16(a); +} + +// CHECK-LABEL: test_vrndah_f16 +// CHECK: [[RND:%.*]] = call half @llvm.round.f16(half %a) +// CHECK: ret half [[RND]] +float16_t test_vrndah_f16(float16_t a) { + return vrndah_f16(a); +} + +// CHECK-LABEL: test_vrndih_f16 +// CHECK: [[RND:%.*]] = call half @llvm.nearbyint.f16(half %a) +// CHECK: ret half [[RND]] +float16_t test_vrndih_f16(float16_t a) { + return vrndih_f16(a); +} + +// CHECK-LABEL: test_vrndmh_f16 +// CHECK: [[RND:%.*]] = call half @llvm.floor.f16(half %a) +// CHECK: ret half [[RND]] +float16_t test_vrndmh_f16(float16_t a) { + return vrndmh_f16(a); +} + +// CHECK-LABEL: test_vrndnh_f16 +// CHECK: [[RND:%.*]] = call half @llvm.aarch64.neon.frintn.f16(half %a) +// CHECK: ret half [[RND]] +float16_t test_vrndnh_f16(float16_t a) { + return vrndnh_f16(a); +} + +// CHECK-LABEL: test_vrndph_f16 +// CHECK: [[RND:%.*]] = call half @llvm.ceil.f16(half %a) +// CHECK: ret half [[RND]] +float16_t test_vrndph_f16(float16_t a) { + return vrndph_f16(a); +} + +// CHECK-LABEL: test_vrndxh_f16 +// CHECK: [[RND:%.*]] = call half @llvm.rint.f16(half %a) +// CHECK: ret half [[RND]] +float16_t test_vrndxh_f16(float16_t a) { + return vrndxh_f16(a); +} + +// CHECK-LABEL: test_vrsqrteh_f16 +// CHECK: [[RND:%.*]] = call half @llvm.aarch64.neon.frsqrte.f16(half %a) +// CHECK: ret half [[RND]] +float16_t test_vrsqrteh_f16(float16_t a) { + return vrsqrteh_f16(a); +} + +// CHECK-LABEL: test_vsqrth_f16 +// CHECK: [[SQR:%.*]] = call half @llvm.sqrt.f16(half %a) +// CHECK: ret half [[SQR]] +float16_t test_vsqrth_f16(float16_t a) { + return vsqrth_f16(a); +} + +// CHECK-LABEL: test_vaddh_f16 +// CHECK: [[ADD:%.*]] = fadd half %a, %b +// CHECK: ret half [[ADD]] +float16_t test_vaddh_f16(float16_t a, float16_t b) { + return vaddh_f16(a, b); +} + +// CHECK-LABEL: test_vabdh_f16 +// CHECK: [[ABD:%.*]] = call half @llvm.aarch64.sisd.fabd.f16(half %a, half %b) +// CHECK: ret half [[ABD]] +float16_t test_vabdh_f16(float16_t a, float16_t b) { + return vabdh_f16(a, b); +} + +// CHECK-LABEL: test_vcageh_f16 +// CHECK: [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facge.i16.f16(half %a, half %b) +// CHECK: ret i16 [[ABS]] +uint16_t test_vcageh_f16(float16_t a, float16_t b) { + return vcageh_f16(a, b); +} + +// CHECK-LABEL: test_vcagth_f16 +// CHECK: [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facgt.i16.f16(half %a, half %b) +// CHECK: ret i16 [[ABS]] +uint16_t test_vcagth_f16(float16_t a, float16_t b) { + return vcagth_f16(a, b); +} + +// CHECK-LABEL: test_vcaleh_f16 +// CHECK: [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facge.i16.f16(half %a, half %b) +// CHECK: ret i16 [[ABS]] +uint16_t test_vcaleh_f16(float16_t a, float16_t b) { + return vcaleh_f16(a, b); +} + +// CHECK-LABEL: test_vcalth_f16 +// CHECK: [[ABS:%.*]] = call i16 @llvm.aarch64.neon.facgt.i16.f16(half %a, half %b) +// CHECK: ret i16 [[ABS]] +uint16_t test_vcalth_f16(float16_t a, float16_t b) { + return vcalth_f16(a, b); +} + +// CHECK-LABEL: test_vceqh_f16 +// CHECK: [[TMP1:%.*]] = fcmp oeq half %a, %b +// CHECK: [[TMP2:%.*]] = sext i1 [[TMP1]] to i16 +// CHECK: ret i16 [[TMP2]] +uint16_t test_vceqh_f16(float16_t a, float16_t b) { + return vceqh_f16(a, b); +} + +// CHECK-LABEL: test_vcgeh_f16 +// CHECK: [[TMP1:%.*]] = fcmp oge half %a, %b +// CHECK: [[TMP2:%.*]] = sext i1 [[TMP1]] to i16 +// CHECK: ret i16 [[TMP2]] +uint16_t test_vcgeh_f16(float16_t a, float16_t b) { + return vcgeh_f16(a, b); +} + +// CHECK-LABEL: test_vcgth_f16 +//CHECK: [[TMP1:%.*]] = fcmp ogt half %a, %b +// CHECK: [[TMP2:%.*]] = sext i1 [[TMP1]] to i16 +// CHECK: ret i16 [[TMP2]] +uint16_t test_vcgth_f16(float16_t a, float16_t b) { + return vcgth_f16(a, b); +} + +// CHECK-LABEL: test_vcleh_f16 +// CHECK: [[TMP1:%.*]] = fcmp ole half %a, %b +// CHECK: [[TMP2:%.*]] = sext i1 [[TMP1]] to i16 +// CHECK: ret i16 [[TMP2]] +uint16_t test_vcleh_f16(float16_t a, float16_t b) { + return vcleh_f16(a, b); +} + +// CHECK-LABEL: test_vclth_f16 +// CHECK: [[TMP1:%.*]] = fcmp olt half %a, %b +// CHECK: [[TMP2:%.*]] = sext i1 [[TMP1]] to i16 +// CHECK: ret i16 [[TMP2]] +uint16_t test_vclth_f16(float16_t a, float16_t b) { + return vclth_f16(a, b); +} + +// CHECK-LABEL: test_vcvth_n_f16_s16 +// CHECK: [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i16(i16 %a, i32 0) +// CHECK: ret half [[CVT]] +float16_t test_vcvth_n_f16_s16(int16_t a) { + return vcvth_n_f16_s16(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_f16_s32 +// CHECK: [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i32(i32 %a, i32 0) +// CHECK: ret half [[CVT]] +float16_t test_vcvth_n_f16_s32(int32_t a) { + return vcvth_n_f16_s32(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_f16_s64 +// CHECK: [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxs2fp.f16.i64(i64 %a, i32 0) +// CHECK: ret half [[CVT]] +float16_t test_vcvth_n_f16_s64(int64_t a) { + return vcvth_n_f16_s64(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_s16_f16 +// CHECK: [[CVT:%.*]] = call i16 @llvm.aarch64.neon.vcvtfp2fxs.i16.f16(half %a, i32 0) +// CHECK: ret i16 [[CVT]] +int16_t test_vcvth_n_s16_f16(float16_t a) { + return vcvth_n_s16_f16(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_s32_f16 +// CHECK: [[CVT:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f16(half %a, i32 0) +// CHECK: ret i32 [[CVT]] +int32_t test_vcvth_n_s32_f16(float16_t a) { + return vcvth_n_s32_f16(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_s64_f16 +// CHECK: [[CVT:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f16(half %a, i32 0) +// CHECK: ret i64 [[CVT]] +int64_t test_vcvth_n_s64_f16(float16_t a) { + return vcvth_n_s64_f16(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_f16_u16 +// CHECK: [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i16(i16 %a, i32 0) +// CHECK: ret half [[CVT]] +float16_t test_vcvth_n_f16_u16(int16_t a) { + return vcvth_n_f16_u16(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_f16_u32 +// CHECK: [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i32(i32 %a, i32 0) +// CHECK: ret half [[CVT]] +float16_t test_vcvth_n_f16_u32(int32_t a) { + return vcvth_n_f16_u32(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_f16_u64 +// CHECK: [[CVT:%.*]] = call half @llvm.aarch64.neon.vcvtfxu2fp.f16.i64(i64 %a, i32 0) +// CHECK: ret half [[CVT]] +float16_t test_vcvth_n_f16_u64(int64_t a) { + return vcvth_n_f16_u64(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_u16_f16 +// CHECK: [[CVT:%.*]] = call i16 @llvm.aarch64.neon.vcvtfp2fxu.i16.f16(half %a, i32 0) +// CHECK: ret i16 [[CVT]] +int16_t test_vcvth_n_u16_f16(float16_t a) { + return vcvth_n_u16_f16(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_u32_f16 +// CHECK: [[CVT:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f16(half %a, i32 0) +// CHECK: ret i32 [[CVT]] +int32_t test_vcvth_n_u32_f16(float16_t a) { + return vcvth_n_u32_f16(a, 0); +} + +// CHECK-LABEL: test_vcvth_n_u64_f16 +// CHECK: [[CVT:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f16(half %a, i32 0) +// CHECK: ret i64 [[CVT]] +int64_t test_vcvth_n_u64_f16(float16_t a) { + return vcvth_n_u64_f16(a, 0); +} + +// CHECK-LABEL: test_vdivh_f16 +// CHECK: [[DIV:%.*]] = fdiv half %a, %b +// CHECK: ret half [[DIV]] +float16_t test_vdivh_f16(float16_t a, float16_t b) { + return vdivh_f16(a, b); +} + +// CHECK-LABEL: test_vmaxh_f16 +// CHECK: [[MAX:%.*]] = call half @llvm.aarch64.neon.fmax.f16(half %a, half %b) +// CHECK: ret half [[MAX]] +float16_t test_vmaxh_f16(float16_t a, float16_t b) { + return vmaxh_f16(a, b); +} + +// CHECK-LABEL: test_vmaxnmh_f16 +// CHECK: [[MAX:%.*]] = call half @llvm.aarch64.neon.fmaxnm.f16(half %a, half %b) +// CHECK: ret half [[MAX]] +float16_t test_vmaxnmh_f16(float16_t a, float16_t b) { + return vmaxnmh_f16(a, b); +} + +// CHECK-LABEL: test_vminh_f16 +// CHECK: [[MIN:%.*]] = call half @llvm.aarch64.neon.fmin.f16(half %a, half %b) +// CHECK: ret half [[MIN]] +float16_t test_vminh_f16(float16_t a, float16_t b) { + return vminh_f16(a, b); +} + +// CHECK-LABEL: test_vminnmh_f16 +// CHECK: [[MIN:%.*]] = call half @llvm.aarch64.neon.fminnm.f16(half %a, half %b) +// CHECK: ret half [[MIN]] +float16_t test_vminnmh_f16(float16_t a, float16_t b) { + return vminnmh_f16(a, b); +} + +// CHECK-LABEL: test_vmulh_f16 +// CHECK: [[MUL:%.*]] = fmul half %a, %b +// CHECK: ret half [[MUL]] +float16_t test_vmulh_f16(float16_t a, float16_t b) { + return vmulh_f16(a, b); +} + +// CHECK-LABEL: test_vmulxh_f16 +// CHECK: [[MUL:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b) +// CHECK: ret half [[MUL]] +float16_t test_vmulxh_f16(float16_t a, float16_t b) { + return vmulxh_f16(a, b); +} + +// CHECK-LABEL: test_vrecpsh_f16 +// CHECK: [[RECPS:%.*]] = call half @llvm.aarch64.neon.frecps.f16(half %a, half %b) +// CHECK: ret half [[RECPS]] +float16_t test_vrecpsh_f16(float16_t a, float16_t b) { + return vrecpsh_f16(a, b); +} + +// CHECK-LABEL: test_vrsqrtsh_f16 +// CHECK: [[RSQRTS:%.*]] = call half @llvm.aarch64.neon.frsqrts.f16(half %a, half %b) +// CHECK: ret half [[RSQRTS]] +float16_t test_vrsqrtsh_f16(float16_t a, float16_t b) { + return vrsqrtsh_f16(a, b); +} + +// CHECK-LABEL: test_vsubh_f16 +// CHECK: [[SUB:%.*]] = fsub half %a, %b +// CHECK: ret half [[SUB]] +float16_t test_vsubh_f16(float16_t a, float16_t b) { + return vsubh_f16(a, b); +} + +// CHECK-LABEL: test_vfmah_f16 +// CHECK: [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half %c, half %a) +// CHECK: ret half [[FMA]] +float16_t test_vfmah_f16(float16_t a, float16_t b, float16_t c) { + return vfmah_f16(a, b, c); +} + +// CHECK-LABEL: test_vfmsh_f16 +// CHECK: [[SUB:%.*]] = fsub half 0xH8000, %b +// CHECK: [[ADD:%.*]] = call half @llvm.fma.f16(half [[SUB]], half %c, half %a) +// CHECK: ret half [[ADD]] +float16_t test_vfmsh_f16(float16_t a, float16_t b, float16_t c) { + return vfmsh_f16(a, b, c); +} + Index: clang/utils/TableGen/NeonEmitter.cpp =================================================================== --- clang/utils/TableGen/NeonEmitter.cpp +++ clang/utils/TableGen/NeonEmitter.cpp @@ -552,7 +552,11 @@ // run - Emit arm_neon.h.inc void run(raw_ostream &o); + // runFP16 - Emit arm_fp16.h.inc + void runFP16(raw_ostream &o); + // runHeader - Emit all the __builtin prototypes used in arm_neon.h + // and arm_fp16.h void runHeader(raw_ostream &o); // runTests - Emit tests for all the Neon intrinsics. @@ -852,6 +856,35 @@ NumVectors = 0; Float = true; break; + case 'Y': + Bitwidth = ElementBitwidth = 16; + NumVectors = 0; + Float = true; + break; + case 'I': + Bitwidth = ElementBitwidth = 32; + NumVectors = 0; + Float = false; + Signed = true; + break; + case 'L': + Bitwidth = ElementBitwidth = 64; + NumVectors = 0; + Float = false; + Signed = true; + break; + case 'U': + Bitwidth = ElementBitwidth = 32; + NumVectors = 0; + Float = false; + Signed = false; + break; + case 'O': + Bitwidth = ElementBitwidth = 64; + NumVectors = 0; + Float = false; + Signed = false; + break; case 'f': Float = true; ElementBitwidth = 32; @@ -1010,7 +1043,7 @@ } static bool isFloatingPointProtoModifier(char Mod) { - return Mod == 'F' || Mod == 'f' || Mod == 'H'; + return Mod == 'F' || Mod == 'f' || Mod == 'H' || Mod == 'Y' || Mod == 'I'; } std::string Intrinsic::getBuiltinTypeStr() { @@ -2420,12 +2453,125 @@ OS << "#endif /* __ARM_NEON_H */\n"; } +/// run - Read the records in arm_fp16.td and output arm_fp16.h. arm_fp16.h +/// is comprised of type definitions and function declarations. +void NeonEmitter::runFP16(raw_ostream &OS) { + OS << "/*===---- arm_fp16.h - ARM FP16 intrinsics " + "------------------------------" + "---===\n" + " *\n" + " * Permission is hereby granted, free of charge, to any person " + "obtaining a copy\n" + " * of this software and associated documentation files (the " + "\"Software\"), to deal\n" + " * in the Software without restriction, including without limitation " + "the rights\n" + " * to use, copy, modify, merge, publish, distribute, sublicense, " + "and/or sell\n" + " * copies of the Software, and to permit persons to whom the Software " + "is\n" + " * furnished to do so, subject to the following conditions:\n" + " *\n" + " * The above copyright notice and this permission notice shall be " + "included in\n" + " * all copies or substantial portions of the Software.\n" + " *\n" + " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, " + "EXPRESS OR\n" + " * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF " + "MERCHANTABILITY,\n" + " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT " + "SHALL THE\n" + " * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR " + "OTHER\n" + " * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, " + "ARISING FROM,\n" + " * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER " + "DEALINGS IN\n" + " * THE SOFTWARE.\n" + " *\n" + " *===-----------------------------------------------------------------" + "---" + "---===\n" + " */\n\n"; + + OS << "#ifndef __ARM_FP16_H\n"; + OS << "#define __ARM_FP16_H\n\n"; + + OS << "#include \n\n"; + + OS << "typedef __fp16 float16_t;\n"; + + OS << "#define __ai static inline __attribute__((__always_inline__, " + "__nodebug__))\n\n"; + + SmallVector Defs; + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + for (auto *R : RV) + createIntrinsic(R, Defs); + + for (auto *I : Defs) + I->indexBody(); + + std::stable_sort( + Defs.begin(), Defs.end(), + [](const Intrinsic *A, const Intrinsic *B) { return *A < *B; }); + + // Only emit a def when its requirements have been met. + // FIXME: This loop could be made faster, but it's fast enough for now. + bool MadeProgress = true; + std::string InGuard; + while (!Defs.empty() && MadeProgress) { + MadeProgress = false; + + for (SmallVector::iterator I = Defs.begin(); + I != Defs.end(); /*No step*/) { + bool DependenciesSatisfied = true; + for (auto *II : (*I)->getDependencies()) { + if (std::find(Defs.begin(), Defs.end(), II) != Defs.end()) + DependenciesSatisfied = false; + } + if (!DependenciesSatisfied) { + // Try the next one. + ++I; + continue; + } + + // Emit #endif/#if pair if needed. + if ((*I)->getGuard() != InGuard) { + if (!InGuard.empty()) + OS << "#endif\n"; + InGuard = (*I)->getGuard(); + if (!InGuard.empty()) + OS << "#if " << InGuard << "\n"; + } + + // Actually generate the intrinsic code. + OS << (*I)->generate(); + + MadeProgress = true; + I = Defs.erase(I); + } + } + assert(Defs.empty() && "Some requirements were not satisfied!"); + if (!InGuard.empty()) + OS << "#endif\n"; + + OS << "\n"; + OS << "#undef __ai\n\n"; + OS << "#endif /* __ARM_FP16_H */\n"; +} + namespace clang { void EmitNeon(RecordKeeper &Records, raw_ostream &OS) { NeonEmitter(Records).run(OS); } +void EmitFP16(RecordKeeper &Records, raw_ostream &OS) { + NeonEmitter(Records).runFP16(OS); +} + void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS) { NeonEmitter(Records).runHeader(OS); } Index: clang/utils/TableGen/TableGen.cpp =================================================================== --- clang/utils/TableGen/TableGen.cpp +++ clang/utils/TableGen/TableGen.cpp @@ -52,6 +52,7 @@ GenClangCommentCommandInfo, GenClangCommentCommandList, GenArmNeon, + GenArmFP16, GenArmNeonSema, GenArmNeonTest, GenAttrDocs, @@ -139,6 +140,7 @@ "Generate list of commands that are used in " "documentation comments"), clEnumValN(GenArmNeon, "gen-arm-neon", "Generate arm_neon.h for clang"), + clEnumValN(GenArmFP16, "gen-arm-fp16", "Generate arm_fp16.h for clang"), clEnumValN(GenArmNeonSema, "gen-arm-neon-sema", "Generate ARM NEON sema support for clang"), clEnumValN(GenArmNeonTest, "gen-arm-neon-test", @@ -250,6 +252,9 @@ case GenArmNeon: EmitNeon(Records, OS); break; + case GenArmFP16: + EmitFP16(Records, OS); + break; case GenArmNeonSema: EmitNeonSema(Records, OS); break; Index: clang/utils/TableGen/TableGenBackends.h =================================================================== --- clang/utils/TableGen/TableGenBackends.h +++ clang/utils/TableGen/TableGenBackends.h @@ -65,6 +65,7 @@ void EmitClangCommentCommandList(RecordKeeper &Records, raw_ostream &OS); void EmitNeon(RecordKeeper &Records, raw_ostream &OS); +void EmitFP16(RecordKeeper &Records, raw_ostream &OS); void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS); void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS); void EmitNeon2(RecordKeeper &Records, raw_ostream &OS); Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -146,6 +146,9 @@ class AdvSIMD_CvtFPToFx_Intrinsic : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>; + + class AdvSIMD_1Arg_Intrinsic + : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrNoMem]>; } // Arithmetic ops @@ -244,7 +247,7 @@ // Vector Max def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic; - def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmax : AdvSIMD_2FloatArg_Intrinsic; def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic; // Vector Max Across Lanes @@ -256,7 +259,7 @@ // Vector Min def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic; def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic; - def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmin : AdvSIMD_2FloatArg_Intrinsic; def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic; // Vector Min/Max Number @@ -354,7 +357,7 @@ def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic; // Vector Absolute Value - def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic; + def int_aarch64_neon_abs : AdvSIMD_1Arg_Intrinsic; // Vector Saturating Absolute Value def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic;