diff --git a/clang/include/clang/Basic/BuiltinsARM.def b/clang/include/clang/Basic/BuiltinsARM.def --- a/clang/include/clang/Basic/BuiltinsARM.def +++ b/clang/include/clang/Basic/BuiltinsARM.def @@ -189,6 +189,11 @@ BUILTIN(__builtin_arm_wsr64, "vcC*LLUi", "nc") BUILTIN(__builtin_arm_wsrp, "vcC*vC*", "nc") +// Builtins for implementing ACLE MVE intrinsics. (Unlike NEON, these +// don't need to live in a separate BuiltinsMVE.def, because they +// aren't included from both here and BuiltinsAArch64.def.) +#include "clang/Basic/arm_mve_builtins.inc" + // MSVC LANGBUILTIN(__emit, "vIUiC", "", ALL_MS_LANGUAGES) diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt --- a/clang/include/clang/Basic/CMakeLists.txt +++ b/clang/include/clang/Basic/CMakeLists.txt @@ -41,10 +41,22 @@ TARGET ClangAttrHasAttributeImpl ) -# ARM NEON +# ARM NEON and MVE clang_tablegen(arm_neon.inc -gen-arm-neon-sema SOURCE arm_neon.td TARGET ClangARMNeon) clang_tablegen(arm_fp16.inc -gen-arm-neon-sema SOURCE arm_fp16.td TARGET ClangARMFP16) +clang_tablegen(arm_mve_builtins.inc -gen-arm-mve-builtin-def + SOURCE arm_mve.td + TARGET ClangARMMveBuiltinsDef) +clang_tablegen(arm_mve_builtin_cg.inc -gen-arm-mve-builtin-codegen + SOURCE arm_mve.td + TARGET ClangARMMveBuiltinCG) +clang_tablegen(arm_mve_builtin_sema.inc -gen-arm-mve-builtin-sema + SOURCE arm_mve.td + TARGET ClangARMMveBuiltinSema) +clang_tablegen(arm_mve_builtin_aliases.inc -gen-arm-mve-builtin-aliases + SOURCE arm_mve.td + TARGET ClangARMMveBuiltinAliases) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -8526,6 +8526,12 @@ InGroup>; def err_argument_not_multiple : Error< "argument should be a multiple of %0">; +def err_argument_not_power_of_2 : Error< + "argument should be a power of 2">; +def err_argument_not_shifted_byte : Error< + "argument should be an 8-bit value shifted by a multiple of 8 bits">; +def err_argument_not_shifted_byte_or_xxff : Error< + "argument should be an 8-bit value shifted by a multiple of 8 bits, or in the form 0x??FF">; def warn_neon_vector_initializer_non_portable : Warning< "vector initializers are not compatible with NEON intrinsics in big endian " "mode">, InGroup>; diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td new file mode 100644 --- /dev/null +++ b/clang/include/clang/Basic/arm_mve.td @@ -0,0 +1,121 @@ +//===- arm_mve.td - ACLE intrinsic functions for MVE architecture ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the set of ACLE-specified source-level intrinsic +// functions wrapping the MVE vector instruction set and scalar shift +// operations. +// +// Refer to comments in arm_mve_defs.td for the infrastructure used in +// here, and to MveEmitter.cpp for how those are used in turn to +// generate code. +// +//===----------------------------------------------------------------------===// + +include "arm_mve_defs.td" + +let params = T.All in +foreach n = [ 2, 4 ] in { + def "vst"#n#"q": Intrinsic, MultiVector), + (CustomCodegen<"VST24"> n:$NumVectors, + "Intrinsic::arm_mve_vst"#n#"q":$IRIntr)>; + def "vld"#n#"q": Intrinsic, (args CPtr), + (CustomCodegen<"VLD24"> n:$NumVectors, + "Intrinsic::arm_mve_vld"#n#"q":$IRIntr)>; +} + +let params = T.Int in { +def vaddq: Intrinsic; +def vsubq: Intrinsic; +} + +let params = T.Float in { +def vaddqf: Intrinsic, + NameOverride<"vaddq">; +def vsubqf: Intrinsic, + NameOverride<"vsubq">; +} + +let params = T.Usual in { +def vaddq_m: Intrinsic< + Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred), + (IRInt<"add_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>; +def vsubq_m: Intrinsic< + Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred), + (IRInt<"sub_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>; +} + +let params = T.Int in { +def vminvq: Intrinsic $prev, $vec))>; +def vmaxvq: Intrinsic $prev, $vec))>; +} + +foreach half = [ "b", "t" ] in +foreach halfconst = [ !if(!eq(half, "b"), 0, 1) ] in { + +let params = [f32], pnt = PNT_None in { + +def vcvt#half#q_f16: Intrinsic< + VecOf, (args VecOf:$inactive, Vector:$a), + (IRInt<"fltnarrow"> $inactive, $a, halfconst)>; +def vcvt#half#q_m_f16: Intrinsic< + VecOf, (args VecOf:$inactive, Vector:$a, PredOf:$pred), + (IRInt<"fltnarrow_predicated"> $inactive, $a, halfconst, $pred)>; + +} // params = [f32], pnt = PNT_None + +} // loop over half = "b", "t" + +let params = T.All32, pnt = PNT_None in +def vldrwq_gather_base_wb: Intrinsic< + Vector, (args Ptr>>:$addr, imm_mem7bit<4>:$offset), + (seq (IRInt<"vldr_gather_base_wb", [Vector, VecOf>]> + (load $addr), $offset):$pair, + (store (xval $pair, 1), $addr), + (xval $pair, 0))>; + +let params = T.All64, pnt = PNT_None in +def vldrdq_gather_base_wb_z: Intrinsic< + Vector, (args Ptr>>:$addr, imm_mem7bit<8>:$offset, + Predicate:$pred), + (seq (IRInt<"vldr_gather_base_wb_predicated", [Vector, VecOf>, Predicate]> + (load $addr), $offset, $pred):$pair, + (store (xval $pair, 1), $addr), + (xval $pair, 0))>; + +let params = [Void], pnt = PNT_None in +def urshrl: Intrinsic $lo, $hi, $shift):$pair, + (or (shl (u64 (xval $pair, 1)), (u64 32)), + (u64 (xval $pair, 0))))>; + +let params = T.Int32 in { +def vadcq: Intrinsic:$carry), + (seq (IRInt<"vadc", [Vector]> $a, $b, (shl (load $carry), 29)):$pair, + (store (and 1, (lshr (xval $pair, 1), 29)), $carry), + (xval $pair, 0))>; +def vadciq: Intrinsic:$carry), + (seq (IRInt<"vadc", [Vector]> $a, $b, 0):$pair, + (store (and 1, (lshr (xval $pair, 1), 29)), $carry), + (xval $pair, 0))>; +def vadcq_m: Intrinsic:$carry, Predicate:$pred), + (seq (IRInt<"vadc_predicated", [Vector, Predicate]> $inactive, $a, $b, + (shl (load $carry), 29), $pred):$pair, + (store (and 1, (lshr (xval $pair, 1), 29)), $carry), + (xval $pair, 0))>; +def vadciq_m: Intrinsic:$carry, Predicate:$pred), + (seq (IRInt<"vadc_predicated", [Vector, Predicate]> $inactive, $a, $b, + 0, $pred):$pair, + (store (and 1, (lshr (xval $pair, 1), 29)), $carry), + (xval $pair, 0))>; +} diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td new file mode 100644 --- /dev/null +++ b/clang/include/clang/Basic/arm_mve_defs.td @@ -0,0 +1,325 @@ +//===- arm_mve_defs.td - definitions and infrastructure for arm_mve.td ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The definitions in this file are designed to work in close conjunction with +// clang/utils/TableGen/MveEmitter.cpp. Comments in there will probably be +// useful as well. +// +//===----------------------------------------------------------------------===// + +// ----------------------------------------------------------------------------- +// Forward declarations. +class Type; + +// ----------------------------------------------------------------------------- +// Dummy record used as the dag operator for the argument list of an intrinsic. +// +// We store arguments as a dag rather than a list so that we can give +// each one a name, to be used in codegen. For example, (args Vector:$a, +// Scalar:$b) defines the names $a and $b which the specification of the code +// for that intrinsic can refer to. + +def args; + +// ----------------------------------------------------------------------------- +// Family of nodes for use in the codegen dag for an intrinsic, corresponding +// roughly to operations in LLVM IR. More precisely, they correspond to calls +// to methods of llvm::IRBuilder. +class IRBuilder { + string func = func_; // the method name + list address_params = []; // indices of parameters with type Address + list int_constant_params = []; // indices of plain integer parameters +} +def add: IRBuilder<"CreateAdd">; +def or: IRBuilder<"CreateOr">; +def and: IRBuilder<"CreateAnd">; +def sub: IRBuilder<"CreateSub">; +def shl: IRBuilder<"CreateShl">; +def lshr: IRBuilder<"CreateLShr">; +def fadd: IRBuilder<"CreateFAdd">; +def fsub: IRBuilder<"CreateFSub">; +def load: IRBuilder<"CreateLoad"> { let address_params = [0]; } +def store: IRBuilder<"CreateStore"> { let address_params = [1]; } +def xval: IRBuilder<"CreateExtractValue"> { let int_constant_params = [1]; } + +// Another node class you can use in the codegen dag. This one corresponds to +// an IR intrinsic function, which has to be specialized to a particular list +// of types. +class IRInt params_ = [], bit appendKind_ = 0> { + string intname = name_; // base name of the intrinsic, minus "arm_mve_" + list params = params_; // list of parameter types + + // If this flag is set, then the IR intrinsic name will get a suffix _s, _u + // or _f depending on whether the main parameter type of the ACLE intrinsic + // being generated is a signed integer, unsigned integer, or float. Mostly + // this is useful for signed vs unsigned integers, because the ACLE + // intrinsics and the source-level integer types distinguish them, but at IR + // level the distinction has moved from the type system into the operations + // and you just have i32 or i16 etc. So when an IR intrinsic has to vary with + // signedness, you set this bit, and then you can still put the signed and + // unsigned versions in the same subclass of Intrinsic, and the Tablegen + // backend will take care of adding _s or _u as appropriate in each instance. + bit appendKind = appendKind_; +} + +// The 'seq' node in a codegen dag specifies a set of IR operations to be +// performed in order. It has the special ability to define extra variable +// names, on top of the ones that refer to the intrinsic's parameters. For +// example: +// +// (seq (foo this, that):$a, +// (bar this, $a):$b +// (add $a, $b)) +// +// defines the name $a to refer to the return value of the 'foo' operation; +// then the 'bar' operation uses $a as one of its arguments, and the return +// value of that is assigned the name $b; finally, $a and $b are added to give +// the return value of the seq construction as a whole. +def seq; + +// If you put CustomCodegen<"foo"> in an intrinsic's codegen field, it +// indicates that the IR generation for that intrinsic is done by handwritten +// C++ and not autogenerated at all. The effect in the MVE builtin codegen +// function is to break out of the main switch and fall through to the +// manual-codegen cases below it, having set the CustomCodeGenType enumerated +// variable to the value given by the 'type' string here. +class CustomCodegen { string type = type_; } + +// ----------------------------------------------------------------------------- +// System for building up complex instances of Type from simple ones. + +// ComplexType is used to represent any more complicated type: vectors, +// multivectors, pointers etc. Its dag argument specifies how the type should +// be constructed from simpler types. The operator of the dag will always be an +// instance of ComplexTypeOp, defined below. +class ComplexType: Type { dag spec = spec_; } + +// Operators you can use in the ComplexType spec dag. These are an intermediate +// layer, interpreted by MveEmitter::getType() in the Tablegen backend, and +// only used in the definitions below. Actual intrinsic definitions in +// arm_mve.td will use the defs defined below here. +class ComplexTypeOp; +def CTO_Parameter: ComplexTypeOp; +def CTO_Vec: ComplexTypeOp; +def CTO_Pred: ComplexTypeOp; +class CTO_Tuple: ComplexTypeOp { int n = n_; } +class CTO_Pointer: ComplexTypeOp { bit const = const_; } +class CTO_Sign: ComplexTypeOp { bit signed = signed_; } + +// ----------------------------------------------------------------------------- +// Instances of Type intended to be used directly in the specification of an +// intrinsic in arm_mve.td. + +// The type Void can be used for the return type of an intrinsic, and as the +// parameter type for intrinsics that aren't actually parametrised by any kind +// of _s32 / _f16 / _u8 suffix. +def Void : Type; + +// Primitive types: base class, and an instance for the set of scalar integer +// and floating types that MVE uses. +class PrimitiveType: Type { + string kind = kind_; + int size = size_; +} +// The type records defined by these foreaches have names like s32, f16, u8. +foreach size = [8, 16, 32, 64] in + foreach kind = ["u", "s"] in + def kind # size: PrimitiveType; +foreach size = [16, 32] in + foreach kind = ["f"] in + def kind # size: PrimitiveType; + +// VecOf expects t to be a scalar, and gives a 128-bit vector of whatever it +// is. +class VecOf: ComplexType<(CTO_Vec t)>; + +// PredOf expects t to be a scalar, and expands to a predicate vector which +// (logically speaking) has the same number of lanes as VecOf would. +class PredOf: ComplexType<(CTO_Pred t)>; + +// Scalar expands to whatever is the main parameter type of the current +// intrinsic. Vector and Predicate expand to the vector and predicate types +// corresponding to that. +def Scalar: ComplexType<(CTO_Parameter)>; +def Vector: VecOf; +def Predicate: PredOf; + +// MultiVector expands to a type containing n instances of Vector. (There's +// no need to define this for a general underlying vector type, since it's only +// used by vld2q and friends, which don't need that generality.) +class MultiVector: ComplexType<(CTO_Tuple Vector)>; + +// Ptr and CPtr expand to a pointer to t, or a pointer to const t, +// respectively. +class Ptr: ComplexType<(CTO_Pointer<0> t)>; +class CPtr: ComplexType<(CTO_Pointer<1> t)>; + +// Unsigned expects t to be a scalar, and expands to the unsigned integer +// scalar of the same size. So it returns u16 if you give it s16 or f16 (or +// u16 itself). +class Unsigned: ComplexType<(CTO_Sign<0> t)>; + +// ----------------------------------------------------------------------------- +// Internal definitions for specifying immediate arguments for an intrinsic. + +class ImmediateBounds; +class Immediate: Type { + Type type = type_; + ImmediateBounds bounds = bounds_; + string extra; + string extraarg; +} +class IB_ConstRange : ImmediateBounds { + int lo = lo_; + int hi = hi_; +} +def IB_UEltValue : ImmediateBounds; +def IB_LaneIndex : ImmediateBounds; +class IB_EltBit : ImmediateBounds { int base = base_; } + +// ----------------------------------------------------------------------------- +// End-user definitions for immediate arguments. + +// imm_simd and imm_simd_restrictive are used for the immediate operands to +// intrinsics like vmvnq or vorrq. imm_simd_restrictive has to be an 8-bit +// value shifted left by a whole number of bytes; imm_simd_vmvn can also be of +// the form 0xXXFF for some byte value XX. +def imm_simd_restrictive : Immediate { + let extra = "ShiftedByte"; +} +def imm_simd_vmvn : Immediate { + let extra = "ShiftedByteOrXXFF"; +} + +// imm_1toN can take any value from 1 to N inclusive, where N is the number of +// bits in the main parameter type. (E.g. an immediate shift count, in an +// intrinsic that shifts every lane of a vector by the same amount.) +// +// imm_0toNm1 is the same but with the range offset by 1, i.e. 0 to N-1 +// inclusive. +def imm_1toN : Immediate>; +def imm_0toNm1 : Immediate>; + +// imm_lane has to be the index of a vector lane in the main vector type, i.e +// it can range from 0 to (128 / size of scalar)-1 inclusive. (e.g. vgetq_lane) +def imm_lane : Immediate; + +// imm_1to32 can be in the range 1 to 32, unconditionally. (e.g. scalar shift +// intrinsics) +def imm_1to32 : Immediate>; + +// imm_1248 can be 1, 2, 4 or 8. (e.g. vidupq) +def imm_1248 : Immediate> { + let extra = "Power2"; +} + +// imm_mem7bit is a valid immediate offset for a load/store intrinsic whose +// memory access size is n bytes (e.g. 1 for vldrb_[whatever], 2 for vldrh, +// ...). The set of valid immediates for these is {0*n, 1*n, ..., 127*n}. +class imm_mem7bit + : Immediate> { + let extra = !if(!eq(membytes, 1), ?, "Multiple"); + let extraarg = !cast(membytes); +} + +// ----------------------------------------------------------------------------- +// Specification of ways that the full name of an intrinsic can be mapped to +// its shorter polymorphic name. + +class PolymorphicNameType { + int NumTypeSuffixesToDiscard = nt_; + string ExtraSuffixToDiscard = x_; +} + +// PNT_None: the intrinsic is not polymorphic at all, so its short name is the +// same as its long name. (E.g. scalar shift intrinsics such as uqshl.) +def PNT_None: PolymorphicNameType<0, ?>; + +// PNT_Type: the usual case, in which the polymorphic name is made by dropping +// the type suffix, so it ends up the same as the Tablegen record name. E.g. +// vaddq_u16 -> vaddq. +def PNT_Type: PolymorphicNameType<1, ?>; + +// PNT_2Type: the polymorphic name is made by dropping _two_ type suffixes. +// E.g. vcvtq_f16_u16 -> vcvtq. +def PNT_2Type: PolymorphicNameType<2, ?>; + +// PNT_NType: the polymorphic name is made by dropping an "_n" suffix and a +// type. E.g. vaddq_n_u16 -> vaddq. +def PNT_NType: PolymorphicNameType<1, "n">; + +// PNT_NType: the polymorphic name is made by just dropping an "_n" suffix +// (even if it isn't at the end of the name). E.g. vidupq_n_u16 -> vidupq_u16. +def PNT_N: PolymorphicNameType<0, "n">; + +// PNT_WBType: the polymorphic name is made by dropping an "_wb" suffix and a +// type. E.g. vidupq_m_wb_u16 -> vidupq_m. +def PNT_WBType: PolymorphicNameType<1, "wb">; + +// PNT_WB: the polymorphic name is made by just dropping "_wb". E.g. +// vidupq_wb_u16 -> vidupq_u16. +def PNT_WB: PolymorphicNameType<0, "wb">; + +// ----------------------------------------------------------------------------- +// The main class Intrinsic. Define one of these for each family of ACLE +// intrinsics which are the same apart from some final type suffix (e.g. +// vaddq_{s8,u8,f16,...}. +// +// The record's name plus that type suffix is taken to be the full unambiguous +// name of the function. Its shorter polymorphic name is constructed from that +// in turn, in a way specified by the PolymorphicNameType system above. + +class Intrinsic { + // List of parameter types to suffix to this intrinsic's name. A separate + // actual ACLE intrinsic will be generated for each of these. Set it to + // [Void] if the intrinsic is not polymorphic at all. + list params; + + // Return type and arguments for the intrinsic. + Type ret = ret_; + dag args = args_; + + // Specification of how to generate its IR. + dag codegen = codegen_; + + // Default to PNT_Type, which is by far the most common case. + PolymorphicNameType pnt = PNT_Type; +} + +// Sometimes you have to use two separate Intrinsic declarations to +// declare intrinsics that are logically the same family (e.g. vaddq, +// because it needs to expand to an Add or FAdd IR node depending on +// type). For that purpose, you can derive from NameOverride to +// specify the intrinsic's base name independently of the Tablegen +// record name. + +class NameOverride { + string basename = basename_; +} + +// ----------------------------------------------------------------------------- +// Convenience lists of parameter types. 'T' is just a container record, so you +// can define a typical intrinsic with 'let Params = T.Usual', or similar, +// instead of having to repeat a long list every time. + +def T { + list Signed = [s8, s16, s32]; + list Unsigned = [u8, u16, u32]; + list Int = Signed # Unsigned; + list Float = [f16, f32]; + list Usual = Int # Float; + list Int8 = [s8, u8]; + list Int16 = [s16, u16]; + list Int32 = [s32, u32]; + list Int64 = [s64, u64]; + list All8 = Int8; + list All16 = Int16 # [f16]; + list All32 = Int32 # [f32]; + list All64 = Int64; + list All = Usual # All64; +} diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10991,6 +10991,7 @@ bool CheckARMBuiltinExclusiveCall(unsigned BuiltinID, CallExpr *TheCall, unsigned MaxWidth); bool CheckNeonBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); + bool CheckMVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); bool CheckARMBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); bool CheckAArch64BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall); @@ -11037,6 +11038,9 @@ int High, bool RangeIsError = true); bool SemaBuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum, unsigned Multiple); + bool SemaBuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum); + bool SemaBuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum); + bool SemaBuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum); bool SemaBuiltinARMSpecialReg(unsigned BuiltinID, CallExpr *TheCall, int ArgNum, unsigned ExpectedFieldNum, bool AllowName); diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -3079,8 +3079,13 @@ FunctionDecl *FunctionDecl::getCanonicalDecl() { return getFirstDecl(); } static bool ArmMveAliasValid(unsigned BuiltinID, StringRef AliasName) { - // This will be filled in by Tablegen which isn't written yet - return false; + if (AliasName.startswith("__arm_")) + AliasName = AliasName.substr(6); + switch (BuiltinID) { + #include "clang/Basic/arm_mve_builtin_aliases.inc" + default: + return false; + } } /// Returns a value indicating whether this function corresponds to a builtin diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4221,7 +4221,7 @@ } // See if we have a target specific builtin that needs to be lowered. - if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E)) + if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E, ReturnValue)) return RValue::get(V); ErrorUnsupported(E, "builtin function"); @@ -4232,13 +4232,14 @@ static Value *EmitTargetArchBuiltinExpr(CodeGenFunction *CGF, unsigned BuiltinID, const CallExpr *E, + ReturnValueSlot ReturnValue, llvm::Triple::ArchType Arch) { switch (Arch) { case llvm::Triple::arm: case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: - return CGF->EmitARMBuiltinExpr(BuiltinID, E, Arch); + return CGF->EmitARMBuiltinExpr(BuiltinID, E, ReturnValue, Arch); case llvm::Triple::aarch64: case llvm::Triple::aarch64_be: return CGF->EmitAArch64BuiltinExpr(BuiltinID, E, Arch); @@ -4268,15 +4269,16 @@ } Value *CodeGenFunction::EmitTargetBuiltinExpr(unsigned BuiltinID, - const CallExpr *E) { + const CallExpr *E, + ReturnValueSlot ReturnValue) { if (getContext().BuiltinInfo.isAuxBuiltinID(BuiltinID)) { assert(getContext().getAuxTargetInfo() && "Missing aux target info"); return EmitTargetArchBuiltinExpr( this, getContext().BuiltinInfo.getAuxBuiltinID(BuiltinID), E, - getContext().getAuxTargetInfo()->getTriple().getArch()); + ReturnValue, getContext().getAuxTargetInfo()->getTriple().getArch()); } - return EmitTargetArchBuiltinExpr(this, BuiltinID, E, + return EmitTargetArchBuiltinExpr(this, BuiltinID, E, ReturnValue, getTarget().getTriple().getArch()); } @@ -6004,6 +6006,7 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E, + ReturnValueSlot ReturnValue, llvm::Triple::ArchType Arch) { if (auto Hint = GetValueForARMHint(BuiltinID)) return Hint; @@ -6320,6 +6323,10 @@ return EmitSpecialRegisterBuiltin(*this, E, RegisterType, ValueType, IsRead); } + // Deal with MVE builtins + if (Value *Result = EmitARMMVEBuiltinExpr(BuiltinID, E, ReturnValue, Arch)) + return Result; + // Find out if any arguments are required to be integer constant // expressions. unsigned ICEArguments = 0; @@ -6769,6 +6776,109 @@ } } +Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID, + const CallExpr *E, + ReturnValueSlot ReturnValue, + llvm::Triple::ArchType Arch) { + enum class CustomCodeGen { VLD24, VST24 } CustomCodeGenType; + Intrinsic::ID IRIntr; + unsigned NumVectors; + + // Code autogenerated by Tablegen will handle all the simple builtins. + switch (BuiltinID) { + #include "clang/Basic/arm_mve_builtin_cg.inc" + + // If we didn't match an MVE builtin id at all, go back to the + // main EmitARMBuiltinExpr. + default: + return nullptr; + } + + // Anything that breaks from that switch is an MVE builtin that + // needs handwritten code to generate. + + switch (CustomCodeGenType) { + + case CustomCodeGen::VLD24: { + llvm::SmallVector Ops; + llvm::SmallVector Tys; + + auto MvecCType = E->getType(); + auto MvecLType = ConvertType(MvecCType); + assert(MvecLType->isStructTy() && + "Return type for vld[24]q should be a struct"); + assert(MvecLType->getStructNumElements() == 1 && + "Return-type struct for vld[24]q should have one element"); + auto MvecLTypeInner = MvecLType->getStructElementType(0); + assert(MvecLTypeInner->isArrayTy() && + "Return-type struct for vld[24]q should contain an array"); + assert(MvecLTypeInner->getArrayNumElements() == NumVectors && + "Array member of return-type struct vld[24]q has wrong length"); + auto VecLType = MvecLTypeInner->getArrayElementType(); + + Tys.push_back(VecLType); + + auto Addr = E->getArg(0); + Ops.push_back(EmitScalarExpr(Addr)); + Tys.push_back(ConvertType(Addr->getType())); + + Function *F = CGM.getIntrinsic(IRIntr, makeArrayRef(Tys)); + Value *LoadResult = Builder.CreateCall(F, Ops); + Value *MvecOut = UndefValue::get(MvecLType); + for (unsigned i = 0; i < NumVectors; ++i) { + Value *Vec = Builder.CreateExtractValue(LoadResult, i); + MvecOut = Builder.CreateInsertValue(MvecOut, Vec, {0, i}); + } + + if (ReturnValue.isNull()) + return MvecOut; + else + return Builder.CreateStore(MvecOut, ReturnValue.getValue()); + } + + case CustomCodeGen::VST24: { + llvm::SmallVector Ops; + llvm::SmallVector Tys; + + auto Addr = E->getArg(0); + Ops.push_back(EmitScalarExpr(Addr)); + Tys.push_back(ConvertType(Addr->getType())); + + auto MvecCType = E->getArg(1)->getType(); + auto MvecLType = ConvertType(MvecCType); + assert(MvecLType->isStructTy() && "Data type for vst2q should be a struct"); + assert(MvecLType->getStructNumElements() == 1 && + "Data-type struct for vst2q should have one element"); + auto MvecLTypeInner = MvecLType->getStructElementType(0); + assert(MvecLTypeInner->isArrayTy() && + "Data-type struct for vst2q should contain an array"); + assert(MvecLTypeInner->getArrayNumElements() == NumVectors && + "Array member of return-type struct vld[24]q has wrong length"); + auto VecLType = MvecLTypeInner->getArrayElementType(); + + Tys.push_back(VecLType); + + AggValueSlot MvecSlot = CreateAggTemp(MvecCType); + EmitAggExpr(E->getArg(1), MvecSlot); + auto Mvec = Builder.CreateLoad(MvecSlot.getAddress()); + for (unsigned i = 0; i < NumVectors; i++) + Ops.push_back(Builder.CreateExtractValue(Mvec, {0, i})); + + Function *F = CGM.getIntrinsic(IRIntr, makeArrayRef(Tys)); + Value *ToReturn = nullptr; + for (unsigned i = 0; i < NumVectors; i++) { + Ops.push_back(llvm::ConstantInt::get(Int32Ty, i)); + ToReturn = Builder.CreateCall(F, Ops); + Ops.pop_back(); + } + return ToReturn; + } + + default: + llvm_unreachable("bad CustomCodegen enum value"); + } +} + static Value *EmitAArch64TblBuiltinExpr(CodeGenFunction &CGF, unsigned BuiltinID, const CallExpr *E, SmallVectorImpl &Ops, diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3726,14 +3726,19 @@ /// EmitTargetBuiltinExpr - Emit the given builtin call. Returns 0 if the call /// is unhandled by the current target. - llvm::Value *EmitTargetBuiltinExpr(unsigned BuiltinID, const CallExpr *E); + llvm::Value *EmitTargetBuiltinExpr(unsigned BuiltinID, const CallExpr *E, + ReturnValueSlot ReturnValue); llvm::Value *EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty, const llvm::CmpInst::Predicate Fp, const llvm::CmpInst::Predicate Ip, const llvm::Twine &Name = ""); llvm::Value *EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E, + ReturnValueSlot ReturnValue, llvm::Triple::ArchType Arch); + llvm::Value *EmitARMMVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E, + ReturnValueSlot ReturnValue, + llvm::Triple::ArchType Arch); llvm::Value *EmitCommonNeonBuiltinExpr(unsigned BuiltinID, unsigned LLVMIntrinsic, diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -182,6 +182,8 @@ clang_generate_header(-gen-arm-neon arm_neon.td arm_neon.h) # Generate arm_fp16.h clang_generate_header(-gen-arm-fp16 arm_fp16.td arm_fp16.h) +# Generate arm_mve.h +clang_generate_header(-gen-arm-mve-header arm_mve.td arm_mve.h) add_custom_target(clang-resource-headers ALL DEPENDS ${out_files}) set_target_properties(clang-resource-headers PROPERTIES diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -1712,6 +1712,14 @@ return SemaBuiltinConstantArgRange(TheCall, i, l, u + l); } +bool Sema::CheckMVEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { + switch (BuiltinID) { + default: + return false; + #include "clang/Basic/arm_mve_builtin_sema.inc" + } +} + bool Sema::CheckARMBuiltinExclusiveCall(unsigned BuiltinID, CallExpr *TheCall, unsigned MaxWidth) { assert((BuiltinID == ARM::BI__builtin_arm_ldrex || @@ -1852,6 +1860,8 @@ if (CheckNeonBuiltinFunctionCall(BuiltinID, TheCall)) return true; + if (CheckMVEBuiltinFunctionCall(BuiltinID, TheCall)) + return true; // For intrinsics which take an immediate value as part of the instruction, // range check them here. @@ -6143,6 +6153,101 @@ return false; } +/// SemaBuiltinConstantArgPower2 - Check if argument ArgNum of TheCall is a +/// constant expression representing a power of 2. +bool Sema::SemaBuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum) { + llvm::APSInt Result; + + // We can't check the value of a dependent argument. + Expr *Arg = TheCall->getArg(ArgNum); + if (Arg->isTypeDependent() || Arg->isValueDependent()) + return false; + + // Check constant-ness first. + if (SemaBuiltinConstantArg(TheCall, ArgNum, Result)) + return true; + + // Bit-twiddling to test for a power of 2: for x > 0, x & (x-1) is zero if + // and only if x is a power of 2. + if (Result.isStrictlyPositive() && (Result & (Result - 1)) == 0) + return false; + + return Diag(TheCall->getBeginLoc(), diag::err_argument_not_power_of_2) + << Arg->getSourceRange(); +} + +static bool IsShiftedByte(llvm::APSInt Value) { + if (Value.isNegative()) + return false; + + // Check if it's a shifted byte, by shifting it down + while (true) { + // If the value fits in the bottom byte, the check passes. + if (Value < 0x100) + return true; + + // Otherwise, if the value has _any_ bits in the bottom byte, the check + // fails. + if ((Value & 0xFF) != 0) + return false; + + // If the bottom 8 bits are all 0, but something above that is nonzero, + // then shifting the value right by 8 bits won't affect whether it's a + // shifted byte or not. So do that, and go round again. + Value >>= 8; + } +} + +/// SemaBuiltinConstantArgShiftedByte - Check if argument ArgNum of TheCall is +/// a constant expression representing an arbitrary byte value shifted left by +/// a multiple of 8 bits. +bool Sema::SemaBuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum) { + llvm::APSInt Result; + + // We can't check the value of a dependent argument. + Expr *Arg = TheCall->getArg(ArgNum); + if (Arg->isTypeDependent() || Arg->isValueDependent()) + return false; + + // Check constant-ness first. + if (SemaBuiltinConstantArg(TheCall, ArgNum, Result)) + return true; + + if (IsShiftedByte(Result)) + return false; + + return Diag(TheCall->getBeginLoc(), diag::err_argument_not_shifted_byte) + << Arg->getSourceRange(); +} + +/// SemaBuiltinConstantArgShiftedByteOr0xFF - Check if argument ArgNum of +/// TheCall is a constant expression representing either a shifted byte value, +/// or a value of the form 0x??FF (i.e. a member of the arithmetic progression +/// 0x00FF, 0x01FF, ..., 0xFFFF). This strange range check is needed for some +/// Arm MVE intrinsics. +bool Sema::SemaBuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, + int ArgNum) { + llvm::APSInt Result; + + // We can't check the value of a dependent argument. + Expr *Arg = TheCall->getArg(ArgNum); + if (Arg->isTypeDependent() || Arg->isValueDependent()) + return false; + + // Check constant-ness first. + if (SemaBuiltinConstantArg(TheCall, ArgNum, Result)) + return true; + + // Check to see if it's in either of the required forms. + if (IsShiftedByte(Result) || + (Result > 0 && Result < 0x10000 && (Result & 0xFF) == 0xFF)) + return false; + + return Diag(TheCall->getBeginLoc(), + diag::err_argument_not_shifted_byte_or_xxff) + << Arg->getSourceRange(); +} + /// SemaBuiltinARMMemoryTaggingCall - Handle calls of memory tagging extensions bool Sema::SemaBuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall) { if (BuiltinID == AArch64::BI__builtin_arm_irg) { diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -7233,8 +7233,10 @@ /// match one of the standard Neon vector types. static void HandleNeonVectorTypeAttr(QualType &CurType, const ParsedAttr &Attr, Sema &S, VectorType::VectorKind VecKind) { - // Target must have NEON - if (!S.Context.getTargetInfo().hasFeature("neon")) { + // Target must have NEON (or MVE, whose vectors are similar enough + // not to need a separate attribute) + if (!S.Context.getTargetInfo().hasFeature("neon") && + !S.Context.getTargetInfo().hasFeature("mve")) { S.Diag(Attr.getLoc(), diag::err_attribute_unsupported) << Attr; Attr.setInvalid(); return; diff --git a/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c b/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/scalar-shifts.c @@ -0,0 +1,24 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -O3 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -DPOLYMORPHIC -O3 -S -emit-llvm -o - %s | FileCheck %s + +#include + +// CHECK-LABEL: @test_urshrl( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[VALUE:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[VALUE]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = tail call { i32, i32 } @llvm.arm.mve.urshrl(i32 [[TMP2]], i32 [[TMP1]], i32 6) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_urshrl(uint64_t value) +{ + return urshrl(value, 6); +} diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c @@ -0,0 +1,89 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -O3 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -DPOLYMORPHIC -O3 -S -emit-llvm -o - %s | FileCheck %s + +#include + +// CHECK-LABEL: @test_vadciq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 29 +// CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 1 +// CHECK-NEXT: store i32 [[TMP3]], i32* [[CARRY_OUT:%.*]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// +int32x4_t test_vadciq_s32(int32x4_t a, int32x4_t b, unsigned *carry_out) +{ +#ifdef POLYMORPHIC + return vadciq(a, b, carry_out); +#else /* POLYMORPHIC */ + return vadciq_s32(a, b, carry_out); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vadcq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[CARRY:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 29 +// CHECK-NEXT: [[TMP2:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 29 +// CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[TMP5]], i32* [[CARRY]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP6]] +// +uint32x4_t test_vadcq_u32(uint32x4_t a, uint32x4_t b, unsigned *carry) +{ +#ifdef POLYMORPHIC + return vadcq(a, b, carry); +#else /* POLYMORPHIC */ + return vadcq_u32(a, b, carry); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vadciq_m_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP3]], 29 +// CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[TMP5]], i32* [[CARRY_OUT:%.*]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP6]] +// +uint32x4_t test_vadciq_m_u32(uint32x4_t inactive, uint32x4_t a, uint32x4_t b, unsigned *carry_out, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vadciq_m(inactive, a, b, carry_out, p); +#else /* POLYMORPHIC */ + return vadciq_m_u32(inactive, a, b, carry_out, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vadcq_m_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[CARRY:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 29 +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vadc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 [[TMP1]], <4 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP4]], 1 +// CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP5]], 29 +// CHECK-NEXT: [[TMP7:%.*]] = and i32 [[TMP6]], 1 +// CHECK-NEXT: store i32 [[TMP7]], i32* [[CARRY]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP4]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP8]] +// +int32x4_t test_vadcq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsigned *carry, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vadcq_m(inactive, a, b, carry, p); +#else /* POLYMORPHIC */ + return vadcq_m_s32(inactive, a, b, carry, p); +#endif /* POLYMORPHIC */ +} diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddq.c @@ -0,0 +1,65 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -O3 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -DPOLYMORPHIC -O3 -S -emit-llvm -o - %s | FileCheck %s + +#include + +// CHECK-LABEL: @test_vaddq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[B:%.*]], [[A:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) +{ +#ifdef POLYMORPHIC + return vaddq(a, b); +#else /* POLYMORPHIC */ + return vaddq_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vsubq_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = fsub <8 x half> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// +float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) +{ +#ifdef POLYMORPHIC + return vsubq(a, b); +#else /* POLYMORPHIC */ + return vsubq_f16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddq_m_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.add.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vaddq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vaddq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vaddq_m_s8(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vsubq_m_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vsubq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vsubq_m(inactive, a, b, p); +#else /* POLYMORPHIC */ + return vsubq_m_f32(inactive, a, b, p); +#endif /* POLYMORPHIC */ +} diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c b/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c @@ -0,0 +1,27 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -O3 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -DPOLYMORPHIC -O3 -S -emit-llvm -o - %s | FileCheck %s + +#include + +// CHECK-LABEL: @test_vcvttq_f16_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.arm.mve.fltnarrow(<8 x half> [[A:%.*]], <4 x float> [[B:%.*]], i32 1) +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// +float16x8_t test_vcvttq_f16_f32(float16x8_t a, float32x4_t b) +{ + return vcvttq_f16_f32(a, b); +} + +// CHECK-LABEL: @test_vcvttq_m_f16_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.fltnarrow.predicated(<8 x half> [[A:%.*]], <4 x float> [[B:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x half> [[TMP2]] +// +float16x8_t test_vcvttq_m_f16_f32(float16x8_t a, float32x4_t b, mve_pred16_t p) +{ + return vcvttq_m_f16_f32(a, b, p); +} diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c @@ -0,0 +1,83 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -O3 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -DPOLYMORPHIC -O3 -S -emit-llvm -o - %s | FileCheck %s + +#include + +// CHECK-LABEL: @test_vld2q_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0f16(half* [[ADDR:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <8 x half>, <8 x half> } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] undef, <8 x half> [[TMP1]], 0, 0 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x half>, <8 x half> } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] %2, <8 x half> [[TMP3]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] %4 +// +float16x8x2_t test_vld2q_f16(const float16_t *addr) +{ +#ifdef POLYMORPHIC + return vld2q(addr); +#else /* POLYMORPHIC */ + return vld2q_f16(addr); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld4q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8.p0i8(i8* [[ADDR:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP0]], 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T:%.*]] undef, <16 x i8> [[TMP1]], 0, 0 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP0]], 1 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] %2, <16 x i8> [[TMP3]], 0, 1 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP0]], 2 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] %4, <16 x i8> [[TMP5]], 0, 2 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP0]], 3 +// CHECK-NEXT: [[TMP8:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] %6, <16 x i8> [[TMP7]], 0, 3 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X4_T]] %8 +// +uint8x16x4_t test_vld4q_u8(const uint8_t *addr) +{ +#ifdef POLYMORPHIC + return vld4q(addr); +#else /* POLYMORPHIC */ + return vld4q_u8(addr); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst2q_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T:%.*]] %value.coerce, 0, 0 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_UINT32X4X2_T]] %value.coerce, 0, 1 +// CHECK-NEXT: tail call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[ADDR:%.*]], <4 x i32> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <4 x i32> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 0) +// CHECK-NEXT: tail call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* [[ADDR]], <4 x i32> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <4 x i32> [[VALUE_COERCE_FCA_0_1_EXTRACT]], i32 1) +// CHECK-NEXT: ret void +// +void test_vst2q_u32(uint32_t *addr, uint32x4x2_t value) +{ +#ifdef POLYMORPHIC + vst2q(addr, value); +#else /* POLYMORPHIC */ + vst2q_u32(addr, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst4q_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_0_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T:%.*]] %value.coerce, 0, 0 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_1_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] %value.coerce, 0, 1 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_2_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] %value.coerce, 0, 2 +// CHECK-NEXT: [[VALUE_COERCE_FCA_0_3_EXTRACT:%.*]] = extractvalue [[STRUCT_INT8X16X4_T]] %value.coerce, 0, 3 +// CHECK-NEXT: tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* [[ADDR:%.*]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 0) +// CHECK-NEXT: tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 1) +// CHECK-NEXT: tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 2) +// CHECK-NEXT: tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* [[ADDR]], <16 x i8> [[VALUE_COERCE_FCA_0_0_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_1_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_2_EXTRACT]], <16 x i8> [[VALUE_COERCE_FCA_0_3_EXTRACT]], i32 3) +// CHECK-NEXT: ret void +// +void test_vst4q_s8(int8_t *addr, int8x16x4_t value) +{ +#ifdef POLYMORPHIC + vst4q(addr, value); +#else /* POLYMORPHIC */ + vst4q_s8(addr, value); +#endif /* POLYMORPHIC */ +} diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vldr.c b/clang/test/CodeGen/arm-mve-intrinsics/vldr.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vldr.c @@ -0,0 +1,49 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -O3 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -DPOLYMORPHIC -O3 -S -emit-llvm -o - %s | FileCheck %s + +#include + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> [[TMP0]], i32 80) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP1]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +int32x4_t test_vldrwq_gather_base_wb_s32(uint32x4_t *addr) +{ + return vldrwq_gather_base_wb_s32(addr, 0x50); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32> [[TMP0]], i32 64) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP1]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// +float32x4_t test_vldrwq_gather_base_wb_f32(uint32x4_t *addr) +{ + return vldrwq_gather_base_wb_f32(addr, 0x40); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_wb_z_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 656, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 1 +// CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 0 +// CHECK-NEXT: ret <2 x i64> [[TMP5]] +// +uint64x2_t test_vldrdq_gather_base_wb_z_u64(uint64x2_t *addr, mve_pred16_t p) +{ + return vldrdq_gather_base_wb_z_u64(addr, 0x290, p); +} diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c @@ -0,0 +1,19 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -O3 -S -emit-llvm -o - %s | FileCheck %s +// RUN: %clang --target=arm-arm-none-eabi -march=armv8.1m.main+mve.fp -mfloat-abi=hard -DPOLYMORPHIC -O3 -S -emit-llvm -o - %s | FileCheck %s + +#include + +// CHECK-LABEL: @test_vminvq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.arm.mve.minv.u.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vminvq_u32(uint32_t a, uint32x4_t b) +{ +#ifdef POLYMORPHIC + return vminvq(a, b); +#else /* POLYMORPHIC */ + return vminvq_u32(a, b); +#endif /* POLYMORPHIC */ +} diff --git a/clang/utils/TableGen/CMakeLists.txt b/clang/utils/TableGen/CMakeLists.txt --- a/clang/utils/TableGen/CMakeLists.txt +++ b/clang/utils/TableGen/CMakeLists.txt @@ -12,6 +12,7 @@ ClangOptionDocEmitter.cpp ClangSACheckersEmitter.cpp NeonEmitter.cpp + MveEmitter.cpp TableGen.cpp ) set_target_properties(clang-tblgen PROPERTIES FOLDER "Clang tablegenning") diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp new file mode 100644 --- /dev/null +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -0,0 +1,1737 @@ +//===- MveEmitter.cpp - Generate arm_mve.h for use with clang -*- C++ -*-=====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This set of linked tablegen backends is responsible for emitting the bits +// and pieces that implement , which is defined by the ACLE standard +// and provides a set of types and functions for (more or less) direct access +// to the MVE instruction set, including the scalar shifts as well as the +// vector instructions. +// +// MVE's standard intrinsic functions are unusual in that they have a system of +// polymorphism. For example, the function vaddq() can behave like vaddq_u16(), +// vaddq_f32(), vaddq_s8(), etc., depending on the types of the vector +// arguments you give it. +// +// This constrains the implementation strategies. The usual approach to making +// the user-facing functions polymorphic would be to either use +// __attribute__((overloadable)) to make a set of vaddq() functions that are +// all inline wrappers on the underlying clang builtins, or to define a single +// vaddq() macro which expands to an instance of _Generic. +// +// The inline-wrappers approach would work fine for most intrinsics, except for +// the ones that take an argument required to be a compile-time constant, +// because if you wrap an inline function around a call to a builtin, the +// constant nature of the argument is not passed through. +// +// The _Generic approach can be made to work with enough effort, but it takes a +// lot of machinery, because of the design feature of _Generic that even the +// untaken branches are required to pass all front-end validity checks such as +// type-correctness. You can work around that by nesting further _Generics all +// over the place to coerce things to the right type in untaken branches, but +// what you get out is complicated, hard to guarantee its correctness, and +// worst of all, gives _completely unreadable_ error messages if the user gets +// the types wrong for an intrinsic call. +// +// Therefore, my strategy is to introduce a new __attribute__ that allows a +// function to be mapped to a clang builtin even though it doesn't have the +// same name, and then declare all the user-facing MVE function names with that +// attribute, mapping each one directly to the clang builtin. And the +// polymorphic ones have __attribute__((overloadable)) as well. So once the +// compiler has resolved the overload, it knows the internal builtin ID of the +// selected function, and can check the immediate arguments against that; and +// if the user gets the types wrong in a call to a polymorphic intrinsic, they +// get a completely clear error message showing all the declarations of that +// function in the header file and explaining why each one doesn't fit their +// call. +// +// The downside of this is that if every clang builtin has to correspond +// exactly to a user-facing ACLE intrinsic, then you can't save work in the +// frontend by doing it in the header file: CGBuiltin.cpp has to do the entire +// job of converting an ACLE intrinsic call into LLVM IR. So the Tablegen +// description for an MVE intrinsic has to contain a full description of the +// sequence of IRBuilder calls that clang will need to make. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +namespace { + +#if 0 +} // stop emacs from wanting to auto-indent everything to 2 spaces inside here +#endif + +class MveEmitter; +class Value; + +// ----------------------------------------------------------------------------- +// A system of classes to represent all the types we'll need to deal with in +// the prototypes of intrinsics. +// +// Query methods include finding out the C name of a type; the "LLVM name" in +// the sense of a C++ code snippet that can be used in the codegen function; +// the suffix that represents the type in the ACLE intrinsic naming scheme +// (e.g. 's32' represents int32_t in intrinsics such as vaddq_s32); whether the +// type is floating-point related (hence should be under #ifdef in the MVE +// header so that it isn't included in integer-only MVE mode); and the type's +// size in bits. Not all subtypes support all these queries. + +class Type { +public: + enum class TypeKind { + // Void appears as a return type (for store intrinsics, which are pure + // side-effect). It's also used as the parameter type in the Tablegen + // when an intrinsic doesn't need to come in various suffixed forms like + // vfooq_s8,vfooq_u16,vfooq_f32. + Void, + + // Scalar is used for ordinary int and float types of all sizes. + Scalar, + + // Vector is used for anything that occupies exactly one MVE vector + // register, i.e. {uint,int,float}NxM_t. + Vector, + + // MultiVector is used for the {uint,int,float}NxMxK_t types used by the + // interleaving load/store intrinsics v{ld,st}{2,4}q. + MultiVector, + + // Predicate is used by all the predicated intrinsics. Its C + // representation is mve_pred16_t (which is just an alias for uint16_t). + // But we give more detail here, by indicating that a given predicate + // instruction is logically regarded as a vector of i1 containing the + // same number of lanes as the input vector type. So our Predicate type + // comes with a lane count, which we use to decide which kind of + // we'll invoke the pred_i2v IR intrinsic to translate it into. + Predicate, + + // Pointer is used for pointer types (obviously), and comes with a flag + // indicating whether it's a pointer to a const or mutable instance of + // the pointee type. + Pointer, + }; + +private: + const TypeKind TKind; + +protected: + Type(TypeKind K) : TKind(K) {} + +public: + TypeKind typeKind() const { return TKind; } + virtual ~Type() = default; + virtual bool requires_float() const = 0; + virtual unsigned sizeInBits() const = 0; + virtual std::string c_name() const = 0; + virtual std::string llvm_name() const { + PrintFatalError("no LLVM type name available for type " + c_name()); + } + virtual std::string acle_suffix() const { + PrintFatalError("no ACLE suffix available for this type"); + } +}; + +enum class ScalarTypeKind { SignedInt, UnsignedInt, Float }; +inline std::string to_letter(ScalarTypeKind kind) { + switch (kind) { + case ScalarTypeKind::SignedInt: + return "s"; + case ScalarTypeKind::UnsignedInt: + return "u"; + case ScalarTypeKind::Float: + return "f"; + default: + llvm_unreachable("bad scalar type kind"); + } +} +inline std::string to_c_prefix(ScalarTypeKind kind) { + switch (kind) { + case ScalarTypeKind::SignedInt: + return "int"; + case ScalarTypeKind::UnsignedInt: + return "uint"; + case ScalarTypeKind::Float: + return "float"; + default: + llvm_unreachable("bad scalar type kind"); + } +} + +class VoidType : public Type { +public: + VoidType() : Type(TypeKind::Void) {} + virtual unsigned sizeInBits() const override { return 0; } + virtual bool requires_float() const override { return false; } + virtual std::string c_name() const override { return "void"; } + + static bool classof(const Type *T) { return T->typeKind() == TypeKind::Void; } + virtual std::string acle_suffix() const override { return ""; } +}; + +class PointerType : public Type { + const Type *Pointee; + bool Const; + +public: + PointerType(const Type *Pointee, bool Const) + : Type(TypeKind::Pointer), Pointee(Pointee), Const(Const) {} + virtual unsigned sizeInBits() const override { return 32; } + virtual bool requires_float() const override { + return Pointee->requires_float(); + } + virtual std::string c_name() const override { + std::string Name = Pointee->c_name(); + // Syntax for a pointer in C is different when the pointee is itself a + // pointer, especially if the pointee is also const. + if (isa(Pointee)) { + if (Const) + Name += "const "; + return Name + "*"; + } else { + if (Const) + Name = "const " + Name; + return Name + " *"; + } + } + + static bool classof(const Type *T) { + return T->typeKind() == TypeKind::Pointer; + } +}; + +// Base class for all the types that have a name of the form +// [prefix][numbers]_t, like int32_t, uint16x8_t, float32x4x2_t. +// +// For this sub-hierarchy we invent a c_name_base() method which returns the +// whole name except for the trailing "_t", so that Vector and MultiVector can +// append an extra "x2" or whatever to their element type's c_name_base(). Then +// the main c_name() query method puts "_t" on the end for the final type name. + +class CRegularNamedType : public Type { + using Type::Type; + virtual std::string c_name_base() const = 0; + +public: + virtual std::string c_name() const override { return c_name_base() + "_t"; } +}; + +class ScalarType : public CRegularNamedType { + ScalarTypeKind Kind; + unsigned Bits; + +public: + ScalarType(const Record *Record) : CRegularNamedType(TypeKind::Scalar) { + Kind = StringSwitch(Record->getValueAsString("kind")) + .Case("s", ScalarTypeKind::SignedInt) + .Case("u", ScalarTypeKind::UnsignedInt) + .Case("f", ScalarTypeKind::Float); + Bits = Record->getValueAsInt("size"); + } + virtual unsigned sizeInBits() const override { return Bits; } + ScalarTypeKind kind() const { return Kind; } + std::string suffix() const { return to_letter(Kind) + utostr(Bits); } + virtual std::string c_name_base() const override { + return to_c_prefix(Kind) + utostr(Bits); + } + virtual std::string llvm_name() const override { + if (Kind == ScalarTypeKind::Float) { + if (Bits == 16) + return "HalfTy"; + if (Bits == 32) + return "FloatTy"; + if (Bits == 64) + return "DoubleTy"; + PrintFatalError("bad size for floating type"); + } + return "Int" + utostr(Bits) + "Ty"; + } + virtual std::string acle_suffix() const override { + return "_" + to_letter(Kind) + utostr(Bits); + } + bool is_integer() const { return Kind != ScalarTypeKind::Float; } + virtual bool requires_float() const override { return !is_integer(); } + + static bool classof(const Type *T) { + return T->typeKind() == TypeKind::Scalar; + } +}; + +class VectorType : public CRegularNamedType { + const ScalarType *Element; + unsigned Lanes; + +public: + VectorType(const ScalarType *Element) + : CRegularNamedType(TypeKind::Vector), Element(Element) { + // MVE has a fixed 128-bit vector size + Lanes = 128 / Element->sizeInBits(); + } + virtual unsigned sizeInBits() const override { return 128; } + unsigned lanes() const { return Lanes; } + virtual bool requires_float() const override { + return Element->requires_float(); + } + virtual std::string c_name_base() const override { + return Element->c_name_base() + "x" + utostr(Lanes); + } + virtual std::string llvm_name() const override { + return "llvm::VectorType::get(" + Element->llvm_name() + ", " + + utostr(Lanes) + ")"; + } + + static bool classof(const Type *T) { + return T->typeKind() == TypeKind::Vector; + } +}; + +class MultiVectorType : public CRegularNamedType { + const VectorType *Element; + unsigned Registers; + +public: + MultiVectorType(unsigned Registers, const VectorType *Element) + : CRegularNamedType(TypeKind::MultiVector), Element(Element), + Registers(Registers) {} + virtual unsigned sizeInBits() const override { + return Registers * Element->sizeInBits(); + } + unsigned registers() const { return Registers; } + virtual bool requires_float() const override { + return Element->requires_float(); + } + virtual std::string c_name_base() const override { + return Element->c_name_base() + "x" + utostr(Registers); + } + + static bool classof(const Type *T) { + return T->typeKind() == TypeKind::MultiVector; + } +}; + +class PredicateType : public CRegularNamedType { + unsigned Lanes; + +public: + PredicateType(unsigned Lanes) + : CRegularNamedType(TypeKind::Predicate), Lanes(Lanes) {} + virtual unsigned sizeInBits() const override { return 16; } + virtual std::string c_name_base() const override { return "mve_pred16"; } + virtual bool requires_float() const override { return false; }; + virtual std::string llvm_name() const override { + // Use <4 x i1> instead of <2 x i1> for two-lane vector types. See + // the comment in llvm/lib/Target/ARM/ARMInstrMVE.td for further + // explanation. + unsigned ModifiedLanes = (Lanes == 2 ? 4 : Lanes); + + return "llvm::VectorType::get(Builder.getInt1Ty(), " + + utostr(ModifiedLanes) + ")"; + } + + static bool classof(const Type *T) { + return T->typeKind() == TypeKind::Predicate; + } +}; + +// ----------------------------------------------------------------------------- +// Class to facilitate merging together the code generation for many intrinsics +// by means of varying a few constant or type parameters. +// +// Most obviously, the intrinsics in a single parametrised family will have +// code generation sequences that only differ in a type or two, e.g. vaddq_s8 +// and vaddq_u16 will look the same apart from putting a different vector type +// in the call to CGM.getIntrinsic(). But also, completely different intrinsics +// will often code-generate in the same way, with only a different choice of +// _which_ IR intrinsic they lower to (e.g. vaddq_m_s8 and vmulq_m_s8), but +// marshalling the arguments and return values of the IR intrinsic in exactly +// the same way. And others might differ only in some other kind of constant, +// such as a lane index. +// +// So, when we generate the IR-building code for all these intrinsics, we keep +// track of every value that could possibly be pulled out of the code and +// stored ahead of time in a local variable. Then we group together intrinsics +// by textual equivalence of the code that would result if _all_ those +// parameters were stored in local variables. That gives us maximal sets that +// can be implemented by a single piece of IR-building code by changing +// parameter values ahead of time. +// +// After we've done that, we do a second pass in which we only allocate _some_ +// of the parameters into local variables, by tracking which ones have the same +// values as each other (so that a single variable can be reused) and which +// ones are the same across the whole set (so that no variable is needed at +// all). +// +// Hence the class below. Its alloc_param method is invoked during code +// generation by every method of a Value subclass (see below) that wants to +// give it the opportunity to pull something out into a switchable parameter. +// It returns a variable name for the parameter, or (if it's being used in the +// second pass once we've decided that some parameters don't need to be stored +// in variables after all) it might just return the input expression unchanged. + +struct CodeGenParamAllocator { + // Accumulated during code generation + std::vector *ParamTypes = nullptr; + std::vector *ParamValues = nullptr; + + // Provided ahead of time in pass 2, to indicate which parameters are being + // assigned to what. This vector contains an entry for each call to + // alloc_param expected during code gen (which we counted up in pass 1), and + // indicates the number of the parameter variable that should be returned, or + // -1 if this call shouldn't allocate a parameter variable at all. + // + // We rely on the recursive code generation working identically in passes 1 + // and 2, so that the same list of calls to alloc_param happen in the same + // order. That guarantees that the parameter numbers recorded in pass 1 will + // match the entries in this vector that store what MveEmitter::EmitBuiltinCG + // decided to do about each one in pass 2. + std::vector *ParamNumberMap = nullptr; + + // Internally track how many things we've allocated + unsigned nparams = 0; + + std::string alloc_param(std::string Type, std::string Value) { + unsigned param_number; + + if (!ParamNumberMap) { + // In pass 1, unconditionally assign a new parameter variable to every + // value we're asked to process. + param_number = nparams++; + } else { + // In pass 2, consult the map provided by the caller to find out which + // variable we should be keeping things in. + int map_value = (*ParamNumberMap)[nparams++]; + if (map_value < 0) + return Value; + param_number = map_value; + } + + // If we've allocated a new parameter variable for the first time, store + // its type and value to be retrieved after codegen. + if (ParamTypes && ParamTypes->size() == param_number) + ParamTypes->push_back(Type); + if (ParamValues && ParamValues->size() == param_number) + ParamValues->push_back(Value); + + // Unimaginative naming scheme for parameter variables. + return "Param" + utostr(param_number); + } +}; + +// ----------------------------------------------------------------------------- +// System of classes that represent all the intermediate values used during +// code-generation for an intrinsic. +// +// The base class 'Value' can represent a value of the LLVM type 'Value', or +// sometimes 'Address' (for loads/stores, including an alignment requirement). +// +// In the case where the Tablegen provides a value in the codegen dag as a +// plain integer literal, the Value object we construct here will be one that +// returns true from hasIntegerConstantValue(). This allows the generated C++ +// code to use the constant directly in contexts which can take a literal +// integer, such as Builder.CreateExtractValue(thing, 1), without going to the +// effort of calling llvm::ConstantInt::get() and then pulling the constant +// back out of the resulting llvm:Value later. + +class Value { +public: + // Convenient shorthand for the pointer type we'll be using everywhere. + using Ptr = std::shared_ptr; + +private: + Ptr Predecessor; + std::string VarName; + bool VarNameUsed = false; + unsigned Visited = 0; + +public: + virtual ~Value() = default; + using Scope = std::map; + virtual void gen_code(raw_ostream &OS, CodeGenParamAllocator &) const = 0; + virtual bool hasIntegerConstantValue() const { return false; } + virtual uint32_t integerConstantValue() const { return 0; } + virtual std::string typeName() const { return "Value *"; } + + // Mostly, when a code-generation operation has a dependency on prior + // operations, it's because it uses the output values of those operations as + // inputs. But there's one exception, which is the use of 'seq' in Tablegen + // to indicate that operations have to be performed in sequence regardless of + // whether they use each others' output values. + // + // So, the actual generation of code is done by depth-first search, using the + // prerequisites() method to get a list of all the other Values that have to + // be computed before this one. That method divides into the 'predecessor', + // set by setPredecessor() while processing a 'seq' dag node, and the list + // returned by 'morePrerequisites', which each subclass implements to return + // a list of the Values it uses as input to whatever its own computation is + // doing. + + virtual void morePrerequisites(std::vector &output) const {} + std::vector prerequisites() const { + std::vector ToRet; + if (Predecessor) + ToRet.push_back(Predecessor); + morePrerequisites(ToRet); + return ToRet; + } + + void setPredecessor(Ptr p) { + assert(!Predecessor); + Predecessor = p; + } + + // Each Value will be assigned a variable name in the output code, but not + // all those variable names will actually be used (e.g. the return value of + // Builder.CreateStore has void type, so nobody will want to refer to it). To + // prevent annoying compiler warnings, we track whether each Value's variable + // name was ever actually mentioned in subsequent statements, so that it can + // be left out of the final generated code. + std::string varname() { + VarNameUsed = true; + return VarName; + } + void setVarname(const std::string &s) { VarName = s; } + bool varnameUsed() const { return VarNameUsed; } + + // Code generation happens in multiple passes. This method tracks whether a + // Value has yet been visited in a given pass, without the need for a tedious + // loop in between passes that goes through and resets a 'visited' flag back + // to false: you just set Pass=1 the first time round, and Pass=2 the second + // time. + bool needs_visiting(unsigned Pass) { + bool ToRet = Visited < Pass; + Visited = Pass; + return ToRet; + } +}; + +// Value subclass that retrieves one of the arguments to the clang builtin +// function. In cases where the argument has pointer type, we call +// EmitPointerWithAlignment and store the result in a variable of type Address, +// so that load and store IR nodes can know the right alignment. Otherwise, we +// call EmitScalarExpr. +// +// There are aggregate parameters in the MVE intrinsics API, but we don't deal +// with them in this Tablegen back end: they only arise in the vld2q/vld4q and +// vst2q/vst4q family, which is few enough that we just write the code by hand +// for those in CGBuiltin.cpp. +class BuiltinArgValue : public Value { +public: + unsigned ArgNum; + bool AddressType; + BuiltinArgValue(unsigned ArgNum, bool AddressType) + : ArgNum(ArgNum), AddressType(AddressType) {} + virtual void gen_code(raw_ostream &OS, + CodeGenParamAllocator &) const override { + OS << (AddressType ? "EmitPointerWithAlignment" : "EmitScalarExpr") + << "(E->getArg(" << ArgNum << "))"; + } + virtual std::string typeName() const { + return AddressType ? "Address" : Value::typeName(); + } +}; + +// Value subclass for an integer literal appearing in Tablegen. This may need +// to be turned into an llvm::Value by means of llvm::ConstantInt::get(), or it +// may be used directly as an integer, depending on which IRBuilder method it's +// being passed to. +class IntLiteralValue : public Value { +public: + const ScalarType *IntegerType; + uint32_t IntegerValue; + IntLiteralValue(const ScalarType *IntegerType, uint32_t IntegerValue) + : IntegerType(IntegerType), IntegerValue(IntegerValue) {} + virtual void gen_code(raw_ostream &OS, + CodeGenParamAllocator &ParamAlloc) const override { + OS << "llvm::ConstantInt::get(" + << ParamAlloc.alloc_param("llvm::Type *", IntegerType->llvm_name()) + << ", "; + OS << ParamAlloc.alloc_param(IntegerType->c_name(), utostr(IntegerValue)) + << ")"; + } + virtual bool hasIntegerConstantValue() const override { return true; } + virtual uint32_t integerConstantValue() const override { + return IntegerValue; + } +}; + +// Value subclass representing a cast between different integer types. We use +// our own ScalarType abstraction as the representation of the target type, +// which gives both size and signedness. +class IntCastValue : public Value { +public: + const ScalarType *IntegerType; + Ptr V; + IntCastValue(const ScalarType *IntegerType, Ptr V) + : IntegerType(IntegerType), V(V) {} + virtual void gen_code(raw_ostream &OS, + CodeGenParamAllocator &ParamAlloc) const override { + OS << "Builder.CreateIntCast(" << V->varname() << ", " + << ParamAlloc.alloc_param("llvm::Type *", IntegerType->llvm_name()) + << ", " + << ParamAlloc.alloc_param("bool", IntegerType->kind() == + ScalarTypeKind::SignedInt + ? "true" + : "false") + << ")"; + } + virtual void morePrerequisites(std::vector &output) const override { + output.push_back(V); + } +}; + +// Value subclass representing a call to an IRBuilder method. Each IRBuilder +// method we want to use will have a Tablegen record giving the method name and +// describing any important details of how to call it, such as whether a +// particular argument should be an integer constant instead of an llvm::Value. +class IRBuilderValue : public Value { +public: + std::string BuilderMethod; + std::vector Args; + std::set AddressArgs; + std::set IntConstantArgs; + IRBuilderValue(std::string BuilderMethod, std::vector Args, + std::set AddressArgs, + std::set IntConstantArgs) + : BuilderMethod(BuilderMethod), Args(Args), AddressArgs(AddressArgs), + IntConstantArgs(IntConstantArgs) {} + virtual void gen_code(raw_ostream &OS, + CodeGenParamAllocator &ParamAlloc) const override { + OS << "Builder." << BuilderMethod << "("; + const char *Sep = ""; + for (unsigned i = 0, e = Args.size(); i < e; ++i) { + Ptr Arg = Args[i]; + if (IntConstantArgs.find(i) != IntConstantArgs.end()) { + assert(Arg->hasIntegerConstantValue()); + OS << Sep + << ParamAlloc.alloc_param("unsigned", + utostr(Arg->integerConstantValue())); + } else { + OS << Sep << Arg->varname(); + } + Sep = ", "; + } + OS << ")"; + } + virtual void morePrerequisites(std::vector &output) const override { + for (unsigned i = 0, e = Args.size(); i < e; ++i) { + Ptr Arg = Args[i]; + if (IntConstantArgs.find(i) != IntConstantArgs.end()) + continue; + output.push_back(Arg); + } + } +}; + +// Value subclass representing a call to an IR intrinsic, which we first have +// to look up using an Intrinsic::ID constant and an array of types. +class IRIntrinsicValue : public Value { +public: + std::string IntrinsicID; + std::vector ParamTypes; + std::vector Args; + IRIntrinsicValue(std::string IntrinsicID, + std::vector ParamTypes, std::vector Args) + : IntrinsicID(IntrinsicID), ParamTypes(ParamTypes), Args(Args) {} + virtual void gen_code(raw_ostream &OS, + CodeGenParamAllocator &ParamAlloc) const override { + std::string IntNo = ParamAlloc.alloc_param( + "Intrinsic::ID", "Intrinsic::arm_mve_" + IntrinsicID); + OS << "Builder.CreateCall(CGM.getIntrinsic(" << IntNo; + if (!ParamTypes.empty()) { + OS << ", llvm::SmallVector {"; + const char *Sep = ""; + for (auto T : ParamTypes) { + OS << Sep << ParamAlloc.alloc_param("llvm::Type *", T->llvm_name()); + Sep = ", "; + } + OS << "}"; + } + OS << "), llvm::SmallVector {"; + const char *Sep = ""; + for (auto Arg : Args) { + OS << Sep << Arg->varname(); + Sep = ", "; + } + OS << "})"; + } + virtual void morePrerequisites(std::vector &output) const override { + output.insert(output.end(), Args.begin(), Args.end()); + } +}; + +// ----------------------------------------------------------------------------- +// Class that describes a single ACLE intrinsic. +// +// A Tablegen record will typically describe more than one ACLE intrinsic, by +// means of setting the 'list Params' field to a list of multiple +// parameter types, so as to define vaddq_{s8,u8,...,f16,f32} all in one go. +// We'll end up with one instance of ACLEIntrinsic for *each* parameter type, +// rather than a single one for all of them. Hence, the constructor takes both +// a Tablegen record and the current value of the parameter type. + +class ACLEIntrinsic { + // Structure documenting that one of the intrinsic's arguments is required to + // be a compile-time constant integer, and what constraints there are on its + // value. Used when generating Sema checking code. + struct ImmediateArg { + enum class BoundsType { ExplicitRange, UInt }; + BoundsType boundsType; + int64_t i1, i2; + std::string ExtraCheckType, ExtraCheckArgs; + const Type *ArgType; + }; + + // For polymorphic intrinsics, FullName is the explicit name that uniquely + // identifies this variant of the intrinsic, and ShortName is the name it + // shares with at least one other intrinsic. + std::string ShortName, FullName; + + const Type *ReturnType; + std::vector ArgTypes; + std::map ImmediateArgs; + Value::Ptr Code; + + std::map CustomCodeGenArgs; + + // Recursive function that does the internals of code generation. + void gen_code_dfs(Value::Ptr V, std::list &used, + unsigned Pass) const { + if (!V->needs_visiting(Pass)) + return; + + CodeGenParamAllocator DummyParamAlloc; + V->gen_code(nulls(), DummyParamAlloc); + + for (Value::Ptr W : V->prerequisites()) + gen_code_dfs(W, used, Pass); + + used.push_back(V); + } + +public: + const std::string &shortName() const { return ShortName; } + const std::string &fullName() const { return FullName; } + const Type *returnType() const { return ReturnType; } + const std::vector &argTypes() const { return ArgTypes; } + bool requires_float() const { + if (ReturnType->requires_float()) + return true; + for (const Type *T : ArgTypes) + if (T->requires_float()) + return true; + return false; + } + bool polymorphic() const { return ShortName != FullName; } + + // External entry point for code generation, called from MveEmitter. + void gen_code(raw_ostream &OS, CodeGenParamAllocator &ParamAlloc, + unsigned Pass) const { + if (!has_code()) { + for (auto kv : CustomCodeGenArgs) + OS << " " << kv.first << " = " << kv.second << ";\n"; + OS << " break; // custom code gen\n"; + return; + } + std::list used; + gen_code_dfs(Code, used, Pass); + + unsigned varindex = 0; + for (Value::Ptr V : used) + if (V->varnameUsed()) + V->setVarname("Val" + utostr(varindex++)); + + for (Value::Ptr V : used) { + OS << " "; + if (V == used.back()) { + assert(!V->varnameUsed()); + OS << "return "; // FIXME: what if the top-level thing is void? + } else if (V->varnameUsed()) { + std::string Type = V->typeName(); + OS << V->typeName(); + if (!StringRef(Type).endswith("*")) + OS << " "; + OS << V->varname() << " = "; + } + V->gen_code(OS, ParamAlloc); + OS << ";\n"; + } + } + bool has_code() const { return Code != nullptr; } + + std::string gen_sema() const { + std::vector SemaChecks; + + for (const auto &kv : ImmediateArgs) { + const ImmediateArg &IA = kv.second; + + llvm::APInt lo(128, 0), hi(128, 0); + switch (IA.boundsType) { + case ImmediateArg::BoundsType::ExplicitRange: + lo = IA.i1; + hi = IA.i2; + break; + case ImmediateArg::BoundsType::UInt: + lo = 0; + hi = IA.i1; + break; + } + + llvm::APInt typelo, typehi; + if (cast(IA.ArgType)->kind() == ScalarTypeKind::UnsignedInt) { + typelo = llvm::APInt::getSignedMinValue(IA.ArgType->sizeInBits()); + typehi = llvm::APInt::getSignedMaxValue(IA.ArgType->sizeInBits()); + } else { + typelo = llvm::APInt::getMinValue(IA.ArgType->sizeInBits()); + typehi = llvm::APInt::getMaxValue(IA.ArgType->sizeInBits()); + } + typelo = typelo.sext(128); + typehi = typehi.sext(128); + + std::string Index = utostr(kv.first); + + if (lo.sle(typelo) && hi.sge(typehi)) + SemaChecks.push_back("SemaBuiltinConstantArg(TheCall, " + Index + ")"); + else + SemaChecks.push_back("SemaBuiltinConstantArgRange(TheCall, " + Index + + ", 0x" + lo.toString(16, true) + ", 0x" + + hi.toString(16, true) + ")"); + + if (!IA.ExtraCheckType.empty()) { + std::string Suffix; + if (!IA.ExtraCheckArgs.empty()) + Suffix = ", " + IA.ExtraCheckArgs; + SemaChecks.push_back("SemaBuiltinConstantArg" + IA.ExtraCheckType + + "(TheCall, " + Index + Suffix + ")"); + } + } + if (SemaChecks.empty()) + return ""; + return std::string(" return ") + + join(std::begin(SemaChecks), std::end(SemaChecks), + " ||\n ") + + ";\n"; + } + + ACLEIntrinsic(MveEmitter &ME, Record *R, const Type *Param); +}; + +// ----------------------------------------------------------------------------- +// The top-level class that holds all the state from analyzing the entire +// Tablegen input. + +class MveEmitter { + // MveEmitter holds a collection of all the types we've instantiated. + VoidType Void; + std::map> ScalarTypes; + std::map, std::unique_ptr> + VectorTypes; + std::map, std::unique_ptr> + MultiVectorTypes; + std::map> PredicateTypes; + std::map> PointerTypes; + + // And all the ACLEIntrinsic instances we've created. + std::map> ACLEIntrinsics; + +public: + // Methods to create a Type object, or return the right existing one from the + // maps stored in this object. + const VoidType *getVoidType() { return &Void; } + const ScalarType *getScalarType(StringRef Name) { + return ScalarTypes[Name].get(); + } + const ScalarType *getScalarType(Record *R) { + return getScalarType(R->getName()); + } + const VectorType *getVectorType(const ScalarType *ST) { + std::pair key(ST->kind(), ST->sizeInBits()); + if (VectorTypes.find(key) == VectorTypes.end()) + VectorTypes[key] = std::make_unique(ST); + return VectorTypes[key].get(); + } + const MultiVectorType *getMultiVectorType(unsigned Registers, + const VectorType *VT) { + std::pair key(VT->c_name_base(), Registers); + if (MultiVectorTypes.find(key) == MultiVectorTypes.end()) + MultiVectorTypes[key] = std::make_unique(Registers, VT); + return MultiVectorTypes[key].get(); + } + const PredicateType *getPredicateType(unsigned Lanes) { + unsigned key = Lanes; + if (PredicateTypes.find(key) == PredicateTypes.end()) + PredicateTypes[key] = std::make_unique(Lanes); + return PredicateTypes[key].get(); + } + const PointerType *getPointerType(const Type *T, bool Const) { + PointerType PT(T, Const); + std::string key = PT.c_name(); + if (PointerTypes.find(key) == PointerTypes.end()) + PointerTypes[key] = std::make_unique(PT); + return PointerTypes[key].get(); + } + + // Methods to construct a type from various pieces of Tablegen. These are + // always called in the context of setting up a particular ACLEIntrinsic, so + // there's always an ambient parameter type (because we're iterating through + // the Params list in the Tablegen record for the intrinsic), which is used + // to expand Tablegen classes like 'Vector' which mean something different in + // each member of a parametric family. + const Type *getType(Record *R, const Type *Param); + const Type *getType(DagInit *D, const Type *Param); + const Type *getType(Init *I, const Type *Param); + + // Functions that translate the Tablegen representation of an intrinsic's + // code generation into a collection of Value objects (which will then be + // reprocessed to read out the actual C++ code included by CGBuiltin.cpp). + Value::Ptr getCodeForDag(DagInit *D, const Value::Scope &Scope, + const Type *Param); + Value::Ptr getCodeForDagArg(DagInit *D, unsigned ArgNum, + const Value::Scope &Scope, const Type *Param); + Value::Ptr getCodeForArg(unsigned ArgNum, const Type *ArgType); + + // Constructor and top-level functions. + + MveEmitter(RecordKeeper &Records); + + void EmitHeader(raw_ostream &OS); + void EmitBuiltinDef(raw_ostream &OS); + void EmitBuiltinSema(raw_ostream &OS); + void EmitBuiltinCG(raw_ostream &OS); + void EmitBuiltinAliases(raw_ostream &OS); +}; + +const Type *MveEmitter::getType(Init *I, const Type *Param) { + if (auto Dag = dyn_cast(I)) + return getType(Dag, Param); + if (auto Def = dyn_cast(I)) + return getType(Def->getDef(), Param); + + PrintFatalError("Could not convert this value into a type"); +} + +const Type *MveEmitter::getType(Record *R, const Type *Param) { + if (R->isSubClassOf("Immediate")) + R = R->getValueAsDef("type"); // pass to subfield + + if (R->getName() == "Void") + return getVoidType(); + if (R->isSubClassOf("PrimitiveType")) + return getScalarType(R); + if (R->isSubClassOf("ComplexType")) + return getType(R->getValueAsDag("spec"), Param); + + PrintFatalError(R->getLoc(), "Could not convert this record into a type"); +} + +const Type *MveEmitter::getType(DagInit *D, const Type *Param) { + // The meat of the getType system: types in the Tablegen are represented by a + // dag whose operators select sub-cases of this function. + + Record *Op = cast(D->getOperator())->getDef(); + if (!Op->isSubClassOf("ComplexTypeOp")) + PrintFatalError( + "Expected ComplexTypeOp as dag operator in type expression"); + + if (Op->getName() == "CTO_Parameter") { + if (isa(Param)) + PrintFatalError("Parametric type in unparametrised context"); + return Param; + } + + if (Op->getName() == "CTO_Vec") { + const Type *Element = getType(D->getArg(0), Param); + return getVectorType(cast(Element)); + } + + if (Op->getName() == "CTO_Pred") { + const Type *Element = getType(D->getArg(0), Param); + return getPredicateType(128 / Element->sizeInBits()); + } + + if (Op->isSubClassOf("CTO_Tuple")) { + unsigned Registers = Op->getValueAsInt("n"); + const Type *Element = getType(D->getArg(0), Param); + return getMultiVectorType(Registers, cast(Element)); + } + + if (Op->isSubClassOf("CTO_Pointer")) { + const Type *Pointee = getType(D->getArg(0), Param); + return getPointerType(Pointee, Op->getValueAsBit("const")); + } + + if (Op->isSubClassOf("CTO_Sign")) { + const ScalarType *ST = cast(getType(D->getArg(0), Param)); + ScalarTypeKind NewKind = Op->getValueAsBit("signed") + ? ScalarTypeKind::SignedInt + : ScalarTypeKind::UnsignedInt; + for (const auto &kv : ScalarTypes) { + const ScalarType *RT = kv.second.get(); + if (RT->kind() == NewKind && RT->sizeInBits() == ST->sizeInBits()) + return RT; + } + PrintFatalError("Cannot change sign of this type"); + } + + PrintFatalError("Bad operator in type dag expression"); +} + +Value::Ptr MveEmitter::getCodeForDag(DagInit *D, const Value::Scope &Scope, + const Type *Param) { + Record *Op = cast(D->getOperator())->getDef(); + + if (Op->getName() == "seq") { + Value::Scope SubScope = Scope; + Value::Ptr PrevV = nullptr; + for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i) { + // We don't use getCodeForDagArg here, because the argument name + // has different semantics in a seq + Value::Ptr V = + getCodeForDag(cast(D->getArg(i)), SubScope, Param); + StringRef ArgName = D->getArgNameStr(i); + if (!ArgName.empty()) + SubScope[ArgName] = V; + if (PrevV) + V->setPredecessor(PrevV); + PrevV = V; + } + return PrevV; + } else if (Op->isSubClassOf("Type")) { + if (D->getNumArgs() != 1) + PrintFatalError("Type casts should have exactly one argument"); + const Type *CastType = getType(Op, Param); + Value::Ptr Arg = getCodeForDagArg(D, 0, Scope, Param); + if (const auto *ST = dyn_cast(CastType)) { + if (!ST->requires_float()) { + if (Arg->hasIntegerConstantValue()) + return std::make_shared(ST, + Arg->integerConstantValue()); + else + return std::make_shared(ST, Arg); + } + } + PrintFatalError("Unsupported type cast"); + } else { + std::vector Args; + for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i) + Args.push_back(getCodeForDagArg(D, i, Scope, Param)); + if (Op->isSubClassOf("IRBuilder")) { + std::set AddressArgs; + for (unsigned i : Op->getValueAsListOfInts("address_params")) + AddressArgs.insert(i); + std::set IntConstantArgs; + for (unsigned i : Op->getValueAsListOfInts("int_constant_params")) + IntConstantArgs.insert(i); + return std::make_shared( + Op->getValueAsString("func"), Args, AddressArgs, IntConstantArgs); + } else if (Op->isSubClassOf("IRInt")) { + std::vector ParamTypes; + for (Record *RParam : Op->getValueAsListOfDefs("params")) + ParamTypes.push_back(getType(RParam, Param)); + std::string IntName = Op->getValueAsString("intname"); + if (Op->getValueAsBit("appendKind")) + IntName += "_" + to_letter(cast(Param)->kind()); + return std::make_shared(IntName, ParamTypes, Args); + } else { + PrintFatalError("Unsupported dag node " + Op->getName()); + } + } +} + +Value::Ptr MveEmitter::getCodeForDagArg(DagInit *D, unsigned ArgNum, + const Value::Scope &Scope, + const Type *Param) { + Init *Arg = D->getArg(ArgNum); + StringRef Name = D->getArgNameStr(ArgNum); + + if (!Name.empty()) { + if (!isa(Arg)) + PrintFatalError( + "dag operator argument should not have both a value and a name"); + auto it = Scope.find(Name); + if (it == Scope.end()) + PrintFatalError("unrecognized variable name '" + Name + "'"); + return it->second; + } + + if (auto *II = dyn_cast(Arg)) + return std::make_shared(getScalarType("u32"), + II->getValue()); + + if (auto *DI = dyn_cast(Arg)) + return getCodeForDag(DI, Scope, Param); + + PrintFatalError("bad dag argument type for code generation"); +} + +Value::Ptr MveEmitter::getCodeForArg(unsigned ArgNum, const Type *ArgType) { + Value::Ptr V = + std::make_shared(ArgNum, isa(ArgType)); + + if (const auto *ST = dyn_cast(ArgType)) { + if (ST->is_integer() && ST->sizeInBits() < 32) + V = std::make_shared(getScalarType("u32"), V); + } else if (const auto *PT = dyn_cast(ArgType)) { + V = std::make_shared(getScalarType("u32"), V); + V = std::make_shared( + "pred_i2v", std::vector{PT}, std::vector{V}); + } + + return V; +} + +ACLEIntrinsic::ACLEIntrinsic(MveEmitter &ME, Record *R, const Type *Param) + : ReturnType(ME.getType(R->getValueAsDef("ret"), Param)) { + // Derive the intrinsic's full name, by taking the name of the + // Tablegen record (or override) and appending the suffix from its + // parameter type. (If the intrinsic is unparametrised, its + // parameter type will be given as Void, which returns the empty + // string for acle_suffix.) + std::string BaseName = (R->isSubClassOf("NameOverride") ? + R->getValueAsString("basename") : R->getName()); + FullName = BaseName + Param->acle_suffix(); + + // Derive the intrinsic's polymorphic name, by removing components from the + // full name as specified by its 'pnt' member ('polymorphic name type'), + // which indicates how many type suffixes to remove, and any other piece of + // the name that should be removed. + Record *PolymorphicNameType = R->getValueAsDef("pnt"); + SmallVector NameParts; + StringRef(FullName).split(NameParts, '_'); + for (unsigned i = 0, e = PolymorphicNameType->getValueAsInt( + "NumTypeSuffixesToDiscard"); + i < e; ++i) + NameParts.pop_back(); + if (!PolymorphicNameType->isValueUnset("ExtraSuffixToDiscard")) { + std::string ExtraSuffix = + PolymorphicNameType->getValueAsString("ExtraSuffixToDiscard"); + auto it = NameParts.end(); + while (it != NameParts.begin()) { + --it; + if (*it == ExtraSuffix) { + NameParts.erase(it); + break; + } + } + } + ShortName = join(std::begin(NameParts), std::end(NameParts), "_"); + + // Process the intrinsic's argument list. + DagInit *ArgsDag = R->getValueAsDag("args"); + Value::Scope Scope; + for (unsigned i = 0, e = ArgsDag->getNumArgs(); i < e; ++i) { + Init *TypeInit = ArgsDag->getArg(i); + + // Work out the type of the argument, for use in the function prototype in + // the header file. + const Type *ArgType = ME.getType(TypeInit, Param); + ArgTypes.push_back(ArgType); + + // The argument will usually have a name in the arguments dag, which goes + // into the variable-name scope that the code gen will refer to. + StringRef ArgName = ArgsDag->getArgNameStr(i); + if (!ArgName.empty()) + Scope[ArgName] = ME.getCodeForArg(i, ArgType); + + // If the argument is a subclass of Immediate, record the details about + // what values it can take, for Sema checking. + if (auto TypeDI = dyn_cast(TypeInit)) { + Record *TypeRec = TypeDI->getDef(); + if (TypeRec->isSubClassOf("Immediate")) { + Record *Bounds = TypeRec->getValueAsDef("bounds"); + ImmediateArg &IA = ImmediateArgs[i]; + if (Bounds->isSubClassOf("IB_ConstRange")) { + IA.boundsType = ImmediateArg::BoundsType::ExplicitRange; + IA.i1 = Bounds->getValueAsInt("lo"); + IA.i2 = Bounds->getValueAsInt("hi"); + } else if (Bounds->getName() == "IB_UEltValue") { + IA.boundsType = ImmediateArg::BoundsType::UInt; + IA.i1 = Param->sizeInBits(); + } else if (Bounds->getName() == "IB_LaneIndex") { + IA.boundsType = ImmediateArg::BoundsType::ExplicitRange; + IA.i1 = 0; + IA.i2 = 128 / Param->sizeInBits(); + } else if (Bounds->getName() == "IB_EltBit") { + IA.boundsType = ImmediateArg::BoundsType::ExplicitRange; + IA.i1 = Bounds->getValueAsInt("base"); + IA.i2 = IA.i1 + Param->sizeInBits() - 1; + } else { + PrintFatalError("unrecognised ImmediateBounds subclass"); + } + + IA.ArgType = ArgType; + + if (!TypeRec->isValueUnset("extra")) { + IA.ExtraCheckType = TypeRec->getValueAsString("extra"); + if (!TypeRec->isValueUnset("extraarg")) + IA.ExtraCheckArgs = + std::string(TypeRec->getValueAsString("extraarg")); + } + } + } + } + + // Finally, go through the codegen dag and translate it into a Value object + // (with an arbitrary DAG of depended-on Values hanging off it). + DagInit *CodeDag = R->getValueAsDag("codegen"); + Record *MainOp = cast(CodeDag->getOperator())->getDef(); + if (MainOp->isSubClassOf("CustomCodegen")) { + // Or, if it's the special case of CustomCodegen, just accumulate + // a list of parameters we're going to assign to variables before + // breaking from the loop. + CustomCodeGenArgs["CustomCodeGenType"] = + "CustomCodeGen::" + std::string(MainOp->getValueAsString("type")); + for (unsigned i = 0, e = CodeDag->getNumArgs(); i < e; ++i) { + StringRef Name = CodeDag->getArgNameStr(i); + if (Name.empty()) { + PrintFatalError("Operands to CustomCodegen should have names"); + } else if (auto *II = dyn_cast(CodeDag->getArg(i))) { + CustomCodeGenArgs[Name] = itostr(II->getValue()); + } else if (auto *SI = dyn_cast(CodeDag->getArg(i))) { + CustomCodeGenArgs[Name] = SI->getValue(); + } else { + PrintFatalError("Operands to CustomCodegen should be integers"); + } + } + } else { + Code = ME.getCodeForDag(CodeDag, Scope, Param); + } +} + +MveEmitter::MveEmitter(RecordKeeper &Records) { + // Construct the whole MveEmitter. + + // First, look up all the instances of PrimitiveType. This gives us the list + // of vector typedefs we have to put in arm_mve.h, and also allows us to + // collect all the useful ScalarType instances into a big list so that we can + // use it for operations such as 'find the unsigned version of this signed + // integer type'. + for (Record *R : Records.getAllDerivedDefinitions("PrimitiveType")) + ScalarTypes[R->getName()] = std::make_unique(R); + + // Now go through the instances of Intrinsic, and for each one, iterate + // through its list of type parameters making an ACLEIntrinsic for each one. + for (Record *R : Records.getAllDerivedDefinitions("Intrinsic")) { + for (Record *RParam : R->getValueAsListOfDefs("params")) { + const Type *Param = getType(RParam, getVoidType()); + auto Intrinsic = std::make_unique(*this, R, Param); + ACLEIntrinsics[Intrinsic->fullName()] = std::move(Intrinsic); + } + } +} + +/// A wrapper on raw_string_ostream that contains its own buffer rather than +/// having to point it at one elsewhere. (In other words, it works just like +/// std::ostringstream; also, this makes it convenient to declare a whole array +/// of them at once.) +/// +/// We have to set this up using multiple inheritance, to ensure that the +/// string member has been constructed before raw_string_ostream's constructor +/// is given a pointer to it. +class string_holder { +protected: + std::string S; +}; +class raw_self_contained_string_ostream : private string_holder, + public raw_string_ostream { +public: + raw_self_contained_string_ostream() + : string_holder(), raw_string_ostream(S) {} +}; + +void MveEmitter::EmitHeader(raw_ostream &OS) { + // Accumulate pieces of the header file that will be enabled under various + // different combinations of #ifdef. The index into parts[] is made up of + // the following bit flags. + constexpr unsigned Float = 1; + constexpr unsigned UseUserNamespace = 2; + + constexpr unsigned NumParts = 4; + raw_self_contained_string_ostream parts[NumParts]; + + // Write typedefs for all the required vector types, and a few scalar + // types that don't already have the name we want them to have. + + parts[0] << "typedef uint16_t mve_pred16_t;\n"; + parts[Float] << "typedef __fp16 float16_t;\n" + "typedef float float32_t;\n"; + for (const auto &kv : ScalarTypes) { + const ScalarType *ST = kv.second.get(); + raw_ostream &OS = parts[ST->requires_float() ? Float : 0]; + const VectorType *VT = getVectorType(ST); + + OS << "typedef __attribute__((neon_vector_type(" << VT->lanes() << "))) " + << ST->c_name() << " " << VT->c_name() << ";\n"; + + // Every vector type also comes with a pair of multi-vector types for + // the VLD2 and VLD4 instructions. + for (unsigned n = 2; n <= 4; n += 2) { + const MultiVectorType *MT = getMultiVectorType(n, VT); + OS << "typedef struct { " << VT->c_name() << " val[" << n << "]; } " + << MT->c_name() << ";\n"; + } + } + parts[0] << "\n"; + parts[Float] << "\n"; + + // Write declarations for all the intrinsics. + + for (const auto &kv : ACLEIntrinsics) { + const ACLEIntrinsic &Int = *kv.second; + + // We generate each intrinsic twice, under its full unambiguous + // name and its shorter polymorphic name (if the latter exists). + for (bool Polymorphic : {false, true}) { + if (Polymorphic && !Int.polymorphic()) + continue; + + // We also generate each intrinsic under a name like __arm_vfooq + // (which is in C language implementation namespace, so it's + // safe to define in any conforming user program) and a shorter + // one like vfooq (which is in user namespace, so a user might + // reasonably have used it for something already). If so, they + // can #define __ARM_MVE_PRESERVE_USER_NAMESPACE before + // including the header, which will suppress the shorter names + // and leave only the implementation-namespace ones. Then they + // have to write __arm_vfooq everywhere, of course. + + for (bool UserNamespace : {false, true}) { + raw_ostream &OS = parts[(Int.requires_float() ? Float : 0) | + (UserNamespace ? UseUserNamespace : 0)]; + + // Make the name of the function in this declaration. + + std::string FunctionName = + Polymorphic ? Int.shortName() : Int.fullName(); + if (!UserNamespace) + FunctionName = "__arm_" + FunctionName; + + // Make strings for the types involved in the function's + // prototype. + + std::string RetTypeName = Int.returnType()->c_name(); + if (!StringRef(RetTypeName).endswith("*")) + RetTypeName += " "; + + std::vector ArgTypeNames; + for (const Type *ArgTypePtr : Int.argTypes()) + ArgTypeNames.push_back(ArgTypePtr->c_name()); + std::string ArgTypesString = + join(std::begin(ArgTypeNames), std::end(ArgTypeNames), ", "); + + // Emit the actual declaration. All these functions are + // declared 'static inline' without a body, which is fine + // provided clang recognizes them as builtins, and has the + // effect that this type signature is used in place of the one + // that Builtins.def didn't provide. That's how we can get + // structure types that weren't defined until this header was + // included to be part of the type signature of a builtin that + // was known to clang already. + // + // The declarations use __attribute__(__clang_arm_mve_alias), + // so that each function declared will be recognized as the + // appropriate MVE builtin in spite of its user-facing name. + // + // (That's better than making them all wrapper functions, + // partly because it avoids any compiler error message citing + // the wrapper function definition instead of the user's code, + // and mostly because some MVE intrinsics have arguments + // required to be compile-time constants, and that property + // can't be propagated through a wrapper function. It can be + // propagated through a macro, but macros can't be overloaded + // on argument types very easily - you have to use _Generic, + // which makes error messages very confusing when the user + // gets it wrong.) + // + // Finally, the polymorphic versions of the intrinsics are + // also defined with __attribute__(overloadable), so that when + // the same name is defined with several type signatures, the + // right thing happens. Each one of the overloaded + // declarations is given a different builtin id, which + // has exactly the effect we want: first clang resolves the + // overload to the right function, then it knows which builtin + // it's referring to, and then the Sema checking for that + // builtin can check further things like the constant + // arguments. + // + // One more subtlety is the newline just before the return + // type name. That's a cosmetic tweak to make the error + // messages legible if the user gets the types wrong in a call + // to a polymorphic function: this way, clang will print just + // the _final_ line of each declaration in the header, to show + // the type signatures that would have been legal. So all the + // confusing machinery with __attribute__ is left out of the + // error message, and the user sees something that's more or + // less self-documenting: "here's a list of actually readable + // type signatures for vfooq(), and here's why each one didn't + // match your call". + + OS << "static __inline__ __attribute__((" + << (Polymorphic ? "overloadable, " : "") + << "__clang_arm_mve_alias(__builtin_arm_mve_" << Int.fullName() << ")))\n" + << RetTypeName << FunctionName << "(" << ArgTypesString << ");\n"; + } + } + } + for (auto &part : parts) + part << "\n"; + + // Now we've finished accumulating bits and pieces into the parts[] array. + // Put it all together to write the final output file. + + OS << "/*===---- arm_mve.h - ARM MVE intrinsics " + "-----------------------------------===\n" + " *\n" + " * Permission is hereby granted, free of charge, to any person " + "obtaining a copy\n" + " * of this software and associated documentation files (the " + "\"Software\"), to deal\n" + " * in the Software without restriction, including without " + "limitation " + "the rights\n" + " * to use, copy, modify, merge, publish, distribute, sublicense, " + "and/or sell\n" + " * copies of the Software, and to permit persons to whom the " + "Software " + "is\n" + " * furnished to do so, subject to the following conditions:\n" + " *\n" + " * The above copyright notice and this permission notice shall be " + "included in\n" + " * all copies or substantial portions of the Software.\n" + " *\n" + " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY " + "KIND, " + "EXPRESS OR\n" + " * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF " + "MERCHANTABILITY,\n" + " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO " + "EVENT " + "SHALL THE\n" + " * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES " + "OR " + "OTHER\n" + " * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR " + "OTHERWISE, " + "ARISING FROM,\n" + " * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER " + "DEALINGS IN\n" + " * THE SOFTWARE.\n" + " *\n" + " *===-------------------------------------------------------------" + "----" + "------===\n" + " */\n" + "\n" + "#ifndef __ARM_MVE_H\n" + "#define __ARM_MVE_H\n" + "\n" + "#if !__ARM_FEATURE_MVE\n" + "#error \"MVE support not enabled\"\n" + "#endif\n" + "\n" + "#include \n" + "\n"; + + for (size_t i = 0; i < NumParts; ++i) { + std::vector conditions; + if (i & Float) + conditions.push_back("(__ARM_FEATURE_MVE & 2)"); + if (i & UseUserNamespace) + conditions.push_back("(!defined __ARM_MVE_PRESERVE_USER_NAMESPACE)"); + + std::string condition = + join(std::begin(conditions), std::end(conditions), " && "); + if (!condition.empty()) + OS << "#if " << condition << "\n\n"; + OS << parts[i].str(); + if (!condition.empty()) + OS << "#endif /* " << condition << " */\n\n"; + } + + OS << "#endif /* __ARM_MVE_H */\n"; +} + +void MveEmitter::EmitBuiltinDef(raw_ostream &OS) { + for (const auto &kv : ACLEIntrinsics) { + const ACLEIntrinsic &Int = *kv.second; + OS << "TARGET_HEADER_BUILTIN(__builtin_arm_mve_" << Int.fullName() + << ", \"\", \"n\", \"arm_mve.h\", ALL_LANGUAGES, \"\")\n"; + } + + std::set ShortNamesSeen; + + for (const auto &kv : ACLEIntrinsics) { + const ACLEIntrinsic &Int = *kv.second; + if (Int.polymorphic()) { + std::string Name = Int.shortName(); + if (ShortNamesSeen.find(Name) == ShortNamesSeen.end()) { + OS << "BUILTIN(__builtin_arm_mve_" << Name << ", \"vi.\", \"nt\")\n"; + ShortNamesSeen.insert(Name); + } + } + } +} + +void MveEmitter::EmitBuiltinSema(raw_ostream &OS) { + std::map> Checks; + + for (const auto &kv : ACLEIntrinsics) { + const ACLEIntrinsic &Int = *kv.second; + std::string Check = Int.gen_sema(); + if (!Check.empty()) + Checks[Check].insert(Int.fullName()); + } + + for (const auto &kv : Checks) { + for (std::string Name : kv.second) + OS << "case ARM::BI__builtin_arm_mve_" << Name << ":\n"; + OS << kv.first; + } +} + +// Machinery for the grouping of intrinsics by similar codegen. +// +// The general setup is that 'MergeableGroup' stores the things that a set of +// similarly shaped intrinsics have in common: the text of their code +// generation, and the number and type of their parameter variables. +// MergeableGroup is the key in a std::map whose value is a set of +// OutputIntrinsic, which stores the ways in which a particular intrinsic +// specializes the MergeableGroup's generic description: the function name and +// the _values_ of the parameter variables. + +struct ComparableStringVector : std::vector { + // Infrastructure: a derived class of vector which comes with an + // ordering, so that it can be used as a key in maps and an element in sets. + // There's no requirement on the ordering beyond being deterministic. + bool operator<(const ComparableStringVector &rhs) const { + if (size() != rhs.size()) + return size() < rhs.size(); + for (size_t i = 0, e = size(); i < e; ++i) + if ((*this)[i] != rhs[i]) + return (*this)[i] < rhs[i]; + return false; + } +}; + +struct OutputIntrinsic { + const ACLEIntrinsic *Int; + std::string Name; + ComparableStringVector ParamValues; + bool operator<(const OutputIntrinsic &rhs) const { + if (Name != rhs.Name) + return Name < rhs.Name; + return ParamValues < rhs.ParamValues; + } +}; +struct MergeableGroup { + std::string Code; + ComparableStringVector ParamTypes; + bool operator<(const MergeableGroup &rhs) const { + if (Code != rhs.Code) + return Code < rhs.Code; + return ParamTypes < rhs.ParamTypes; + } +}; + +void MveEmitter::EmitBuiltinCG(raw_ostream &OS) { + // Pass 1: generate code for all the intrinsics as if every type or constant + // that can possibly be abstracted out into a parameter variable will be. + // This identifies the sets of intrinsics we'll group together into a single + // piece of code generation. + + std::map> MergeableGroupsPrelim; + + for (const auto &kv : ACLEIntrinsics) { + const ACLEIntrinsic &Int = *kv.second; + + MergeableGroup MG; + OutputIntrinsic OI; + + OI.Int = ∬ + OI.Name = Int.fullName(); + CodeGenParamAllocator ParamAllocPrelim{&MG.ParamTypes, &OI.ParamValues}; + raw_string_ostream OS(MG.Code); + Int.gen_code(OS, ParamAllocPrelim, 1); + OS.flush(); + + MergeableGroupsPrelim[MG].insert(OI); + } + + // Pass 2: for each of those groups, optimize the parameter variable set by + // eliminating 'parameters' that are the same for all intrinsics in the + // group, and merging together pairs of parameter variables that take the + // same values as each other for all intrinsics in the group. + + std::map> MergeableGroups; + + for (const auto &kv : MergeableGroupsPrelim) { + const MergeableGroup &MG = kv.first; + std::vector ParamNumbers; + std::map ParamNumberMap; + + // Loop over the parameters for this group. + for (size_t i = 0, e = MG.ParamTypes.size(); i < e; ++i) { + // Is this parameter the same for all intrinsics in the group? + bool Constant = true; + auto it = kv.second.begin(); + const auto &OI_first = *it; + for (++it; it != kv.second.end(); ++it) { + const auto &OI = *it; + if (OI.ParamValues[i] != OI_first.ParamValues[i]) { + Constant = false; + break; + } + } + + // If so, record it as -1, meaning 'no parameter variable needed'. Then + // the corresponding call to alloc_param in pass 2 will not generate a + // variable at all, and just use the value inline. + if (Constant) { + ParamNumbers.push_back(-1); + continue; + } + + // Otherwise, make a list of the values this parameter takes for each + // intrinsic, and see if that value vector matches anything we already + // have. We also record the parameter type, so that we don't accidentally + // match up two parameter variables with different types. (Not that + // there's much chance of them having textually equivalent values, but in + // _principle_ it could happen.) + ComparableStringVector key; + key.push_back(MG.ParamTypes[i]); + for (const auto &OI : kv.second) + key.push_back(OI.ParamValues[i]); + + auto Found = ParamNumberMap.find(key); + if (Found != ParamNumberMap.end()) { + // Yes, an existing parameter variable can be reused for this. + ParamNumbers.push_back(Found->second); + continue; + } + + // No, we need a new parameter variable. + int existing_index = ParamNumberMap.size(); + ParamNumberMap[key] = existing_index; + ParamNumbers.push_back(existing_index); + } + + // Now we're ready to do the pass 2 code generation, which will emit the + // reduced set of parameter variables we've just worked out. + + for (const auto &OI_prelim : kv.second) { + const ACLEIntrinsic *Int = OI_prelim.Int; + + MergeableGroup MG; + OutputIntrinsic OI; + + OI.Int = OI_prelim.Int; + OI.Name = OI_prelim.Name; + CodeGenParamAllocator ParamAlloc{&MG.ParamTypes, &OI.ParamValues, + &ParamNumbers}; + raw_string_ostream OS(MG.Code); + Int->gen_code(OS, ParamAlloc, 2); + OS.flush(); + + MergeableGroups[MG].insert(OI); + } + } + + // Output the actual C++ code. + + for (const auto &kv : MergeableGroups) { + const MergeableGroup &MG = kv.first; + + // List of case statements in the main switch on BuiltinID, and an open + // brace. + const char *prefix = ""; + for (const auto &OI : kv.second) { + OS << prefix << "case ARM::BI__builtin_arm_mve_" << OI.Name << ":"; + prefix = "\n"; + } + OS << " {\n"; + + if (!MG.ParamTypes.empty()) { + // If we've got some parameter variables, then emit their declarations... + for (size_t i = 0, e = MG.ParamTypes.size(); i < e; ++i) { + StringRef Type = MG.ParamTypes[i]; + OS << " " << Type; + if (!Type.endswith("*")) + OS << " "; + OS << " Param" << utostr(i) << ";\n"; + } + + // ... and an inner switch on BuiltinID that will fill them in with each + // individual intrinsic's values. + OS << " switch (BuiltinID) {\n"; + for (const auto &OI : kv.second) { + OS << " case ARM::BI__builtin_arm_mve_" << OI.Name << ":\n"; + for (size_t i = 0, e = MG.ParamTypes.size(); i < e; ++i) + OS << " Param" << utostr(i) << " = " << OI.ParamValues[i] << ";\n"; + OS << " break;\n"; + } + OS << " }\n"; + } + + // And finally, output the code, and close the outer pair of braces. (The + // code will always end with a 'return' statement, so we need not insert a + // 'break' here.) + OS << MG.Code << "}\n"; + } +} + +void MveEmitter::EmitBuiltinAliases(raw_ostream &OS) { + for (const auto &kv : ACLEIntrinsics) { + const ACLEIntrinsic &Int = *kv.second; + OS << "case ARM::BI__builtin_arm_mve_" << Int.fullName() << ":\n" + << " return AliasName == \"" << Int.fullName() << "\""; + if (Int.polymorphic()) + OS << " || AliasName == \"" << Int.shortName() << "\""; + OS << ";\n"; + } +} + +} // namespace + +namespace clang { + +void EmitMveHeader(RecordKeeper &Records, raw_ostream &OS) { + MveEmitter(Records).EmitHeader(OS); +} + +void EmitMveBuiltinDef(RecordKeeper &Records, raw_ostream &OS) { + MveEmitter(Records).EmitBuiltinDef(OS); +} + +void EmitMveBuiltinSema(RecordKeeper &Records, raw_ostream &OS) { + MveEmitter(Records).EmitBuiltinSema(OS); +} + +void EmitMveBuiltinCG(RecordKeeper &Records, raw_ostream &OS) { + MveEmitter(Records).EmitBuiltinCG(OS); +} + +void EmitMveBuiltinAliases(RecordKeeper &Records, raw_ostream &OS) { + MveEmitter(Records).EmitBuiltinAliases(OS); +} + +} // end namespace clang diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -58,6 +58,11 @@ GenArmFP16, GenArmNeonSema, GenArmNeonTest, + GenArmMveHeader, + GenArmMveBuiltinDef, + GenArmMveBuiltinSema, + GenArmMveBuiltinCG, + GenArmMveBuiltinAliases, GenAttrDocs, GenDiagDocs, GenOptDocs, @@ -156,6 +161,16 @@ "Generate ARM NEON sema support for clang"), clEnumValN(GenArmNeonTest, "gen-arm-neon-test", "Generate ARM NEON tests for clang"), + clEnumValN(GenArmMveHeader, "gen-arm-mve-header", + "Generate arm_mve.h for clang"), + clEnumValN(GenArmMveBuiltinDef, "gen-arm-mve-builtin-def", + "Generate ARM MVE builtin definitions for clang"), + clEnumValN(GenArmMveBuiltinSema, "gen-arm-mve-builtin-sema", + "Generate ARM MVE builtin sema checks for clang"), + clEnumValN(GenArmMveBuiltinCG, "gen-arm-mve-builtin-codegen", + "Generate ARM MVE builtin code-generator for clang"), + clEnumValN(GenArmMveBuiltinAliases, "gen-arm-mve-builtin-aliases", + "Generate list of valid ARM MVE builtin aliases for clang"), clEnumValN(GenAttrDocs, "gen-attr-docs", "Generate attribute documentation"), clEnumValN(GenDiagDocs, "gen-diag-docs", @@ -284,6 +299,21 @@ case GenArmNeonTest: EmitNeonTest(Records, OS); break; + case GenArmMveHeader: + EmitMveHeader(Records, OS); + break; + case GenArmMveBuiltinDef: + EmitMveBuiltinDef(Records, OS); + break; + case GenArmMveBuiltinSema: + EmitMveBuiltinSema(Records, OS); + break; + case GenArmMveBuiltinCG: + EmitMveBuiltinCG(Records, OS); + break; + case GenArmMveBuiltinAliases: + EmitMveBuiltinAliases(Records, OS); + break; case GenAttrDocs: EmitClangAttrDocs(Records, OS); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -86,6 +86,12 @@ void EmitNeonSema2(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitNeonTest2(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitMveBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitMveBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitMveBuiltinAliases(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); + void EmitClangAttrDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangDiagDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangOptDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);