Index: include/clang/Basic/LangOptions.def =================================================================== --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -126,6 +126,7 @@ LANGOPT(OpenCL , 1, 0, "OpenCL") LANGOPT(OpenCLVersion , 32, 0, "OpenCL version") LANGOPT(NativeHalfType , 1, 0, "Native half type support") +LANGOPT(HalfArgsAndReturns, 1, 0, "Allow function arguments and returns of type half") LANGOPT(CUDA , 1, 0, "CUDA") LANGOPT(OpenMP , 1, 0, "OpenMP support") Index: include/clang/Driver/CC1Options.td =================================================================== --- include/clang/Driver/CC1Options.td +++ include/clang/Driver/CC1Options.td @@ -507,6 +507,8 @@ HelpText<"Control vtordisp placement on win32 targets">; def fno_rtti_data : Flag<["-"], "fno-rtti-data">, HelpText<"Control emission of RTTI data">; +def fallow_half_arguments_and_returns : Flag<["-"], "fallow-half-arguments-and-returns">, + HelpText<"Allow function arguments and returns of type half">; //===----------------------------------------------------------------------===// // Header Search Options Index: lib/CodeGen/CGCall.cpp =================================================================== --- lib/CodeGen/CGCall.cpp +++ lib/CodeGen/CGCall.cpp @@ -512,11 +512,12 @@ // default now. ABIArgInfo &retInfo = FI->getReturnInfo(); if (retInfo.canHaveCoerceToType() && retInfo.getCoerceToType() == nullptr) - retInfo.setCoerceToType(ConvertType(FI->getReturnType())); + retInfo.setCoerceToType(ConvertType(FI->getReturnType(), /* isFunctionArgOrReturn */ true)); for (auto &I : FI->arguments()) if (I.info.canHaveCoerceToType() && I.info.getCoerceToType() == nullptr) - I.info.setCoerceToType(ConvertType(I.type)); + I.info.setCoerceToType( + ConvertType(I.type, /* isFunctionArgOrReturn */ true)); bool erased = FunctionsBeingProcessed.erase(FI); (void)erased; assert(erased && "Not in set?"); @@ -587,11 +588,11 @@ } } } else if (const ComplexType *CT = type->getAs()) { - llvm::Type *EltTy = ConvertType(CT->getElementType()); + llvm::Type *EltTy = ConvertType(CT->getElementType(), true); expandedTypes.push_back(EltTy); expandedTypes.push_back(EltTy); } else - expandedTypes.push_back(ConvertType(type)); + expandedTypes.push_back(ConvertType(type, true)); } llvm::Function::arg_iterator Index: lib/CodeGen/CGExpr.cpp =================================================================== --- lib/CodeGen/CGExpr.cpp +++ lib/CodeGen/CGExpr.cpp @@ -1473,6 +1473,14 @@ return; } + // If we are storing a 'half' but do not have native 'half' operations, we + // need to bitcast it to i16. + if (Src.getScalarVal()->getType()->isHalfTy() && + !getContext().getLangOpts().NativeHalfType) { + Src = RValue::get(Builder.CreateBitCast( + Src.getScalarVal(), llvm::Type::getInt16Ty(getLLVMContext()))); + } + assert(Src.isScalar() && "Can't emit an agg store with this method"); EmitStoreOfScalar(Src.getScalarVal(), Dst, isInit); } Index: lib/CodeGen/CGExprScalar.cpp =================================================================== --- lib/CodeGen/CGExprScalar.cpp +++ lib/CodeGen/CGExprScalar.cpp @@ -800,10 +800,19 @@ } if (DstTy != ResTy) { - assert(ResTy->isIntegerTy(16) && "Only half FP requires extra conversion"); + if (ResTy->isIntegerTy(16)) { Res = Builder.CreateCall( CGF.CGM.getIntrinsic(llvm::Intrinsic::convert_to_fp16, CGF.CGM.FloatTy), Res); + } else if (ResTy->isHalfTy()) { + // This is required for ARM, where half is a storage and interchange + // format only, but may appear in the IR in order to get the calling + // convention correct. + Res = Builder.CreateCall(CGF.CGM.getIntrinsic(llvm::Intrinsic::convert_to_fp16, CGF.CGM.FloatTy), Res); + Res = Builder.CreateBitCast(Res, ResTy); + } else { + llvm_unreachable("Only half FP requires extra conversion"); + } } return Res; Index: lib/CodeGen/CodeGenTypes.h =================================================================== --- lib/CodeGen/CodeGenTypes.h +++ lib/CodeGen/CodeGenTypes.h @@ -106,6 +106,10 @@ /// TypeCache - This map keeps cache of llvm::Types /// and maps clang::Type to corresponding llvm::Type. llvm::DenseMap TypeCache; + /// ArgTypeCache - caches the mapping from clang::Type to llvm::Type for types + /// which must be converted differently when being used as a function argument + /// or return. + llvm::DenseMap ArgTypeCache; public: CodeGenTypes(CodeGenModule &cgm); @@ -118,8 +122,8 @@ CGCXXABI &getCXXABI() const { return TheCXXABI; } llvm::LLVMContext &getLLVMContext() { return TheModule.getContext(); } - /// ConvertType - Convert type T into a llvm::Type. - llvm::Type *ConvertType(QualType T); + /// ConvertType - Convert type T into an llvm::Type. + llvm::Type *ConvertType(QualType T, bool isFunctionArgOrReturn = false); /// ConvertTypeForMem - Convert type T into a llvm::Type. This differs from /// ConvertType in that it is used to convert to the memory representation for Index: lib/CodeGen/CodeGenTypes.cpp =================================================================== --- lib/CodeGen/CodeGenTypes.cpp +++ lib/CodeGen/CodeGenTypes.cpp @@ -235,12 +235,15 @@ // from the enum to be recomputed. if (const EnumDecl *ED = dyn_cast(TD)) { // Only flush the cache if we've actually already converted this type. - if (TypeCache.count(ED->getTypeForDecl())) { + if (TypeCache.count(ED->getTypeForDecl()) || + ArgTypeCache.count(ED->getTypeForDecl())) { // Okay, we formed some types based on this. We speculated that the enum // would be lowered to i32, so we only need to flush the cache if this // didn't happen. - if (!ConvertType(ED->getIntegerType())->isIntegerTy(32)) + if (!ConvertType(ED->getIntegerType())->isIntegerTy(32)) { TypeCache.clear(); + ArgTypeCache.clear(); + } } // If necessary, provide the full definition of a type only used with a // declaration so far. @@ -288,7 +291,7 @@ } /// ConvertType - Convert the specified type to its LLVM form. -llvm::Type *CodeGenTypes::ConvertType(QualType T) { +llvm::Type *CodeGenTypes::ConvertType(QualType T, bool isFunctionArgOrReturn) { T = Context.getCanonicalType(T); const Type *Ty = T.getTypePtr(); @@ -296,7 +299,48 @@ // RecordTypes are cached and processed specially. if (const RecordType *RT = dyn_cast(Ty)) return ConvertRecordDeclType(RT->getDecl()); - + + llvm::Type *ResultType = nullptr; + // Some types require converting differently when they are used as a function + // argument or return. We handle them first, with their own cache. + if (isFunctionArgOrReturn) { + // See if type is already cached. + llvm::DenseMap::iterator TCI = + ArgTypeCache.find(Ty); + // If type is found in map then use it. + if (TCI != ArgTypeCache.end()) + return TCI->second; + + switch (Ty->getTypeClass()) { + case Type::Builtin: + switch (cast(Ty)->getKind()) { + case BuiltinType::Half: { + // Half FP can either be storage-only (lowered to i16) or native. However, + // the ARM C Language Extensions specify that __fp16 can be used as a + // function argument or return, so we need to emit IR which uses the half + // type so that the backend can get the calling convention correct. + if (Context.getLangOpts().HalfArgsAndReturns) + ResultType = getTypeForFormat(getLLVMContext(), + Context.getFloatTypeSemantics(T), true); + break; + } + default: { + // This type needs no special conversion, convert it as normal. + break; + } + } + default: { + // This type needs no special conversion, convert it as normal. + break; + } + } + + if (ResultType) { + ArgTypeCache[Ty] = ResultType; + return ResultType; + } + } + // See if type is already cached. llvm::DenseMap::iterator TCI = TypeCache.find(Ty); // If type is found in map then use it. Otherwise, convert type T. @@ -304,7 +348,6 @@ return TCI->second; // If we don't have it in the cache, convert it now. - llvm::Type *ResultType = nullptr; switch (Ty->getTypeClass()) { case Type::Record: // Handled above. #define TYPE(Class, Base) @@ -398,7 +441,8 @@ case Type::Auto: llvm_unreachable("Unexpected undeduced auto type!"); case Type::Complex: { - llvm::Type *EltTy = ConvertType(cast(Ty)->getElementType()); + llvm::Type *EltTy = ConvertType(cast(Ty)->getElementType(), + isFunctionArgOrReturn); ResultType = llvm::StructType::get(EltTy, EltTy, NULL); break; } @@ -462,8 +506,9 @@ case Type::ExtVector: case Type::Vector: { const VectorType *VT = cast(Ty); - ResultType = llvm::VectorType::get(ConvertType(VT->getElementType()), - VT->getNumElements()); + ResultType = llvm::VectorType::get( + ConvertType(VT->getElementType(), false), + VT->getNumElements()); break; } case Type::FunctionNoProto: @@ -527,8 +572,10 @@ RecordsBeingLaidOut.erase(Ty); - if (SkippedLayout) + if (SkippedLayout) { TypeCache.clear(); + ArgTypeCache.clear(); + } if (RecordsBeingLaidOut.empty()) while (!DeferredRecords.empty()) @@ -537,7 +584,8 @@ } case Type::ObjCObject: - ResultType = ConvertType(cast(Ty)->getBaseType()); + ResultType = ConvertType(cast(Ty)->getBaseType(), + isFunctionArgOrReturn); break; case Type::ObjCInterface: { @@ -564,7 +612,7 @@ case Type::Enum: { const EnumDecl *ED = cast(Ty)->getDecl(); if (ED->isCompleteDefinition() || ED->isFixed()) - return ConvertType(ED->getIntegerType()); + return ConvertType(ED->getIntegerType(), isFunctionArgOrReturn); // Return a placeholder 'i32' type. This can be changed later when the // type is defined (see UpdateCompletedType), but is likely to be the // "right" answer. @@ -609,6 +657,7 @@ assert(ResultType && "Didn't convert a type?"); TypeCache[Ty] = ResultType; + return ResultType; } @@ -671,8 +720,10 @@ // If this struct blocked a FunctionType conversion, then recompute whatever // was derived from that. // FIXME: This is hugely overconservative. - if (SkippedLayout) + if (SkippedLayout) { TypeCache.clear(); + ArgTypeCache.clear(); + } // If we're done converting the outer-most record, then convert any deferred // structs as well. Index: lib/CodeGen/TargetInfo.cpp =================================================================== --- lib/CodeGen/TargetInfo.cpp +++ lib/CodeGen/TargetInfo.cpp @@ -3544,6 +3544,7 @@ static bool isHomogeneousAggregate(QualType Ty, const Type *&Base, ASTContext &Context, + bool isAArch64, uint64_t *HAMembers = nullptr); ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, @@ -3625,7 +3626,7 @@ // Homogeneous Floating-point Aggregates (HFAs) need to be expanded. const Type *Base = nullptr; uint64_t Members = 0; - if (isHomogeneousAggregate(Ty, Base, getContext(), &Members)) { + if (isHomogeneousAggregate(Ty, Base, getContext(), true, &Members)) { IsHA = true; if (!IsNamedArg && isDarwinPCS()) { // With the Darwin ABI, variadic arguments are always passed on the stack @@ -3683,7 +3684,7 @@ return ABIArgInfo::getIgnore(); const Type *Base = nullptr; - if (isHomogeneousAggregate(RetTy, Base, getContext())) + if (isHomogeneousAggregate(RetTy, Base, getContext(), true)) // Homogeneous Floating-point Aggregates (HFAs) are returned directly. return ABIArgInfo::getDirect(); @@ -3820,7 +3821,7 @@ const Type *Base = nullptr; uint64_t NumMembers; - bool IsHFA = isHomogeneousAggregate(Ty, Base, Ctx, &NumMembers); + bool IsHFA = isHomogeneousAggregate(Ty, Base, Ctx, true, &NumMembers); if (IsHFA && NumMembers > 1) { // Homogeneous aggregates passed in registers will have their elements split // and stored 16-bytes apart regardless of size (they're notionally in qN, @@ -3963,7 +3964,7 @@ uint64_t Align = CGF.getContext().getTypeAlign(Ty) / 8; const Type *Base = nullptr; - bool isHA = isHomogeneousAggregate(Ty, Base, getContext()); + bool isHA = isHomogeneousAggregate(Ty, Base, getContext(), true); bool isIndirect = false; // Arguments bigger than 16 bytes which aren't homogeneous aggregates should @@ -4251,10 +4252,11 @@ /// contained in the type is returned through it; this is used for the /// recursive calls that check aggregate component types. static bool isHomogeneousAggregate(QualType Ty, const Type *&Base, - ASTContext &Context, uint64_t *HAMembers) { + ASTContext &Context, bool isAArch64, + uint64_t *HAMembers) { uint64_t Members = 0; if (const ConstantArrayType *AT = Context.getAsConstantArrayType(Ty)) { - if (!isHomogeneousAggregate(AT->getElementType(), Base, Context, &Members)) + if (!isHomogeneousAggregate(AT->getElementType(), Base, Context, isAArch64, &Members)) return false; Members *= AT->getSize().getZExtValue(); } else if (const RecordType *RT = Ty->getAs()) { @@ -4265,7 +4267,7 @@ Members = 0; for (const auto *FD : RD->fields()) { uint64_t FldMembers; - if (!isHomogeneousAggregate(FD->getType(), Base, Context, &FldMembers)) + if (!isHomogeneousAggregate(FD->getType(), Base, Context, isAArch64, &FldMembers)) return false; Members = (RD->isUnion() ? @@ -4279,12 +4281,22 @@ } // Homogeneous aggregates for AAPCS-VFP must have base types of float, - // double, or 64-bit or 128-bit vectors. + // double, or 64-bit or 128-bit vectors. "long double" has the same machine + // type as double, so it is also allowed as a base type. + // Homogeneous aggregates for AAPCS64 must have base types of a floating + // point type or a short-vector type. This is the same as the 32-bit ABI, + // but with the difference that any floating-point type is allowed, + // including __fp16. if (const BuiltinType *BT = Ty->getAs()) { - if (BT->getKind() != BuiltinType::Float && - BT->getKind() != BuiltinType::Double && - BT->getKind() != BuiltinType::LongDouble) - return false; + if (isAArch64) { + if (!BT->isFloatingPoint()) + return false; + } else { + if (BT->getKind() != BuiltinType::Float && + BT->getKind() != BuiltinType::Double && + BT->getKind() != BuiltinType::LongDouble) + return false; + } } else if (const VectorType *VT = Ty->getAs()) { unsigned VecSize = Context.getTypeSize(VT); if (VecSize != 64 && VecSize != 128) @@ -4482,7 +4494,7 @@ // into VFP registers. const Type *Base = nullptr; uint64_t Members = 0; - if (isHomogeneousAggregate(Ty, Base, getContext(), &Members)) { + if (isHomogeneousAggregate(Ty, Base, getContext(), false, &Members)) { assert(Base && "Base class should be set for homogeneous aggregate"); // Base can be a floating-point or a vector. if (Base->isVectorType()) { @@ -4683,7 +4695,7 @@ // Check for homogeneous aggregates with AAPCS-VFP. if (getABIKind() == AAPCS_VFP && !isVariadic) { const Type *Base = nullptr; - if (isHomogeneousAggregate(RetTy, Base, getContext())) { + if (isHomogeneousAggregate(RetTy, Base, getContext(), false)) { assert(Base && "Base class should be set for homogeneous aggregate"); // Homogeneous Aggregates are returned directly. return ABIArgInfo::getDirect(); Index: lib/Driver/Tools.cpp =================================================================== --- lib/Driver/Tools.cpp +++ lib/Driver/Tools.cpp @@ -3690,6 +3690,10 @@ CmdArgs.push_back(Args.MakeArgString("-mstack-alignment=" + alignment)); } + if (getToolChain().getTriple().getArch() == llvm::Triple::aarch64 || + getToolChain().getTriple().getArch() == llvm::Triple::aarch64_be) + CmdArgs.push_back("-fallow-half-arguments-and-returns"); + if (Arg *A = Args.getLastArg(options::OPT_mrestrict_it, options::OPT_mno_restrict_it)) { if (A->getOption().matches(options::OPT_mrestrict_it)) { Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -1497,6 +1497,7 @@ Opts.CurrentModule = Args.getLastArgValue(OPT_fmodule_name); Opts.ImplementationOfModule = Args.getLastArgValue(OPT_fmodule_implementation_of); + Opts.HalfArgsAndReturns = Args.hasArg(OPT_fallow_half_arguments_and_returns); if (!Opts.CurrentModule.empty() && !Opts.ImplementationOfModule.empty() && Opts.CurrentModule != Opts.ImplementationOfModule) { Index: lib/Sema/SemaType.cpp =================================================================== --- lib/Sema/SemaType.cpp +++ lib/Sema/SemaType.cpp @@ -1746,7 +1746,7 @@ } // Functions cannot return half FP. - if (T->isHalfType()) { + if (T->isHalfType() && !getLangOpts().HalfArgsAndReturns) { Diag(Loc, diag::err_parameters_retval_cannot_have_fp16_type) << 1 << FixItHint::CreateInsertion(Loc, "*"); return true; @@ -1776,7 +1776,7 @@ if (ParamType->isVoidType()) { Diag(Loc, diag::err_param_with_void_type); Invalid = true; - } else if (ParamType->isHalfType()) { + } else if (ParamType->isHalfType() && !getLangOpts().HalfArgsAndReturns) { // Disallow half FP arguments. Diag(Loc, diag::err_parameters_retval_cannot_have_fp16_type) << 0 << FixItHint::CreateInsertion(Loc, "*"); @@ -2751,7 +2751,7 @@ S.Diag(D.getIdentifierLoc(), diag::err_opencl_half_return) << T; D.setInvalidType(true); } - } else { + } else if (!S.getLangOpts().HalfArgsAndReturns) { S.Diag(D.getIdentifierLoc(), diag::err_parameters_retval_cannot_have_fp16_type) << 1; D.setInvalidType(true); @@ -2941,7 +2941,7 @@ D.setInvalidType(); Param->setInvalidDecl(); } - } else { + } else if (!S.getLangOpts().HalfArgsAndReturns) { S.Diag(Param->getLocation(), diag::err_parameters_retval_cannot_have_fp16_type) << 0; D.setInvalidType(); Index: test/CodeGen/arm64-aapcs-arguments.c =================================================================== --- test/CodeGen/arm64-aapcs-arguments.c +++ test/CodeGen/arm64-aapcs-arguments.c @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon -target-abi aapcs -ffreestanding -emit-llvm -w -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-abi aapcs -ffreestanding -fallow-half-arguments-and-returns -emit-llvm -w -o - %s | FileCheck %s // AAPCS clause C.8 says: If the argument has an alignment of 16 then the NGRN // is rounded up to the next even number. @@ -40,3 +40,12 @@ // CHECK: define i8 @test5(i8 %a, i16 %b) unsigned char test5(unsigned char a, signed short b) { } + +// __fp16 can be used as a function argument or return type (ACLE 2.0) +// CHECK: define half @test_half(half %{{.*}}) +__fp16 test_half(__fp16 A) { } + +// __fp16 is a base type for homogeneous floating-point aggregates for AArch64 (but not 32-bit ARM). +// CHECK: define %struct.HFA_half @test_half_hfa(half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}}) +struct HFA_half { __fp16 a[4]; }; +struct HFA_half test_half_hfa(struct HFA_half A) { }