Index: include/clang/Basic/LangOptions.def
===================================================================
--- include/clang/Basic/LangOptions.def
+++ include/clang/Basic/LangOptions.def
@@ -126,6 +126,7 @@
 LANGOPT(OpenCL            , 1, 0, "OpenCL")
 LANGOPT(OpenCLVersion     , 32, 0, "OpenCL version")
 LANGOPT(NativeHalfType    , 1, 0, "Native half type support")
+LANGOPT(HalfArgsAndReturns, 1, 0, "Allow function arguments and returns of type half")
 LANGOPT(CUDA              , 1, 0, "CUDA")
 LANGOPT(OpenMP            , 1, 0, "OpenMP support")
 
Index: include/clang/Driver/CC1Options.td
===================================================================
--- include/clang/Driver/CC1Options.td
+++ include/clang/Driver/CC1Options.td
@@ -507,6 +507,8 @@
   HelpText<"Control vtordisp placement on win32 targets">;
 def fno_rtti_data : Flag<["-"], "fno-rtti-data">,
   HelpText<"Control emission of RTTI data">;
+def fallow_half_arguments_and_returns : Flag<["-"], "fallow-half-arguments-and-returns">,
+  HelpText<"Allow function arguments and returns of type half">;
 
 //===----------------------------------------------------------------------===//
 // Header Search Options
Index: lib/CodeGen/CGCall.cpp
===================================================================
--- lib/CodeGen/CGCall.cpp
+++ lib/CodeGen/CGCall.cpp
@@ -512,11 +512,12 @@
   // default now.
   ABIArgInfo &retInfo = FI->getReturnInfo();
   if (retInfo.canHaveCoerceToType() && retInfo.getCoerceToType() == nullptr)
-    retInfo.setCoerceToType(ConvertType(FI->getReturnType()));
+    retInfo.setCoerceToType(ConvertType(FI->getReturnType(), /* isFunctionArgOrReturn */ true));
 
   for (auto &I : FI->arguments())
     if (I.info.canHaveCoerceToType() && I.info.getCoerceToType() == nullptr)
-      I.info.setCoerceToType(ConvertType(I.type));
+      I.info.setCoerceToType(
+          ConvertType(I.type, /* isFunctionArgOrReturn */ true));
 
   bool erased = FunctionsBeingProcessed.erase(FI); (void)erased;
   assert(erased && "Not in set?");
@@ -587,11 +588,11 @@
       }
     }
   } else if (const ComplexType *CT = type->getAs<ComplexType>()) {
-    llvm::Type *EltTy = ConvertType(CT->getElementType());
+    llvm::Type *EltTy = ConvertType(CT->getElementType(), true);
     expandedTypes.push_back(EltTy);
     expandedTypes.push_back(EltTy);
   } else
-    expandedTypes.push_back(ConvertType(type));
+    expandedTypes.push_back(ConvertType(type, true));
 }
 
 llvm::Function::arg_iterator
Index: lib/CodeGen/CGExpr.cpp
===================================================================
--- lib/CodeGen/CGExpr.cpp
+++ lib/CodeGen/CGExpr.cpp
@@ -1473,6 +1473,14 @@
     return;
   }
 
+  // If we are storing a 'half' but do not have native 'half' operations, we
+  // need to bitcast it to i16.
+  if (Src.getScalarVal()->getType()->isHalfTy() &&
+      !getContext().getLangOpts().NativeHalfType) {
+    Src = RValue::get(Builder.CreateBitCast(
+        Src.getScalarVal(), llvm::Type::getInt16Ty(getLLVMContext())));
+  }
+
   assert(Src.isScalar() && "Can't emit an agg store with this method");
   EmitStoreOfScalar(Src.getScalarVal(), Dst, isInit);
 }
Index: lib/CodeGen/CGExprScalar.cpp
===================================================================
--- lib/CodeGen/CGExprScalar.cpp
+++ lib/CodeGen/CGExprScalar.cpp
@@ -800,10 +800,19 @@
   }
 
   if (DstTy != ResTy) {
-    assert(ResTy->isIntegerTy(16) && "Only half FP requires extra conversion");
+    if (ResTy->isIntegerTy(16)) {
     Res = Builder.CreateCall(
         CGF.CGM.getIntrinsic(llvm::Intrinsic::convert_to_fp16, CGF.CGM.FloatTy),
         Res);
+    } else if (ResTy->isHalfTy()) {
+      // This is required for ARM, where half is a storage and interchange
+      // format only, but may appear in the IR in order to get the calling
+      // convention correct.
+      Res = Builder.CreateCall(CGF.CGM.getIntrinsic(llvm::Intrinsic::convert_to_fp16, CGF.CGM.FloatTy), Res);
+      Res = Builder.CreateBitCast(Res, ResTy);
+    } else {
+      llvm_unreachable("Only half FP requires extra conversion");
+    }
   }
 
   return Res;
Index: lib/CodeGen/CodeGenTypes.h
===================================================================
--- lib/CodeGen/CodeGenTypes.h
+++ lib/CodeGen/CodeGenTypes.h
@@ -106,6 +106,10 @@
   /// TypeCache - This map keeps cache of llvm::Types
   /// and maps clang::Type to corresponding llvm::Type.
   llvm::DenseMap<const Type *, llvm::Type *> TypeCache;
+  /// ArgTypeCache - caches the mapping from clang::Type to llvm::Type for types
+  /// which must be converted differently when being used as a function argument
+  /// or return.
+  llvm::DenseMap<const Type *, llvm::Type *> ArgTypeCache;
 
 public:
   CodeGenTypes(CodeGenModule &cgm);
@@ -118,8 +122,8 @@
   CGCXXABI &getCXXABI() const { return TheCXXABI; }
   llvm::LLVMContext &getLLVMContext() { return TheModule.getContext(); }
 
-  /// ConvertType - Convert type T into a llvm::Type.
-  llvm::Type *ConvertType(QualType T);
+  /// ConvertType - Convert type T into an llvm::Type.
+  llvm::Type *ConvertType(QualType T, bool isFunctionArgOrReturn = false);
 
   /// ConvertTypeForMem - Convert type T into a llvm::Type.  This differs from
   /// ConvertType in that it is used to convert to the memory representation for
Index: lib/CodeGen/CodeGenTypes.cpp
===================================================================
--- lib/CodeGen/CodeGenTypes.cpp
+++ lib/CodeGen/CodeGenTypes.cpp
@@ -235,12 +235,15 @@
   // from the enum to be recomputed.
   if (const EnumDecl *ED = dyn_cast<EnumDecl>(TD)) {
     // Only flush the cache if we've actually already converted this type.
-    if (TypeCache.count(ED->getTypeForDecl())) {
+    if (TypeCache.count(ED->getTypeForDecl()) ||
+        ArgTypeCache.count(ED->getTypeForDecl())) {
       // Okay, we formed some types based on this.  We speculated that the enum
       // would be lowered to i32, so we only need to flush the cache if this
       // didn't happen.
-      if (!ConvertType(ED->getIntegerType())->isIntegerTy(32))
+      if (!ConvertType(ED->getIntegerType())->isIntegerTy(32)) {
         TypeCache.clear();
+        ArgTypeCache.clear();
+      }
     }
     // If necessary, provide the full definition of a type only used with a
     // declaration so far.
@@ -288,7 +291,7 @@
 }
 
 /// ConvertType - Convert the specified type to its LLVM form.
-llvm::Type *CodeGenTypes::ConvertType(QualType T) {
+llvm::Type *CodeGenTypes::ConvertType(QualType T, bool isFunctionArgOrReturn) {
   T = Context.getCanonicalType(T);
 
   const Type *Ty = T.getTypePtr();
@@ -296,7 +299,48 @@
   // RecordTypes are cached and processed specially.
   if (const RecordType *RT = dyn_cast<RecordType>(Ty))
     return ConvertRecordDeclType(RT->getDecl());
-  
+
+  llvm::Type *ResultType = nullptr;
+  // Some types require converting differently when they are used as a function
+  // argument or return. We handle them first, with their own cache.
+  if (isFunctionArgOrReturn) {
+    // See if type is already cached.
+    llvm::DenseMap<const Type *, llvm::Type *>::iterator TCI =
+        ArgTypeCache.find(Ty);
+    // If type is found in map then use it.
+    if (TCI != ArgTypeCache.end())
+      return TCI->second;
+
+    switch (Ty->getTypeClass()) {
+    case Type::Builtin:
+      switch (cast<BuiltinType>(Ty)->getKind()) {
+      case BuiltinType::Half: {
+        // Half FP can either be storage-only (lowered to i16) or native. However,
+        // the ARM C Language Extensions specify that __fp16 can be used as a
+        // function argument or return, so we need to emit IR which uses the half
+        // type so that the backend can get the calling convention correct.
+        if (Context.getLangOpts().HalfArgsAndReturns)
+          ResultType = getTypeForFormat(getLLVMContext(),
+                                        Context.getFloatTypeSemantics(T), true);
+        break;
+      }
+      default: {
+        // This type needs no special conversion, convert it as normal.
+        break;
+      }
+      }
+    default: {
+      // This type needs no special conversion, convert it as normal.
+      break;
+    }
+    }
+
+    if (ResultType) {
+      ArgTypeCache[Ty] = ResultType;
+      return ResultType;
+    }
+  }
+
   // See if type is already cached.
   llvm::DenseMap<const Type *, llvm::Type *>::iterator TCI = TypeCache.find(Ty);
   // If type is found in map then use it. Otherwise, convert type T.
@@ -304,7 +348,6 @@
     return TCI->second;
 
   // If we don't have it in the cache, convert it now.
-  llvm::Type *ResultType = nullptr;
   switch (Ty->getTypeClass()) {
   case Type::Record: // Handled above.
 #define TYPE(Class, Base)
@@ -398,7 +441,8 @@
   case Type::Auto:
     llvm_unreachable("Unexpected undeduced auto type!");
   case Type::Complex: {
-    llvm::Type *EltTy = ConvertType(cast<ComplexType>(Ty)->getElementType());
+    llvm::Type *EltTy = ConvertType(cast<ComplexType>(Ty)->getElementType(),
+                                    isFunctionArgOrReturn);
     ResultType = llvm::StructType::get(EltTy, EltTy, NULL);
     break;
   }
@@ -462,8 +506,9 @@
   case Type::ExtVector:
   case Type::Vector: {
     const VectorType *VT = cast<VectorType>(Ty);
-    ResultType = llvm::VectorType::get(ConvertType(VT->getElementType()),
-                                       VT->getNumElements());
+    ResultType = llvm::VectorType::get(
+        ConvertType(VT->getElementType(), false),
+        VT->getNumElements());
     break;
   }
   case Type::FunctionNoProto:
@@ -527,8 +572,10 @@
 
     RecordsBeingLaidOut.erase(Ty);
 
-    if (SkippedLayout)
+    if (SkippedLayout) {
       TypeCache.clear();
+      ArgTypeCache.clear();
+    }
     
     if (RecordsBeingLaidOut.empty())
       while (!DeferredRecords.empty())
@@ -537,7 +584,8 @@
   }
 
   case Type::ObjCObject:
-    ResultType = ConvertType(cast<ObjCObjectType>(Ty)->getBaseType());
+    ResultType = ConvertType(cast<ObjCObjectType>(Ty)->getBaseType(),
+                             isFunctionArgOrReturn);
     break;
 
   case Type::ObjCInterface: {
@@ -564,7 +612,7 @@
   case Type::Enum: {
     const EnumDecl *ED = cast<EnumType>(Ty)->getDecl();
     if (ED->isCompleteDefinition() || ED->isFixed())
-      return ConvertType(ED->getIntegerType());
+      return ConvertType(ED->getIntegerType(), isFunctionArgOrReturn);
     // Return a placeholder 'i32' type.  This can be changed later when the
     // type is defined (see UpdateCompletedType), but is likely to be the
     // "right" answer.
@@ -609,6 +657,7 @@
   assert(ResultType && "Didn't convert a type?");
   
   TypeCache[Ty] = ResultType;
+
   return ResultType;
 }
 
@@ -671,8 +720,10 @@
   // If this struct blocked a FunctionType conversion, then recompute whatever
   // was derived from that.
   // FIXME: This is hugely overconservative.
-  if (SkippedLayout)
+  if (SkippedLayout) {
     TypeCache.clear();
+    ArgTypeCache.clear();
+  }
     
   // If we're done converting the outer-most record, then convert any deferred
   // structs as well.
Index: lib/CodeGen/TargetInfo.cpp
===================================================================
--- lib/CodeGen/TargetInfo.cpp
+++ lib/CodeGen/TargetInfo.cpp
@@ -3544,6 +3544,7 @@
 
 static bool isHomogeneousAggregate(QualType Ty, const Type *&Base,
                                    ASTContext &Context,
+                                   bool isAArch64,
                                    uint64_t *HAMembers = nullptr);
 
 ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty,
@@ -3625,7 +3626,7 @@
   // Homogeneous Floating-point Aggregates (HFAs) need to be expanded.
   const Type *Base = nullptr;
   uint64_t Members = 0;
-  if (isHomogeneousAggregate(Ty, Base, getContext(), &Members)) {
+  if (isHomogeneousAggregate(Ty, Base, getContext(), true, &Members)) {
     IsHA = true;
     if (!IsNamedArg && isDarwinPCS()) {
       // With the Darwin ABI, variadic arguments are always passed on the stack
@@ -3683,7 +3684,7 @@
     return ABIArgInfo::getIgnore();
 
   const Type *Base = nullptr;
-  if (isHomogeneousAggregate(RetTy, Base, getContext()))
+  if (isHomogeneousAggregate(RetTy, Base, getContext(), true))
     // Homogeneous Floating-point Aggregates (HFAs) are returned directly.
     return ABIArgInfo::getDirect();
 
@@ -3820,7 +3821,7 @@
 
   const Type *Base = nullptr;
   uint64_t NumMembers;
-  bool IsHFA = isHomogeneousAggregate(Ty, Base, Ctx, &NumMembers);
+  bool IsHFA = isHomogeneousAggregate(Ty, Base, Ctx, true, &NumMembers);
   if (IsHFA && NumMembers > 1) {
     // Homogeneous aggregates passed in registers will have their elements split
     // and stored 16-bytes apart regardless of size (they're notionally in qN,
@@ -3963,7 +3964,7 @@
   uint64_t Align = CGF.getContext().getTypeAlign(Ty) / 8;
 
   const Type *Base = nullptr;
-  bool isHA = isHomogeneousAggregate(Ty, Base, getContext());
+  bool isHA = isHomogeneousAggregate(Ty, Base, getContext(), true);
 
   bool isIndirect = false;
   // Arguments bigger than 16 bytes which aren't homogeneous aggregates should
@@ -4251,10 +4252,11 @@
 /// contained in the type is returned through it; this is used for the
 /// recursive calls that check aggregate component types.
 static bool isHomogeneousAggregate(QualType Ty, const Type *&Base,
-                                   ASTContext &Context, uint64_t *HAMembers) {
+                                   ASTContext &Context, bool isAArch64,
+                                   uint64_t *HAMembers) {
   uint64_t Members = 0;
   if (const ConstantArrayType *AT = Context.getAsConstantArrayType(Ty)) {
-    if (!isHomogeneousAggregate(AT->getElementType(), Base, Context, &Members))
+    if (!isHomogeneousAggregate(AT->getElementType(), Base, Context, isAArch64, &Members))
       return false;
     Members *= AT->getSize().getZExtValue();
   } else if (const RecordType *RT = Ty->getAs<RecordType>()) {
@@ -4265,7 +4267,7 @@
     Members = 0;
     for (const auto *FD : RD->fields()) {
       uint64_t FldMembers;
-      if (!isHomogeneousAggregate(FD->getType(), Base, Context, &FldMembers))
+      if (!isHomogeneousAggregate(FD->getType(), Base, Context, isAArch64, &FldMembers))
         return false;
 
       Members = (RD->isUnion() ?
@@ -4279,12 +4281,22 @@
     }
 
     // Homogeneous aggregates for AAPCS-VFP must have base types of float,
-    // double, or 64-bit or 128-bit vectors.
+    // double, or 64-bit or 128-bit vectors. "long double" has the same machine
+    // type as double, so it is also allowed as a base type.
+    // Homogeneous aggregates for AAPCS64 must have base types of a floating
+    // point type or a short-vector type. This is the same as the 32-bit ABI,
+    // but with the difference that any floating-point type is allowed,
+    // including __fp16.
     if (const BuiltinType *BT = Ty->getAs<BuiltinType>()) {
-      if (BT->getKind() != BuiltinType::Float && 
-          BT->getKind() != BuiltinType::Double &&
-          BT->getKind() != BuiltinType::LongDouble)
-        return false;
+      if (isAArch64) {
+        if (!BT->isFloatingPoint())
+          return false;
+      } else {
+        if (BT->getKind() != BuiltinType::Float &&
+            BT->getKind() != BuiltinType::Double &&
+            BT->getKind() != BuiltinType::LongDouble)
+          return false;
+      }
     } else if (const VectorType *VT = Ty->getAs<VectorType>()) {
       unsigned VecSize = Context.getTypeSize(VT);
       if (VecSize != 64 && VecSize != 128)
@@ -4482,7 +4494,7 @@
     // into VFP registers.
     const Type *Base = nullptr;
     uint64_t Members = 0;
-    if (isHomogeneousAggregate(Ty, Base, getContext(), &Members)) {
+    if (isHomogeneousAggregate(Ty, Base, getContext(), false, &Members)) {
       assert(Base && "Base class should be set for homogeneous aggregate");
       // Base can be a floating-point or a vector.
       if (Base->isVectorType()) {
@@ -4683,7 +4695,7 @@
   // Check for homogeneous aggregates with AAPCS-VFP.
   if (getABIKind() == AAPCS_VFP && !isVariadic) {
     const Type *Base = nullptr;
-    if (isHomogeneousAggregate(RetTy, Base, getContext())) {
+    if (isHomogeneousAggregate(RetTy, Base, getContext(), false)) {
       assert(Base && "Base class should be set for homogeneous aggregate");
       // Homogeneous Aggregates are returned directly.
       return ABIArgInfo::getDirect();
Index: lib/Driver/Tools.cpp
===================================================================
--- lib/Driver/Tools.cpp
+++ lib/Driver/Tools.cpp
@@ -3690,6 +3690,10 @@
     CmdArgs.push_back(Args.MakeArgString("-mstack-alignment=" + alignment));
   }
 
+  if (getToolChain().getTriple().getArch() == llvm::Triple::aarch64 ||
+      getToolChain().getTriple().getArch() == llvm::Triple::aarch64_be)
+    CmdArgs.push_back("-fallow-half-arguments-and-returns");
+
   if (Arg *A = Args.getLastArg(options::OPT_mrestrict_it,
                                options::OPT_mno_restrict_it)) {
     if (A->getOption().matches(options::OPT_mrestrict_it)) {
Index: lib/Frontend/CompilerInvocation.cpp
===================================================================
--- lib/Frontend/CompilerInvocation.cpp
+++ lib/Frontend/CompilerInvocation.cpp
@@ -1497,6 +1497,7 @@
   Opts.CurrentModule = Args.getLastArgValue(OPT_fmodule_name);
   Opts.ImplementationOfModule =
       Args.getLastArgValue(OPT_fmodule_implementation_of);
+  Opts.HalfArgsAndReturns = Args.hasArg(OPT_fallow_half_arguments_and_returns);
 
   if (!Opts.CurrentModule.empty() && !Opts.ImplementationOfModule.empty() &&
       Opts.CurrentModule != Opts.ImplementationOfModule) {
Index: lib/Sema/SemaType.cpp
===================================================================
--- lib/Sema/SemaType.cpp
+++ lib/Sema/SemaType.cpp
@@ -1746,7 +1746,7 @@
   }
 
   // Functions cannot return half FP.
-  if (T->isHalfType()) {
+  if (T->isHalfType() && !getLangOpts().HalfArgsAndReturns) {
     Diag(Loc, diag::err_parameters_retval_cannot_have_fp16_type) << 1 <<
       FixItHint::CreateInsertion(Loc, "*");
     return true;
@@ -1776,7 +1776,7 @@
     if (ParamType->isVoidType()) {
       Diag(Loc, diag::err_param_with_void_type);
       Invalid = true;
-    } else if (ParamType->isHalfType()) {
+    } else if (ParamType->isHalfType() && !getLangOpts().HalfArgsAndReturns) {
       // Disallow half FP arguments.
       Diag(Loc, diag::err_parameters_retval_cannot_have_fp16_type) << 0 <<
         FixItHint::CreateInsertion(Loc, "*");
@@ -2751,7 +2751,7 @@
             S.Diag(D.getIdentifierLoc(), diag::err_opencl_half_return) << T;
             D.setInvalidType(true);
           } 
-        } else {
+        } else if (!S.getLangOpts().HalfArgsAndReturns) {
           S.Diag(D.getIdentifierLoc(),
             diag::err_parameters_retval_cannot_have_fp16_type) << 1;
           D.setInvalidType(true);
@@ -2941,7 +2941,7 @@
                 D.setInvalidType();
                 Param->setInvalidDecl();
               }
-            } else {
+            } else if (!S.getLangOpts().HalfArgsAndReturns) {
               S.Diag(Param->getLocation(),
                 diag::err_parameters_retval_cannot_have_fp16_type) << 0;
               D.setInvalidType();
Index: test/CodeGen/arm64-aapcs-arguments.c
===================================================================
--- test/CodeGen/arm64-aapcs-arguments.c
+++ test/CodeGen/arm64-aapcs-arguments.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-linux-gnu -target-feature +neon -target-abi aapcs -ffreestanding -emit-llvm -w -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-abi aapcs -ffreestanding -fallow-half-arguments-and-returns -emit-llvm -w -o - %s | FileCheck %s
 
 // AAPCS clause C.8 says: If the argument has an alignment of 16 then the NGRN
 // is rounded up to the next even number.
@@ -40,3 +40,12 @@
 // CHECK: define i8 @test5(i8 %a, i16 %b)
 unsigned char test5(unsigned char a, signed short b) {
 }
+
+// __fp16 can be used as a function argument or return type (ACLE 2.0)
+// CHECK: define half @test_half(half %{{.*}})
+__fp16 test_half(__fp16 A) { }
+
+// __fp16 is a base type for homogeneous floating-point aggregates for AArch64 (but not 32-bit ARM).
+// CHECK: define %struct.HFA_half @test_half_hfa(half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}})
+struct HFA_half { __fp16 a[4]; };
+struct HFA_half test_half_hfa(struct HFA_half A) { }