Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8739,7 +8739,8 @@
       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
         assert(InVals[i].getNode() &&
                "LowerFormalArguments emitted a null value!");
-        assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
+        assert((InVals[i].getValueType() == MVT::f16 ||
+               EVT(Ins[i].VT) == InVals[i].getValueType()) &&
                "LowerFormalArguments emitted a value with the wrong type!");
       }
     });
Index: lib/Target/ARM/ARMCallingConv.td
===================================================================
--- lib/Target/ARM/ARMCallingConv.td
+++ lib/Target/ARM/ARMCallingConv.td
@@ -156,6 +156,8 @@
   // Handles byval parameters.
   CCIfByVal<CCPassByVal<4, 4>>,
 
+  CCIfType<[f16], CCBitConvertToType<i16>>,
+
   // The 'nest' parameter, if any, is passed in R12.
   CCIfNest<CCAssignToReg<[R12]>>,
 
@@ -187,6 +189,9 @@
 
   CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
+
+  CCIfType<[f16], CCBitConvertToType<i16>>,
+
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
 
@@ -214,8 +219,8 @@
 
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
-  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
-                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                      S9, S10, S11, S12, S13, S14, S15]>>,
   CCDelegateTo<CC_ARM_AAPCS_Common>
 ]>;
 
@@ -232,8 +237,8 @@
 
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
-  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
-                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                      S9, S10, S11, S12, S13, S14, S15]>>,
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
 
Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -662,7 +662,8 @@
                                    SDValue &Chain) const;
     SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) const;
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -522,6 +522,112 @@
     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
   }
 
+  // The HPR registerclass and f16 type are added as a legal type when:
+  // - FullFP16 is enabled, which means support for the Armv8.2-A FP16 instructions,
+  // - FP16 is enabled, which means support for the f16 <-> f32 conversion
+  //   instructions, which are a VFP3 extension and part of VFP4.
+  //
+  // It's obvious why f16 is legal for the former case, but the latter is perhaps the
+  // more interesting one. Making fp16 legal for FP16, results in
+  // f16 LOADs/STOREs while we don't have instructions for them. So the approach is
+  // to custom lower f16 LOAD/STORE nodes.
+  //
+  // The reason to make f16 legal when FP16 is supported is:
+  // - avoid very early legalization/combining of f16 arguments to f32 types,
+  //   which would again interpret the higher 16 bits in 32-bit registers and that
+  //   would be wrong. Instead of trying to undo this early legalization/combining,
+  //   this approach is easier and cleaner.
+  // - As a consequence, the isel dags are in a more 'normal form'. I.e. it relies
+  //   less on funny nodes FP_TO_FP16 and FP16_TO_FP, which are funny because they
+  //   perform float up/down converts and produce i32 values by moving from/to integer
+  //   and float registers. Instead, FP_EXTEND and FP_ROUND nodes will be introduced,
+  //   so this is more a clean up rather than e.g. addressing a correctness issue.
+  //   Unfortunatly I found that I can't completely get rid of nodes FP16_TO_FP, see
+  // - When these FP_EXTEND and FP_ROUND are introduced by the legalizer, and
+  //   we don't have the FP16 conversion instructions available, they will be
+  //   custom lowered to EABI calls h2f and f2h.
+  //
+  if (Subtarget->hasFP16() || Subtarget->hasFullFP16()) {
+    addRegisterClass(MVT::f16, &ARM::HPRRegClass);
+  }
+
+  if (!Subtarget->hasFullFP16()) {
+    setOperationAction(ISD::SELECT,      MVT::f16,  Promote);
+    setOperationAction(ISD::SELECT_CC,   MVT::f16,  Promote);
+    setOperationAction(ISD::SETCC,       MVT::f16,  Promote);
+    setOperationAction(ISD::BR_CC,       MVT::f16,  Custom);
+    setOperationAction(ISD::FADD,        MVT::f16,  Promote);
+    setOperationAction(ISD::FSUB,        MVT::f16,  Promote);
+    setOperationAction(ISD::FMUL,        MVT::f16,  Promote);
+    setOperationAction(ISD::FDIV,        MVT::f16,  Promote);
+    setOperationAction(ISD::FREM,        MVT::f16,  Promote);
+    setOperationAction(ISD::FMA,         MVT::f16,  Promote);
+    setOperationAction(ISD::FNEG,        MVT::f16,  Promote);
+    setOperationAction(ISD::FABS,        MVT::f16,  Promote);
+    setOperationAction(ISD::FCEIL,       MVT::f16,  Promote);
+    setOperationAction(ISD::FSQRT,       MVT::f16,  Promote);
+    setOperationAction(ISD::FFLOOR,      MVT::f16,  Promote);
+    setOperationAction(ISD::FNEARBYINT,  MVT::f16,  Promote);
+    setOperationAction(ISD::FRINT,       MVT::f16,  Promote);
+    setOperationAction(ISD::FROUND,      MVT::f16,  Promote);
+    setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
+    setOperationAction(ISD::FPOWI,       MVT::f16,  Promote);
+    setOperationAction(ISD::FPOW,        MVT::f16,  Promote);
+    setOperationAction(ISD::FEXP,        MVT::f16,  Promote);
+    setOperationAction(ISD::FEXP2,       MVT::f16,  Promote);
+    setOperationAction(ISD::FSIN,        MVT::f16,  Promote);
+    setOperationAction(ISD::FCOS,        MVT::f16,  Promote);
+    setOperationAction(ISD::FLOG,        MVT::f16,  Promote);
+    setOperationAction(ISD::FLOG2,       MVT::f16,  Promote);
+    setOperationAction(ISD::FLOG10,      MVT::f16,  Promote);
+    setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
+    setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
+    setOperationAction(ISD::FCOPYSIGN,   MVT::f16,  Custom);
+
+    // When we don't have FullFP16 support, and thus don't have FP16 load/store
+    // instructions, we create integer half-word integer load/stores.
+    //
+    // And input IR like this e.g.:
+    //
+    // %1 = load i16, i16 * ...
+    // %2 = tail call float @llvm.convert.from.fp16.f32(i16 ...)
+    // .. = fadd %2 ..
+    //
+    // gets combined very early to f16 loads when f16 types are legal. So we
+    // are custom lowering these f16 loads and stores, using integer loads and
+    // stores. This matches the storage-only semantics of __fp16, where
+    // arithmetic is done in single-precision, but results written back to
+    // half-precision. IR like the example above, can be generated due to use
+    // of __fp16.
+    setOperationAction(ISD::LOAD,       MVT::f16, Custom);
+    setOperationAction(ISD::STORE,      MVT::f16, Custom);
+
+    // This is a cleanup. We unfortunately need a FP_TO_FP16 node to create
+    // a truncating i32 -> i16 integer store.
+    setOperationAction(ISD::FP_TO_FP16, MVT::i32, Custom);
+
+    // Another case from the use of __fp16 and passing halfs as i16. I.e. when
+    // function arguments are passed as i16, but converted to f32 or f64 in the
+    // function body, an i16 truncate, f16 bitcast, and an FP_EXTEND are
+    // generated. When f16 is not a legal type, the f16 bitcast is legalized
+    // to FP16_TO_FP. But when f16 is a legal type, this does not happen, and
+    // the truncate results in code generation and stack loads/stores. We want
+    // to avoid this, and custom lower the truncate/bitcast to FP16_TO_FP.
+    setOperationAction(ISD::BITCAST,   MVT::i16, Custom);
+
+    // Create a libcall for f64 -> f16 conversion if necessary.
+    setOperationAction(ISD::FP_ROUND,    MVT::f16,  Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,  MVT::i32,  Custom);
+    setOperationAction(ISD::FP_TO_SINT,  MVT::i64,  Custom);
+    setOperationAction(ISD::FP_TO_UINT,  MVT::i32,  Custom);
+    setOperationAction(ISD::FP_TO_UINT,  MVT::i64,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,  MVT::i32,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,  MVT::i64,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,  MVT::i32,  Custom);
+    setOperationAction(ISD::SINT_TO_FP,  MVT::i64,  Custom);
+  }
+
   for (MVT VT : MVT::vector_valuetypes()) {
     for (MVT InnerVT : MVT::vector_valuetypes()) {
       setTruncStoreAction(VT, InnerVT, Expand);
@@ -707,6 +813,7 @@
     setTargetDAGCombine(ISD::FP_TO_UINT);
     setTargetDAGCombine(ISD::FDIV);
     setTargetDAGCombine(ISD::LOAD);
+    setTargetDAGCombine(ISD::BITCAST);
 
     // It is legal to extload from v4i8 to v4i16 or v4i32.
     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
@@ -1502,6 +1609,7 @@
   case ISD::SETUGE: return ARMCC::HS;
   case ISD::SETULT: return ARMCC::LO;
   case ISD::SETULE: return ARMCC::LS;
+  case ISD::SETOLT: return ARMCC::MI;
   }
 }
 
@@ -3683,7 +3791,9 @@
       } else {
         const TargetRegisterClass *RC;
 
-        if (RegVT == MVT::f32)
+        if (RegVT == MVT::f16) {
+          RC = &ARM::HPRRegClass;
+        } else if (RegVT == MVT::f32)
           RC = &ARM::SPRRegClass;
         else if (RegVT == MVT::f64)
           RC = &ARM::DPRRegClass;
@@ -3706,7 +3816,21 @@
       switch (VA.getLocInfo()) {
       default: llvm_unreachable("Unknown loc info!");
       case CCValAssign::Full: break;
+      case CCValAssign::AExt:
+        if (VA.getValVT() == MVT::f16) {
+          ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f32, ArgValue);
+          ArgValue = DAG.getFPExtendOrRound(ArgValue, dl, MVT::f16);
+          break;
+        }
+        assert(0  && "Unknown loc info!");
+        break;
       case CCValAssign::BCvt:
+        if (Ins[VA.getValNo()].ArgVT == MVT::f16 &&
+            !Subtarget->isTargetHardFloat()) {
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, ArgValue);
+          ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f16, ArgValue);
+          break;
+        }
         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
         break;
       case CCValAssign::SExt:
@@ -4534,6 +4658,12 @@
                        Chain, Dest, ARMcc, CCR, Cmp);
   }
 
+  if (LHS.getValueType() == MVT::f16) {
+    LHS = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, LHS);
+    assert(RHS.getValueType() == MVT::f16);
+    RHS = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, RHS);
+  }
+
   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
 
   if (getTargetMachine().Options.UnsafeFPMath &&
@@ -4620,15 +4750,49 @@
   EVT VT = Op.getValueType();
   if (VT.isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
-  if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
+
+  const EVT OpType = Op.getOperand(0).getValueType();
+
+  if (VT == MVT::i64 && OpType == MVT::f16) {
+    SDValue Cvt;
+    if (Op.getOperand(0).getOpcode() == ISD::LOAD) {
+      LoadSDNode *LD = cast<LoadSDNode>(Op.getOperand(0).getNode());
+      SDValue NewLD = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, MVT::i32,
+                              SDLoc(Op), LD->getOperand(0), LD->getBasePtr(),
+                              LD->getOffset(), MVT::i16, LD->getMemOperand());
+      Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, NewLD);
+    } else
+      Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, Op.getOperand(0));
+
+    SDValue Fp2int = DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Cvt);
+
+    const EVT OpType = Fp2int.getOperand(0).getValueType();
+
     RTLIB::Libcall LC;
     if (Op.getOpcode() == ISD::FP_TO_SINT)
-      LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
-                              Op.getValueType());
+      LC = RTLIB::getFPTOSINT(OpType, VT);
     else
-      LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
-                              Op.getValueType());
-    return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
+      LC = RTLIB::getFPTOUINT(OpType, VT);
+
+    return makeLibCall(DAG, LC, VT, Fp2int.getOperand(0),
+                       /*isSigned*/ false, SDLoc(Op)).first;
+  }
+
+  if (OpType == MVT::f16) {
+    SDValue Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op),
+                              MVT::f32, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Cvt);
+  }
+
+  if ((Subtarget->isFPOnlySP() && OpType == MVT::f64) ||
+      Op.getValueType() == MVT::i64) {
+    RTLIB::Libcall LC;
+    if (Op.getOpcode() == ISD::FP_TO_SINT)
+      LC = RTLIB::getFPTOSINT(OpType, VT);
+    else
+      LC = RTLIB::getFPTOUINT(OpType, VT);
+
+    return makeLibCall(DAG, LC, VT, Op.getOperand(0),
                        /*isSigned*/ false, SDLoc(Op)).first;
   }
 
@@ -4669,10 +4833,30 @@
 }
 
 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
+  if (!Subtarget->hasFullFP16() && Op.getValueType() == MVT::f16 &&
+      Op.getOperand(0).getValueType() == MVT::i32) {
+    // Legalize:
+    //      t2: f16 = [su]int_to_fp t1
+    // to:
+    //      t2: f32 = [su]int_to_fp t1
+    //      t3: f16 = fp_round t2
+    SDValue I2F = DAG.getNode(Op.getOpcode(), SDLoc(Op), MVT::f32,
+                              Op.getOperand(0));
+    return DAG.getFPExtendOrRound(I2F, SDLoc(Op), MVT::f16);
+  }
+
   EVT VT = Op.getValueType();
   if (VT.isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
-  if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
+
+  const bool IsF16Write = Op.getValueType() == MVT::f16;
+  const bool IsI64Read = Op.getOperand(0).getValueType() == MVT::i64;
+
+  if (IsF16Write)
+    Op = DAG.getNode(Op.getOpcode(), SDLoc(Op), MVT::f32, Op.getOperand(0));
+
+  if ((Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) ||
+      IsF16Write || IsI64Read) {
     RTLIB::Libcall LC;
     if (Op.getOpcode() == ISD::SINT_TO_FP)
       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
@@ -4692,6 +4876,13 @@
   SDValue Tmp0 = Op.getOperand(0);
   SDValue Tmp1 = Op.getOperand(1);
   SDLoc dl(Op);
+
+  if (Tmp0.getValueType() == MVT::f16 && Tmp1.getValueType() == MVT::f16) {
+    Tmp0 = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, Tmp0);
+    Tmp1 = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op), MVT::f32, Tmp1);
+    Op = DAG.getNode(ISD::FCOPYSIGN, SDLoc(Op), MVT::f32, Tmp0, Tmp1);
+  }
+
   EVT VT = Op.getValueType();
   EVT SrcVT = Tmp1.getValueType();
   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
@@ -4908,8 +5099,96 @@
   // source or destination of the bit convert.
   EVT SrcVT = Op.getValueType();
   EVT DstVT = N->getValueType(0);
-  assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
-         "ExpandBITCAST called for non-i64 type");
+
+  if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
+    // Handle @llvm.convert.from.fp16.f64(i16 %in), which generates IR like:
+    //
+    //  t2: i32,ch = CopyFromReg t0, ...
+    //     t3: i16 = truncate t2
+    //   t4: f16 = bitcast t3
+    // t5: f64 = fp_extend t4
+    //
+    // We want to custom lower the truncate->bitcast->fp_extend pattern to
+    // just a fp16_to_fp node:
+    //
+    // t2: i32,ch = CopyFromReg t0, Register:i32 %vreg0
+    //  tx: f64 = fp16_to_fp t2
+    //
+    // This avoids stack loads/stores code generation for the bitcast node,
+    // and thus just generates a mov and convert.
+    //
+    if (Op.getOpcode() != ISD::TRUNCATE)
+      return SDValue();
+
+    auto BitcastUse = N->use_begin();
+
+    if (N->use_size() == 1 && BitcastUse->getOpcode() == ISD::FP_EXTEND) {
+      SDValue Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op),
+                                BitcastUse->getValueType(0), Op.getOperand(0));
+      DAG.ReplaceAllUsesWith(*BitcastUse, Cvt.getNode());
+      return Cvt;
+    }
+
+    // If the use of the bitcast is not an extend, it's a data processing
+    // instructions, and we want to convert its operand to f32:
+    //
+    //     t9: i16 = truncate t5
+    //          t10: f16 = bitcast t9
+    //        t11: f16 = fadd t8, t10
+    //
+    SDValue Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op),
+                              MVT::f32, Op.getOperand(0));
+    return Cvt;
+  }
+
+  if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
+    // Very similarly for e.g. f64, we want to transform:
+    //
+    //       t2: f64,ch = CopyFromReg t0, Register:f64 %vreg0
+    //     t4: f16 = fp_round t2, TargetConstant:i32<0>
+    //   t5: i16 = bitcast t4
+    // t6: i32 = any_extend t5
+    //
+    // into:
+    //
+    // t2: f64,ch = CopyFromReg t0, Register:f64 %vreg0
+    //   t13: i32 = fp_to_fp16 t2
+    // t15: i32 = and t13, Constant:i32<65535>
+    //
+    if (Op.getOpcode() == ISD::FP_ROUND) {
+      auto FPAnyExtend = N->use_begin();
+      if (N->use_size() != 1 || FPAnyExtend->getOpcode() != ISD::ANY_EXTEND)
+        return SDValue();
+      SDValue Cvt = DAG.getNode(ISD::FP_TO_FP16, SDLoc(Op),
+                                MVT::i32, Op.getOperand(0));
+      SDValue And = DAG.getNode(ISD::AND, SDLoc(Op), MVT::i32, Cvt,
+                                DAG.getConstant(65535, SDLoc(Op), MVT::i32));
+      DAG.ReplaceAllUsesWith(*FPAnyExtend, And.getNode());
+      return And;
+    }
+
+    // Custom lower the f16 -> i16 -> i32 -> f32 conversion pattern:
+    //
+    // t12: i16 = bitcast t11
+    //      t13: i32 = zero_extend t12
+    //    t14: f32 = bitcast t13
+    //
+    auto ZeroExtend = N->use_begin();
+    if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+    auto BitCast = ZeroExtend->use_begin();
+    if (BitCast->use_size() != 1 || BitCast->getOpcode() != ISD::BITCAST)
+      return SDValue();
+    if (BitCast->getValueType(0) != MVT::f32)
+      return SDValue();
+
+    SDValue V = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op);
+    DAG.ReplaceAllUsesWith(*BitCast, V.getNode());
+    return V;
+  }
+
+  if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
+    return SDValue();
 
   // Turn i64->f64 into VMOVDRR.
   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
@@ -7771,10 +8050,214 @@
   return !CI.second.getNode() ? DAG.getRoot() : CI.first;
 }
 
+// This is a cleanup for the (corner)case when a load instruction directly
+// feeds a store. For a load -> store chain, when the f16 store is legalized
+// first, we unfortunately need to introduce a helper FP_TO_FP16 node
+// in order to create a truncating i32 -> i16 integer store; this node somehow
+// models a convert from a float to int type, which allows us to create a
+// int store. This FP_TO_FP16 needs to be cleaned up though, as it should not
+// lead to any code generation. When it is not a load/store chain, there will
+// be f16 data processing instruction between loads/stores; the f16 operands of
+// f16 data processing instructions would have been legalized and FP_EXTEND and
+// FP_ROUND instructions would have been introduced.
+// We want to transform this:
+//
+// t12: i32,ch = load<LD2[%in], zext from i16> t0, t2, undef:i32
+//      t10: i32 = fp_to_fp16 t12
+//    t11: ch = store<ST2[%out], trunc to i16> t12:1, t10, t4, undef:i32
+//
+// into:
+//
+// t12: i32,ch = load<LD2[%in], zext from i16> t0, t2, undef:i32
+//    t11: ch = store<ST2[%out], trunc to i16> t12:1, t12, t4, undef:i32
+//
+// so that we just generate a LDRH and STRH half-word integer load/stores.
+//
+static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG,
+                               const ARMSubtarget *Subtarget) {
+  if(!Op.hasOneUse())
+    return SDValue();
+
+  auto Use = Op.getNode()->use_begin();
+  if (Use->getOpcode() != ISD::STORE) {
+    dbgs() << "LowerFP_TO_FP16: use not a store, not cleaning it up\n";
+    return SDValue();
+  }
+
+  // return the operand so that the uses get properly updated/replaced.
+  return Op.getOperand(0);
+}
+
+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
+                          const ARMSubtarget *Subtarget) {
+  assert(!Subtarget->hasFullFP16());
+  SDValue F16Op = Op.getOperand(1);
+  assert(F16Op.getValueType() == MVT::f16);
+  SDNode *N = Op.getNode();
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  DEBUG( dbgs() << "Creating truncating i16 store for: "; F16Op.dump());
+  SDValue Fp2fp16 = DAG.getNode(ISD::FP_TO_FP16, SDLoc(Op), MVT::i32, F16Op);
+  SDValue NewST = DAG.getTruncStore(Op.getOperand(0), SDLoc(Op), Fp2fp16,
+                                    ST->getBasePtr(), MVT::i16,
+                                    ST->getMemOperand());
+  DEBUG(dbgs() << "New i16 store: "; NewST.dump());
+  DAG.ReplaceAllUsesOfValueWith(Op, NewST);
+  return NewST;
+}
+
+static SDNode *IsF16LoadStoreChain(SDNode *N) {
+  assert(N->getOpcode() == ISD::LOAD);
+
+  if (N->getNumValues() != 2) {
+    DEBUG(dbgs() << "expecting 2 values\n");
+    return nullptr;
+  }
+
+  if (N->use_size() != 2)
+    return nullptr;
+
+  // We expect the LD node of a LD->ST chain to have 2 uses:
+  //
+  // 1) the bitcast node, which feeds an extend to i32
+  // 2) the ST node
+
+  bool UseIsAStore = false;
+  bool UseIsABitCastAndExtend = false;
+  SDNode *ZEXT;
+
+  for (auto U : N->uses()) {
+    switch (U->getOpcode()) {
+    default: return nullptr;
+    case ISD::STORE:
+      DEBUG(dbgs() << "Found a ST as a use: "; U->dump());
+      UseIsAStore = true;
+      continue;
+    case ISD::BITCAST:
+      DEBUG(dbgs() << "Found a BITCAST as a use: "; U->dump());
+      // Bail out if the bitcast has more uses, because then it is
+      // not a simple LD-ST chain.
+      if (!U->hasOneUse())
+        return nullptr;
+      ZEXT = *U->use_begin();
+      if (ZEXT->getOpcode () != ISD::ZERO_EXTEND)
+        return nullptr;
+      UseIsABitCastAndExtend = true;
+      break;
+    }
+  }
+
+  if (!UseIsAStore || !UseIsABitCastAndExtend || !ZEXT->hasOneUse())
+    return nullptr;
+
+  return ZEXT;
+}
+
+static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG,
+                         const ARMSubtarget *Subtarget) {
+  assert(!Subtarget->hasFullFP16() && Op.getValueType() == MVT::f16);
+
+  SDNode *N = Op.getNode();
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  SDValue NewLD = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, MVT::i32,
+                              SDLoc(Op), Op.getOperand(0), LD->getBasePtr(),
+                              LD->getOffset(), MVT::i16, LD->getMemOperand());
+
+  DEBUG(dbgs() << "Custom lowering f16 load: "; Op.dump();
+        dbgs() << "Creating new i32 load:    "; NewLD.dump());
+
+  // Now the more tricky part: fixing up the dag, i.e. replace the uses of the
+  // old load node. There are a few corner cases. For this example:
+  //
+  //   t1: f16,ch = LD2
+  //   t2:     ch = ST2 t1
+  //
+  // The ST2 could have been legalized first, introducing a bitcast, an extend,
+  // and a truncating integer store, so that the DAG looks likes this
+  //
+  //   t1: f16 = LD2
+  //   t2: i16 = bitcast t1
+  //   t3: i32 = zero_extend t2
+  //   t4:  ch = ST2 <i16 trunc> t3
+  //
+  // And we want to avoid code generation for these bitcast and extend nodes,
+  // by making the load a direct producer of the store. To achieve this, we
+  // need to replace uses, but can't use t1 because that expects f16 values.
+  // Instead we look for the extend, and replace its uses with the new i32 load
+  // node.
+  //
+  // Case I: Load -> Store
+  //
+  SDNode *From = IsF16LoadStoreChain(N);
+  if (From) {
+    // replace the chain
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
+    // replace the uses of i32 zero extend
+    DAG.ReplaceAllUsesWith(From, NewLD.getNode());
+    return NewLD;
+  }
+
+  // Case II: Load -> FP_EXTEND
+  //
+  // The other special case is a f16 load producing a value for a fpextend node.
+  // We cannot simply replace the uses because of type mismatches, so we work
+  // around that by creating a FP16_TO_FP node to model the extend, and then
+  // replace the uses of the extend.
+  //
+  SDNode * FPExtend = nullptr;
+  for (auto U : N->uses()) {
+    if (U->getOpcode() == ISD::FP_EXTEND)
+      FPExtend = U;
+  }
+
+  if (FPExtend != nullptr) {
+    DEBUG(dbgs() << "Creating i32 -> f32 node\n");
+    SDValue F16ToF = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op),
+                                 FPExtend->getValueType(0), NewLD);
+    DAG.ReplaceAllUsesWith(FPExtend, F16ToF.getNode());
+    return NewLD;
+  }
+
+  // Case III: Load -> CopyToReg
+  //
+  // When a load produces a value for a copytoreg, we need to make sure to upconvert
+  // the f16 to f32. I.e., we want to transform this:
+  //
+  //    t5: f16,ch = load<LD2[%p]> t0, t2, undef:i32
+  // t7: ch = CopyToReg t0, Register:f16 %0, t5
+  //
+  // into:
+  //     t8: i32,ch = load<LD2[%p], zext from i16> t0, t2, undef:i32
+  //   t9: f32 = fp16_to_fp t8
+  // t11: ch,glue = CopyToReg t0, Register:f32 %4, t9
+  //
+  auto Copy2Reg = *N->use_begin();
+  if (N->hasOneUse() && Copy2Reg->getOpcode() == ISD::CopyToReg) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &RI = MF.getRegInfo();
+    unsigned VReg = RI.createVirtualRegister(&ARM::SPRRegClass);
+
+    SDValue F16ToF = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op),
+                                 MVT::f32, NewLD);
+    SDValue C2R = DAG.getCopyToReg(Copy2Reg->getOperand(0), SDLoc(Copy2Reg),
+                                   VReg, F16ToF);
+
+    DEBUG(dbgs() << "Old CopyToReg: "; Copy2Reg->dump();
+          dbgs() << "New CopyToReg: "; C2R->dump());
+
+    DAG.ReplaceAllUsesWith(Copy2Reg, C2R.getNode());
+    return NewLD;
+  }
+  return NewLD;
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   DEBUG(dbgs() << "Lowering node: "; Op.dump());
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::LOAD:           return LowerLOAD(Op, DAG, Subtarget);
+  case ISD::STORE:          return LowerSTORE(Op, DAG, Subtarget);
+  case ISD::FP_TO_FP16:     return LowerFP_TO_FP16(Op, DAG, Subtarget);
   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
   case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
@@ -7850,7 +8333,7 @@
     if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment())
       return LowerDYNAMIC_STACKALLOC(Op, DAG);
     llvm_unreachable("Don't know how to custom lower this!");
-  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG, Subtarget);
   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
   case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
   case ARMISD::WIN__DBZCHK: return SDValue();
@@ -7897,6 +8380,14 @@
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Don't know how to custom expand this!");
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+    Res = LowerINT_TO_FP(SDValue(N, 0), DAG);
+    break;
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    Res = LowerFP_TO_INT(SDValue(N, 0), DAG);
+    break;
   case ISD::READ_REGISTER:
     ExpandREAD_REGISTER(N, Results, DAG);
     break;
@@ -13531,8 +14022,8 @@
 }
 
 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
-         "Unexpected type for custom-lowering FP_EXTEND");
+  if (Op.getOperand(0).getValueType() == MVT::i32)
+    return SDValue();
 
   RTLIB::Libcall LC;
   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
@@ -13542,17 +14033,34 @@
                      SDLoc(Op)).first;
 }
 
-SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getOperand(0).getValueType() == MVT::f64 &&
-         Subtarget->isFPOnlySP() &&
-         "Unexpected type for custom-lowering FP_ROUND");
+SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG,
+                                         const ARMSubtarget *ST) const {
+  EVT DstType = Op.getValueType();
+  const EVT SrcType = Op.getOperand(0).getValueType();
+  const bool F32ToF16 = (SrcType == MVT::f32 && DstType == MVT::f16);
+  const bool F64ToF16 = (SrcType == MVT::f64 && DstType == MVT::f16);
+
+  // 1) fptrunc float to half
+  //    Supported: FP16
+  //
+  // 2) fptrunc double to half
+  //    Supported: V8
+  //
+  if ((F32ToF16 && ST->hasFP16()) ||                       // case 1)
+      (F64ToF16 && ST->hasFPARMv8() && !ST->isFPOnlySP())) // case 2)
+   return Op;
 
   RTLIB::Libcall LC;
   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
-
   SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
-                     SDLoc(Op)).first;
+
+  if (DstType == MVT::f16)
+    DstType = MVT::i32;
+
+  SDValue NewNode = makeLibCall(DAG, LC, DstType, SrcVal,
+                                /*isSigned*/ false, SDLoc(Op)).first;
+  DEBUG(dbgs() << "New node: "; NewNode.dump());
+  return NewNode;
 }
 
 bool
Index: lib/Target/ARM/ARMInstrVFP.td
===================================================================
--- lib/Target/ARM/ARMInstrVFP.td
+++ lib/Target/ARM/ARMInstrVFP.td
@@ -69,10 +69,19 @@
   let ParserMatchClass = FPImmOperand;
 }
 
+def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 2;
+}]>;
+
 def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 4;
 }]>;
 
+def alignedstore16 : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+
 def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
                              (store node:$val, node:$ptr), [{
   return cast<StoreSDNode>(N)->getAlignment() >= 4;
@@ -113,9 +122,9 @@
   let D = VFPNeonDomain;
 }
 
-def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr),
+def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr),
                  IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
-                 []>,
+                 [(set HPR:$Sd, (alignedload16 addrmode5:$addr))]>,
             Requires<[HasFullFP16]>;
 
 } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
@@ -132,9 +141,9 @@
   let D = VFPNeonDomain;
 }
 
-def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr),
+def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr),
                  IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
-                 []>,
+                 [(alignedstore16 HPR:$Sd, addrmode5:$addr)]>,
             Requires<[HasFullFP16]>;
 
 //===----------------------------------------------------------------------===//
@@ -335,9 +344,9 @@
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VADDH  : AHbI<0b11100, 0b11, 0, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>,
              Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -360,9 +369,9 @@
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VSUBH  : AHbI<0b11100, 0b11, 1, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>,
             Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -659,16 +668,15 @@
 }
 
 // Between half, single and double-precision.  For disassembly only.
-
-def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins HPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>,
+                 [(set SPR:$Sd, (fpextend HPR:$Sm))]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
-def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs HPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>,
+                 [(set HPR:$Sd, (fpround SPR:$Sm))]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
@@ -698,9 +706,10 @@
 }
 
 def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
-                   (outs SPR:$Sd), (ins DPR:$Dm),
+                   (outs HPR:$Sd), (ins DPR:$Dm),
                    NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
-                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+                   [(set HPR:$Sd, (fpround DPR:$Dm))]>, 
+                   Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Dm;
@@ -739,15 +748,19 @@
   let Inst{5}     = Dm{4};
 }
 
+// f16 -> f32 conversions
+def : Pat<(fp_to_f16 HPR:$a),
+          (i32 (COPY_TO_REGCLASS HPR:$a, GPR))>;
 def : Pat<(fp_to_f16 SPR:$a),
           (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
 
-def : Pat<(fp_to_f16 (f64 DPR:$a)),
-          (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
-
+// f32 -> f16 conversions
 def : Pat<(f16_to_fp GPR:$a),
-          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, HPR))>;
 
+// f16 <-> f64 conversions
+def : Pat<(fp_to_f16 (f64 DPR:$a)),
+          (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
 def : Pat<(f64 (f16_to_fp GPR:$a)),
           (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
@@ -1290,6 +1303,9 @@
   let D = VFPNeonA8Domain;
 }
 
+def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)),
+                   (VCVTBSH (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR)))>;
+
 def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)),
                    (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
Index: lib/Target/ARM/ARMRegisterInfo.td
===================================================================
--- lib/Target/ARM/ARMRegisterInfo.td
+++ lib/Target/ARM/ARMRegisterInfo.td
@@ -307,6 +307,23 @@
   let DiagnosticString = "operand must be a register in range [s0, s31]";
 }
 
+// Half-precision (FullFP16) register class. It's exactly the same as the
+// single-precision class, using the same S-registers. Each instruction that generates a
+// FP16 result writes that to the bottom 16 bits of the associated 32-bit Floating-point
+// register and the top 16 bits of the 32-bit floating-point register are written to 0.
+// A different register class is added, as opposed to adding f16 to SPR, to avoid
+// modifying and adding type information to the rules.
+def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> {
+  let AltOrders = [(add (decimate HPR, 2), SPR),
+                   (add (decimate HPR, 4),
+                        (decimate HPR, 2),
+                        (decimate (rotl HPR, 1), 4),
+                        (decimate (rotl HPR, 1), 2))];
+  let AltOrderSelect = [{
+    return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+  }];
+}
+
 // Subset of SPR which can be used as a source of NEON scalars for 16-bit
 // operations
 def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> {
Index: lib/Target/ARM/Disassembler/ARMDisassembler.cpp
===================================================================
--- lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -158,6 +158,8 @@
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -182,6 +184,8 @@
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeHPRRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
@@ -996,6 +1000,11 @@
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
 static const uint16_t DPRDecoderTable[] = {
      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
@@ -1253,6 +1262,11 @@
   return S;
 }
 
+static DecodeStatus DecodeHPRRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  return DecodeSPRRegListOperand(Inst, Val, Address, Decoder);
+}
+
 static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
Index: test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
===================================================================
--- test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
+++ test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
@@ -43,7 +43,7 @@
 }
 
 define half @test_half(half %a, half %b) {
-; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)*
+; CHECK: remark: {{.*}} unable to legalize instruction: %{{.}}:_(s16) = G_FADD %{{.}}, %{{.}}
 ; CHECK-LABEL: warning: Instruction selection used fallback path for test_half
   %res = fadd half %a, %b
   ret half %res
Index: test/CodeGen/ARM/fp16-args.ll
===================================================================
--- test/CodeGen/ARM/fp16-args.ll
+++ test/CodeGen/ARM/fp16-args.ll
@@ -33,9 +33,7 @@
 ; HARD: vcvtb.f32.f16   {{s[0-9]+}}, s0
 ; HARD: vadd.f32        {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; HARD: vcvtb.f16.f32   [[SREG:s[0-9]+]], {{s[0-9]+}}
-; HARD-NEXT: vmov            [[REG0:r[0-9]+]], [[SREG]]
-; HARD-NEXT: uxth            [[REG1:r[0-9]+]], [[REG0]]
-; HARD-NEXT: vmov            s0, [[REG1]]
+; HARD-NEXT: vcvtb.f32.f16   [[SREG]], [[SREG]]
 
 ; CHECK: bx lr
 }
Index: test/CodeGen/ARM/fp16-instructions.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/fp16-instructions.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefix=CHECK-SOFT
+; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 -float-abi=hard | FileCheck %s --check-prefix=CHECK-FP16
+; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon,+fullfp16 -float-abi=hard | FileCheck %s --check-prefix=CHECK-FULLFP16
+
+define half @Sub(half %a, half %b) local_unnamed_addr {
+entry:
+;CHECK-SOFT-LABEL:      Sub:
+;CHECK-SOFT:            bl  __aeabi_h2f
+;CHECK-SOFT:            bl  __aeabi_h2f
+;CHECK-SOFT:            bl  __aeabi_fsub
+;CHECK-SOFT:            bl  __aeabi_f2h
+
+;CHECK-FP16-LABEL:      Sub:
+;CHECK-FP16:            vcvtb.f32.f16 s2, s1
+;CHECK-FP16-NEXT:       vcvtb.f32.f16 s0, s0
+;CHECK-FP16-NEXT:       vsub.f32  s0, s0, s2
+;CHECK-FP16-NEXT:       vcvtb.f16.f32 s0, s0
+;CHECK-FP16-NEXT:       mov pc, lr
+
+;CHECK-FULLFP16-LABEL:  Sub:
+;CHECK-FULLFP16:        vsub.f16  s0, s0, s1
+;CHECK-FULLFP16-NEXT:   mov pc, lr
+
+  %sub = fsub half %a, %b
+  ret half %sub
+}
+
+define half @Add(half %a, half %b) local_unnamed_addr {
+entry:
+;CHECK-SOFT-LABEL:      Add:
+;CHECK-SOFT:            bl  __aeabi_h2f
+;CHECK-SOFT:            bl  __aeabi_h2f
+;CHECK-SOFT:            bl  __aeabi_fadd
+;CHECK-SOFT:            bl  __aeabi_f2h
+
+;CHECK-FP16-LABEL:      Add:
+;CHECK-FP16:            vcvtb.f32.f16 s2, s1
+;CHECK-FP16-NEXT:       vcvtb.f32.f16 s0, s0
+;CHECK-FP16-NEXT:       vadd.f32  s0, s0, s2
+;CHECK-FP16-NEXT:       vcvtb.f16.f32 s0, s0
+;CHECK-FP16-NEXT:       mov pc, lr
+
+;CHECK-FULLFP16-LABEL:  Add:
+;CHECK-FULLFP16:        vadd.f16  s0, s0, s1
+;CHECK-FULLFP16-NEXT:   mov pc, lr
+
+  %add = fadd half %a, %b
+  ret half %add
+}
+
+
Index: test/CodeGen/ARM/fp16-v3.ll
===================================================================
--- test/CodeGen/ARM/fp16-v3.ll
+++ test/CodeGen/ARM/fp16-v3.ll
@@ -11,12 +11,15 @@
 ; CHECK: vadd.f32 [[SREG5:s[0-9]+]], [[SREG4]], [[SREG1]]
 ; CHECK-NEXT: vcvtb.f16.f32 [[SREG6:s[0-9]+]], [[SREG5]]
 ; CHECK-NEXT: vmov [[RREG1:r[0-9]+]], [[SREG6]]
-; CHECK-DAG: uxth [[RREG2:r[0-9]+]], [[RREG1]]
-; CHECK-DAG: pkhbt [[RREG3:r[0-9]+]], [[RREG1]], [[RREG1]], lsl #16
+; The next store/load pair, is a result of code generation for a bitcast,
+; and is a minor performance regression that needs looking into.
+; CHECK-NEXT: strh
+; CHECK-NEXT: ldrh [[RREG2:r[0-9]+]], [sp, #2]
 ; CHECK-DAG: strh [[RREG1]], [r0, #4]
+; CHECK-DAG: orr [[RREG3:r[0-9]+]], [[RREG2]], [[RREG2]], lsl #16
 ; CHECK-DAG: vmov [[DREG:d[0-9]+]], [[RREG3]], [[RREG2]]
 ; CHECK-DAG: vst1.32 {[[DREG]][0]}, [r0:32]
-; CHECK-NEXT: bx lr
+; CHECK: bx lr
 define void @test_vec3(<3 x half>* %arr, i32 %i) #0 {
   %H = sitofp i32 %i to half
   %S = fadd half %H, 0xH4A00