Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8723,7 +8723,8 @@
       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
         assert(InVals[i].getNode() &&
                "LowerFormalArguments emitted a null value!");
-        assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
+        assert((InVals[i].getValueType() == MVT::f16 ||
+               EVT(Ins[i].VT) == InVals[i].getValueType()) &&
                "LowerFormalArguments emitted a value with the wrong type!");
       }
     });
Index: lib/Target/ARM/ARMCallingConv.td
===================================================================
--- lib/Target/ARM/ARMCallingConv.td
+++ lib/Target/ARM/ARMCallingConv.td
@@ -156,6 +156,8 @@
   // Handles byval parameters.
   CCIfByVal<CCPassByVal<4, 4>>,
 
+  CCIfType<[f16], CCBitConvertToType<i16>>,
+
   // The 'nest' parameter, if any, is passed in R12.
   CCIfNest<CCAssignToReg<[R12]>>,
 
@@ -187,6 +189,9 @@
 
   CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
+
+  CCIfType<[f16], CCBitConvertToType<i16>>,
+
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
 
@@ -214,8 +219,8 @@
 
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
-  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
-                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                      S9, S10, S11, S12, S13, S14, S15]>>,
   CCDelegateTo<CC_ARM_AAPCS_Common>
 ]>;
 
@@ -232,8 +237,8 @@
 
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
-  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
-                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCIfType<[f16, f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                      S9, S10, S11, S12, S13, S14, S15]>>,
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
 
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -530,6 +530,83 @@
     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
   }
 
+  // Hard float ABI defaults to VFP4, which supports the storage-only
+  // half-precision conversion instructions (and it's an extension for VFP3).
+  // We make f16 a legal type for this case, and not only when FullFP16 is
+  // supported (Armv8.2-A), for a few reasons. The AAPCS specifies that half
+  // floats sit in the lower 16 bits of the single precision registers, the
+  // upper half contains unspecified values. By making it a legal type, we
+  // avoid early legalization of arguments to f32 types, which would result in
+  // incorrectly interpreting the upper 16 bits. Another reason is that this
+  // avoids the obscure FP16_TO_FP and FP_TO_FP16 nodes. Instread, if
+  // instructions operands need promotion to f32 types, the 'normal' FP_EXTEND
+  // and FP_ROUND nodes will be introduced.
+  if (Subtarget->hasFP16() || Subtarget->hasFullFP16()) {
+    addRegisterClass(MVT::f16, &ARM::HPRRegClass);
+  }
+
+  if (!Subtarget->hasFullFP16()) {
+    setOperationAction(ISD::SELECT,      MVT::f16,  Promote);
+    setOperationAction(ISD::SELECT_CC,   MVT::f16,  Promote);
+    setOperationAction(ISD::SETCC,       MVT::f16,  Promote);
+    setOperationAction(ISD::BR_CC,       MVT::f16,  Promote);
+    setOperationAction(ISD::FADD,        MVT::f16,  Promote);
+    setOperationAction(ISD::FSUB,        MVT::f16,  Promote);
+    setOperationAction(ISD::FMUL,        MVT::f16,  Promote);
+    setOperationAction(ISD::FDIV,        MVT::f16,  Promote);
+    setOperationAction(ISD::FREM,        MVT::f16,  Promote);
+    setOperationAction(ISD::FMA,         MVT::f16,  Promote);
+    setOperationAction(ISD::FNEG,        MVT::f16,  Promote);
+    setOperationAction(ISD::FABS,        MVT::f16,  Promote);
+    setOperationAction(ISD::FCEIL,       MVT::f16,  Promote);
+    setOperationAction(ISD::FSQRT,       MVT::f16,  Promote);
+    setOperationAction(ISD::FFLOOR,      MVT::f16,  Promote);
+    setOperationAction(ISD::FNEARBYINT,  MVT::f16,  Promote);
+    setOperationAction(ISD::FRINT,       MVT::f16,  Promote);
+    setOperationAction(ISD::FROUND,      MVT::f16,  Promote);
+    setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
+    setOperationAction(ISD::SINT_TO_FP,  MVT::f16,  Promote);
+
+    // When we don't have FullFP16 support, and thus don't have FP16 load/store
+    // instructions, we create integer half-word integer load/stores.
+    //
+    // And input IR like this e.g.:
+    //
+    // %1 = load i16, i16 * ...
+    // %2 = tail call float @llvm.convert.from.fp16.f32(i16 ...)
+    // .. = fadd %2 ..
+    //
+    // gets combined very early to f16 loads when f16 types are legal. So we
+    // are custom lowering these f16 loads and stores, using integer loads and
+    // stores. This matches the storage-only semantics of __fp16, where
+    // arithmetic is done in single-precision, but results written back to
+    // half-precision. IR like the example above, can be generated due to use
+    // of __fp16.
+    setOperationAction(ISD::LOAD,       MVT::f16, Custom);
+    setOperationAction(ISD::STORE,      MVT::f16, Custom);
+
+    // This is cleanup. We unfortunately need a FP_TO_FP16 node to create
+    // a truncating i32 -> i16 integer store.
+    setOperationAction(ISD::FP_TO_FP16, MVT::i32, Custom);
+
+    // Another case from the use of __fp16 and passing halfs as i16. I.e. when
+    // function arguments are passed as i16, but converted to f32 or f64 in the
+    // function body, an i16 truncate, f16 bitcast, and an FP_EXTEND are
+    // generated. When f16 is not a legal type, the f16 bitcast is legalized
+    // to FP16_TO_FP. But when f16 is a legal type, this does not happen, and
+    // the truncate results in code generation and stack loads/stores. We want
+    // to avoid this, and custom lower the truncate/bitcast to FP16_TO_FP.
+    if (Subtarget->isTargetHardFloat())
+      setOperationAction(ISD::BITCAST,   MVT::i16, Custom);
+  }
+
+  // Create f2h and h2f conversion EABI libcalls.
+  if (!Subtarget->hasFP16()) {
+    setOperationAction(ISD::FP_EXTEND,   MVT::f16,  Custom);
+    setOperationAction(ISD::FP_EXTEND,   MVT::f32,  Custom);
+    setOperationAction(ISD::FP_ROUND,    MVT::f16,  Custom);
+  }
+
   for (MVT VT : MVT::vector_valuetypes()) {
     for (MVT InnerVT : MVT::vector_valuetypes()) {
       setTruncStoreAction(VT, InnerVT, Expand);
@@ -715,6 +792,7 @@
     setTargetDAGCombine(ISD::FP_TO_UINT);
     setTargetDAGCombine(ISD::FDIV);
     setTargetDAGCombine(ISD::LOAD);
+    setTargetDAGCombine(ISD::BITCAST);
 
     // It is legal to extload from v4i8 to v4i16 or v4i32.
     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
@@ -3699,7 +3777,9 @@
       } else {
         const TargetRegisterClass *RC;
 
-        if (RegVT == MVT::f32)
+        if (RegVT == MVT::f16) {
+          RC = &ARM::HPRRegClass;
+        } else if (RegVT == MVT::f32)
           RC = &ARM::SPRRegClass;
         else if (RegVT == MVT::f64)
           RC = &ARM::DPRRegClass;
@@ -3723,6 +3803,12 @@
       default: llvm_unreachable("Unknown loc info!");
       case CCValAssign::Full: break;
       case CCValAssign::BCvt:
+        if (Ins[VA.getValNo()].ArgVT == MVT::f16 &&
+            !Subtarget->isTargetHardFloat()) {
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, ArgValue);
+          ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f16, ArgValue);
+          break;
+        }
         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
         break;
       case CCValAssign::SExt:
@@ -4917,8 +5003,68 @@
   // source or destination of the bit convert.
   EVT SrcVT = Op.getValueType();
   EVT DstVT = N->getValueType(0);
-  assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
-         "ExpandBITCAST called for non-i64 type");
+
+  if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
+    // Handle @llvm.convert.from.fp16.f64(i16 %in), which generates IR like:
+    //
+    //  t2: i32,ch = CopyFromReg t0, ...
+    //     t3: i16 = truncate t2
+    //   t4: f16 = bitcast t3
+    // t5: f64 = fp_extend t4
+    //
+    // We want to custom lower the truncate->bitcast->fp_extend pattern to
+    // just a fp16_to_fp node:
+    //
+    // t2: i32,ch = CopyFromReg t0, Register:i32 %vreg0
+    //  tx: f64 = fp16_to_fp t2
+    //
+    // This avoids stack loads/stores code generation for the bitcast node,
+    // and thus just generates a mov and convert.
+    if (Op.getOpcode() != ISD::TRUNCATE)
+      return SDValue();
+
+    auto FPExtend = N->use_begin();
+    if (N->use_size() != 1 || FPExtend->getOpcode() != ISD::FP_EXTEND)
+      return SDValue();
+
+    SDValue Cvt = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op),
+                              FPExtend->getValueType(0), Op.getOperand(0));
+    DAG.ReplaceAllUsesWith(*FPExtend, Cvt.getNode());
+    return Cvt;
+  }
+
+  if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
+    // Very similarly for f64, we want to transform:
+    //
+    //       t2: f64,ch = CopyFromReg t0, Register:f64 %vreg0
+    //     t4: f16 = fp_round t2, TargetConstant:i32<0>
+    //   t5: i16 = bitcast t4
+    // t6: i32 = any_extend t5
+    //
+    // into:
+    //
+    // t2: f64,ch = CopyFromReg t0, Register:f64 %vreg0
+    //   t13: i32 = fp_to_fp16 t2
+    // t15: i32 = and t13, Constant:i32<65535>
+
+    if (Op.getOpcode() != ISD::FP_ROUND)
+      return SDValue();
+
+    auto FPAnyExtend = N->use_begin();
+    if (N->use_size() != 1 || FPAnyExtend->getOpcode() != ISD::ANY_EXTEND)
+      return SDValue();
+
+    SDValue Cvt = DAG.getNode(ISD::FP_TO_FP16, SDLoc(Op),
+                              MVT::i32, Op.getOperand(0));
+    SDValue And = DAG.getNode(ISD::AND, SDLoc(Op), MVT::i32, Cvt,
+                              DAG.getConstant(65535, SDLoc(Op), MVT::i32));
+
+    DAG.ReplaceAllUsesWith(*FPAnyExtend, And.getNode());
+    return And;
+  }
+
+  if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
+    return SDValue();
 
   // Turn i64->f64 into VMOVDRR.
   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
@@ -7780,10 +7926,198 @@
   return !CI.second.getNode() ? DAG.getRoot() : CI.first;
 }
 
+// This is a cleanup for the (corner)case when a load instruction directly
+// feeds a store. For a load -> store chain, when the f16 store is legalized
+// first, we unfortunately need to introduce a helper FP_TO_FP16 node
+// in order to create a truncating i32 -> i16 integer store; this node somehow
+// models a convert from a float to int type, which allows us to create a
+// int store. This FP_TO_FP16 needs to be cleaned up though, as it should not
+// lead to any code generation. When it is not a load/store chain, there will
+// be f16 data processing instruction between loads/stores; the f16 operands of
+// f16 data processing instructions would have been legalized and FP_EXTEND and
+// FP_ROUND instructions would have been introduced.
+// We want to transform this:
+//
+// t12: i32,ch = load<LD2[%in], zext from i16> t0, t2, undef:i32
+//      t10: i32 = fp_to_fp16 t12
+//    t11: ch = store<ST2[%out], trunc to i16> t12:1, t10, t4, undef:i32
+//
+// into:
+//
+// t12: i32,ch = load<LD2[%in], zext from i16> t0, t2, undef:i32
+//    t11: ch = store<ST2[%out], trunc to i16> t12:1, t12, t4, undef:i32
+//
+// so that we just generate a LDRH and STRH half-word integer load/stores.
+//
+static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG,
+                               const ARMSubtarget *Subtarget) {
+  if(!Op.hasOneUse())
+    return SDValue();
+
+  auto Use = Op.getNode()->use_begin();
+  if (Use->getOpcode() != ISD::STORE) {
+    dbgs() << "LowerFP_TO_FP16: use not a store, not cleaning it up\n";
+    return SDValue();
+  }
+
+  SDValue Load = Op.getOperand(0);
+  if (Load.getOpcode() != ISD::LOAD) {
+    dbgs() << "LowerFP_TO_FP16: operand not a load, not cleaning it up\n";
+    return SDValue();
+  }
+
+  DAG.ReplaceAllUsesOfValueWith(Op, Load);
+  return Load;
+}
+
+static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
+                          const ARMSubtarget *Subtarget) {
+  assert(!Subtarget->hasFullFP16());
+  SDValue F16Op = Op.getOperand(1);
+  assert(F16Op.getValueType() == MVT::f16);
+  SDNode *N = Op.getNode();
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  DEBUG( dbgs() << "Creating truncating i16 store for: "; F16Op.dump());
+  SDValue Fp2fp16 = DAG.getNode(ISD::FP_TO_FP16, SDLoc(Op), MVT::i32, F16Op);
+  SDValue NewST = DAG.getTruncStore(Op.getOperand(0), SDLoc(Op), Fp2fp16,
+                                    ST->getBasePtr(), MVT::i16,
+                                    ST->getMemOperand());
+  DEBUG(dbgs() << "New i16 store: "; NewST.dump());
+  DAG.ReplaceAllUsesOfValueWith(Op, NewST);
+  return NewST;
+}
+
+static SDNode *IsF16LoadStoreChain(SDNode *N) {
+  assert(N->getOpcode() == ISD::LOAD);
+
+  if (N->getNumValues() != 2) {
+    DEBUG(dbgs() << "expecting 2 values\n");
+    return nullptr;
+  }
+
+  if (N->use_size() != 2)
+    return nullptr;
+
+  // We expect the LD node of a LD->ST chain to have 2 uses:
+  //
+  // 1) the bitcast node, which feeds an extend to i32
+  // 2) the ST node
+
+  bool UseIsAStore = false;
+  bool UseIsABitCastAndExtend = false;
+  SDNode *ZEXT;
+
+  for (auto U : N->uses()) {
+    switch (U->getOpcode()) {
+    default: return nullptr;
+    case ISD::STORE:
+      DEBUG(dbgs() << "Found a ST as a use: "; U->dump());
+      UseIsAStore = true;
+      continue;
+    case ISD::BITCAST:
+      DEBUG(dbgs() << "Found a BITCAST as a use: "; U->dump());
+      // bail out if the bitcast has more uses, because then it is
+      // not a simple LD-ST chain.
+      if (!U->hasOneUse())
+        return nullptr;
+      ZEXT = *U->use_begin();
+      if (ZEXT->getOpcode () != ISD::ZERO_EXTEND)
+        return nullptr;
+      UseIsABitCastAndExtend = true;
+      break;
+    }
+  }
+
+  if (!UseIsAStore || !UseIsABitCastAndExtend)
+    return nullptr;
+
+  if (!ZEXT->hasOneUse())
+    return nullptr;
+
+  return ZEXT;
+}
+
+static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG,
+                         const ARMSubtarget *Subtarget) {
+  assert(!Subtarget->hasFullFP16() && Op.getValueType() == MVT::f16);
+  DEBUG(dbgs() << "Lowering f16 load, creating an i32 load for: "; Op.dump());
+
+  // Input DAG:
+  //
+  //  tx: f16,ch = LD2
+  //  t.:     ch = ST2 tx
+  //
+  // If ST2 is legalized first, a bitcast and extend are introduced to create a
+  // truncating integer store:
+  //
+  //  tx: f16 = LD2
+  //  ty: i16 = bitcast tx
+  //  tz: i32 = zero_extend ty
+  //  t.:  ch = ST2 <i16 trunc> tz
+  //
+  // Now we pick up the LD for legalization, and want to create:
+  //
+  //  tx: i32 = LD2<anyext from i16>
+  //  t.:  ch = ST2<trunc to i16> tx, ...
+  //
+  // To achieve this, we need to:
+  //
+  // 1) Create the widening i32 LD,
+  // 2) Be careful how we replace nodes:
+  //    uses of tx expect f16 values, so we can't replace uses of 'tx' with
+  //    the new i32 node: there's a f16 <-> i32 type mismatch. What we need to
+  //    is to replace uses of 'tz' with this new node.
+
+  // 1) Create the new i32 load:
+  SDNode *N = Op.getNode();
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  SDValue NewLD = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, MVT::i32,
+                              SDLoc(Op), Op.getOperand(0), LD->getBasePtr(),
+                              LD->getOffset(), MVT::i16, LD->getMemOperand());
+  DEBUG(dbgs() << "New i32 load: "; NewLD.dump());
+
+  // Fixup the DAG
+  //
+  // Case I: Load -> FP_EXTEND
+  SDNode * FPExtend = nullptr;
+  for (auto U : N->uses()) {
+    if (U->getOpcode() == ISD::FP_EXTEND)
+      FPExtend = U;
+  }
+
+  if (FPExtend != nullptr && !Subtarget->hasFullFP16()) {
+    DEBUG(dbgs() << "Creating i32 -> f32 bitcast\n");
+    SDValue NewBitcast = DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op),
+                                     FPExtend->getValueType(0), NewLD);
+    DAG.ReplaceAllUsesWith(FPExtend, NewBitcast.getNode());
+    return NewLD;
+  }
+
+  // Case II: Load -> Store
+  SDNode *From = IsF16LoadStoreChain(N);
+  if (From) {
+    // replace the chain
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
+    // replace the uses of i32 zero extend
+    DAG.ReplaceAllUsesWith(From, NewLD.getNode());
+    return NewLD;
+  }
+
+  // Case III: Load -> ret (copytoreg)
+  //
+  // The load feeds a return node, and we don't need special casing
+  // to fixup the uses of the old load node.
+  return NewLD;
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   DEBUG(dbgs() << "Lowering node: "; Op.dump());
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::LOAD:           return LowerLOAD(Op, DAG, Subtarget);
+  case ISD::STORE:          return LowerSTORE(Op, DAG, Subtarget);
+  case ISD::FP_TO_FP16:     return LowerFP_TO_FP16(Op, DAG, Subtarget);
   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
   case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
@@ -13407,8 +13741,8 @@
 }
 
 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
-         "Unexpected type for custom-lowering FP_EXTEND");
+  if (Op.getOperand(0).getValueType() == MVT::i32)
+    return SDValue();
 
   RTLIB::Libcall LC;
   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
@@ -13419,16 +13753,14 @@
 }
 
 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getOperand(0).getValueType() == MVT::f64 &&
-         Subtarget->isFPOnlySP() &&
-         "Unexpected type for custom-lowering FP_ROUND");
-
   RTLIB::Libcall LC;
   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
 
   SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
-                     SDLoc(Op)).first;
+  SDValue NewNode = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
+                                /*isSigned*/ false, SDLoc(Op)).first;
+  DEBUG(dbgs() << "New node: "; NewNode.dump());
+  return NewNode;
 }
 
 bool
Index: lib/Target/ARM/ARMInstrVFP.td
===================================================================
--- lib/Target/ARM/ARMInstrVFP.td
+++ lib/Target/ARM/ARMInstrVFP.td
@@ -69,10 +69,19 @@
   let ParserMatchClass = FPImmOperand;
 }
 
+def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 2;
+}]>;
+
 def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 4;
 }]>;
 
+def alignedstore16 : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+
 def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
                              (store node:$val, node:$ptr), [{
   return cast<StoreSDNode>(N)->getAlignment() >= 4;
@@ -113,9 +122,9 @@
   let D = VFPNeonDomain;
 }
 
-def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr),
+def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr),
                  IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
-                 []>,
+                 [(set HPR:$Sd, (alignedload16 addrmode5:$addr))]>,
             Requires<[HasFullFP16]>;
 
 } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
@@ -132,9 +141,9 @@
   let D = VFPNeonDomain;
 }
 
-def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr),
+def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr),
                  IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
-                 []>,
+                 [(alignedstore16 HPR:$Sd, addrmode5:$addr)]>,
             Requires<[HasFullFP16]>;
 
 //===----------------------------------------------------------------------===//
@@ -355,9 +364,9 @@
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VADDH  : AHbI<0b11100, 0b11, 0, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>,
              Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -380,9 +389,9 @@
 
 let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VSUBH  : AHbI<0b11100, 0b11, 1, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                  (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
-                  []>,
+                  [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>,
             Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -679,16 +688,15 @@
 }
 
 // Between half, single and double-precision.  For disassembly only.
-
-def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins HPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>,
+                 [(set SPR:$Sd, (fpextend HPR:$Sm))]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
-def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs HPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>,
+                 [(set HPR:$Sd, (fpround SPR:$Sm))]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
@@ -718,9 +726,10 @@
 }
 
 def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
-                   (outs SPR:$Sd), (ins DPR:$Dm),
+                   (outs HPR:$Sd), (ins DPR:$Dm),
                    NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
-                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+                   [(set HPR:$Sd, (fpround DPR:$Dm))]> { ,
+                   Requires<[HasFPARMv8, HasDPVFP]> {
   // Instruction operands.
   bits<5> Sd;
   bits<5> Dm;
@@ -759,15 +768,20 @@
   let Inst{5}     = Dm{4};
 }
 
+
+// f16 -> f32 conversions
+def : Pat<(fp_to_f16 HPR:$a),
+          (i32 (COPY_TO_REGCLASS HPR:$a, GPR))>;
 def : Pat<(fp_to_f16 SPR:$a),
           (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
 
-def : Pat<(fp_to_f16 (f64 DPR:$a)),
-          (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
-
+// f32 -> f16 conversions
 def : Pat<(f16_to_fp GPR:$a),
-          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, HPR))>;
 
+// f16 <-> f64 conversions
+def : Pat<(fp_to_f16 (f64 DPR:$a)),
+          (i32 (COPY_TO_REGCLASS (VCVTBDH DPR:$a), GPR))>;
 def : Pat<(f64 (f16_to_fp GPR:$a)),
           (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
Index: lib/Target/ARM/ARMRegisterInfo.td
===================================================================
--- lib/Target/ARM/ARMRegisterInfo.td
+++ lib/Target/ARM/ARMRegisterInfo.td
@@ -307,6 +307,23 @@
   let DiagnosticString = "operand must be a register in range [s0, s31]";
 }
 
+// Half-precision (FullFP16) register class. It's exactly the same as the
+// single-precision class, using the same S-registers. Each instruction that generates a
+// FP16 result writes that to the bottom 16 bits of the associated 32-bit Floating-point
+// register and the top 16 bits of the 32-bit floating-point register are written to 0.
+// A different register class is added, as opposed to adding f16 to SPR, to avoid
+// modifying and adding type information to the rules.
+def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> {
+  let AltOrders = [(add (decimate HPR, 2), SPR),
+                   (add (decimate HPR, 4),
+                        (decimate HPR, 2),
+                        (decimate (rotl HPR, 1), 4),
+                        (decimate (rotl HPR, 1), 2))];
+  let AltOrderSelect = [{
+    return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+  }];
+}
+
 // Subset of SPR which can be used as a source of NEON scalars for 16-bit
 // operations
 def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> {
Index: lib/Target/ARM/Disassembler/ARMDisassembler.cpp
===================================================================
--- lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -158,6 +158,8 @@
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -182,6 +184,8 @@
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeHPRRegListOperand(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
@@ -996,6 +1000,11 @@
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
 static const uint16_t DPRDecoderTable[] = {
      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
@@ -1253,6 +1262,11 @@
   return S;
 }
 
+static DecodeStatus DecodeHPRRegListOperand(MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  return DecodeSPRRegListOperand(Inst, Val, Address, Decoder);
+}
+
 static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
Index: test/CodeGen/ARM/fp16-instructions.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/fp16-instructions.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefix=CHECK-SOFT
+; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 -float-abi=hard | FileCheck %s --check-prefix=CHECK-FP16
+; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon,+fullfp16 -float-abi=hard | FileCheck %s --check-prefix=CHECK-FULLFP16
+
+define half @Sub(half %a, half %b) local_unnamed_addr {
+entry:
+;CHECK-SOFT-LABEL:      Sub:
+;CHECK-SOFT:            bl  __aeabi_h2f
+;CHECK-SOFT:            bl  __aeabi_h2f
+;CHECK-SOFT:            bl  __aeabi_fsub
+;CHECK-SOFT:            bl  __aeabi_f2h
+
+;CHECK-FP16-LABEL:      Sub:
+;CHECK-FP16:            vcvtb.f32.f16 s2, s1
+;CHECK-FP16-NEXT:       vcvtb.f32.f16 s0, s0
+;CHECK-FP16-NEXT:       vsub.f32  s0, s0, s2
+;CHECK-FP16-NEXT:       vcvtb.f16.f32 s0, s0
+;CHECK-FP16-NEXT:       mov pc, lr
+
+;CHECK-FULLFP16-LABEL:  Sub:
+;CHECK-FULLFP16:        vsub.f16  s0, s0, s1
+;CHECK-FULLFP16-NEXT:   mov pc, lr
+
+  %sub = fsub half %a, %b
+  ret half %sub
+}
+
+define half @Add(half %a, half %b) local_unnamed_addr {
+entry:
+;CHECK-SOFT-LABEL:      Add:
+;CHECK-SOFT:            bl  __aeabi_h2f
+;CHECK-SOFT:            bl  __aeabi_h2f
+;CHECK-SOFT:            bl  __aeabi_fadd
+;CHECK-SOFT:            bl  __aeabi_f2h
+
+;CHECK-FP16-LABEL:      Add:
+;CHECK-FP16:            vcvtb.f32.f16 s2, s1
+;CHECK-FP16-NEXT:       vcvtb.f32.f16 s0, s0
+;CHECK-FP16-NEXT:       vadd.f32  s0, s0, s2
+;CHECK-FP16-NEXT:       vcvtb.f16.f32 s0, s0
+;CHECK-FP16-NEXT:       mov pc, lr
+
+;CHECK-FULLFP16-LABEL:  Add:
+;CHECK-FULLFP16:        vadd.f16  s0, s0, s1
+;CHECK-FULLFP16-NEXT:   mov pc, lr
+
+  %add = fadd half %a, %b
+  ret half %add
+}
+
+