Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -524,9 +524,8 @@
 
   if (Subtarget->hasFullFP16()) {
     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
-    // Clean up bitcast of incoming arguments if hard float abi is enabled.
-    if (Subtarget->isTargetHardFloat())
-      setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::i16, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f16, Custom);
   }
 
   for (MVT VT : MVT::vector_valuetypes()) {
@@ -5061,38 +5060,78 @@
   EVT SrcVT = Op.getValueType();
   EVT DstVT = N->getValueType(0);
 
-  // Half-precision arguments can be passed in like this:
-  //
-  //    t4: f32,ch = CopyFromReg t0, Register:f32 %1
-  //            t8: i32 = bitcast t4
-  //          t9: i16 = truncate t8
-  //        t10: f16 = bitcast t9   <~~~~ SDNode N
-  //
-  // but we want to avoid code generation for the bitcast, so transform this
-  // into:
-  //
-  // t18: f16 = CopyFromReg t0, Register:f32 %0
-  //
+
+  // Half-precision arguments: avoid stack stores/loads
   if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
-     if (Op.getOpcode() != ISD::TRUNCATE)
-        return SDValue();
+    if (Op.getOpcode() != ISD::TRUNCATE)
+      return SDValue();
 
+    // Transform this:
+    //
+    //       t4: f32,ch = CopyFromReg t0, Register:f32 %1
+    //     t8: i32 = bitcast t4
+    //   t9: i16 = truncate t8   <~~~~ Op
+    // t10: f16 = bitcast t9     <~~~~ SDNode N
+    //
+    // into an f16 copy from reg:
+    //
+    // t18: f16 = CopyFromReg t0, Register:f32 %0
+    //
     SDValue Bitcast = Op.getOperand(0);
-    if (Bitcast.getOpcode() != ISD::BITCAST ||
-        Bitcast.getValueType() != MVT::i32)
-      return SDValue();
+    if (Bitcast.getOpcode() == ISD::BITCAST &&
+        Bitcast.getValueType() == MVT::i32) {
+
+      SDValue Copy = Bitcast.getOperand(0);
+      if (Copy.getOpcode() != ISD::CopyFromReg ||
+          Copy.getValueType() != MVT::f32)
+        return SDValue();
 
-    SDValue Copy = Bitcast.getOperand(0);
-    if (Copy.getOpcode() != ISD::CopyFromReg ||
-        Copy.getValueType() != MVT::f32)
+      SDValue Ops[] = { Copy->getOperand(0), Copy->getOperand(1) };
+      return DAG.getNode(ISD::CopyFromReg, SDLoc(Copy), MVT::f16, Ops);
+    }
+
+    // And for FullFP16 we can have this:
+    //
+    //          t5: i32,ch = CopyFromReg t0, Register:i32 %1
+    //        t9: i16 = truncate t5   <~~~~ Op
+    //      t10: f16 = bitcast t9     <~~~~ SDNode N
+    //    t11: f16 = fadd t8, t10
+    //
+    SDValue Copy = Op.getOperand(0);
+    if (Copy.getOpcode() == ISD::CopyFromReg &&
+        Copy.getValueType() == MVT::i32) {
+      // We use FP16_TO_FP just to model a GPR -> HPR move
+      return DAG.getNode(ISD::FP16_TO_FP, SDLoc(Op),
+                         MVT::f32, Op.getOperand(0));
+    }
+    return SDValue();
+  }
+
+  // Half-precision return values: avoid stack stores/loads
+  if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
+    //
+    //       t11: f16 = fadd t8, t10
+    //     t12: i16 = bitcast t11       <~~~ SDNode N
+    //   t13: i32 = zero_extend t12
+    // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
+    //
+    auto ZeroExtend = N->use_begin();
+    if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
+        ZeroExtend->getValueType(0) != MVT::i32)
       return SDValue();
 
-    SDValue Ops[] = { Copy->getOperand(0), Copy->getOperand(1) };
-    return DAG.getNode(ISD::CopyFromReg, SDLoc(Copy), MVT::f16, Ops);
+    auto Copy = ZeroExtend->use_begin();
+    if (Copy->getOpcode() == ISD::CopyToReg) {
+      // We use FP_TO_FP16 just to model a HPR -> GPR move
+      SDValue Cvt = DAG.getNode(ISD::FP_TO_FP16, SDLoc(Op), MVT::i32, Op);
+      DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
+      return Cvt;
+    }
+    return SDValue();
   }
 
-  assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
-         "ExpandBITCAST called for non-i64 type");
+  if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
+    return SDValue();
 
   // Turn i64->f64 into VMOVDRR.
   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
Index: lib/Target/ARM/ARMInstrVFP.td
===================================================================
--- lib/Target/ARM/ARMInstrVFP.td
+++ lib/Target/ARM/ARMInstrVFP.td
@@ -750,6 +750,13 @@
   let Inst{5}     = Dm{4};
 }
 
+let Predicates = [HasFullFP16] in {
+  def : Pat<(f16_to_fp GPR:$a),
+            (f32 (COPY_TO_REGCLASS GPR:$a, HPR))>;
+  def : Pat<(fp_to_f16 HPR:$a),
+            (i32 (COPY_TO_REGCLASS HPR:$a, GPR))>;
+}
+
 def : Pat<(fp_to_f16 SPR:$a),
           (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
 
Index: test/CodeGen/ARM/fp16-instructions.ll
===================================================================
--- test/CodeGen/ARM/fp16-instructions.ll
+++ test/CodeGen/ARM/fp16-instructions.ll
@@ -43,14 +43,11 @@
 ; CHECK-SOFTFP-FP16:  vcvtb.f16.f32 [[S0]], [[S0]]
 ; CHECK-SOFTFP-FP16:  vmov  r0, s0
 
-; CHECK-SOFTFP-FULLFP16:  strh  r1, {{.*}}
-; CHECK-SOFTFP-FULLFP16:  strh  r0, {{.*}}
-; CHECK-SOFTFP-FULLFP16:  vldr.16 [[S0:s[0-9]]], {{.*}}
-; CHECK-SOFTFP-FULLFP16:  vldr.16 [[S2:s[0-9]]], {{.*}}
-; CHECK-SOFTFP-FULLFP16:  vadd.f16  [[S0]], [[S2]], [[S0]]
-; CHECK-SOFTFP-FULLFP16:  vstr.16 [[S2:s[0-9]]],  {{.*}}
-; CHECK-SOFTFP-FULLFP16:  ldrh  r0, {{.*}}
-; CHECK-SOFTFP-FULLFP16:  mov pc, lr
+; CHECK-SOFTFP-FULLFP16:       vmov      [[S0:s[0-9]]], r1
+; CHECK-SOFTFP-FULLFP16:       vmov      [[S2:s[0-9]]], r0
+; CHECK-SOFTFP-FULLFP16:       vadd.f16  [[S0]], [[S2]], [[S0]]
+; CHECK-SOFTFP-FULLFP16-NEXT:  vmov      r0, s0
+; CHECK-SOFTFP-FULLFP16-NEXT:  mov       pc, lr
 
 ; CHECK-HARDFP-VFP3:  vmov r{{.}}, s0
 ; CHECK-HARDFP-VFP3:  vmov{{.*}}, s1
@@ -69,4 +66,3 @@
 ; CHECK-HARDFP-FULLFP16-NEXT:  mov pc, lr
 
 }
-