Index: clang/include/clang/Basic/arm_neon.td
===================================================================
--- clang/include/clang/Basic/arm_neon.td
+++ clang/include/clang/Basic/arm_neon.td
@@ -1499,11 +1499,9 @@
   def VMULX_LANEH   : IOpInst<"vmulx_lane", "ddgi", "hQh", OP_MULX_LN>;
   def VMULX_LANEQH  : IOpInst<"vmulx_laneq", "ddji", "hQh", OP_MULX_LN>;
   def VMULX_NH      : IOpInst<"vmulx_n", "dds", "hQh", OP_MULX_N>;
-  // TODO: Scalar floating point multiply extended (scalar, by element)
-  // Below ones are commented out because they need vmulx_f16(float16_t, float16_t)
-  // which will be implemented later with fp16 scalar intrinsic (arm_fp16.h)
-  //def SCALAR_FMULX_LANEH : IOpInst<"vmulx_lane", "ssdi", "Sh", OP_SCALAR_MUL_LN>;
-  //def SCALAR_FMULX_LANEQH : IOpInst<"vmulx_laneq", "ssji", "Sh", OP_SCALAR_MUL_LN>;
+  // Scalar floating point  mulx (scalar, by element)
+  def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "ssdi", "Sh">;
+  def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "ssji", "Sh">;
 
   // ARMv8.2-A FP16 reduction vector intrinsics.
   def VMAXVH   : SInst<"vmaxv", "sd", "hQh">;
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -7238,6 +7238,16 @@
     Int = Intrinsic::aarch64_neon_fmulx;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vmulx");
   }
+  case NEON::BI__builtin_neon_vmulxh_lane_f16:
+  case NEON::BI__builtin_neon_vmulxh_laneq_f16: {
+    // vmulx_lane should be mapped to Neon scalar mulx after
+    // extracting the scalar element
+    Ops.push_back(EmitScalarExpr(E->getArg(2)));
+    Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract");
+    Ops.pop_back();
+    Int = Intrinsic::aarch64_neon_fmulx;
+    return EmitNeonCall(CGM.getIntrinsic(Int, HalfTy), Ops, "vmulx");
+  }
   case NEON::BI__builtin_neon_vmul_lane_v:
   case NEON::BI__builtin_neon_vmul_laneq_v: {
     // v1f64 vmul_lane should be mapped to Neon scalar mul lane
Index: clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
===================================================================
--- clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -1223,27 +1223,25 @@
   return vmulxq_n_f16(a, b);
 }
 
-/* TODO: Not implemented yet (needs scalar intrinsic from arm_fp16.h)
-// CCHECK-LABEL: test_vmulxh_lane_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CCHECK: ret half [[CONV3:%.*]]
+// CHECK-LABEL: test_vmulxh_lane_f16
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[EXTR:%.*]] = extractelement <4 x half> [[TMP1]], i32 3
+// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]]
+// CHECK: ret half [[MULX]]
 float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) {
   return vmulxh_lane_f16(a, b, 3);
 }
 
-// CCHECK-LABEL: test_vmulxh_laneq_f16
-// CCHECK: [[CONV0:%.*]] = fpext half %a to float
-// CCHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CCHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CCHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CCHECK: ret half [[CONV3:%.*]]
+// CHECK-LABEL: test_vmulxh_laneq_f16
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[EXTR:%.*]] = extractelement <8 x half> [[TMP1]], i32 7
+// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]])
+// CHECK: ret half [[MULX]]
 float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) {
   return vmulxh_laneq_f16(a, b, 7);
 }
-*/
 
 // CHECK-LABEL: test_vmaxv_f16
 // CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
Index: llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
===================================================================
--- llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+v8.2a,+fullfp16  | FileCheck %s
 
+declare half @llvm.aarch64.neon.fmulx.f16(half, half)
 declare <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half>, <4 x half>)
 declare <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half>, <8 x half>)
 declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
@@ -236,6 +237,25 @@
   ret half %1
 }
 
+define dso_local half @t_vmulx_f16(half %a, half %b) {
+; CHECK-LABEL: t_vmulx_f16:
+; CHECK:         fmulx h0, h0, h1
+; CHECK-NEXT:    ret
+entry:
+  %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %b)
+  ret half %fmulx.i
+}
+
+define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) {
+; CHECK-LABEL: t_vmulxh_lane_f16:
+; CHECK:         fmulx h0, h0, v1.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <4 x half> %b, i32 3
+  %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
+  ret half %fmulx.i
+}
+
 define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulx_lane_f16:
 ; CHECK:         fmulx v0.4h, v0.4h, v1.h[0]
@@ -276,6 +296,16 @@
   ret <8 x half> %vmulx2.i
 }
 
+define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) {
+; CHECK-LABEL: t_vmulxh_laneq_f16:
+; CHECK:         fmulx h0, h0, v1.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <8 x half> %b, i32 7
+  %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract)
+  ret half %fmulx.i
+}
+
 define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
 ; CHECK-LABEL: t_vmulx_n_f16:
 ; CHECK:         dup v1.4h, v1.h[0]