Index: include/clang/Basic/arm_neon.td
===================================================================
--- include/clang/Basic/arm_neon.td
+++ include/clang/Basic/arm_neon.td
@@ -206,6 +206,15 @@
     : Op<(call "vdot", $p0, $p1,
           (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>;
 
+def OP_FMLAL_LN     : Op<(call "vfmlal_low", $p0, $p1,
+                           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+def OP_FMLSL_LN     : Op<(call "vfmlsl_low", $p0, $p1,
+                           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+def OP_FMLAL_LN_Hi  : Op<(call "vfmlal_high", $p0, $p1,
+                           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+def OP_FMLSL_LN_Hi  : Op<(call "vfmlsl_high", $p0, $p1,
+                           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -1640,3 +1649,21 @@
   // Variants indexing into a 128-bit vector are A64 only.
   def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>;
 }
+
+// v8.2-A FP16 fused multiply-add long instructions.
+let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {
+  def VFMLAL_LOW  : SInst<"vfmlal_low", "ffHH", "UiQUi">;
+  def VFMLSL_LOW  : SInst<"vfmlsl_low", "ffHH", "UiQUi">;
+  def VFMLAL_HIGH : SInst<"vfmlal_high", "ffHH", "UiQUi">;
+  def VFMLSL_HIGH : SInst<"vfmlsl_high", "ffHH", "UiQUi">;
+
+  def VFMLAL_LANE_LOW  : SOpInst<"vfmlal_lane_low", "ffH0i", "UiQUi", OP_FMLAL_LN>;
+  def VFMLSL_LANE_LOW  : SOpInst<"vfmlsl_lane_low", "ffH0i", "UiQUi", OP_FMLSL_LN>;
+  def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "ffH0i", "UiQUi", OP_FMLAL_LN_Hi>;
+  def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "ffH0i", "UiQUi", OP_FMLSL_LN_Hi>;
+
+  def VFMLAL_LANEQ_LOW  : SOpInst<"vfmlal_laneq_low", "ffH1i", "UiQUi", OP_FMLAL_LN>;
+  def VFMLSL_LANEQ_LOW  : SOpInst<"vfmlsl_laneq_low", "ffH1i", "UiQUi", OP_FMLSL_LN>;
+  def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "ffH1i", "UiQUi", OP_FMLAL_LN_Hi>;
+  def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "ffH1i", "UiQUi", OP_FMLSL_LN_Hi>;
+}
Index: include/clang/Basic/arm_neon_incl.td
===================================================================
--- include/clang/Basic/arm_neon_incl.td
+++ include/clang/Basic/arm_neon_incl.td
@@ -96,6 +96,11 @@
 // example: (dup $p1) -> "(uint32x2_t) {__p1, __p1}" (assuming the base type
 //          is uint32x2_t).
 def dup;
+// dup_typed - Take a vector and a scalar argument, and create a new vector of
+//             the same type by duplicating the scalar value into all lanes.
+// example: (dup_typed $p1, $p2) -> "(float16x4_t) {__p2, __p2, __p2, __p2}"
+//          (assuming __p1 is float16x4_t, and __p2 is a compatible scalar).
+def dup_typed;
 // splat - Take a vector and a lane index, and return a vector of the same type
 //         containing repeated instances of the source vector at the lane index.
 // example: (splat $p0, $p1) ->
@@ -229,6 +234,8 @@
 // f: float (int args)
 // F: double (int args)
 // H: half (int args)
+// 0: half (int args), ignore 'Q' size modifier.
+// 1: half (int args), force 'Q' size modifier.
 // d: default
 // g: default, ignore 'Q' size modifier.
 // j: default, force 'Q' size modifier.
Index: lib/Basic/Targets/AArch64.h
===================================================================
--- lib/Basic/Targets/AArch64.h
+++ lib/Basic/Targets/AArch64.h
@@ -34,6 +34,7 @@
   unsigned Unaligned;
   unsigned HasFullFP16;
   unsigned HasDotProd;
+  unsigned HasFP16FML;
   llvm::AArch64::ArchKind ArchKind;
 
   static const Builtin::Info BuiltinInfo[];
Index: lib/Basic/Targets/AArch64.cpp
===================================================================
--- lib/Basic/Targets/AArch64.cpp
+++ lib/Basic/Targets/AArch64.cpp
@@ -194,6 +194,9 @@
   if (HasDotProd)
     Builder.defineMacro("__ARM_FEATURE_DOTPROD", "1");
 
+  if ((FPU & NeonMode) && HasFP16FML)
+    Builder.defineMacro("__ARM_FEATURE_FP16FML", "1");
+
   switch (ArchKind) {
   default:
     break;
@@ -231,6 +234,7 @@
   Unaligned = 1;
   HasFullFP16 = 0;
   HasDotProd = 0;
+  HasFP16FML = 0;
   ArchKind = llvm::AArch64::ArchKind::ARMV8A;
 
   for (const auto &Feature : Features) {
@@ -252,6 +256,8 @@
       HasFullFP16 = 1;
     if (Feature == "+dotprod")
       HasDotProd = 1;
+    if (Feature == "+fp16fml")
+      HasFP16FML = 1;
   }
 
   setDataLayout();
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -4368,6 +4368,14 @@
   NEONMAP0(vextq_v),
   NEONMAP0(vfma_v),
   NEONMAP0(vfmaq_v),
+  NEONMAP1(vfmlal_high_v, aarch64_neon_fmlal2, 0),
+  NEONMAP1(vfmlal_low_v, aarch64_neon_fmlal, 0),
+  NEONMAP1(vfmlalq_high_v, aarch64_neon_fmlal2, 0),
+  NEONMAP1(vfmlalq_low_v, aarch64_neon_fmlal, 0),
+  NEONMAP1(vfmlsl_high_v, aarch64_neon_fmlsl2, 0),
+  NEONMAP1(vfmlsl_low_v, aarch64_neon_fmlsl, 0),
+  NEONMAP1(vfmlslq_high_v, aarch64_neon_fmlsl2, 0),
+  NEONMAP1(vfmlslq_low_v, aarch64_neon_fmlsl, 0),
   NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
   NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
   NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
@@ -5341,6 +5349,34 @@
     Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
   }
+  case NEON::BI__builtin_neon_vfmlal_low_v:
+  case NEON::BI__builtin_neon_vfmlalq_low_v: {
+    llvm::Type *InputTy =
+        llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low");
+  }
+  case NEON::BI__builtin_neon_vfmlsl_low_v:
+  case NEON::BI__builtin_neon_vfmlslq_low_v: {
+    llvm::Type *InputTy =
+        llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low");
+  }
+  case NEON::BI__builtin_neon_vfmlal_high_v:
+  case NEON::BI__builtin_neon_vfmlalq_high_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high");
+  }
+  case NEON::BI__builtin_neon_vfmlsl_high_v:
+  case NEON::BI__builtin_neon_vfmlslq_high_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
+  }
   }
 
   assert(Int && "Expected valid intrinsic number");
Index: test/CodeGen/aarch64-neon-fp16fml.c
===================================================================
--- /dev/null
+++ test/CodeGen/aarch64-neon-fp16fml.c
@@ -0,0 +1,196 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \
+// RUN: -fallow-half-arguments-and-returns -disable-O0-optnone -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+// Test AArch64 Armv8.2-A FP16 Fused Multiply-Add Long intrinsics
+
+#include <arm_neon.h>
+
+// Vector form
+
+float32x2_t test_vfmlal_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_low_u32(a, b, c);
+}
+
+float32x2_t test_vfmlsl_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_low_u32(a, b, c);
+}
+
+float32x2_t test_vfmlal_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_high_u32(a, b, c);
+}
+
+float32x2_t test_vfmlsl_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_high_u32(a, b, c);
+}
+
+float32x4_t test_vfmlalq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_low_u32(a, b, c);
+}
+
+float32x4_t test_vfmlslq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_low_u32(a, b, c);
+}
+
+float32x4_t test_vfmlalq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_high_u32(a, b, c);
+}
+
+float32x4_t test_vfmlslq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_high_u32(a, b, c);
+}
+
+// Indexed form
+
+float32x2_t test_vfmlal_lane_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_lane_low_u32(a, b, c, 0);
+}
+
+float32x2_t test_vfmlal_lane_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_lane_high_u32(a, b, c, 1);
+}
+
+float32x4_t test_vfmlalq_lane_low_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_low_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_lane_low_u32(a, b, c, 2);
+}
+
+float32x4_t test_vfmlalq_lane_high_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_high_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_lane_high_u32(a, b, c, 3);
+}
+
+float32x2_t test_vfmlal_laneq_low_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_low_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_laneq_low_u32(a, b, c, 4);
+}
+
+float32x2_t test_vfmlal_laneq_high_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_high_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_laneq_high_u32(a, b, c, 5);
+}
+
+float32x4_t test_vfmlalq_laneq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_laneq_low_u32(a, b, c, 6);
+}
+
+float32x4_t test_vfmlalq_laneq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_laneq_high_u32(a, b, c, 7);
+}
+
+float32x2_t test_vfmlsl_lane_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_lane_low_u32(a, b, c, 0);
+}
+
+float32x2_t test_vfmlsl_lane_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_lane_high_u32(a, b, c, 1);
+}
+
+float32x4_t test_vfmlslq_lane_low_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_low_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_lane_low_u32(a, b, c, 2);
+}
+
+float32x4_t test_vfmlslq_lane_high_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_high_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_lane_high_u32(a, b, c, 3);
+}
+
+float32x2_t test_vfmlsl_laneq_low_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_low_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_laneq_low_u32(a, b, c, 4);
+}
+
+float32x2_t test_vfmlsl_laneq_high_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_high_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_laneq_high_u32(a, b, c, 5);
+}
+
+float32x4_t test_vfmlslq_laneq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_laneq_low_u32(a, b, c, 6);
+}
+
+float32x4_t test_vfmlslq_laneq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_laneq_high_u32(a, b, c, 7);
+}
Index: test/Preprocessor/aarch64-target-features.c
===================================================================
--- test/Preprocessor/aarch64-target-features.c
+++ test/Preprocessor/aarch64-target-features.c
@@ -93,16 +93,20 @@
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv8.2a+dotprod -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DOTPROD %s
 // CHECK-DOTPROD: __ARM_FEATURE_DOTPROD 1
 
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// On ARMv8.2-A and above, +fp16fml implies +fp16.
+// On ARMv8.4-A and above, +fp16 implies +fp16fml.
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// CHECK-FULLFP16-FML:           #define __ARM_FEATURE_FP16FML 1
+// CHECK-FULLFP16-NOFML-NOT:     #define __ARM_FEATURE_FP16FML 1
 // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
 // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
 // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FP 0xE
@@ -114,6 +118,7 @@
 // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
 // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
 // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// CHECK-FULLFP16-SCALAR-NOT:   #define __ARM_FEATURE_FP16FML 1
 // CHECK-FULLFP16-SCALAR:       #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
 // CHECK-FULLFP16-SCALAR-NOT:   #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
 // CHECK-FULLFP16-SCALAR:       #define __ARM_FP 0xE
@@ -127,10 +132,11 @@
 // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
 // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
 // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16FML 1
 // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
 // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
-// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP 0xE
-// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP16_FORMAT_IEEE 1
+// CHECK-FULLFP16-NOFML-VECTOR-SCALAR:     #define __ARM_FP 0xE
+// CHECK-FULLFP16-NOFML-VECTOR-SCALAR:     #define __ARM_FP16_FORMAT_IEEE 1
 
 // ================== Check whether -mtune accepts mixed-case features.
 // RUN: %clang -target aarch64 -mtune=CYCLONE -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MTUNE-CYCLONE %s
Index: utils/TableGen/NeonEmitter.cpp
===================================================================
--- utils/TableGen/NeonEmitter.cpp
+++ utils/TableGen/NeonEmitter.cpp
@@ -494,6 +494,7 @@
     std::pair<Type, std::string> emitDagSaveTemp(DagInit *DI);
     std::pair<Type, std::string> emitDagSplat(DagInit *DI);
     std::pair<Type, std::string> emitDagDup(DagInit *DI);
+    std::pair<Type, std::string> emitDagDupTyped(DagInit *DI);
     std::pair<Type, std::string> emitDagShuffle(DagInit *DI);
     std::pair<Type, std::string> emitDagCast(DagInit *DI, bool IsBitCast);
     std::pair<Type, std::string> emitDagCall(DagInit *DI);
@@ -897,6 +898,18 @@
     Float = true;
     ElementBitwidth = 16;
     break;
+  case '0':
+    Float = true;
+    if (AppliedQuad)
+      Bitwidth /= 2;
+    ElementBitwidth = 16;
+    break;
+  case '1':
+    Float = true;
+    if (!AppliedQuad)
+      Bitwidth *= 2;
+    ElementBitwidth = 16;
+    break;
   case 'g':
     if (AppliedQuad)
       Bitwidth /= 2;
@@ -1507,6 +1520,8 @@
     return emitDagShuffle(DI);
   if (Op == "dup")
     return emitDagDup(DI);
+  if (Op == "dup_typed")
+    return emitDagDupTyped(DI);
   if (Op == "splat")
     return emitDagSplat(DI);
   if (Op == "save_temp")
@@ -1771,6 +1786,28 @@
   return std::make_pair(T, S);
 }
 
+std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI) {
+  assert_with_loc(DI->getNumArgs() == 2, "dup_typed() expects two arguments");
+  std::pair<Type, std::string> A = emitDagArg(DI->getArg(0),
+                                              DI->getArgNameStr(0));
+  std::pair<Type, std::string> B = emitDagArg(DI->getArg(1),
+                                              DI->getArgNameStr(1));
+  assert_with_loc(B.first.isScalar(),
+                  "dup_typed() requires a scalar as the second argument");
+
+  Type T = A.first;
+  assert_with_loc(T.isVector(), "dup_typed() used but target type is scalar!");
+  std::string S = "(" + T.str() + ") {";
+  for (unsigned I = 0; I < T.getNumElements(); ++I) {
+    if (I != 0)
+      S += ", ";
+    S += B.second;
+  }
+  S += "}";
+
+  return std::make_pair(T, S);
+}
+
 std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagSplat(DagInit *DI) {
   assert_with_loc(DI->getNumArgs() == 2, "splat() expects two arguments");
   std::pair<Type, std::string> A = emitDagArg(DI->getArg(0),