diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics-constrained.c
@@ -105,7 +105,7 @@
 // COMMONIR:      [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
 // UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]])
 // CONSTRAINED:   [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-ASM:     fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
 // COMMONIR:      ret <8 x half> [[FMLA]]
 float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
   return vfmaq_lane_f16(a, b, c, 3);
@@ -213,7 +213,6 @@
 
 // COMMON-LABEL: test_vfmsq_lane_f16
 // COMMONIR:      [[SUB:%.*]]  = fneg <8 x half> %b
-// CHECK-ASM:     fneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 // COMMONIR:      [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
 // COMMONIR:      [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8>
 // COMMONIR:      [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
@@ -223,7 +222,7 @@
 // COMMONIR:      [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
 // UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]])
 // CONSTRAINED:   [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:     fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-ASM:     fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}]
 // COMMONIR:      ret <8 x half> [[FMLA]]
 float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
   return vfmsq_lane_f16(a, b, c, 3);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -8052,6 +8052,15 @@
 }
 
 multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  // 1 variant for the .8h version: DUPLANE from 128-bit
+  def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
+                           (v8f16 (AArch64duplane16 (v8f16 V128:$Rm),
+                                                VectorIndexS:$idx)))),
+            (!cast<Instruction>(INST # "v8i16_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  } // Predicates = [HasNEON, HasFullFP16]
+
   // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
   def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
                            (AArch64duplane32 (v4f32 V128:$Rm),
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -29,8 +29,7 @@
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-NEXT:    fmla v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %lane1 = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> zeroinitializer
@@ -57,8 +56,7 @@
 ; CHECK:       .Lt_vfmaq_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.8h
+; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %lane1 = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> zeroinitializer
@@ -148,9 +146,7 @@
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    fneg v1.8h, v1.8h
-; CHECK-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-NEXT:    fmla v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b
@@ -179,8 +175,7 @@
 ; CHECK:       .Lt_vfmsq_laneq_f16$local:
 ; CHECK-NEXT:    .cfi_startproc
 ; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-NEXT:    fmls v0.8h, v2.8h, v1.8h
+; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
   %sub = fsub <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, %b