Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -205,6 +205,7 @@
   bool tryBitfieldInsertOp(SDNode *N);
   bool tryBitfieldInsertInZeroOp(SDNode *N);
   bool tryShiftAmountMod(SDNode *N);
+  bool tryHighFPExt(SDNode *N);
 
   bool tryReadRegister(SDNode *N);
   bool tryWriteRegister(SDNode *N);
@@ -1803,6 +1804,35 @@
   return true;
 }
 
+/// Try to form fcvtl2 instructions from a floating-point extend of a high-half
+/// extract of a subvector.
+bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) {
+  assert(N->getOpcode() == ISD::FP_EXTEND);
+
+  // There are 2 forms of fcvtl2 - extend to double or extend to float.
+  SDValue Extract = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT NarrowVT = Extract.getValueType();
+  if ((VT != MVT::v2f64 || NarrowVT != MVT::v2f32) &&
+      (VT != MVT::v4f32 || NarrowVT != MVT::v4f16))
+    return false;
+
+  // Optionally look past a bitcast.
+  Extract = peekThroughBitcasts(Extract);
+  if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return false;
+
+  // Match extract from start of high half index.
+  // Example: v8i16 -> v4i16 means the extract must begin at index 4.
+  unsigned ExtractIndex = Extract.getConstantOperandVal(1);
+  if (ExtractIndex != Extract.getValueType().getVectorNumElements())
+    return false;
+
+  auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16;
+  CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0));
+  return true;
+}
+
 static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
                                 SDValue &Opd0, unsigned &Immr, unsigned &Imms,
                                 unsigned NumberOfIgnoredLowBits = 0,
@@ -3010,6 +3040,11 @@
       return;
     break;
 
+    case ISD::FP_EXTEND:
+      if (tryHighFPExt(Node))
+        return;
+      break;
+
   case ISD::OR:
     if (tryBitfieldInsertOp(Node))
       return;
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3634,14 +3634,8 @@
                                                               (i64 4)))),
           (FCVTLv8i16 V128:$Rn)>;
 def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
-def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
-                                                    (i64 2))))),
-          (FCVTLv4i32 V128:$Rn)>;
 
 def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
-def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
-                                                    (i64 4))))),
-          (FCVTLv8i16 V128:$Rn)>;
 
 defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
 defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
Index: llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -39,21 +39,14 @@
 }
 
 define <2 x double> @test_vcvt_high_v1f64_f32_bitcast(<4 x float> %x) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_v1f64_f32_bitcast:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    ext.16b v0, v0, v0, #8
-; GENERIC-NEXT:    fcvtl v0.2d, v0.2s
-; GENERIC-NEXT:    ret
-;
-; FAST-LABEL: test_vcvt_high_v1f64_f32_bitcast:
-; FAST:       // %bb.0:
-; FAST-NEXT:    fcvtl2 v0.2d, v0.4s
-; FAST-NEXT:    ret
+; CHECK-LABEL: test_vcvt_high_v1f64_f32_bitcast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
+; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test_vcvt_high_v1f64_f32_bitcast:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    ext.16b v0, v0, v0, #8
-; GISEL-NEXT:    fcvtl v0.2d, v0.2s
+; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
 ; GISEL-NEXT:    ret
   %bc1 = bitcast <4 x float> %x to <2 x double>
   %ext = shufflevector <2 x double> %bc1, <2 x double> undef, <1 x i32> <i32 1>
@@ -63,23 +56,14 @@
 }
 
 define <2 x double> @test_vcvt_high_v1i64_f32_bitcast(<2 x i64> %x) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_v1i64_f32_bitcast:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    ext.16b v0, v0, v0, #8
-; GENERIC-NEXT:    fcvtl v0.2d, v0.2s
-; GENERIC-NEXT:    ret
-;
-; FAST-LABEL: test_vcvt_high_v1i64_f32_bitcast:
-; FAST:       // %bb.0:
-; FAST-NEXT:    ext.16b v0, v0, v0, #8
-; FAST-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; FAST-NEXT:    fcvtl v0.2d, v0.2s
-; FAST-NEXT:    ret
+; CHECK-LABEL: test_vcvt_high_v1i64_f32_bitcast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
+; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test_vcvt_high_v1i64_f32_bitcast:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    ext.16b v0, v0, v0, #8
-; GISEL-NEXT:    fcvtl v0.2d, v0.2s
+; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
 ; GISEL-NEXT:    ret
   %ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x i64> %ext to <2 x float>
@@ -88,23 +72,14 @@
 }
 
 define <2 x double> @test_vcvt_high_v2i32_f32_bitcast(<4 x i32> %x) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_v2i32_f32_bitcast:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    ext.16b v0, v0, v0, #8
-; GENERIC-NEXT:    fcvtl v0.2d, v0.2s
-; GENERIC-NEXT:    ret
-;
-; FAST-LABEL: test_vcvt_high_v2i32_f32_bitcast:
-; FAST:       // %bb.0:
-; FAST-NEXT:    ext.16b v0, v0, v0, #8
-; FAST-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; FAST-NEXT:    fcvtl v0.2d, v0.2s
-; FAST-NEXT:    ret
+; CHECK-LABEL: test_vcvt_high_v2i32_f32_bitcast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
+; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test_vcvt_high_v2i32_f32_bitcast:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    ext.16b v0, v0, v0, #8
-; GISEL-NEXT:    fcvtl v0.2d, v0.2s
+; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
 ; GISEL-NEXT:    ret
   %ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %bc2 = bitcast <2 x i32> %ext to <2 x float>
@@ -113,23 +88,14 @@
 }
 
 define <2 x double> @test_vcvt_high_v4i16_f32_bitcast(<8 x i16> %x) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_v4i16_f32_bitcast:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    ext.16b v0, v0, v0, #8
-; GENERIC-NEXT:    fcvtl v0.2d, v0.2s
-; GENERIC-NEXT:    ret
-;
-; FAST-LABEL: test_vcvt_high_v4i16_f32_bitcast:
-; FAST:       // %bb.0:
-; FAST-NEXT:    ext.16b v0, v0, v0, #8
-; FAST-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; FAST-NEXT:    fcvtl v0.2d, v0.2s
-; FAST-NEXT:    ret
+; CHECK-LABEL: test_vcvt_high_v4i16_f32_bitcast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
+; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test_vcvt_high_v4i16_f32_bitcast:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    ext.16b v0, v0, v0, #8
-; GISEL-NEXT:    fcvtl v0.2d, v0.2s
+; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
 ; GISEL-NEXT:    ret
   %ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %bc2 = bitcast <4 x i16> %ext to <2 x float>
@@ -138,23 +104,14 @@
 }
 
 define <2 x double> @test_vcvt_high_v8i8_f32_bitcast(<16 x i8> %x) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_v8i8_f32_bitcast:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    ext.16b v0, v0, v0, #8
-; GENERIC-NEXT:    fcvtl v0.2d, v0.2s
-; GENERIC-NEXT:    ret
-;
-; FAST-LABEL: test_vcvt_high_v8i8_f32_bitcast:
-; FAST:       // %bb.0:
-; FAST-NEXT:    ext.16b v0, v0, v0, #8
-; FAST-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; FAST-NEXT:    fcvtl v0.2d, v0.2s
-; FAST-NEXT:    ret
+; CHECK-LABEL: test_vcvt_high_v8i8_f32_bitcast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
+; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test_vcvt_high_v8i8_f32_bitcast:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    ext.16b v0, v0, v0, #8
-; GISEL-NEXT:    fcvtl v0.2d, v0.2s
+; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
 ; GISEL-NEXT:    ret
   %ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %bc2 = bitcast <8 x i8> %ext to <2 x float>
@@ -163,23 +120,14 @@
 }
 
 define <4 x float> @test_vcvt_high_v1i64_f16_bitcast(<2 x i64> %x) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_v1i64_f16_bitcast:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    ext.16b v0, v0, v0, #8
-; GENERIC-NEXT:    fcvtl v0.4s, v0.4h
-; GENERIC-NEXT:    ret
-;
-; FAST-LABEL: test_vcvt_high_v1i64_f16_bitcast:
-; FAST:       // %bb.0:
-; FAST-NEXT:    ext.16b v0, v0, v0, #8
-; FAST-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; FAST-NEXT:    fcvtl v0.4s, v0.4h
-; FAST-NEXT:    ret
+; CHECK-LABEL: test_vcvt_high_v1i64_f16_bitcast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test_vcvt_high_v1i64_f16_bitcast:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    ext.16b v0, v0, v0, #8
-; GISEL-NEXT:    fcvtl v0.4s, v0.4h
+; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
 ; GISEL-NEXT:    ret
   %ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1>
   %bc2 = bitcast <1 x i64> %ext to <4 x half>
@@ -188,23 +136,14 @@
 }
 
 define <4 x float> @test_vcvt_high_v2i32_f16_bitcast(<4 x i32> %x) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_v2i32_f16_bitcast:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    ext.16b v0, v0, v0, #8
-; GENERIC-NEXT:    fcvtl v0.4s, v0.4h
-; GENERIC-NEXT:    ret
-;
-; FAST-LABEL: test_vcvt_high_v2i32_f16_bitcast:
-; FAST:       // %bb.0:
-; FAST-NEXT:    ext.16b v0, v0, v0, #8
-; FAST-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; FAST-NEXT:    fcvtl v0.4s, v0.4h
-; FAST-NEXT:    ret
+; CHECK-LABEL: test_vcvt_high_v2i32_f16_bitcast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test_vcvt_high_v2i32_f16_bitcast:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    ext.16b v0, v0, v0, #8
-; GISEL-NEXT:    fcvtl v0.4s, v0.4h
+; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
 ; GISEL-NEXT:    ret
   %ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %bc2 = bitcast <2 x i32> %ext to <4 x half>
@@ -213,23 +152,14 @@
 }
 
 define <4 x float> @test_vcvt_high_v4i16_f16_bitcast(<8 x i16> %x) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_v4i16_f16_bitcast:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    ext.16b v0, v0, v0, #8
-; GENERIC-NEXT:    fcvtl v0.4s, v0.4h
-; GENERIC-NEXT:    ret
-;
-; FAST-LABEL: test_vcvt_high_v4i16_f16_bitcast:
-; FAST:       // %bb.0:
-; FAST-NEXT:    ext.16b v0, v0, v0, #8
-; FAST-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; FAST-NEXT:    fcvtl v0.4s, v0.4h
-; FAST-NEXT:    ret
+; CHECK-LABEL: test_vcvt_high_v4i16_f16_bitcast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test_vcvt_high_v4i16_f16_bitcast:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    ext.16b v0, v0, v0, #8
-; GISEL-NEXT:    fcvtl v0.4s, v0.4h
+; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
 ; GISEL-NEXT:    ret
   %ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %bc2 = bitcast <4 x i16> %ext to <4 x half>
@@ -238,23 +168,14 @@
 }
 
 define <4 x float> @test_vcvt_high_v8i8_f16_bitcast(<16 x i8> %x) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_v8i8_f16_bitcast:
-; GENERIC:       // %bb.0:
-; GENERIC-NEXT:    ext.16b v0, v0, v0, #8
-; GENERIC-NEXT:    fcvtl v0.4s, v0.4h
-; GENERIC-NEXT:    ret
-;
-; FAST-LABEL: test_vcvt_high_v8i8_f16_bitcast:
-; FAST:       // %bb.0:
-; FAST-NEXT:    ext.16b v0, v0, v0, #8
-; FAST-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; FAST-NEXT:    fcvtl v0.4s, v0.4h
-; FAST-NEXT:    ret
+; CHECK-LABEL: test_vcvt_high_v8i8_f16_bitcast:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtl2 v0.4s, v0.8h
+; CHECK-NEXT:    ret
 ;
 ; GISEL-LABEL: test_vcvt_high_v8i8_f16_bitcast:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    ext.16b v0, v0, v0, #8
-; GISEL-NEXT:    fcvtl v0.4s, v0.4h
+; GISEL-NEXT:    fcvtl2 v0.4s, v0.8h
 ; GISEL-NEXT:    ret
   %ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %bc2 = bitcast <8 x i8> %ext to <4 x half>