Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -874,6 +874,12 @@
     }
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+
+    for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
+      if (isTypeLegal(VT)) {
+        setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+      }
+    }
   }
 
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
@@ -7443,14 +7449,6 @@
   // Extend input splat value where needed to fit into a GPR (32b or 64b only)
   // FPRs don't have this restriction.
   switch (ElemVT.getSimpleVT().SimpleTy) {
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
-    return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
-  case MVT::i64:
-    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
-    return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
   case MVT::i1: {
     // The general case of i1.  There isn't any natural way to do this,
     // so we use some trickery with whilelo.
@@ -7463,13 +7461,24 @@
     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
                        DAG.getConstant(0, dl, MVT::i64), SplatVal);
   }
-  // TODO: we can support float types, but haven't added patterns yet.
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
+    break;
+  case MVT::i64:
+    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
+    break;
   case MVT::f16:
   case MVT::f32:
   case MVT::f64:
+    // Fine as is
+    break;
   default:
     report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
   }
+
+  return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
 }
 
 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -66,6 +66,7 @@
   dag InOperandList  = iops;
   let Pattern        = pattern;
   let isCodeGenOnly  = 1;
+  let isPseudo       = 1;
 }
 
 // Real instructions (have encoding information)
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1474,14 +1474,48 @@
 }
 
 bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Opcode = MI.getOpcode();
+
+  if ((Opcode == AArch64::DUP_ZV_H) ||
+      (Opcode == AArch64::DUP_ZV_S) ||
+      (Opcode == AArch64::DUP_ZV_D)) {
+    auto RC = &AArch64::ZPRRegClass;
+    unsigned Dst = MI.getOperand(0).getReg();
+    unsigned Src = MI.getOperand(1).getReg();
+    bool SrcIsKill = MI.getOperand(1).isKill();
+
+    unsigned NewOpcode;
+    unsigned NewSrc;
+    switch (Opcode) {
+    case AArch64::DUP_ZV_H:
+      NewOpcode = AArch64::DUP_ZZI_H;
+      NewSrc = TRI->getMatchingSuperReg(Src, AArch64::hsub, RC);
+      break;
+    case AArch64::DUP_ZV_S:
+      NewOpcode = AArch64::DUP_ZZI_S;
+      NewSrc = TRI->getMatchingSuperReg(Src, AArch64::ssub, RC);
+      break;
+    case AArch64::DUP_ZV_D:
+      NewOpcode = AArch64::DUP_ZZI_D;
+      NewSrc = TRI->getMatchingSuperReg(Src, AArch64::dsub, RC);
+      break;
+    }
+
+    BuildMI(MBB, MI, DL, get(NewOpcode), Dst)
+        .addReg(NewSrc, getKillRegState(SrcIsKill))
+        .addImm(0);
+    MBB.erase(MI);
+    return true;
+  }
+
   if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD &&
       MI.getOpcode() != AArch64::CATCHRET)
     return false;
 
-  MachineBasicBlock &MBB = *MI.getParent();
   auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
-  auto TRI = Subtarget.getRegisterInfo();
-  DebugLoc DL = MI.getDebugLoc();
 
   if (MI.getOpcode() == AArch64::CATCHRET) {
     // Skip to the first instruction before the epilog.
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -293,6 +293,27 @@
   defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_pred>;
   defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_pred>;
 
+  // Create variants of DUP_ZZI (I=0) that can be used without INSERT_SUBREG.
+  def DUP_ZV_H : Pseudo<(outs ZPR16:$Zd), (ins FPR16:$Vn), []>, Sched<[]>;
+  def DUP_ZV_S : Pseudo<(outs ZPR32:$Zd), (ins FPR32:$Vn), []>, Sched<[]>;
+  def DUP_ZV_D : Pseudo<(outs ZPR64:$Zd), (ins FPR64:$Vn), []>, Sched<[]>;
+
+  // Duplicate FP scalar into all vector elements
+  def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))), (DUP_ZV_H $src)>;
+  def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))), (DUP_ZV_H $src)>;
+  def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))), (DUP_ZV_H $src)>;
+  def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))), (DUP_ZV_S $src)>;
+  def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))), (DUP_ZV_S $src)>;
+  def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))), (DUP_ZV_D $src)>;
+
+  // Duplicate +0.0 into all vector elements
+  def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
+  def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
+  def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
+
   // Select elements from either vector (predicated)
   defm SEL_ZPZZ    : sve_int_sel_vvv<"sel", vselect>;
 
Index: llvm/test/CodeGen/AArch64/vector-splat-sve.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/vector-splat-sve.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s
+
+define <vscale x 8 x half> @splat_nxv8f16(half %val) {
+; CHECK-LABEL: splat_nxv8f16:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  %1 = insertelement <vscale x 8 x half> undef, half %val, i32 0
+  %2 = shufflevector <vscale x 8 x half> %1, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x half> %2
+}
+
+define <vscale x 4 x half> @splat_nxv4f16(half %val) {
+; CHECK-LABEL: splat_nxv4f16:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  %1 = insertelement <vscale x 4 x half> undef, half %val, i32 0
+  %2 = shufflevector <vscale x 4 x half> %1, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x half> %2
+}
+
+define <vscale x 2 x half> @splat_nxv2f16(half %val) {
+; CHECK-LABEL: splat_nxv2f16:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  %1 = insertelement <vscale x 2 x half> undef, half %val, i32 0
+  %2 = shufflevector <vscale x 2 x half> %1, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x half> %2
+}
+
+define <vscale x 4 x float> @splat_nxv4f32(float %val) {
+; CHECK-LABEL: splat_nxv4f32:
+; CHECK: mov z0.s, s0
+; CHECK-NEXT: ret
+  %1 = insertelement <vscale x 4 x float> undef, float %val, i32 0
+  %2 = shufflevector <vscale x 4 x float> %1, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x float> @splat_nxv2f32(float %val) {
+; CHECK-LABEL: splat_nxv2f32:
+; CHECK: mov z0.s, s0
+; CHECK-NEXT: ret
+  %1 = insertelement <vscale x 2 x float> undef, float %val, i32 0
+  %2 = shufflevector <vscale x 2 x float> %1, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %2
+}
+
+define <vscale x 2 x double> @splat_nxv2f64(double %val) {
+; CHECK-LABEL: splat_nxv2f64:
+; CHECK: mov z0.d, d0
+; CHECK-NEXT: ret
+  %1 = insertelement <vscale x 2 x double> undef, double %val, i32 0
+  %2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x double> %2
+}
+
+; TODO: The f16 constant load isn't folded correctly.
+define <vscale x 8 x half> @splat_nxv8f16_zero() {
+; CHECK-LABEL: splat_nxv8f16_zero:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  ret <vscale x 8 x half> zeroinitializer
+}
+
+; TODO: The f16 constant load isn't folded correctly.
+define <vscale x 4 x half> @splat_nxv4f16_zero() {
+; CHECK-LABEL: splat_nxv4f16_zero:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  ret <vscale x 4 x half> zeroinitializer
+}
+
+; TODO: The f16 constant load isn't folded correctly.
+define <vscale x 2 x half> @splat_nxv2f16_zero() {
+; CHECK-LABEL: splat_nxv2f16_zero:
+; CHECK: mov z0.h, h0
+; CHECK-NEXT: ret
+  ret <vscale x 2 x half> zeroinitializer
+}
+
+define <vscale x 4 x float> @splat_nxv4f32_zero() {
+; CHECK-LABEL: splat_nxv4f32_zero:
+; CHECK: mov z0.s, #0
+; CHECK-NEXT: ret
+  ret <vscale x 4 x float> zeroinitializer
+}
+
+define <vscale x 2 x float> @splat_nxv2f32_zero() {
+; CHECK-LABEL: splat_nxv2f32_zero:
+; CHECK: mov z0.s, #0
+; CHECK-NEXT: ret
+  ret <vscale x 2 x float> zeroinitializer
+}
+
+define <vscale x 2 x double> @splat_nxv2f64_zero() {
+; CHECK-LABEL: splat_nxv2f64_zero:
+; CHECK: mov z0.d, #0
+; CHECK-NEXT: ret
+  ret <vscale x 2 x double> zeroinitializer
+}