Index: llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -437,8 +437,6 @@
     DOPRegIsUnique = true;
     break;
   }
-
-  assert (DOPRegIsUnique && "The destructive operand should be unique");
 #endif
 
   // Resolve the reverse opcode
@@ -451,28 +449,32 @@
 
   // Get the right MOVPRFX
   uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode);
-  unsigned MovPrfx, MovPrfxZero, MovPrfxMerge;
+  unsigned MovPrfx, MovPrfxZero, MovPrfxMerge, Sel;
   switch (ElementSize) {
   case AArch64::ElementSizeNone:
   case AArch64::ElementSizeB:
     MovPrfx = AArch64::MOVPRFX_ZZ;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B;
     MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_B;
+    Sel = AArch64::SEL_ZPZZ_B;
     break;
   case AArch64::ElementSizeH:
     MovPrfx = AArch64::MOVPRFX_ZZ;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H;
     MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_H;
+    Sel = AArch64::SEL_ZPZZ_H;
     break;
   case AArch64::ElementSizeS:
     MovPrfx = AArch64::MOVPRFX_ZZ;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S;
     MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_S;
+    Sel = AArch64::SEL_ZPZZ_S;
     break;
   case AArch64::ElementSizeD:
     MovPrfx = AArch64::MOVPRFX_ZZ;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D;
     MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_D;
+    Sel = AArch64::SEL_ZPZZ_D;
     break;
   default:
     llvm_unreachable("Unsupported ElementSize");
@@ -482,7 +484,14 @@
   // Create the destructive operation (if required)
   //
   MachineInstrBuilder PRFX, DOP;
-  if (FalseLanes == AArch64::FalseLanesZero) {
+
+  if (FalseLanes == AArch64::FalseLanesZero && !DOPRegIsUnique) {
+    PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Sel))
+            .addReg(DstReg, RegState::Define)
+            .addReg(MI.getOperand(PredIdx).getReg())
+            .addReg(MI.getOperand(DOPIdx).getReg())
+            .addReg(MI.getOperand(PassIdx).getReg());
+  } else if (FalseLanes == AArch64::FalseLanesZero) {
     assert(ElementSize != AArch64::ElementSizeNone &&
            "This instruction is unpredicated");
 
Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -162,10 +162,11 @@
     return false;
   }
 
-  bool SelectDupZero(SDValue N) {
+  bool SelectDupZero(SDValue N, SDValue &Res) {
     switch(N->getOpcode()) {
     case AArch64ISD::DUP:
     case ISD::SPLAT_VECTOR: {
+      Res = N;
       auto Opnd0 = N->getOperand(0);
       if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
         if (CN->isNullValue())
Index: llvm/lib/Target/AArch64/SVEInstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -379,7 +379,7 @@
 : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))),
       (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>;
 
-def SVEDup0 : ComplexPattern<i64, 0, "SelectDupZero", []>;
+def SVEDup0 : ComplexPattern<vAny, 1, "SelectDupZero", []>;
 def SVEDup0Undef : ComplexPattern<i64, 0, "SelectDupZeroOrUndef", []>;
 
 let AddedComplexity = 1 in {
@@ -393,6 +393,11 @@
 : Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))),
       (inst $Op1, $Op2, $Op3)>;
 
+class SVE_3_Op_Pat_SelZero_Passthru<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                   ValueType vt2, ValueType vt3, Instruction inst>
+: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (vt2 SVEDup0:$Dup)), vt3:$Op3))),
+      (inst $Op1, $Op2, $Op3, $Dup)>;
+
 class SVE_3_Op_Pat_Shift_Imm_SelZero<ValueType vtd, SDPatternOperator op,
                                      ValueType vt1, ValueType vt2,
                                      Operand vt3, Instruction inst>
@@ -476,6 +481,12 @@
     let FalseLanes = flags;
     let Constraints = "$Zd = $Zpt";
   }
+
+  class PredTwoOpMergeZero<string name, ZPRRegOp zprty>
+  : SVEPseudo2Instr<name, 0>,
+    Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zpt), []> {
+    let FalseLanes = FalseLanesZero;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1616,13 +1627,13 @@
 }
 
 multiclass sve_fp_2op_p_zds_zx<SDPatternOperator op> {
-  def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
-  def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
-  def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;
+  def _ZERO_H : PredTwoOpMergeZero<NAME # _H, ZPR16>;
+  def _ZERO_S : PredTwoOpMergeZero<NAME # _S, ZPR32>;
+  def _ZERO_D : PredTwoOpMergeZero<NAME # _D, ZPR64>;
 
-  def : SVE_3_Op_Pat_SelZero<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>;
-  def : SVE_3_Op_Pat_SelZero<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>;
-  def : SVE_3_Op_Pat_SelZero<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>;
+  def : SVE_3_Op_Pat_SelZero_Passthru<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>;
+  def : SVE_3_Op_Pat_SelZero_Passthru<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>;
+  def : SVE_3_Op_Pat_SelZero_Passthru<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>;
 
   def _H : PredTwoOpMergePseudo<NAME # _H, ZPR16, FalseLanesMerge>;
   def _S : PredTwoOpMergePseudo<NAME # _S, ZPR32, FalseLanesMerge>;
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll
@@ -280,6 +280,26 @@
   ret <vscale x 2 x double> %out
 }
 
+; This tests currently breaks on master, because the register allcoation ends up as:
+;    Dst = FSUB_ZERO_S P0, Z0, Z0
+; And the expand pass cannot zero the false lanes of Z0 using MOVPRFX, because the
+; instruction specifies that the destination register must not be used in any other
+; operand position than the destination register, so:
+;    Z0 = MOVPRFX P0/z, Z0
+;    Z0 = FSUB_S Z0, P0/m, Z0
+; would not be valid. Hence the need to use a SELECT of Z0 and DUP(0).
+define <vscale x 4 x float> @fsub_zero_z0_z0(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0) {
+; CHECK-LABEL: fsub_zero_z0_z0
+; CHECK:       mov     z1.s, #0
+; CHECK-NEXT:  sel     z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:  fsubr   z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> zeroinitializer
+  %add = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0_in, <vscale x 4 x float> %z0)
+  ret <vscale x 4 x float> %add
+}
+
+
 ;
 ; FSUBR
 ;