Index: llvm/include/llvm/CodeGen/TargetPassConfig.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -402,6 +402,13 @@
     return false;
   }
 
+  /// addPostCoalesce - Add passes to the optimized register allocation pipeline
+  /// after coalescing is complete, but before further scheduling or register
+  /// allocation.
+  virtual bool addPostCoalesce() {
+    return false;
+  }
+
   /// Add passes to be run immediately after virtual registers are rewritten
   /// to physical registers.
   virtual void addPostRewrite() { }
Index: llvm/lib/CodeGen/TargetPassConfig.cpp
===================================================================
--- llvm/lib/CodeGen/TargetPassConfig.cpp
+++ llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1216,6 +1216,9 @@
   addPass(&TwoAddressInstructionPassID, false);
   addPass(&RegisterCoalescerID);
 
+  // Allow targets to change the live ranges after coalescing
+  addPostCoalesce();
+
   // The machine scheduler may accidentally create disconnected components
   // when moving subregister definitions around, avoid this by splitting them to
   // separate vregs before. Splitting can also improve reg. allocation quality.
Index: llvm/lib/Target/AArch64/AArch64.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64.h
+++ llvm/lib/Target/AArch64/AArch64.h
@@ -52,6 +52,7 @@
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
 FunctionPass *createAArch64CollectLOHPass();
+FunctionPass *createSVEConditionalEarlyClobberPass();
 ModulePass *createSVEIntrinsicOptsPass();
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
Index: llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -348,13 +348,25 @@
 
 /// \brief Expand Pseudos to Instructions with destructive operands.
 ///
-/// This mechanism uses MOVPRFX instructions for zeroing the false lanes
-/// or for fixing relaxed register allocation conditions to comply with
+/// This mechanism uses MOVPRFX instructions for merging/zeroing the false
+/// lanes or for fixing relaxed register allocation conditions to comply with
 /// the instructions register constraints. The latter case may be cheaper
 /// than setting the register constraints in the register allocator,
 /// since that will insert regular MOV instructions rather than MOVPRFX.
 ///
-/// Example (after register allocation):
+/// Merging example (after register allocation):
+///
+///   FADD_ZPZZ_B Z0, Pg, Z0, Z1, Z2
+///
+/// * The Pseudo FADD_ZPZZ_B maps to FADD_ZPmZ_B, where Z2 is the
+///   Passthru register.
+/// * We cannot map directly to FADD_ZPmZ_B because we need to
+///   carry the explicit passthru register.
+/// * FIXME: Register constraints when they're determined.
+/// * For performance, it's prefered to use the zero/undef merging
+///   variants.
+///
+/// Zeroing example (after register allocation):
 ///
 ///   FSUB_ZPZZ_ZERO_B Z0, Pg, Z1, Z0
 ///
@@ -379,9 +391,8 @@
 ///   MOVPRFX_ZPzZ_B Z0, Pg/z, Z0
 ///   FSUBR_ZPmZ_B   Z0, Pg/m, Z0, Z1
 ///
-/// Note that this can only be done for _ZERO or _UNDEF variants where
-/// we can guarantee the false lanes to be zeroed (by implementing this)
-/// or that they are undef (don't care / not used), otherwise the
+/// Note that this can only be done for merging variants where
+/// we can guarantee the false lanes are specified, otherwise the
 /// swapping of operands is illegal because the operation is not
 /// (or cannot be emulated to be) fully commutative.
 bool AArch64ExpandPseudo::expand_DestructiveOp(
@@ -391,7 +402,6 @@
   unsigned Opcode = AArch64::getSVEPseudoMap(MI.getOpcode());
   uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask;
   uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask;
-  bool FalseZero = FalseLanes == AArch64::FalseLanesZero;
 
   unsigned DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
@@ -400,21 +410,21 @@
     assert(DstReg != MI.getOperand(3).getReg());
 
   bool UseRev = false;
-  unsigned PredIdx, DOPIdx, SrcIdx;
+  unsigned PredIdx, DOPIdx, SrcIdx, PassthruIdx;
   switch (DType) {
   case AArch64::DestructiveBinaryComm:
   case AArch64::DestructiveBinaryCommWithRev:
     if (DstReg == MI.getOperand(3).getReg()) {
       // FSUB Zd, Pg, Zs1, Zd  ==> FSUBR   Zd, Pg/m, Zd, Zs1
-      std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 3, 2);
+      std::tie(PredIdx, DOPIdx, SrcIdx, PassthruIdx) = std::make_tuple(1, 3, 2, 4);
       UseRev = true;
       break;
     }
     LLVM_FALLTHROUGH;
   case AArch64::DestructiveBinary:
   case AArch64::DestructiveBinaryImm:
-    std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3);
-   break;
+    std::tie(PredIdx, DOPIdx, SrcIdx, PassthruIdx) = std::make_tuple(1, 2, 3, 4);
+    break;
   default:
     llvm_unreachable("Unsupported Destructive Operand type");
   }
@@ -449,24 +459,28 @@
 
   // Get the right MOVPRFX
   uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode);
-  unsigned MovPrfx, MovPrfxZero;
+  unsigned MovPrfx, MovPrfxZero, MovPrfxMerge;
   switch (ElementSize) {
   case AArch64::ElementSizeNone:
   case AArch64::ElementSizeB:
     MovPrfx = AArch64::MOVPRFX_ZZ;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B;
+    MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_B;
     break;
   case AArch64::ElementSizeH:
     MovPrfx = AArch64::MOVPRFX_ZZ;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H;
+    MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_H;
     break;
   case AArch64::ElementSizeS:
     MovPrfx = AArch64::MOVPRFX_ZZ;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S;
+    MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_S;
     break;
   case AArch64::ElementSizeD:
     MovPrfx = AArch64::MOVPRFX_ZZ;
     MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D;
+    MovPrfxMerge = AArch64::MOVPRFX_ZPmZ_D;
     break;
   default:
     llvm_unreachable("Unsupported ElementSize");
@@ -476,22 +490,56 @@
   // Create the destructive operation (if required)
   //
   MachineInstrBuilder PRFX, DOP;
-  if (FalseZero) {
+  if (FalseLanes == AArch64::FalseLanesZero) {
     assert(ElementSize != AArch64::ElementSizeNone &&
            "This instruction is unpredicated");
 
+    // If we're replacing the (DUP #0) with a zeroing MOVPRFX, walk
+    // backwards through the MachineInstrs to see if the DUP can be
+    // removed.
+    unsigned PassthruReg = MI.getOperand(PassthruIdx).getReg();
+    MachineBasicBlock::reverse_iterator RIt = MI.getReverseIterator();
+    for (MachineInstr &PredI : make_range(std::next(RIt), MBB.rend())) {
+      // If there are any uses of the DUP, don't remove it.
+      if (PredI.readsRegister(PassthruReg))
+        break;
+
+      // If we found the DUP with no other uses, remove it.
+      if (PredI.definesRegister(PassthruReg)) {
+        PredI.eraseFromParent();
+        break;
+      }
+    }
+
     // Merge source operand into destination register
     PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero))
                .addReg(DstReg, RegState::Define)
                .addReg(MI.getOperand(PredIdx).getReg())
                .addReg(MI.getOperand(DOPIdx).getReg());
 
+    // After the movprfx, the destructive operand is same as Dst
+    DOPIdx = 0;
+  } else if (FalseLanes == AArch64::FalseLanesMerge) {
+    unsigned PassthruReg = MI.getOperand(PassthruIdx).getReg();
+    unsigned DOPReg = MI.getOperand(DOPIdx).getReg();
+
+    // Generate a MOVPRFX to merge the false lanes. If the src and
+    // dst regs are the same, there's nothing to be done.
+    if (PassthruReg != DOPReg)
+      PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxMerge))
+                 .addReg(PassthruReg, RegState::Define)
+                 .addReg(PassthruReg)
+                 .addReg(MI.getOperand(PredIdx).getReg())
+                 .addReg(DOPReg);
+
     // After the movprfx, the destructive operand is same as Dst
     DOPIdx = 0;
   } else if (DstReg != MI.getOperand(DOPIdx).getReg()) {
     PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx))
                .addReg(DstReg, RegState::Define)
                .addReg(MI.getOperand(DOPIdx).getReg());
+
+    // After the movprfx, the destructive operand is same as Dst
     DOPIdx = 0;
   }
 
Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -162,10 +162,11 @@
     return false;
   }
 
-  bool SelectDupZero(SDValue N) {
+  bool SelectDupZero(SDValue N, SDValue &Res) {
     switch(N->getOpcode()) {
     case AArch64ISD::DUP:
     case ISD::SPLAT_VECTOR: {
+      Res = N;
       auto Opnd0 = N->getOperand(0);
       if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
         if (CN->isNullValue())
Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -37,12 +37,13 @@
 def DestructiveBinaryCommWithRev  : DestructiveInstTypeEnum<7>;
 def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>;
 
-class FalseLanesEnum<bits<2> val> {
-  bits<2> Value = val;
+class FalseLanesEnum<bits<3> val> {
+  bits<3> Value = val;
 }
 def FalseLanesNone  : FalseLanesEnum<0>;
 def FalseLanesZero  : FalseLanesEnum<1>;
 def FalseLanesUndef : FalseLanesEnum<2>;
+def FalseLanesMerge : FalseLanesEnum<4>;
 
 // AArch64 Instruction Format
 class AArch64Inst<Format f, string cstr> : Instruction {
@@ -64,7 +65,7 @@
   DestructiveInstTypeEnum DestructiveInstType = NotDestructive;
   ElementSizeEnum ElementSize = ElementSizeNone;
 
-  let TSFlags{8-7} = FalseLanes.Value;
+  let TSFlags{9-7} = FalseLanes.Value;
   let TSFlags{6-3} = DestructiveInstType.Value;
   let TSFlags{2-0} = ElementSize.Value;
 
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -420,9 +420,10 @@
 };
 
 enum FalseLaneType {
-  FalseLanesMask  = TSFLAG_FALSE_LANE_TYPE(0x3),
-  FalseLanesZero  = TSFLAG_FALSE_LANE_TYPE(0x1),
+  FalseLanesMask = TSFLAG_FALSE_LANE_TYPE(0x7),
+  FalseLanesZero = TSFLAG_FALSE_LANE_TYPE(0x1),
   FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2),
+  FalseLanesMerge = TSFLAG_FALSE_LANE_TYPE(0x4),
 };
 
 #undef TSFLAG_ELEMENT_SIZE_TYPE
Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -405,6 +405,7 @@
   }
 
   void addIRPasses()  override;
+  bool addPostCoalesce() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
@@ -493,6 +494,14 @@
 }
 
 // Pass Pipeline Configuration
+bool AArch64PassConfig::addPostCoalesce() {
+  // Add a pass that transforms SVE MOVPRFXable Pseudo instructions
+  // to add an 'earlyclobber' under certain conditions
+  addPass(createSVEConditionalEarlyClobberPass());
+
+  return false;
+}
+
 bool AArch64PassConfig::addPreISel() {
   // Run promote constant before global merge, so that the promoted constants
   // get a chance to be merged
Index: llvm/lib/Target/AArch64/CMakeLists.txt
===================================================================
--- llvm/lib/Target/AArch64/CMakeLists.txt
+++ llvm/lib/Target/AArch64/CMakeLists.txt
@@ -65,6 +65,7 @@
   AArch64TargetMachine.cpp
   AArch64TargetObjectFile.cpp
   AArch64TargetTransformInfo.cpp
+  SVEConditionalEarlyClobberPass.cpp
   SVEIntrinsicOpts.cpp
   AArch64SIMDInstrOpt.cpp
 
Index: llvm/lib/Target/AArch64/SVEConditionalEarlyClobberPass.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Target/AArch64/SVEConditionalEarlyClobberPass.cpp
@@ -0,0 +1,187 @@
+//==-- SVEConditionalEarlyClobberPass.cpp - Conditionally add early clobber ==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This pass solves an issue with MOVPRFXable instructions that
+//  have the restriction that the destination register of a MOVPRFX
+//  cannot be used in any operand of the next instruction, except for
+//  the destructive operand.
+//
+//  We chose to create Pseudo instructions to implement false-lane zeroing,
+//  where we specifically tried not to use the '$Zd = $Zs1' restriction
+//  so that the register allocator doesn't insert normal
+//  MOV instructions. The downside of doing that, is that the register
+//  allocation of:
+//    vreg1 = OP_ZEROING vreg0, vreg0
+//  may result in:
+//    Z8 = OP_ZEROING Z8, Z8
+//
+//  At expand time, the OP_ZEROING will either need a scratch register to
+//  implement an actual 'MOV(DUP(0))', or will need to use a MOVPRFX Pg/z
+//  with a dummy ('nop'-like) MOVPRFXable instruction, like LSL #0.
+//
+//  This is better handled by the register allocator creating an allocation
+//  that takes the above restriction into account, e.g.
+//    Z3 = OP_ZEROING Z8, Z8
+//  which can be correctly expanded into:
+//    Z3 = MOVPRFX Pg/z, Z8
+//    Z3 = OP Z3, Z8
+//
+//  After Coalescing of virtual registers, we know whether the input operands
+//  to the instruction will be in the same register or not.
+//  For our example:
+//    vreg1 = OP_ZEROING vreg0, vreg0
+//  we know that vreg0 and vreg0 will be equal, but we don't know the
+//  register allocation of vreg1. We want to force that vreg1 will be different
+//  from vreg0, which can be done using an 'earlyclobber'.
+//
+//  This pass adds the earlyclobber to the machine operand, and also updates
+//  the cache of live ranges so that subsequent passes don't need to
+//  recalculate those for the newly added earlyclobber.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define PASS_SHORT_NAME "Conditional Early Clobber"
+
+namespace llvm {
+  void initializeSVEConditionalEarlyClobberPassPass(PassRegistry &);
+}
+
+namespace {
+class SVEConditionalEarlyClobberPass : public MachineFunctionPass {
+public:
+  static char ID;
+  SVEConditionalEarlyClobberPass() : MachineFunctionPass(ID) {
+    initializeSVEConditionalEarlyClobberPassPass(
+                    *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  StringRef getPassName() const override { return PASS_SHORT_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<SlotIndexes>();
+    AU.addPreserved<SlotIndexes>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+private:
+  const TargetInstrInfo *TII;
+  LiveIntervals *LIS;
+
+  bool addConditionalEC(MachineInstr &MI);
+  bool hasConditionalClobber(const MachineInstr &MI);
+};
+char SVEConditionalEarlyClobberPass::ID = 0;
+}
+
+INITIALIZE_PASS(SVEConditionalEarlyClobberPass,
+                "aarch64-conditional-early-clobber",
+                PASS_SHORT_NAME, false, false)
+
+FunctionPass *llvm::createSVEConditionalEarlyClobberPass() {
+  return new SVEConditionalEarlyClobberPass();
+}
+
+// We could also choose to do this with a new instruction annotation
+// like 'earlyclobberif($Zd=$Zs1)', but because this is so specific to SVE
+// it should be fine to explicitly check the type of SVE operation where
+// we know what the conditions are.
+bool SVEConditionalEarlyClobberPass::hasConditionalClobber(
+                                                const MachineInstr &MI) {
+  int Instr = AArch64::getSVEPseudoMap(MI.getOpcode());
+  if (Instr == -1)
+    return false;
+
+  uint64_t FalseLanesZero = MI.getDesc().TSFlags & AArch64::FalseLanesZero;
+  if (!FalseLanesZero)
+    return false;
+
+  uint64_t DType =
+      TII->get(Instr).TSFlags & AArch64::DestructiveInstTypeMask;
+  auto mo_equals = [&](const MachineOperand &MO1, const MachineOperand &MO2) {
+    if (MO1.getReg() == MO2.getReg() && MO1.getSubReg() == MO2.getSubReg()) {
+      // This is needed to deal with cases where subreg assignment means that
+      // the earlyclobber isn't necessary.
+      return MI.getOperand(0).getSubReg() == MO1.getSubReg() ||
+             ((MO1.getSubReg() == 0) ^ (MI.getOperand(0).getSubReg() == 0));
+    }
+    return false;
+  };
+  switch (DType) {
+  case AArch64::DestructiveBinary:
+  case AArch64::DestructiveBinaryComm:
+  case AArch64::DestructiveBinaryCommWithRev:
+    return mo_equals(MI.getOperand(2), MI.getOperand(3));
+  case AArch64::DestructiveTernaryCommWithRev:
+    return mo_equals(MI.getOperand(2), MI.getOperand(3)) ||
+           mo_equals(MI.getOperand(2), MI.getOperand(4)) ||
+           mo_equals(MI.getOperand(3), MI.getOperand(4));
+  case AArch64::NotDestructive:
+  case AArch64::DestructiveBinaryImm:
+  case AArch64::DestructiveBinaryShImmUnpred:
+    return false;
+  default:
+    break;
+  }
+
+  llvm_unreachable("Not a known destructive operand type");
+}
+
+bool SVEConditionalEarlyClobberPass::addConditionalEC(MachineInstr &MI) {
+  // If the operand is already 'earlyclobber' or it doesn't require
+  // adding a conditional one (based on instruction), then don't bother.
+  if (!hasConditionalClobber(MI))
+    return false;
+
+  if (MI.getOperand(0).isEarlyClobber())
+    return false;
+
+  assert(MI.getOperand(0).isDef());
+
+  // Set the 'EarlyClobber' attribute for when the live ranges need
+  // to be recalculated.
+  MI.getOperand(0).setIsEarlyClobber(true);
+
+  SlotIndex Index = LIS->getInstructionIndex(MI);
+  SlotIndex DefSlot = Index.getRegSlot(0);
+
+  // Update the LiveRange cache by extending the liferange of the
+  // 'Def' register to be live earlier, so it overlaps with the
+  // live ranges of the input operands.
+  unsigned Reg = MI.getOperand(0).getReg();
+  auto *Seg = LIS->getInterval(Reg).getSegmentContaining(DefSlot);
+  assert(Seg && "Expected Def operand to be live with instruction");
+  Seg->start = Index.getRegSlot(true);
+  Seg->valno->def = Seg->start;
+
+  return true;
+}
+
+bool SVEConditionalEarlyClobberPass::runOnMachineFunction(MachineFunction &MF) {
+  LIS = &getAnalysis<LiveIntervals>();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  bool Modified = false;
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      Modified |= addConditionalEC(MI);
+
+  return Modified;
+}
Index: llvm/lib/Target/AArch64/SVEInstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -373,7 +373,7 @@
 : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))),
       (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>;
 
-def SVEDup0 : ComplexPattern<i64, 0, "SelectDupZero", []>;
+def SVEDup0 : ComplexPattern<vAny, 1, "SelectDupZero", []>;
 def SVEDup0Undef : ComplexPattern<i64, 0, "SelectDupZeroOrUndef", []>;
 
 let AddedComplexity = 1 in {
@@ -382,11 +382,27 @@
 : Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))),
       (inst $Op1, $Op2, $Op3)>;
 
+class SVE_3_Op_Pat_Sel_Passthru<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                                ValueType vt2, ValueType vt3, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, vt2:$Passthru), vt3:$Op3)),
+      (inst $Op1, $Op2, $Op3, $Passthru)>;
+
+class SVE_3_Op_Pat_SelZero_Passthru<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                   ValueType vt2, ValueType vt3, Instruction inst>
+: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (vt2 SVEDup0:$Dup)), vt3:$Op3))),
+      (inst $Op1, $Op2, $Op3, $Dup)>;
+
 class SVE_3_Op_Pat_Shift_Imm_SelZero<ValueType vtd, SDPatternOperator op,
                                      ValueType vt1, ValueType vt2,
                                      Operand vt3, Instruction inst>
 : Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))),
       (inst $Op1, $Op2, vt3:$Op3)>;
+
+class SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru<ValueType vtd, SDPatternOperator op,
+                                              ValueType vt1, ValueType vt2,
+                                              Operand vt3, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (vt2 SVEDup0:$Dup)), (i32 (vt3:$Op3)))),
+      (inst $Op1, $Op2, vt3:$Op3, $Dup)>;
 }
 
 //
@@ -457,6 +473,25 @@
     Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> {
     let FalseLanes = flags;
   }
+
+  class PredTwoOpMergePseudo<string name, ZPRRegOp zprty>
+  : SVEPseudo2Instr<name, 0>,
+    Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zpt), []> {
+    let FalseLanes = FalseLanesMerge;
+    let Constraints = "$Zd = $Zpt";
+  }
+
+  class PredTwoOpMergeZeroPseudo<string name, ZPRRegOp zprty>
+  : SVEPseudo2Instr<name, 0>,
+    Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2, zprty:$Zpt), []> {
+    let FalseLanes = FalseLanesZero;
+  }
+
+  class PredTwoOpImmMergeZeroPseudo<string name, ZPRRegOp zprty, Operand immty>
+  : SVEPseudo2Instr<name, 0>,
+    Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm, zprty:$Zpt), []> {
+    let FalseLanes = FalseLanesZero;
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1597,13 +1632,21 @@
 }
 
 multiclass sve_fp_2op_p_zds_zx<SDPatternOperator op> {
-  def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
-  def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
-  def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;
+  def _ZERO_H : PredTwoOpMergeZeroPseudo<NAME # _H, ZPR16>;
+  def _ZERO_S : PredTwoOpMergeZeroPseudo<NAME # _S, ZPR32>;
+  def _ZERO_D : PredTwoOpMergeZeroPseudo<NAME # _D, ZPR64>;
+
+  def : SVE_3_Op_Pat_SelZero_Passthru<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>;
+  def : SVE_3_Op_Pat_SelZero_Passthru<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>;
+  def : SVE_3_Op_Pat_SelZero_Passthru<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>;
+
+  def _MERGE_H : PredTwoOpMergePseudo<NAME # _H, ZPR16>;
+  def _MERGE_S : PredTwoOpMergePseudo<NAME # _S, ZPR32>;
+  def _MERGE_D : PredTwoOpMergePseudo<NAME # _D, ZPR64>;
 
-  def : SVE_3_Op_Pat_SelZero<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>;
-  def : SVE_3_Op_Pat_SelZero<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>;
-  def : SVE_3_Op_Pat_SelZero<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>;
+  def : SVE_3_Op_Pat_Sel_Passthru<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _MERGE_H)>;
+  def : SVE_3_Op_Pat_Sel_Passthru<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _MERGE_S)>;
+  def : SVE_3_Op_Pat_Sel_Passthru<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _MERGE_D)>;
 }
 
 class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
@@ -4762,15 +4805,15 @@
   def _S_Z_UNDEF : PredTwoOpImmPseudo<psName # _S, ZPR32, tvecshiftL32, FalseLanesUndef>;
   def _D_Z_UNDEF : PredTwoOpImmPseudo<psName # _D, ZPR64, tvecshiftL64, FalseLanesUndef>;
 
-  def _B_Z_ZERO : PredTwoOpImmPseudo<psName # _B, ZPR8,  tvecshiftL8,  FalseLanesZero>;
-  def _H_Z_ZERO : PredTwoOpImmPseudo<psName # _H, ZPR16, tvecshiftL16, FalseLanesZero>;
-  def _S_Z_ZERO : PredTwoOpImmPseudo<psName # _S, ZPR32, tvecshiftL32, FalseLanesZero>;
-  def _D_Z_ZERO : PredTwoOpImmPseudo<psName # _D, ZPR64, tvecshiftL64, FalseLanesZero>;
+  def _B_Z_ZERO : PredTwoOpImmMergeZeroPseudo<psName # _B, ZPR8,  tvecshiftL8>;
+  def _H_Z_ZERO : PredTwoOpImmMergeZeroPseudo<psName # _H, ZPR16, tvecshiftL16>;
+  def _S_Z_ZERO : PredTwoOpImmMergeZeroPseudo<psName # _S, ZPR32, tvecshiftL32>;
+  def _D_Z_ZERO : PredTwoOpImmMergeZeroPseudo<psName # _D, ZPR64, tvecshiftL64>;
 
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftL8,  !cast<Pseudo>(NAME # _B_Z_ZERO)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1,  nxv8i16, tvecshiftL16, !cast<Pseudo>(NAME # _H_Z_ZERO)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1,  nxv4i32, tvecshiftL32, !cast<Pseudo>(NAME # _S_Z_ZERO)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1,  nxv2i64, tvecshiftL64, !cast<Pseudo>(NAME # _D_Z_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftL8,  !cast<Pseudo>(NAME # _B_Z_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru<nxv8i16, op, nxv8i1,  nxv8i16, tvecshiftL16, !cast<Pseudo>(NAME # _H_Z_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru<nxv4i32, op, nxv4i1,  nxv4i32, tvecshiftL32, !cast<Pseudo>(NAME # _S_Z_ZERO)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru<nxv2i64, op, nxv2i1,  nxv2i64, tvecshiftL64, !cast<Pseudo>(NAME # _D_Z_ZERO)>;
 
   def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftL8,  !cast<Instruction>(NAME # _B)>;
   def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1,  nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
@@ -4803,15 +4846,15 @@
 }
 
 multiclass sve_int_bin_pred_shift_0_right_zx<SDPatternOperator op = null_frag> {
-  def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, vecshiftR8, FalseLanesZero>;
-  def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftR16, FalseLanesZero>;
-  def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, vecshiftR32, FalseLanesZero>;
-  def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, vecshiftR64, FalseLanesZero>;
+  def _ZERO_B : PredTwoOpImmMergeZeroPseudo<NAME # _B, ZPR8, vecshiftR8>;
+  def _ZERO_H : PredTwoOpImmMergeZeroPseudo<NAME # _H, ZPR16, vecshiftR16>;
+  def _ZERO_S : PredTwoOpImmMergeZeroPseudo<NAME # _S, ZPR32, vecshiftR32>;
+  def _ZERO_D : PredTwoOpImmMergeZeroPseudo<NAME # _D, ZPR64, vecshiftR64>;
 
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftR8, !cast<Pseudo>(NAME # _ZERO_B)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftR16, !cast<Pseudo>(NAME # _ZERO_H)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftR32, !cast<Pseudo>(NAME # _ZERO_S)>;
-  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftR64, !cast<Pseudo>(NAME # _ZERO_D)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftR8, !cast<Pseudo>(NAME # _ZERO_B)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftR16, !cast<Pseudo>(NAME # _ZERO_H)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftR32, !cast<Pseudo>(NAME # _ZERO_S)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero_Passthru<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftR64, !cast<Pseudo>(NAME # _ZERO_D)>;
 }
 
 class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
@@ -4857,15 +4900,25 @@
 }
 
 multiclass sve_int_bin_pred_zx<SDPatternOperator op> {
-  def _ZERO_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesZero>;
-  def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
-  def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
-  def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;
-
-  def : SVE_3_Op_Pat_SelZero<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _ZERO_B)>;
-  def : SVE_3_Op_Pat_SelZero<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _ZERO_H)>;
-  def : SVE_3_Op_Pat_SelZero<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _ZERO_S)>;
-  def : SVE_3_Op_Pat_SelZero<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>;
+  def _ZERO_B : PredTwoOpMergeZeroPseudo<NAME # _B, ZPR8>;
+  def _ZERO_H : PredTwoOpMergeZeroPseudo<NAME # _H, ZPR16>;
+  def _ZERO_S : PredTwoOpMergeZeroPseudo<NAME # _S, ZPR32>;
+  def _ZERO_D : PredTwoOpMergeZeroPseudo<NAME # _D, ZPR64>;
+
+  def : SVE_3_Op_Pat_SelZero_Passthru<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _ZERO_B)>;
+  def : SVE_3_Op_Pat_SelZero_Passthru<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _ZERO_H)>;
+  def : SVE_3_Op_Pat_SelZero_Passthru<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _ZERO_S)>;
+  def : SVE_3_Op_Pat_SelZero_Passthru<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>;
+
+  def _MERGE_B : PredTwoOpMergePseudo<NAME # _B, ZPR8>;
+  def _MERGE_H : PredTwoOpMergePseudo<NAME # _H, ZPR16>;
+  def _MERGE_S : PredTwoOpMergePseudo<NAME # _S, ZPR32>;
+  def _MERGE_D : PredTwoOpMergePseudo<NAME # _D, ZPR64>;
+
+  def : SVE_3_Op_Pat_Sel_Passthru<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _MERGE_B)>;
+  def : SVE_3_Op_Pat_Sel_Passthru<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _MERGE_H)>;
+  def : SVE_3_Op_Pat_Sel_Passthru<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _MERGE_S)>;
+  def : SVE_3_Op_Pat_Sel_Passthru<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _MERGE_D)>;
 }
 
 multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm,
Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll
===================================================================
--- llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -135,6 +135,7 @@
 ; CHECK-NEXT:       Slot index numbering
 ; CHECK-NEXT:       Live Interval Analysis
 ; CHECK-NEXT:       Simple Register Coalescing
+; CHECK-NEXT:       Conditional Early Clobber
 ; CHECK-NEXT:       Rename Disconnected Subregister Components
 ; CHECK-NEXT:       Machine Instruction Scheduler
 ; CHECK-NEXT:       Machine Block Frequency Analysis
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll
@@ -52,6 +52,114 @@
   ret <vscale x 2 x i64> %out
 }
 
+define <vscale x 16 x i8> @add_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a,
+                                  <vscale x 16 x i8> %b, <vscale x 16 x i8> %passthru) {
+; CHECK-LABEL: add_i8:
+; CHECK:      movprfx z2.b, p0/m, z0.b
+; CHECK-NEXT: add z2.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %passthru
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.add.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a_m,
+                                                               <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @add_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a,
+                                   <vscale x 8 x i16> %b, <vscale x 8 x i16> %passthru) {
+; CHECK-LABEL: add_i16
+; CHECK:      movprfx z2.h, p0/m, z0.h
+; CHECK-NEXT: add z2.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %passthru
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a_m,
+                                                               <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @add_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a,
+                                   <vscale x 4 x i32> %b, <vscale x 4 x i32> %passthru) {
+; CHECK-LABEL: add_i32:
+; CHECK:      movprfx z2.s, p0/m, z0.s
+; CHECK-NEXT: add z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %passthru
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a_m,
+                                                               <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @add_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a,
+                                   <vscale x 2 x i64> %b, <vscale x 2 x i64> %passthru) {
+; CHECK-LABEL: add_i64:
+; CHECK:      movprfx z2.d, p0/m, z0.d
+; CHECK-NEXT: add z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %passthru
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a_m,
+                                                               <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 16 x i8> @add_i8_comm(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a,
+                                      <vscale x 16 x i8> %b) {
+; CHECK-LABEL: add_i8_comm:
+; CHECK:      add z1.b, p0/m, z1.b, z0.b
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.add.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a_m,
+                                                               <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @add_i16_comm(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a,
+                                       <vscale x 8 x i16> %b) {
+; CHECK-LABEL: add_i16
+; CHECK:      add z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a_m,
+                                                               <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @add_i32_comm(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a,
+                                       <vscale x 4 x i32> %b) {
+; CHECK-LABEL: add_i32_comm:
+; CHECK:      add z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a_m,
+                                                               <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @add_i64_comm(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a,
+                                       <vscale x 2 x i64> %b) {
+; CHECK-LABEL: add_i64_comm:
+; CHECK:      add z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.add.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a_m,
+                                                               <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SUB
 ;
@@ -104,6 +212,62 @@
   ret <vscale x 2 x i64> %out
 }
 
+define <vscale x 16 x i8> @sub_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a,
+                                  <vscale x 16 x i8> %b, <vscale x 16 x i8> %passthru) {
+; CHECK-LABEL: sub_i8:
+; CHECK:      movprfx z2.b, p0/m, z0.b
+; CHECK-NEXT: sub z2.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %passthru
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sub.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a_m,
+                                                               <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sub_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a,
+                                   <vscale x 8 x i16> %b, <vscale x 8 x i16> %passthru) {
+; CHECK-LABEL: sub_i16
+; CHECK:      movprfx z2.h, p0/m, z0.h
+; CHECK-NEXT: sub z2.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %passthru
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sub.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a_m,
+                                                               <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sub_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a,
+                                   <vscale x 4 x i32> %b, <vscale x 4 x i32> %passthru) {
+; CHECK-LABEL: sub_i32:
+; CHECK:      movprfx z2.s, p0/m, z0.s
+; CHECK-NEXT: sub z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %passthru
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sub.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a_m,
+                                                               <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @sub_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a,
+                                   <vscale x 2 x i64> %b, <vscale x 2 x i64> %passthru) {
+; CHECK-LABEL: sub_i64:
+; CHECK:      movprfx z2.d, p0/m, z0.d
+; CHECK-NEXT: sub z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %passthru
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sub.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a_m,
+                                                               <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; SUBR
 ;
@@ -156,6 +320,118 @@
   ret <vscale x 2 x i64> %out
 }
 
+define <vscale x 16 x i8> @subr_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a,
+                                   <vscale x 16 x i8> %b, <vscale x 16 x i8> %passthru) {
+; CHECK-LABEL: subr_i8:
+; CHECK:      movprfx z2.b, p0/m, z0.b
+; CHECK-NEXT: subr z2.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %passthru
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.subr.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                <vscale x 16 x i8> %a_m,
+                                                                <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @subr_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a,
+                                   <vscale x 8 x i16> %b, <vscale x 8 x i16> %passthru) {
+; CHECK-LABEL: subr_i16
+; CHECK:      movprfx z2.h, p0/m, z0.h
+; CHECK-NEXT: subr z2.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %passthru
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.subr.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                <vscale x 8 x i16> %a_m,
+                                                                <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @subr_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a,
+                                    <vscale x 4 x i32> %b, <vscale x 4 x i32> %passthru) {
+; CHECK-LABEL: subr_i32:
+; CHECK:      movprfx z2.s, p0/m, z0.s
+; CHECK-NEXT: subr z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %passthru
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.subr.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                <vscale x 4 x i32> %a_m,
+                                                                <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @subr_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a,
+                                    <vscale x 2 x i64> %b, <vscale x 2 x i64> %passthru) {
+; CHECK-LABEL: subr_i64:
+; CHECK:      movprfx z2.d, p0/m, z0.d
+; CHECK-NEXT: subr z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %passthru
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.subr.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                <vscale x 2 x i64> %a_m,
+                                                                <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 16 x i8> @subr_i8_rev(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a,
+                                       <vscale x 16 x i8> %b) {
+; CHECK-LABEL: subr_i8_rev:
+; CHECK-NOT:  movprfx
+; CHECK:      subr z1.b, p0/m, z1.b, z0.b
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sub.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                <vscale x 16 x i8> %a_m,
+                                                                <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @subr_i16_rev(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a,
+                                        <vscale x 8 x i16> %b) {
+; CHECK-LABEL: subr_i16_rev:
+; CHECK-NOT:  movprfx
+; CHECK:      subr z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sub.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                <vscale x 8 x i16> %a_m,
+                                                                <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @subr_i32_rev(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a,
+                                        <vscale x 4 x i32> %b) {
+; CHECK-LABEL: subr_i32_rev:
+; CHECK-NOT:  movprfx
+; CHECK:      subr z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sub.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                <vscale x 4 x i32> %a_m,
+                                                                <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @subr_i64_rev(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a,
+                                        <vscale x 2 x i64> %b) {
+; CHECK-LABEL: subr_i64_rev:
+; CHECK-NOT:  movprfx
+; CHECK:      subr z1.d, p0/m, z1.d, z0.d
+; CHECK-NEXT: mov z0.d, z1.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sub.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                <vscale x 2 x i64> %a_m,
+                                                                <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.add.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.add.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll
@@ -85,6 +85,62 @@
   ret <vscale x 4 x i32> %out
 }
 
+define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a,
+                                  <vscale x 16 x i8> %b, <vscale x 16 x i8> %passthru) {
+; CHECK-LABEL: asr_i8:
+; CHECK:      movprfx z2.b, p0/m, z0.b
+; CHECK-NEXT: asr z2.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %passthru
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1> %pg,
+                                                          <vscale x 16 x i8> %a_m,
+                                                          <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @asr_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a,
+                                   <vscale x 8 x i16> %b, <vscale x 8 x i16> %passthru) {
+; CHECK-LABEL: asr_i16:
+; CHECK:      movprfx z2.h, p0/m, z0.h
+; CHECK-NEXT: asr z2.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %passthru
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1> %pg,
+                                                          <vscale x 8 x i16> %a_m,
+                                                          <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @asr_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a,
+                                   <vscale x 4 x i32> %b, <vscale x 4 x i32> %passthru) {
+; CHECK-LABEL: asr_i32:
+; CHECK:      movprfx z2.s, p0/m, z0.s
+; CHECK-NEXT: asr z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %passthru
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1> %pg,
+                                                          <vscale x 4 x i32> %a_m,
+                                                          <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @asr_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a,
+                                   <vscale x 2 x i64> %b, <vscale x 2 x i64> %passthru) {
+; CHECK-LABEL: asr_i64:
+; CHECK:      movprfx z2.d, p0/m, z0.d
+; CHECK-NEXT: asr z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %passthru
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1> %pg,
+                                                          <vscale x 2 x i64> %a_m,
+                                                          <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; ASRD
 ;
@@ -222,6 +278,62 @@
   ret <vscale x 4 x i32> %out
 }
 
+define <vscale x 16 x i8> @lsl_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a,
+                                   <vscale x 16 x i8> %b, <vscale x 16 x i8> %passthru) {
+; CHECK-LABEL: lsl_i8:
+; CHECK:      movprfx z2.b, p0/m, z0.b
+; CHECK-NEXT: lsl z2.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %passthru
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                          <vscale x 16 x i8> %a_m,
+                                                          <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsl_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a,
+                                   <vscale x 8 x i16> %b, <vscale x 8 x i16> %passthru) {
+; CHECK-LABEL: lsl_i16:
+; CHECK:      movprfx z2.h, p0/m, z0.h
+; CHECK-NEXT: lsl z2.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %passthru
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                          <vscale x 8 x i16> %a_m,
+                                                          <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsl_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a,
+                                   <vscale x 4 x i32> %b, <vscale x 4 x i32> %passthru) {
+; CHECK-LABEL: lsl_i32:
+; CHECK:      movprfx z2.s, p0/m, z0.s
+; CHECK-NEXT: lsl z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %passthru
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                          <vscale x 4 x i32> %a_m,
+                                                          <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @lsl_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a,
+                                   <vscale x 2 x i64> %b, <vscale x 2 x i64> %passthru) {
+; CHECK-LABEL: lsl_i64:
+; CHECK:      movprfx z2.d, p0/m, z0.d
+; CHECK-NEXT: lsl z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %passthru
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                          <vscale x 2 x i64> %a_m,
+                                                          <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 ;
 ; LSR
 ;
@@ -307,6 +419,62 @@
   ret <vscale x 4 x i32> %out
 }
 
+define <vscale x 16 x i8> @lsr_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a,
+                                   <vscale x 16 x i8> %b, <vscale x 16 x i8> %passthru) {
+; CHECK-LABEL: lsr_i8:
+; CHECK:      movprfx z2.b, p0/m, z0.b
+; CHECK-NEXT: lsr z2.b, p0/m, z2.b, z1.b
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %passthru
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1> %pg,
+                                                          <vscale x 16 x i8> %a_m,
+                                                          <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsr_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a,
+                                   <vscale x 8 x i16> %b, <vscale x 8 x i16> %passthru) {
+; CHECK-LABEL: lsr_i16:
+; CHECK:      movprfx z2.h, p0/m, z0.h
+; CHECK-NEXT: lsr z2.h, p0/m, z2.h, z1.h
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %passthru
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1> %pg,
+                                                          <vscale x 8 x i16> %a_m,
+                                                          <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsr_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a,
+                                   <vscale x 4 x i32> %b, <vscale x 4 x i32> %passthru) {
+; CHECK-LABEL: lsr_i32:
+; CHECK:      movprfx z2.s, p0/m, z0.s
+; CHECK-NEXT: lsr z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %passthru
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg,
+                                                          <vscale x 4 x i32> %a_m,
+                                                          <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @lsr_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a,
+                                   <vscale x 2 x i64> %b, <vscale x 2 x i64> %passthru) {
+; CHECK-LABEL: lsr_i64:
+; CHECK:      movprfx z2.d, p0/m, z0.d
+; CHECK-NEXT: lsr z2.d, p0/m, z2.d, z1.d
+; CHECK-NEXT: mov z0.d, z2.d
+; CHECK-NEXT: ret
+  %a_m = select <vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %passthru
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1> %pg,
+                                                          <vscale x 2 x i64> %a_m,
+                                                          <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
Index: llvm/test/CodeGen/AArch64/sve-movprfx-merging.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-movprfx-merging.ll
@@ -0,0 +1,88 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+
+define <vscale x 4 x float> @fsub_merge_z0_z0_z0(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0) {
+; CHECK-LABEL: fsub_merge_z0_z0_z0
+; CHECK:       fsub	z0.s, p0/m, z0.s, z0.s
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> %z0
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z0)
+  ret <vscale x 4 x float> %sub
+}
+
+define <vscale x 4 x float> @fsub_merge_z0_z1(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0,
+                                              <vscale x 4 x float> %z1, <vscale x 4 x float> %pt) {
+; CHECK-LABEL: fsub_merge_z0_z1
+; CHECK:       movprfx	z2.s, p0/m, z0.s
+; CHECK-NEXT:  fsub	z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:  mov	z0.d, z2.d
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> %pt
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z1)
+  ret <vscale x 4 x float> %sub
+}
+
+define <vscale x 4 x float> @fsub_merge_reuse_z0_z1_pt(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0,
+                                                       <vscale x 4 x float> %z1, <vscale x 4 x float> %pt) {
+; CHECK-LABEL: fsub_merge_reuse_z0_z1_pt
+; CHECK:       mov      z3.d, z2.d
+; CHECK:       movprfx  z3.s, p0/m, z0.s
+; CHECK-NEXT:  fsub     z3.s, p0/m, z3.s, z1.s
+; CHECK:       movprfx  z2.s, p0/m, z0.s
+; CHECK-NEXT:  fsub     z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:  mov      z0.d, z2.d
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> %pt
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z1)
+  %sub2 = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                   <vscale x 4 x float> %z0_in,
+                                                                   <vscale x 4 x float> %sub)
+  ret <vscale x 4 x float> %sub2
+}
+
+define <vscale x 4 x float> @fsub_merge_reuse2_z0_z1_pt(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0,
+                                                        <vscale x 4 x float> %z1, <vscale x 4 x float> %pt) {
+; CHECK-LABEL: fsub_merge_reuse2_z0_z1_pt
+; CHECK:       sel	z3.s, p0, z0.s, z2.s
+; CHECK:       movprfx  z2.s, p0/m, z0.s
+; CHECK-NEXT:  fsub     z2.s, p0/m, z2.s, z1.s
+; CHECK-NEXT:  fsub     z2.s, p0/m, z2.s, z3.s
+; CHECK-NEXT:  mov      z0.d, z2.d
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> %pt
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z1)
+  %sub2 = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                   <vscale x 4 x float> %sub,
+                                                                   <vscale x 4 x float> %z0_in)
+  ret <vscale x 4 x float> %sub2
+}
+
+define <vscale x 4 x float> @fsub_merge_z0_z1_pt_reuse(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0,
+                                                    <vscale x 4 x float> %z1, <vscale x 4 x float> %pt) {
+; CHECK-LABEL: fsub_merge_z0_z1_pt
+; CHECK:       mov	z3.d, z2.d
+; CHECK:       movprfx  z3.s, p0/m, z0.s
+; CHECK-NEXT:  fsub     z3.s, p0/m, z3.s, z1.s
+; CHECK-NEXT:  fsub     z3.s, p0/m, z3.s, z2.s
+; CHECK-NEXT:  mov      z0.d, z3.d
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> %pt
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z1)
+  %sub2 = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                   <vscale x 4 x float> %sub,
+                                                                   <vscale x 4 x float> %pt)
+  ret <vscale x 4 x float> %sub2
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+
Index: llvm/test/CodeGen/AArch64/sve-movprfx-zeroing.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-movprfx-zeroing.ll
@@ -0,0 +1,120 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 4 x float> @fsub_zero_z0_z0(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0) {
+; CHECK-LABEL: fsub_zero_z0_z0
+; CHECK:       movprfx z1.s, p0/z, z0.s
+; CHECK-NEXT:  fsub    z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:  z0.d, z1.d
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> zeroinitializer
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z0)
+  ret <vscale x 4 x float> %sub
+}
+
+define <vscale x 4 x float> @fsub_zero_z0_z1(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0,
+                                             <vscale x 4 x float> %z1) {
+; CHECK-LABEL: fsub_zero_z0_z1
+; CHECK:       movprfx z0.s, p0/z, z0.s
+; CHECK-NEXT:  fsub    z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> zeroinitializer
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z1)
+  ret <vscale x 4 x float> %sub
+}
+
+define <vscale x 4 x float> @fsub_zero_z0_reuse_z01(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> %z1) {
+; CHECK-LABEL: fsub_zero_z0_reuse_z01
+; CHECK:       movprfx z1.s, p0/z, z1.s
+; CHECK-NEXT:  fsubr   z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:  fsub    z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> zeroinitializer
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z1)
+  %sub2 = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                   <vscale x 4 x float> %z0,
+                                                                   <vscale x 4 x float> %sub)
+  ret <vscale x 4 x float> %sub2
+}
+
+define <vscale x 4 x float> @fsub_zero_z0_z0_fsub_zero_z0_z0(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0) {
+; CHECK-LABEL: fsub_zero_z0_z0_fsub_zero_z0_z0
+; CHECK:       movprfx	z1.s, p0/z, z0.s
+; CHECK-NEXT:  fsub     z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:  fsub     z1.s, p0/m, z1.s, z1.s
+; CHECK-NEXT:  mov	z0.d, z1.d
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> zeroinitializer
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z0)
+  %sub2 = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                   <vscale x 4 x float> %z0_in,
+                                                                   <vscale x 4 x float> %z0)
+  %sub3 = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                   <vscale x 4 x float> %sub,
+                                                                   <vscale x 4 x float> %sub2)
+  ret <vscale x 4 x float> %sub3
+}
+
+define <vscale x 4 x float> @fsub_zero_z0_z1_fsub_zero_z0_z2(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0,
+                                                            <vscale x 4 x float> %z1,  <vscale x 4 x float> %z2) {
+; CHECK-LABEL: fsub_zero_z0_z1_fsub_zero_z0_z2
+; CHECK:       movprfx	z1.s, p0/z, z1.s
+; CHECK-NEXT:  fsubr    z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:  movprfx	z0.s, p0/z, z0.s
+; CHECK-NEXT:  fsub     z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:  fsub     z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:  mov      z0.d, z1.d
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> zeroinitializer
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0_in, <vscale x 4 x float> %z1)
+  %sub2 = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0_in, <vscale x 4 x float> %z2)
+  %sub3 = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,  <vscale x 4 x float> %sub, <vscale x 4 x float> %sub2)
+  ret <vscale x 4 x float> %sub3
+}
+
+define <vscale x 4 x float> @fsub_zero_z0_z0_reuse(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0) {
+; CHECK-LABEL: fsub_zero_z0_z0_reuse
+; CHECK:       movprfx z1.s, p0/z, z0.s
+; CHECK-NEXT:  fsub    z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:  fsub    z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:  z0.d, z1.d
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> zeroinitializer
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z0)
+  %sub2 = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                   <vscale x 4 x float> %sub,
+                                                                   <vscale x 4 x float> %z0)
+  ret <vscale x 4 x float> %sub2
+}
+
+define <vscale x 4 x float> @fsub_zero_reuse_z0_z0(<vscale x 4 x i1> %p, <vscale x 4 x float> %z0) {
+; CHECK-LABEL: fsub_zero_reuse_z0_z0
+; CHECK:       mov     z2.s, #0
+; CHECK-NEXT:  sel     z3.s, p0, z0.s, z2.s
+; CHECK-NEXT:  movprfx z1.s, p0/z, z0.s
+; CHECK-NEXT:  fsub    z1.s, p0/m, z1.s, z0.s
+; CHECK-NEXT:  fsub    z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:  mov     z0.d, z1.d
+; CHECK-NEXT:  ret
+  %z0_in = select <vscale x 4 x i1> %p, <vscale x 4 x float> %z0, <vscale x 4 x float> zeroinitializer
+  %sub = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                  <vscale x 4 x float> %z0_in,
+                                                                  <vscale x 4 x float> %z0)
+  %sub2 = call <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1> %p,
+                                                                   <vscale x 4 x float> %sub,
+                                                                   <vscale x 4 x float> %z0_in)
+  ret <vscale x 4 x float> %sub2
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fsub.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fsub.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)