diff --git a/llvm/include/llvm/CodeGen/CallingConvLower.h b/llvm/include/llvm/CodeGen/CallingConvLower.h
--- a/llvm/include/llvm/CodeGen/CallingConvLower.h
+++ b/llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -340,6 +340,11 @@
     return Regs.size();
   }
 
+  void DeallocateReg(MCPhysReg Reg) {
+    assert(isAllocated(Reg) && "Trying to deallocate an unallocated register");
+    MarkUnallocated(Reg);
+  }
+
   /// AllocateReg - Attempt to allocate one register.  If it is not available,
   /// return zero.  Otherwise, return the register, marking it and any aliases
   /// as allocated.
@@ -570,6 +575,8 @@
 private:
   /// MarkAllocated - Mark a register and all of its aliases as allocated.
   void MarkAllocated(MCPhysReg Reg);
+
+  void MarkUnallocated(MCPhysReg Reg);
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h
--- a/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -122,10 +122,12 @@
     void setReturned() { IsReturned = 1; }
 
     bool isInConsecutiveRegs()  const { return IsInConsecutiveRegs; }
-    void setInConsecutiveRegs() { IsInConsecutiveRegs = 1; }
+    void setInConsecutiveRegs(bool Flag = true) { IsInConsecutiveRegs = Flag; }
 
     bool isInConsecutiveRegsLast() const { return IsInConsecutiveRegsLast; }
-    void setInConsecutiveRegsLast() { IsInConsecutiveRegsLast = 1; }
+    void setInConsecutiveRegsLast(bool Flag = true) {
+      IsInConsecutiveRegsLast = Flag;
+    }
 
     bool isSplit()   const { return IsSplit; }
     void setSplit()  { IsSplit = 1; }
diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp
--- a/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -63,6 +63,11 @@
     UsedRegs[*AI / 32] |= 1 << (*AI & 31);
 }
 
+void CCState::MarkUnallocated(MCPhysReg Reg) {
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI / 32] &= ~(1 << (*AI & 31));
+}
+
 bool CCState::IsShadowAllocatedReg(MCRegister Reg) const {
   if (!isAllocated(Reg))
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -42,6 +42,51 @@
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
                              CCState &State, Align SlotAlign) {
+  if (LocVT.isScalableVector()) {
+    const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+        State.getMachineFunction().getSubtarget());
+    const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
+
+    // We are about to reinvoke the CCAssignFn auto-generated handler. If we
+    // don't unset these flags we will get stuck in an infinite loop forever
+    // invoking the custom handler.
+    ArgFlags.setInConsecutiveRegs(false);
+    ArgFlags.setInConsecutiveRegsLast(false);
+
+    // The calling convention for passing SVE tuples states that in the event
+    // we cannot allocate enough registers for the tuple we should still leave
+    // any remaining registers unallocated. However, when we call the
+    // CCAssignFn again we want it to behave as if all remaining registers are
+    // allocated. This will force the code to pass the tuple indirectly in
+    // accordance with the PCS.
+    bool RegsAllocated[8];
+    for (int I = 0; I < 8; I++) {
+      RegsAllocated[I] = State.isAllocated(ZRegList[I]);
+      State.AllocateReg(ZRegList[I]);
+    }
+
+    auto &It = PendingMembers[0];
+    CCAssignFn *AssignFn =
+        TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false);
+    if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full,
+                 ArgFlags, State))
+      llvm_unreachable("Call operand has unhandled type");
+
+    // Return the flags to how they were before.
+    ArgFlags.setInConsecutiveRegs(true);
+    ArgFlags.setInConsecutiveRegsLast(true);
+
+    // Return the register state back to how it was before, leaving any
+    // unallocated registers available for other smaller types.
+    for (int I = 0; I < 8; I++)
+      if (!RegsAllocated[I])
+        State.DeallocateReg(ZRegList[I]);
+
+    // All pending members have now been allocated
+    PendingMembers.clear();
+    return true;
+  }
+
   unsigned Size = LocVT.getSizeInBits() / 8;
   const Align StackAlign =
       State.getMachineFunction().getDataLayout().getStackAlignment();
@@ -146,13 +191,11 @@
     return true;
   }
 
-  if (LocVT.isScalableVector())
-    report_fatal_error(
-        "Passing consecutive scalable vector registers unsupported");
-
-  // Mark all regs in the class as unavailable
-  for (auto Reg : RegList)
-    State.AllocateReg(Reg);
+  if (!LocVT.isScalableVector()) {
+    // Mark all regs in the class as unavailable
+    for (auto Reg : RegList)
+      State.AllocateReg(Reg);
+  }
 
   const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8);
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4277,10 +4277,10 @@
     assert(!Res && "Call operand has unhandled type");
     (void)Res;
   }
-  assert(ArgLocs.size() == Ins.size());
   SmallVector<SDValue, 16> ArgValues;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  unsigned ExtraArgLocs = 0;
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
 
     if (Ins[i].Flags.isByVal()) {
       // Byval is used for HFAs in the PCS, but the system should work in a
@@ -4408,16 +4408,44 @@
     if (VA.getLocInfo() == CCValAssign::Indirect) {
       assert(VA.getValVT().isScalableVector() &&
            "Only scalable vectors can be passed indirectly");
-      // If value is passed via pointer - do a load.
-      ArgValue =
-          DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
-    }
 
-    if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
-      ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
-                             ArgValue, DAG.getValueType(MVT::i32));
-    InVals.push_back(ArgValue);
+      uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
+      unsigned NumParts = 1;
+      if (Ins[i].Flags.isInConsecutiveRegs()) {
+        assert(!Ins[i].Flags.isInConsecutiveRegsLast());
+        while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
+          ++NumParts;
+      }
+
+      MVT PartLoad = VA.getValVT();
+      SDValue Ptr = ArgValue;
+
+      // Ensure we generate all loads for each tuple part, whilst updating the
+      // pointer after each load correctly using vscale.
+      while (NumParts > 0) {
+        ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
+        InVals.push_back(ArgValue);
+        NumParts--;
+        if (NumParts > 0) {
+          SDValue BytesIncrement = DAG.getVScale(
+              DL, Ptr.getValueType(),
+              APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
+          SDNodeFlags Flags;
+          Flags.setNoUnsignedWrap(true);
+          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                            BytesIncrement, Flags);
+          ExtraArgLocs++;
+          i++;
+        }
+      }
+    } else {
+      if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
+        ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
+                               ArgValue, DAG.getValueType(MVT::i32));
+      InVals.push_back(ArgValue);
+    }
   }
+  assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
 
   // varargs
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -5015,8 +5043,9 @@
   }
 
   // Walk the register/memloc assignments, inserting copies/loads.
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  unsigned ExtraArgLocs = 0;
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
     SDValue Arg = OutVals[i];
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
 
@@ -5058,18 +5087,49 @@
     case CCValAssign::Indirect:
       assert(VA.getValVT().isScalableVector() &&
              "Only scalable vectors can be passed indirectly");
+
+      uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
+      uint64_t PartSize = StoreSize;
+      unsigned NumParts = 1;
+      if (Outs[i].Flags.isInConsecutiveRegs()) {
+        assert(!Outs[i].Flags.isInConsecutiveRegsLast());
+        while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
+          ++NumParts;
+        StoreSize *= NumParts;
+      }
+
       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
       Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
       Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
-      int FI = MFI.CreateStackObject(
-          VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
+      int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
       MFI.setStackID(FI, TargetStackID::SVEVector);
 
-      SDValue SpillSlot = DAG.getFrameIndex(
+      MachinePointerInfo MPI =
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+      SDValue Ptr = DAG.getFrameIndex(
           FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
-      Chain = DAG.getStore(
-          Chain, DL, Arg, SpillSlot,
-          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      SDValue SpillSlot = Ptr;
+
+      // Ensure we generate all stores for each tuple part, whilst updating the
+      // pointer after each store correctly using vscale.
+      while (NumParts) {
+        Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
+        NumParts--;
+        if (NumParts > 0) {
+          SDValue BytesIncrement = DAG.getVScale(
+              DL, Ptr.getValueType(),
+              APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
+          SDNodeFlags Flags;
+          Flags.setNoUnsignedWrap(true);
+
+          MPI = MachinePointerInfo(MPI.getAddrSpace());
+          Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                            BytesIncrement, Flags);
+          ExtraArgLocs++;
+          i++;
+        }
+      }
+
       Arg = SpillSlot;
       break;
     }
@@ -5121,7 +5181,7 @@
       uint32_t BEAlign = 0;
       unsigned OpSize;
       if (VA.getLocInfo() == CCValAssign::Indirect)
-        OpSize = VA.getLocVT().getSizeInBits();
+        OpSize = VA.getLocVT().getFixedSizeInBits();
       else
         OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
                                  : VA.getValVT().getSizeInBits();
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+sve | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; Make sure callers set up the arguments correctly - tests AArch64ISelLowering::LowerCALL
+
+define float @foo1(double* %x0, double* %x1, double* %x2) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0]
+; CHECK-NEXT:    ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x2]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    mov x0, sp
+; CHECK-NEXT:    st1d { z16.d }, p0, [sp]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z19.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    bl callee1
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
+  %3 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
+  %4 = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %1, double* %x2)
+  %call = call float @callee1(float 1.000000e+00, <vscale x 8 x double> %2, <vscale x 8 x double> %3, <vscale x 2 x double> %4)
+  ret float %call
+}
+
+define float @foo2(double* %x0, double* %x1) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    sub sp, sp, #16 // =16
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0]
+; CHECK-NEXT:    ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, sp, #16 // =16
+; CHECK-NEXT:    add x9, sp, #16 // =16
+; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    mov w1, #1
+; CHECK-NEXT:    mov w2, #2
+; CHECK-NEXT:    mov w3, #3
+; CHECK-NEXT:    mov w4, #4
+; CHECK-NEXT:    mov w5, #5
+; CHECK-NEXT:    mov w6, #6
+; CHECK-NEXT:    mov w7, #7
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    st1d { z16.d }, p0, [x9]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z19.d }, p0, [x8, #3, mul vl]
+; CHECK-NEXT:    str x8, [sp]
+; CHECK-NEXT:    bl callee2
+; CHECK-NEXT:    addvl sp, sp, #4
+; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
+  %3 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
+  %call = call float @callee2(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, float 1.000000e+00, <vscale x 8 x double> %2, <vscale x 8 x double> %3)
+  ret float %call
+}
+
+define float @foo3(double* %x0, double* %x1, double* %x2) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x0]
+; CHECK-NEXT:    ld3d { z16.d, z17.d, z18.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x2]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    fmov s0, #1.00000000
+; CHECK-NEXT:    fmov s1, #2.00000000
+; CHECK-NEXT:    mov x0, sp
+; CHECK-NEXT:    st1d { z16.d }, p0, [sp]
+; CHECK-NEXT:    st1d { z17.d }, p0, [x8, #1, mul vl]
+; CHECK-NEXT:    st1d { z18.d }, p0, [x8, #2, mul vl]
+; CHECK-NEXT:    bl callee3
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
+  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
+  %3 = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
+  %4 = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1> %1, double* %x2)
+  %call = call float @callee3(float 1.000000e+00, float 2.000000e+00, <vscale x 8 x double> %2, <vscale x 6 x double> %3, <vscale x 2 x double> %4)
+  ret float %call
+}
+
+; Make sure callees read the arguments correctly - tests AArch64ISelLowering::LowerFormalArguments
+
+define double @foo4(double %x0, double * %ptr1, double * %ptr2, double * %ptr3, <vscale x 8 x double> %x1, <vscale x 8 x double> %x2, <vscale x 2 x double> %x3) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x3, #1, mul vl]
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x3]
+; CHECK-NEXT:    ld1d { z24.d }, p0/z, [x3, #3, mul vl]
+; CHECK-NEXT:    ld1d { z25.d }, p0/z, [x3, #2, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [x0, #3, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    st1d { z25.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    st1d { z24.d }, p0, [x1, #3, mul vl]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x1]
+; CHECK-NEXT:    st1d { z6.d }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast double * %ptr1 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x1, <vscale x 8 x double>* %ptr1.bc
+  %ptr2.bc = bitcast double * %ptr2 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x2, <vscale x 8 x double>* %ptr2.bc
+  %ptr3.bc = bitcast double * %ptr3 to <vscale x 2 x double> *
+  store volatile <vscale x 2 x double> %x3, <vscale x 2 x double>* %ptr3.bc
+  ret double %x0
+}
+
+define double @foo5(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, double * %ptr1, double * %ptr2, double %x0, <vscale x 8 x double> %x1, <vscale x 8 x double> %x2) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x8]
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x8, #3, mul vl]
+; CHECK-NEXT:    ld1d { z24.d }, p0/z, [x8, #2, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [x6, #3, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x6, #2, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x6, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x6]
+; CHECK-NEXT:    st1d { z24.d }, p0, [x7, #2, mul vl]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x7, #3, mul vl]
+; CHECK-NEXT:    st1d { z6.d }, p0, [x7]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x7, #1, mul vl]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast double * %ptr1 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x1, <vscale x 8 x double>* %ptr1.bc
+  %ptr2.bc = bitcast double * %ptr2 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x2, <vscale x 8 x double>* %ptr2.bc
+  ret double %x0
+}
+
+define double @foo6(double %x0, double %x1, double * %ptr1, double * %ptr2, <vscale x 8 x double> %x2, <vscale x 6 x double> %x3) nounwind {
+; CHECK-LABEL: foo6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x2]
+; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x2, #2, mul vl]
+; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x2, #1, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [x0, #3, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [x0]
+; CHECK-NEXT:    st1d { z7.d }, p0, [x1, #1, mul vl]
+; CHECK-NEXT:    st1d { z6.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
+; CHECK-NEXT:    ret
+entry:
+  %ptr1.bc = bitcast double * %ptr1 to <vscale x 8 x double> *
+  store volatile <vscale x 8 x double> %x2, <vscale x 8 x double>* %ptr1.bc
+  %ptr2.bc = bitcast double * %ptr2 to <vscale x 6 x double> *
+  store volatile <vscale x 6 x double> %x3, <vscale x 6 x double>* %ptr2.bc
+  ret double %x0
+}
+
+declare float @callee1(float, <vscale x 8 x double>, <vscale x 8 x double>, <vscale x 2 x double>)
+declare float @callee2(i32, i32, i32, i32, i32, i32, i32, i32, float, <vscale x 8 x double>, <vscale x 8 x double>)
+declare float @callee3(float, float, <vscale x 8 x double>, <vscale x 6 x double>, <vscale x 2 x double>)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1>, double*)
+declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1(<vscale x 2 x i1>, double*)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.nxv2f64(<vscale x 2 x i1>, double*)
+declare double @llvm.aarch64.sve.faddv.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(<vscale x 8 x double>, i32 immarg)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(<vscale x 6 x double>, i32 immarg)
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll
deleted file mode 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: not --crash llc < %s -mtriple aarch64-linux-gnu -mattr=+sve >/dev/null 2>%t
-; RUN: FileCheck %s < %t
-
-; CHECK: Passing consecutive scalable vector registers unsupported
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-gnu"
-
-define float @foo(double* %x0, double* %x1) {
-entry:
-  %0 = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
-  %1 = call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %0)
-  %2 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x0)
-  %3 = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1> %1, double* %x1)
-  %call = call float @callee(float 1.000000e+00, <vscale x 8 x double> %2, <vscale x 8 x double> %3)
-  ret float %call
-}
-
-declare float @callee(float, <vscale x 8 x double>, <vscale x 8 x double>)
-
-declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
-declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
-declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(<vscale x 2 x i1>, double*)