Index: llvm/include/llvm/CodeGen/CallingConvLower.h =================================================================== --- llvm/include/llvm/CodeGen/CallingConvLower.h +++ llvm/include/llvm/CodeGen/CallingConvLower.h @@ -340,6 +340,11 @@ return Regs.size(); } + void DeallocateReg(MCPhysReg Reg) { + assert(isAllocated(Reg) && "Trying to deallocate an unallocated register"); + MarkUnallocated(Reg); + } + /// AllocateReg - Attempt to allocate one register. If it is not available, /// return zero. Otherwise, return the register, marking it and any aliases /// as allocated. @@ -570,6 +575,8 @@ private: /// MarkAllocated - Mark a register and all of its aliases as allocated. void MarkAllocated(MCPhysReg Reg); + + void MarkUnallocated(MCPhysReg Reg); }; } // end namespace llvm Index: llvm/include/llvm/CodeGen/TargetCallingConv.h =================================================================== --- llvm/include/llvm/CodeGen/TargetCallingConv.h +++ llvm/include/llvm/CodeGen/TargetCallingConv.h @@ -122,10 +122,12 @@ void setReturned() { IsReturned = 1; } bool isInConsecutiveRegs() const { return IsInConsecutiveRegs; } - void setInConsecutiveRegs() { IsInConsecutiveRegs = 1; } + void setInConsecutiveRegs(bool Flag = true) { IsInConsecutiveRegs = Flag; } bool isInConsecutiveRegsLast() const { return IsInConsecutiveRegsLast; } - void setInConsecutiveRegsLast() { IsInConsecutiveRegsLast = 1; } + void setInConsecutiveRegsLast(bool Flag = true) { + IsInConsecutiveRegsLast = Flag; + } bool isSplit() const { return IsSplit; } void setSplit() { IsSplit = 1; } Index: llvm/lib/CodeGen/CallingConvLower.cpp =================================================================== --- llvm/lib/CodeGen/CallingConvLower.cpp +++ llvm/lib/CodeGen/CallingConvLower.cpp @@ -63,6 +63,11 @@ UsedRegs[*AI / 32] |= 1 << (*AI & 31); } +void CCState::MarkUnallocated(MCPhysReg Reg) { + for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) + UsedRegs[*AI / 32] &= ~(1 << (*AI & 31)); +} + bool CCState::IsShadowAllocatedReg(MCRegister Reg) const { if (!isAllocated(Reg)) return false; Index: llvm/lib/Target/AArch64/AArch64CallingConvention.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64CallingConvention.cpp +++ llvm/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -42,6 +42,51 @@ static bool finishStackBlock(SmallVectorImpl &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State, Align SlotAlign) { + if (LocVT.isScalableVector()) { + const AArch64Subtarget &Subtarget = static_cast( + State.getMachineFunction().getSubtarget()); + const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); + + // We are about to reinvoke the CCAssignFn auto-generated handler. If we + // don't unset these flags we will get stuck in an infinite loop forever + // invoking the custom handler. + ArgFlags.setInConsecutiveRegs(false); + ArgFlags.setInConsecutiveRegsLast(false); + + // The calling convention for passing SVE tuples states that in the event + // we cannot allocate enough registers for the tuple we should still leave + // any remaining registers unallocated. However, when we call the + // CCAssignFn again we want it to behave as if all remaining registers are + // allocated. This will force the code to pass the tuple indirectly in + // accordance with the PCS. + bool RegsAllocated[8]; + for (int I = 0; I < 8; I++) { + RegsAllocated[I] = State.isAllocated(ZRegList[I]); + State.AllocateReg(ZRegList[I]); + } + + auto &It = PendingMembers[0]; + CCAssignFn *AssignFn = + TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false); + if (AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), CCValAssign::Full, + ArgFlags, State)) + llvm_unreachable("Call operand has unhandled type"); + + // Return the flags to how they were before. + ArgFlags.setInConsecutiveRegs(true); + ArgFlags.setInConsecutiveRegsLast(true); + + // Return the register state back to how it was before, leaving any + // unallocated registers available for other smaller types. + for (int I = 0; I < 8; I++) + if (!RegsAllocated[I]) + State.DeallocateReg(ZRegList[I]); + + // All pending members have now been allocated + PendingMembers.clear(); + return true; + } + unsigned Size = LocVT.getSizeInBits() / 8; const Align StackAlign = State.getMachineFunction().getDataLayout().getStackAlignment(); @@ -146,13 +191,11 @@ return true; } - if (LocVT.isScalableVector()) - report_fatal_error( - "Passing consecutive scalable vector registers unsupported"); - - // Mark all regs in the class as unavailable - for (auto Reg : RegList) - State.AllocateReg(Reg); + if (!LocVT.isScalableVector()) { + // Mark all regs in the class as unavailable + for (auto Reg : RegList) + State.AllocateReg(Reg); + } const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4167,16 +4167,16 @@ assert(!Res && "Call operand has unhandled type"); (void)Res; } - assert(ArgLocs.size() == Ins.size()); SmallVector ArgValues; - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + unsigned ExtraArgLocs = 0; + for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i, ++j) { CCValAssign &VA = ArgLocs[i]; - if (Ins[i].Flags.isByVal()) { + if (Ins[j].Flags.isByVal()) { // Byval is used for HFAs in the PCS, but the system should work in a // non-compliant manner for larger structs. EVT PtrVT = getPointerTy(DAG.getDataLayout()); - int Size = Ins[i].Flags.getByValSize(); + int Size = Ins[j].Flags.getByValSize(); unsigned NumRegs = (Size + 7) / 8; // FIXME: This works on big-endian for composite byvals, which are the common @@ -4253,7 +4253,7 @@ uint32_t BEAlign = 0; if (!Subtarget->isLittleEndian() && ArgSize < 8 && - !Ins[i].Flags.isInConsecutiveRegs()) + !Ins[j].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); @@ -4298,16 +4298,42 @@ if (VA.getLocInfo() == CCValAssign::Indirect) { assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - // If value is passed via pointer - do a load. - ArgValue = - DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo()); - } - if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) - ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), - ArgValue, DAG.getValueType(MVT::i32)); - InVals.push_back(ArgValue); + uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize(); + unsigned NumParts = 1; + if (Ins[j].Flags.isInConsecutiveRegs()) { + assert(!Ins[j].Flags.isInConsecutiveRegsLast()); + while (!Ins[j + NumParts - 1].Flags.isInConsecutiveRegsLast()) + ++NumParts; + } + + MVT PartLoad = VA.getValVT(); + SDValue Ptr = ArgValue; + + while (NumParts > 0) { + ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); + InVals.push_back(ArgValue); + NumParts--; + if (NumParts > 0) { + SDValue BytesIncrement = DAG.getVScale( + DL, Ptr.getValueType(), + APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + BytesIncrement, Flags); + ExtraArgLocs++; + j++; + } + } + } else { + if (Subtarget->isTargetILP32() && Ins[j].Flags.isPointer()) + ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), + ArgValue, DAG.getValueType(MVT::i32)); + InVals.push_back(ArgValue); + } } + assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); // varargs AArch64FunctionInfo *FuncInfo = MF.getInfo(); @@ -4905,10 +4931,10 @@ } // Walk the register/memloc assignments, inserting copies/loads. - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i, ++j) { CCValAssign &VA = ArgLocs[i]; - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; + SDValue Arg = OutVals[j]; + ISD::ArgFlagsTy Flags = Outs[j].Flags; // Promote the value if needed. switch (VA.getLocInfo()) { @@ -4948,18 +4974,46 @@ case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); + + uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize(); + uint64_t PartSize = StoreSize; + unsigned NumParts = 1; + if (Outs[j].Flags.isInConsecutiveRegs()) { + assert(!Outs[j].Flags.isInConsecutiveRegsLast()); + while (!Outs[j + NumParts - 1].Flags.isInConsecutiveRegsLast()) + ++NumParts; + StoreSize *= NumParts; + } + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); - int FI = MFI.CreateStackObject( - VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false); + int FI = MFI.CreateStackObject(StoreSize, Alignment, false); MFI.setStackID(FI, TargetStackID::SVEVector); - SDValue SpillSlot = DAG.getFrameIndex( + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); + SDValue Ptr = DAG.getFrameIndex( FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Chain = DAG.getStore( - Chain, DL, Arg, SpillSlot, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + SDValue SpillSlot = Ptr; + + while (NumParts) { + Chain = DAG.getStore(Chain, DL, OutVals[j], Ptr, MPI); + NumParts--; + if (NumParts > 0) { + SDValue BytesIncrement = DAG.getVScale( + DL, Ptr.getValueType(), + APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); + + MPI = MachinePointerInfo(MPI.getAddrSpace()); + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + BytesIncrement, Flags); + j++; + } + } + Arg = SpillSlot; break; } @@ -5011,7 +5065,7 @@ uint32_t BEAlign = 0; unsigned OpSize; if (VA.getLocInfo() == CCValAssign::Indirect) - OpSize = VA.getLocVT().getSizeInBits(); + OpSize = VA.getLocVT().getFixedSizeInBits(); else OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 : VA.getValVT().getSizeInBits(); Index: llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -0,0 +1,210 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+sve | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +; Make sure callers set up the arguments correctly - tests AArch64ISelLowering::LowerCALL + +define float @foo1(double* %x0, double* %x1, double* %x2) { +; CHECK-LABEL: foo1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0] +; CHECK-NEXT: ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x2] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: fmov s0, #1.00000000 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: st1d { z16.d }, p0, [sp] +; CHECK-NEXT: st1d { z17.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1d { z18.d }, p0, [x8, #2, mul vl] +; CHECK-NEXT: st1d { z19.d }, p0, [x8, #3, mul vl] +; CHECK-NEXT: bl callee1 +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) + %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) + %3 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x1) + %4 = call @llvm.aarch64.sve.ld1.nxv2f64( %1, double* %x2) + %call = call float @callee1(float 1.000000e+00, %2, %3, %4) + ret float %call +} + +define float @foo2(double* %x0, double* %x1) { +; CHECK-LABEL: foo2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: sub sp, sp, #16 // =16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 32 * VG +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0] +; CHECK-NEXT: ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 // =16 +; CHECK-NEXT: add x9, sp, #16 // =16 +; CHECK-NEXT: fmov s0, #1.00000000 +; CHECK-NEXT: mov w1, #1 +; CHECK-NEXT: mov w2, #2 +; CHECK-NEXT: mov w3, #3 +; CHECK-NEXT: mov w4, #4 +; CHECK-NEXT: mov w5, #5 +; CHECK-NEXT: mov w6, #6 +; CHECK-NEXT: mov w7, #7 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: st1d { z16.d }, p0, [x9] +; CHECK-NEXT: st1d { z17.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1d { z18.d }, p0, [x8, #2, mul vl] +; CHECK-NEXT: st1d { z19.d }, p0, [x8, #3, mul vl] +; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: bl callee2 +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: add sp, sp, #16 // =16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) + %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) + %3 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x1) + %call = call float @callee2(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, float 1.000000e+00, %2, %3) + ret float %call +} + +define float @foo3(double* %x0, double* %x1, double* %x2) { +; CHECK-LABEL: foo3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x0] +; CHECK-NEXT: ld3d { z16.d, z17.d, z18.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: fmov s0, #1.00000000 +; CHECK-NEXT: fmov s1, #2.00000000 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: st1d { z16.d }, p0, [sp] +; CHECK-NEXT: st1d { z17.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1d { z18.d }, p0, [x8, #2, mul vl] +; CHECK-NEXT: bl callee3 +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) + %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) + %3 = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1( %1, double* %x1) + %4 = call @llvm.aarch64.sve.ld1.nxv2f64( %1, double* %x2) + %call = call float @callee3(float 1.000000e+00, float 2.000000e+00, %2, %3, %4) + ret float %call +} + +; Make sure callees read the arguments correctly - tests AArch64ISelLowering::LowerFormalArguments + +define double @foo4(double %x0, double * %ptr1, double * %ptr2, double * %ptr3, %x1, %x2, %x3) { +; CHECK-LABEL: foo4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x3, #1, mul vl] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x3] +; CHECK-NEXT: ld1d { z24.d }, p0/z, [x3, #3, mul vl] +; CHECK-NEXT: ld1d { z25.d }, p0/z, [x3, #2, mul vl] +; CHECK-NEXT: st1d { z4.d }, p0, [x0, #3, mul vl] +; CHECK-NEXT: st1d { z3.d }, p0, [x0, #2, mul vl] +; CHECK-NEXT: st1d { z2.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: st1d { z1.d }, p0, [x0] +; CHECK-NEXT: st1d { z25.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: st1d { z24.d }, p0, [x1, #3, mul vl] +; CHECK-NEXT: st1d { z7.d }, p0, [x1] +; CHECK-NEXT: st1d { z6.d }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1d { z5.d }, p0, [x2] +; CHECK-NEXT: ret +entry: + %ptr1.bc = bitcast double * %ptr1 to * + store volatile %x1, * %ptr1.bc + %ptr2.bc = bitcast double * %ptr2 to * + store volatile %x2, * %ptr2.bc + %ptr3.bc = bitcast double * %ptr3 to * + store volatile %x3, * %ptr3.bc + ret double %x0 +} + +define double @foo5(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, double * %ptr1, double * %ptr2, double %x0, %x1, %x2) { +; CHECK-LABEL: foo5: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr x8, [sp] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x8] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x8, #3, mul vl] +; CHECK-NEXT: ld1d { z24.d }, p0/z, [x8, #2, mul vl] +; CHECK-NEXT: st1d { z4.d }, p0, [x6, #3, mul vl] +; CHECK-NEXT: st1d { z3.d }, p0, [x6, #2, mul vl] +; CHECK-NEXT: st1d { z2.d }, p0, [x6, #1, mul vl] +; CHECK-NEXT: st1d { z1.d }, p0, [x6] +; CHECK-NEXT: st1d { z24.d }, p0, [x7, #2, mul vl] +; CHECK-NEXT: st1d { z7.d }, p0, [x7, #3, mul vl] +; CHECK-NEXT: st1d { z6.d }, p0, [x7] +; CHECK-NEXT: st1d { z5.d }, p0, [x7, #1, mul vl] +; CHECK-NEXT: ret +entry: + %ptr1.bc = bitcast double * %ptr1 to * + store volatile %x1, * %ptr1.bc + %ptr2.bc = bitcast double * %ptr2 to * + store volatile %x2, * %ptr2.bc + ret double %x0 +} + +define double @foo6(double %x0, double %x1, double * %ptr1, double * %ptr2, %x2, %x3) { +; CHECK-LABEL: foo6: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x2] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2, #2, mul vl] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x2, #1, mul vl] +; CHECK-NEXT: st1d { z5.d }, p0, [x0, #3, mul vl] +; CHECK-NEXT: st1d { z4.d }, p0, [x0, #2, mul vl] +; CHECK-NEXT: st1d { z3.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: st1d { z2.d }, p0, [x0] +; CHECK-NEXT: st1d { z7.d }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1d { z6.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: st1d { z1.d }, p0, [x1] +; CHECK-NEXT: ret +entry: + %ptr1.bc = bitcast double * %ptr1 to * + store volatile %x2, * %ptr1.bc + %ptr2.bc = bitcast double * %ptr2 to * + store volatile %x3, * %ptr2.bc + ret double %x0 +} + +declare float @callee1(float, , , ) +declare float @callee2(i32, i32, i32, i32, i32, i32, i32, i32, float, , ) +declare float @callee3(float, float, , , ) + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg) +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() +declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(, double*) +declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1(, double*) +declare @llvm.aarch64.sve.ld1.nxv2f64(, double*) +declare double @llvm.aarch64.sve.faddv.nxv2f64(, ) +declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(, i32 immarg) Index: llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: not --crash llc < %s -mtriple aarch64-linux-gnu -mattr=+sve >/dev/null 2>%t -; RUN: FileCheck %s < %t - -; CHECK: Passing consecutive scalable vector registers unsupported - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-unknown-linux-gnu" - -define float @foo(double* %x0, double* %x1) { -entry: - %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) - %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) - %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) - %3 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x1) - %call = call float @callee(float 1.000000e+00, %2, %3) - ret float %call -} - -declare float @callee(float, , ) - -declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg) -declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() -declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(, double*)