Index: llvm/include/llvm/CodeGen/CallingConvLower.h =================================================================== --- llvm/include/llvm/CodeGen/CallingConvLower.h +++ llvm/include/llvm/CodeGen/CallingConvLower.h @@ -340,6 +340,11 @@ return Regs.size(); } + void DeallocateReg(MCPhysReg Reg) { + assert(isAllocated(Reg) && "Trying to deallocate an unallocated register"); + MarkUnallocated(Reg); + } + /// AllocateReg - Attempt to allocate one register. If it is not available, /// return zero. Otherwise, return the register, marking it and any aliases /// as allocated. @@ -570,6 +575,8 @@ private: /// MarkAllocated - Mark a register and all of its aliases as allocated. void MarkAllocated(MCPhysReg Reg); + + void MarkUnallocated(MCPhysReg Reg); }; } // end namespace llvm Index: llvm/include/llvm/CodeGen/TargetCallingConv.h =================================================================== --- llvm/include/llvm/CodeGen/TargetCallingConv.h +++ llvm/include/llvm/CodeGen/TargetCallingConv.h @@ -122,10 +122,12 @@ void setReturned() { IsReturned = 1; } bool isInConsecutiveRegs() const { return IsInConsecutiveRegs; } - void setInConsecutiveRegs() { IsInConsecutiveRegs = 1; } + void setInConsecutiveRegs(bool Flag = true) { IsInConsecutiveRegs = Flag; } bool isInConsecutiveRegsLast() const { return IsInConsecutiveRegsLast; } - void setInConsecutiveRegsLast() { IsInConsecutiveRegsLast = 1; } + void setInConsecutiveRegsLast(bool Flag = true) { + IsInConsecutiveRegsLast = Flag; + } bool isSplit() const { return IsSplit; } void setSplit() { IsSplit = 1; } Index: llvm/lib/CodeGen/CallingConvLower.cpp =================================================================== --- llvm/lib/CodeGen/CallingConvLower.cpp +++ llvm/lib/CodeGen/CallingConvLower.cpp @@ -63,6 +63,11 @@ UsedRegs[*AI / 32] |= 1 << (*AI & 31); } +void CCState::MarkUnallocated(MCPhysReg Reg) { + for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) + UsedRegs[*AI / 32] &= ~(1 << (*AI & 31)); +} + bool CCState::IsShadowAllocatedReg(MCRegister Reg) const { if (!isAllocated(Reg)) return false; Index: llvm/lib/Target/AArch64/AArch64CallingConvention.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64CallingConvention.cpp +++ llvm/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -42,6 +42,40 @@ static bool finishStackBlock(SmallVectorImpl &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State, Align SlotAlign) { + if (LocVT.isScalableVector()) { + const AArch64Subtarget &Subtarget = static_cast( + State.getMachineFunction().getSubtarget()); + const AArch64TargetLowering *TLI = Subtarget.getTargetLowering(); + + ArgFlags.setInConsecutiveRegs(false); + ArgFlags.setInConsecutiveRegsLast(false); + + bool RegsAllocated[8]; + for (int i = 0; i < 8; i++) { + RegsAllocated[i] = State.isAllocated(ZRegList[i]); + State.AllocateReg(ZRegList[i]); + } + + auto &It = PendingMembers[0]; + CCAssignFn *AssignFn = + TLI->CCAssignFnForCall(State.getCallingConv(), /*IsVarArg=*/false); + bool Res = AssignFn(It.getValNo(), It.getValVT(), It.getValVT(), + CCValAssign::Full, ArgFlags, State); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + + ArgFlags.setInConsecutiveRegs(true); + ArgFlags.setInConsecutiveRegsLast(true); + + for (int i = 0; i < 8; i++) + if (!RegsAllocated[i]) + State.DeallocateReg(ZRegList[i]); + + // All pending members have now been allocated + PendingMembers.clear(); + return true; + } + unsigned Size = LocVT.getSizeInBits() / 8; const Align StackAlign = State.getMachineFunction().getDataLayout().getStackAlignment(); @@ -146,13 +180,11 @@ return true; } - if (LocVT.isScalableVector()) - report_fatal_error( - "Passing consecutive scalable vector registers unsupported"); - - // Mark all regs in the class as unavailable - for (auto Reg : RegList) - State.AllocateReg(Reg); + if (!LocVT.isScalableVector()) { + // Mark all regs in the class as unavailable + for (auto Reg : RegList) + State.AllocateReg(Reg); + } const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4158,16 +4158,16 @@ assert(!Res && "Call operand has unhandled type"); (void)Res; } - assert(ArgLocs.size() == Ins.size()); SmallVector ArgValues; - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + unsigned ExtraArgLocs = 0; + for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i, ++j) { CCValAssign &VA = ArgLocs[i]; - if (Ins[i].Flags.isByVal()) { + if (Ins[j].Flags.isByVal()) { // Byval is used for HFAs in the PCS, but the system should work in a // non-compliant manner for larger structs. EVT PtrVT = getPointerTy(DAG.getDataLayout()); - int Size = Ins[i].Flags.getByValSize(); + int Size = Ins[j].Flags.getByValSize(); unsigned NumRegs = (Size + 7) / 8; // FIXME: This works on big-endian for composite byvals, which are the common @@ -4244,7 +4244,7 @@ uint32_t BEAlign = 0; if (!Subtarget->isLittleEndian() && ArgSize < 8 && - !Ins[i].Flags.isInConsecutiveRegs()) + !Ins[j].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); @@ -4289,16 +4289,53 @@ if (VA.getLocInfo() == CCValAssign::Indirect) { assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - // If value is passed via pointer - do a load. - ArgValue = - DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo()); - } - if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) - ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), - ArgValue, DAG.getValueType(MVT::i32)); - InVals.push_back(ArgValue); + uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize(); + unsigned NumParts = 1; + if (Ins[j].Flags.isInConsecutiveRegs()) { + unsigned k = j; + assert(!Ins[k].Flags.isInConsecutiveRegsLast()); + while (!Ins[k].Flags.isInConsecutiveRegsLast()) { + k++; + NumParts++; + } + } + + MVT PartLoad = VA.getValVT(); + SDValue Ptr = ArgValue; + + ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); + InVals.push_back(ArgValue); + + if (NumParts > 1) { + SDValue BytesIncrement = DAG.getVScale( + DL, Ptr.getValueType(), + APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); + + NumParts--; + while (NumParts) { + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + BytesIncrement, Flags); + + ArgValue = + DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo()); + + InVals.push_back(ArgValue); + NumParts--; + ExtraArgLocs++; + j++; + } + } + } else { + if (Subtarget->isTargetILP32() && Ins[j].Flags.isPointer()) + ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), + ArgValue, DAG.getValueType(MVT::i32)); + InVals.push_back(ArgValue); + } } + assert((ArgLocs.size() + ExtraArgLocs) == Ins.size()); // varargs AArch64FunctionInfo *FuncInfo = MF.getInfo(); @@ -4892,10 +4929,10 @@ } // Walk the register/memloc assignments, inserting copies/loads. - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i, ++j) { CCValAssign &VA = ArgLocs[i]; - SDValue Arg = OutVals[i]; - ISD::ArgFlagsTy Flags = Outs[i].Flags; + SDValue Arg = OutVals[j]; + ISD::ArgFlagsTy Flags = Outs[j].Flags; // Promote the value if needed. switch (VA.getLocInfo()) { @@ -4935,18 +4972,53 @@ case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); + + uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize(); + uint64_t PartSize = StoreSize; + unsigned NumParts = 1; + if (Outs[j].Flags.isInConsecutiveRegs()) { + unsigned k = j; + assert(!Outs[k].Flags.isInConsecutiveRegsLast()); + while (!Outs[k].Flags.isInConsecutiveRegsLast()) { + k++; + NumParts++; + } + StoreSize *= NumParts; + } + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); - int FI = MFI.CreateStackObject( - VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false); + int FI = MFI.CreateStackObject(StoreSize, Alignment, false); MFI.setStackID(FI, TargetStackID::SVEVector); - SDValue SpillSlot = DAG.getFrameIndex( + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); + SDValue Ptr = DAG.getFrameIndex( FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Chain = DAG.getStore( - Chain, DL, Arg, SpillSlot, - MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + SDValue SpillSlot = Ptr; + Chain = DAG.getStore(Chain, DL, OutVals[j], Ptr, MPI); + + if (NumParts > 1) { + SDValue BytesIncrement = DAG.getVScale( + DL, Ptr.getValueType(), + APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); + + MPI = MachinePointerInfo(MPI.getAddrSpace()); + NumParts--; + while (NumParts) { + SDNodeFlags Flags; + Flags.setNoUnsignedWrap(true); + + Arg = OutVals[++j]; + Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + BytesIncrement, Flags); + Chain = DAG.getStore(Chain, DL, Arg, Ptr, MPI); + + NumParts--; + } + } + Arg = SpillSlot; break; } @@ -4998,7 +5070,7 @@ uint32_t BEAlign = 0; unsigned OpSize; if (VA.getLocInfo() == CCValAssign::Indirect) - OpSize = VA.getLocVT().getSizeInBits(); + OpSize = VA.getLocVT().getFixedSizeInBits(); else OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 : VA.getValVT().getSizeInBits(); Index: llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -0,0 +1,207 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-linux-gnu -mattr=+sve | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +; Make sure callers set up the arguments correctly - tests AArch64ISelLowering::LowerCALL + +define float @foo1(double* %x0, double* %x1, double* %x2) { +; CHECK-LABEL: foo1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0] +; CHECK-NEXT: ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x2] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: fmov s0, #1.00000000 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: st1d { z16.d }, p0, [sp] +; CHECK-NEXT: st1d { z17.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1d { z18.d }, p0, [x8, #2, mul vl] +; CHECK-NEXT: st1d { z19.d }, p0, [x8, #3, mul vl] +; CHECK-NEXT: bl callee1 +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) + %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) + %3 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x1) + %4 = call @llvm.aarch64.sve.ld1.nxv2f64( %1, double* %x2) + %call = call float @callee1(float 1.000000e+00, %2, %3, %4) + ret float %call +} + +define float @foo2(double* %x0, double* %x1) { +; CHECK-LABEL: foo2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-4 +; CHECK-NEXT: sub sp, sp, #16 // =16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 32 * VG +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld4d { z1.d, z2.d, z3.d, z4.d }, p0/z, [x0] +; CHECK-NEXT: ld4d { z16.d, z17.d, z18.d, z19.d }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, sp, #16 // =16 +; CHECK-NEXT: add x9, sp, #16 // =16 +; CHECK-NEXT: fmov s0, #1.00000000 +; CHECK-NEXT: mov w1, #1 +; CHECK-NEXT: mov w2, #2 +; CHECK-NEXT: mov w3, #3 +; CHECK-NEXT: mov w4, #4 +; CHECK-NEXT: mov w5, #5 +; CHECK-NEXT: mov w6, #6 +; CHECK-NEXT: mov w7, #7 +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: st1d { z16.d }, p0, [x9] +; CHECK-NEXT: st1d { z17.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1d { z18.d }, p0, [x8, #2, mul vl] +; CHECK-NEXT: st1d { z19.d }, p0, [x8, #3, mul vl] +; CHECK-NEXT: str x8, [sp] +; CHECK-NEXT: bl callee2 +; CHECK-NEXT: addvl sp, sp, #4 +; CHECK-NEXT: add sp, sp, #16 // =16 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) + %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) + %3 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x1) + %call = call float @callee2(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, float 1.000000e+00, %2, %3) + ret float %call +} + +define float @foo3(double* %x0, double* %x1, double* %x2) { +; CHECK-LABEL: foo3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld4d { z2.d, z3.d, z4.d, z5.d }, p0/z, [x0] +; CHECK-NEXT: ld3d { z16.d, z17.d, z18.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x2] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: fmov s0, #1.00000000 +; CHECK-NEXT: fmov s1, #2.00000000 +; CHECK-NEXT: mov x0, sp +; CHECK-NEXT: st1d { z16.d }, p0, [sp] +; CHECK-NEXT: st1d { z17.d }, p0, [x8, #1, mul vl] +; CHECK-NEXT: st1d { z18.d }, p0, [x8, #2, mul vl] +; CHECK-NEXT: bl callee3 +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) + %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) + %3 = call @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1( %1, double* %x1) + %4 = call @llvm.aarch64.sve.ld1.nxv2f64( %1, double* %x2) + %call = call float @callee3(float 1.000000e+00, float 2.000000e+00, %2, %3, %4) + ret float %call +} + +; Make sure callees read the arguments correctly - tests AArch64ISelLowering::LowerFormalArguments + +define double @foo4(double %x0, %x1, %x2, %x3) { +; CHECK-LABEL: foo4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: faddv d2, p0, z3.d +; CHECK-NEXT: fadd d0, d2, d0 +; CHECK-NEXT: faddv d1, p0, z1.d +; CHECK-NEXT: fadd d0, d0, d1 +; CHECK-NEXT: faddv d1, p0, z5.d +; CHECK-NEXT: fadd d0, d0, d1 +; CHECK-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64( %x1, i32 2) + %1 = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64( %x2, i32 1) + %2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %3 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %2) + %4 = tail call double @llvm.aarch64.sve.faddv.nxv2f64( %3, %0) + %add = fadd double %4, %x0 + %5 = tail call double @llvm.aarch64.sve.faddv.nxv2f64( %3, %1) + %add1 = fadd double %add, %5 + %6 = tail call double @llvm.aarch64.sve.faddv.nxv2f64( %3, %x3) + %add2 = fadd double %add1, %6 + ret double %add2 +} + +define double @foo5(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, double %x0, %x1, %x2) { +; CHECK-LABEL: foo5: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr x8, [sp] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8, #1, mul vl] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: faddv d2, p0, z3.d +; CHECK-NEXT: fadd d0, d2, d0 +; CHECK-NEXT: faddv d1, p0, z1.d +; CHECK-NEXT: fadd d0, d0, d1 +; CHECK-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64( %x1, i32 2) + %1 = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64( %x2, i32 1) + %2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %3 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %2) + %4 = tail call double @llvm.aarch64.sve.faddv.nxv2f64( %3, %0) + %add = fadd double %4, %x0 + %5 = tail call double @llvm.aarch64.sve.faddv.nxv2f64( %3, %1) + %add1 = fadd double %add, %5 + ret double %add1 +} + +define double @foo6(double %x0, double %x1, %x2, %x3) { +; CHECK-LABEL: foo6: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: faddv d2, p0, z4.d +; CHECK-NEXT: fadd d0, d2, d0 +; CHECK-NEXT: faddv d1, p0, z1.d +; CHECK-NEXT: fadd d0, d0, d1 +; CHECK-NEXT: ret +entry: + %0 = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64( %x2, i32 2) + %1 = tail call @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64( %x3, i32 1) + %2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %3 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %2) + %4 = tail call double @llvm.aarch64.sve.faddv.nxv2f64( %3, %0) + %add = fadd double %4, %x0 + %5 = tail call double @llvm.aarch64.sve.faddv.nxv2f64( %3, %1) + %add1 = fadd double %add, %5 + ret double %add1 +} + +declare float @callee1(float, , , ) +declare float @callee2(i32, i32, i32, i32, i32, i32, i32, i32, float, , ) +declare float @callee3(float, float, , , ) + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg) +declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() +declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(, double*) +declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1(, double*) +declare @llvm.aarch64.sve.ld1.nxv2f64(, double*) +declare double @llvm.aarch64.sve.faddv.nxv2f64(, ) +declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv8f64(, i32 immarg) +declare @llvm.aarch64.sve.tuple.get.nxv2f64.nxv6f64(, i32 immarg) Index: llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-calling-convention-tuples-broken.ll +++ /dev/null @@ -1,23 +0,0 @@ -; RUN: not --crash llc < %s -mtriple aarch64-linux-gnu -mattr=+sve >/dev/null 2>%t -; RUN: FileCheck %s < %t - -; CHECK: Passing consecutive scalable vector registers unsupported - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-unknown-linux-gnu" - -define float @foo(double* %x0, double* %x1) { -entry: - %0 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) - %1 = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %0) - %2 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x0) - %3 = call @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1( %1, double* %x1) - %call = call float @callee(float 1.000000e+00, %2, %3) - ret float %call -} - -declare float @callee(float, , ) - -declare @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg) -declare @llvm.aarch64.sve.convert.from.svbool.nxv2i1() -declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1(, double*)