Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14891,6 +14891,12 @@ !LD->getValueType(0).isInteger()) return false; + // The algorithm to split up a load of a scalable vector into individual + // elements currently requires knowing the length of the loaded type, + // so will need adjusting to work on scalable vectors. + if (LD->getValueType(0).isScalableVector()) + return false; + // Keep track of already used bits to detect overlapping values. // In that case, we will just abort the transformation. APInt UsedBits(LD->getValueSizeInBits(0), 0); @@ -16527,7 +16533,10 @@ } if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && - !ST1->getBasePtr().isUndef()) { + !ST1->getBasePtr().isUndef() && + // BaseIndexOffset and the code below requires knowing the size + // of a vector, so bail out if MemoryVT is scalable. + !ST1->getMemoryVT().isScalableVector()) { const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG); const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG); unsigned STBitSize = ST->getMemoryVT().getSizeInBits(); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3330,9 +3330,6 @@ switch (CC) { default: report_fatal_error("Unsupported calling convention."); - case CallingConv::AArch64_SVE_VectorCall: - // Calling SVE functions is currently not yet supported. - report_fatal_error("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: @@ -3355,6 +3352,7 @@ case CallingConv::CFGuard_Check: return CC_AArch64_Win64_CFGuard_Check; case CallingConv::AArch64_VectorCall: + case CallingConv::AArch64_SVE_VectorCall: return CC_AArch64_AAPCS; } } @@ -3473,7 +3471,7 @@ case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; @@ -3490,7 +3488,9 @@ } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); - unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; + unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect + ? VA.getLocVT().getSizeInBits() + : VA.getValVT().getSizeInBits()) / 8; uint32_t BEAlign = 0; if (!Subtarget->isLittleEndian() && ArgSize < 8 && @@ -3516,7 +3516,8 @@ case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + MemVT = VA.getLocVT(); + break; case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; @@ -3534,6 +3535,15 @@ MemVT); } + + if (VA.getLocInfo() == CCValAssign::Indirect) { + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + // If value is passed via pointer - do a load. + ArgValue = + DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo()); + } + if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), ArgValue, DAG.getValueType(MVT::i32)); @@ -3894,6 +3904,18 @@ const AArch64FunctionInfo *FuncInfo = MF.getInfo(); + // If any of the arguments is passed indirectly, it must be SVE, so the + // 'getBytesInStackArgArea' is not sufficient to determine whether we need to + // allocate space on the stack. That is why we determine this explicitly here + // the call cannot be a tailcall. + if (llvm::any_of(ArgLocs, [](CCValAssign &A) { + assert((A.getLocInfo() != CCValAssign::Indirect || + A.getValVT().isScalableVector()) && + "Expected value to be scalable"); + return A.getLocInfo() == CCValAssign::Indirect; + })) + return false; + // If the stack arguments for this call do not fit into our own save area then // the call cannot be made tail. if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) @@ -4134,7 +4156,20 @@ case CCValAssign::Indirect: assert(VA.getValVT().isScalableVector() && "Only scalable vectors can be passed indirectly"); - llvm_unreachable("Spilling of SVE vectors not yet implemented"); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); + Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); + unsigned Align = DAG.getDataLayout().getPrefTypeAlignment(Ty); + int FI = MFI.CreateStackObject( + VA.getValVT().getStoreSize().getKnownMinSize(), Align, false); + MFI.setStackID(FI, TargetStackID::SVEVector); + + SDValue SpillSlot = DAG.getFrameIndex( + FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Chain = DAG.getStore( + Chain, DL, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); + Arg = SpillSlot; + break; } if (VA.isRegLoc()) { @@ -4182,8 +4217,12 @@ // FIXME: This works on big-endian for composite byvals, which are the // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; - unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 - : VA.getValVT().getSizeInBits(); + unsigned OpSize; + if (VA.getLocInfo() == CCValAssign::Indirect) + OpSize = VA.getLocVT().getSizeInBits(); + else + OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 + : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; if (!Subtarget->isLittleEndian() && !Flags.isByVal() && !Flags.isInConsecutiveRegs()) { Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1285,6 +1285,9 @@ multiclass unpred_store { def _fi : Pat<(store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)), (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + + def _default : Pat<(store (Ty ZPR:$val), GPR64:$base), + (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>; } defm Pat_ST1B : unpred_store; @@ -1298,6 +1301,9 @@ multiclass unpred_load { def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm4s1:$offset))), (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>; + + def _default : Pat<(Ty (load GPR64:$base)), + (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>; } defm Pat_LD1B : unpred_load; @@ -1311,6 +1317,9 @@ multiclass unpred_store_predicate { def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)), (Store PPR:$val, GPR64sp:$base, simm9:$offset)>; + + def _default : Pat<(store (Ty PPR:$Val), GPR64:$base), + (Store PPR:$Val, GPR64:$base, (i64 0))>; } defm Pat_Store_P16 : unpred_store_predicate; @@ -1321,6 +1330,9 @@ multiclass unpred_load_predicate { def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))), (Load GPR64sp:$base, simm9:$offset)>; + + def _default : Pat<(Ty (load GPR64:$base)), + (Load GPR64:$base, (i64 0))>; } defm Pat_Load_P16 : unpred_load_predicate; Index: llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll @@ -0,0 +1,29 @@ +; Because some arguments are passed by reference (through stack), +; the compiler should not do tail-call optimization. +; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s + +; CHECK-LABEL: caller: +; CHECK: addvl sp, sp, #-[[STACKSIZE:[0-9]+]] +; CHECK-NOT: addvl sp +; CHECK: bl callee +; CHECK: addvl sp, sp, #[[STACKSIZE]] +; CHECK: ret +define @caller( %v) { + %1 = tail call @callee( %v, %v, %v, %v, %v, %v, %v, %v, %v) + ret %1 +} + +declare @callee(, , , , , , , , ) + +; CHECK-LABEL: caller_pred: +; CHECK: addvl sp, sp, #-[[STACKSIZE:[0-9]+]] +; CHECK-NOT: addvl sp +; CHECK: bl callee_pred +; CHECK: addvl sp, sp, #[[STACKSIZE]] +; CHECK: ret +define @caller_pred( %v) { + %1 = tail call @callee_pred( %v, %v, %v, %v, %v) + ret %1 +} + +declare @callee_pred(, , , , ) Index: llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll @@ -0,0 +1,118 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -stop-after=finalize-isel < %s | FileCheck %s + +; Test that z8 and z9, passed in by reference, are correctly loaded from x0 and x1. +; i.e. z0 = %z0 +; : +; z7 = %z7 +; x0 = &%z8 +; x1 = &%z9 +define aarch64_sve_vector_pcs @callee_with_many_sve_arg( %z0, %z1, %z2, %z3, %z4, %z5, %z6, %z7, %z8, %z9) { +; CHECK: name: callee_with_many_sve_arg +; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = COPY $x1 +; CHECK-DAG: [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31 +; CHECK-DAG: [[RES:%[0-9]+]]:zpr = LD1W_IMM killed [[PTRUE]], [[BASE]] +; CHECK-DAG: $z0 = COPY [[RES]] +; CHECK: RET_ReallyLR implicit $z0 + ret %z9 +} + +; Test that z8 and z9 are passed by reference. +define aarch64_sve_vector_pcs @caller_with_many_sve_arg( %z) { +; CHECK: name: caller_with_many_sve_arg +; CHECK: stack: +; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 16, alignment: 16, +; CHECK-NEXT: stack-id: sve-vec +; CHECK: - { id: 1, name: '', type: default, offset: 0, size: 16, alignment: 16, +; CHECK-NEXT: stack-id: sve-vec +; CHECK-DAG: [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31 +; CHECK-DAG: ST1W_IMM %{{[0-9]+}}, [[PTRUE]], %stack.1, 0 +; CHECK-DAG: ST1W_IMM %{{[0-9]+}}, [[PTRUE]], %stack.0, 0 +; CHECK-DAG: [[BASE2:%[0-9]+]]:gpr64sp = ADDXri %stack.1, 0 +; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0 +; CHECK-DAG: $x0 = COPY [[BASE1]] +; CHECK-DAG: $x1 = COPY [[BASE2]] +; CHECK-NEXT: BL @callee_with_many_sve_arg +; CHECK: RET_ReallyLR implicit $z0 + %ret = call aarch64_sve_vector_pcs @callee_with_many_sve_arg( %z, %z, %z, %z, %z, %z, %z, %z, %z, %z) + ret %ret +} + +; Test that p4 and p5, passed in by reference, are correctly loaded from register x0 and x1. +; i.e. p0 = %p0 +; : +; p3 = %p3 +; x0 = &%p4 +; x1 = &%p5 +define aarch64_sve_vector_pcs @callee_with_many_svepred_arg( %p0, %p1, %p2, %p3, %p4, %p5) { +; CHECK: name: callee_with_many_svepred_arg +; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = COPY $x1 +; CHECK-DAG: [[RES:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0 +; CHECK-DAG: $p0 = COPY [[RES]] +; CHECK: RET_ReallyLR implicit $p0 + ret %p5 +} + +; Test that p4 and p5 are passed by reference. +define aarch64_sve_vector_pcs @caller_with_many_svepred_arg( %p) { +; CHECK: name: caller_with_many_svepred_arg +; CHECK: stack: +; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 1, alignment: 4, +; CHECK-NEXT: stack-id: sve-vec +; CHECK: - { id: 1, name: '', type: default, offset: 0, size: 1, alignment: 4, +; CHECK-NEXT: stack-id: sve-vec +; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.0, 0 +; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.1, 0 +; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0 +; CHECK-DAG: [[BASE2:%[0-9]+]]:gpr64sp = ADDXri %stack.1, 0 +; CHECK-DAG: $x0 = COPY [[BASE1]] +; CHECK-DAG: $x1 = COPY [[BASE2]] +; CHECK-NEXT: BL @callee_with_many_svepred_arg +; CHECK: RET_ReallyLR implicit $p0 + %ret = call aarch64_sve_vector_pcs @callee_with_many_svepred_arg( %p, %p, %p, %p, %p, %p) + ret %ret +} + +; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack. +; i.e. x0 = %x0 +; : +; x7 = %x7 +; z0 = %z0 +; : +; z7 = %z7 +; [sp] = &%z8 +; [sp+8] = &%z9 +; +define aarch64_sve_vector_pcs @callee_with_many_gpr_sve_arg(i64 %x0, i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, %z0, %z1, %z2, %z3, %z4, %z5, %z6, %z7, %z8, %z9) { +; CHECK: name: callee_with_many_gpr_sve_arg +; CHECK: fixedStack: +; CHECK: - { id: 0, type: default, offset: 8, size: 8, alignment: 8, stack-id: default, +; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = LDRXui %fixed-stack.0, 0 +; CHECK-DAG: [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31 +; CHECK-DAG: [[RES:%[0-9]+]]:zpr = LD1W_IMM killed [[PTRUE]], killed [[BASE]] +; CHECK-DAG: $z0 = COPY [[RES]] +; CHECK: RET_ReallyLR implicit $z0 + ret %z9 +} + +; Test that z8 and z9 are passed by reference, where reference is passed on the stack. +define aarch64_sve_vector_pcs @caller_with_many_gpr_sve_arg(i64 %x, %z, %z2) { +; CHECK: name: caller_with_many_gpr_sve_arg +; CHECK: stack: +; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 16, alignment: 16, +; CHECK-NEXT: stack-id: sve-vec +; CHECK: - { id: 1, name: '', type: default, offset: 0, size: 16, alignment: 16, +; CHECK-NEXT: stack-id: sve-vec +; CHECK-DAG: [[PTRUE_S:%[0-9]+]]:ppr_3b = PTRUE_S 31 +; CHECK-DAG: [[PTRUE_D:%[0-9]+]]:ppr_3b = PTRUE_D 31 +; CHECK-DAG: ST1D_IMM %{{[0-9]+}}, killed [[PTRUE_D]], %stack.0, 0 +; CHECK-DAG: ST1W_IMM %{{[0-9]+}}, killed [[PTRUE_S]], %stack.1, 0 +; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0 +; CHECK-DAG: [[BASE2:%[0-9]+]]:gpr64common = ADDXri %stack.1, 0 +; CHECK-DAG: [[SP:%[0-9]+]]:gpr64sp = COPY $sp +; CHECK-DAG: STRXui killed [[BASE1]], [[SP]], 0 +; CHECK-DAG: STRXui killed [[BASE2]], [[SP]], 1 +; CHECK: BL @callee_with_many_gpr_sve_arg +; CHECK: RET_ReallyLR implicit $z0 + %ret = call aarch64_sve_vector_pcs @callee_with_many_gpr_sve_arg(i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, %z, %z, %z, %z, %z, %z, %z, %z, %z2, %z) + ret %ret +}