diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -906,7 +906,7 @@ SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, + const SmallVectorImpl &RVLocs, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5953,7 +5953,7 @@ assert(!Res && "Call operand has unhandled type"); (void)Res; } - SmallVector ArgValues; + unsigned ExtraArgLocs = 0; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i - ExtraArgLocs]; @@ -6346,17 +6346,10 @@ /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, const SDLoc &DL, + const SmallVectorImpl &RVLocs, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { - CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); - // Assign locations to each value returned by this call. - SmallVector RVLocs; DenseMap CopiedRegs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, - *DAG.getContext()); - CCInfo.AnalyzeCallResult(Ins, RetCC); - // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; @@ -6726,17 +6719,39 @@ GuardWithBTI = FuncInfo->branchTargetEnforcement(); } + // Analyze operands of the call, assigning locations to each operand. + SmallVector ArgLocs; + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + + if (IsVarArg) { + unsigned NumArgs = Outs.size(); + + for (unsigned i = 0; i != NumArgs; ++i) { + if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector()) + report_fatal_error("Passing SVE types to variadic functions is " + "currently not supported"); + } + } + + analyzeCallOperands(*this, Subtarget, CLI, CCInfo); + + CCAssignFn *RetCC = CCAssignFnForReturn(CallConv); + // Assign locations to each value returned by this call. + SmallVector RVLocs; + CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + RetCCInfo.AnalyzeCallResult(Ins, RetCC); + // Check callee args/returns for SVE registers and set calling convention // accordingly. if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) { - bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){ - return Out.VT.isScalableVector(); - }); - bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){ - return In.VT.isScalableVector(); - }); - - if (CalleeInSVE || CalleeOutSVE) + auto HasSVERegLoc = [](CCValAssign &Loc) { + if (!Loc.isRegLoc()) + return false; + return AArch64::ZPRRegClass.contains(Loc.getLocReg()) || + AArch64::PPRRegClass.contains(Loc.getLocReg()); + }; + if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc)) CallConv = CallingConv::AArch64_SVE_VectorCall; } @@ -6758,22 +6773,6 @@ report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); - // Analyze operands of the call, assigning locations to each operand. - SmallVector ArgLocs; - CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); - - if (IsVarArg) { - unsigned NumArgs = Outs.size(); - - for (unsigned i = 0; i != NumArgs; ++i) { - if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector()) - report_fatal_error("Passing SVE types to variadic functions is " - "currently not supported"); - } - } - - analyzeCallOperands(*this, Subtarget, CLI, CCInfo); - // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); @@ -7221,9 +7220,9 @@ // Handle result values, copying them out of physregs into vregs that we // return. - SDValue Result = - LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, InVals, - IsThisReturn, IsThisReturn ? OutVals[0] : SDValue()); + SDValue Result = LowerCallResult(Chain, InFlag, CallConv, IsVarArg, RVLocs, + DL, DAG, InVals, IsThisReturn, + IsThisReturn ? OutVals[0] : SDValue()); if (!Ins.empty()) InFlag = Result.getValue(Result->getNumValues() - 1); diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -334,7 +334,46 @@ ret void } -; Use AAPCS, no SVE register in z0-7 used (floats occupy z0-z7) +; Use AAVPCS, SVE register used in return + +define @aavpcs5(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, %s17, float * %ptr) nounwind { +; CHECK-LABEL: aavpcs5: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr x8, [sp] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x6] +; CHECK-NEXT: ld1w { z4.s }, p0/z, [x5] +; CHECK-NEXT: ld1w { z5.s }, p0/z, [x4] +; CHECK-NEXT: ld1w { z6.s }, p0/z, [x3] +; CHECK-NEXT: ld1w { z7.s }, p0/z, [x2] +; CHECK-NEXT: ld1w { z24.s }, p0/z, [x1] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z24.s }, p0, [x8] +; CHECK-NEXT: st1w { z7.s }, p0, [x8] +; CHECK-NEXT: st1w { z6.s }, p0, [x8] +; CHECK-NEXT: st1w { z5.s }, p0, [x8] +; CHECK-NEXT: st1w { z4.s }, p0, [x8] +; CHECK-NEXT: st1w { z3.s }, p0, [x8] +; CHECK-NEXT: st1w { z2.s }, p0, [x8] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: ret +entry: + %ptr1.bc = bitcast float * %ptr to * + store volatile %s8, * %ptr1.bc + store volatile %s9, * %ptr1.bc + store volatile %s10, * %ptr1.bc + store volatile %s11, * %ptr1.bc + store volatile %s12, * %ptr1.bc + store volatile %s13, * %ptr1.bc + store volatile %s14, * %ptr1.bc + store volatile %s15, * %ptr1.bc + store volatile %s16, * %ptr1.bc + ret %s8 +} define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, %s17, float * %ptr) nounwind { ; CHECK-LABEL: aapcs1: @@ -375,18 +414,64 @@ ret void } -define void @non_sve_callee_high_range(float %f0, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6, float %f7, %v0, %v1) { -; CHECK-LABEL: non_sve_callee_high_range: +declare void @non_sve_callee_high_range(float %f0, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6, float %f7, %v0, %v1) + +define void @non_sve_caller_non_sve_callee_high_range() { +; CHECK-LABEL: non_sve_caller_non_sve_callee_high_range: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: fmov s1, #1.00000000 +; CHECK-NEXT: fmov s2, #2.00000000 +; CHECK-NEXT: fmov s3, #3.00000000 +; CHECK-NEXT: fmov s4, #4.00000000 +; CHECK-NEXT: fmov s5, #5.00000000 +; CHECK-NEXT: fmov s6, #6.00000000 +; CHECK-NEXT: fmov s7, #7.00000000 +; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: addvl x0, sp, #1 +; CHECK-NEXT: bl non_sve_callee_high_range +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @non_sve_callee_high_range(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, undef, undef) + ret void +} + +define void @non_sve_caller_high_range_non_sve_callee_high_range(float %f0, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6, float %f7, %v0, %v1) { +; CHECK-LABEL: non_sve_caller_high_range_non_sve_callee_high_range: ; CHECK: // %bb.0: -; CHECK-NEXT: str d8, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset b8, -16 -; CHECK-NEXT: //APP -; CHECK-NEXT: mov z8.d, #42 // =0x2a -; CHECK-NEXT: //NO_APP -; CHECK-NEXT: ldr d8, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: ld1w { z16.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z17.s }, p0/z, [x1] +; CHECK-NEXT: fmov s1, #1.00000000 +; CHECK-NEXT: fmov s2, #2.00000000 +; CHECK-NEXT: fmov s3, #3.00000000 +; CHECK-NEXT: fmov s4, #4.00000000 +; CHECK-NEXT: fmov s5, #5.00000000 +; CHECK-NEXT: fmov s6, #6.00000000 +; CHECK-NEXT: fmov s7, #7.00000000 +; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: addvl x0, sp, #1 +; CHECK-NEXT: st1w { z17.s }, p0, [sp] +; CHECK-NEXT: st1w { z16.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: bl non_sve_callee_high_range +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret - tail call void asm sideeffect "mov z8.d, #42", "~{z8}"() + call void @non_sve_callee_high_range(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, %v0, %v1) ret void } @@ -397,13 +482,48 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -8 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-18 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG -; CHECK-NEXT: addvl sp, sp, #-2 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG -; CHECK-NEXT: mov z8.d, z0.d +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 168 * VG +; CHECK-NEXT: mov z25.d, z0.d +; CHECK-NEXT: str z0, [sp] // 16-byte Folded Spill ; CHECK-NEXT: movi d0, #0000000000000000 ; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmov s1, #1.00000000 @@ -413,22 +533,143 @@ ; CHECK-NEXT: fmov s5, #5.00000000 ; CHECK-NEXT: fmov s6, #6.00000000 ; CHECK-NEXT: fmov s7, #7.00000000 -; CHECK-NEXT: mov x1, sp -; CHECK-NEXT: addvl x0, sp, #1 +; CHECK-NEXT: addvl x0, sp, #2 +; CHECK-NEXT: addvl x1, sp, #1 ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: st1w { z24.s }, p0, [sp] -; CHECK-NEXT: st1w { z8.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1w { z24.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: st1w { z25.s }, p0, [sp, #2, mul vl] ; CHECK-NEXT: bl non_sve_callee_high_range -; CHECK-NEXT: mov z0.d, z8.d -; CHECK-NEXT: addvl sp, sp, #2 -; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #18 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret call void @non_sve_callee_high_range(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, %v0, %v1) ret %v0 } +define @sve_ret_caller_non_sve_callee_high_range() { +; CHECK-LABEL: sve_ret_caller_non_sve_callee_high_range: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-18 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 160 * VG +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: fmov s1, #1.00000000 +; CHECK-NEXT: fmov s2, #2.00000000 +; CHECK-NEXT: fmov s3, #3.00000000 +; CHECK-NEXT: fmov s4, #4.00000000 +; CHECK-NEXT: fmov s5, #5.00000000 +; CHECK-NEXT: fmov s6, #6.00000000 +; CHECK-NEXT: fmov s7, #7.00000000 +; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: addvl x0, sp, #1 +; CHECK-NEXT: bl non_sve_callee_high_range +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #18 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @non_sve_callee_high_range(float 0.0, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, undef, undef) + ret undef +} + declare float @callee1(float, , , ) declare float @callee2(i32, i32, i32, i32, i32, i32, i32, i32, float, , ) declare float @callee3(float, float, , , )