Index: llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp +++ llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp @@ -59,18 +59,43 @@ } // end anonymous namespace Function *AArch64Arm64ECCallLowering::buildExitThunk(CallBase *CB) { - Type *RetTy = CB->getFunctionType()->getReturnType(); + FunctionType *FT = CB->getFunctionType(); + Type *RetTy = FT->getReturnType(); + bool IsVarArg = FT->isVarArg(); + Type *PtrTy = PointerType::get(M->getContext(), 0); + Type *I64Ty = Type::getInt64Ty(M->getContext()); + SmallVector DefArgTypes; // The first argument to a thunk is the called function, stored in x9. // (Normally, we won't explicitly refer to this in the assembly; it just // gets passed on by the call.) - DefArgTypes.push_back(Type::getInt8PtrTy(M->getContext())); - for (unsigned i = 0; i < CB->arg_size(); ++i) { - DefArgTypes.push_back(CB->getArgOperand(i)->getType()); + DefArgTypes.push_back(PtrTy); + + if (IsVarArg) { + // We treat the variadic function's exit thunk as a normal function + // with type: + // rettype exitthunk( + // ptr x9, ptr x0, i64 x1, i64 x2, i64 x3, ptr x4, i64 x5) + // that can coverage all types of variadic function. + // x9 is similar to normal exit thunk, store the called function. + // x0-x3 is the arguments be stored in registers. + // x4 is the address of the arguments on the stack. + // x5 is the size of the arguments on the stack. + DefArgTypes.push_back(PtrTy); + for (int i = 0; i < 3; i++) + DefArgTypes.push_back(I64Ty); + + DefArgTypes.push_back(PtrTy); + DefArgTypes.push_back(I64Ty); + } else { + for (unsigned i = 0; i < CB->arg_size(); ++i) { + DefArgTypes.push_back(CB->getArgOperand(i)->getType()); + } } + FunctionType *Ty = FunctionType::get(RetTy, DefArgTypes, false); Function *F = - Function::Create(Ty, GlobalValue::InternalLinkage, 0, "thunk", M); + Function::Create(Ty, GlobalValue::InternalLinkage, 0, "exit_thunk", M); F->setCallingConv(CallingConv::ARM64EC_Thunk_Native); // Copy MSVC, and always set up a frame pointer. (Maybe this isn't necessary.) F->addFnAttr("frame-pointer", "all"); @@ -138,12 +163,13 @@ } else { Args.push_back(&Arg); } - ArgTypes.push_back(Args.back()->getType()); + if (!IsVarArg) + ArgTypes.push_back(Args.back()->getType()); } // FIXME: Transfer necessary attributes? sret? anything else? // FIXME: Try to share thunks. This probably involves simplifying the // argument types (translating all integers/pointers to i64, etc.) - auto *CallTy = FunctionType::get(X64RetType, ArgTypes, false); + auto *CallTy = FunctionType::get(X64RetType, ArgTypes, IsVarArg); Callee = IRB.CreateBitCast(Callee, CallTy->getPointerTo(0)); CallInst *Call = IRB.CreateCall(CallTy, Callee, Args); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1452,6 +1452,13 @@ PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); IsStrictFPEnabled = true; + + if (Subtarget->isWindowsArm64EC()) { + // FIXME: are there other intrinsics we need to add here? + setLibcallName(RTLIB::MEMCPY, "#memcpy"); + setLibcallName(RTLIB::MEMSET, "#memset"); + setLibcallName(RTLIB::MEMMOVE, "#memmove"); + } } void AArch64TargetLowering::addTypeForNEON(MVT VT) { @@ -6372,7 +6379,8 @@ // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack // for the shadow store. - if (CalleeCC == CallingConv::ARM64EC_Thunk_X64) + // Variadic function allocate 32 extra bytes in the dynamic allocation + if (CalleeCC == CallingConv::ARM64EC_Thunk_X64 && !IsVarArg) CCInfo.AllocateStack(32, Align(16)); unsigned NumArgs = Outs.size(); @@ -6605,6 +6613,60 @@ return ZExtBool; } +// Variadic function's exit thunk need to allocate an allocation on +// the bottom of current stack as callee's real arguments on the stack, +// then copy caller's arguments on the stack to the allocation . +static SDValue varArgCopyForExitThunk(SelectionDAG &DAG, SDLoc &DL, + SDValue Chain, + SmallVector &OutVals, + bool RetStack) { + // Memory addresss of the arguments on the stack. + SDValue X4Stack = OutVals[OutVals.size() - 2]; + // Size of the arguments on the stack. + SDValue X5Length = OutVals[OutVals.size() - 1]; + + // 32 extra bytes shadow register + // 8 extra bytes to store x3 + int64_t ExtraAlloc = 32 + (RetStack ? 8 : 0); + SDValue AlignC = DAG.getConstant(0, DL, MVT::i64); + SDValue AddC = DAG.getConstant(15 + ExtraAlloc, DL, MVT::i64); + SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, X5Length, AddC); + + // Dynamic stack wiil align the size to 16btyes. + // It looks Microsoft not only align the size to 16bytes, + // but also align (-1,-15) to -16. We don't know why so for + // now we don't add this part. + SDValue Ops[] = {Chain, Add, AlignC}; + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); + SDValue ArgsCopyOnStack = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops); + Chain = ArgsCopyOnStack.getValue(1); + MachinePointerInfo PtrInfo = MachinePointerInfo(); + + // When varargs function returns the value in a register on AArch64, + // but requires an “sret” return on x64, we need to shuffle around + // the argument registers, and store x3 to the stack. + if (RetStack) { + SDValue ShadowC = DAG.getConstant(32, DL, MVT::i64); + SDValue StorePtr = + DAG.getNode(ISD::ADD, DL, MVT::i64, ArgsCopyOnStack, ShadowC); + Chain = DAG.getStore(Chain, DL, OutVals[5], StorePtr, PtrInfo); + } + + SDValue ExtraAllocC = DAG.getConstant(ExtraAlloc, DL, MVT::i64); + SDValue Dst = + DAG.getNode(ISD::ADD, DL, MVT::i64, ArgsCopyOnStack, ExtraAllocC); + SDValue Cpy = DAG.getMemcpy(Chain, DL, Dst, X4Stack, X5Length, Align(8), + /*isVol = */ false, /*AlwaysInline = */ false, + /*isTailCall = */ false, PtrInfo, PtrInfo); + Chain = Cpy.getValue(1); + + // FIXME: this should be a hack to make sure the HasVarSizedObjects=true, + // do we have better way here? + MachineFunction &MF = DAG.getMachineFunction(); + MF.getFrameInfo().CreateVariableSizedObject(Align(1), nullptr); + return Chain; +} + /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, /// and add input and output parameter nodes. SDValue @@ -6620,6 +6682,7 @@ bool &IsTailCall = CLI.IsTailCall; CallingConv::ID &CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; + bool IsArm64EcThunk = CallConv == CallingConv::ARM64EC_Thunk_X64; MachineFunction &MF = DAG.getMachineFunction(); MachineFunction::CallSiteInfo CSInfo; @@ -6679,6 +6742,11 @@ report_fatal_error("Passing SVE types to variadic functions is " "currently not supported"); } + + // Variadic exit thunk only need first 5 parameters to lower call itself. + // Last 2 arguments are the stack address and size. + if (IsArm64EcThunk) + Outs.resize(5); } analyzeCallOperands(*this, Subtarget, CLI, CCInfo); @@ -6737,6 +6805,9 @@ SmallVector MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); + if (IsVarArg && IsArm64EcThunk) + Chain = varArgCopyForExitThunk(DAG, DL, Chain, OutVals, Outs[1].IsFixed); + if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) { const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { @@ -6893,6 +6964,28 @@ if (Options.EmitCallSiteInfo) CSInfo.emplace_back(VA.getLocReg(), i); } + + if (IsVarArg && IsArm64EcThunk) { + // Float parameters are passed in both int and float register + Register ShadowReg; + switch (VA.getLocReg()) { + case AArch64::X0: + ShadowReg = AArch64::D0; + break; + case AArch64::X1: + ShadowReg = AArch64::D1; + break; + case AArch64::X2: + ShadowReg = AArch64::D2; + break; + case AArch64::X3: + ShadowReg = AArch64::D3; + break; + } + if (ShadowReg) + RegsToPass.push_back(std::make_pair( + ShadowReg, DAG.getRegister(VA.getLocReg(), MVT::i64))); + } } else { assert(VA.isMemLoc()); @@ -6962,7 +7055,7 @@ } } - if (IsVarArg && Subtarget->isWindowsArm64EC()) { + if (IsVarArg && Subtarget->isWindowsArm64EC() && !IsArm64EcThunk) { // For vararg calls, the Arm64EC ABI requires values in x4 and x5 // describing the argument list. x4 contains the address of the // first stack parameter. x5 contains the size in bytes of all parameters Index: llvm/test/CodeGen/AArch64/arm64ec-cfg.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64ec-cfg.ll +++ llvm/test/CodeGen/AArch64/arm64ec-cfg.ll @@ -43,6 +43,13 @@ ret [2 x i64] %r } +%struct.s17 = type { [17 x i8] } +define void @fvar4(ptr sret(%struct.s17) align 1 %agg.result, ptr %g) { +entry: + call void (ptr, i32, ...) %g(ptr sret(%struct.s17) align 1 %agg.result, i32 4, i32 5, i32 6, i32 8, i32 7, i32 9) + ret void +} + ; CHECK-LABEL: f: ; CHECK: .seh_proc f ; CHECK-NEXT: // %bb.0: // %entry @@ -50,8 +57,8 @@ ; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x8, __os_arm64x_check_icall -; CHECK-NEXT: adrp x10, thunk -; CHECK-NEXT: add x10, x10, :lo12:thunk +; CHECK-NEXT: adrp x10, exit_thunk +; CHECK-NEXT: add x10, x10, :lo12:exit_thunk ; CHECK-NEXT: mov x11, x0 ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] ; CHECK-NEXT: blr x8 @@ -71,8 +78,8 @@ ; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x8, __os_arm64x_check_icall -; CHECK-NEXT: adrp x10, thunk.1 -; CHECK-NEXT: add x10, x10, :lo12:thunk.1 +; CHECK-NEXT: adrp x10, exit_thunk.1 +; CHECK-NEXT: add x10, x10, :lo12:exit_thunk.1 ; CHECK-NEXT: mov x11, x0 ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] ; CHECK-NEXT: blr x8 @@ -97,8 +104,8 @@ ; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x8, __os_arm64x_check_icall -; CHECK-NEXT: adrp x10, thunk.2 -; CHECK-NEXT: add x10, x10, :lo12:thunk.2 +; CHECK-NEXT: adrp x10, exit_thunk.2 +; CHECK-NEXT: add x10, x10, :lo12:exit_thunk.2 ; CHECK-NEXT: mov x11, x0 ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] ; CHECK-NEXT: blr x8 @@ -122,8 +129,8 @@ ; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x8, __os_arm64x_check_icall -; CHECK-NEXT: adrp x10, thunk.3 -; CHECK-NEXT: add x10, x10, :lo12:thunk.3 +; CHECK-NEXT: adrp x10, exit_thunk.3 +; CHECK-NEXT: add x10, x10, :lo12:exit_thunk.3 ; CHECK-NEXT: mov x11, x0 ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] ; CHECK-NEXT: blr x8 @@ -147,8 +154,8 @@ ; CHECK-NEXT: .seh_save_reg_x x30, 16 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x8, __os_arm64x_check_icall -; CHECK-NEXT: adrp x10, thunk.4 -; CHECK-NEXT: add x10, x10, :lo12:thunk.4 +; CHECK-NEXT: adrp x10, exit_thunk.4 +; CHECK-NEXT: add x10, x10, :lo12:exit_thunk.4 ; CHECK-NEXT: mov x11, x0 ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] ; CHECK-NEXT: blr x8 @@ -176,8 +183,8 @@ ; CHECK-NEXT: .seh_save_reg x30, 16 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x8, __os_arm64x_check_icall -; CHECK-NEXT: adrp x10, thunk.5 -; CHECK-NEXT: add x10, x10, :lo12:thunk.5 +; CHECK-NEXT: adrp x10, exit_thunk.5 +; CHECK-NEXT: add x10, x10, :lo12:exit_thunk.5 ; CHECK-NEXT: mov x11, x0 ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] ; CHECK-NEXT: blr x8 @@ -211,8 +218,8 @@ ; CHECK-NEXT: .seh_save_reg x30, 16 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x8, __os_arm64x_check_icall -; CHECK-NEXT: adrp x10, thunk.6 -; CHECK-NEXT: add x10, x10, :lo12:thunk.6 +; CHECK-NEXT: adrp x10, exit_thunk.6 +; CHECK-NEXT: add x10, x10, :lo12:exit_thunk.6 ; CHECK-NEXT: mov x11, x0 ; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_check_icall] ; CHECK-NEXT: blr x8 @@ -237,8 +244,43 @@ ; CHECK-NEXT: .seh_endfunclet ; CHECK-NEXT: .seh_endproc ; -; CHECK-LABEL: thunk: -; CHECK: .seh_proc thunk +; CHECK-LABEL: fvar4: +; CHECK: .seh_proc fvar4 +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: sub sp, sp, #32 +; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: .seh_save_reg x30, 16 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: adrp x9, __os_arm64x_check_icall +; CHECK-NEXT: adrp x10, exit_thunk.7 +; CHECK-NEXT: add x10, x10, :lo12:exit_thunk.7 +; CHECK-NEXT: mov x11, x0 +; CHECK-NEXT: ldr x9, [x9, :lo12:__os_arm64x_check_icall] +; CHECK-NEXT: blr x9 +; CHECK-NEXT: mov x4, sp +; CHECK-NEXT: mov w9, #9 +; CHECK-NEXT: mov w10, #7 +; CHECK-NEXT: mov w0, #4 +; CHECK-NEXT: mov w1, #5 +; CHECK-NEXT: mov w2, #6 +; CHECK-NEXT: mov w3, #8 +; CHECK-NEXT: mov w5, #16 +; CHECK-NEXT: str w9, [sp, #8] +; CHECK-NEXT: str w10, [sp] +; CHECK-NEXT: blr x11 +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg x30, 16 +; CHECK-NEXT: add sp, sp, #32 +; CHECK-NEXT: .seh_stackalloc 32 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc +; +; CHECK-LABEL: exit_thunk: +; CHECK: .seh_proc exit_thunk ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: sub sp, sp, #48 ; CHECK-NEXT: .seh_stackalloc 48 @@ -260,8 +302,8 @@ ; CHECK-NEXT: .seh_endfunclet ; CHECK-NEXT: .seh_endproc ; -; CHECK-LABEL: thunk.1: -; CHECK: .seh_proc thunk.1 +; CHECK-LABEL: exit_thunk.1: +; CHECK: .seh_proc exit_thunk.1 ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: .seh_stackalloc 64 @@ -284,8 +326,8 @@ ; CHECK-NEXT: .seh_endfunclet ; CHECK-NEXT: .seh_endproc ; -; CHECK-LABEL: thunk.2: -; CHECK: .seh_proc thunk.2 +; CHECK-LABEL: exit_thunk.2: +; CHECK: .seh_proc exit_thunk.2 ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: sub sp, sp, #64 ; CHECK-NEXT: .seh_stackalloc 64 @@ -311,8 +353,8 @@ ; CHECK-NEXT: .seh_endfunclet ; CHECK-NEXT: .seh_endproc ; -; CHECK-LABEL: thunk.3: -; CHECK: .seh_proc thunk.3 +; CHECK-LABEL: exit_thunk.3: +; CHECK: .seh_proc exit_thunk.3 ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: .seh_stackalloc 80 @@ -340,82 +382,229 @@ ; CHECK-NEXT: .seh_endfunclet ; CHECK-NEXT: .seh_endproc ; -; CHECK-LABEL: thunk.4: -; CHECK: .seh_proc thunk.4 +; CHECK-LABEL: exit_thunk.4: +; CHECK: .seh_proc exit_thunk.4 ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: sub sp, sp, #48 -; CHECK-NEXT: .seh_stackalloc 48 -; CHECK-NEXT: stp x29, x30, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: .seh_save_fplr 32 -; CHECK-NEXT: add x29, sp, #32 -; CHECK-NEXT: .seh_add_fp 32 +; CHECK-NEXT: stp x19, x20, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp_x x19, 64 +; CHECK-NEXT: stp x21, x22, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp x21, 16 +; CHECK-NEXT: stp x25, x26, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp x25, 32 +; CHECK-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr 48 +; CHECK-NEXT: add x29, sp, #48 +; CHECK-NEXT: .seh_add_fp 48 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_call_no_redirect -; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect] -; CHECK-NEXT: blr x8 +; CHECK-NEXT: mov x19, x3 +; CHECK-NEXT: mov x20, x2 +; CHECK-NEXT: mov x21, x1 +; CHECK-NEXT: mov x22, x0 +; CHECK-NEXT: mov x25, x9 +; CHECK-NEXT: ldr x26, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect] +; CHECK-NEXT: add x8, x5, #47 +; CHECK-NEXT: lsr x15, x8, #4 +; CHECK-NEXT: bl __chkstk_arm64ec +; CHECK-NEXT: sub x8, sp, x15, lsl #4 +; CHECK-NEXT: add x0, x8, #32 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: mov x1, x4 +; CHECK-NEXT: mov x2, x5 +; CHECK-NEXT: bl "#memcpy" +; CHECK-NEXT: mov x0, x22 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: mov x2, x20 +; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: mov x9, x25 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: fmov d1, x1 +; CHECK-NEXT: fmov d2, x2 +; CHECK-NEXT: fmov d3, x3 +; CHECK-NEXT: blr x26 ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldp x29, x30, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: .seh_save_fplr 32 -; CHECK-NEXT: add sp, sp, #48 -; CHECK-NEXT: .seh_stackalloc 48 +; CHECK-NEXT: sub sp, x29, #48 +; CHECK-NEXT: .seh_add_fp 48 +; CHECK-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr 48 +; CHECK-NEXT: ldp x25, x26, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp x25, 32 +; CHECK-NEXT: ldp x21, x22, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp x21, 16 +; CHECK-NEXT: ldp x19, x20, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp_x x19, 64 ; CHECK-NEXT: .seh_endepilogue ; CHECK-NEXT: ret ; CHECK-NEXT: .seh_endfunclet ; CHECK-NEXT: .seh_endproc ; -; CHECK-LABEL: thunk.5: -; CHECK: .seh_proc thunk.5 +; CHECK-LABEL: exit_thunk.5: +; CHECK: .seh_proc exit_thunk.5 ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: .seh_stackalloc 64 +; CHECK-NEXT: stp x19, x20, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp_x x19, 64 +; CHECK-NEXT: stp x21, x22, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp x21, 16 +; CHECK-NEXT: stp x25, x26, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp x25, 32 ; CHECK-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: .seh_save_fplr 48 ; CHECK-NEXT: add x29, sp, #48 ; CHECK-NEXT: .seh_add_fp 48 ; CHECK-NEXT: .seh_endprologue ; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_call_no_redirect -; CHECK-NEXT: str w5, [sp, #40] -; CHECK-NEXT: str w4, [sp, #32] -; CHECK-NEXT: ldr x8, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect] -; CHECK-NEXT: blr x8 +; CHECK-NEXT: mov x19, x3 +; CHECK-NEXT: mov x20, x2 +; CHECK-NEXT: mov x21, x1 +; CHECK-NEXT: mov x22, x0 +; CHECK-NEXT: mov x25, x9 +; CHECK-NEXT: ldr x26, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect] +; CHECK-NEXT: add x8, x5, #47 +; CHECK-NEXT: lsr x15, x8, #4 +; CHECK-NEXT: bl __chkstk_arm64ec +; CHECK-NEXT: sub x8, sp, x15, lsl #4 +; CHECK-NEXT: add x0, x8, #32 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: mov x1, x4 +; CHECK-NEXT: mov x2, x5 +; CHECK-NEXT: bl "#memcpy" +; CHECK-NEXT: mov x0, x22 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: mov x2, x20 +; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: mov x9, x25 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: fmov d1, x1 +; CHECK-NEXT: fmov d2, x2 +; CHECK-NEXT: fmov d3, x3 +; CHECK-NEXT: blr x26 ; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: sub sp, x29, #48 +; CHECK-NEXT: .seh_add_fp 48 ; CHECK-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: .seh_save_fplr 48 -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: .seh_stackalloc 64 +; CHECK-NEXT: ldp x25, x26, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp x25, 32 +; CHECK-NEXT: ldp x21, x22, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp x21, 16 +; CHECK-NEXT: ldp x19, x20, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp_x x19, 64 ; CHECK-NEXT: .seh_endepilogue ; CHECK-NEXT: ret ; CHECK-NEXT: .seh_endfunclet ; CHECK-NEXT: .seh_endproc ; -; CHECK-LABEL: thunk.6: -; CHECK: .seh_proc thunk.6 +; CHECK-LABEL: exit_thunk.6: +; CHECK: .seh_proc exit_thunk.6 ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: .seh_stackalloc 96 -; CHECK-NEXT: stp x29, x30, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: .seh_save_fplr 80 -; CHECK-NEXT: add x29, sp, #80 -; CHECK-NEXT: .seh_add_fp 80 +; CHECK-NEXT: stp x19, x20, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp_x x19, 64 +; CHECK-NEXT: stp x21, x22, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp x21, 16 +; CHECK-NEXT: str x25, [sp, #32] // 8-byte Folded Spill +; CHECK-NEXT: .seh_save_reg x25, 32 +; CHECK-NEXT: stp x29, x30, [sp, #40] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr 40 +; CHECK-NEXT: add x29, sp, #40 +; CHECK-NEXT: .seh_add_fp 40 +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .seh_stackalloc 16 ; CHECK-NEXT: .seh_endprologue -; CHECK-NEXT: adrp x10, __os_arm64x_dispatch_call_no_redirect -; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: mov w3, w2 -; CHECK-NEXT: mov w2, w1 -; CHECK-NEXT: mov w1, w0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: ldr x10, [x10, :lo12:__os_arm64x_dispatch_call_no_redirect] -; CHECK-NEXT: str w5, [sp, #48] -; CHECK-NEXT: str w4, [sp, #40] -; CHECK-NEXT: str w8, [sp, #32] -; CHECK-NEXT: blr x10 -; CHECK-NEXT: ldp x0, x1, [x29, #-16] +; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_call_no_redirect +; CHECK-NEXT: mov x19, x2 +; CHECK-NEXT: mov x20, x1 +; CHECK-NEXT: mov x21, x0 +; CHECK-NEXT: mov x22, x9 +; CHECK-NEXT: ldr x25, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect] +; CHECK-NEXT: add x8, x5, #55 +; CHECK-NEXT: lsr x15, x8, #4 +; CHECK-NEXT: bl __chkstk_arm64ec +; CHECK-NEXT: sub x8, sp, x15, lsl #4 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: add x0, x8, #40 +; CHECK-NEXT: mov x1, x4 +; CHECK-NEXT: mov x2, x5 +; CHECK-NEXT: str x3, [x8, #32] +; CHECK-NEXT: bl "#memcpy" +; CHECK-NEXT: sub x0, x29, #56 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: mov x2, x20 +; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: mov x9, x22 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: fmov d1, x1 +; CHECK-NEXT: fmov d2, x2 +; CHECK-NEXT: fmov d3, x3 +; CHECK-NEXT: blr x25 +; CHECK-NEXT: ldp x0, x1, [x29, #-56] +; CHECK-NEXT: .seh_startepilogue +; CHECK-NEXT: sub sp, x29, #40 +; CHECK-NEXT: .seh_add_fp 40 +; CHECK-NEXT: ldp x29, x30, [sp, #40] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr 40 +; CHECK-NEXT: ldr x25, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: .seh_save_reg x25, 32 +; CHECK-NEXT: ldp x21, x22, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp x21, 16 +; CHECK-NEXT: ldp x19, x20, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp_x x19, 64 +; CHECK-NEXT: .seh_endepilogue +; CHECK-NEXT: ret +; CHECK-NEXT: .seh_endfunclet +; CHECK-NEXT: .seh_endproc +; +; CHECK-LABEL: exit_thunk.7: +; CHECK: .seh_proc exit_thunk.7 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: stp x19, x20, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp_x x19, 64 +; CHECK-NEXT: stp x21, x22, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp x21, 16 +; CHECK-NEXT: stp x25, x26, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_regp x25, 32 +; CHECK-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .seh_save_fplr 48 +; CHECK-NEXT: add x29, sp, #48 +; CHECK-NEXT: .seh_add_fp 48 +; CHECK-NEXT: .seh_endprologue +; CHECK-NEXT: mov x22, x8 +; CHECK-NEXT: adrp x8, __os_arm64x_dispatch_call_no_redirect +; CHECK-NEXT: mov x19, x2 +; CHECK-NEXT: mov x20, x1 +; CHECK-NEXT: mov x21, x0 +; CHECK-NEXT: mov x25, x9 +; CHECK-NEXT: ldr x26, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect] +; CHECK-NEXT: add x8, x4, #47 +; CHECK-NEXT: lsr x15, x8, #4 +; CHECK-NEXT: bl __chkstk_arm64ec +; CHECK-NEXT: sub x8, sp, x15, lsl #4 +; CHECK-NEXT: add x0, x8, #32 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: mov x1, x3 +; CHECK-NEXT: mov x2, x4 +; CHECK-NEXT: bl "#memcpy" +; CHECK-NEXT: mov x0, x22 +; CHECK-NEXT: mov x1, x21 +; CHECK-NEXT: mov x2, x20 +; CHECK-NEXT: mov x3, x19 +; CHECK-NEXT: mov x9, x25 +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: fmov d1, x1 +; CHECK-NEXT: fmov d2, x2 +; CHECK-NEXT: fmov d3, x3 +; CHECK-NEXT: blr x26 ; CHECK-NEXT: .seh_startepilogue -; CHECK-NEXT: ldp x29, x30, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: .seh_save_fplr 80 -; CHECK-NEXT: add sp, sp, #96 -; CHECK-NEXT: .seh_stackalloc 96 +; CHECK-NEXT: sub sp, x29, #48 +; CHECK-NEXT: .seh_add_fp 48 +; CHECK-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_fplr 48 +; CHECK-NEXT: ldp x25, x26, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp x25, 32 +; CHECK-NEXT: ldp x21, x22, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp x21, 16 +; CHECK-NEXT: ldp x19, x20, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: .seh_save_regp_x x19, 64 ; CHECK-NEXT: .seh_endepilogue ; CHECK-NEXT: ret ; CHECK-NEXT: .seh_endfunclet