diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/llvm/lib/Target/AArch64/AArch64CallingConvention.h --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.h +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.h @@ -19,6 +19,9 @@ bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); +bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -162,6 +162,46 @@ CCDelegateTo ]>; +// Vararg functions on Arm64EC ABI use a different convention, using +// a stack layout compatible with the x64 calling convention. +let Entry = 1 in +def CC_AArch64_Arm64EC_VarArg : CallingConv<[ + // Convert small floating-point values to integer. + CCIfType<[f16, bf16], CCBitConvertToType>, + CCIfType<[f32], CCBitConvertToType>, + CCIfType<[f64, v1f64, v1i64, v2f32, v2i32, v4i16, v4f16, v4bf16, v8i8, iPTR], + CCBitConvertToType>, + + // Larger floating-point/vector values are passed indirectly. + CCIfType<[f128, v2f64, v2i64, v4i32, v4f32, v8i16, v8f16, v8bf16, v16i8], + CCPassIndirect>, + CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, + nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64], + CCPassIndirect>, + CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], + CCPassIndirect>, + + // Handle SRet. See comment in CC_AArch64_AAPCS. + CCIfInReg>>>>, + CCIfSRet>>, + + // Put ByVal arguments directly on the stack. Minimum size and alignment of a + // slot is 64-bit. (Shouldn't normally come up; the Microsoft ABI doesn't + // use byval.) + CCIfByVal>, + + // Promote small integers to i32 + CCIfType<[i1, i8, i16], CCPromoteToType>, + + // Pass first four arguments in x0-x3. + CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3]>>, + CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3]>>, + + // Put remaining arguments on stack. + CCIfType<[i32, i64], CCAssignToStack<8, 8>>, +]>; + // Windows Control Flow Guard checks take a single argument (the target function // address) and have no return value. let Entry = 1 in diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5515,8 +5515,11 @@ case CallingConv::Swift: case CallingConv::SwiftTail: case CallingConv::Tail: - if (Subtarget->isTargetWindows() && IsVarArg) + if (Subtarget->isTargetWindows() && IsVarArg) { + if (Subtarget->isWindowsArm64EC()) + return CC_AArch64_Arm64EC_VarArg; return CC_AArch64_Win64_VarArg; + } if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; if (!IsVarArg) @@ -5524,7 +5527,12 @@ return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg : CC_AArch64_DarwinPCS_VarArg; case CallingConv::Win64: - return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; + if (IsVarArg) { + if (Subtarget->isWindowsArm64EC()) + return CC_AArch64_Arm64EC_VarArg; + return CC_AArch64_Win64_VarArg; + } + return CC_AArch64_AAPCS; case CallingConv::CFGuard_Check: return CC_AArch64_Win64_CFGuard_Check; case CallingConv::AArch64_VectorCall: @@ -5650,8 +5658,9 @@ case CCValAssign::Full: break; case CCValAssign::Indirect: - assert(VA.getValVT().isScalableVector() && - "Only scalable vectors can be passed indirectly"); + assert((VA.getValVT().isScalableVector() || + Subtarget->isWindowsArm64EC()) && + "Indirect arguments should be scalable on most subtargets"); break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); @@ -5678,10 +5687,24 @@ !Ins[i].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; - int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); + SDValue FIN; + MachinePointerInfo PtrInfo; + if (isVarArg && Subtarget->isWindowsArm64EC()) { + // In the ARM64EC varargs convention, fixed arguments on the stack are + // accessed relative to x4, not sp. + unsigned ObjOffset = ArgOffset + BEAlign; + Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); + FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val, + DAG.getConstant(ObjOffset, DL, MVT::i64)); + PtrInfo = MachinePointerInfo::getUnknownStack(MF); + } else { + int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); - // Create load nodes to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + // Create load nodes to retrieve arguments from the stack. + FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); + PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); + } // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; @@ -5695,8 +5718,9 @@ MemVT = VA.getLocVT(); break; case CCValAssign::Indirect: - assert(VA.getValVT().isScalableVector() && - "Only scalable vectors can be passed indirectly"); + assert((VA.getValVT().isScalableVector() || + Subtarget->isWindowsArm64EC()) && + "Indirect arguments should be scalable on most subtargets"); MemVT = VA.getLocVT(); break; case CCValAssign::SExt: @@ -5710,14 +5734,14 @@ break; } - ArgValue = - DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, - MachinePointerInfo::getFixedStack(MF, FI), MemVT); + ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo, + MemVT); } if (VA.getLocInfo() == CCValAssign::Indirect) { - assert(VA.getValVT().isScalableVector() && - "Only scalable vectors can be passed indirectly"); + assert( + (VA.getValVT().isScalableVector() || Subtarget->isWindowsArm64EC()) && + "Indirect arguments should be scalable on most subtargets"); uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize(); unsigned NumParts = 1; @@ -5737,9 +5761,16 @@ InVals.push_back(ArgValue); NumParts--; if (NumParts > 0) { - SDValue BytesIncrement = DAG.getVScale( - DL, Ptr.getValueType(), - APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); + SDValue BytesIncrement; + if (PartLoad.isScalableVector()) { + BytesIncrement = DAG.getVScale( + DL, Ptr.getValueType(), + APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); + } else { + BytesIncrement = DAG.getConstant( + APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize), DL, + Ptr.getValueType()); + } SDNodeFlags Flags; Flags.setNoUnsignedWrap(true); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, @@ -5786,6 +5817,7 @@ unsigned StackOffset = CCInfo.getNextStackOffset(); // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8); + FuncInfo->setVarArgsStackOffset(StackOffset); FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); if (MFI.hasMustTailInVarArgFunc()) { @@ -5867,7 +5899,12 @@ static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 }; - static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); + unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); + if (Subtarget->isWindowsArm64EC()) { + // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs + // functions. + NumGPRArgRegs = 4; + } unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); @@ -5881,7 +5918,19 @@ } else GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false); - SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); + SDValue FIN; + if (Subtarget->isWindowsArm64EC()) { + // With the Arm64EC ABI, we reserve the save area as usual, but we + // compute its address relative to x4. For a normal AArch64->AArch64 + // call, x4 == sp on entry, but calls from an entry thunk can pass in a + // different address. + Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); + FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val, + DAG.getConstant(GPRSaveSize, DL, MVT::i64)); + } else { + FIN = DAG.getFrameIndex(GPRIdx, PtrVT); + } for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); @@ -6189,9 +6238,10 @@ // 'getBytesInStackArgArea' is not sufficient to determine whether we need to // allocate space on the stack. That is why we determine this explicitly here // the call cannot be a tailcall. - if (llvm::any_of(ArgLocs, [](CCValAssign &A) { + if (llvm::any_of(ArgLocs, [&](CCValAssign &A) { assert((A.getLocInfo() != CCValAssign::Indirect || - A.getValVT().isScalableVector()) && + A.getValVT().isScalableVector() || + Subtarget->isWindowsArm64EC()) && "Expected value to be scalable"); return A.getLocInfo() == CCValAssign::Indirect; })) @@ -6456,8 +6506,9 @@ Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::Indirect: - assert(VA.getValVT().isScalableVector() && - "Only scalable vectors can be passed indirectly"); + bool isScalable = VA.getValVT().isScalableVector(); + assert((isScalable || Subtarget->isWindowsArm64EC()) && + "Indirect arguments should be scalable on most subtargets"); uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize(); uint64_t PartSize = StoreSize; @@ -6473,7 +6524,8 @@ Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext()); Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty); int FI = MFI.CreateStackObject(StoreSize, Alignment, false); - MFI.setStackID(FI, TargetStackID::ScalableVector); + if (isScalable) + MFI.setStackID(FI, TargetStackID::ScalableVector); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI); SDValue Ptr = DAG.getFrameIndex( @@ -6486,9 +6538,16 @@ Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI); NumParts--; if (NumParts > 0) { - SDValue BytesIncrement = DAG.getVScale( - DL, Ptr.getValueType(), - APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); + SDValue BytesIncrement; + if (isScalable) { + BytesIncrement = DAG.getVScale( + DL, Ptr.getValueType(), + APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize)); + } else { + BytesIncrement = DAG.getConstant( + APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize), DL, + Ptr.getValueType()); + } SDNodeFlags Flags; Flags.setNoUnsignedWrap(true); @@ -6607,6 +6666,16 @@ } } + if (IsVarArg && Subtarget->isWindowsArm64EC()) { + // For vararg calls, the Arm64EC ABI requires values in x4 and x5 + // describing the argument list. x4 contains the address of the + // first stack parameter. x5 contains the size in bytes of all parameters + // passed on the stack. + RegsToPass.emplace_back(AArch64::X4, StackPtr); + RegsToPass.emplace_back(AArch64::X5, + DAG.getConstant(NumBytes, DL, MVT::i64)); + } + if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); @@ -8239,14 +8308,30 @@ SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const { - AArch64FunctionInfo *FuncInfo = - DAG.getMachineFunction().getInfo(); + MachineFunction &MF = DAG.getMachineFunction(); + AArch64FunctionInfo *FuncInfo = MF.getInfo(); SDLoc DL(Op); - SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 - ? FuncInfo->getVarArgsGPRIndex() - : FuncInfo->getVarArgsStackIndex(), - getPointerTy(DAG.getDataLayout())); + SDValue FR; + if (Subtarget->isWindowsArm64EC()) { + // With the Arm64EC ABI, we compute the address of the varargs save area + // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry, + // but calls from an entry thunk can pass in a different address. + Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass); + SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64); + uint64_t StackOffset; + if (FuncInfo->getVarArgsGPRSize() > 0) + StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize(); + else + StackOffset = FuncInfo->getVarArgsStackOffset(); + FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val, + DAG.getConstant(StackOffset, DL, MVT::i64)); + } else { + FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 + ? FuncInfo->getVarArgsGPRIndex() + : FuncInfo->getVarArgsStackIndex(), + getPointerTy(DAG.getDataLayout())); + } const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -85,6 +85,9 @@ /// stack. int VarArgsStackIndex = 0; + /// Offset of start of varargs area for arguments passed on the stack. + unsigned VarArgsStackOffset = 0; + /// FrameIndex for start of varargs area for arguments passed in /// general purpose registers. int VarArgsGPRIndex = 0; @@ -315,6 +318,9 @@ int getVarArgsStackIndex() const { return VarArgsStackIndex; } void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; } + unsigned getVarArgsStackOffset() const { return VarArgsStackOffset; } + void setVarArgsStackOffset(unsigned Offset) { VarArgsStackOffset = Offset; } + int getVarArgsGPRIndex() const { return VarArgsGPRIndex; } void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -1076,6 +1076,11 @@ MachineRegisterInfo &MRI = MF.getRegInfo(); auto &DL = F.getParent()->getDataLayout(); const AArch64TargetLowering &TLI = *getTLI(); + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + + // Arm64EC has extra requirements for varargs calls; bail out for now. + if (Info.IsVarArg && Subtarget.isWindowsArm64EC()) + return false; SmallVector OutArgs; for (auto &OrigArg : Info.OrigArgs) { @@ -1129,7 +1134,6 @@ // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. - const AArch64Subtarget &Subtarget = MF.getSubtarget(); unsigned Opc = 0; // Calls with operand bundle "clang.arc.attachedcall" are special. They should // be expanded to the call, directly followed by a special marker sequence and diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-pc-windows-msvc_arm64ec < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-pc-windows-msvc_arm64ec < %s -global-isel=1 -global-isel-abort=0 | FileCheck %s + +define void @varargs_callee(double %x, ...) nounwind { +; CHECK-LABEL: varargs_callee: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: stp x1, x2, [x4, #-24]! +; CHECK-NEXT: str x3, [x4, #16] +; CHECK-NEXT: str x4, [sp, #8] +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret + %list = alloca i8*, align 8 + %listx = bitcast i8** %list to i8* + call void @llvm.va_start(i8* nonnull %listx) + ret void +} + +define void @varargs_callee_manyargs(i64, i64, i64, i64, i64, ...) nounwind { +; CHECK-LABEL: varargs_callee_manyargs: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: add x8, x4, #8 +; CHECK-NEXT: str x8, [sp, #8] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret + %list = alloca i8*, align 8 + %listx = bitcast i8** %list to i8* + call void @llvm.va_start(i8* nonnull %listx) + ret void +} + +define void @varargs_caller() nounwind { +; CHECK-LABEL: varargs_caller: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: mov x4, sp +; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: mov x9, #4617315517961601024 +; CHECK-NEXT: mov x0, #4607182418800017408 +; CHECK-NEXT: mov w1, #2 +; CHECK-NEXT: mov x2, #4613937818241073152 +; CHECK-NEXT: mov w3, #4 +; CHECK-NEXT: mov w5, #16 +; CHECK-NEXT: stp xzr, x30, [sp, #24] // 8-byte Folded Spill +; CHECK-NEXT: stp x8, xzr, [sp, #8] +; CHECK-NEXT: str x9, [sp] +; CHECK-NEXT: bl varargs_callee +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret + call void (double, ...) @varargs_callee(double 1.0, i32 2, double 3.0, i32 4, double 5.0, <2 x double> ) + ret void +} + +define <2 x double> @varargs_many_argscallee(double %a, double %b, double %c, +; CHECK-LABEL: varargs_many_argscallee: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x4] +; CHECK-NEXT: ldr q0, [x3] +; CHECK-NEXT: ldr q1, [x8] +; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret + <2 x double> %d, <2 x double> %e, ...) nounwind { + %rval = fadd <2 x double> %d, %e + ret <2 x double> %rval +} + +define void @varargs_many_argscalleer() nounwind { +; CHECK-LABEL: varargs_many_argscalleer: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #64 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov x4, sp +; CHECK-NEXT: mov x8, #4618441417868443648 +; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: add x3, sp, #32 +; CHECK-NEXT: mov x0, #4607182418800017408 +; CHECK-NEXT: mov x1, #4611686018427387904 +; CHECK-NEXT: mov x2, #4613937818241073152 +; CHECK-NEXT: mov w5, #16 +; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-NEXT: stp q0, q0, [sp, #16] +; CHECK-NEXT: stp x9, x8, [sp] +; CHECK-NEXT: bl varargs_many_argscallee +; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #64 +; CHECK-NEXT: ret + call <2 x double> (double, double, double, <2 x double>, <2 x double>, ...) + @varargs_many_argscallee(double 1., double 2., double 3., + <2 x double> zeroinitializer, + <2 x double> zeroinitializer, double 6.) + ret void +} + + +declare void @llvm.va_start(i8*)