diff --git a/llvm/lib/Target/VE/VECallingConv.td b/llvm/lib/Target/VE/VECallingConv.td --- a/llvm/lib/Target/VE/VECallingConv.td +++ b/llvm/lib/Target/VE/VECallingConv.td @@ -21,7 +21,11 @@ CCAssignToStack<0, 8> ]>; -def CC_VE : CallingConv<[ +///// C Calling Convention (VE ABI v2.1) ///// +// +// Reference: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-ABI_v2.1.pdf +// +def CC_VE_C : CallingConv<[ // All arguments get passed in generic registers if there is space. // Promote i1/i8/i16/i32 arguments to i64. @@ -51,6 +55,7 @@ CCDelegateTo ]>; +///// Standard vararg C Calling Convention (VE ABI v2.1) ///// // All arguments get passed in stack for varargs function or non-prototyped // function. def CC_VE2 : CallingConv<[ @@ -70,7 +75,7 @@ CCAssignToStack<0, 8> ]>; -def RetCC_VE : CallingConv<[ +def RetCC_VE_C : CallingConv<[ // Promote i1/i8/i16/i32 return values to i64. CCIfType<[i1, i8, i16, i32], CCPromoteToType>, @@ -92,9 +97,71 @@ [SX0, SX1, SX3, SX5]>>, ]>; +///// Custom fastcc ///// +// +// This passes vector params and return values in registers. Scalar values are +// handled conforming to the standard cc. +def CC_VE_Fast : CallingConv<[ + // vector --> generic vector registers + CCIfType<[v2i32, v2i64, v2f32, v2f64, + v4i32, v4i64, v4f32, v4f64, + v8i32, v8i64, v8f32, v8f64, + v16i32, v16i64, v16f32, v16f64, + v32i32, v32i64, v32f32, v32f64, + v64i32, v64i64, v64f32, v64f64, + v128i32, v128i64, v128f32, v128f64, + v256i32, v256f32, v256i64, v256f64], + CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>, + // TODO: make this conditional on packed mode + CCIfType<[v512i32, v512f32], + CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>, + + // vector mask --> generic vector mask registers + CCIfType<[v256i1], + CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>, + + // pair of vector mask --> generic vector mask registers + CCIfType<[v512i1], + CCAssignToRegWithShadow<[VMP1, VMP2, VMP3], + [VM1, VM1, VM3]>>, + + // Follow the standard C CC for scalars. + CCDelegateTo +]>; + +def RetCC_VE_Fast : CallingConv<[ + // vector --> generic vector registers + CCIfType<[v2i32, v2i64, v2f32, v2f64, + v4i32, v4i64, v4f32, v4f64, + v8i32, v8i64, v8f32, v8f64, + v16i32, v16i64, v16f32, v16f64, + v32i32, v32i64, v32f32, v32f64, + v64i32, v64i64, v64f32, v64f64, + v128i32, v128i64, v128f32, v128f64, + v256i32, v256f32, v256i64, v256f64], + CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>, + // TODO: make this conditional on packed mode + CCIfType<[v512i32, v512f32], + CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>, + + // vector mask --> generic vector mask registers + CCIfType<[v256i1], + CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>, + + // pair of vector mask --> generic vector mask registers + CCIfType<[v512i1], + CCAssignToRegWithShadow<[VMP1, VMP2, VMP3], + [VM1, VM1, VM3]>>, + + // Follow the standard C CC for scalars. + CCDelegateTo +]>; + // Callee-saved registers def CSR : CalleeSavedRegs<(add (sequence "SX%u", 18, 33))>; def CSR_NoRegs : CalleeSavedRegs<(add)>; // PreserveAll (clobbers s62,s63) - used for ve_grow_stack -def CSR_preserve_all : CalleeSavedRegs<(add (sequence "SX%u", 0, 61))>; +def CSR_preserve_all : CalleeSavedRegs<(add (sequence "SX%u", 0, 61), + (sequence "V%u", 0, 63), + (sequence "VM%u", 1, 15))>; diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -40,10 +40,30 @@ #include "VEGenCallingConv.inc" +CCAssignFn *getReturnCC(CallingConv::ID CallConv) { + switch (CallConv) { + default: + return RetCC_VE_C; + case CallingConv::Fast: + return RetCC_VE_Fast; + } +} + +CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) { + if (IsVarArg) + return CC_VE2; + switch (CallConv) { + default: + return CC_VE_C; + case CallingConv::Fast: + return CC_VE_Fast; + } +} + bool VETargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { - CCAssignFn *RetCC = RetCC_VE; + CCAssignFn *RetCC = getReturnCC(CallConv); SmallVector RVLocs; CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC); @@ -276,7 +296,7 @@ *DAG.getContext()); // Analyze return values. - CCInfo.AnalyzeReturn(Outs, RetCC_VE); + CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv)); SDValue Flag; SmallVector RetOps(1, Chain); @@ -357,7 +377,7 @@ CCInfo.AllocateStack(ArgsPreserved, Align(8)); // We already allocated the preserved area, so the stack offset computed // by CC_VE would be correct now. - CCInfo.AnalyzeFormalArguments(Ins, CC_VE); + CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false)); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -505,7 +525,7 @@ CCInfo.AllocateStack(ArgsPreserved, Align(8)); // We already allocated the preserved area, so the stack offset computed // by CC_VE would be correct now. - CCInfo.AnalyzeCallOperands(CLI.Outs, CC_VE); + CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false)); // VE requires to use both register and stack for varargs or no-prototyped // functions. @@ -516,7 +536,8 @@ CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs2, *DAG.getContext()); if (UseBoth) - CCInfo2.AnalyzeCallOperands(CLI.Outs, CC_VE2); + CCInfo2.AnalyzeCallOperands(CLI.Outs, + getParamCC(CLI.CallConv, CLI.IsVarArg)); // Get the size of the outgoing arguments stack space requirement. unsigned ArgsSize = CCInfo.getNextStackOffset(); @@ -701,7 +722,7 @@ if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB) CLI.Ins[0].Flags.setInReg(); - RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_VE); + RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv)); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp --- a/llvm/lib/Target/VE/VERegisterInfo.cpp +++ b/llvm/lib/Target/VE/VERegisterInfo.cpp @@ -35,6 +35,8 @@ const MCPhysReg * VERegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { switch (MF->getFunction().getCallingConv()) { + case CallingConv::Fast: + // Being explicit (same as standard CC). default: return CSR_SaveList; case CallingConv::PreserveAll: @@ -45,6 +47,8 @@ const uint32_t *VERegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { switch (CC) { + case CallingConv::Fast: + // Being explicit (same as standard CC). default: return CSR_RegMask; case CallingConv::PreserveAll: diff --git a/llvm/test/CodeGen/VE/Vector/fastcc.ll b/llvm/test/CodeGen/VE/Vector/fastcc.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/fastcc.ll @@ -0,0 +1,87 @@ +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +; Scalar argument passing must not change (same tests as in VE/Scalar/callee.ll below - this time with +vpu) + +define fastcc i32 @stack_stack_arg_i32_r9(i1 %0, i8 %1, i16 %2, i32 %3, i64 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; CHECK-LABEL: stack_stack_arg_i32_r9: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: ldl.sx %s0, 424(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 + ret i32 %9 +} + +define fastcc i64 @stack_stack_arg_i64_r9(i1 %0, i8 %1, i16 %2, i32 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) { +; CHECK-LABEL: stack_stack_arg_i64_r9: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: ld %s0, 424(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 + ret i64 %9 +} + +define fastcc float @stack_stack_arg_f32_r9(float %p0, float %p1, float %p2, float %p3, float %p4, float %p5, float %p6, float %p7, float %s0, float %s1) { +; CHECK-LABEL: stack_stack_arg_f32_r9: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: ldu %s0, 428(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 + ret float %s1 +} + +define fastcc i32 @stack_stack_arg_i32f32_r8(i32 %p0, float %p1, i32 %p2, float %p3, i32 %p4, float %p5, i32 %p6, float %p7, i32 %s0, float %s1) { +; CHECK-LABEL: stack_stack_arg_i32f32_r8: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: ldl.sx %s0, 416(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 + ret i32 %s0 +} + +define fastcc float @stack_stack_arg_i32f32_r9(i32 %p0, float %p1, i32 %p2, float %p3, i32 %p4, float %p5, i32 %p6, float %p7, i32 %s0, float %s1) { +; CHECK-LABEL: stack_stack_arg_i32f32_r9: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: ldu %s0, 428(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 + ret float %s1 +} + +; Vector argument passing (fastcc feature) + +; v0-to-v0 passthrough case without vreg copy. +define fastcc <256 x i32> @vreg_arg_v256i32_r0(<256 x i32> %p0) { +; CHECK-LABEL: vreg_arg_v256i32_r0: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: or %s11, 0, %s9 + ret <256 x i32> %p0 +} + +; TODO: Uncomment tests when vreg-to-vreg copy is upstream. +; define fastcc <256 x i32> @vreg_arg_v256i32_r1(<256 x i32> %p0, <256 x i32> %p1) { +; ret <256 x i32> %p1 +; } +; +; define fastcc <256 x i32> @vreg_arg_v256i32_r2(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2) { +; ret <256 x i32> %p2 +; } +; +; define fastcc <256 x i32> @vreg_arg_v256i32_r3(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3) { +; ret <256 x i32> %p3 +; } +; +; define fastcc <256 x i32> @vreg_arg_v256i32_r4(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4) { +; ret <256 x i32> %p4 +; } +; +; define fastcc <256 x i32> @vreg_arg_v256i32_r5(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5) { +; ret <256 x i32> %p5 +; } +; +; define fastcc <256 x i32> @vreg_arg_v256i32_r6(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6) { +; ret <256 x i32> %p6 +; } +; +; define <256 x i32> @vreg_arg_v256i32_r7(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6, <256 x i32> %p7) { +; ret <256 x i32> %p7 +; } + +; TODO: Uncomment test when vector loads are upstream (vreg stack passing). +; define <256 x i32> @vreg_arg_v256i32_r8(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6, <256 x i32> %p7, <256 x i32> %p8) { +; ret <256 x i32> %p8 +; }