Index: include/llvm/IR/CallingConv.h =================================================================== --- include/llvm/IR/CallingConv.h +++ include/llvm/IR/CallingConv.h @@ -193,6 +193,9 @@ /// Calling convention for AMDGPU code object kernels. AMDGPU_KERNEL = 91, + /// Register calling convention used for parameters transfer optimization + X86_RegCall = 92, + /// The highest possible calling convention ID. Must be some 2^k - 1. MaxID = 1023 }; Index: lib/AsmParser/LLLexer.cpp =================================================================== --- lib/AsmParser/LLLexer.cpp +++ lib/AsmParser/LLLexer.cpp @@ -591,6 +591,7 @@ KEYWORD(intel_ocl_bicc); KEYWORD(x86_64_sysvcc); KEYWORD(x86_64_win64cc); + KEYWORD(x86_regcallcc); KEYWORD(webkit_jscc); KEYWORD(swiftcc); KEYWORD(anyregcc); Index: lib/AsmParser/LLParser.cpp =================================================================== --- lib/AsmParser/LLParser.cpp +++ lib/AsmParser/LLParser.cpp @@ -1676,6 +1676,7 @@ case lltok::kw_coldcc: CC = CallingConv::Cold; break; case lltok::kw_x86_stdcallcc: CC = CallingConv::X86_StdCall; break; case lltok::kw_x86_fastcallcc: CC = CallingConv::X86_FastCall; break; + case lltok::kw_x86_regcallcc: CC = CallingConv::X86_RegCall; break; case lltok::kw_x86_thiscallcc: CC = CallingConv::X86_ThisCall; break; case lltok::kw_x86_vectorcallcc:CC = CallingConv::X86_VectorCall; break; case lltok::kw_arm_apcscc: CC = CallingConv::ARM_APCS; break; Index: lib/AsmParser/LLToken.h =================================================================== --- lib/AsmParser/LLToken.h +++ lib/AsmParser/LLToken.h @@ -127,6 +127,7 @@ kw_x86_fastcallcc, kw_x86_thiscallcc, kw_x86_vectorcallcc, + kw_x86_regcallcc, kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc, Index: lib/IR/AsmWriter.cpp =================================================================== --- lib/IR/AsmWriter.cpp +++ lib/IR/AsmWriter.cpp @@ -310,6 +310,7 @@ case CallingConv::X86_StdCall: Out << "x86_stdcallcc"; break; case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break; case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break; + case CallingConv::X86_RegCall: Out << "x86_regcallcc"; break; case CallingConv::X86_VectorCall:Out << "x86_vectorcallcc"; break; case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break; case CallingConv::ARM_APCS: Out << "arm_apcscc"; break; Index: lib/Target/X86/X86CallingConv.td =================================================================== --- lib/Target/X86/X86CallingConv.td +++ lib/Target/X86/X86CallingConv.td @@ -18,6 +18,168 @@ "(State.getMachineFunction().getSubtarget()).", F), A>; +// Register classes for RegCall +class RC_X86_RegCall { + list GPR_8 = []; + list GPR_16 = []; + list GPR_32 = []; + list GPR_64 = []; + list XMM = []; + list YMM = []; + list ZMM = []; +} + +// RegCall register classes for 32 bits +def RC_X86_32_RegCall : RC_X86_RegCall { + let GPR_8 = [AL, CL, DL, DIL, SIL]; + let GPR_16 = [AX, CX, DX, DI, SI]; + let GPR_32 = [EAX, ECX, EDX, EDI, ESI]; + let GPR_64 = [RAX]; /* not actually used, but AssignToReg can't handle [] */ + let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]; + let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]; + let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]; +} + +class RC_X86_64_RegCall : RC_X86_RegCall { + let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]; + let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, + YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15]; + let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7, + ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM14, ZMM15]; +} + +def RC_X86_64_RegCall_Win : RC_X86_64_RegCall { + let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R10B, R11B, R12B, R14B, R15B]; + let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R10W, R11W, R12W, R14W, R15W]; + let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R10D, R11D, R12D, R14D, R15D]; + let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11, R12, R14, R15]; +} + +def RC_X86_64_RegCall_Lin : RC_X86_64_RegCall { + let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R12B, R13B, R14B, R15B]; + let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R12W, R13W, R14W, R15W]; + let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R12D, R13D, R14D, R15D]; + let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R12, R13, R14, R15]; +} + +// Match if the current subtarget is either Linux or OSX +class CCIfSubtargetLinuxOSX : CCIf< + "static_cast(State.getMachineFunction().getSubtarget())." + "isTargetLinux() ||" + "static_cast(State.getMachineFunction().getSubtarget())." + "getTargetTriple().isMacOSX()", + A +>; + +// X86-64 Intel regcall calling convention. +multiclass X86_RegCall_base { +def CC_#NAME : CallingConv<[ + // Handles byval parameters. + CCIfSubtarget<"is64Bit()", CCIfByVal>>, + CCIfByVal>, + + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType>, + + // bool, char, int, enum, long, pointer --> GPR + CCIfType<[i32], CCAssignToReg>, + + // TODO: Handle the case of mask types (v*i1) + // TODO: Handle the case of 32 bit machine with v64i1 argument + // (split to 2 registers) + + // long long, __int64 --> GPR + CCIfType<[i64], CCAssignToReg>, + + // TODO: Handle the case of long double (f80) + + // float, double, float128 --> XMM + CCIfType<[f32, f64, f128], CCAssignToReg>, + + // __m128, __m128i, __m128d --> XMM + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg>, + + // __m256, __m256i, __m256d --> YMM + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg>, + + // __m512, __m512i, __m512d --> ZMM + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg>, + + // If no register was found -> assign to stack + + // In 64 bit, assign 64/32 bit values to 8 byte stack + CCIfSubtarget<"is64Bit()", CCIfType<[i32, i64, f32, f64], + CCAssignToStack<8, 8>>>, + + // In 32 bit, assign 64/32 bit values to 8/4 byte stack + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[f64], CCAssignToStack<8, 4>>, + + // Long doubles get stack slots whose size and alignment depends on the + // subtarget. + CCIfSubtarget<"is64Bit()", CCIfType<[f80], CCAssignToStack<0, 0>>>, + CCIfType<[f80], CCAssignToStack<0, 4>>, + + // MMX type gets 8 byte slot in stack , while alignment depends on target + CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>, + CCIfType<[x86mmx], CCAssignToStack<8, 4>>, + + // float 128 get stack slots whose size and alignment depends + // on the subtarget. + CCIfType<[f128], CCAssignToStack<0, 0>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToStack<16, 16>>, + + // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32 >>, + + // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64 >> +]>; + +def RetCC_#NAME : CallingConv<[ + // Promote i1 arguments to i8. + CCIfType<[i1], CCPromoteToType>, + + // bool, char, int, enum, long, pointer --> GPR + CCIfType<[i8], CCAssignToReg>, + CCIfType<[i16], CCAssignToReg>, + CCIfType<[i32], CCAssignToReg>, + + // TODO: Handle the case of mask types (v*i1) + // TODO: Handle the case of 32 bit machine with v64i1 argument + (split to 2 registers) + + // long long, __int64 --> GPR + CCIfType<[i64], CCAssignToReg>, + + // long double --> FP + CCIfType<[f80], CCAssignToReg<[FP0]>>, + + // float, double, float128 --> XMM + CCIfType<[f32, f64, f128], CCAssignToReg>, + + // __m128, __m128i, __m128d --> XMM + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg>, + + // __m256, __m256i, __m256d --> YMM + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg>, + + // __m512, __m512i, __m512d --> ZMM + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg> +]>; +} + //===----------------------------------------------------------------------===// // Return Value Calling Conventions //===----------------------------------------------------------------------===// @@ -234,6 +396,14 @@ RAX, R10, R11, R13, R14, R15]>> ]>; + +defm X86_32_RegCall : + X86_RegCall_base; +defm X86_Win64_RegCall : + X86_RegCall_base; +defm X86_Lin64_RegCall : + X86_RegCall_base; + // This is the root return-value convention for the X86-32 backend. def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. @@ -241,6 +411,7 @@ // If HiPE, use RetCC_X86_32_HiPE. CCIfCC<"CallingConv::HiPE", CCDelegateTo>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, + CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo>, // Otherwise, use RetCC_X86_32_C. CCDelegateTo @@ -265,6 +436,12 @@ // Handle HHVM calls. CCIfCC<"CallingConv::HHVM", CCDelegateTo>, + CCIfCC<"CallingConv::X86_RegCall", + CCIfSubtarget<"isTargetWin64()", + CCDelegateTo>>, + CCIfCC<"CallingConv::X86_RegCall", + CCIfSubtargetLinuxOSX>>, + // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo>, @@ -814,6 +991,7 @@ CCIfCC<"CallingConv::Fast", CCDelegateTo>, CCIfCC<"CallingConv::GHC", CCDelegateTo>, CCIfCC<"CallingConv::HiPE", CCDelegateTo>, + CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo>, // Otherwise, drop to normal X86-32 CC CCDelegateTo @@ -830,6 +1008,10 @@ CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo>, CCIfCC<"CallingConv::HHVM", CCDelegateTo>, CCIfCC<"CallingConv::HHVM_C", CCDelegateTo>, + CCIfCC<"CallingConv::X86_RegCall", + CCIfSubtarget<"isTargetWin64()", CCDelegateTo>>, + CCIfCC<"CallingConv::X86_RegCall", + CCIfSubtargetLinuxOSX>>, CCIfCC<"CallingConv::X86_INTR", CCDelegateTo>, // Mingw64 and native Win64 use Win64 CC @@ -931,3 +1113,14 @@ // Only R12 is preserved for PHP calls in HHVM. def CSR_64_HHVM : CalleeSavedRegs<(add R12)>; + +// Register calling convention preserves few GPR and XMM8-15 +// TODO: Register that passed as arguments doesn't need to be preserved +def CSR_32_RegCall : CalleeSavedRegs<(add ESI, EDI, EBX, EBP, ESP, + (sequence "XMM%u", 4, 7))>; +def CSR_Win64_RegCall : CalleeSavedRegs<(add RBX, RBP, RSP, + (sequence "R%u", 10, 15), + (sequence "XMM%u", 8, 15))>; +def CSR_Lin64_RegCall : CalleeSavedRegs<(add RBX, RBP, RSP, + (sequence "R%u", 12, 15), + (sequence "XMM%u", 8, 15))>; Index: lib/Target/X86/X86RegisterInfo.cpp =================================================================== --- lib/Target/X86/X86RegisterInfo.cpp +++ lib/Target/X86/X86RegisterInfo.cpp @@ -305,6 +305,10 @@ } case CallingConv::HHVM: return CSR_64_HHVM_SaveList; + case CallingConv::X86_RegCall: + return Is64Bit ? (IsWin64 ? CSR_Win64_RegCall_SaveList + : CSR_Lin64_RegCall_SaveList) + : CSR_32_RegCall_SaveList; case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_SaveList; @@ -401,6 +405,10 @@ } case CallingConv::HHVM: return CSR_64_HHVM_RegMask; + case CallingConv::X86_RegCall: + return Is64Bit ? (IsWin64 ? CSR_Win64_RegCall_RegMask + : CSR_Lin64_RegCall_RegMask) + : CSR_32_RegCall_RegMask; case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_RegMask; Index: test/CodeGen/X86/avx512-regcall-NoMask.ll =================================================================== --- test/CodeGen/X86/avx512-regcall-NoMask.ll +++ test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -0,0 +1,540 @@ +; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=X32 %s +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=WIN64 %s +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck --check-prefix=LINUXOSX64 %s + +; X32-LABEL: test_argReti1: +; X32: kmov{{.*}} %eax, %k{{[0-7]}} +; X32: kmov{{.*}} %k{{[0-7]}}, %eax +; X32: ret{{.*}} + +; WIN64-LABEL: test_argReti1: +; WIN64: kmov{{.*}} %eax, %k{{[0-7]}} +; WIN64: kmov{{.*}} %k{{[0-7]}}, %eax +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning i1 +define x86_regcallcc i1 @test_argReti1(i1 %a) { + %add = add i1 %a, 1 + ret i1 %add +} + +; X32-LABEL: test_CallargReti1: +; X32: kmov{{.*}} %k{{[0-7]}}, %eax +; X32: call{{.*}} {{.*}}test_argReti1 +; X32: kmov{{.*}} %eax, %k{{[0-7]}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargReti1: +; WIN64: kmov{{.*}} %k{{[0-7]}}, %eax +; WIN64: call{{.*}} {{.*}}test_argReti1 +; WIN64: kmov{{.*}} %eax, %k{{[0-7]}} +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving i1 +define x86_regcallcc i1 @test_CallargReti1(i1 %a) { + %b = add i1 %a, 1 + %c = call x86_regcallcc i1 @test_argReti1(i1 %b) + %d = add i1 %c, 1 + ret i1 %d +} + +; X32-LABEL: test_argReti8: +; X32: incb %al +; X32: ret{{.*}} + +; WIN64-LABEL: test_argReti8: +; WIN64: incb %al +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning i8 +define x86_regcallcc i8 @test_argReti8(i8 %a) { + %add = add i8 %a, 1 + ret i8 %add +} + +; X32-LABEL: test_CallargReti8: +; X32: incb %al +; X32: call{{.*}} {{.*}}test_argReti8 +; X32: incb %al +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargReti8: +; WIN64: incb %al +; WIN64: call{{.*}} {{.*}}test_argReti8 +; WIN64: incb %al +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving i8 +define x86_regcallcc i8 @test_CallargReti8(i8 %a) { + %b = add i8 %a, 1 + %c = call x86_regcallcc i8 @test_argReti8(i8 %b) + %d = add i8 %c, 1 + ret i8 %d +} + +; X32-LABEL: test_argReti16: +; X32: incl %eax +; X32: ret{{.*}} + +; WIN64-LABEL: test_argReti16: +; WIN64: incl %eax +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning i16 +define x86_regcallcc i16 @test_argReti16(i16 %a) { + %add = add i16 %a, 1 + ret i16 %add +} + +; X32-LABEL: test_CallargReti16: +; X32: incl %eax +; X32: call{{.*}} {{.*}}test_argReti16 +; X32: incl %eax +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargReti16: +; WIN64: incl %eax +; WIN64: call{{.*}} {{.*}}test_argReti16 +; WIN64: incl %eax +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving i16 +define x86_regcallcc i16 @test_CallargReti16(i16 %a) { + %b = add i16 %a, 1 + %c = call x86_regcallcc i16 @test_argReti16(i16 %b) + %d = add i16 %c, 1 + ret i16 %d +} + +; X32-LABEL: test_argReti32: +; X32: incl %eax +; X32: ret{{.*}} + +; WIN64-LABEL: test_argReti32: +; WIN64: incl %eax +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning i32 +define x86_regcallcc i32 @test_argReti32(i32 %a) { + %add = add i32 %a, 1 + ret i32 %add +} + +; X32-LABEL: test_CallargReti32: +; X32: incl %eax +; X32: call{{.*}} {{.*}}test_argReti32 +; X32: incl %eax +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargReti32: +; WIN64: incl %eax +; WIN64: call{{.*}} {{.*}}test_argReti32 +; WIN64: incl %eax +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving i32 +define x86_regcallcc i32 @test_CallargReti32(i32 %a) { + %b = add i32 %a, 1 + %c = call x86_regcallcc i32 @test_argReti32(i32 %b) + %d = add i32 %c, 1 + ret i32 %d +} + +; X32-LABEL: test_argReti64: +; X32: addl $1, %eax +; X32: ret{{.*}} + +; WIN64-LABEL: test_argReti64: +; WIN64: incq %rax +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning i64 +define x86_regcallcc i64 @test_argReti64(i64 %a) { + %add = add i64 %a, 1 + ret i64 %add +} + +; X32-LABEL: test_CallargReti64: +; X32: add{{.*}} $1, %eax +; X32: adcl $0, {{%e(cx|dx|si|di|bx|bp)}} +; X32: call{{.*}} {{.*}}test_argReti64 +; X32: add{{.*}} $1, %eax +; X32: adcl $0, {{%e(cx|dx|si|di|bx|bp)}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargReti64: +; WIN64: incq %rax +; WIN64: call{{.*}} {{.*}}test_argReti64 +; WIN64: incq %rax +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving i64 +define x86_regcallcc i64 @test_CallargReti64(i64 %a) { + %b = add i64 %a, 1 + %c = call x86_regcallcc i64 @test_argReti64(i64 %b) + %d = add i64 %c, 1 + ret i64 %d +} + +; X32-LABEL: test_argRetFloat: +; X32: vadd{{.*}} {{.*}}, {{%xmm([0-7])}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_argRetFloat: +; WIN64: vadd{{.*}} {{.*}}, {{%xmm([0-7])}} +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning float +define x86_regcallcc float @test_argRetFloat(float %a) { + %add = fadd float 1.0, %a + ret float %add +} + +; X32-LABEL: test_CallargRetFloat: +; X32: vadd{{.*}} {{%xmm([0-7])}}, {{%xmm([0-7])}}, {{%xmm([0-7])}} +; X32: call{{.*}} {{.*}}test_argRetFloat +; X32: vadd{{.*}} {{%xmm([0-7])}}, {{%xmm([0-7])}}, {{%xmm([0-7])}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargRetFloat: +; WIN64: vadd{{.*}} {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; WIN64: call{{.*}} {{.*}}test_argRetFloat +; WIN64: vadd{{.*}} {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving float +define x86_regcallcc float @test_CallargRetFloat(float %a) { + %b = fadd float 1.0, %a + %c = call x86_regcallcc float @test_argRetFloat(float %b) + %d = fadd float 1.0, %c + ret float %d +} + +; X32-LABEL: test_argRetDouble: +; X32: vadd{{.*}} {{.*}}, {{%xmm([0-7])}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_argRetDouble: +; WIN64: vadd{{.*}} {{.*}}, {{%xmm([0-7])}} +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning double +define x86_regcallcc double @test_argRetDouble(double %a) { + %add = fadd double %a, 1.0 + ret double %add +} + +; X32-LABEL: test_CallargRetDouble: +; X32: vadd{{.*}} {{%xmm([0-7])}}, {{%xmm([0-7])}}, {{%xmm([0-7])}} +; X32: call{{.*}} {{.*}}test_argRetDouble +; X32: vadd{{.*}} {{%xmm([0-7])}}, {{%xmm([0-7])}}, {{%xmm([0-7])}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargRetDouble: +; WIN64: vadd{{.*}} {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; WIN64: call{{.*}} {{.*}}test_argRetDouble +; WIN64: vadd{{.*}} {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving double +define x86_regcallcc double @test_CallargRetDouble(double %a) { + %b = fadd double 1.0, %a + %c = call x86_regcallcc double @test_argRetDouble(double %b) + %d = fadd double 1.0, %c + ret double %d +} + +; X32-LABEL: test_argRetPointer: +; X32: incl %eax +; X32: ret{{.*}} + +; WIN64-LABEL: test_argRetPointer: +; WIN64: incl %eax +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning pointer +define x86_regcallcc [4 x i32]* @test_argRetPointer([4 x i32]* %a) { + %b = ptrtoint [4 x i32]* %a to i32 + %c = add i32 %b, 1 + %d = inttoptr i32 %c to [4 x i32]* + ret [4 x i32]* %d +} + +; X32-LABEL: test_CallargRetPointer: +; X32: incl %eax +; X32: call{{.*}} {{.*}}test_argRetPointer +; X32: incl %eax +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargRetPointer: +; WIN64: incl %eax +; WIN64: call{{.*}} {{.*}}test_argRetPointer +; WIN64: incl %eax +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving pointer +define x86_regcallcc [4 x i32]* @test_CallargRetPointer([4 x i32]* %a) { + %b = ptrtoint [4 x i32]* %a to i32 + %c = add i32 %b, 1 + %d = inttoptr i32 %c to [4 x i32]* + %e = call x86_regcallcc [4 x i32]* @test_argRetPointer([4 x i32]* %d) + %f = ptrtoint [4 x i32]* %e to i32 + %g = add i32 %f, 1 + %h = inttoptr i32 %g to [4 x i32]* + ret [4 x i32]* %h +} + +; X32-LABEL: test_argRet128Vector: +; X32: vpblend{{.*}} {{%xmm([0-7])}}, {{%xmm([0-7])}}, {{%xmm([0-7])}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_argRet128Vector: +; WIN64: vpblend{{.*}} {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning 128 bit vector +define x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %b) { + %d = select <4 x i1> undef , <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %d +} + +; X32-LABEL: test_CallargRet128Vector: +; X32: vmov{{.*}} {{%xmm([0-7])}}, {{%xmm([0-7])}} +; X32: call{{.*}} {{.*}}test_argRet128Vector +; X32: vpblend{{.*}} {{%xmm([0-7])}}, {{%xmm([0-7])}}, {{%xmm([0-7])}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargRet128Vector: +; WIN64: vmov{{.*}} {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; WIN64: call{{.*}} {{.*}}test_argRet128Vector +; WIN64: vpblend{{.*}} {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving 128 bit vector +define x86_regcallcc <4 x i32> @test_CallargRet128Vector(<4 x i32> %a) { + %b = call x86_regcallcc <4 x i32> @test_argRet128Vector(<4 x i32> %a, <4 x i32> %a) + %c = select <4 x i1> undef , <4 x i32> %a, <4 x i32> %b + ret <4 x i32> %c +} + +; X32-LABEL: test_argRet256Vector: +; X32: vpblend{{.*}} {{%ymm([0-7])}}, {{%ymm([0-7])}}, {{%ymm([0-7])}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_argRet256Vector: +; WIN64: vpblend{{.*}} {{%ymm([0-9]+)}}, {{%ymm([0-9]+)}}, {{%ymm([0-9]+)}} +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning 256 bit vector +define x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %b) { + %d = select <8 x i1> undef , <8 x i32> %a, <8 x i32> %b + ret <8 x i32> %d +} + +; X32-LABEL: test_CallargRet256Vector: +; X32: vmov{{.*}} {{%ymm([0-7])}}, {{%ymm([0-7])}} +; X32: call{{.*}} {{.*}}test_argRet256Vector +; X32: vpblend{{.*}} {{%ymm([0-7])}}, {{%ymm([0-7])}}, {{%ymm([0-7])}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargRet256Vector: +; WIN64: vmov{{.*}} {{%ymm([0-9]+)}}, {{%ymm([0-9]+)}} +; WIN64: call{{.*}} {{.*}}test_argRet256Vector +; WIN64: vpblend{{.*}} {{%ymm([0-9]+)}}, {{%ymm([0-9]+)}}, {{%ymm([0-9]+)}} +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving 256 bit vector +define x86_regcallcc <8 x i32> @test_CallargRet256Vector(<8 x i32> %a) { + %b = call x86_regcallcc <8 x i32> @test_argRet256Vector(<8 x i32> %a, <8 x i32> %a) + %c = select <8 x i1> undef , <8 x i32> %a, <8 x i32> %b + ret <8 x i32> %c +} + +; X32-LABEL: test_argRet512Vector: +; X32: vpblend{{.*}} {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_argRet512Vector: +; WIN64: vpblend{{.*}} {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning 512 bit vector +define x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> %b) { + %d = select <16 x i1> undef , <16 x i32> %a, <16 x i32> %b + ret <16 x i32> %d +} + +; X32-LABEL: test_CallargRet512Vector: +; X32: vmov{{.*}} {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: call{{.*}} {{.*}}test_argRet512Vector +; X32: vpblend{{.*}} {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: ret{{.*}} + +; WIN64-LABEL: test_CallargRet512Vector: +; WIN64: vmov{{.*}} {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; WIN64: call{{.*}} {{.*}}test_argRet512Vector +; WIN64: vpblend{{.*}} {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving 512 bit vector +define x86_regcallcc <16 x i32> @test_CallargRet512Vector(<16 x i32> %a) { + %b = call x86_regcallcc <16 x i32> @test_argRet512Vector(<16 x i32> %a, <16 x i32> %a) + %c = select <16 x i1> undef , <16 x i32> %a, <16 x i32> %b + ret <16 x i32> %c +} + +; WIN64: testf32_inp +; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; WIN64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; WIN64: retq + +; X32: testf32_inp +; X32: vmovups {{%xmm([0-7])}}, {{.*(%esp).*}} {{#+}} 16-byte Spill +; X32: vmovups {{%xmm([0-7])}}, {{.*(%esp).*}} {{#+}} 16-byte Spill +; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}} +; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}} +; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}} +; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}} +; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}} +; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}} +; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}} +; X32: {{.*}} {{%zmm[0-7]}}, {{%zmm[0-7]}}, {{%zmm[0-7]}} +; X32: vmovups {{.*(%esp).*}}, {{%xmm([0-7])}} {{#+}} 16-byte Reload +; X32: vmovups {{.*(%esp).*}}, {{%xmm([0-7])}} {{#+}} 16-byte Reload +; X32: retl + +; LINUXOSX64: testf32_inp +; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; LINUXOSX64: {{.*}} {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}}, {{%zmm([0-9]|1[0-1])}} +; LINUXOSX64: retq + +; Test regcall when running multiple input parameters - callee saved XMMs +define x86_regcallcc <32 x float> @testf32_inp(<32 x float> %a, <32 x float> %b, <32 x float> %c) nounwind { + %x1 = fadd <32 x float> %a, %b + %x2 = fmul <32 x float> %a, %b + %x3 = fsub <32 x float> %x1, %x2 + %x4 = fadd <32 x float> %x3, %c + ret <32 x float> %x4 +} + +; X32: pushl {{%e(si|di|bx|bp)}} +; X32: pushl {{%e(si|di|bx|bp)}} +; X32: pushl {{%e(si|di|bx|bp)}} +; X32: pushl {{%e(si|di|bx|bp)}} +; X32: popl {{%e(si|di|bx|bp)}} +; X32: popl {{%e(si|di|bx|bp)}} +; X32: popl {{%e(si|di|bx|bp)}} +; X32: popl {{%e(si|di|bx|bp)}} +; X32: retl + +; WIN64: pushq {{%r(bp|bx|1[0-5])}} +; WIN64: pushq {{%r(bp|bx|1[0-5])}} +; WIN64: pushq {{%r(bp|bx|1[0-5])}} +; WIN64: pushq {{%r(bp|bx|1[0-5])}} +; WIN64: popq {{%r(bp|bx|1[0-5])}} +; WIN64: popq {{%r(bp|bx|1[0-5])}} +; WIN64: popq {{%r(bp|bx|1[0-5])}} +; WIN64: popq {{%r(bp|bx|1[0-5])}} +; WIN64: retq + +; LINUXOSX64: pushq {{%r(bp|bx|1[2-5])}} +; LINUXOSX64: pushq {{%r(bp|bx|1[2-5])}} +; LINUXOSX64: pushq {{%r(bp|bx|1[2-5])}} +; LINUXOSX64: popq {{%r(bp|bx|1[2-5])}} +; LINUXOSX64: popq {{%r(bp|bx|1[2-5])}} +; LINUXOSX64: popq {{%r(bp|bx|1[2-5])}} +; LINUXOSX64: retq + +; Test regcall when running multiple input parameters - callee saved GPRs +define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, + i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind { + %x1 = sub i32 %a1, %a2 + %x2 = sub i32 %a3, %a4 + %x3 = sub i32 %a5, %a6 + %y1 = sub i32 %b1, %b2 + %y2 = sub i32 %b3, %b4 + %y3 = sub i32 %b5, %b6 + %v1 = add i32 %a1, %a2 + %v2 = add i32 %a3, %a4 + %v3 = add i32 %a5, %a6 + %w1 = add i32 %b1, %b2 + %w2 = add i32 %b3, %b4 + %w3 = add i32 %b5, %b6 + %s1 = mul i32 %x1, %y1 + %s2 = mul i32 %x2, %y2 + %s3 = mul i32 %x3, %y3 + %t1 = mul i32 %v1, %w1 + %t2 = mul i32 %v2, %w2 + %t3 = mul i32 %v3, %w3 + %m1 = add i32 %s1, %s2 + %m2 = add i32 %m1, %s3 + %n1 = add i32 %t1, %t2 + %n2 = add i32 %n1, %t3 + %r1 = add i32 %m2, %n2 + ret i32 %r1 +} + +; X32: testf32_stack +; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{%zmm([0-7])}}, {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-7])}} +; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-1])}} +; X32: vaddps {{([0-9])+}}(%ebp), {{%zmm([0-7])}}, {{%zmm([0-1])}} +; X32: retl + +; LINUXOSX64: testf32_stack +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}}, {{%zmm([0-9]+)}} +; LINUXOSX64: vaddps {{([0-9])+}}(%rbp), {{%zmm([0-9]+)}}, {{%zmm([0-1])}} +; LINUXOSX64: vaddps {{([0-9])+}}(%rbp), {{%zmm([0-9]+)}}, {{%zmm([0-1])}} +; LINUXOSX64: retq + +; Test that parameters, overflowing register capacity, are passed through the stack +define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a0, <32 x float> %b0, <32 x float> %c0, + <32 x float> %a1, <32 x float> %b1, <32 x float> %c1, + <32 x float> %a2, <32 x float> %b2, <32 x float> %c2) nounwind { + %x1 = fadd <32 x float> %a0, %b0 + %x2 = fadd <32 x float> %c0, %x1 + %x3 = fadd <32 x float> %a1, %x2 + %x4 = fadd <32 x float> %b1, %x3 + %x5 = fadd <32 x float> %c1, %x4 + %x6 = fadd <32 x float> %a2, %x5 + %x7 = fadd <32 x float> %b2, %x6 + %x8 = fadd <32 x float> %c2, %x7 + ret <32 x float> %x8 +} Index: test/CodeGen/X86/sse-regcall.ll =================================================================== --- test/CodeGen/X86/sse-regcall.ll +++ test/CodeGen/X86/sse-regcall.ll @@ -0,0 +1,207 @@ +; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+sse | FileCheck --check-prefix=WIN32 %s +; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse | FileCheck --check-prefix=WIN64 %s +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck --check-prefix=LINUXOSX %s + +; WIN32-LABEL: test_argReti1: +; WIN32: incb %al +; WIN32: ret{{.*}} + +; WIN64-LABEL: test_argReti1: +; WIN64: incb %al +; WIN64: ret{{.*}} + +; Test regcall when receiving/returning i1 +define x86_regcallcc i1 @test_argReti1(i1 %a) { + %add = add i1 %a, 1 + ret i1 %add +} + +; WIN32-LABEL: test_CallargReti1: +; WIN32: movzbl %al, %eax +; WIN32: call{{.*}} {{.*}}test_argReti1 +; WIN32: incb %al +; WIN32: ret{{.*}} + +; WIN64-LABEL: test_CallargReti1: +; WIN64: movzbl %al, %eax +; WIN64: call{{.*}} {{.*}}test_argReti1 +; WIN64: incb %al +; WIN64: ret{{.*}} + +; Test regcall when passing/retrieving i1 +define x86_regcallcc i1 @test_CallargReti1(i1 %a) { + %b = add i1 %a, 1 + %c = call x86_regcallcc i1 @test_argReti1(i1 %b) + %d = add i1 %c, 1 + ret i1 %d +} + +; WIN64-LABEL: testf32_inp +; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}} {{#+}} 16-byte Spill +; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}} {{#+}} 16-byte Spill +; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}} {{#+}} 16-byte Spill +; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}} {{#+}} 16-byte Spill +; WIN64: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}} +; WIN64: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}} +; WIN64: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}} +; WIN64: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}} +; WIN64: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}} {{#+}} 16-byte Reload +; WIN64: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}} {{#+}} 16-byte Reload +; WIN64: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}} {{#+}} 16-byte Reload +; WIN64: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}} {{#+}} 16-byte Reload +; WIN64: retq + +; WIN32-LABEL: testf32_inp +; WIN32: movaps {{%xmm([4-7])}}, {{.*(%ebp).*}} {{#+}} 16-byte Spill +; WIN32: movaps {{%xmm([4-7])}}, {{.*(%ebp).*}} {{#+}} 16-byte Spill +; WIN32: movaps {{%xmm([4-7])}}, {{.*(%ebp).*}} {{#+}} 16-byte Spill +; WIN32: movaps {{%xmm([4-7])}}, {{.*(%ebp).*}} {{#+}} 16-byte Spill +; WIN32: {{.*}} {{%xmm[0-7]}}, {{%xmm[4-7]}} +; WIN32: {{.*}} {{%xmm[0-7]}}, {{%xmm[4-7]}} +; WIN32: {{.*}} {{%xmm[0-7]}}, {{%xmm[4-7]}} +; WIN32: {{.*}} {{%xmm[0-7]}}, {{%xmm[4-7]}} +; WIN32: movaps {{.*(%ebp).*}}, {{%xmm([4-7])}} {{#+}} 16-byte Reload +; WIN32: movaps {{.*(%ebp).*}}, {{%xmm([4-7])}} {{#+}} 16-byte Reload +; WIN32: movaps {{.*(%ebp).*}}, {{%xmm([4-7])}} {{#+}} 16-byte Reload +; WIN32: movaps {{.*(%ebp).*}}, {{%xmm([4-7])}} {{#+}} 16-byte Reload +; WIN32: retl + +; LINUXOSX-LABEL: testf32_inp +; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}} {{#+}} 16-byte Spill +; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}} {{#+}} 16-byte Spill +; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}} {{#+}} 16-byte Spill +; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}} {{#+}} 16-byte Spill +; LINUXOSX: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}} +; LINUXOSX: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}} +; LINUXOSX: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}} +; LINUXOSX: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}} +; LINUXOSX: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}} {{#+}} 16-byte Reload +; LINUXOSX: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}} {{#+}} 16-byte Reload +; LINUXOSX: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}} {{#+}} 16-byte Reload +; LINUXOSX: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}} {{#+}} 16-byte Reload +; LINUXOSX: retq + +;test calling conventions - input parameters, callee saved XMMs +define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind { + %x1 = fadd <16 x float> %a, %b + %x2 = fmul <16 x float> %a, %b + %x3 = fsub <16 x float> %x1, %x2 + %x4 = fadd <16 x float> %x3, %c + ret <16 x float> %x4 +} + +; WIN32-LABEL: testi32_inp +; WIN32: pushl {{%e(si|di|bx|bp)}} +; WIN32: pushl {{%e(si|di|bx|bp)}} +; WIN32: pushl {{%e(si|di|bx|bp)}} +; WIN32: pushl {{%e(si|di|bx|bp)}} +; WIN32: popl {{%e(si|di|bx|bp)}} +; WIN32: popl {{%e(si|di|bx|bp)}} +; WIN32: popl {{%e(si|di|bx|bp)}} +; WIN32: popl {{%e(si|di|bx|bp)}} +; WIN32: retl + +; WIN64-LABEL: testi32_inp +; WIN64: pushq {{%r(bp|bx|1[0-5])}} +; WIN64: pushq {{%r(bp|bx|1[0-5])}} +; WIN64: pushq {{%r(bp|bx|1[0-5])}} +; WIN64: pushq {{%r(bp|bx|1[0-5])}} +; WIN64: pushq {{%r(bp|bx|1[0-5])}} +; WIN64: popq {{%r(bp|bx|1[0-5])}} +; WIN64: popq {{%r(bp|bx|1[0-5])}} +; WIN64: popq {{%r(bp|bx|1[0-5])}} +; WIN64: popq {{%r(bp|bx|1[0-5])}} +; WIN64: popq {{%r(bp|bx|1[0-5])}} +; WIN64: retq + +; LINUXOSX-LABEL: testi32_inp +; LINUXOSX: pushq {{%r(bp|bx|1[2-5])}} +; LINUXOSX: pushq {{%r(bp|bx|1[2-5])}} +; LINUXOSX: pushq {{%r(bp|bx|1[2-5])}} +; LINUXOSX: pushq {{%r(bp|bx|1[2-5])}} +; LINUXOSX: popq {{%r(bp|bx|1[2-5])}} +; LINUXOSX: popq {{%r(bp|bx|1[2-5])}} +; LINUXOSX: popq {{%r(bp|bx|1[2-5])}} +; LINUXOSX: popq {{%r(bp|bx|1[2-5])}} +; LINUXOSX: retq + +;test calling conventions - input parameters, callee saved GPRs +define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, + i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind { + %x1 = sub i32 %a1, %a2 + %x2 = sub i32 %a3, %a4 + %x3 = sub i32 %a5, %a6 + %y1 = sub i32 %b1, %b2 + %y2 = sub i32 %b3, %b4 + %y3 = sub i32 %b5, %b6 + %v1 = add i32 %a1, %a2 + %v2 = add i32 %a3, %a4 + %v3 = add i32 %a5, %a6 + %w1 = add i32 %b1, %b2 + %w2 = add i32 %b3, %b4 + %w3 = add i32 %b5, %b6 + %s1 = mul i32 %x1, %y1 + %s2 = mul i32 %x2, %y2 + %s3 = mul i32 %x3, %y3 + %t1 = mul i32 %v1, %w1 + %t2 = mul i32 %v2, %w2 + %t3 = mul i32 %v3, %w3 + %m1 = add i32 %s1, %s2 + %m2 = add i32 %m1, %s3 + %n1 = add i32 %t1, %t2 + %n2 = add i32 %n1, %t3 + %r1 = add i32 %m2, %n2 + ret i32 %r1 +} + +; X32: testf32_stack +; X32: movaps {{%xmm([0-7])}}, {{(-*[0-9])+}}(%ebp) +; X32: movaps {{%xmm([0-7])}}, {{(-*[0-9])+}}(%ebp) +; X32: movaps {{%xmm([0-7])}}, {{(-*[0-9])+}}(%ebp) +; X32: movaps {{%xmm([0-7])}}, {{(-*[0-9])+}}(%ebp) +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: addps {{([0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: movaps {{(-*[0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: movaps {{(-*[0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: movaps {{(-*[0-9])+}}(%ebp), {{%xmm([0-7])}} +; X32: movaps {{(-*[0-9])+}}(%ebp), {{%xmm([0-7])}} + +; LINUXOSX: testf32_stack +; LINUXOSX: addps {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; LINUXOSX: addps {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; LINUXOSX: addps {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; LINUXOSX: addps {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; LINUXOSX: addps {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; LINUXOSX: addps {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; LINUXOSX: addps {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; LINUXOSX: addps {{%xmm([0-9]+)}}, {{%xmm([0-9]+)}} +; LINUXOSX: addps {{([0-9])+}}(%rsp), {{%xmm([0-7])}} +; LINUXOSX: addps {{([0-9])+}}(%rsp), {{%xmm([0-7])}} +; LINUXOSX: addps {{([0-9])+}}(%rsp), {{%xmm([0-7])}} +; LINUXOSX: addps {{([0-9])+}}(%rsp), {{%xmm([0-7])}} +; LINUXOSX: addps {{([0-9])+}}(%rsp), {{%xmm([0-7])}} +; LINUXOSX: addps {{([0-9])+}}(%rsp), {{%xmm([0-7])}} +; LINUXOSX: addps {{([0-9])+}}(%rsp), {{%xmm([0-7])}} +; LINUXOSX: addps {{([0-9])+}}(%rsp), {{%xmm([0-7])}} +; LINUXOSX: retq + +; Test that parameters, overflowing register capacity, are passed through the stack +define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a, <32 x float> %b, <32 x float> %c) nounwind { + %x1 = fadd <32 x float> %a, %b + %x2 = fadd <32 x float> %x1, %c + ret <32 x float> %x2 +} \ No newline at end of file