diff --git a/llvm/lib/Target/VE/VECallingConv.td b/llvm/lib/Target/VE/VECallingConv.td --- a/llvm/lib/Target/VE/VECallingConv.td +++ b/llvm/lib/Target/VE/VECallingConv.td @@ -97,6 +97,66 @@ [SX0, SX1, SX3, SX5]>>, ]>; +///// Custom fastcc ///// +// +// This passes vector params and return values in registers. Scalar values are +// handled conforming to the standard cc. +def CC_VE_Fast : CallingConv<[ + // vector --> generic vector registers + CCIfType<[v2i32, v2i64, v2f32, v2f64, + v4i32, v4i64, v4f32, v4f64, + v8i32, v8i64, v8f32, v8f64, + v16i32, v16i64, v16f32, v16f64, + v32i32, v32i64, v32f32, v32f64, + v64i32, v64i64, v64f32, v64f64, + v128i32, v128i64, v128f32, v128f64, + v256i32, v256f32, v256i64, v256f64], + CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>, + // TODO: make this conditional on packed mode + CCIfType<[v512i32, v512f32], + CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>, + + // vector mask --> generic vector mask registers + CCIfType<[v256i1], + CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>, + + // pair of vector mask --> generic vector mask registers + CCIfType<[v512i1], + CCAssignToRegWithShadow<[VMP1, VMP2, VMP3], + [VM1, VM1, VM3]>>, + + // Follow the standard C CC for scalars. + CCDelegateTo +]>; + +def RetCC_VE_Fast : CallingConv<[ + // vector --> generic vector registers + CCIfType<[v2i32, v2i64, v2f32, v2f64, + v4i32, v4i64, v4f32, v4f64, + v8i32, v8i64, v8f32, v8f64, + v16i32, v16i64, v16f32, v16f64, + v32i32, v32i64, v32f32, v32f64, + v64i32, v64i64, v64f32, v64f64, + v128i32, v128i64, v128f32, v128f64, + v256i32, v256f32, v256i64, v256f64], + CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>, + // TODO: make this conditional on packed mode + CCIfType<[v512i32, v512f32], + CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>, + + // vector mask --> generic vector mask registers + CCIfType<[v256i1], + CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>, + + // pair of vector mask --> generic vector mask registers + CCIfType<[v512i1], + CCAssignToRegWithShadow<[VMP1, VMP2, VMP3], + [VM1, VM1, VM3]>>, + + // Follow the standard C CC for scalars. + CCDelegateTo +]>; + // Callee-saved registers def CSR : CalleeSavedRegs<(add (sequence "SX%u", 18, 33))>; def CSR_NoRegs : CalleeSavedRegs<(add)>; diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp --- a/llvm/lib/Target/VE/VEISelLowering.cpp +++ b/llvm/lib/Target/VE/VEISelLowering.cpp @@ -44,6 +44,8 @@ switch (CallConv) { default: return RetCC_VE_C; + case CallingConv::Fast: + return RetCC_VE_Fast; } } @@ -53,6 +55,8 @@ switch (CallConv) { default: return CC_VE_C; + case CallingConv::Fast: + return CC_VE_Fast; } } diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -352,6 +352,25 @@ BuildMI(MBB, I, DL, get(VE::ORri), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)) .addImm(0); + } else if (VE::V64RegClass.contains(DestReg, SrcReg)) { + // Generate following instructions + // %sw16 = LEA32zii 256 + // VORmvl %dest, (0)1, %src, %sw16 + // TODO: reuse a register if vl is already assigned to a register + // FIXME: it would be better to scavenge a register here instead of + // reserving SX16 all of the time. + const TargetRegisterInfo *TRI = &getRegisterInfo(); + Register TmpReg = VE::SX16; + Register SubTmp = TRI->getSubReg(TmpReg, VE::sub_i32); + BuildMI(MBB, I, DL, get(VE::LEAzii), TmpReg) + .addImm(0) + .addImm(0) + .addImm(256); + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(VE::VORmvl), DestReg) + .addImm(M1(0)) // Represent (0)1. + .addReg(SrcReg, getKillRegState(KillSrc)) + .addReg(SubTmp, getKillRegState(true)); + MIB.getInstr()->addRegisterKilled(TmpReg, TRI, true); } else if (VE::F128RegClass.contains(DestReg, SrcReg)) { // Use two instructions. const unsigned SubRegIdx[] = {VE::sub_even, VE::sub_odd}; diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp --- a/llvm/lib/Target/VE/VERegisterInfo.cpp +++ b/llvm/lib/Target/VE/VERegisterInfo.cpp @@ -35,6 +35,8 @@ const MCPhysReg * VERegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { switch (MF->getFunction().getCallingConv()) { + case CallingConv::Fast: + // Being explicit (same as standard CC). default: return CSR_SaveList; case CallingConv::PreserveAll: @@ -45,6 +47,8 @@ const uint32_t *VERegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { switch (CC) { + case CallingConv::Fast: + // Being explicit (same as standard CC). default: return CSR_RegMask; case CallingConv::PreserveAll: diff --git a/llvm/test/CodeGen/VE/Vector/fastcc_callee.ll b/llvm/test/CodeGen/VE/Vector/fastcc_callee.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/fastcc_callee.ll @@ -0,0 +1,122 @@ +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + +; Scalar argument passing must not change (same tests as in VE/Scalar/callee.ll below - this time with +vpu) + +define fastcc i32 @stack_stack_arg_i32_r9(i1 %0, i8 %1, i16 %2, i32 %3, i64 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9) { +; CHECK-LABEL: stack_stack_arg_i32_r9: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: ldl.sx %s0, 424(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 + ret i32 %9 +} + +define fastcc i64 @stack_stack_arg_i64_r9(i1 %0, i8 %1, i16 %2, i32 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 %9) { +; CHECK-LABEL: stack_stack_arg_i64_r9: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: ld %s0, 424(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 + ret i64 %9 +} + +define fastcc float @stack_stack_arg_f32_r9(float %p0, float %p1, float %p2, float %p3, float %p4, float %p5, float %p6, float %p7, float %s0, float %s1) { +; CHECK-LABEL: stack_stack_arg_f32_r9: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: ldu %s0, 428(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 + ret float %s1 +} + +define fastcc i32 @stack_stack_arg_i32f32_r8(i32 %p0, float %p1, i32 %p2, float %p3, i32 %p4, float %p5, i32 %p6, float %p7, i32 %s0, float %s1) { +; CHECK-LABEL: stack_stack_arg_i32f32_r8: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: ldl.sx %s0, 416(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 + ret i32 %s0 +} + +define fastcc float @stack_stack_arg_i32f32_r9(i32 %p0, float %p1, i32 %p2, float %p3, i32 %p4, float %p5, i32 %p6, float %p7, i32 %s0, float %s1) { +; CHECK-LABEL: stack_stack_arg_i32f32_r9: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: ldu %s0, 428(, %s11) +; CHECK-NEXT: or %s11, 0, %s9 + ret float %s1 +} + +; Vector argument passing (fastcc feature) + +; v0-to-v0 passthrough case without vreg copy. +define fastcc <256 x i32> @vreg_arg_v256i32_r0(<256 x i32> %p0) { +; CHECK-LABEL: vreg_arg_v256i32_r0: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: or %s11, 0, %s9 + ret <256 x i32> %p0 +} + +define fastcc <256 x i32> @vreg_arg_v256i32_r1(<256 x i32> %p0, <256 x i32> %p1) { +; CHECK-LABEL: vreg_arg_v256i32_r1: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: or %s11, 0, %s9 + ret <256 x i32> %p1 +} + +define fastcc <256 x i32> @vreg_arg_v256i32_r2(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2) { +; CHECK-LABEL: vreg_arg_v256i32_r2: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v2 +; CHECK-NEXT: or %s11, 0, %s9 + ret <256 x i32> %p2 +} + +define fastcc <256 x i32> @vreg_arg_v256i32_r3(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3) { +; CHECK-LABEL: vreg_arg_v256i32_r3: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v3 +; CHECK-NEXT: or %s11, 0, %s9 + ret <256 x i32> %p3 +} + +define fastcc <256 x i32> @vreg_arg_v256i32_r4(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4) { +; CHECK-LABEL: vreg_arg_v256i32_r4: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v4 +; CHECK-NEXT: or %s11, 0, %s9 + ret <256 x i32> %p4 +} + +define fastcc <256 x i32> @vreg_arg_v256i32_r5(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5) { +; CHECK-LABEL: vreg_arg_v256i32_r5: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v5 +; CHECK-NEXT: or %s11, 0, %s9 + ret <256 x i32> %p5 +} + +define fastcc <256 x i32> @vreg_arg_v256i32_r6(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6) { +; CHECK-LABEL: vreg_arg_v256i32_r6: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v6 +; CHECK-NEXT: or %s11, 0, %s9 + ret <256 x i32> %p6 +} + +; TODO: Uncomment test when vector loads are upstream (vreg stack passing). +; define <256 x i32> @vreg_arg_v256i32_r7(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6, <256 x i32> %p7) { +; ret <256 x i32> %p7 +; } + +; define <256 x i32> @vreg_arg_v256i32_r8(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6, <256 x i32> %p7, <256 x i32> %p8) { +; ret <256 x i32> %p8 +; } diff --git a/llvm/test/CodeGen/VE/Vector/fastcc_caller.ll b/llvm/test/CodeGen/VE/Vector/fastcc_caller.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/fastcc_caller.ll @@ -0,0 +1,256 @@ +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s + + +declare i32 @sample_add(i32, i32) +declare i32 @stack_callee_int(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare i32 @stack_callee_int_szext(i1 signext, i8 zeroext, i32, i32, i32, i32, i32, i32, i16 zeroext, i8 signext) +declare float @stack_callee_float(float, float, float, float, float, float, float, float, float, float) +declare void @test(i64) + +; Scalar argument passing must not change (same tests as in VE/Scalar/call.ll below - this time with +vpu) + +define fastcc i32 @sample_call() { +; CHECK-LABEL: sample_call: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s0, sample_add@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, sample_add@hi(, %s0) +; CHECK-NEXT: or %s0, 1, (0)1 +; CHECK-NEXT: or %s1, 2, (0)1 +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 + %r = tail call fastcc i32 @sample_add(i32 1, i32 2) + ret i32 %r +} + +define fastcc i32 @stack_call_int() { +; CHECK-LABEL: stack_call_int: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: or %s0, 10, (0)1 +; CHECK-NEXT: st %s0, 248(, %s11) +; CHECK-NEXT: or %s34, 9, (0)1 +; CHECK-NEXT: lea %s0, stack_callee_int@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, stack_callee_int@hi(, %s0) +; CHECK-NEXT: or %s0, 1, (0)1 +; CHECK-NEXT: or %s1, 2, (0)1 +; CHECK-NEXT: or %s2, 3, (0)1 +; CHECK-NEXT: or %s3, 4, (0)1 +; CHECK-NEXT: or %s4, 5, (0)1 +; CHECK-NEXT: or %s5, 6, (0)1 +; CHECK-NEXT: or %s6, 7, (0)1 +; CHECK-NEXT: or %s7, 8, (0)1 +; CHECK-NEXT: st %s34, 240(, %s11) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 + %r = tail call fastcc i32 @stack_callee_int(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10) + ret i32 %r +} + +define fastcc i32 @stack_call_int_szext() { +; CHECK-LABEL: stack_call_int_szext: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: or %s0, -1, (0)1 +; CHECK-NEXT: st %s0, 248(, %s11) +; CHECK-NEXT: lea %s34, 65535 +; CHECK-NEXT: lea %s0, stack_callee_int_szext@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, stack_callee_int_szext@hi(, %s0) +; CHECK-NEXT: or %s0, -1, (0)1 +; CHECK-NEXT: lea %s1, 255 +; CHECK-NEXT: or %s2, 3, (0)1 +; CHECK-NEXT: or %s3, 4, (0)1 +; CHECK-NEXT: or %s4, 5, (0)1 +; CHECK-NEXT: or %s5, 6, (0)1 +; CHECK-NEXT: or %s6, 7, (0)1 +; CHECK-NEXT: or %s7, 8, (0)1 +; CHECK-NEXT: st %s34, 240(, %s11) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 + %r = tail call fastcc i32 @stack_callee_int_szext(i1 -1, i8 -1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i16 -1, i8 -1) + ret i32 %r +} + +define fastcc float @stack_call_float() { +; CHECK-LABEL: stack_call_float: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea.sl %s0, 1092616192 +; CHECK-NEXT: st %s0, 248(, %s11) +; CHECK-NEXT: lea.sl %s34, 1091567616 +; CHECK-NEXT: lea %s0, stack_callee_float@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, stack_callee_float@hi(, %s0) +; CHECK-NEXT: lea.sl %s0, 1065353216 +; CHECK-NEXT: lea.sl %s1, 1073741824 +; CHECK-NEXT: lea.sl %s2, 1077936128 +; CHECK-NEXT: lea.sl %s3, 1082130432 +; CHECK-NEXT: lea.sl %s4, 1084227584 +; CHECK-NEXT: lea.sl %s5, 1086324736 +; CHECK-NEXT: lea.sl %s6, 1088421888 +; CHECK-NEXT: lea.sl %s7, 1090519040 +; CHECK-NEXT: st %s34, 240(, %s11) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 + %r = tail call fastcc float @stack_callee_float(float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0) + ret float %r +} + +define fastcc float @stack_call_float2(float %p0) { +; CHECK-LABEL: stack_call_float2: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: st %s0, 248(, %s11) +; CHECK-NEXT: lea %s1, stack_callee_float@lo +; CHECK-NEXT: and %s1, %s1, (32)0 +; CHECK-NEXT: lea.sl %s12, stack_callee_float@hi(, %s1) +; CHECK-NEXT: st %s0, 240(, %s11) +; CHECK-NEXT: or %s1, 0, %s0 +; CHECK-NEXT: or %s2, 0, %s0 +; CHECK-NEXT: or %s3, 0, %s0 +; CHECK-NEXT: or %s4, 0, %s0 +; CHECK-NEXT: or %s5, 0, %s0 +; CHECK-NEXT: or %s6, 0, %s0 +; CHECK-NEXT: or %s7, 0, %s0 +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 + %r = tail call fastcc float @stack_callee_float(float %p0, float %p0, float %p0, float %p0, float %p0, float %p0, float %p0, float %p0, float %p0, float %p0) + ret float %r +} + +; Vector argument passing (fastcc feature) +; +declare fastcc <256 x i32> @get_v256i32() +declare fastcc void @vsample_v(<256 x i32>) +declare fastcc void @vsample_iv(i32, <256 x i32>) + +define void @caller_vret() { +; CHECK: caller_vret: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s0, get_v256i32@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, get_v256i32@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 + %r = tail call fastcc <256 x i32> @get_v256i32() + ret void +} + +define void @caller_vret_pass_p0() { +; CHECK-LABEL: caller_vret_pass_p0: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: lea %s0, get_v256i32@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, get_v256i32@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, vsample_v@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_v@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 + %p = tail call fastcc <256 x i32> @get_v256i32() + call fastcc void @vsample_v(<256 x i32> %p) + ret void +} + +define void @caller_vret_pass_p1(i32 %s) { +; CHECK-LABEL: caller_vret_pass_p1: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK: or %s18, 0, %s0 +; CHECK-NEXT: lea %s0, get_v256i32@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, get_v256i32@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, vsample_iv@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_iv@hi(, %s0) +; CHECK-NEXT: or %s0, 0, %s18 +; CHECK-NEXT: bsic %s10, (, %s12) + %p = tail call fastcc <256 x i32> @get_v256i32() + call fastcc void @vsample_iv(i32 %s, <256 x i32> %p) + ret void +} + +declare fastcc void @vsample_vv(<256 x i32>, <256 x i32>) +declare fastcc void @vsample_vvv(<256 x i32>, <256 x i32>, <256 x i32>) + +define void @caller_vret_pass_p01() { +; CHECK-LABEL: caller_vret_pass_p01: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s0, get_v256i32@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, get_v256i32@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, vsample_vv@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_vv@hi(, %s0) +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v1, (0)1, %v0 +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 + %p = tail call fastcc <256 x i32> @get_v256i32() + call fastcc void @vsample_vv(<256 x i32> %p, <256 x i32> %p) + ret void +} + +define void @caller_vret_pass_p012() { +; CHECK-LABEL: caller_vret_pass_p012: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s0, get_v256i32@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, get_v256i32@hi(, %s0) +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: lea %s0, vsample_vvv@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_vvv@hi(, %s0) +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v1, (0)1, %v0 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v2, (0)1, %v0 +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 + %p = tail call fastcc <256 x i32> @get_v256i32() + call fastcc void @vsample_vvv(<256 x i32> %p, <256 x i32> %p, <256 x i32> %p) + ret void +} + +; Expose register parameter mapping by forcing an explicit vreg move for all parameter positions +declare fastcc void @vsample_vvvvvvv(<256 x i32>, <256 x i32>, <256 x i32>, <256 x i32>, <256 x i32>, <256 x i32>, <256 x i32>) + +; TODO improve vreg copy (redundant lea+lvl emitted) +define fastcc void @roundtrip_caller_callee(<256 x i32> %p0, <256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6) { +; CHECK-LABEL: roundtrip_caller_callee: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v7, (0)1, %v0 +; CHECK-NEXT: lea %s0, vsample_vvvvvvv@lo +; CHECK-NEXT: and %s0, %s0, (32)0 +; CHECK-NEXT: lea.sl %s12, vsample_vvvvvvv@hi(, %s0) +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v0, (0)1, %v1 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v1, (0)1, %v2 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v2, (0)1, %v3 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v3, (0)1, %v4 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v4, (0)1, %v5 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v5, (0)1, %v6 +; CHECK-NEXT: lea %s16, 256 +; CHECK-NEXT: lvl %s16 +; CHECK-NEXT: vor %v6, (0)1, %v7 +; CHECK-NEXT: bsic %s10, (, %s12) +; CHECK-NEXT: or %s11, 0, %s9 + call fastcc void @vsample_vvvvvvv(<256 x i32> %p1, <256 x i32> %p2, <256 x i32> %p3, <256 x i32> %p4, <256 x i32> %p5, <256 x i32> %p6, <256 x i32> %p0) + ret void +}