diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -162,12 +162,14 @@ //===----------------------------------------------------------------------===// // z/OS XPLINK64 callee-saved registers //===----------------------------------------------------------------------===// -def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 8, 15), - (sequence "F%dD", 8, 15))>; - -def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add (sequence "R%dD", 8, 15), - (sequence "F%dD", 15, 8), - (sequence "V%d", 23, 16))>; +// %R7D is volatile by the spec, but it must be saved in the prologue by +// any non-leaf function and restored in the epilogue for use by the +// return instruction so it functions exactly like a callee-saved register. +def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 7, 15), + (sequence "F%dD", 15, 8))>; + +def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add CSR_SystemZ_XPLINK64, + (sequence "V%d", 23, 16))>; //===----------------------------------------------------------------------===// // z/OS XPLINK64 return value calling convention diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -10,6 +10,8 @@ #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H #include "MCTargetDesc/SystemZMCTargetDesc.h" +#include "SystemZInstrBuilder.h" +#include "SystemZMachineFunctionInfo.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/Support/TypeSize.h" @@ -19,7 +21,6 @@ class SystemZSubtarget; class SystemZFrameLowering : public TargetFrameLowering { - public: SystemZFrameLowering(StackDirection D, Align StackAl, int LAO, Align TransAl, bool StackReal); @@ -86,9 +87,24 @@ }; class SystemZXPLINKFrameLowering : public SystemZFrameLowering { + IndexedMap RegSpillOffsets; + public: SystemZXPLINKFrameLowering(); + bool + assignCalleeSavedSpillSlots(MachineFunction &MF, + const TargetRegisterInfo *TRI, + std::vector &CSI) const override; + + void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, + RegScavenger *RS) const override; + + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + ArrayRef CSI, + const TargetRegisterInfo *TRI) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -24,7 +24,7 @@ namespace { // The ABI-defined register save slots, relative to the CFA (i.e. // incoming stack pointer + SystemZMC::ELFCallFrameSize). -static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = { +static const TargetFrameLowering::SpillSlot ELFSpillOffsetTable[] = { { SystemZ::R2D, 0x10 }, { SystemZ::R3D, 0x18 }, { SystemZ::R4D, 0x20 }, @@ -44,6 +44,12 @@ { SystemZ::F4D, 0x90 }, { SystemZ::F6D, 0x98 } }; + +static const TargetFrameLowering::SpillSlot XPLINKSpillOffsetTable[] = { + {SystemZ::R4D, 0x00}, {SystemZ::R5D, 0x08}, {SystemZ::R6D, 0x10}, + {SystemZ::R7D, 0x18}, {SystemZ::R8D, 0x20}, {SystemZ::R9D, 0x28}, + {SystemZ::R10D, 0x30}, {SystemZ::R11D, 0x38}, {SystemZ::R12D, 0x40}, + {SystemZ::R13D, 0x48}, {SystemZ::R14D, 0x50}, {SystemZ::R15D, 0x58}}; } // end anonymous namespace SystemZFrameLowering::SystemZFrameLowering(StackDirection D, Align StackAl, @@ -201,8 +207,9 @@ SystemZELFFrameLowering::SystemZELFFrameLowering() : SystemZFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8), 0, - Align(8), false /* StackRealignable */), + Align(8), /* StackRealignable */ false), RegSpillOffsets(0) { + // Due to the SystemZ ABI, the DWARF CFA (Canonical Frame Address) is not // equal to the incoming stack pointer, but to incoming stack pointer plus // 160. Instead of using a Local Area Offset, the Register save area will @@ -212,8 +219,8 @@ // Create a mapping from register number to save slot offset. // These offsets are relative to the start of the register save area. RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS); - for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I) - RegSpillOffsets[SpillOffsetTable[I].Reg] = SpillOffsetTable[I].Offset; + for (unsigned I = 0, E = array_lengthof(ELFSpillOffsetTable); I != E; ++I) + RegSpillOffsets[ELFSpillOffsetTable[I].Reg] = ELFSpillOffsetTable[I].Offset; } // Add GPR64 to the save instruction being built by MIB, which is in basic @@ -812,7 +819,176 @@ SystemZXPLINKFrameLowering::SystemZXPLINKFrameLowering() : SystemZFrameLowering(TargetFrameLowering::StackGrowsUp, Align(32), 128, - Align(32), false /* StackRealignable */) {} + Align(32), /* StackRealignable */ false), + RegSpillOffsets(-1) { + + // Create a mapping from register number to save slot offset. + // These offsets are relative to the start of the local are area. + RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS); + for (unsigned I = 0, E = array_lengthof(XPLINKSpillOffsetTable); I != E; ++I) + RegSpillOffsets[XPLINKSpillOffsetTable[I].Reg] = + XPLINKSpillOffsetTable[I].Offset; +} + +bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots( + MachineFunction &MF, const TargetRegisterInfo *TRI, + std::vector &CSI) const { + MachineFrameInfo &MFFrame = MF.getFrameInfo(); + SystemZMachineFunctionInfo *MFI = MF.getInfo(); + const SystemZSubtarget &Subtarget = MF.getSubtarget(); + auto &Regs = Subtarget.getSpecialRegisters(); + + // Scan the call-saved GPRs and find the bounds of the register spill area. + unsigned LowGPR = 0; + int LowOffset = INT32_MAX; + unsigned HighGPR = LowGPR; + int HighOffset = -1; + + unsigned RegSP = Regs.getStackPointerRegister(); + auto &GRRegClass = SystemZ::GR64BitRegClass; + const unsigned RegSize = 8; + + auto ProcessCSI = [&](std::vector &CSIList) { + for (auto &CS : CSIList) { + unsigned Reg = CS.getReg(); + int Offset = RegSpillOffsets[Reg]; + if (Offset >= 0) { + if (GRRegClass.contains(Reg)) { + if (LowOffset > Offset) { + LowOffset = Offset; + LowGPR = Reg; + } + + if (Offset > HighOffset) { + HighOffset = Offset; + HighGPR = Reg; + } + } + int FrameIdx = MFFrame.CreateFixedSpillStackObject(RegSize, Offset); + CS.setFrameIdx(FrameIdx); + } else + CS.setFrameIdx(INT32_MAX); + } + }; + + std::vector Spills; + + // For non-leaf functions: + // - the address of callee (entry point) register R6 must be saved + Spills.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister())); + + // If the function needs a frame pointer, or if the backchain pointer should + // be stored, then save the stack pointer register R4. + if (hasFP(MF) || MF.getFunction().hasFnAttribute("backchain")) + Spills.push_back(CalleeSavedInfo(RegSP)); + + // Save the range of call-saved registers, for use by the + // prologue/epilogue inserters. + ProcessCSI(CSI); + MFI->setRestoreGPRRegs(LowGPR, HighGPR, LowOffset); + + // Save the range of call-saved registers, for use by the epilogue inserter. + ProcessCSI(Spills); + MFI->setSpillGPRRegs(LowGPR, HighGPR, LowOffset); + + // Create spill slots for the remaining registers. + for (auto &CS : CSI) { + if (CS.getFrameIdx() != INT32_MAX) + continue; + unsigned Reg = CS.getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + Align Alignment = TRI->getSpillAlign(*RC); + unsigned Size = TRI->getSpillSize(*RC); + Alignment = std::min(Alignment, getStackAlign()); + int FrameIdx = MFFrame.CreateStackObject(Size, Alignment, true); + CS.setFrameIdx(FrameIdx); + } + + return true; +} + +void SystemZXPLINKFrameLowering::determineCalleeSaves(MachineFunction &MF, + BitVector &SavedRegs, + RegScavenger *RS) const { + TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); + + bool HasFP = hasFP(MF); + const SystemZSubtarget &Subtarget = MF.getSubtarget(); + auto &Regs = Subtarget.getSpecialRegisters(); + + // If the function requires a frame pointer, record that the hard + // frame pointer will be clobbered. + if (HasFP) + SavedRegs.set(Regs.getFramePointerRegister()); + + // If the function is not an XPLeaf function, we need to save the + // return address register. We also always use that register for + // the return instruction, so it needs to be restored in the + // epilogue even though that register is considered to be volatile. + // #TODO: Implement leaf detection. + SavedRegs.set(Regs.getReturnFunctionAddressRegister()); +} + +bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + ArrayRef CSI, const TargetRegisterInfo *TRI) const { + if (CSI.empty()) + return true; + + MachineFunction &MF = *MBB.getParent(); + SystemZMachineFunctionInfo *ZFI = MF.getInfo(); + const SystemZSubtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + auto &Regs = Subtarget.getSpecialRegisters(); + SystemZ::GPRRegs SpillGPRs = ZFI->getSpillGPRRegs(); + DebugLoc DL; + + // Save GPRs + if (SpillGPRs.LowGPR) { + assert(SpillGPRs.LowGPR != SpillGPRs.HighGPR && + "Should be saving multiple registers"); + + // Build an STM/STMG instruction. + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STMG)); + + // Add the explicit register operands. + addSavedGPR(MBB, MIB, SpillGPRs.LowGPR, false); + addSavedGPR(MBB, MIB, SpillGPRs.HighGPR, false); + + // Add the address r4 + MIB.addReg(Regs.getStackPointerRegister()); + + // Add the partial offset + // We cannot add the actual offset as, at the stack is not finalized + MIB.addImm(SpillGPRs.GPROffset); + + // Make sure all call-saved GPRs are included as operands and are + // marked as live on entry. + auto &GRRegClass = SystemZ::GR64BitRegClass; + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + unsigned Reg = CSI[I].getReg(); + if (GRRegClass.contains(Reg)) + addSavedGPR(MBB, MIB, Reg, true); + } + } + + // Spill FPRs to the stack in the normal TargetInstrInfo way + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + unsigned Reg = CSI[I].getReg(); + if (SystemZ::FP64BitRegClass.contains(Reg)) { + MBB.addLiveIn(Reg); + TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(), + &SystemZ::FP64BitRegClass, TRI); + } + if (SystemZ::VR128BitRegClass.contains(Reg)) { + MBB.addLiveIn(Reg); + TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(), + &SystemZ::VR128BitRegClass, TRI); + } + } + + return true; +} void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const {} diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h @@ -10,6 +10,7 @@ #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZREGISTERINFO_H #include "SystemZ.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #define GET_REGINFO_HEADER @@ -44,9 +45,9 @@ /// It is abstract, all calling conventions must override and /// define the pure virtual member function defined in this class. class SystemZCallingConventionRegisters { + public: - /// \returns the register that keeps the - /// return function address. + /// \returns the register that keeps the return function address. virtual int getReturnFunctionAddressRegister() = 0; /// \returns the register that keeps the @@ -82,6 +83,8 @@ int getFramePointerRegister() override final { return SystemZ::R8D; }; + int getAddressOfCalleeRegister() { return SystemZ::R6D; }; + const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override final; diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -190,7 +190,9 @@ const MCPhysReg * SystemZXPLINK64Registers::getCalleeSavedRegs(const MachineFunction *MF) const { - return CSR_SystemZ_XPLINK64_SaveList; + const SystemZSubtarget &Subtarget = MF->getSubtarget(); + return Subtarget.hasVector() ? CSR_SystemZ_XPLINK64_Vector_SaveList + : CSR_SystemZ_XPLINK64_SaveList; } const MCPhysReg * @@ -211,7 +213,9 @@ const uint32_t * SystemZXPLINK64Registers::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { - return CSR_SystemZ_XPLINK64_RegMask; + const SystemZSubtarget &Subtarget = MF.getSubtarget(); + return Subtarget.hasVector() ? CSR_SystemZ_XPLINK64_Vector_RegMask + : CSR_SystemZ_XPLINK64_RegMask; } const uint32_t * diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h --- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h +++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h @@ -96,6 +96,10 @@ return SpecialRegisters.get(); } + template SR &getSpecialRegisters() const { + return *static_cast(getSpecialRegisters()); + } + const TargetFrameLowering *getFrameLowering() const override { return FrameLowering.get(); } diff --git a/llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll b/llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll @@ -0,0 +1,236 @@ +; Test the generated function prologs/epilogs under XPLINK64 on z/OS +; +; RUN: llc < %s -mtriple=s390x-ibm-zos -mcpu=z13 | FileCheck --check-prefixes=CHECK64,CHECK %s + +; Test prolog/epilog for non-XPLEAF. + +; Small stack frame. +; CHECK-LABEL: func0 +; CHECK64: stmg 6, 7 +define void @func0() { + call i64 (i64) @fun(i64 10) + ret void +} + +; Spill all GPR CSRs +; CHECK-LABEL: func1 +; CHECK64: stmg 6, 15 +define void @func1(i64 *%ptr) { + %l01 = load volatile i64, i64 *%ptr + %l02 = load volatile i64, i64 *%ptr + %l03 = load volatile i64, i64 *%ptr + %l04 = load volatile i64, i64 *%ptr + %l05 = load volatile i64, i64 *%ptr + %l06 = load volatile i64, i64 *%ptr + %l07 = load volatile i64, i64 *%ptr + %l08 = load volatile i64, i64 *%ptr + %l09 = load volatile i64, i64 *%ptr + %l10 = load volatile i64, i64 *%ptr + %l11 = load volatile i64, i64 *%ptr + %l12 = load volatile i64, i64 *%ptr + %l13 = load volatile i64, i64 *%ptr + %l14 = load volatile i64, i64 *%ptr + %l15 = load volatile i64, i64 *%ptr + %add01 = add i64 %l01, %l01 + %add02 = add i64 %l02, %add01 + %add03 = add i64 %l03, %add02 + %add04 = add i64 %l04, %add03 + %add05 = add i64 %l05, %add04 + %add06 = add i64 %l06, %add05 + %add07 = add i64 %l07, %add06 + %add08 = add i64 %l08, %add07 + %add09 = add i64 %l09, %add08 + %add10 = add i64 %l10, %add09 + %add11 = add i64 %l11, %add10 + %add12 = add i64 %l12, %add11 + %add13 = add i64 %l13, %add12 + %add14 = add i64 %l14, %add13 + %add15 = add i64 %l15, %add14 + store volatile i64 %add01, i64 *%ptr + store volatile i64 %add02, i64 *%ptr + store volatile i64 %add03, i64 *%ptr + store volatile i64 %add04, i64 *%ptr + store volatile i64 %add05, i64 *%ptr + store volatile i64 %add06, i64 *%ptr + store volatile i64 %add07, i64 *%ptr + store volatile i64 %add08, i64 *%ptr + store volatile i64 %add09, i64 *%ptr + store volatile i64 %add10, i64 *%ptr + store volatile i64 %add11, i64 *%ptr + store volatile i64 %add12, i64 *%ptr + store volatile i64 %add13, i64 *%ptr + store volatile i64 %add14, i64 *%ptr + store volatile i64 %add15, i64 *%ptr + ret void +} + + +; Spill all FPRs and VRs +; CHECK-LABEL: func2 +; CHECK64: std 15, {{[0-9]+}}(4) * 8-byte Folded Spill +; CHECK64: std 14, {{[0-9]+}}(4) * 8-byte Folded Spill +; CHECK64: std 13, {{[0-9]+}}(4) * 8-byte Folded Spill +; CHECK64: std 12, {{[0-9]+}}(4) * 8-byte Folded Spill +; CHECK64: std 11, {{[0-9]+}}(4) * 8-byte Folded Spill +; CHECK64: std 10, {{[0-9]+}}(4) * 8-byte Folded Spill +; CHECK64: std 9, {{[0-9]+}}(4) * 8-byte Folded Spill +; CHECK64: std 8, {{[0-9]+}}(4) * 8-byte Folded Spill +; CHECK64: vst 23, {{[0-9]+}}(4), 4 * 16-byte Folded Spill +; CHECK64: vst 22, {{[0-9]+}}(4), 4 * 16-byte Folded Spill +; CHECK64: vst 21, {{[0-9]+}}(4), 4 * 16-byte Folded Spill +; CHECK64: vst 20, {{[0-9]+}}(4), 4 * 16-byte Folded Spill +; CHECK64: vst 19, {{[0-9]+}}(4), 4 * 16-byte Folded Spill +; CHECK64: vst 18, {{[0-9]+}}(4), 4 * 16-byte Folded Spill +; CHECK64: vst 17, {{[0-9]+}}(4), 4 * 16-byte Folded Spill +; CHECK64: vst 16, {{[0-9]+}}(4), 4 * 16-byte Folded Spill +define void @func2(double *%ptr, <2 x i64> *%vec_ptr) { + %l00 = load volatile double, double *%ptr + %l01 = load volatile double, double *%ptr + %l02 = load volatile double, double *%ptr + %l03 = load volatile double, double *%ptr + %l04 = load volatile double, double *%ptr + %l05 = load volatile double, double *%ptr + %l06 = load volatile double, double *%ptr + %l07 = load volatile double, double *%ptr + %l08 = load volatile double, double *%ptr + %l09 = load volatile double, double *%ptr + %l10 = load volatile double, double *%ptr + %l11 = load volatile double, double *%ptr + %l12 = load volatile double, double *%ptr + %l13 = load volatile double, double *%ptr + %l14 = load volatile double, double *%ptr + %l15 = load volatile double, double *%ptr + %add00 = fadd double %l01, %l00 + %add01 = fadd double %l01, %add00 + %add02 = fadd double %l02, %add01 + %add03 = fadd double %l03, %add02 + %add04 = fadd double %l04, %add03 + %add05 = fadd double %l05, %add04 + %add06 = fadd double %l06, %add05 + %add07 = fadd double %l07, %add06 + %add08 = fadd double %l08, %add07 + %add09 = fadd double %l09, %add08 + %add10 = fadd double %l10, %add09 + %add11 = fadd double %l11, %add10 + %add12 = fadd double %l12, %add11 + %add13 = fadd double %l13, %add12 + %add14 = fadd double %l14, %add13 + %add15 = fadd double %l15, %add14 + store volatile double %add00, double *%ptr + store volatile double %add01, double *%ptr + store volatile double %add02, double *%ptr + store volatile double %add03, double *%ptr + store volatile double %add04, double *%ptr + store volatile double %add05, double *%ptr + store volatile double %add06, double *%ptr + store volatile double %add07, double *%ptr + store volatile double %add08, double *%ptr + store volatile double %add09, double *%ptr + store volatile double %add10, double *%ptr + store volatile double %add11, double *%ptr + store volatile double %add12, double *%ptr + store volatile double %add13, double *%ptr + store volatile double %add14, double *%ptr + store volatile double %add15, double *%ptr + + %v00 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v01 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v02 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v03 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v04 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v05 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v06 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v07 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v08 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v09 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v10 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v11 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v12 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v13 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v14 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v15 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v16 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v17 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v18 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v19 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v20 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v21 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v22 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v23 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v24 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v25 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v26 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v27 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v28 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v29 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v30 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %v31 = load volatile <2 x i64>, <2 x i64> *%vec_ptr + %vadd00 = add <2 x i64> %v00, %v00 + %vadd01 = add <2 x i64> %v01, %vadd00 + %vadd02 = add <2 x i64> %v02, %vadd01 + %vadd03 = add <2 x i64> %v03, %vadd02 + %vadd04 = add <2 x i64> %v04, %vadd03 + %vadd05 = add <2 x i64> %v05, %vadd04 + %vadd06 = add <2 x i64> %v06, %vadd05 + %vadd07 = add <2 x i64> %v07, %vadd06 + %vadd08 = add <2 x i64> %v08, %vadd07 + %vadd09 = add <2 x i64> %v09, %vadd08 + %vadd10 = add <2 x i64> %v10, %vadd09 + %vadd11 = add <2 x i64> %v11, %vadd10 + %vadd12 = add <2 x i64> %v12, %vadd11 + %vadd13 = add <2 x i64> %v13, %vadd12 + %vadd14 = add <2 x i64> %v14, %vadd13 + %vadd15 = add <2 x i64> %v15, %vadd14 + %vadd16 = add <2 x i64> %v16, %vadd15 + %vadd17 = add <2 x i64> %v17, %vadd16 + %vadd18 = add <2 x i64> %v18, %vadd17 + %vadd19 = add <2 x i64> %v19, %vadd18 + %vadd20 = add <2 x i64> %v20, %vadd19 + %vadd21 = add <2 x i64> %v21, %vadd20 + %vadd22 = add <2 x i64> %v22, %vadd21 + %vadd23 = add <2 x i64> %v23, %vadd22 + %vadd24 = add <2 x i64> %v24, %vadd23 + %vadd25 = add <2 x i64> %v25, %vadd24 + %vadd26 = add <2 x i64> %v26, %vadd25 + %vadd27 = add <2 x i64> %v27, %vadd26 + %vadd28 = add <2 x i64> %v28, %vadd27 + %vadd29 = add <2 x i64> %v29, %vadd28 + %vadd30 = add <2 x i64> %v30, %vadd29 + %vadd31 = add <2 x i64> %v31, %vadd30 + store volatile <2 x i64> %vadd00, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd01, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd02, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd03, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd04, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd05, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd06, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd07, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd08, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd09, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd10, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd11, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd12, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd13, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd14, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd15, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd16, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd17, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd18, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd19, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd20, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd21, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd22, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd23, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd24, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd25, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd26, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd27, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd28, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd29, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd30, <2 x i64> *%vec_ptr + store volatile <2 x i64> %vadd31, <2 x i64> *%vec_ptr + ret void +} + +declare i64 @fun(i64 %arg0) +