Index: llvm/lib/Target/SystemZ/SystemZCallingConv.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -166,6 +166,7 @@ // any non-leaf function and restored in the epilogue for use by the // return instruction so it functions exactly like a callee-saved register. def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 7, 15), + (sequence "R%dD", 4, 4), (sequence "F%dD", 15, 8))>; def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add CSR_SystemZ_XPLINK64, Index: llvm/lib/Target/SystemZ/SystemZFrameLowering.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -105,11 +105,20 @@ ArrayRef CSI, const TargetRegisterInfo *TRI) const override; + bool + restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBII, + MutableArrayRef CSI, + const TargetRegisterInfo *TRI) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; bool hasFP(const MachineFunction &MF) const override; + + void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS) const override; }; } // end namespace llvm Index: llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -818,7 +818,7 @@ } SystemZXPLINKFrameLowering::SystemZXPLINKFrameLowering() - : SystemZFrameLowering(TargetFrameLowering::StackGrowsUp, Align(32), 128, + : SystemZFrameLowering(TargetFrameLowering::StackGrowsDown, Align(32), 0, Align(32), /* StackRealignable */ false), RegSpillOffsets(-1) { @@ -990,12 +990,183 @@ return true; } +bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MutableArrayRef CSI, const TargetRegisterInfo *TRI) const { + + if (CSI.empty()) + return false; + + MachineFunction &MF = *MBB.getParent(); + SystemZMachineFunctionInfo *ZFI = MF.getInfo(); + const SystemZSubtarget &Subtarget = MF.getSubtarget(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + auto &Regs = Subtarget.getSpecialRegisters(); + + DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Restore FPRs in the normal TargetInstrInfo way. + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + unsigned Reg = CSI[I].getReg(); + if (SystemZ::FP64BitRegClass.contains(Reg)) + TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(), + &SystemZ::FP64BitRegClass, TRI); + if (SystemZ::VR128BitRegClass.contains(Reg)) + TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(), + &SystemZ::VR128BitRegClass, TRI); + } + + // Restore call-saved GPRs (but not call-clobbered varargs, which at + // this point might hold return values). + SystemZ::GPRRegs RestoreGPRs = ZFI->getRestoreGPRRegs(); + if (RestoreGPRs.LowGPR) { + assert(isInt<20>(Regs.getStackPointerBias() + RestoreGPRs.GPROffset)); + if (RestoreGPRs.LowGPR == RestoreGPRs.HighGPR) + // Build an LG/L instruction. + BuildMI(MBB, MBBI, DL, TII->get(SystemZ::LG), RestoreGPRs.LowGPR) + .addReg(Regs.getStackPointerRegister()) + .addImm(Regs.getStackPointerBias() + RestoreGPRs.GPROffset) + .addReg(0); + else { + // Build an LMG/LM instruction. + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::LMG)); + + // Add the explicit register operands. + MIB.addReg(RestoreGPRs.LowGPR, RegState::Define); + MIB.addReg(RestoreGPRs.HighGPR, RegState::Define); + + // Add the address. + MIB.addReg(Regs.getStackPointerRegister()); + MIB.addImm(Regs.getStackPointerBias() + RestoreGPRs.GPROffset); + + // Do a second scan adding regs as being defined by instruction + for (unsigned I = 0, E = CSI.size(); I != E; ++I) { + unsigned Reg = CSI[I].getReg(); + if (Reg > RestoreGPRs.LowGPR && Reg < RestoreGPRs.HighGPR) + MIB.addReg(Reg, RegState::ImplicitDefine); + } + } + } + + return true; +} + void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF, - MachineBasicBlock &MBB) const {} + MachineBasicBlock &MBB) const { + assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); + const SystemZSubtarget &Subtarget = MF.getSubtarget(); + SystemZMachineFunctionInfo *ZFI = MF.getInfo(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + auto *ZII = static_cast(Subtarget.getInstrInfo()); + auto &Regs = Subtarget.getSpecialRegisters(); + MachineFrameInfo &MFFrame = MF.getFrameInfo(); + MachineInstr *StoreInstr = nullptr; + bool HasFP = hasFP(MF); + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; + uint64_t Offset = 0; + + // TODO: Support leaf functions; only add size of save+reserved area when + // function is non-leaf. + MFFrame.setStackSize(MFFrame.getStackSize() + Regs.getCallFrameSize()); + uint64_t StackSize = MFFrame.getStackSize(); + + // FIXME: Implement support for large stack sizes, when the stack extension + // routine needs to be called. + if (StackSize > 1024 * 1024) { + llvm_unreachable("Huge Stack Frame not yet supported on z/OS"); + } + + if (ZFI->getSpillGPRRegs().LowGPR) { + // Skip over the GPR saves. + if ((MBBI != MBB.end()) && ((MBBI->getOpcode() == SystemZ::STMG))) { + const int Operand = 3; + // Now we can set the offset for the operation, since now the Stack + // has been finalized. + Offset = Regs.getStackPointerBias() + MBBI->getOperand(Operand).getImm(); + // Maximum displacement for STMG instruction. + if (isInt<20>(Offset - StackSize)) + Offset -= StackSize; + else + StoreInstr = &*MBBI; + MBBI->getOperand(Operand).setImm(Offset); + ++MBBI; + } else + llvm_unreachable("Couldn't skip over GPR saves"); + } + + if (StackSize) { + MachineBasicBlock::iterator InsertPt = StoreInstr ? StoreInstr : MBBI; + // Allocate StackSize bytes. + int64_t Delta = -int64_t(StackSize); + + // In case the STM(G) instruction also stores SP (R4), but the displacement + // is too large, the SP register is manipulated first before storing, + // resulting in the wrong value stored and retrieved later. In this case, we + // need to temporarily save the value of SP, and store it later to memory. + if (StoreInstr && HasFP) { + // Insert LR r0,r4 before STMG instruction. + BuildMI(MBB, InsertPt, DL, ZII->get(SystemZ::LGR)) + .addReg(SystemZ::R0D, RegState::Define) + .addReg(SystemZ::R4D); + // Insert ST r0,xxx(,r4) after STMG instruction. + BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG), SystemZ::R0D) + .addReg(SystemZ::R4D) + .addImm(Offset) + .addReg(0); + } + + emitIncrement(MBB, InsertPt, DL, Regs.getStackPointerRegister(), Delta, + ZII); + } + + if (HasFP) { + // Copy the base of the frame to Frame Pointer Register. + BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::LGR), + Regs.getFramePointerRegister()) + .addReg(Regs.getStackPointerRegister()); + + // Mark the FramePtr as live at the beginning of every block except + // the entry block. (We'll have marked R8 as live on entry when + // saving the GPRs.) + for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I) + I->addLiveIn(Regs.getFramePointerRegister()); + } +} void SystemZXPLINKFrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const {} + MachineBasicBlock &MBB) const { + const SystemZSubtarget &Subtarget = MF.getSubtarget(); + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + SystemZMachineFunctionInfo *ZFI = MF.getInfo(); + MachineFrameInfo &MFFrame = MF.getFrameInfo(); + auto *ZII = static_cast(Subtarget.getInstrInfo()); + auto &Regs = Subtarget.getSpecialRegisters(); + + // Skip the return instruction. + assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks"); + + uint64_t StackSize = MFFrame.getStackSize(); + if (StackSize) { + unsigned SPReg = Regs.getStackPointerRegister(); + if (ZFI->getRestoreGPRRegs().LowGPR != SPReg) { + DebugLoc DL = MBBI->getDebugLoc(); + emitIncrement(MBB, MBBI, DL, SPReg, StackSize, ZII); + } + } +} bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const { - return false; + return (MF.getFrameInfo().hasVarSizedObjects()); +} + +void SystemZXPLINKFrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFFrame = MF.getFrameInfo(); + const SystemZSubtarget &Subtarget = MF.getSubtarget(); + auto &Regs = Subtarget.getSpecialRegisters(); + + // Setup stack frame offset + MFFrame.setOffsetAdjustment(Regs.getStackPointerBias()); } Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1500,8 +1500,16 @@ assert(VA.isMemLoc() && "Argument not register or memory"); // Create the frame index object for this incoming parameter. - int FI = MFI.CreateFixedObject(LocVT.getSizeInBits() / 8, - VA.getLocMemOffset(), true); + // FIXME: Pre-include call frame size in the offset, should not + // need to manually add it here. + int64_t ArgSPOffset = VA.getLocMemOffset(); + if (Subtarget.isTargetXPLINK64()) { + auto &XPRegs = + Subtarget.getSpecialRegisters(); + ArgSPOffset += XPRegs.getCallFrameSize(); + } + int FI = + MFI.CreateFixedObject(LocVT.getSizeInBits() / 8, ArgSPOffset, true); // Create the SelectionDAG nodes corresponding to a load // from this parameter. Unpromoted ints and floats are Index: llvm/test/CodeGen/SystemZ/call-zos-01.ll =================================================================== --- llvm/test/CodeGen/SystemZ/call-zos-01.ll +++ llvm/test/CodeGen/SystemZ/call-zos-01.ll @@ -86,7 +86,7 @@ } ; CHECK-LABEL: pass_integrals0: -; CHECK: ag 2, -{{[0-9]+}}(4) +; CHECK: ag 2, 2328(4) ; CHECK-NEXT: lgr 3, 2 define signext i64 @pass_integrals0(i64 signext %arg0, i32 signext %arg1, i16 signext %arg2, i64 signext %arg3) { entry: Index: llvm/test/CodeGen/SystemZ/call-zos-vec.ll =================================================================== --- llvm/test/CodeGen/SystemZ/call-zos-vec.ll +++ llvm/test/CodeGen/SystemZ/call-zos-vec.ll @@ -14,7 +14,7 @@ ; CHECK: vaf 1, 1, 27 ; CHECK: vaf 1, 1, 28 ; CHECK: vaf 1, 1, 29 -; CHECK: vl 0, 32(4), 4 +; CHECK: vl 0, 2432(4), 4 ; CHECK: vaf 1, 1, 30 ; CHECK: vaf 1, 1, 31 ; CHECK: vaf 24, 1, 0 Index: llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll =================================================================== --- llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll +++ llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll @@ -6,15 +6,27 @@ ; Small stack frame. ; CHECK-LABEL: func0 -; CHECK64: stmg 6, 7 +; CHECK64: stmg 6, 7, 1872(4) +; stmg instruction's displacement field must be 2064-dsa_size +; as per ABI +; CHECK64: aghi 4, -192 + +; CHECK64: lg 7, 2072(4) +; CHECK64: aghi 4, 192 +; CHECK64: b 2(7) define void @func0() { - call i64 (i64) @fun(i64 10) + call i64 (i64) @fun(i64 10) ret void } ; Spill all GPR CSRs ; CHECK-LABEL: func1 -; CHECK64: stmg 6, 15 +; CHECK64: stmg 6, 15, 1904(4) +; CHECK64: aghi 4, -160 + +; CHECK64: lmg 7, 15, 2072(4) +; CHECK64: aghi 4, 160 +; CHECK64: b 2(7) define void @func1(i64 *%ptr) { %l01 = load volatile i64, i64 *%ptr %l02 = load volatile i64, i64 *%ptr @@ -67,6 +79,8 @@ ; Spill all FPRs and VRs ; CHECK-LABEL: func2 +; CHECK64: stmg 6, 7, 1744(4) +; CHECK64: aghi 4, -320 ; CHECK64: std 15, {{[0-9]+}}(4) * 8-byte Folded Spill ; CHECK64: std 14, {{[0-9]+}}(4) * 8-byte Folded Spill ; CHECK64: std 13, {{[0-9]+}}(4) * 8-byte Folded Spill @@ -83,6 +97,27 @@ ; CHECK64: vst 18, {{[0-9]+}}(4), 4 * 16-byte Folded Spill ; CHECK64: vst 17, {{[0-9]+}}(4), 4 * 16-byte Folded Spill ; CHECK64: vst 16, {{[0-9]+}}(4), 4 * 16-byte Folded Spill + +; CHECK64: ld 15, {{[0-9]+}}(4) * 8-byte Folded Reload +; CHECK64: ld 14, {{[0-9]+}}(4) * 8-byte Folded Reload +; CHECK64: ld 13, {{[0-9]+}}(4) * 8-byte Folded Reload +; CHECK64: ld 12, {{[0-9]+}}(4) * 8-byte Folded Reload +; CHECK64: ld 11, {{[0-9]+}}(4) * 8-byte Folded Reload +; CHECK64: ld 10, {{[0-9]+}}(4) * 8-byte Folded Reload +; CHECK64: ld 9, {{[0-9]+}}(4) * 8-byte Folded Reload +; CHECK64: ld 8, {{[0-9]+}}(4) * 8-byte Folded Reload +; CHECK64: vl 23, {{[0-9]+}}(4), 4 * 16-byte Folded Reload +; CHECK64: vl 22, {{[0-9]+}}(4), 4 * 16-byte Folded Reload +; CHECK64: vl 21, {{[0-9]+}}(4), 4 * 16-byte Folded Reload +; CHECK64: vl 20, {{[0-9]+}}(4), 4 * 16-byte Folded Reload +; CHECK64: vl 19, {{[0-9]+}}(4), 4 * 16-byte Folded Reload +; CHECK64: vl 18, {{[0-9]+}}(4), 4 * 16-byte Folded Reload +; CHECK64: vl 17, {{[0-9]+}}(4), 4 * 16-byte Folded Reload +; CHECK64: vl 16, {{[0-9]+}}(4), 4 * 16-byte Folded Reload +; CHECK64: lg 7, 2072(4) +; CHECK64: aghi 4, 320 +; CHECK64: b 2(7) + define void @func2(double *%ptr, <2 x i64> *%vec_ptr) { %l00 = load volatile double, double *%ptr %l01 = load volatile double, double *%ptr @@ -232,5 +267,43 @@ ret void } -declare i64 @fun(i64 %arg0) +; Big stack frame, force the use of agfi before stmg +; despite not requiring stack extension routine. +; CHECK64: agfi 4, -1040768 +; CHECK64: stmg 6, 7, 2064(4) +; CHECK64: agfi 4, 1040768 +define void @func3() { + %arr = alloca [130070 x i64], align 8 + %ptr = bitcast [130070 x i64]* %arr to i8* + call i64 (i8*) @fun1(i8* %ptr) + ret void +} + +; Requires the saving of r4 due to variable sized +; object in stack frame. (Eg: VLA) +; CHECK64: stmg 4, 8, 1856(4) +; CHECK64: aghi 4, -192 +; CHECK64: lmg 4, 8, 2048(4) +define i64 @func4(i64 %n) { + %vla = alloca i64, i64 %n, align 8 + %call = call i64 @fun2(i64 %n, i64* nonnull %vla, i64* nonnull %vla) + ret i64 %call +} +; Require saving of r4 and in addition, a displacement large enough +; to force use of agfi before stmg. +; CHECK64: lgr 0, 4 +; CHECK64: agfi 4, -1040192 +; CHECK64: stmg 4, 8, 2048(4) +; CHECK64: lmg 4, 8, 2048(4) +define i64 @func5(i64 %n) { + %vla = alloca i64, i64 %n, align 8 + %arr = alloca [130000 x i64], align 8 + %ptr = bitcast [130000 x i64]* %arr to i64* + %call = call i64 @fun2(i64 %n, i64* nonnull %vla, i64* %ptr) + ret i64 %call +} + +declare i64 @fun(i64 %arg0) +declare i64 @fun1(i8* %ptr) +declare i64 @fun2(i64 %n, i64* %arr0, i64* %arr1)