Index: llvm/lib/Target/SystemZ/SystemZAsmPrinter.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZAsmPrinter.h +++ llvm/lib/Target/SystemZ/SystemZAsmPrinter.h @@ -9,10 +9,11 @@ #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H -#include "SystemZTargetMachine.h" #include "SystemZMCInstLower.h" +#include "SystemZTargetMachine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/Support/Compiler.h" namespace llvm { @@ -25,6 +26,8 @@ class LLVM_LIBRARY_VISIBILITY SystemZAsmPrinter : public AsmPrinter { private: StackMaps SM; + typedef std::pair SymbInstPair; + std::vector EXRL_Targets; public: SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) @@ -35,6 +38,7 @@ void emitInstruction(const MachineInstr *MI) override; void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override; void emitEndOfAsmFile(Module &M) override; + void emitFunctionBodyEnd() override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, @@ -49,6 +53,7 @@ void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL); void LowerSTACKMAP(const MachineInstr &MI); void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower); + void emitEXRLTargetInstructions(); }; } // end namespace llvm Index: llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -541,6 +541,24 @@ LowerPATCHPOINT(*MI, Lower); return; + case SystemZ::EXRL_Pseudo: { + unsigned TargetInsOpc = MI->getOperand(0).getImm(); + Register LenMinus1Reg = MI->getOperand(1).getReg(); + Register DestReg = MI->getOperand(2).getReg(); + uint64_t DestDisp = MI->getOperand(3).getImm(); + Register SrcReg = MI->getOperand(4).getReg(); + uint64_t SrcDisp = MI->getOperand(5).getImm(); + + MCSymbol *DotSym = OutContext.createTempSymbol(); + const MCSymbolRefExpr *Dot = MCSymbolRefExpr::create(DotSym, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::EXRL) + .addReg(LenMinus1Reg).addExpr(Dot)); + EXRL_Targets.push_back(SymbInstPair(DotSym, + MCInstBuilder(TargetInsOpc).addReg(DestReg).addImm(DestDisp).addImm(1) + .addReg(SrcReg).addImm(SrcDisp))); + } + return; + default: Lower.lower(MI, LoweredMI); break; @@ -698,6 +716,16 @@ getSubtargetInfo()); } +void SystemZAsmPrinter::emitEXRLTargetInstructions() { + for (auto I : EXRL_Targets) { + MCSymbol *DotSym = I.first; + MCInstBuilder &MCI = I.second; + OutStreamer->emitLabel(DotSym); + OutStreamer->emitInstruction(MCI, getSubtargetInfo()); + } + EXRL_Targets.clear(); +} + // Convert a SystemZ-specific constant pool modifier into the associated // MCSymbolRefExpr variant kind. static MCSymbolRefExpr::VariantKind @@ -749,6 +777,10 @@ emitStackMaps(SM); } +void SystemZAsmPrinter::emitFunctionBodyEnd() { + emitEXRLTargetInstructions(); +} + // Force static initialization. extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmPrinter() { RegisterAsmPrinter X(getTheSystemZTarget()); Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -7068,24 +7068,6 @@ // Custom insertion //===----------------------------------------------------------------------===// -// Force base value Base into a register before MI. Return the register. -static Register forceReg(MachineInstr &MI, MachineOperand &Base, - const SystemZInstrInfo *TII) { - if (Base.isReg()) - return Base.getReg(); - - MachineBasicBlock *MBB = MI.getParent(); - MachineFunction &MF = *MBB->getParent(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - - Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); - BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg) - .add(Base) - .addImm(0) - .addReg(0); - return Reg; -} - // The CC operand of MI might be missing a kill marker because there // were multiple uses of CC, and ISel didn't know which to mark. // Figure out whether MI should have had a kill marker. @@ -7783,55 +7765,51 @@ return MBB; } -MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( - MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const { - MachineFunction &MF = *MBB->getParent(); - const SystemZInstrInfo *TII = - static_cast(Subtarget.getInstrInfo()); - MachineRegisterInfo &MRI = MF.getRegInfo(); - DebugLoc DL = MI.getDebugLoc(); - - MachineOperand DestBase = earlyUseOperand(MI.getOperand(0)); - uint64_t DestDisp = MI.getOperand(1).getImm(); - MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2)); - uint64_t SrcDisp = MI.getOperand(3).getImm(); - uint64_t Length = MI.getOperand(4).getImm(); - - // When generating more than one CLC, all but the last will need to - // branch to the end when a difference is found. - MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ? - SystemZ::splitBlockAfter(MI, MBB) : nullptr); - - // Check for the loop form, in which operand 5 is the trip count. - if (MI.getNumExplicitOperands() > 5) { - bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); +struct MemMemBuilder { + MachineOperand &DestBase; + uint64_t &DestDisp; + MachineOperand &SrcBase; + uint64_t &SrcDisp; + + bool HaveSingleBase; + Register StartCountReg, ThisCountReg, NextCountReg; + Register StartSrcReg, ThisSrcReg, NextSrcReg; + Register StartDestReg, ThisDestReg, NextDestReg; + MemMemBuilder(MachineOperand &DestBase, uint64_t &DestDisp, + MachineOperand &SrcBase, uint64_t &SrcDisp) : + DestBase(DestBase), DestDisp(DestDisp), SrcBase(SrcBase), SrcDisp(SrcDisp) {} + + void buildMemMemLoop(Register TripCountReg, unsigned Opcode, MachineInstr &MI, + MachineBasicBlock *StartMBB, MachineBasicBlock *LoopMBB, + MachineBasicBlock *NextMBB, MachineBasicBlock *EndMBB, + MachineBasicBlock *DoneMBB, const SystemZInstrInfo *TII) { + MachineRegisterInfo &MRI = StartMBB->getParent()->getRegInfo(); + DebugLoc DL = MI.getDebugLoc(); + + // Force base value Base into a register before MI. Return the register. + auto forceReg = [&](MachineOperand &Base) -> Register { + if (Base.isReg()) + return Base.getReg(); + Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + BuildMI(*StartMBB, StartMBB->getFirstTerminator(), DL, TII->get(SystemZ::LA), Reg) + .add(Base).addImm(0).addReg(0); + return Reg; + }; - Register StartCountReg = MI.getOperand(5).getReg(); - Register StartSrcReg = forceReg(MI, SrcBase, TII); - Register StartDestReg = (HaveSingleBase ? StartSrcReg : - forceReg(MI, DestBase, TII)); + StartCountReg = TripCountReg; + HaveSingleBase = DestBase.isIdenticalTo(SrcBase); + StartSrcReg = forceReg(SrcBase); + StartDestReg = (HaveSingleBase ? StartSrcReg : forceReg(DestBase)); const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass; - Register ThisSrcReg = MRI.createVirtualRegister(RC); - Register ThisDestReg = (HaveSingleBase ? ThisSrcReg : - MRI.createVirtualRegister(RC)); - Register NextSrcReg = MRI.createVirtualRegister(RC); - Register NextDestReg = (HaveSingleBase ? NextSrcReg : - MRI.createVirtualRegister(RC)); + ThisSrcReg = MRI.createVirtualRegister(RC); + ThisDestReg = (HaveSingleBase ? ThisSrcReg : MRI.createVirtualRegister(RC)); + NextSrcReg = MRI.createVirtualRegister(RC); + NextDestReg = (HaveSingleBase ? NextSrcReg : MRI.createVirtualRegister(RC)); RC = &SystemZ::GR64BitRegClass; - Register ThisCountReg = MRI.createVirtualRegister(RC); - Register NextCountReg = MRI.createVirtualRegister(RC); - - MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); - MachineBasicBlock *NextMBB = - (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB); - - // StartMBB: - // # fall through to LoopMMB - MBB->addSuccessor(LoopMBB); + ThisCountReg = MRI.createVirtualRegister(RC); + NextCountReg = MRI.createVirtualRegister(RC); // LoopMBB: // %ThisDestReg = phi [ %StartDestReg, StartMBB ], @@ -7845,7 +7823,7 @@ // ( JLH EndMBB ) // // The prefetch is used only for MVC. The JLH is used only for CLC. - MBB = LoopMBB; + MachineBasicBlock *MBB = LoopMBB; BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg) .addReg(StartDestReg).addMBB(StartMBB) @@ -7897,9 +7875,120 @@ .addMBB(LoopMBB); MBB->addSuccessor(LoopMBB); MBB->addSuccessor(DoneMBB); + } +}; + +MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( + MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const { + MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast(Subtarget.getInstrInfo()); + MachineRegisterInfo &MRI = MF.getRegInfo(); + DebugLoc DL = MI.getDebugLoc(); + + MachineOperand DestBase = earlyUseOperand(MI.getOperand(0)); + uint64_t DestDisp = MI.getOperand(1).getImm(); + MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2)); + uint64_t SrcDisp = MI.getOperand(3).getImm(); + MemMemBuilder MemMB(DestBase, DestDisp, SrcBase, SrcDisp); + MachineOperand &LengthMO = MI.getOperand(4); + + if (LengthMO.isReg()) { + Register Length = MI.getOperand(4).getReg(); + + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *PreheaderMBB = SystemZ::emitBlockAfter(MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(PreheaderMBB); + MachineBasicBlock *ExeMBB = SystemZ::emitBlockAfter(LoopMBB); + + // MBB: + // # Jump to DoneMBB if Length is zero, or fall through to PreheaderMBB. + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(Length).addImm(0); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(DoneMBB); + MBB->addSuccessor(DoneMBB); + MBB->addSuccessor(PreheaderMBB); + + // PreheaderMBB: + // %LenMinus1 = Length - 1 (see below) + // %TripC = %LenMinus1 / 256 + // # Jump to ExeMBB if %TripC is zero, or fall through to LoopMBB. + Register LenMinus1 = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + Register TripC = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass); + MBB = PreheaderMBB; + BuildMI(MBB, DL, TII->get(SystemZ::AGHI), LenMinus1) + .addReg(Length).addImm(-1); + BuildMI(MBB, DL, TII->get(SystemZ::SRLG), TripC) + .addReg(LenMinus1).addReg(0).addImm(8); + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(TripC).addImm(0); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(ExeMBB); + MBB->addSuccessor(ExeMBB); + MBB->addSuccessor(LoopMBB); + + // LoopMBB: + // Do %TripC iterations of 256-byte chunks. + MemMB.buildMemMemLoop(TripC, Opcode, MI, PreheaderMBB, LoopMBB, LoopMBB, + nullptr, ExeMBB, TII); + + // ExeMBB: + // # Make PHIs for RemDestReg/RemSrcReg since the loop may or may not run. + // # Use EXecute Relative Long for the remainder of the bytes. The target + // instruction of the EXRL will have a length field of 1 since 0 is an + // illegal value. The number of bytes processed becomes (%LenMinus1 & + // 0xff) + 1. + // # Fall through to DoneMBB. + Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + Register RemDestReg = MemMB.HaveSingleBase ? RemSrcReg + : MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + MBB = ExeMBB; + BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemDestReg) + .addReg(MemMB.StartDestReg).addMBB(PreheaderMBB) + .addReg(MemMB.NextDestReg).addMBB(LoopMBB); + if (!MemMB.HaveSingleBase) + BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg) + .addReg(MemMB.StartSrcReg).addMBB(PreheaderMBB) + .addReg(MemMB.NextSrcReg).addMBB(LoopMBB); + BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo)) + .addImm(Opcode) + .addReg(LenMinus1) + .addReg(RemDestReg).addImm(DestDisp) + .addReg(RemSrcReg).addImm(SrcDisp); + MBB->addSuccessor(DoneMBB); + + MI.eraseFromParent(); + return DoneMBB; + } + + uint64_t Length = LengthMO.getImm(); + + // When generating more than one CLC, all but the last will need to + // branch to the end when a difference is found. + MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ? + SystemZ::splitBlockAfter(MI, MBB) : nullptr); + + // Check for the loop form, in which operand 5 is the trip count. + if (MI.getNumExplicitOperands() > 5) { + MachineBasicBlock *StartMBB = MBB; + MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); + MachineBasicBlock *NextMBB = + (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB); + + // StartMBB: + // # fall through to LoopMBB + MBB->addSuccessor(LoopMBB); + + Register StartCountReg = MI.getOperand(5).getReg(); + MemMB.buildMemMemLoop(StartCountReg, Opcode, MI, StartMBB, LoopMBB, NextMBB, + EndMBB, DoneMBB, TII); - DestBase = MachineOperand::CreateReg(NextDestReg, false); - SrcBase = MachineOperand::CreateReg(NextSrcReg, false); + DestBase = MachineOperand::CreateReg(MemMB.NextDestReg, false); + SrcBase = MachineOperand::CreateReg(MemMB.NextSrcReg, false); Length &= 255; if (EndMBB && !Length) // If the loop handled the whole CLC range, DoneMBB will be empty with @@ -8433,6 +8522,7 @@ return emitMemMemWrapper(MI, MBB, SystemZ::OC); case SystemZ::XCSequence: case SystemZ::XCLoop: + case SystemZ::XCLoopVarLen: return emitMemMemWrapper(MI, MBB, SystemZ::XC); case SystemZ::CLCSequence: case SystemZ::CLCLoop: Index: llvm/lib/Target/SystemZ/SystemZInstrFormats.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5253,6 +5253,7 @@ // The Sequence form uses a straight-line sequence of instructions and // the Loop form uses a loop of length-256 instructions followed by // another instruction to handle the excess. +// The LoopVarLen form is for a loop with a non-constant length parameter. multiclass MemorySS opcode, SDPatternOperator sequence, SDPatternOperator loop> { def "" : SideEffectBinarySSa; @@ -5265,6 +5266,10 @@ imm64:$length, GR64:$count256), [(loop bdaddr12only:$dest, bdaddr12only:$src, imm64:$length, GR64:$count256)]>; + def LoopVarLen : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + GR64:$length), + [(loop bdaddr12only:$dest, bdaddr12only:$src, + GR64:$length, 0x0)]>; } } Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -2165,8 +2165,12 @@ // Execute. let hasSideEffects = 1 in { - def EX : SideEffectBinaryRX<"ex", 0x44, GR64>; - def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>; + def EX : SideEffectBinaryRX<"ex", 0x44, ADDR64>; + def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, ADDR64>; + let hasNoSchedulingInfo = 1 in + def EXRL_Pseudo : Pseudo<(outs), (ins i64imm:$TargetOpc, ADDR64:$lenMinus1, + bdaddr12only:$bdl1, bdaddr12only:$bd2), + []>; } //===----------------------------------------------------------------------===// Index: llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -81,11 +81,12 @@ if (IsVolatile) return SDValue(); + auto *CByte = dyn_cast(Byte); if (auto *CSize = dyn_cast(Size)) { uint64_t Bytes = CSize->getZExtValue(); if (Bytes == 0) return SDValue(); - if (auto *CByte = dyn_cast(Byte)) { + if (CByte) { // Handle cases that can be done using at most two of // MVI, MVHI, MVHHI and MVGHI. The latter two can only be // used if ByteVal is all zeros or all ones; in other casees, @@ -125,7 +126,6 @@ assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already"); // Handle the special case of a memset of 0, which can use XC. - auto *CByte = dyn_cast(Byte); if (CByte && CByte->getZExtValue() == 0) return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP, Chain, Dst, Dst, Bytes); @@ -138,6 +138,14 @@ return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP, Chain, DstPlus1, Dst, Bytes - 1); } + else { // Variable length + if (CByte && CByte->getZExtValue() == 0) + // Handle the special case of a variable length memset of 0 with XC. + return DAG.getNode(SystemZISD::XC_LOOP, DL, MVT::Other, Chain, Dst, Dst, + DAG.getZExtOrTrunc(Size, DL, MVT::i64), + DAG.getConstant(0, DL, PtrVT)); + + } return SDValue(); } Index: llvm/test/CodeGen/SystemZ/memset-05.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/memset-05.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Test memset 0 with variable length +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +define void @fun0(i8* %Addr, i64 %Len) { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: cgibe %r3, 0, 0(%r14) +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: aghi %r3, -1 +; CHECK-NEXT: srlg %r0, %r3, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB0_3 +; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB0_2 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: exrl %r3, .Ltmp0 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: xc 0(1,%r2), 0(%r2) + tail call void @llvm.memset.p0i8.i64(i8* %Addr, i8 0, i64 %Len, i1 false) + ret void +} + +define void @fun1(i8* %Addr, i32 %Len) { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r1, %r3 +; CHECK-NEXT: cgibe %r1, 0, 0(%r14) +; CHECK-NEXT: .LBB1_1: +; CHECK-NEXT: aghi %r1, -1 +; CHECK-NEXT: srlg %r0, %r1, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB1_3 +; CHECK-NEXT: .LBB1_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB1_2 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: exrl %r1, .Ltmp1 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: xc 0(1,%r2), 0(%r2) + tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)