Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -126,6 +126,9 @@ // as for MVC. CLC, + // Use MVC to set a block of memory after storing the first byte. + MEMSET_MVC, + // Use an MVST-based sequence to implement stpcpy(). STPCPY, @@ -709,7 +712,8 @@ MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB, - unsigned Opcode) const; + unsigned Opcode, + bool IsMemset = false) const; MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode) const; MachineBasicBlock *emitTransactionBegin(MachineInstr &MI, Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -5717,6 +5717,7 @@ OPCODE(OC); OPCODE(XC); OPCODE(CLC); + OPCODE(MEMSET_MVC); OPCODE(STPCPY); OPCODE(STRCMP); OPCODE(SEARCH_STRING); @@ -7863,8 +7864,10 @@ return MBB; } -MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( - MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const { +MachineBasicBlock * +SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI, + MachineBasicBlock *MBB, + unsigned Opcode, bool IsMemset) const { MachineFunction &MF = *MBB->getParent(); const SystemZInstrInfo *TII = static_cast(Subtarget.getInstrInfo()); @@ -7873,18 +7876,59 @@ MachineOperand DestBase = earlyUseOperand(MI.getOperand(0)); uint64_t DestDisp = MI.getOperand(1).getImm(); - MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2)); - uint64_t SrcDisp = MI.getOperand(3).getImm(); - MachineOperand &LengthMO = MI.getOperand(4); + // Memset carries only one address and Src becomes the same as Dest here. + MachineOperand SrcBase = earlyUseOperand(MI.getOperand(IsMemset ? 0 : 2)); + uint64_t SrcDisp = MI.getOperand(IsMemset ? 1 : 3).getImm(); + MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4); bool IsImmForm = LengthMO.isImm(); bool IsRegForm = !IsImmForm; + // Fold the displacement Disp if it is out of range. + auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> bool { + unsigned MaxDisp = IsMemset ? 0xfff - 1 : 0xfff; + if (Disp <= MaxDisp) + return false; + Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg) + .add(Base).addImm(Disp).addReg(0); + Base = MachineOperand::CreateReg(Reg, false); + Disp = 0; + return true; + }; + + // Build and insert one Opcode of Length, with special treatment for memset. + auto insertMemMemOp = [&](MachineBasicBlock *InsMBB, + MachineBasicBlock::iterator InsPos, + MachineOperand DestBase, uint64_t DestDisp, + MachineOperand SrcBase, uint64_t SrcDisp, + unsigned Length) -> void { + assert(Length > 0 && "Building mem-mem op with zero length."); + if (IsMemset) { + assert(SrcDisp == DestDisp && isUInt<12>(DestDisp + 1) && + "Unexpected memset displacements."); + MachineOperand &ByteMO = MI.getOperand(3); + if (ByteMO.isImm()) + BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI)) + .add(DestBase).addImm(DestDisp).add(ByteMO); + else + BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC)) + .addReg(ByteMO.getReg()).add(DestBase).addImm(DestDisp).addReg(0); + if (--Length == 0) + return; + } + BuildMI(*MBB, InsPos, DL, TII->get(Opcode)) + .add(DestBase).addImm(DestDisp + IsMemset).addImm(Length) + .add(SrcBase).addImm(SrcDisp) + .setMemRefs(MI.memoperands()); + }; + bool NeedsLoop = false; uint64_t ImmLength = 0; - Register LenMinus1Reg = SystemZ::NoRegister; + Register LenAdjReg = SystemZ::NoRegister; if (IsImmForm) { ImmLength = LengthMO.getImm(); - ImmLength++; // Add back the '1' subtracted originally. + ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment. if (ImmLength == 0) { MI.eraseFromParent(); return MBB; @@ -7908,7 +7952,13 @@ NeedsLoop = true; } else { NeedsLoop = true; - LenMinus1Reg = LengthMO.getReg(); + LenAdjReg = LengthMO.getReg(); + } + + // Prepare to use MVC with a +1 displacement for the dest address. + if (IsMemset && foldDisplIfNeeded(DestBase, DestDisp)){ + SrcBase = DestBase; + SrcDisp = DestDisp; } // When generating more than one CLC, all but the last will need to @@ -7926,17 +7976,17 @@ ImmLength &= 255; } else { BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg) - .addReg(LenMinus1Reg) + .addReg(LenAdjReg) .addReg(0) .addImm(8); } + bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); auto loadZeroAddress = [&]() -> MachineOperand { Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0); return MachineOperand::CreateReg(Reg, false); }; - bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister) DestBase = loadZeroAddress(); if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister) @@ -7971,14 +8021,41 @@ DoneMBB = SystemZ::emitBlockAfter(NextMBB); // MBB: - // # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB. + // # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB. BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) - .addReg(LenMinus1Reg).addImm(-1); + .addReg(LenAdjReg).addImm(IsMemset ? -2 : -1); BuildMI(MBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) .addMBB(AllDoneMBB); MBB->addSuccessor(AllDoneMBB); - MBB->addSuccessor(StartMBB); + if (!IsMemset) + MBB->addSuccessor(StartMBB); + else { + // MemsetOneCheckMBB: + // # Jump to MemsetOneMBB for a memset of length 1, or + // # fall thru to StartMBB. + MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin()); + MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB); + MBB->addSuccessor(MemsetOneCheckMBB); + MBB = MemsetOneCheckMBB; + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(LenAdjReg).addImm(-1); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(MemsetOneMBB); + MBB->addSuccessor(MemsetOneMBB, {10, 100}); + MBB->addSuccessor(StartMBB, {90, 100}); + + // MemsetOneMBB: + // # Jump back to AllDoneMBB after a single MVI or STC. + MBB = MemsetOneMBB; + insertMemMemOp(MBB, MBB->end(), + MachineOperand::CreateReg(StartDestReg, false), DestDisp, + MachineOperand::CreateReg(StartSrcReg, false), SrcDisp, + 1); + BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB); + MBB->addSuccessor(AllDoneMBB); + } // StartMBB: // # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB. @@ -8036,9 +8113,10 @@ BuildMI(MBB, DL, TII->get(SystemZ::PFD)) .addImm(SystemZ::PFD_WRITE) .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0); - BuildMI(MBB, DL, TII->get(Opcode)) - .addReg(ThisDestReg).addImm(DestDisp).addImm(256) - .addReg(ThisSrcReg).addImm(SrcDisp); + insertMemMemOp(MBB, MBB->end(), + MachineOperand::CreateReg(ThisDestReg, false), DestDisp, + MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, + 256); if (EndMBB) { BuildMI(MBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) @@ -8078,7 +8156,7 @@ // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run. // # Use EXecute Relative Long for the remainder of the bytes. The target // instruction of the EXRL will have a length field of 1 since 0 is an - // illegal value. The number of bytes processed becomes (%LenMinus1Reg & + // illegal value. The number of bytes processed becomes (%LenAdjReg & // 0xff) + 1. // # Fall through to AllDoneMBB. Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); @@ -8091,11 +8169,16 @@ BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg) .addReg(StartSrcReg).addMBB(StartMBB) .addReg(NextSrcReg).addMBB(NextMBB); + if (IsMemset) + insertMemMemOp(MBB, MBB->end(), + MachineOperand::CreateReg(RemDestReg, false), DestDisp, + MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, + 1); MachineInstrBuilder EXRL_MIB = BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo)) .addImm(Opcode) - .addReg(LenMinus1Reg) - .addReg(RemDestReg).addImm(DestDisp) + .addReg(LenAdjReg) + .addReg(RemDestReg).addImm(DestDisp + IsMemset) .addReg(RemSrcReg).addImm(SrcDisp); MBB->addSuccessor(AllDoneMBB); MBB = AllDoneMBB; @@ -8110,32 +8193,10 @@ while (ImmLength > 0) { uint64_t ThisLength = std::min(ImmLength, uint64_t(256)); // The previous iteration might have created out-of-range displacements. - // Apply them using LAY if so. - if (!isUInt<12>(DestDisp)) { - Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); - BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg) - .add(DestBase) - .addImm(DestDisp) - .addReg(0); - DestBase = MachineOperand::CreateReg(Reg, false); - DestDisp = 0; - } - if (!isUInt<12>(SrcDisp)) { - Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); - BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg) - .add(SrcBase) - .addImm(SrcDisp) - .addReg(0); - SrcBase = MachineOperand::CreateReg(Reg, false); - SrcDisp = 0; - } - BuildMI(*MBB, MI, DL, TII->get(Opcode)) - .add(DestBase) - .addImm(DestDisp) - .addImm(ThisLength) - .add(SrcBase) - .addImm(SrcDisp) - .setMemRefs(MI.memoperands()); + // Apply them using LA/LAY if so. + foldDisplIfNeeded(DestBase, DestDisp); + foldDisplIfNeeded(SrcBase, SrcDisp); + insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength); DestDisp += ThisLength; SrcDisp += ThisLength; ImmLength -= ThisLength; @@ -8633,6 +8694,11 @@ case SystemZ::CLCImm: case SystemZ::CLCReg: return emitMemMemWrapper(MI, MBB, SystemZ::CLC); + case SystemZ::MemsetImmImm: + case SystemZ::MemsetImmReg: + case SystemZ::MemsetRegImm: + case SystemZ::MemsetRegReg: + return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/); case SystemZ::CLSTLoop: return emitStringWrapper(MI, MBB, SystemZ::CLST); case SystemZ::MVSTLoop: Index: llvm/lib/Target/SystemZ/SystemZInstrFormats.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5256,6 +5256,16 @@ let Constraints = "$R1 = $R1src"; } +class MemsetPseudo + : Pseudo<(outs), (ins bdaddr12only:$dest, lenop:$length, byteop:$B), + [(z_memset_mvc bdaddr12only:$dest, lenop:$length, byteop:$B)]> { + let Defs = [CC]; + let mayLoad = 1; + let mayStore = 1; + let usesCustomInserter = 1; + let hasNoSchedulingInfo = 1; +} + //===----------------------------------------------------------------------===// // Multiclasses that emit both real and pseudo instructions //===----------------------------------------------------------------------===// Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -510,6 +510,12 @@ def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>; } +// Memset[Length][Byte] pseudos. +def MemsetImmImm : MemsetPseudo; +def MemsetImmReg : MemsetPseudo; +def MemsetRegImm : MemsetPseudo; +def MemsetRegReg : MemsetPseudo; + // Move right. let Predicates = [FeatureMiscellaneousExtensions3], mayLoad = 1, mayStore = 1, Uses = [R0L] in Index: llvm/lib/Target/SystemZ/SystemZOperators.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZOperators.td +++ llvm/lib/Target/SystemZ/SystemZOperators.td @@ -102,6 +102,10 @@ SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVT<3, i64>]>; +def SDT_ZMemsetMVC : SDTypeProfile<0, 3, + [SDTCisPtrTy<0>, + SDTCisVT<1, i64>, + SDTCisVT<2, i32>]>; def SDT_ZString : SDTypeProfile<1, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, @@ -413,6 +417,8 @@ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC, [SDNPHasChain, SDNPMayLoad]>; +def z_memset_mvc : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC, [SDNPHasChain, SDNPMayLoad]>; def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString, Index: llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -17,29 +17,44 @@ #define DEBUG_TYPE "systemz-selectiondag-info" -static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) { - return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other) - : DAG.getVTList(MVT::Other); +static unsigned getMemMemLenAdj(unsigned Op) { + return Op == SystemZISD::MEMSET_MVC ? 2 : 1; } -// Emit a mem-mem operation after subtracting one from size, which will be -// added back during pseudo expansion. As the Reg case emitted here may be -// converted by DAGCombiner into having an Imm length, they are both emitted -// the same way. +static SDValue createMemMemNode(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue LenAdj, SDValue Byte) { + SDVTList VTs = Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other) + : DAG.getVTList(MVT::Other); + SmallVector Ops; + if (Op == SystemZISD::MEMSET_MVC) + Ops = { Chain, Dst, LenAdj, Byte }; + else + Ops = { Chain, Dst, Src, LenAdj }; + return DAG.getNode(Op, DL, VTs, Ops); +} + +// Emit a mem-mem operation after subtracting one (or two for memset) from +// size, which will be added back during pseudo expansion. As the Reg case +// emitted here may be converted by DAGCombiner into having an Imm length, +// they are both emitted the same way. static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, SDValue Chain, SDValue Dst, SDValue Src, - uint64_t Size) { - return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, - DAG.getConstant(Size - 1, DL, Src.getValueType())); + uint64_t Size, SDValue Byte = SDValue()) { + unsigned Adj = getMemMemLenAdj(Op); + assert(Size >= Adj && "Adjusted length overflow."); + SDValue LenAdj = DAG.getConstant(Size - Adj, DL, Dst.getValueType()); + return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte); } static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size) { - SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64, - DAG.getZExtOrTrunc(Size, DL, MVT::i64), - DAG.getConstant(-1, DL, MVT::i64)); - return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1); + SDValue Size, SDValue Byte = SDValue()) { + int64_t Adj = getMemMemLenAdj(Op); + SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64, + DAG.getZExtOrTrunc(Size, DL, MVT::i64), + DAG.getConstant(0 - Adj, DL, MVT::i64)); + return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte); } SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy( @@ -127,13 +142,8 @@ if (CByte && CByte->getZExtValue() == 0) return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes); - // Copy the byte to the first location and then use MVC to copy - // it to the rest. - Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment); - SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, - DAG.getConstant(1, DL, PtrVT)); - return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst, - Bytes - 1); + return emitMemMemImm(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(), + Bytes, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32)); } // Variable length @@ -141,7 +151,8 @@ // Handle the special case of a variable length memset of 0 with XC. return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size); - return SDValue(); + return emitMemMemReg(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(), + Size, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32)); } // Convert the current CC value into an integer that is 0 if CC == 0, Index: llvm/test/CodeGen/SystemZ/memset-01.ll =================================================================== --- llvm/test/CodeGen/SystemZ/memset-01.ll +++ llvm/test/CodeGen/SystemZ/memset-01.ll @@ -87,7 +87,8 @@ define void @f9(i8* %dest, i8 %val) { ; CHECK-LABEL: f9: ; CHECK: stc %r3, 0(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: stc %r3, 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 257, i1 false) ret void @@ -97,7 +98,8 @@ define void @f10(i8* %dest, i8 %val) { ; CHECK-LABEL: f10: ; CHECK: stc %r3, 0(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: stc %r3, 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 257, i1 false) ret void @@ -107,7 +109,8 @@ define void @f11(i8* %dest, i8 %val) { ; CHECK-LABEL: f11: ; CHECK: stc %r3, 0(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: stc %r3, 256(%r2) ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 258, i1 false) @@ -118,7 +121,8 @@ define void @f12(i8* %dest, i8 %val) { ; CHECK-LABEL: f12: ; CHECK: stc %r3, 0(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: stc %r3, 256(%r2) ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 258, i1 false) @@ -129,30 +133,87 @@ define void @f13(i8* %dest, i8 %val) { ; CHECK-LABEL: f13: ; CHECK: stc %r3, 0(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) -; CHECK: mvc 257(256,%r2), 256(%r2) -; CHECK: mvc 513(256,%r2), 512(%r2) -; CHECK: mvc 769(256,%r2), 768(%r2) -; CHECK: mvc 1025(256,%r2), 1024(%r2) -; CHECK: mvc 1281(256,%r2), 1280(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: stc %r3, 256(%r2) +; CHECK: mvc 257(255,%r2), 256(%r2) +; CHECK: stc %r3, 512(%r2) +; CHECK: mvc 513(255,%r2), 512(%r2) +; CHECK: stc %r3, 768(%r2) +; CHECK: mvc 769(255,%r2), 768(%r2) +; CHECK: stc %r3, 1024(%r2) +; CHECK: mvc 1025(255,%r2), 1024(%r2) +; CHECK: stc %r3, 1280(%r2) +; CHECK: mvc 1281(255,%r2), 1280(%r2) ; CHECK: br %r14 - call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false) + call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1536, i1 false) ret void } ; Test the next size up, which uses a loop. We leave the other corner -; cases to memcpy-01.ll. +; cases to memcpy-01.ll and memset-07.ll. define void @f14(i8* %dest, i8 %val) { ; CHECK-LABEL: f14: -; CHECK: stc %r3, 0(%r2) ; CHECK: lghi [[COUNT:%r[0-5]]], 6 ; CHECK: [[LABEL:\.L[^:]*]]: -; CHECK: pfd 2, 769(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: pfd 2, 768(%r2) +; CHECK: stc %r3, 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) ; CHECK: la %r2, 256(%r2) ; CHECK: brctg [[COUNT]], [[LABEL]] -; CHECK: mvc 1(1,%r2), 0(%r2) -; CHECK: br %r14 - call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false) +; CHECK: stc %r3, 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false) ret void } + +; Test (no) folding of displacement: Begins with max(uint12) - 1. +define void @f15(i8* %dest, i8 %val) { +; CHECK-LABEL: f15: +; CHECK-NOT: la {{.*}}%r2 + %addr = getelementptr i8, i8* %dest, i64 4094 + call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false) + ret void +} + +; Test folding of displacement: Begins with max(uint12). +define void @f16(i8* %dest, i8 %val) { +; CHECK-LABEL: f16: +; CHECK: la %r1, 4095(%r2) +; CHECK: stc %r3, 0(%r1) + %addr = getelementptr i8, i8* %dest, i64 4095 + call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false) + ret void +} + +; Test folding of displacement with LA: First two ops are in range. +define void @f17(i8* %dest, i8 %val) { +; CHECK-LABEL: f17: +; CHECK: stc %r3, 3583(%r2) +; CHECK-NEXT: mvc 3584(255,%r2), 3583(%r2) +; CHECK-NEXT: stc %r3, 3839(%r2) +; CHECK-NEXT: mvc 3840(255,%r2), 3839(%r2) +; CHECK-NEXT: la %r1, 4095(%r2) +; CHECK-NEXT: stc %r3, 0(%r1) +; CHECK-NEXT: mvc 1(1,%r1), 0(%r1) +; CHECK-NEXT: br %r14 + %addr = getelementptr i8, i8* %dest, i64 3583 + call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false) + ret void +} + +; Test folding of displacement with LAY: First two ops are in range. +define void @f18(i8* %dest, i8 %val) { +; CHECK-LABEL: f18: +; CHECK: stc %r3, 3584(%r2) +; CHECK-NEXT: mvc 3585(255,%r2), 3584(%r2) +; CHECK-NEXT: stc %r3, 3840(%r2) +; CHECK-NEXT: mvc 3841(255,%r2), 3840(%r2) +; CHECK-NEXT: lay %r1, 4096(%r2) +; CHECK-NEXT: stc %r3, 0(%r1) +; CHECK-NEXT: mvc 1(1,%r1), 0(%r1) +; CHECK-NEXT: br %r14 + %addr = getelementptr i8, i8* %dest, i64 3584 + call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false) + ret void +} + Index: llvm/test/CodeGen/SystemZ/memset-02.ll =================================================================== --- llvm/test/CodeGen/SystemZ/memset-02.ll +++ llvm/test/CodeGen/SystemZ/memset-02.ll @@ -123,7 +123,8 @@ define void @f13(i8* %dest) { ; CHECK-LABEL: f13: ; CHECK: mvi 0(%r2), 128 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 128 ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 257, i1 false) ret void @@ -133,7 +134,8 @@ define void @f14(i8* %dest) { ; CHECK-LABEL: f14: ; CHECK: mvi 0(%r2), 128 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 128 ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 257, i1 false) ret void @@ -143,7 +145,8 @@ define void @f15(i8* %dest) { ; CHECK-LABEL: f15: ; CHECK: mvi 0(%r2), 128 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 128 ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 258, i1 false) @@ -154,7 +157,8 @@ define void @f16(i8* %dest) { ; CHECK-LABEL: f16: ; CHECK: mvi 0(%r2), 128 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 128 ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 258, i1 false) Index: llvm/test/CodeGen/SystemZ/memset-04.ll =================================================================== --- llvm/test/CodeGen/SystemZ/memset-04.ll +++ llvm/test/CodeGen/SystemZ/memset-04.ll @@ -359,7 +359,8 @@ define void @f37(i8* %dest) { ; CHECK-LABEL: f37: ; CHECK: mvi 0(%r2), 255 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 255 ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 257, i1 false) ret void @@ -369,7 +370,8 @@ define void @f38(i8* %dest) { ; CHECK-LABEL: f38: ; CHECK: mvi 0(%r2), 255 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 255 ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 257, i1 false) ret void @@ -379,7 +381,8 @@ define void @f39(i8* %dest) { ; CHECK-LABEL: f39: ; CHECK: mvi 0(%r2), 255 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 255 ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 258, i1 false) @@ -390,7 +393,8 @@ define void @f40(i8* %dest) { ; CHECK-LABEL: f40: ; CHECK: mvi 0(%r2), 255 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 255 ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 258, i1 false) Index: llvm/test/CodeGen/SystemZ/memset-07.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/memset-07.ll @@ -0,0 +1,100 @@ +; Test memset in cases where a loop is used. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +declare void @llvm.memset.p0i8.i32(i8 *nocapture, i8, i32, i1) nounwind +declare void @llvm.memset.p0i8.i64(i8 *nocapture, i8, i64, i1) nounwind + +; Constant length: 6 iterations and 2 bytes remainder. +define void @f1(i8* %dest, i8 %val) { +; CHECK-LABEL: f1: +; CHECK: lghi [[COUNT:%r[0-5]]], 6 +; CHECK: [[LABEL:\.L[^:]*]]: +; CHECK: pfd 2, 768(%r2) +; CHECK: stc %r3, 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: la %r2, 256(%r2) +; CHECK: brctg [[COUNT]], [[LABEL]] +; CHECK: stc %r3, 0(%r2) +; CHECK-NEXT: mvc 1(1,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false) + ret void +} + +; Constant length: 6 iterations and 255 bytes remainder. +define void @f2(i8* %dest) { +; CHECK-LABEL: f2: +; CHECK: lghi [[COUNT:%r[0-5]]], 6 +; CHECK: [[LABEL:\.L[^:]*]]: +; CHECK: pfd 2, 768(%r2) +; CHECK: mvi 0(%r2), 1 +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: la %r2, 256(%r2) +; CHECK: brctg [[COUNT]], [[LABEL]] +; CHECK: mvi 0(%r2), 1 +; CHECK-NEXT: mvc 1(254,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 1791, i1 false) + ret void +} + +; Variable length, byte in register. +define void @f3(i8* %dest, i8 %val, i64 %Len) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r4, -2 +; CHECK-NEXT: cgibe %r4, -2, 0(%r14) +; CHECK-NEXT: .LBB2_1: +; CHECK-NEXT: cgije %r4, -1, .LBB2_5 +; CHECK-NEXT:# %bb.2: +; CHECK-NEXT: srlg %r0, %r4, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB2_4 +; CHECK-NEXT:.LBB2_3: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: pfd 2, 768(%r2) +; CHECK-NEXT: stc %r3, 0(%r2) +; CHECK-NEXT: mvc 1(255,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB2_3 +; CHECK-NEXT:.LBB2_4: +; CHECK-NEXT: stc %r3, 0(%r2) +; CHECK-NEXT: exrl %r4, .Ltmp0 +; CHECK-NEXT: br %r14 +; CHECK-NEXT:.LBB2_5: +; CHECK-NEXT: stc %r3, 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %Len, i1 false) + ret void +} + +; Variable length, immediate byte. +define void @f4(i8* %dest, i32 %Len) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r1, %r3 +; CHECK-NEXT: aghi %r1, -2 +; CHECK-NEXT: cgibe %r1, -2, 0(%r14) +; CHECK-NEXT:.LBB3_1: +; CHECK-NEXT: cgije %r1, -1, .LBB3_5 +; CHECK-NEXT:# %bb.2: +; CHECK-NEXT: srlg %r0, %r1, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB3_4 +; CHECK-NEXT:.LBB3_3: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: pfd 2, 768(%r2) +; CHECK-NEXT: mvi 0(%r2), 1 +; CHECK-NEXT: mvc 1(255,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB3_3 +; CHECK-NEXT:.LBB3_4: +; CHECK-NEXT: mvi 0(%r2), 1 +; CHECK-NEXT: exrl %r1, .Ltmp0 +; CHECK-NEXT: br %r14 +; CHECK-NEXT:.LBB3_5: +; CHECK-NEXT: mvi 0(%r2), 1 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 %Len, i1 false) + ret void +} + +; CHECK: .Ltmp0: +; CHECK-NEXT: mvc 1(1,%r2), 0(%r2) Index: llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll =================================================================== --- llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll +++ llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll @@ -8,16 +8,6 @@ ret void } -; CHECK-LABEL: tail_memset: -; CHECK: jg memset -define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 { -entry: - tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 false) - ret void -} - -declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) #0 declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) #0 -declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1) #0 attributes #0 = { nounwind }