diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -126,6 +126,9 @@ // as for MVC. CLC, + // Use MVC to set a block of memory after storing the first byte. + MEMSET_MVC, + // Use an MVST-based sequence to implement stpcpy(). STPCPY, @@ -709,7 +712,8 @@ MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB, - unsigned Opcode) const; + unsigned Opcode, + bool IsMemset = false) const; MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode) const; MachineBasicBlock *emitTransactionBegin(MachineInstr &MI, diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -5714,6 +5714,7 @@ OPCODE(OC); OPCODE(XC); OPCODE(CLC); + OPCODE(MEMSET_MVC); OPCODE(STPCPY); OPCODE(STRCMP); OPCODE(SEARCH_STRING); @@ -7860,8 +7861,10 @@ return MBB; } -MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper( - MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const { +MachineBasicBlock * +SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI, + MachineBasicBlock *MBB, + unsigned Opcode, bool IsMemset) const { MachineFunction &MF = *MBB->getParent(); const SystemZInstrInfo *TII = static_cast(Subtarget.getInstrInfo()); @@ -7870,18 +7873,64 @@ MachineOperand DestBase = earlyUseOperand(MI.getOperand(0)); uint64_t DestDisp = MI.getOperand(1).getImm(); - MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2)); - uint64_t SrcDisp = MI.getOperand(3).getImm(); - MachineOperand &LengthMO = MI.getOperand(4); + MachineOperand SrcBase = MachineOperand::CreateReg(0U, false); + uint64_t SrcDisp; + + // Fold the displacement Disp if it is out of range. + auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> void { + if (!isUInt<12>(Disp)) { + Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg) + .add(Base).addImm(Disp).addReg(0); + Base = MachineOperand::CreateReg(Reg, false); + Disp = 0; + } + }; + + if (!IsMemset) { + SrcBase = earlyUseOperand(MI.getOperand(2)); + SrcDisp = MI.getOperand(3).getImm(); + } else { + SrcBase = DestBase; + SrcDisp = DestDisp++; + foldDisplIfNeeded(DestBase, DestDisp); + } + + MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4); bool IsImmForm = LengthMO.isImm(); bool IsRegForm = !IsImmForm; + // Build and insert one Opcode of Length, with special treatment for memset. + auto insertMemMemOp = [&](MachineBasicBlock *InsMBB, + MachineBasicBlock::iterator InsPos, + MachineOperand DBase, uint64_t DDisp, + MachineOperand SBase, uint64_t SDisp, + unsigned Length) -> void { + assert(Length > 0 && Length <= 256 && "Building memory op with bad length."); + if (IsMemset) { + MachineOperand ByteMO = earlyUseOperand(MI.getOperand(3)); + if (ByteMO.isImm()) + BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI)) + .add(SBase).addImm(SDisp).add(ByteMO); + else + BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC)) + .add(ByteMO).add(SBase).addImm(SDisp).addReg(0); + if (--Length == 0) + return; + } + BuildMI(*MBB, InsPos, DL, TII->get(Opcode)) + .add(DBase).addImm(DDisp).addImm(Length) + .add(SBase).addImm(SDisp) + .setMemRefs(MI.memoperands()); + }; + bool NeedsLoop = false; uint64_t ImmLength = 0; - Register LenMinus1Reg = SystemZ::NoRegister; + Register LenAdjReg = SystemZ::NoRegister; if (IsImmForm) { ImmLength = LengthMO.getImm(); - ImmLength++; // Add back the '1' subtracted originally. + ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment. if (ImmLength == 0) { MI.eraseFromParent(); return MBB; @@ -7905,7 +7954,7 @@ NeedsLoop = true; } else { NeedsLoop = true; - LenMinus1Reg = LengthMO.getReg(); + LenAdjReg = LengthMO.getReg(); } // When generating more than one CLC, all but the last will need to @@ -7923,17 +7972,17 @@ ImmLength &= 255; } else { BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg) - .addReg(LenMinus1Reg) + .addReg(LenAdjReg) .addReg(0) .addImm(8); } + bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); auto loadZeroAddress = [&]() -> MachineOperand { Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0); return MachineOperand::CreateReg(Reg, false); }; - bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister) DestBase = loadZeroAddress(); if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister) @@ -7968,14 +8017,41 @@ DoneMBB = SystemZ::emitBlockAfter(NextMBB); // MBB: - // # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB. + // # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB. BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) - .addReg(LenMinus1Reg).addImm(-1); + .addReg(LenAdjReg).addImm(IsMemset ? -2 : -1); BuildMI(MBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) .addMBB(AllDoneMBB); MBB->addSuccessor(AllDoneMBB); - MBB->addSuccessor(StartMBB); + if (!IsMemset) + MBB->addSuccessor(StartMBB); + else { + // MemsetOneCheckMBB: + // # Jump to MemsetOneMBB for a memset of length 1, or + // # fall thru to StartMBB. + MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB); + MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin()); + MBB->addSuccessor(MemsetOneCheckMBB); + MBB = MemsetOneCheckMBB; + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(LenAdjReg).addImm(-1); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(MemsetOneMBB); + MBB->addSuccessor(MemsetOneMBB, {10, 100}); + MBB->addSuccessor(StartMBB, {90, 100}); + + // MemsetOneMBB: + // # Jump back to AllDoneMBB after a single MVI or STC. + MBB = MemsetOneMBB; + insertMemMemOp(MBB, MBB->end(), + MachineOperand::CreateReg(StartDestReg, false), DestDisp, + MachineOperand::CreateReg(StartSrcReg, false), SrcDisp, + 1); + BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB); + MBB->addSuccessor(AllDoneMBB); + } // StartMBB: // # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB. @@ -8032,10 +8108,10 @@ if (Opcode == SystemZ::MVC) BuildMI(MBB, DL, TII->get(SystemZ::PFD)) .addImm(SystemZ::PFD_WRITE) - .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0); - BuildMI(MBB, DL, TII->get(Opcode)) - .addReg(ThisDestReg).addImm(DestDisp).addImm(256) - .addReg(ThisSrcReg).addImm(SrcDisp); + .addReg(ThisDestReg).addImm(DestDisp - IsMemset + 768).addReg(0); + insertMemMemOp(MBB, MBB->end(), + MachineOperand::CreateReg(ThisDestReg, false), DestDisp, + MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, 256); if (EndMBB) { BuildMI(MBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) @@ -8075,7 +8151,7 @@ // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run. // # Use EXecute Relative Long for the remainder of the bytes. The target // instruction of the EXRL will have a length field of 1 since 0 is an - // illegal value. The number of bytes processed becomes (%LenMinus1Reg & + // illegal value. The number of bytes processed becomes (%LenAdjReg & // 0xff) + 1. // # Fall through to AllDoneMBB. Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); @@ -8088,10 +8164,14 @@ BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg) .addReg(StartSrcReg).addMBB(StartMBB) .addReg(NextSrcReg).addMBB(NextMBB); + if (IsMemset) + insertMemMemOp(MBB, MBB->end(), + MachineOperand::CreateReg(RemDestReg, false), DestDisp, + MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, 1); MachineInstrBuilder EXRL_MIB = BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo)) .addImm(Opcode) - .addReg(LenMinus1Reg) + .addReg(LenAdjReg) .addReg(RemDestReg).addImm(DestDisp) .addReg(RemSrcReg).addImm(SrcDisp); MBB->addSuccessor(AllDoneMBB); @@ -8107,32 +8187,10 @@ while (ImmLength > 0) { uint64_t ThisLength = std::min(ImmLength, uint64_t(256)); // The previous iteration might have created out-of-range displacements. - // Apply them using LAY if so. - if (!isUInt<12>(DestDisp)) { - Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); - BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg) - .add(DestBase) - .addImm(DestDisp) - .addReg(0); - DestBase = MachineOperand::CreateReg(Reg, false); - DestDisp = 0; - } - if (!isUInt<12>(SrcDisp)) { - Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); - BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg) - .add(SrcBase) - .addImm(SrcDisp) - .addReg(0); - SrcBase = MachineOperand::CreateReg(Reg, false); - SrcDisp = 0; - } - BuildMI(*MBB, MI, DL, TII->get(Opcode)) - .add(DestBase) - .addImm(DestDisp) - .addImm(ThisLength) - .add(SrcBase) - .addImm(SrcDisp) - .setMemRefs(MI.memoperands()); + // Apply them using LA/LAY if so. + foldDisplIfNeeded(DestBase, DestDisp); + foldDisplIfNeeded(SrcBase, SrcDisp); + insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength); DestDisp += ThisLength; SrcDisp += ThisLength; ImmLength -= ThisLength; @@ -8630,6 +8688,11 @@ case SystemZ::CLCImm: case SystemZ::CLCReg: return emitMemMemWrapper(MI, MBB, SystemZ::CLC); + case SystemZ::MemsetImmImm: + case SystemZ::MemsetImmReg: + case SystemZ::MemsetRegImm: + case SystemZ::MemsetRegReg: + return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/); case SystemZ::CLSTLoop: return emitStringWrapper(MI, MBB, SystemZ::CLST); case SystemZ::MVSTLoop: diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td --- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5256,6 +5256,16 @@ let Constraints = "$R1 = $R1src"; } +class MemsetPseudo + : Pseudo<(outs), (ins bdaddr12only:$dest, lenop:$length, byteop:$B), + [(z_memset_mvc bdaddr12only:$dest, lenop:$length, byteop:$B)]> { + let Defs = [CC]; + let mayLoad = 1; + let mayStore = 1; + let usesCustomInserter = 1; + let hasNoSchedulingInfo = 1; +} + //===----------------------------------------------------------------------===// // Multiclasses that emit both real and pseudo instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -510,6 +510,12 @@ def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>; } +// Memset[Length][Byte] pseudos. +def MemsetImmImm : MemsetPseudo; +def MemsetImmReg : MemsetPseudo; +def MemsetRegImm : MemsetPseudo; +def MemsetRegReg : MemsetPseudo; + // Move right. let Predicates = [FeatureMiscellaneousExtensions3], mayLoad = 1, mayStore = 1, Uses = [R0L] in diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td --- a/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -102,6 +102,10 @@ SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVT<3, i64>]>; +def SDT_ZMemsetMVC : SDTypeProfile<0, 3, + [SDTCisPtrTy<0>, + SDTCisVT<1, i64>, + SDTCisVT<2, i32>]>; def SDT_ZString : SDTypeProfile<1, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, @@ -413,6 +417,8 @@ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC, [SDNPHasChain, SDNPMayLoad]>; +def z_memset_mvc : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC, [SDNPHasChain, SDNPMayLoad]>; def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString, diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -17,29 +17,44 @@ #define DEBUG_TYPE "systemz-selectiondag-info" -static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) { - return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other) - : DAG.getVTList(MVT::Other); +static unsigned getMemMemLenAdj(unsigned Op) { + return Op == SystemZISD::MEMSET_MVC ? 2 : 1; } -// Emit a mem-mem operation after subtracting one from size, which will be -// added back during pseudo expansion. As the Reg case emitted here may be -// converted by DAGCombiner into having an Imm length, they are both emitted -// the same way. +static SDValue createMemMemNode(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue LenAdj, SDValue Byte) { + SDVTList VTs = Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other) + : DAG.getVTList(MVT::Other); + SmallVector Ops; + if (Op == SystemZISD::MEMSET_MVC) + Ops = { Chain, Dst, LenAdj, Byte }; + else + Ops = { Chain, Dst, Src, LenAdj }; + return DAG.getNode(Op, DL, VTs, Ops); +} + +// Emit a mem-mem operation after subtracting one (or two for memset) from +// size, which will be added back during pseudo expansion. As the Reg case +// emitted here may be converted by DAGCombiner into having an Imm length, +// they are both emitted the same way. static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, SDValue Chain, SDValue Dst, SDValue Src, - uint64_t Size) { - return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, - DAG.getConstant(Size - 1, DL, Src.getValueType())); + uint64_t Size, SDValue Byte = SDValue()) { + unsigned Adj = getMemMemLenAdj(Op); + assert(Size >= Adj && "Adjusted length overflow."); + SDValue LenAdj = DAG.getConstant(Size - Adj, DL, Dst.getValueType()); + return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte); } static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size) { - SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64, - DAG.getZExtOrTrunc(Size, DL, MVT::i64), - DAG.getConstant(-1, DL, MVT::i64)); - return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1); + SDValue Size, SDValue Byte = SDValue()) { + int64_t Adj = getMemMemLenAdj(Op); + SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64, + DAG.getZExtOrTrunc(Size, DL, MVT::i64), + DAG.getConstant(0 - Adj, DL, MVT::i64)); + return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte); } SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy( @@ -127,13 +142,8 @@ if (CByte && CByte->getZExtValue() == 0) return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes); - // Copy the byte to the first location and then use MVC to copy - // it to the rest. - Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment); - SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, - DAG.getConstant(1, DL, PtrVT)); - return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst, - Bytes - 1); + return emitMemMemImm(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(), + Bytes, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32)); } // Variable length @@ -141,7 +151,8 @@ // Handle the special case of a variable length memset of 0 with XC. return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size); - return SDValue(); + return emitMemMemReg(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(), + Size, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32)); } // Convert the current CC value into an integer that is 0 if CC == 0, diff --git a/llvm/test/CodeGen/SystemZ/memset-01.ll b/llvm/test/CodeGen/SystemZ/memset-01.ll --- a/llvm/test/CodeGen/SystemZ/memset-01.ll +++ b/llvm/test/CodeGen/SystemZ/memset-01.ll @@ -87,7 +87,8 @@ define void @f9(i8* %dest, i8 %val) { ; CHECK-LABEL: f9: ; CHECK: stc %r3, 0(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: stc %r3, 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 257, i1 false) ret void @@ -97,7 +98,8 @@ define void @f10(i8* %dest, i8 %val) { ; CHECK-LABEL: f10: ; CHECK: stc %r3, 0(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: stc %r3, 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 257, i1 false) ret void @@ -107,7 +109,8 @@ define void @f11(i8* %dest, i8 %val) { ; CHECK-LABEL: f11: ; CHECK: stc %r3, 0(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: stc %r3, 256(%r2) ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 258, i1 false) @@ -118,7 +121,8 @@ define void @f12(i8* %dest, i8 %val) { ; CHECK-LABEL: f12: ; CHECK: stc %r3, 0(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: stc %r3, 256(%r2) ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 258, i1 false) @@ -129,30 +133,88 @@ define void @f13(i8* %dest, i8 %val) { ; CHECK-LABEL: f13: ; CHECK: stc %r3, 0(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) -; CHECK: mvc 257(256,%r2), 256(%r2) -; CHECK: mvc 513(256,%r2), 512(%r2) -; CHECK: mvc 769(256,%r2), 768(%r2) -; CHECK: mvc 1025(256,%r2), 1024(%r2) -; CHECK: mvc 1281(256,%r2), 1280(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: stc %r3, 256(%r2) +; CHECK: mvc 257(255,%r2), 256(%r2) +; CHECK: stc %r3, 512(%r2) +; CHECK: mvc 513(255,%r2), 512(%r2) +; CHECK: stc %r3, 768(%r2) +; CHECK: mvc 769(255,%r2), 768(%r2) +; CHECK: stc %r3, 1024(%r2) +; CHECK: mvc 1025(255,%r2), 1024(%r2) +; CHECK: stc %r3, 1280(%r2) +; CHECK: mvc 1281(255,%r2), 1280(%r2) ; CHECK: br %r14 - call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false) + call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1536, i1 false) ret void } ; Test the next size up, which uses a loop. We leave the other corner -; cases to memcpy-01.ll. +; cases to memcpy-01.ll and memset-07.ll. define void @f14(i8* %dest, i8 %val) { ; CHECK-LABEL: f14: -; CHECK: stc %r3, 0(%r2) ; CHECK: lghi [[COUNT:%r[0-5]]], 6 ; CHECK: [[LABEL:\.L[^:]*]]: -; CHECK: pfd 2, 769(%r2) -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: pfd 2, 768(%r2) +; CHECK: stc %r3, 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) ; CHECK: la %r2, 256(%r2) ; CHECK: brctg [[COUNT]], [[LABEL]] -; CHECK: mvc 1(1,%r2), 0(%r2) -; CHECK: br %r14 - call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false) +; CHECK: stc %r3, 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false) ret void } + +; Test (no) folding of displacement: Begins with max(uint12) - 1. +define void @f15(i8* %dest, i8 %val) { +; CHECK-LABEL: f15: +; CHECK-NOT: la {{.*}}%r2 + %addr = getelementptr i8, i8* %dest, i64 4094 + call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false) + ret void +} + +; Test folding of displacement: Begins with max(uint12). +define void @f16(i8* %dest, i8 %val) { +; CHECK-LABEL: f16: +; CHECK-DAG: lay %r1, 4096(%r2) +; CHECK-DAG: stc %r3, 4095(%r2) + %addr = getelementptr i8, i8* %dest, i64 4095 + call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false) + ret void +} + +; Test folding of displacement with LA: First two ops are in range. +define void @f17(i8* %dest, i8 %val) { +; CHECK-LABEL: f17: +; CHECK: stc %r3, 3583(%r2) +; CHECK-NEXT: mvc 3584(255,%r2), 3583(%r2) +; CHECK-NEXT: stc %r3, 3839(%r2) +; CHECK-NEXT: mvc 3840(255,%r2), 3839(%r2) +; CHECK-NEXT: lay %r1, 4096(%r2) +; CHECK-NEXT: stc %r3, 4095(%r2) +; CHECK-NEXT: mvc 0(1,%r1), 4095(%r2) +; CHECK-NEXT: br %r14 + %addr = getelementptr i8, i8* %dest, i64 3583 + call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false) + ret void +} + +; Test folding of displacement with LAY: First two ops are in range. +define void @f18(i8* %dest, i8 %val) { +; CHECK-LABEL: f18: +; CHECK: stc %r3, 3584(%r2) +; CHECK-NEXT: mvc 3585(255,%r2), 3584(%r2) +; CHECK-NEXT: stc %r3, 3840(%r2) +; CHECK-NEXT: mvc 3841(255,%r2), 3840(%r2) +; CHECK-NEXT: lay %r1, 4097(%r2) +; CHECK-NEXT: lay %r2, 4096(%r2) +; CHECK-NEXT: stc %r3, 0(%r2) +; CHECK-NEXT: mvc 0(1,%r1), 0(%r2) +; CHECK-NEXT: br %r14 + %addr = getelementptr i8, i8* %dest, i64 3584 + call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false) + ret void +} + diff --git a/llvm/test/CodeGen/SystemZ/memset-02.ll b/llvm/test/CodeGen/SystemZ/memset-02.ll --- a/llvm/test/CodeGen/SystemZ/memset-02.ll +++ b/llvm/test/CodeGen/SystemZ/memset-02.ll @@ -123,7 +123,8 @@ define void @f13(i8* %dest) { ; CHECK-LABEL: f13: ; CHECK: mvi 0(%r2), 128 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 128 ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 257, i1 false) ret void @@ -133,7 +134,8 @@ define void @f14(i8* %dest) { ; CHECK-LABEL: f14: ; CHECK: mvi 0(%r2), 128 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 128 ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 257, i1 false) ret void @@ -143,7 +145,8 @@ define void @f15(i8* %dest) { ; CHECK-LABEL: f15: ; CHECK: mvi 0(%r2), 128 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 128 ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 258, i1 false) @@ -154,7 +157,8 @@ define void @f16(i8* %dest) { ; CHECK-LABEL: f16: ; CHECK: mvi 0(%r2), 128 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 128 ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 258, i1 false) diff --git a/llvm/test/CodeGen/SystemZ/memset-04.ll b/llvm/test/CodeGen/SystemZ/memset-04.ll --- a/llvm/test/CodeGen/SystemZ/memset-04.ll +++ b/llvm/test/CodeGen/SystemZ/memset-04.ll @@ -359,7 +359,8 @@ define void @f37(i8* %dest) { ; CHECK-LABEL: f37: ; CHECK: mvi 0(%r2), 255 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 255 ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 257, i1 false) ret void @@ -369,7 +370,8 @@ define void @f38(i8* %dest) { ; CHECK-LABEL: f38: ; CHECK: mvi 0(%r2), 255 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 255 ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 257, i1 false) ret void @@ -379,7 +381,8 @@ define void @f39(i8* %dest) { ; CHECK-LABEL: f39: ; CHECK: mvi 0(%r2), 255 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 255 ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 258, i1 false) @@ -390,7 +393,8 @@ define void @f40(i8* %dest) { ; CHECK-LABEL: f40: ; CHECK: mvi 0(%r2), 255 -; CHECK: mvc 1(256,%r2), 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: mvi 256(%r2), 255 ; CHECK: mvc 257(1,%r2), 256(%r2) ; CHECK: br %r14 call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 258, i1 false) diff --git a/llvm/test/CodeGen/SystemZ/memset-07.ll b/llvm/test/CodeGen/SystemZ/memset-07.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/memset-07.ll @@ -0,0 +1,100 @@ +; Test memset in cases where a loop is used. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +declare void @llvm.memset.p0i8.i32(i8 *nocapture, i8, i32, i1) nounwind +declare void @llvm.memset.p0i8.i64(i8 *nocapture, i8, i64, i1) nounwind + +; Constant length: 6 iterations and 2 bytes remainder. +define void @f1(i8* %dest, i8 %val) { +; CHECK-LABEL: f1: +; CHECK: lghi [[COUNT:%r[0-5]]], 6 +; CHECK: [[LABEL:\.L[^:]*]]: +; CHECK: pfd 2, 768(%r2) +; CHECK: stc %r3, 0(%r2) +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: la %r2, 256(%r2) +; CHECK: brctg [[COUNT]], [[LABEL]] +; CHECK: stc %r3, 0(%r2) +; CHECK-NEXT: mvc 1(1,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false) + ret void +} + +; Constant length: 6 iterations and 255 bytes remainder. +define void @f2(i8* %dest) { +; CHECK-LABEL: f2: +; CHECK: lghi [[COUNT:%r[0-5]]], 6 +; CHECK: [[LABEL:\.L[^:]*]]: +; CHECK: pfd 2, 768(%r2) +; CHECK: mvi 0(%r2), 1 +; CHECK: mvc 1(255,%r2), 0(%r2) +; CHECK: la %r2, 256(%r2) +; CHECK: brctg [[COUNT]], [[LABEL]] +; CHECK: mvi 0(%r2), 1 +; CHECK-NEXT: mvc 1(254,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 1791, i1 false) + ret void +} + +; Variable length, byte in register. +define void @f3(i8* %dest, i8 %val, i64 %Len) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r4, -2 +; CHECK-NEXT: cgibe %r4, -2, 0(%r14) +; CHECK-NEXT: .LBB2_1: +; CHECK-NEXT: cgije %r4, -1, .LBB2_5 +; CHECK-NEXT:# %bb.2: +; CHECK-NEXT: srlg %r0, %r4, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB2_4 +; CHECK-NEXT:.LBB2_3: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: pfd 2, 768(%r2) +; CHECK-NEXT: stc %r3, 0(%r2) +; CHECK-NEXT: mvc 1(255,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB2_3 +; CHECK-NEXT:.LBB2_4: +; CHECK-NEXT: stc %r3, 0(%r2) +; CHECK-NEXT: exrl %r4, .Ltmp0 +; CHECK-NEXT: br %r14 +; CHECK-NEXT:.LBB2_5: +; CHECK-NEXT: stc %r3, 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %Len, i1 false) + ret void +} + +; Variable length, immediate byte. +define void @f4(i8* %dest, i32 %Len) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r1, %r3 +; CHECK-NEXT: aghi %r1, -2 +; CHECK-NEXT: cgibe %r1, -2, 0(%r14) +; CHECK-NEXT:.LBB3_1: +; CHECK-NEXT: cgije %r1, -1, .LBB3_5 +; CHECK-NEXT:# %bb.2: +; CHECK-NEXT: srlg %r0, %r1, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB3_4 +; CHECK-NEXT:.LBB3_3: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: pfd 2, 768(%r2) +; CHECK-NEXT: mvi 0(%r2), 1 +; CHECK-NEXT: mvc 1(255,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB3_3 +; CHECK-NEXT:.LBB3_4: +; CHECK-NEXT: mvi 0(%r2), 1 +; CHECK-NEXT: exrl %r1, .Ltmp0 +; CHECK-NEXT: br %r14 +; CHECK-NEXT:.LBB3_5: +; CHECK-NEXT: mvi 0(%r2), 1 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 %Len, i1 false) + ret void +} + +; CHECK: .Ltmp0: +; CHECK-NEXT: mvc 1(1,%r2), 0(%r2) diff --git a/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll b/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll --- a/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll +++ b/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll @@ -12,7 +12,7 @@ ; CHECK: jg memset define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 { entry: - tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 false) + tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 true) ret void }