diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -7830,9 +7830,40 @@ MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2)); uint64_t SrcDisp = MI.getOperand(3).getImm(); MachineOperand &LengthMO = MI.getOperand(4); - uint64_t ImmLength = LengthMO.isImm() ? LengthMO.getImm() : 0; - Register LenMinus1Reg = - LengthMO.isReg() ? LengthMO.getReg() : SystemZ::NoRegister; + bool IsImmForm = LengthMO.isImm(); + bool IsRegForm = !IsImmForm; + + bool NeedsLoop = false; + uint64_t ImmLength = 0; + Register LenMinus1Reg = SystemZ::NoRegister; + if (IsImmForm) { + ImmLength = LengthMO.getImm(); + ImmLength++; // Add back the '1' subtracted originally. + if (ImmLength == 0) { + MI.eraseFromParent(); + return MBB; + } + if (Opcode == SystemZ::CLC) { + if (ImmLength > 3 * 256) + // A two-CLC sequence is a clear win over a loop, not least because + // it needs only one branch. A three-CLC sequence needs the same + // number of branches as a loop (i.e. 2), but is shorter. That + // brings us to lengths greater than 768 bytes. It seems relatively + // likely that a difference will be found within the first 768 bytes, + // so we just optimize for the smallest number of branch + // instructions, in order to avoid polluting the prediction buffer + // too much. + NeedsLoop = true; + } else if (ImmLength > 6 * 256) + // The heuristic we use is to prefer loops for anything that would + // require 7 or more MVCs. With these kinds of sizes there isn't much + // to choose between straight-line code and looping code, since the + // time will be dominated by the MVCs themselves. + NeedsLoop = true; + } else { + NeedsLoop = true; + LenMinus1Reg = LengthMO.getReg(); + } // When generating more than one CLC, all but the last will need to // branch to the end when a difference is found. @@ -7840,16 +7871,25 @@ ? SystemZ::splitBlockAfter(MI, MBB) : nullptr); - // Check for the loop form, in which operand 5 is the trip count. - if (MI.getNumExplicitOperands() > 5) { - Register StartCountReg = MI.getOperand(5).getReg(); - bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); + if (NeedsLoop) { + Register StartCountReg = + MRI.createVirtualRegister(&SystemZ::GR64BitRegClass); + if (IsImmForm) { + TII->loadImmediate(*MBB, MI, StartCountReg, ImmLength / 256); + ImmLength &= 255; + } else { + BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg) + .addReg(LenMinus1Reg) + .addReg(0) + .addImm(8); + } auto loadZeroAddress = [&]() -> MachineOperand { Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0); return MachineOperand::CreateReg(Reg, false); }; + bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister) DestBase = loadZeroAddress(); if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister) @@ -7876,7 +7916,7 @@ Register ThisCountReg = MRI.createVirtualRegister(RC); Register NextCountReg = MRI.createVirtualRegister(RC); - if (LengthMO.isReg()) { + if (IsRegForm) { AllDoneMBB = SystemZ::splitBlockBefore(MI, MBB); StartMBB = SystemZ::emitBlockAfter(MBB); LoopMBB = SystemZ::emitBlockAfter(StartMBB); @@ -7916,7 +7956,6 @@ DestBase = MachineOperand::CreateReg(NextDestReg, false); SrcBase = MachineOperand::CreateReg(NextSrcReg, false); - ImmLength &= 255; if (EndMBB && !ImmLength) // If the loop handled the whole CLC range, DoneMBB will be empty with // CC live-through into EndMBB, so add it as live-in. @@ -7987,7 +8026,7 @@ MBB->addSuccessor(DoneMBB); MBB = DoneMBB; - if (LengthMO.isReg()) { + if (IsRegForm) { // DoneMBB: // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run. // # Use EXecute Relative Long for the remainder of the bytes. The target @@ -8005,7 +8044,6 @@ BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg) .addReg(StartSrcReg).addMBB(StartMBB) .addReg(NextSrcReg).addMBB(LoopMBB); - MRI.constrainRegClass(LenMinus1Reg, &SystemZ::ADDR64BitRegClass); BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo)) .addImm(Opcode) .addReg(LenMinus1Reg) @@ -8530,21 +8568,16 @@ case SystemZ::ATOMIC_CMP_SWAPW: return emitAtomicCmpSwapW(MI, MBB); - case SystemZ::MVCSequence: - case SystemZ::MVCLoop: + case SystemZ::MVCImm: return emitMemMemWrapper(MI, MBB, SystemZ::MVC); - case SystemZ::NCSequence: - case SystemZ::NCLoop: + case SystemZ::NCImm: return emitMemMemWrapper(MI, MBB, SystemZ::NC); - case SystemZ::OCSequence: - case SystemZ::OCLoop: + case SystemZ::OCImm: return emitMemMemWrapper(MI, MBB, SystemZ::OC); - case SystemZ::XCSequence: - case SystemZ::XCLoop: - case SystemZ::XCLoopVarLen: + case SystemZ::XCImm: + case SystemZ::XCReg: return emitMemMemWrapper(MI, MBB, SystemZ::XC); - case SystemZ::CLCSequence: - case SystemZ::CLCLoop: + case SystemZ::CLCImm: return emitMemMemWrapper(MI, MBB, SystemZ::CLC); case SystemZ::CLSTLoop: return emitStringWrapper(MI, MBB, SystemZ::CLST); diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td --- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td @@ -128,9 +128,10 @@ (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; } -defm LoadStoreF32 : MVCLoadStore; -defm LoadStoreF64 : MVCLoadStore; -defm LoadStoreF128 : MVCLoadStore; +// The length is given as one less for MVCImm. +defm LoadStoreF32 : MVCLoadStore; +defm LoadStoreF64 : MVCLoadStore; +defm LoadStoreF128 : MVCLoadStore; //===----------------------------------------------------------------------===// // Load instructions diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td --- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5329,42 +5329,33 @@ // Define an instruction that operates on two fixed-length blocks of memory, // and associated pseudo instructions for operating on blocks of any size. -// The Sequence form uses a straight-line sequence of instructions and -// the Loop form uses a loop of length-256 instructions followed by -// another instruction to handle the excess. -// The LoopVarLen form is for a loop with a non-constant length parameter. -multiclass MemorySS opcode, - SDPatternOperator sequence, SDPatternOperator loop> { +// There are two pseudos for the different cases of when the length is +// constant or variable. The length operand of a pseudo is actually one less +// than the intended number of bytes, since the register case needs to use an +// EXRL with a target instruction that adds one to the length always. +multiclass MemorySS opcode, SDPatternOperator memop> { def "" : SideEffectBinarySSa; let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [CC] in { - def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length), - [(sequence bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length)]>; - def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length, GR64:$count256), - [(loop bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length, GR64:$count256)]>; - def LoopVarLen : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, - GR64:$length, GR64:$count256), - [(loop bdaddr12only:$dest, bdaddr12only:$src, - GR64:$length, GR64:$count256)]>; + def Imm : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length), + [(memop bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length)]>; + def Reg : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + ADDR64:$length), + [(memop bdaddr12only:$dest, bdaddr12only:$src, + ADDR64:$length)]>; } } // The same, but setting a CC result as comparison operator. multiclass CompareMemorySS opcode, - SDPatternOperator sequence, SDPatternOperator loop> { + SDPatternOperator memop> { def "" : SideEffectBinarySSa; let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { - def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length), - [(set CC, (sequence bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length))]>; - def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length, GR64:$count256), - [(set CC, (loop bdaddr12only:$dest, bdaddr12only:$src, - imm64:$length, GR64:$count256))]>; + def Imm : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length), + [(set CC, (memop bdaddr12only:$dest, bdaddr12only:$src, + imm64:$length))]>; } } diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -503,7 +503,7 @@ // Memory-to-memory moves. let mayLoad = 1, mayStore = 1 in - defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>; + defm MVC : MemorySS<"mvc", 0xD2, z_mvc>; let mayLoad = 1, mayStore = 1, Defs = [CC] in { def MVCL : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>; def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>; @@ -1200,7 +1200,7 @@ // Block AND. let mayLoad = 1, mayStore = 1 in - defm NC : MemorySS<"nc", 0xD4, z_nc, z_nc_loop>; + defm NC : MemorySS<"nc", 0xD4, z_nc>; } defm : RMWIByte; defm : RMWIByte; @@ -1257,7 +1257,7 @@ // Block OR. let mayLoad = 1, mayStore = 1 in - defm OC : MemorySS<"oc", 0xD6, z_oc, z_oc_loop>; + defm OC : MemorySS<"oc", 0xD6, z_oc>; } defm : RMWIByte; defm : RMWIByte; @@ -1297,7 +1297,7 @@ // Block XOR. let mayLoad = 1, mayStore = 1 in - defm XC : MemorySS<"xc", 0xD7, z_xc, z_xc_loop>; + defm XC : MemorySS<"xc", 0xD7, z_xc>; } defm : RMWIByte; defm : RMWIByte; @@ -1624,7 +1624,7 @@ // Memory-to-memory comparison. let mayLoad = 1, Defs = [CC] in { - defm CLC : CompareMemorySS<"clc", 0xD5, z_clc, z_clc_loop>; + defm CLC : CompareMemorySS<"clc", 0xD5, z_clc>; def CLCL : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>; def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>; def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>; @@ -2355,21 +2355,15 @@ (RLLG GR64:$val, (LCR GR32:$shift), 0)>; } -// Peepholes for turning scalar operations into block operations. -defm : BlockLoadStore; -defm : BlockLoadStore; -defm : BlockLoadStore; -defm : BlockLoadStore; -defm : BlockLoadStore; -defm : BlockLoadStore; -defm : BlockLoadStore; +// Peepholes for turning scalar operations into block operations. The length +// is given as one less for these pseudos. +defm : BlockLoadStore; +defm : BlockLoadStore; +defm : BlockLoadStore; +defm : BlockLoadStore; +defm : BlockLoadStore; +defm : BlockLoadStore; +defm : BlockLoadStore; //===----------------------------------------------------------------------===// // Mnemonic Aliases diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td --- a/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -102,17 +102,6 @@ SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVT<3, i64>]>; -def SDT_ZMemMemLoop : SDTypeProfile<0, 4, - [SDTCisPtrTy<0>, - SDTCisPtrTy<1>, - SDTCisVT<2, i64>, - SDTCisVT<3, i64>]>; -def SDT_ZMemMemLoopCC : SDTypeProfile<1, 4, - [SDTCisVT<0, i32>, - SDTCisPtrTy<1>, - SDTCisPtrTy<2>, - SDTCisVT<3, i64>, - SDTCisVT<4, i64>]>; def SDT_ZString : SDTypeProfile<1, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, @@ -416,24 +405,14 @@ def z_mvc : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength, [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; -def z_mvc_loop : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop, - [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_nc : SDNode<"SystemZISD::NC", SDT_ZMemMemLength, [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; -def z_nc_loop : SDNode<"SystemZISD::NC_LOOP", SDT_ZMemMemLoop, - [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_oc : SDNode<"SystemZISD::OC", SDT_ZMemMemLength, [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; -def z_oc_loop : SDNode<"SystemZISD::OC_LOOP", SDT_ZMemMemLoop, - [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_xc : SDNode<"SystemZISD::XC", SDT_ZMemMemLength, [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; -def z_xc_loop : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop, - [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC, [SDNPHasChain, SDNPMayLoad]>; -def z_clc_loop : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoopCC, - [SDNPHasChain, SDNPMayLoad]>; def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC, [SDNPHasChain, SDNPMayLoad]>; def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString, diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -17,32 +17,34 @@ #define DEBUG_TYPE "systemz-selectiondag-info" -// Decide whether it is best to use a loop or straight-line code for -// a block operation of Size bytes with source address Src and destination -// address Dest. Sequence is the opcode to use for straight-line code -// (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP). -// Return the chain for the completed operation. -static SDValue emitMemMem(SelectionDAG &DAG, const SDLoc &DL, unsigned Sequence, - unsigned Loop, SDValue Chain, SDValue Dst, - SDValue Src, uint64_t Size) { - EVT PtrVT = Src.getValueType(); - // The heuristic we use is to prefer loops for anything that would - // require 7 or more MVCs. With these kinds of sizes there isn't - // much to choose between straight-line code and looping code, - // since the time will be dominated by the MVCs themselves. - // However, the loop has 4 or 5 instructions (depending on whether - // the base addresses can be proved equal), so there doesn't seem - // much point using a loop for 5 * 256 bytes or fewer. Anything in - // the range (5 * 256, 6 * 256) will need another instruction after - // the loop, so it doesn't seem worth using a loop then either. - // The next value up, 6 * 256, can be implemented in the same - // number of straight-line MVCs as 6 * 256 - 1. - if (Size > 6 * 256) - return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src, - DAG.getConstant(Size, DL, PtrVT), - DAG.getConstant(Size / 256, DL, PtrVT)); - return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src, - DAG.getConstant(Size, DL, PtrVT)); +// Emit a mem-mem operation after subtracting one from size, which will be +// added back during pseudo expansion. As the Reg case emitted here may be +// converted by DAGCombiner into having an Imm length, they are both emitted +// the same way. +static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, + SDValue Chain, SDValue Dst, SDValue Src, + uint64_t Size) { + return DAG.getNode(Op, DL, MVT::Other, Chain, Dst, Src, + DAG.getConstant(Size - 1, DL, Src.getValueType())); +} + +static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, + SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size) { + SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64, + DAG.getZExtOrTrunc(Size, DL, MVT::i64), + DAG.getConstant(-1, DL, MVT::i64)); + return DAG.getNode(Op, DL, MVT::Other, Chain, Dst, Src, LenMinus1); +} + +// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size). +// One is subtracted from size also here, per above. +static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, + SDValue Src1, SDValue Src2, uint64_t Size) { + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); + EVT PtrVT = Src1.getValueType(); + return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2, + DAG.getConstant(Size - 1, DL, PtrVT)); } SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy( @@ -53,8 +55,8 @@ return SDValue(); if (auto *CSize = dyn_cast(Size)) - return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP, - Chain, Dst, Src, CSize->getZExtValue()); + return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, Dst, Src, + CSize->getZExtValue()); return SDValue(); } @@ -127,52 +129,23 @@ // Handle the special case of a memset of 0, which can use XC. if (CByte && CByte->getZExtValue() == 0) - return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP, - Chain, Dst, Dst, Bytes); + return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes); // Copy the byte to the first location and then use MVC to copy // it to the rest. Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment); SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst, DAG.getConstant(1, DL, PtrVT)); - return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP, - Chain, DstPlus1, Dst, Bytes - 1); + return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst, + Bytes - 1); } // Variable length - if (CByte && CByte->getZExtValue() == 0) { + if (CByte && CByte->getZExtValue() == 0) // Handle the special case of a variable length memset of 0 with XC. - SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64, - DAG.getZExtOrTrunc(Size, DL, MVT::i64), - DAG.getConstant(-1, DL, MVT::i64)); - SDValue TripC = DAG.getNode(ISD::SRL, DL, MVT::i64, LenMinus1, - DAG.getConstant(8, DL, MVT::i64)); - return DAG.getNode(SystemZISD::XC_LOOP, DL, MVT::Other, Chain, Dst, Dst, - LenMinus1, TripC); - } - return SDValue(); -} + return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size); -// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size), -// deciding whether to use a loop or straight-line code. -static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, - SDValue Src1, SDValue Src2, uint64_t Size) { - SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); - EVT PtrVT = Src1.getValueType(); - // A two-CLC sequence is a clear win over a loop, not least because it - // needs only one branch. A three-CLC sequence needs the same number - // of branches as a loop (i.e. 2), but is shorter. That brings us to - // lengths greater than 768 bytes. It seems relatively likely that - // a difference will be found within the first 768 bytes, so we just - // optimize for the smallest number of branch instructions, in order - // to avoid polluting the prediction buffer too much. A loop only ever - // needs 2 branches, whereas a straight-line sequence would need 3 or more. - if (Size > 3 * 256) - return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2, - DAG.getConstant(Size, DL, PtrVT), - DAG.getConstant(Size / 256, DL, PtrVT)); - return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2, - DAG.getConstant(Size, DL, PtrVT)); + return SDValue(); } // Convert the current CC value into an integer that is 0 if CC == 0, diff --git a/llvm/test/CodeGen/SystemZ/memset-05.ll b/llvm/test/CodeGen/SystemZ/memset-05.ll --- a/llvm/test/CodeGen/SystemZ/memset-05.ll +++ b/llvm/test/CodeGen/SystemZ/memset-05.ll @@ -48,37 +48,37 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: llgfr %r1, %r3 ; CHECK-NEXT: aghi %r1, -1 -; CHECK-NEXT: srlg %r0, %r1, 8 ; CHECK-NEXT: cgije %r1, -1, .LBB2_5 ; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: srlg %r0, %r1, 8 ; CHECK-NEXT: lgr %r3, %r2 ; CHECK-NEXT: cgije %r0, 0, .LBB2_4 ; CHECK-NEXT: # %bb.2: ; CHECK-NEXT: lgr %r3, %r2 -; CHECK-NEXT: lgr %r4, %r0 ; CHECK-NEXT: .LBB2_3: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: xc 0(256,%r3), 0(%r3) ; CHECK-NEXT: la %r3, 256(%r3) -; CHECK-NEXT: brctg %r4, .LBB2_3 +; CHECK-NEXT: brctg %r0, .LBB2_3 ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: exrl %r1, .Ltmp1 ; CHECK-NEXT: .LBB2_5: ; CHECK-NEXT: cgije %r1, -1, .LBB2_10 ; CHECK-NEXT: # %bb.6: +; CHECK-NEXT: srlg %r0, %r1, 8 ; CHECK-NEXT: lgr %r3, %r2 ; CHECK-NEXT: cgije %r0, 0, .LBB2_9 ; CHECK-NEXT: # %bb.7: ; CHECK-NEXT: lgr %r3, %r2 -; CHECK-NEXT: lgr %r4, %r0 ; CHECK-NEXT: .LBB2_8: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: xc 0(256,%r3), 0(%r3) ; CHECK-NEXT: la %r3, 256(%r3) -; CHECK-NEXT: brctg %r4, .LBB2_8 +; CHECK-NEXT: brctg %r0, .LBB2_8 ; CHECK-NEXT: .LBB2_9: ; CHECK-NEXT: exrl %r1, .Ltmp1 ; CHECK-NEXT: .LBB2_10: ; CHECK-NEXT: cgibe %r1, -1, 0(%r14) ; CHECK-NEXT: .LBB2_11: +; CHECK-NEXT: srlg %r0, %r1, 8 ; CHECK-NEXT: cgije %r0, 0, .LBB2_13 ; CHECK-NEXT: .LBB2_12: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: xc 0(256,%r2), 0(%r2) @@ -114,6 +114,135 @@ ret void } +; Test that a memset with a length argument that DAGCombiner will convert +; into a constant get the correct number of bytes set. +@Data = external hidden constant [1024 x i8], align 2 +define void @fun4() { +; CHECK-LABEL: fun4: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, Data +; CHECK-NEXT: xc 35(256,%r1), 35(%r1) +; CHECK-NEXT: xc 291(256,%r1), 291(%r1) +; CHECK-NEXT: xc 547(256,%r1), 547(%r1) +; CHECK-NEXT: xc 803(221,%r1), 803(%r1) +; CHECK-NEXT: mvghi 0(%r1), 989 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64( + i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35), + i8 0, + i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 0) to i64), i64 1), + i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 0, i64 35) to i64), i64 1)), + i1 false) + %i11 = getelementptr i8, i8* null, + i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 0) to i64), i64 1), + i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 0, i64 35) to i64), i64 1)) + store i8* %i11, i8** undef, align 8 + ret void +} + +; The same, with a resulting constant length of 0. +define void @fun5() { +; CHECK-LABEL: fun5: +; CHECK: # %bb.0: +; CHECK-NEXT: mvghi 0(%r1), 0 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64( + i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35), + i8 0, + i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1), + i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)), + i1 false) + %i11 = getelementptr i8, i8* null, + i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1), + i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)) + store i8* %i11, i8** undef, align 8 + ret void +} + +; The same, with a resulting constant length of 1. +define void @fun6() { +; CHECK-LABEL: fun6: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, Data +; CHECK-NEXT: xc 35(1,%r1), 35(%r1) +; CHECK-NEXT: mvghi 0(%r1), 1 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64( + i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35), + i8 0, + i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 36) to i64), i64 1), + i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)), + i1 false) + %i11 = getelementptr i8, i8* null, + i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 36) to i64), i64 1), + i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)) + store i8* %i11, i8** undef, align 8 + ret void +} + +; The same, with a resulting constant length of 256. +define void @fun7() { +; CHECK-LABEL: fun7: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, Data +; CHECK-NEXT: xc 35(256,%r1), 35(%r1) +; CHECK-NEXT: mvghi 0(%r1), 256 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64( + i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35), + i8 0, + i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 291) to i64), i64 1), + i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)), + i1 false) + %i11 = getelementptr i8, i8* null, + i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 291) to i64), i64 1), + i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)) + store i8* %i11, i8** undef, align 8 + ret void +} + +; The same, with a resulting constant length of 257. +define void @fun8() { +; CHECK-LABEL: fun8: +; CHECK: # %bb.0: +; CHECK-NEXT: larl %r1, Data +; CHECK-NEXT: xc 35(256,%r1), 35(%r1) +; CHECK-NEXT: xc 291(1,%r1), 291(%r1) +; CHECK-NEXT: mvghi 0(%r1), 257 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64( + i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @Data, i64 0, i64 35), + i8 0, + i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 292) to i64), i64 1), + i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)), + i1 false) + %i11 = getelementptr i8, i8* null, + i64 sub (i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 292) to i64), i64 1), + i64 add (i64 ptrtoint (i8* getelementptr inbounds ([1024 x i8], + [1024 x i8]* @Data, i64 1, i64 35) to i64), i64 1)) + store i8* %i11, i8** undef, align 8 + ret void +} + ; CHECK: .Ltmp2: ; CHECK-NEXT: xc 0(1,%r1), 0(%r1) ; CHECK-NEXT: .Ltmp0: diff --git a/llvm/test/CodeGen/SystemZ/mverify-optypes.mir b/llvm/test/CodeGen/SystemZ/mverify-optypes.mir --- a/llvm/test/CodeGen/SystemZ/mverify-optypes.mir +++ b/llvm/test/CodeGen/SystemZ/mverify-optypes.mir @@ -31,8 +31,8 @@ %4:addr64bit = LARL @gsrc %4:addr64bit = LARL $r2l - MVCLoop %4, 0, %3, 0, 1680, %0, implicit-def $cc - MVCLoop %4, 0, %3, 0, %1, %0, implicit-def $cc + MVCImm %4, 0, %3, 0, 1680, implicit-def $cc + MVCImm %4, 0, %3, 0, %1, implicit-def $cc BCR 0, 0, $r2d, implicit $cc BCR 0, $r2d, $r2d, implicit $cc @@ -63,7 +63,7 @@ # CHECK: - operand 1: $r2l # CHECK: *** Bad machine code: Expected a non-register operand. *** -# CHECK: - instruction: MVCLoop %4:addr64bit, 0, %3:addr64bit, 0, %1:addr64bit, %0:gr64bit, implicit-def $cc +# CHECK: - instruction: MVCImm %4:addr64bit, 0, %3:addr64bit, 0, %1:addr64bit, implicit-def $cc # CHECK: - operand 4: %1:addr64bit # CHECK: *** Bad machine code: Expected a non-register operand. ***