diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h @@ -9,10 +9,11 @@ #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H -#include "SystemZTargetMachine.h" #include "SystemZMCInstLower.h" +#include "SystemZTargetMachine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/Support/Compiler.h" namespace llvm { @@ -26,6 +27,33 @@ private: StackMaps SM; + typedef std::pair MCInstSTIPair; + struct CmpMCInst { + bool operator()(const MCInstSTIPair &MCI_STI_A, + const MCInstSTIPair &MCI_STI_B) const { + if (MCI_STI_A.second != MCI_STI_B.second) + return uintptr_t(MCI_STI_A.second) < uintptr_t(MCI_STI_B.second); + const MCInst &A = MCI_STI_A.first; + const MCInst &B = MCI_STI_B.first; + assert(A.getNumOperands() == B.getNumOperands() && + A.getNumOperands() == 5 && A.getOperand(2).getImm() == 1 && + B.getOperand(2).getImm() == 1 && "Unexpected EXRL target MCInst"); + if (A.getOpcode() != B.getOpcode()) + return A.getOpcode() < B.getOpcode(); + if (A.getOperand(0).getReg() != B.getOperand(0).getReg()) + return A.getOperand(0).getReg() < B.getOperand(0).getReg(); + if (A.getOperand(1).getImm() != B.getOperand(1).getImm()) + return A.getOperand(1).getImm() < B.getOperand(1).getImm(); + if (A.getOperand(3).getReg() != B.getOperand(3).getReg()) + return A.getOperand(3).getReg() < B.getOperand(3).getReg(); + if (A.getOperand(4).getImm() != B.getOperand(4).getImm()) + return A.getOperand(4).getImm() < B.getOperand(4).getImm(); + return false; + } + }; + typedef std::map EXRLT2SymMap; + EXRLT2SymMap EXRLTargets2Sym; + public: SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) : AsmPrinter(TM, std::move(Streamer)), SM(*this) {} @@ -49,6 +77,7 @@ void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL); void LowerSTACKMAP(const MachineInstr &MI); void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower); + void emitEXRLTargetInstructions(); }; } // end namespace llvm diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp --- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -541,6 +541,30 @@ LowerPATCHPOINT(*MI, Lower); return; + case SystemZ::EXRL_Pseudo: { + unsigned TargetInsOpc = MI->getOperand(0).getImm(); + Register LenMinus1Reg = MI->getOperand(1).getReg(); + Register DestReg = MI->getOperand(2).getReg(); + int64_t DestDisp = MI->getOperand(3).getImm(); + Register SrcReg = MI->getOperand(4).getReg(); + int64_t SrcDisp = MI->getOperand(5).getImm(); + + MCSymbol *DotSym = nullptr; + MCInst ET = MCInstBuilder(TargetInsOpc).addReg(DestReg) + .addImm(DestDisp).addImm(1).addReg(SrcReg).addImm(SrcDisp); + MCInstSTIPair ET_STI(ET, &MF->getSubtarget()); + EXRLT2SymMap::iterator I = EXRLTargets2Sym.find(ET_STI); + if (I != EXRLTargets2Sym.end()) + DotSym = I->second; + else + EXRLTargets2Sym[ET_STI] = DotSym = OutContext.createTempSymbol(); + const MCSymbolRefExpr *Dot = MCSymbolRefExpr::create(DotSym, OutContext); + EmitToStreamer( + *OutStreamer, + MCInstBuilder(SystemZ::EXRL).addReg(LenMinus1Reg).addExpr(Dot)); + return; + } + default: Lower.lower(MI, LoweredMI); break; @@ -698,6 +722,19 @@ getSubtargetInfo()); } +void SystemZAsmPrinter::emitEXRLTargetInstructions() { + if (EXRLTargets2Sym.empty()) + return; + // Switch to the .text section. + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + for (auto &I : EXRLTargets2Sym) { + OutStreamer->emitLabel(I.second); + const MCInstSTIPair &MCI_STI = I.first; + OutStreamer->emitInstruction(MCI_STI.first, *MCI_STI.second); + } + EXRLTargets2Sym.clear(); +} + // Convert a SystemZ-specific constant pool modifier into the associated // MCSymbolRefExpr variant kind. static MCSymbolRefExpr::VariantKind @@ -746,6 +783,7 @@ } void SystemZAsmPrinter::emitEndOfAsmFile(Module &M) { + emitEXRLTargetInstructions(); emitStackMaps(SM); } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -7795,43 +7795,89 @@ uint64_t DestDisp = MI.getOperand(1).getImm(); MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2)); uint64_t SrcDisp = MI.getOperand(3).getImm(); - uint64_t Length = MI.getOperand(4).getImm(); + MachineOperand &LengthMO = MI.getOperand(4); + uint64_t ImmLength = LengthMO.isImm() ? LengthMO.getImm() : 0; + Register LenMinus1Reg = + LengthMO.isReg() ? LengthMO.getReg() : SystemZ::NoRegister; // When generating more than one CLC, all but the last will need to // branch to the end when a difference is found. - MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ? - SystemZ::splitBlockAfter(MI, MBB) : nullptr); + MachineBasicBlock *EndMBB = (ImmLength > 256 && Opcode == SystemZ::CLC + ? SystemZ::splitBlockAfter(MI, MBB) + : nullptr); // Check for the loop form, in which operand 5 is the trip count. if (MI.getNumExplicitOperands() > 5) { - bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); - Register StartCountReg = MI.getOperand(5).getReg(); - Register StartSrcReg = forceReg(MI, SrcBase, TII); - Register StartDestReg = (HaveSingleBase ? StartSrcReg : - forceReg(MI, DestBase, TII)); + + MachineBasicBlock *StartMBB = nullptr; + MachineBasicBlock *LoopMBB = nullptr; + MachineBasicBlock *NextMBB = nullptr; + MachineBasicBlock *DoneMBB = nullptr; + MachineBasicBlock *AllDoneMBB = nullptr; + + bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); + Register StartSrcReg = forceReg(MI, SrcBase, TII); + Register StartDestReg = + (HaveSingleBase ? StartSrcReg : forceReg(MI, DestBase, TII)); const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass; Register ThisSrcReg = MRI.createVirtualRegister(RC); - Register ThisDestReg = (HaveSingleBase ? ThisSrcReg : - MRI.createVirtualRegister(RC)); + Register ThisDestReg = + (HaveSingleBase ? ThisSrcReg : MRI.createVirtualRegister(RC)); Register NextSrcReg = MRI.createVirtualRegister(RC); - Register NextDestReg = (HaveSingleBase ? NextSrcReg : - MRI.createVirtualRegister(RC)); - + Register NextDestReg = + (HaveSingleBase ? NextSrcReg : MRI.createVirtualRegister(RC)); RC = &SystemZ::GR64BitRegClass; Register ThisCountReg = MRI.createVirtualRegister(RC); Register NextCountReg = MRI.createVirtualRegister(RC); - MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); - MachineBasicBlock *NextMBB = - (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB); - - // StartMBB: - // # fall through to LoopMMB - MBB->addSuccessor(LoopMBB); + if (LengthMO.isReg()) { + AllDoneMBB = SystemZ::splitBlockBefore(MI, MBB); + StartMBB = SystemZ::emitBlockAfter(MBB); + LoopMBB = SystemZ::emitBlockAfter(StartMBB); + NextMBB = LoopMBB; + DoneMBB = SystemZ::emitBlockAfter(LoopMBB); + + // MBB: + // # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB. + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(LenMinus1Reg).addImm(-1); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(AllDoneMBB); + MBB->addSuccessor(AllDoneMBB); + MBB->addSuccessor(StartMBB); + + // StartMBB: + // # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB. + MBB = StartMBB; + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(StartCountReg).addImm(0); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(DoneMBB); + MBB->addSuccessor(DoneMBB); + MBB->addSuccessor(LoopMBB); + } + else { + StartMBB = MBB; + DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + LoopMBB = SystemZ::emitBlockAfter(StartMBB); + NextMBB = (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB); + + // StartMBB: + // # fall through to LoopMBB + MBB->addSuccessor(LoopMBB); + + DestBase = MachineOperand::CreateReg(NextDestReg, false); + SrcBase = MachineOperand::CreateReg(NextSrcReg, false); + ImmLength &= 255; + if (EndMBB && !ImmLength) + // If the loop handled the whole CLC range, DoneMBB will be empty with + // CC live-through into EndMBB, so add it as live-in. + DoneMBB->addLiveIn(SystemZ::CC); + } // LoopMBB: // %ThisDestReg = phi [ %StartDestReg, StartMBB ], @@ -7846,7 +7892,6 @@ // // The prefetch is used only for MVC. The JLH is used only for CLC. MBB = LoopMBB; - BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg) .addReg(StartDestReg).addMBB(StartMBB) .addReg(NextDestReg).addMBB(NextMBB); @@ -7882,7 +7927,6 @@ // // The AGHI, CGHI and JLH should be converted to BRCTG by later passes. MBB = NextMBB; - BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg) .addReg(ThisDestReg).addImm(256).addReg(0); if (!HaveSingleBase) @@ -7898,18 +7942,39 @@ MBB->addSuccessor(LoopMBB); MBB->addSuccessor(DoneMBB); - DestBase = MachineOperand::CreateReg(NextDestReg, false); - SrcBase = MachineOperand::CreateReg(NextSrcReg, false); - Length &= 255; - if (EndMBB && !Length) - // If the loop handled the whole CLC range, DoneMBB will be empty with - // CC live-through into EndMBB, so add it as live-in. - DoneMBB->addLiveIn(SystemZ::CC); MBB = DoneMBB; + if (LengthMO.isReg()) { + // DoneMBB: + // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run. + // # Use EXecute Relative Long for the remainder of the bytes. The target + // instruction of the EXRL will have a length field of 1 since 0 is an + // illegal value. The number of bytes processed becomes (%LenMinus1Reg & + // 0xff) + 1. + // # Fall through to AllDoneMBB. + Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + Register RemDestReg = HaveSingleBase ? RemSrcReg + : MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemDestReg) + .addReg(StartDestReg).addMBB(StartMBB) + .addReg(NextDestReg).addMBB(LoopMBB); + if (!HaveSingleBase) + BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg) + .addReg(StartSrcReg).addMBB(StartMBB) + .addReg(NextSrcReg).addMBB(LoopMBB); + MRI.constrainRegClass(LenMinus1Reg, &SystemZ::ADDR64BitRegClass); + BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo)) + .addImm(Opcode) + .addReg(LenMinus1Reg) + .addReg(RemDestReg).addImm(DestDisp) + .addReg(RemSrcReg).addImm(SrcDisp); + MBB->addSuccessor(AllDoneMBB); + MBB = AllDoneMBB; + } } + // Handle any remaining bytes with straight-line code. - while (Length > 0) { - uint64_t ThisLength = std::min(Length, uint64_t(256)); + while (ImmLength > 0) { + uint64_t ThisLength = std::min(ImmLength, uint64_t(256)); // The previous iteration might have created out-of-range displacements. // Apply them using LAY if so. if (!isUInt<12>(DestDisp)) { @@ -7939,10 +8004,10 @@ .setMemRefs(MI.memoperands()); DestDisp += ThisLength; SrcDisp += ThisLength; - Length -= ThisLength; + ImmLength -= ThisLength; // If there's another CLC to go, branch to the end if a difference // was found. - if (EndMBB && Length > 0) { + if (EndMBB && ImmLength > 0) { MachineBasicBlock *NextMBB = SystemZ::splitBlockBefore(MI, MBB); BuildMI(MBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) @@ -8433,6 +8498,7 @@ return emitMemMemWrapper(MI, MBB, SystemZ::OC); case SystemZ::XCSequence: case SystemZ::XCLoop: + case SystemZ::XCLoopVarLen: return emitMemMemWrapper(MI, MBB, SystemZ::XC); case SystemZ::CLCSequence: case SystemZ::CLCLoop: diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td --- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5253,6 +5253,7 @@ // The Sequence form uses a straight-line sequence of instructions and // the Loop form uses a loop of length-256 instructions followed by // another instruction to handle the excess. +// The LoopVarLen form is for a loop with a non-constant length parameter. multiclass MemorySS opcode, SDPatternOperator sequence, SDPatternOperator loop> { def "" : SideEffectBinarySSa; @@ -5265,6 +5266,10 @@ imm64:$length, GR64:$count256), [(loop bdaddr12only:$dest, bdaddr12only:$src, imm64:$length, GR64:$count256)]>; + def LoopVarLen : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + GR64:$length, GR64:$count256), + [(loop bdaddr12only:$dest, bdaddr12only:$src, + GR64:$length, GR64:$count256)]>; } } diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td --- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -2165,8 +2165,12 @@ // Execute. let hasSideEffects = 1 in { - def EX : SideEffectBinaryRX<"ex", 0x44, GR64>; - def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>; + def EX : SideEffectBinaryRX<"ex", 0x44, ADDR64>; + def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, ADDR64>; + let hasNoSchedulingInfo = 1 in + def EXRL_Pseudo : Pseudo<(outs), (ins i64imm:$TargetOpc, ADDR64:$lenMinus1, + bdaddr12only:$bdl1, bdaddr12only:$bd2), + []>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -81,11 +81,12 @@ if (IsVolatile) return SDValue(); + auto *CByte = dyn_cast(Byte); if (auto *CSize = dyn_cast(Size)) { uint64_t Bytes = CSize->getZExtValue(); if (Bytes == 0) return SDValue(); - if (auto *CByte = dyn_cast(Byte)) { + if (CByte) { // Handle cases that can be done using at most two of // MVI, MVHI, MVHHI and MVGHI. The latter two can only be // used if ByteVal is all zeros or all ones; in other casees, @@ -125,7 +126,6 @@ assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already"); // Handle the special case of a memset of 0, which can use XC. - auto *CByte = dyn_cast(Byte); if (CByte && CByte->getZExtValue() == 0) return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP, Chain, Dst, Dst, Bytes); @@ -138,6 +138,18 @@ return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP, Chain, DstPlus1, Dst, Bytes - 1); } + + // Variable length + if (CByte && CByte->getZExtValue() == 0) { + // Handle the special case of a variable length memset of 0 with XC. + SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64, + DAG.getZExtOrTrunc(Size, DL, MVT::i64), + DAG.getConstant(-1, DL, MVT::i64)); + SDValue TripC = DAG.getNode(ISD::SRL, DL, MVT::i64, LenMinus1, + DAG.getConstant(8, DL, MVT::i64)); + return DAG.getNode(SystemZISD::XC_LOOP, DL, MVT::Other, Chain, Dst, Dst, + LenMinus1, TripC); + } return SDValue(); } diff --git a/llvm/test/CodeGen/SystemZ/memset-05.ll b/llvm/test/CodeGen/SystemZ/memset-05.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/memset-05.ll @@ -0,0 +1,101 @@ +; Test memset 0 with variable length +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +define void @fun0(i8* %Addr, i64 %Len) { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r3, -1 +; CHECK-NEXT: cgibe %r3, -1, 0(%r14) +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: srlg %r0, %r3, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB0_3 +; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB0_2 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: exrl %r3, .Ltmp0 +; CHECK-NEXT: br %r14 + tail call void @llvm.memset.p0i8.i64(i8* %Addr, i8 0, i64 %Len, i1 false) + ret void +} + +define void @fun1(i8* %Addr, i32 %Len) { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r1, %r3 +; CHECK-NEXT: aghi %r1, -1 +; CHECK-NEXT: cgibe %r1, -1, 0(%r14) +; CHECK-NEXT: .LBB1_1: +; CHECK-NEXT: srlg %r0, %r1, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB1_3 +; CHECK-NEXT: .LBB1_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB1_2 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: exrl %r1, .Ltmp0 +; CHECK-NEXT: br %r14 + tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false) + ret void +} + +; Test that identical target instructions get reused. +define void @fun2(i8* %Addr, i32 %Len) { +; CHECK-LABEL: fun2: +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r1, %r3 +; CHECK-NEXT: aghi %r1, -1 +; CHECK-NEXT: srlg %r0, %r1, 8 +; CHECK-NEXT: cgije %r1, -1, .LBB2_5 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lgr %r3, %r2 +; CHECK-NEXT: cgije %r0, 0, .LBB2_4 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: lgr %r3, %r2 +; CHECK-NEXT: lgr %r4, %r0 +; CHECK-NEXT: .LBB2_3: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r3), 0(%r3) +; CHECK-NEXT: la %r3, 256(%r3) +; CHECK-NEXT: brctg %r4, .LBB2_3 +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: exrl %r1, .Ltmp1 +; CHECK-NEXT: .LBB2_5: +; CHECK-NEXT: cgije %r1, -1, .LBB2_10 +; CHECK-NEXT: # %bb.6: +; CHECK-NEXT: lgr %r3, %r2 +; CHECK-NEXT: cgije %r0, 0, .LBB2_9 +; CHECK-NEXT: # %bb.7: +; CHECK-NEXT: lgr %r3, %r2 +; CHECK-NEXT: lgr %r4, %r0 +; CHECK-NEXT: .LBB2_8: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r3), 0(%r3) +; CHECK-NEXT: la %r3, 256(%r3) +; CHECK-NEXT: brctg %r4, .LBB2_8 +; CHECK-NEXT: .LBB2_9: +; CHECK-NEXT: exrl %r1, .Ltmp1 +; CHECK-NEXT: .LBB2_10: +; CHECK-NEXT: cgibe %r1, -1, 0(%r14) +; CHECK-NEXT: .LBB2_11: +; CHECK-NEXT: cgije %r0, 0, .LBB2_13 +; CHECK-NEXT: .LBB2_12: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB2_12 +; CHECK-NEXT: .LBB2_13: +; CHECK-NEXT: exrl %r1, .Ltmp0 +; CHECK-NEXT: br %r14 + tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false) + tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false) + tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false) + ret void +} + +; CHECK: .Ltmp0: +; CHECK-NEXT: xc 0(1,%r2), 0(%r2) +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: xc 0(1,%r3), 0(%r3) + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)