Index: llvm/lib/Target/SystemZ/SystemZAsmPrinter.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZAsmPrinter.h +++ llvm/lib/Target/SystemZ/SystemZAsmPrinter.h @@ -9,10 +9,11 @@ #ifndef LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZASMPRINTER_H -#include "SystemZTargetMachine.h" #include "SystemZMCInstLower.h" +#include "SystemZTargetMachine.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/Support/Compiler.h" namespace llvm { @@ -26,6 +27,9 @@ private: StackMaps SM; + typedef std::pair > EXRLT2Symbols; + std::vector EXRL_Targets; + public: SystemZAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) : AsmPrinter(TM, std::move(Streamer)), SM(*this) {} @@ -35,6 +39,7 @@ void emitInstruction(const MachineInstr *MI) override; void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override; void emitEndOfAsmFile(Module &M) override; + void emitFunctionBodyEnd() override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS) override; bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, @@ -49,6 +54,7 @@ void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL); void LowerSTACKMAP(const MachineInstr &MI); void LowerPATCHPOINT(const MachineInstr &MI, SystemZMCInstLower &Lower); + void emitEXRLTargetInstructions(); }; } // end namespace llvm Index: llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -541,6 +541,38 @@ LowerPATCHPOINT(*MI, Lower); return; + case SystemZ::EXRL_Pseudo: { + unsigned TargetInsOpc = MI->getOperand(0).getImm(); + Register LenMinus1Reg = MI->getOperand(1).getReg(); + Register DestReg = MI->getOperand(2).getReg(); + int64_t DestDisp = MI->getOperand(3).getImm(); + Register SrcReg = MI->getOperand(4).getReg(); + int64_t SrcDisp = MI->getOperand(5).getImm(); + + MCSymbol *DotSym = OutContext.createTempSymbol(); + const MCSymbolRefExpr *Dot = MCSymbolRefExpr::create(DotSym, OutContext); + EmitToStreamer(*OutStreamer, MCInstBuilder(SystemZ::EXRL) + .addReg(LenMinus1Reg).addExpr(Dot)); + + auto isIdentical = [&] (const MCInst &ET) -> bool { + return ET.getOpcode() == TargetInsOpc && + ET.getOperand(0).getReg() == DestReg && + ET.getOperand(1).getImm() == DestDisp && + ET.getOperand(3).getReg() == SrcReg && + ET.getOperand(4).getImm() == SrcDisp; + }; + for (auto &I : EXRL_Targets) + if (isIdentical(I.first)) { + I.second.push_back(DotSym); + return; + } + + MCInst ET = MCInstBuilder(TargetInsOpc).addReg(DestReg) + .addImm(DestDisp).addImm(1).addReg(SrcReg).addImm(SrcDisp); + EXRL_Targets.push_back(EXRLT2Symbols(ET, {DotSym})); + return; + } + default: Lower.lower(MI, LoweredMI); break; @@ -698,6 +730,16 @@ getSubtargetInfo()); } +void SystemZAsmPrinter::emitEXRLTargetInstructions() { + for (auto &T : EXRL_Targets) { + for (auto &I : T.second) + OutStreamer->emitLabel(I); + OutStreamer->emitInstruction(T.first, getSubtargetInfo()); + } + + EXRL_Targets.clear(); +} + // Convert a SystemZ-specific constant pool modifier into the associated // MCSymbolRefExpr variant kind. static MCSymbolRefExpr::VariantKind @@ -749,6 +791,10 @@ emitStackMaps(SM); } +void SystemZAsmPrinter::emitFunctionBodyEnd() { + emitEXRLTargetInstructions(); +} + // Force static initialization. extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmPrinter() { RegisterAsmPrinter X(getTheSystemZTarget()); Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -7795,43 +7795,115 @@ uint64_t DestDisp = MI.getOperand(1).getImm(); MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2)); uint64_t SrcDisp = MI.getOperand(3).getImm(); - uint64_t Length = MI.getOperand(4).getImm(); + MachineOperand &LengthMO = MI.getOperand(4); + uint64_t ImmLength = LengthMO.isImm() ? LengthMO.getImm() : 0; // When generating more than one CLC, all but the last will need to // branch to the end when a difference is found. - MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ? + MachineBasicBlock *EndMBB = (ImmLength > 256 && Opcode == SystemZ::CLC ? SystemZ::splitBlockAfter(MI, MBB) : nullptr); // Check for the loop form, in which operand 5 is the trip count. if (MI.getNumExplicitOperands() > 5) { - bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); - Register StartCountReg = MI.getOperand(5).getReg(); - Register StartSrcReg = forceReg(MI, SrcBase, TII); - Register StartDestReg = (HaveSingleBase ? StartSrcReg : - forceReg(MI, DestBase, TII)); + + MachineBasicBlock *StartMBB = nullptr; + MachineBasicBlock *LoopMBB = nullptr; + MachineBasicBlock *NextMBB = nullptr; + MachineBasicBlock *DoneMBB = nullptr; + + bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase); + Register StartSrcReg = forceReg(MI, SrcBase, TII); + Register StartDestReg = (HaveSingleBase ? StartSrcReg + : forceReg(MI, DestBase, TII)); const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass; Register ThisSrcReg = MRI.createVirtualRegister(RC); - Register ThisDestReg = (HaveSingleBase ? ThisSrcReg : - MRI.createVirtualRegister(RC)); + Register ThisDestReg = (HaveSingleBase ? ThisSrcReg + : MRI.createVirtualRegister(RC)); Register NextSrcReg = MRI.createVirtualRegister(RC); - Register NextDestReg = (HaveSingleBase ? NextSrcReg : - MRI.createVirtualRegister(RC)); - + Register NextDestReg = (HaveSingleBase ? NextSrcReg + : MRI.createVirtualRegister(RC)); RC = &SystemZ::GR64BitRegClass; Register ThisCountReg = MRI.createVirtualRegister(RC); Register NextCountReg = MRI.createVirtualRegister(RC); - MachineBasicBlock *StartMBB = MBB; - MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB); - MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB); - MachineBasicBlock *NextMBB = - (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB); + if (LengthMO.isReg()) { + Register LenMinus1 = MI.getOperand(4).getReg(); + + MachineBasicBlock *AllDoneMBB = SystemZ::splitBlockBefore(MI, MBB); + StartMBB = SystemZ::emitBlockAfter(MBB); + LoopMBB = SystemZ::emitBlockAfter(StartMBB); + NextMBB = LoopMBB; + DoneMBB = SystemZ::emitBlockAfter(LoopMBB); - // StartMBB: - // # fall through to LoopMMB - MBB->addSuccessor(LoopMBB); + // MBB: + // # Jump to AllDoneMBB if LenMinus1 is -1, or fall through to StartMBB. + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(LenMinus1).addImm(-1); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(AllDoneMBB); + MBB->addSuccessor(AllDoneMBB); + MBB->addSuccessor(StartMBB); + + // StartMBB: + // # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB. + MBB = StartMBB; + BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + .addReg(StartCountReg).addImm(0); + BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ) + .addMBB(DoneMBB); + MBB->addSuccessor(DoneMBB); + MBB->addSuccessor(LoopMBB); + + // DoneMBB: + // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run. + // # Use EXecute Relative Long for the remainder of the bytes. The target + // instruction of the EXRL will have a length field of 1 since 0 is an + // illegal value. The number of bytes processed becomes (%LenMinus1 & + // 0xff) + 1. + // # Fall through to AllDoneMBB. + MBB = DoneMBB; + Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + Register RemDestReg = HaveSingleBase ? RemSrcReg + : MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); + BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemDestReg) + .addReg(StartDestReg).addMBB(StartMBB) + .addReg(NextDestReg).addMBB(LoopMBB); + if (!HaveSingleBase) + BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg) + .addReg(StartSrcReg).addMBB(StartMBB) + .addReg(NextSrcReg).addMBB(LoopMBB); + MRI.constrainRegClass(LenMinus1, &SystemZ::ADDR64BitRegClass); + BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo)) + .addImm(Opcode) + .addReg(LenMinus1) + .addReg(RemDestReg).addImm(DestDisp) + .addReg(RemSrcReg).addImm(SrcDisp); + MBB->addSuccessor(AllDoneMBB); + MBB = AllDoneMBB; + } + else { + StartMBB = MBB; + DoneMBB = SystemZ::splitBlockBefore(MI, MBB); + LoopMBB = SystemZ::emitBlockAfter(StartMBB); + NextMBB = (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB); + + // StartMBB: + // # fall through to LoopMBB + MBB->addSuccessor(LoopMBB); + + DestBase = MachineOperand::CreateReg(NextDestReg, false); + SrcBase = MachineOperand::CreateReg(NextSrcReg, false); + ImmLength &= 255; + if (EndMBB && !ImmLength) + // If the loop handled the whole CLC range, DoneMBB will be empty with + // CC live-through into EndMBB, so add it as live-in. + DoneMBB->addLiveIn(SystemZ::CC); + MBB = DoneMBB; + } // LoopMBB: // %ThisDestReg = phi [ %StartDestReg, StartMBB ], @@ -7845,31 +7917,29 @@ // ( JLH EndMBB ) // // The prefetch is used only for MVC. The JLH is used only for CLC. - MBB = LoopMBB; - - BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg) + BuildMI(LoopMBB, DL, TII->get(SystemZ::PHI), ThisDestReg) .addReg(StartDestReg).addMBB(StartMBB) .addReg(NextDestReg).addMBB(NextMBB); if (!HaveSingleBase) - BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg) + BuildMI(LoopMBB, DL, TII->get(SystemZ::PHI), ThisSrcReg) .addReg(StartSrcReg).addMBB(StartMBB) .addReg(NextSrcReg).addMBB(NextMBB); - BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg) + BuildMI(LoopMBB, DL, TII->get(SystemZ::PHI), ThisCountReg) .addReg(StartCountReg).addMBB(StartMBB) .addReg(NextCountReg).addMBB(NextMBB); if (Opcode == SystemZ::MVC) - BuildMI(MBB, DL, TII->get(SystemZ::PFD)) + BuildMI(LoopMBB, DL, TII->get(SystemZ::PFD)) .addImm(SystemZ::PFD_WRITE) .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0); - BuildMI(MBB, DL, TII->get(Opcode)) + BuildMI(LoopMBB, DL, TII->get(Opcode)) .addReg(ThisDestReg).addImm(DestDisp).addImm(256) .addReg(ThisSrcReg).addImm(SrcDisp); if (EndMBB) { - BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + BuildMI(LoopMBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) .addMBB(EndMBB); - MBB->addSuccessor(EndMBB); - MBB->addSuccessor(NextMBB); + LoopMBB->addSuccessor(EndMBB); + LoopMBB->addSuccessor(NextMBB); } // NextMBB: @@ -7881,35 +7951,25 @@ // # fall through to DoneMBB // // The AGHI, CGHI and JLH should be converted to BRCTG by later passes. - MBB = NextMBB; - - BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg) + BuildMI(NextMBB, DL, TII->get(SystemZ::LA), NextDestReg) .addReg(ThisDestReg).addImm(256).addReg(0); if (!HaveSingleBase) - BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg) + BuildMI(NextMBB, DL, TII->get(SystemZ::LA), NextSrcReg) .addReg(ThisSrcReg).addImm(256).addReg(0); - BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg) + BuildMI(NextMBB, DL, TII->get(SystemZ::AGHI), NextCountReg) .addReg(ThisCountReg).addImm(-1); - BuildMI(MBB, DL, TII->get(SystemZ::CGHI)) + BuildMI(NextMBB, DL, TII->get(SystemZ::CGHI)) .addReg(NextCountReg).addImm(0); - BuildMI(MBB, DL, TII->get(SystemZ::BRC)) + BuildMI(NextMBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) .addMBB(LoopMBB); - MBB->addSuccessor(LoopMBB); - MBB->addSuccessor(DoneMBB); - - DestBase = MachineOperand::CreateReg(NextDestReg, false); - SrcBase = MachineOperand::CreateReg(NextSrcReg, false); - Length &= 255; - if (EndMBB && !Length) - // If the loop handled the whole CLC range, DoneMBB will be empty with - // CC live-through into EndMBB, so add it as live-in. - DoneMBB->addLiveIn(SystemZ::CC); - MBB = DoneMBB; + NextMBB->addSuccessor(LoopMBB); + NextMBB->addSuccessor(DoneMBB); } + // Handle any remaining bytes with straight-line code. - while (Length > 0) { - uint64_t ThisLength = std::min(Length, uint64_t(256)); + while (ImmLength > 0) { + uint64_t ThisLength = std::min(ImmLength, uint64_t(256)); // The previous iteration might have created out-of-range displacements. // Apply them using LAY if so. if (!isUInt<12>(DestDisp)) { @@ -7939,10 +7999,10 @@ .setMemRefs(MI.memoperands()); DestDisp += ThisLength; SrcDisp += ThisLength; - Length -= ThisLength; + ImmLength -= ThisLength; // If there's another CLC to go, branch to the end if a difference // was found. - if (EndMBB && Length > 0) { + if (EndMBB && ImmLength > 0) { MachineBasicBlock *NextMBB = SystemZ::splitBlockBefore(MI, MBB); BuildMI(MBB, DL, TII->get(SystemZ::BRC)) .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE) @@ -8433,6 +8493,7 @@ return emitMemMemWrapper(MI, MBB, SystemZ::OC); case SystemZ::XCSequence: case SystemZ::XCLoop: + case SystemZ::XCLoopVarLen: return emitMemMemWrapper(MI, MBB, SystemZ::XC); case SystemZ::CLCSequence: case SystemZ::CLCLoop: Index: llvm/lib/Target/SystemZ/SystemZInstrFormats.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5253,6 +5253,7 @@ // The Sequence form uses a straight-line sequence of instructions and // the Loop form uses a loop of length-256 instructions followed by // another instruction to handle the excess. +// The LoopVarLen form is for a loop with a non-constant length parameter. multiclass MemorySS opcode, SDPatternOperator sequence, SDPatternOperator loop> { def "" : SideEffectBinarySSa; @@ -5265,6 +5266,10 @@ imm64:$length, GR64:$count256), [(loop bdaddr12only:$dest, bdaddr12only:$src, imm64:$length, GR64:$count256)]>; + def LoopVarLen : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + GR64:$length, GR64:$count256), + [(loop bdaddr12only:$dest, bdaddr12only:$src, + GR64:$length, GR64:$count256)]>; } } Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -2165,8 +2165,12 @@ // Execute. let hasSideEffects = 1 in { - def EX : SideEffectBinaryRX<"ex", 0x44, GR64>; - def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, GR64>; + def EX : SideEffectBinaryRX<"ex", 0x44, ADDR64>; + def EXRL : SideEffectBinaryRILPC<"exrl", 0xC60, ADDR64>; + let hasNoSchedulingInfo = 1 in + def EXRL_Pseudo : Pseudo<(outs), (ins i64imm:$TargetOpc, ADDR64:$lenMinus1, + bdaddr12only:$bdl1, bdaddr12only:$bd2), + []>; } //===----------------------------------------------------------------------===// Index: llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -81,11 +81,12 @@ if (IsVolatile) return SDValue(); + auto *CByte = dyn_cast(Byte); if (auto *CSize = dyn_cast(Size)) { uint64_t Bytes = CSize->getZExtValue(); if (Bytes == 0) return SDValue(); - if (auto *CByte = dyn_cast(Byte)) { + if (CByte) { // Handle cases that can be done using at most two of // MVI, MVHI, MVHHI and MVGHI. The latter two can only be // used if ByteVal is all zeros or all ones; in other casees, @@ -125,7 +126,6 @@ assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already"); // Handle the special case of a memset of 0, which can use XC. - auto *CByte = dyn_cast(Byte); if (CByte && CByte->getZExtValue() == 0) return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP, Chain, Dst, Dst, Bytes); @@ -138,6 +138,18 @@ return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP, Chain, DstPlus1, Dst, Bytes - 1); } + else { // Variable length + if (CByte && CByte->getZExtValue() == 0) { + // Handle the special case of a variable length memset of 0 with XC. + SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64, + DAG.getZExtOrTrunc(Size, DL, MVT::i64), + DAG.getConstant(-1, DL, MVT::i64)); + SDValue TripC = DAG.getNode(ISD::SRL, DL, MVT::i64, LenMinus1, + DAG.getConstant(8, DL, MVT::i64)); + return DAG.getNode(SystemZISD::XC_LOOP, DL, MVT::Other, Chain, Dst, Dst, + LenMinus1, TripC); + } + } return SDValue(); } Index: llvm/test/CodeGen/SystemZ/memset-05.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/memset-05.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Test memset 0 with variable length +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +define void @fun0(i8* %Addr, i64 %Len) { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r3, -1 +; CHECK-NEXT: cgibe %r3, -1, 0(%r14) +; CHECK-NEXT: .LBB0_1: +; CHECK-NEXT: srlg %r0, %r3, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB0_3 +; CHECK-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB0_2 +; CHECK-NEXT: .LBB0_3: +; CHECK-NEXT: exrl %r3, .Ltmp0 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: xc 0(1,%r2), 0(%r2) + tail call void @llvm.memset.p0i8.i64(i8* %Addr, i8 0, i64 %Len, i1 false) + ret void +} + +define void @fun1(i8* %Addr, i32 %Len) { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r1, %r3 +; CHECK-NEXT: aghi %r1, -1 +; CHECK-NEXT: cgibe %r1, -1, 0(%r14) +; CHECK-NEXT: .LBB1_1: +; CHECK-NEXT: srlg %r0, %r1, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB1_3 +; CHECK-NEXT: .LBB1_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB1_2 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: exrl %r1, .Ltmp1 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: xc 0(1,%r2), 0(%r2) + tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false) + ret void +} + +; Test that identical target instructions get reused. +define void @fun2(i8* %Addr, i32 %Len) { +; CHECK-LABEL: fun2: +; CHECK: # %bb.0: +; CHECK-NEXT: llgfr %r1, %r3 +; CHECK-NEXT: aghi %r1, -1 +; CHECK-NEXT: srlg %r0, %r1, 8 +; CHECK-NEXT: cgije %r1, -1, .LBB2_5 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: lgr %r3, %r2 +; CHECK-NEXT: cgije %r0, 0, .LBB2_4 +; CHECK-NEXT: # %bb.2: +; CHECK-NEXT: lgr %r3, %r2 +; CHECK-NEXT: lgr %r4, %r0 +; CHECK-NEXT: .LBB2_3: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r3), 0(%r3) +; CHECK-NEXT: la %r3, 256(%r3) +; CHECK-NEXT: brctg %r4, .LBB2_3 +; CHECK-NEXT: .LBB2_4: +; CHECK-NEXT: exrl %r1, .Ltmp2 +; CHECK-NEXT: .LBB2_5: +; CHECK-NEXT: cgije %r1, -1, .LBB2_10 +; CHECK-NEXT: # %bb.6: +; CHECK-NEXT: lgr %r3, %r2 +; CHECK-NEXT: cgije %r0, 0, .LBB2_9 +; CHECK-NEXT: # %bb.7: +; CHECK-NEXT: lgr %r3, %r2 +; CHECK-NEXT: lgr %r4, %r0 +; CHECK-NEXT: .LBB2_8: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r3), 0(%r3) +; CHECK-NEXT: la %r3, 256(%r3) +; CHECK-NEXT: brctg %r4, .LBB2_8 +; CHECK-NEXT: .LBB2_9: +; CHECK-NEXT: exrl %r1, .Ltmp3 +; CHECK-NEXT: .LBB2_10: +; CHECK-NEXT: cgibe %r1, -1, 0(%r14) +; CHECK-NEXT: .LBB2_11: +; CHECK-NEXT: cgije %r0, 0, .LBB2_13 +; CHECK-NEXT: .LBB2_12: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xc 0(256,%r2), 0(%r2) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB2_12 +; CHECK-NEXT: .LBB2_13: +; CHECK-NEXT: exrl %r1, .Ltmp4 +; CHECK-NEXT: br %r14 +; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: xc 0(1,%r3), 0(%r3) +; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: xc 0(1,%r2), 0(%r2) + tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false) + tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false) + tail call void @llvm.memset.p0i8.i32(i8* %Addr, i8 0, i32 %Len, i1 false) + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) +declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)