diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -7867,9 +7867,10 @@ // When generating more than one CLC, all but the last will need to // branch to the end when a difference is found. - MachineBasicBlock *EndMBB = (ImmLength > 256 && Opcode == SystemZ::CLC - ? SystemZ::splitBlockAfter(MI, MBB) - : nullptr); + MachineBasicBlock *EndMBB = + (Opcode == SystemZ::CLC && (ImmLength > 256 || NeedsLoop) + ? SystemZ::splitBlockAfter(MI, MBB) + : nullptr); if (NeedsLoop) { Register StartCountReg = @@ -7920,8 +7921,8 @@ AllDoneMBB = SystemZ::splitBlockBefore(MI, MBB); StartMBB = SystemZ::emitBlockAfter(MBB); LoopMBB = SystemZ::emitBlockAfter(StartMBB); - NextMBB = LoopMBB; - DoneMBB = SystemZ::emitBlockAfter(LoopMBB); + NextMBB = (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB); + DoneMBB = SystemZ::emitBlockAfter(NextMBB); // MBB: // # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB. @@ -8039,18 +8040,23 @@ : MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass); BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemDestReg) .addReg(StartDestReg).addMBB(StartMBB) - .addReg(NextDestReg).addMBB(LoopMBB); + .addReg(NextDestReg).addMBB(NextMBB); if (!HaveSingleBase) BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg) .addReg(StartSrcReg).addMBB(StartMBB) - .addReg(NextSrcReg).addMBB(LoopMBB); - BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo)) - .addImm(Opcode) - .addReg(LenMinus1Reg) - .addReg(RemDestReg).addImm(DestDisp) - .addReg(RemSrcReg).addImm(SrcDisp); + .addReg(NextSrcReg).addMBB(NextMBB); + MachineInstrBuilder EXRL_MIB = + BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo)) + .addImm(Opcode) + .addReg(LenMinus1Reg) + .addReg(RemDestReg).addImm(DestDisp) + .addReg(RemSrcReg).addImm(SrcDisp); MBB->addSuccessor(AllDoneMBB); MBB = AllDoneMBB; + if (EndMBB) { + EXRL_MIB.addReg(SystemZ::CC, RegState::ImplicitDefine); + MBB->addLiveIn(SystemZ::CC); + } } } @@ -8569,6 +8575,7 @@ case SystemZ::ATOMIC_CMP_SWAPW: return emitAtomicCmpSwapW(MI, MBB); case SystemZ::MVCImm: + case SystemZ::MVCReg: return emitMemMemWrapper(MI, MBB, SystemZ::MVC); case SystemZ::NCImm: return emitMemMemWrapper(MI, MBB, SystemZ::NC); @@ -8578,6 +8585,7 @@ case SystemZ::XCReg: return emitMemMemWrapper(MI, MBB, SystemZ::XC); case SystemZ::CLCImm: + case SystemZ::CLCReg: return emitMemMemWrapper(MI, MBB, SystemZ::CLC); case SystemZ::CLSTLoop: return emitStringWrapper(MI, MBB, SystemZ::CLST); diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td --- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5356,6 +5356,10 @@ imm64:$length), [(set CC, (memop bdaddr12only:$dest, bdaddr12only:$src, imm64:$length))]>; + def Reg : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, + ADDR64:$length), + [(set CC, (memop bdaddr12only:$dest, bdaddr12only:$src, + ADDR64:$length))]>; } } diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -17,6 +17,11 @@ #define DEBUG_TYPE "systemz-selectiondag-info" +static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) { + return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other) + : DAG.getVTList(MVT::Other); +} + // Emit a mem-mem operation after subtracting one from size, which will be // added back during pseudo expansion. As the Reg case emitted here may be // converted by DAGCombiner into having an Imm length, they are both emitted @@ -24,7 +29,7 @@ static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size) { - return DAG.getNode(Op, DL, MVT::Other, Chain, Dst, Src, + return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, DAG.getConstant(Size - 1, DL, Src.getValueType())); } @@ -34,17 +39,7 @@ SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64, DAG.getZExtOrTrunc(Size, DL, MVT::i64), DAG.getConstant(-1, DL, MVT::i64)); - return DAG.getNode(Op, DL, MVT::Other, Chain, Dst, Src, LenMinus1); -} - -// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size). -// One is subtracted from size also here, per above. -static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, - SDValue Src1, SDValue Src2, uint64_t Size) { - SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); - EVT PtrVT = Src1.getValueType(); - return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2, - DAG.getConstant(Size - 1, DL, PtrVT)); + return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1); } SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy( @@ -57,7 +52,8 @@ if (auto *CSize = dyn_cast(Size)) return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, Dst, Src, CSize->getZExtValue()); - return SDValue(); + + return emitMemMemReg(DAG, DL, SystemZISD::MVC, Chain, Dst, Src, Size); } // Handle a memset of 1, 2, 4 or 8 bytes with the operands given by @@ -166,15 +162,16 @@ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Src1, SDValue Src2, SDValue Size, MachinePointerInfo Op1PtrInfo, MachinePointerInfo Op2PtrInfo) const { + SDValue CCReg; + // Swap operands to invert CC == 1 vs. CC == 2 cases. if (auto *CSize = dyn_cast(Size)) { uint64_t Bytes = CSize->getZExtValue(); assert(Bytes > 0 && "Caller should have handled 0-size case"); - // Swap operands to invert CC == 1 vs. CC == 2 cases. - SDValue CCReg = emitCLC(DAG, DL, Chain, Src2, Src1, Bytes); - Chain = CCReg.getValue(1); - return std::make_pair(addIPMSequence(DL, CCReg, DAG), Chain); - } - return std::make_pair(SDValue(), SDValue()); + CCReg = emitMemMemImm(DAG, DL, SystemZISD::CLC, Chain, Src2, Src1, Bytes); + } else + CCReg = emitMemMemReg(DAG, DL, SystemZISD::CLC, Chain, Src2, Src1, Size); + Chain = CCReg.getValue(1); + return std::make_pair(addIPMSequence(DL, CCReg, DAG), Chain); } std::pair SystemZSelectionDAGInfo::EmitTargetCodeForMemchr( diff --git a/llvm/test/CodeGen/SystemZ/loop-03.ll b/llvm/test/CodeGen/SystemZ/loop-03.ll --- a/llvm/test/CodeGen/SystemZ/loop-03.ll +++ b/llvm/test/CodeGen/SystemZ/loop-03.ll @@ -19,7 +19,7 @@ define void @fun0(%0*) { ; CHECK-LABEL: .LBB0_4 -; CHECK: => This Inner Loop Header: Depth=2 +; CHECK: => This Inner Loop Header ; CHECK-NOT: 16-byte Folded Spill ; CHECK-NOT: 16-byte Folded Reload diff --git a/llvm/test/CodeGen/SystemZ/memcmp-01.ll b/llvm/test/CodeGen/SystemZ/memcmp-01.ll --- a/llvm/test/CodeGen/SystemZ/memcmp-01.ll +++ b/llvm/test/CodeGen/SystemZ/memcmp-01.ll @@ -219,3 +219,30 @@ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 769) ret i32 %res } + +define i32 @f14(i8 *%src1, i8 *%src2, i64 %Len) { +; CHECK-LABEL: f14: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r4, -1 +; CHECK-NEXT: cghi %r4, -1 +; CHECK-NEXT: je .LBB13_5 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: srlg %r0, %r4, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB13_4 +; CHECK-NEXT: .LBB13_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: clc 0(256,%r3), 0(%r2) +; CHECK-NEXT: jlh .LBB13_5 +; CHECK-NEXT: # %bb.3: # in Loop: Header=BB13_2 Depth=1 +; CHECK-NEXT: la %r3, 256(%r3) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: brctg %r0, .LBB13_2 +; CHECK-NEXT: .LBB13_4: +; CHECK-NEXT: exrl %r4, .Ltmp0 +; CHECK-NEXT: .LBB13_5: +; CHECK-NEXT: ipm %r2 +; CHECK-NEXT: sll %r2, 2 +; CHECK-NEXT: sra %r2, 30 +; CHECK-NEXT: br %r14 + %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 %Len) + ret i32 %res +} diff --git a/llvm/test/CodeGen/SystemZ/memcpy-01.ll b/llvm/test/CodeGen/SystemZ/memcpy-01.ll --- a/llvm/test/CodeGen/SystemZ/memcpy-01.ll +++ b/llvm/test/CodeGen/SystemZ/memcpy-01.ll @@ -217,3 +217,28 @@ call void @foo(i8* %dest, i8* %src) ret void } + +; Test a variable length loop. +define void @f17(i8* %dest, i8* %src, i64 %Len) { +; CHECK-LABEL: f17: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r4, -1 +; CHECK-NEXT: cgibe %r4, -1, 0(%r14) +; CHECK-NEXT: .LBB16_1: +; CHECK-NEXT: srlg %r0, %r4, 8 +; CHECK-NEXT: cgije %r0, 0, .LBB16_3 +; CHECK-NEXT: .LBB16_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: pfd 2, 768(%r2) +; CHECK-NEXT: mvc 0(256,%r2), 0(%r3) +; CHECK-NEXT: la %r2, 256(%r2) +; CHECK-NEXT: la %r3, 256(%r3) +; CHECK-NEXT: brctg %r0, .LBB16_2 +; CHECK-NEXT: .LBB16_3: +; CHECK-NEXT: exrl %r4, .Ltmp0 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 %Len, i1 false) + ret void +} + +; CHECK: .Ltmp0: +; CHECK-NEXT: mvc 0(1,%r2), 0(%r3) diff --git a/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll b/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll --- a/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll +++ b/llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll @@ -1,13 +1,5 @@ ; RUN: llc -mtriple=s390x-linux-gnu < %s | FileCheck %s -; CHECK-LABEL: tail_memcpy: -; CHECK: jg memcpy -define void @tail_memcpy(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 { -entry: - tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i1 false) - ret void -} - ; CHECK-LABEL: tail_memmove: ; CHECK: jg memmove define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {