This is an archive of the discontinued LLVM Phabricator instance.

Differential D106874

[SystemZ] Implement memcpy with variable length with MVC
ClosedPublic

Authored by jonpa on Jul 27 2021, 6:59 AM.

Download Raw Diff

Details

Reviewers

uweigand

Commits

rGc6c13c58eebd: [SystemZ] Implement memcpy of variable length with MVC.

Summary

Instead of making a libcall to memcpy, emit an MVC loop along with an EXRL instruction the same way as already done for memset 0.

It seemed this was a slight overall improvements on preliminary measurements.

I also tried some different prefetch settings on (quick) spec for both Write and Read (compared to master which has only Write 768):

Overall results (by average over benchmarks):

z14:
2017_B_Memcpy_pfd_w_0_pfd_r_0                                             99.856 %
2017_E_Memcpy_pfd_w_2048_pfd_r_0                                          99.920 %
2017_C_Memcpy_pfd_w_768_pfd_r_768                                         99.978 %
2017_D_Memcpy_pfd_w_2048_pfd_r_2048                                       99.986 %
2017_F_Memcpy_pfd_w_524287_pfd_r_524287                                   100.426 %

z15:
2017_E_Memcpy_pfd_w_2048_pfd_r_0                                          99.941 %
2017_B_Memcpy_pfd_w_0_pfd_r_0                                             99.941 %
2017_D_Memcpy_pfd_w_2048_pfd_r_2048                                       100.043 %
2017_C_Memcpy_pfd_w_768_pfd_r_768                                         100.053 %
2017_F_Memcpy_pfd_w_524287_pfd_r_524287                                   100.313 %

I also tried to do a runtime check for a big size like:

f17:                                    # @f17
        .cfi_startproc
# %bb.0:
        aghi    %r4, -1
        cgibe   %r4, -1, 0(%r14)
.LBB16_1:
        srlg    %r0, %r4, 8
        cgije   %r0, 0, .LBB16_4
# %bb.2:
        lghi    %r1, 0
        cgfi    %r4, 2000000
        locghihe        %r1, 1
        sllg    %r1, %r1, 22
.LBB16_3:                               # =>This Inner Loop Header: Depth=1
        pfd     2, 0(%r1,%r2)
        mvc     0(256,%r2), 0(%r3)
        la      %r2, 256(%r2)
        la      %r3, 256(%r3)
        brctg   %r0, .LBB16_3
.LBB16_4:
        exrl    %r4, .Ltmp0
        br      %r14

The idea was to prefetch for the L2 cache (4M), if size was bigger than 2M as a check to see if this could give anything. It however did not seem to improve any benchmark either with W, R, or W+R prefetching per this pattern.

Keeping the prefetching as it was with this patch was slightly better overall on z15 and slightly better without it on z14, so there does not seem to be any major gains to be had from changing the MVC prefetching...

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

jonpa created this revision.Jul 27 2021, 6:59 AM

Herald added a subscriber: hiraditya. · View Herald TranscriptJul 27 2021, 6:59 AM

jonpa requested review of this revision.Jul 27 2021, 6:59 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 27 2021, 6:59 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Harbormaster completed remote builds in B116421: Diff 362016.Jul 27 2021, 7:41 AM

This looks good to me, but we should verify the performance using a full run.

This revision was not accepted when it landed; it landed in state Needs Review.Oct 5 2021, 8:15 AM

This revision was landed with ongoing or failed builds.

Closed by commit rGc6c13c58eebd: [SystemZ] Implement memcpy of variable length with MVC. (authored by jonpa). · Explain Why

This revision was automatically updated to reflect the committed changes.

jonpa added a commit: rGc6c13c58eebd: [SystemZ] Implement memcpy of variable length with MVC..

jonpa added a reverting change: rG3562076dfc0b: [SystemZ] Temporarily revert memcmp and memcpy patches.Oct 6 2021, 2:06 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

SystemZ/

SystemZISelLowering.cpp

1 line

SystemZSelectionDAGInfo.cpp

26 lines

test/

CodeGen/

SystemZ/

loop-03.ll

2 lines

memcpy-01.ll

25 lines

tail-call-mem-intrinsics.ll

8 lines

Diff 377260

llvm/lib/Target/SystemZ/SystemZISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,526 Lines • ▼ Show 20 Lines	MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
case SystemZ::ATOMIC_LOAD_UMAX_64:		case SystemZ::ATOMIC_LOAD_UMAX_64:
return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,		return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
SystemZ::CCMASK_CMP_GE, 64);		SystemZ::CCMASK_CMP_GE, 64);

case SystemZ::ATOMIC_CMP_SWAPW:		case SystemZ::ATOMIC_CMP_SWAPW:
return emitAtomicCmpSwapW(MI, MBB);		return emitAtomicCmpSwapW(MI, MBB);
case SystemZ::MVCSequence:		case SystemZ::MVCSequence:
case SystemZ::MVCLoop:		case SystemZ::MVCLoop:
		case SystemZ::MVCLoopVarLen:
return emitMemMemWrapper(MI, MBB, SystemZ::MVC);		return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
case SystemZ::NCSequence:		case SystemZ::NCSequence:
case SystemZ::NCLoop:		case SystemZ::NCLoop:
return emitMemMemWrapper(MI, MBB, SystemZ::NC);		return emitMemMemWrapper(MI, MBB, SystemZ::NC);
case SystemZ::OCSequence:		case SystemZ::OCSequence:
case SystemZ::OCLoop:		case SystemZ::OCLoop:
return emitMemMemWrapper(MI, MBB, SystemZ::OC);		return emitMemMemWrapper(MI, MBB, SystemZ::OC);
case SystemZ::XCSequence:		case SystemZ::XCSequence:
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp

Show All 39 Lines	static SDValue emitMemMem(SelectionDAG &DAG, const SDLoc &DL, unsigned Sequence,
if (Size > 6 * 256)		if (Size > 6 * 256)
return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,		return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,
DAG.getConstant(Size, DL, PtrVT),		DAG.getConstant(Size, DL, PtrVT),
DAG.getConstant(Size / 256, DL, PtrVT));		DAG.getConstant(Size / 256, DL, PtrVT));
return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,		return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,
DAG.getConstant(Size, DL, PtrVT));		DAG.getConstant(Size, DL, PtrVT));
}		}

		static SDValue emitMemMemVarLen(SelectionDAG &DAG, const SDLoc &DL,
		unsigned Loop, SDValue Chain, SDValue Dst,
		SDValue Src, SDValue Size) {
		SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
		DAG.getZExtOrTrunc(Size, DL, MVT::i64),
		DAG.getConstant(-1, DL, MVT::i64));
		SDValue TripC = DAG.getNode(ISD::SRL, DL, MVT::i64, LenMinus1,
		DAG.getConstant(8, DL, MVT::i64));
		return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src, LenMinus1, TripC);
		}

SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(		SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,		SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline,		SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline,
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {		MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
if (IsVolatile)		if (IsVolatile)
return SDValue();		return SDValue();

if (auto *CSize = dyn_cast<ConstantSDNode>(Size))		if (auto *CSize = dyn_cast<ConstantSDNode>(Size))
return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,		return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
Chain, Dst, Src, CSize->getZExtValue());		Chain, Dst, Src, CSize->getZExtValue());
return SDValue();
		return emitMemMemVarLen(DAG, DL, SystemZISD::MVC_LOOP, Chain, Dst, Src, Size);
}		}

// Handle a memset of 1, 2, 4 or 8 bytes with the operands given by		// Handle a memset of 1, 2, 4 or 8 bytes with the operands given by
// Chain, Dst, ByteVal and Size. These cases are expected to use		// Chain, Dst, ByteVal and Size. These cases are expected to use
// MVI, MVHHI, MVHI and MVGHI respectively.		// MVI, MVHHI, MVHI and MVGHI respectively.
static SDValue memsetStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,		static SDValue memsetStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
SDValue Dst, uint64_t ByteVal, uint64_t Size,		SDValue Dst, uint64_t ByteVal, uint64_t Size,
unsigned Align, MachinePointerInfo DstPtrInfo) {		unsigned Align, MachinePointerInfo DstPtrInfo) {
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	if (auto *CSize = dyn_cast<ConstantSDNode>(Size)) {
Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);		Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,		SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
DAG.getConstant(1, DL, PtrVT));		DAG.getConstant(1, DL, PtrVT));
return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,		return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
Chain, DstPlus1, Dst, Bytes - 1);		Chain, DstPlus1, Dst, Bytes - 1);
}		}

// Variable length		// Variable length
if (CByte && CByte->getZExtValue() == 0) {		if (CByte && CByte->getZExtValue() == 0)
// Handle the special case of a variable length memset of 0 with XC.		// Handle the special case of a variable length memset of 0 with XC.
SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,		return emitMemMemVarLen(DAG, DL, SystemZISD::XC_LOOP, Chain, Dst, Dst, Size);
DAG.getZExtOrTrunc(Size, DL, MVT::i64),
DAG.getConstant(-1, DL, MVT::i64));
SDValue TripC = DAG.getNode(ISD::SRL, DL, MVT::i64, LenMinus1,
DAG.getConstant(8, DL, MVT::i64));
return DAG.getNode(SystemZISD::XC_LOOP, DL, MVT::Other, Chain, Dst, Dst,
LenMinus1, TripC);
}
return SDValue();		return SDValue();
}		}

// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),		// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
// deciding whether to use a loop or straight-line code.		// deciding whether to use a loop or straight-line code.
static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,		static SDValue emitCLC(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
SDValue Src1, SDValue Src2, uint64_t Size) {		SDValue Src1, SDValue Src2, uint64_t Size) {
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);		SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
▲ Show 20 Lines • Show All 128 Lines • Show Last 20 Lines

llvm/test/CodeGen/SystemZ/loop-03.ll

	Show All 13 Lines
	%6 = type { i64, %4, i32, i64, i8 }			%6 = type { i64, %4, i32, i64, i8 }
	%7 = type { i64, i64, %8** }			%7 = type { i64, i64, %8** }
	%8 = type { i64, i64, i64, %4, i64, i32, %5, i32, i64, i64 }			%8 = type { i64, i64, i64, %4, i64, i32, %5, i32, i64, i64 }

	declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)			declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)

	define void @fun0(%0*) {			define void @fun0(%0*) {
	; CHECK-LABEL: .LBB0_4			; CHECK-LABEL: .LBB0_4
	; CHECK: => This Inner Loop Header: Depth=2			; CHECK: => This Inner Loop Header
	; CHECK-NOT: 16-byte Folded Spill			; CHECK-NOT: 16-byte Folded Spill
	; CHECK-NOT: 16-byte Folded Reload			; CHECK-NOT: 16-byte Folded Reload

	%2 = load i64, i64* undef, align 8			%2 = load i64, i64* undef, align 8
	%3 = udiv i64 128, %2			%3 = udiv i64 128, %2
	%4 = mul i64 %3, %2			%4 = mul i64 %3, %2
	%5 = load i64, i64* undef, align 8			%5 = load i64, i64* undef, align 8
	switch i32 undef, label %36 [			switch i32 undef, label %36 [
	▲ Show 20 Lines • Show All 96 Lines • Show Last 20 Lines

llvm/test/CodeGen/SystemZ/memcpy-01.ll

Show First 20 Lines • Show All 211 Lines • ▼ Show 20 Lines	; CHECK: br %r14
%arr = alloca [3200 x i8]		%arr = alloca [3200 x i8]
%dest = getelementptr [3200 x i8], [3200 x i8] *%arr, i64 0, i64 1600		%dest = getelementptr [3200 x i8], [3200 x i8] *%arr, i64 0, i64 1600
%src = getelementptr [3200 x i8], [3200 x i8] *%arr, i64 0, i64 0		%src = getelementptr [3200 x i8], [3200 x i8] *%arr, i64 0, i64 0
call void @foo(i8* %dest, i8* %src)		call void @foo(i8* %dest, i8* %src)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 1537, i1 false)		call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 1537, i1 false)
call void @foo(i8* %dest, i8* %src)		call void @foo(i8* %dest, i8* %src)
ret void		ret void
}		}

		; Test a variable length loop.
		define void @f17(i8* %dest, i8* %src, i64 %Len) {
		; CHECK-LABEL: f17:
		; CHECK: # %bb.0:
		; CHECK-NEXT: aghi %r4, -1
		; CHECK-NEXT: cgibe %r4, -1, 0(%r14)
		; CHECK-NEXT: .LBB16_1:
		; CHECK-NEXT: srlg %r0, %r4, 8
		; CHECK-NEXT: cgije %r0, 0, .LBB16_3
		; CHECK-NEXT: .LBB16_2: # =>This Inner Loop Header: Depth=1
		; CHECK-NEXT: pfd 2, 768(%r2)
		; CHECK-NEXT: mvc 0(256,%r2), 0(%r3)
		; CHECK-NEXT: la %r2, 256(%r2)
		; CHECK-NEXT: la %r3, 256(%r3)
		; CHECK-NEXT: brctg %r0, .LBB16_2
		; CHECK-NEXT: .LBB16_3:
		; CHECK-NEXT: exrl %r4, .Ltmp0
		; CHECK-NEXT: br %r14
		call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 %Len, i1 false)
		ret void
		}

		; CHECK: .Ltmp0:
		; CHECK-NEXT: mvc 0(1,%r2), 0(%r3)

llvm/test/CodeGen/SystemZ/tail-call-mem-intrinsics.ll

	; RUN: llc -mtriple=s390x-linux-gnu < %s \| FileCheck %s			; RUN: llc -mtriple=s390x-linux-gnu < %s \| FileCheck %s

	; CHECK-LABEL: tail_memcpy:
	; CHECK: jg memcpy
	define void @tail_memcpy(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
	entry:
	tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i1 false)
	ret void
	}

	; CHECK-LABEL: tail_memmove:			; CHECK-LABEL: tail_memmove:
	; CHECK: jg memmove			; CHECK: jg memmove
	define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {			define void @tail_memmove(i8* nocapture %p, i8* nocapture readonly %q, i32 %n) #0 {
	entry:			entry:
	tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i1 false)			tail call void @llvm.memmove.p0i8.p0i8.i32(i8* %p, i8* %q, i32 %n, i1 false)
	ret void			ret void
	}			}

	Show All 13 Lines