This is an archive of the discontinued LLVM Phabricator instance.

scudo: Use DC GZVA instruction in storeTags().
ClosedPublic

Authored by pcc on Apr 20 2021, 4:26 PM.

Download Raw Diff

Details

Reviewers

eugenis
hctim
cryptoad

Commits

rG46c59d91dc7a: scudo: Use DC GZVA instruction in storeTags().

Summary

DC GZVA can operate on multiple granules at a time (corresponding to
the CPU's cache line size) so we can generally expect it to be faster
than STZG in a loop.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

pcc requested review of this revision.Apr 20 2021, 4:26 PM

pcc created this revision.

Herald added a project: Restricted Project. · View Herald TranscriptApr 20 2021, 4:26 PM

Herald added a subscriber: Restricted Project. · View Herald Transcript

pcc added a child revision: D100911: scudo: Make prepareTaggedChunk() and resizeTaggedChunk() generic..Apr 20 2021, 4:27 PM

Harbormaster completed remote builds in B99842: Diff 339036.Apr 20 2021, 5:57 PM

LGTM
I wonder if doing the size check before the DCZID check could speed up small allocations, and maybe raising the threshold value could help.
But we can worry about that later.

This revision is now accepted and ready to land.Apr 21 2021, 10:42 AM

This revision was landed with ongoing or failed builds.Apr 21 2021, 1:54 PM

Closed by commit rG46c59d91dc7a: scudo: Use DC GZVA instruction in storeTags(). (authored by pcc). · Explain Why

This revision was automatically updated to reflect the committed changes.

pcc added a commit: rG46c59d91dc7a: scudo: Use DC GZVA instruction in storeTags()..

Revision Contents

Path

Size

compiler-rt/

lib/

scudo/

standalone/

memtag.h

71 lines

Diff 339376

compiler-rt/lib/scudo/standalone/memtag.h

Show First 20 Lines • Show All 146 Lines • ▼ Show 20 Lines	__asm__ __volatile__(
: [Ptr] "r"(Ptr), [ExcludeMask] "r"(ExcludeMask));		: [Ptr] "r"(Ptr), [ExcludeMask] "r"(ExcludeMask));
return TaggedPtr;		return TaggedPtr;
}		}

inline uptr addFixedTag(uptr Ptr, uptr Tag) { return Ptr \| (Tag << 56); }		inline uptr addFixedTag(uptr Ptr, uptr Tag) { return Ptr \| (Tag << 56); }

inline uptr storeTags(uptr Begin, uptr End) {		inline uptr storeTags(uptr Begin, uptr End) {
DCHECK(Begin % 16 == 0);		DCHECK(Begin % 16 == 0);
if (Begin != End) {		uptr LineSize, Next, Tmp;
__asm__ __volatile__(		__asm__ __volatile__(
R"(		R"(
.arch_extension memtag		.arch_extension memtag

		// Compute the cache line size in bytes (DCZID_EL0 stores it as the log2
		// of the number of 4-byte words) and bail out to the slow path if DCZID_EL0
		// indicates that the DC instructions are unavailable.
		DCZID .req %[Tmp]
		mrs DCZID, dczid_el0
		tbnz DCZID, #4, 3f
		and DCZID, DCZID, #15
		mov %[LineSize], #4
		lsl %[LineSize], %[LineSize], DCZID
		.unreq DCZID

		// Our main loop doesn't handle the case where we don't need to perform any
		// DC GZVA operations. If the size of our tagged region is less than
		// twice the cache line size, bail out to the slow path since it's not
		// guaranteed that we'll be able to do a DC GZVA.
		Size .req %[Tmp]
		sub Size, %[End], %[Cur]
		cmp Size, %[LineSize], lsl #1
		b.lt 3f
		.unreq Size

		LineMask .req %[Tmp]
		sub LineMask, %[LineSize], #1

		// STZG until the start of the next cache line.
		orr %[Next], %[Cur], LineMask
1:		1:
stzg %[Cur], [%[Cur]], #16		stzg %[Cur], [%[Cur]], #16
cmp %[Cur], %[End]		cmp %[Cur], %[Next]
b.lt 1b		b.lt 1b

		// DC GZVA cache lines until we have no more full cache lines.
		bic %[Next], %[End], LineMask
		.unreq LineMask
		2:
		dc gzva, %[Cur]
		add %[Cur], %[Cur], %[LineSize]
		cmp %[Cur], %[Next]
		b.lt 2b

		// STZG until the end of the tagged region. This loop is also used to handle
		// slow path cases.
		3:
		cmp %[Cur], %[End]
		b.ge 4f
		stzg %[Cur], [%[Cur]], #16
		b 3b

		4:
)"		)"
: [Cur] "+&r"(Begin)		: [Cur] "+&r"(Begin), [LineSize] "=&r"(LineSize), [Next] "=&r"(Next),
		[Tmp] "=&r"(Tmp)
: [End] "r"(End)		: [End] "r"(End)
: "memory");		: "memory");
}
return Begin;		return Begin;
}		}

inline void prepareTaggedChunk(void Ptr, uptr Size, uptr ExcludeMask,		inline void prepareTaggedChunk(void Ptr, uptr Size, uptr ExcludeMask,
uptr BlockEnd) {		uptr BlockEnd) {
// Prepare the granule before the chunk to store the chunk header by setting		// Prepare the granule before the chunk to store the chunk header by setting
// its tag to 0. Normally its tag will already be 0, but in the case where a		// its tag to 0. Normally its tag will already be 0, but in the case where a
// chunk holding a low alignment allocation is reused for a higher alignment		// chunk holding a low alignment allocation is reused for a higher alignment
▲ Show 20 Lines • Show All 179 Lines • Show Last 20 Lines