This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
lib/scudo/
-
scudo/
12
scudo_allocator.cpp
-
scudo_crc32.h
6
scudo_crc32.cpp
-
scudo_utils.h

Differential D32971

[scudo] CRC32 optimizations
ClosedPublic

Authored by cryptoad on May 8 2017, 10:15 AM.

Download Raw Diff

Details

Reviewers

dvyukov
alekseyshl
kcc

Commits

rGb0e96eb28e6b: [scudo] CRC32 optimizations
rCRT302538: [scudo] CRC32 optimizations
rL302538: [scudo] CRC32 optimizations

Summary

This change optimizes several aspects of the checksum used for chunk headers.

First, there is no point in checking the weak symbol computeHardwareCRC32
everytime, it will either be there or not when we start, so check it once
during initialization and set the checksum type accordingly.

Then, the loading of HashAlgorithm for SSE versions (and ARM equivalent) was
not optimized out, while not necessary. So I reshuffled that part of the code,
which duplicates a tiny bit of code, but ends up in a much cleaner assembly
(and faster as we avoid an extraneous load and some calls).

The following code is the checksum at the end of scudoMalloc for x86_64 with
full SSE 4.2, before:

mov     rax, 0FFFFFFFFFFFFFFh
shl     r10, 38h
mov     edi, dword ptr cs:_ZN7__scudoL6CookieE ; __scudo::Cookie
and     r14, rax
lea     rsi, [r13-10h]
movzx   eax, cs:_ZN7__scudoL13HashAlgorithmE ; __scudo::HashAlgorithm
or      r14, r10
mov     rbx, r14
xor     bx, bx
call    _ZN7__scudo20computeHardwareCRC32Ejm ; __scudo::computeHardwareCRC32(uint,ulong)
mov     rsi, rbx
mov     edi, eax
call    _ZN7__scudo20computeHardwareCRC32Ejm ; __scudo::computeHardwareCRC32(uint,ulong)
mov     r14w, ax
mov     rax, r13
mov     [r13-10h], r14

After:

mov     rax, cs:_ZN7__scudoL6CookieE ; __scudo::Cookie
lea     rcx, [rbx-10h]
mov     rdx, 0FFFFFFFFFFFFFFh
and     r14, rdx
shl     r9, 38h
or      r14, r9
crc32   eax, rcx
mov     rdx, r14
xor     dx, dx
mov     eax, eax
crc32   eax, rdx
mov     r14w, ax
mov     rax, rbx
mov     [rbx-10h], r14

Diff Detail

Build Status

Buildable 6260
Build 6260: arc lint + arc unit

Event Timeline

cryptoad created this revision.May 8 2017, 10:15 AM

Herald added subscribers: rengolin, aemerson. · View Herald TranscriptMay 8 2017, 10:15 AM

dvyukov added inline comments.May 8 2017, 12:04 PM

lib/scudo/scudo_allocator.cpp
39	It looked nice to have a separate function that wraps all of this logic. Why do you inline it? I would just move the load of HashAlgorithm into this function so that it does not happen when not needed.

cryptoad added inline comments.May 8 2017, 12:30 PM

lib/scudo/scudo_allocator.cpp
39	Indeed, I considered this as well. The other advantage of doing it the new way was to have the `crc32` instructions inlined as opposed to be calls (which is another part of the performance gain), which required me to use the intrinsic within the loop. After duplicating the loop with intrinsic, I didn't see a need to keep that function.

dvyukov added inline comments.May 8 2017, 12:47 PM

lib/scudo/scudo_allocator.cpp
39	Do you can have intrinsics in the inlinable computeCRC32 function. It should lead to the same assembly. With intrinsics it becomes more complex, so there is even more reason to have a separate function for it. Consider that we want to calculate crc in a another place.

cryptoad added inline comments.May 8 2017, 12:51 PM

lib/scudo/scudo_allocator.cpp

So something like:

INLINE u32 computeCRC32(u32 Crc, uptr Data, u8 HashType) {
#if defined(__SSE4_2__) || defined(__ARM_FEATURE_CRC32)
  return CRC32_INTRINSIC(Crc, Data);
#else
  if (atomic_load_relaxed(&HashAlgorithm) == CRC32Hardware)
    return computeHardwareCRC32(Crc, Data);
  return computeSoftwareCRC32(Crc, Data);
#endif
}

(without the 3rd param)

dvyukov added inline comments.May 8 2017, 12:54 PM

lib/scudo/scudo_allocator.cpp
39	yup

alekseyshl added inline comments.May 8 2017, 1:11 PM

lib/scudo/scudo_allocator.cpp
89–90	Would it make sense to check algorithm once and duplicate the code here, instead of comparing on every loop iteration? More code, but since we're after the performance... if (HashType == CRC32Hardware) { Crc = computeHardwareCRC32(Cookie, reinterpret_cast<uptr>(this)); for (uptr i = 0; i < ARRAY_SIZE(HeaderHolder); i++) Crc = computeHardwareCRC32(Crc, HeaderHolder[i]); } else { Crc = computeSoftwareCRC32(Cookie, reinterpret_cast<uptr>(this)); for (uptr i = 0; i < ARRAY_SIZE(HeaderHolder); i++) Crc = computeSoftwareCRC32(Crc, HeaderHolder[i]); }
lib/scudo/scudo_crc32.cpp
23	Please remind me, does it mean that computeHardwareCRC32 != 0 only when defined(SSE4_2) \|\| defined(__ARM_FEATURE_CRC32)? Or is it expected to come from other libraries too?

cryptoad added inline comments.May 8 2017, 1:11 PM

lib/scudo/scudo_allocator.cpp

Just gave it a try, for the non-full-SSE the loop is unrolled but with a load & check of the algorithm at each iteration (2 for 64-bit, 3 for 32-bit):

                mov     rax, cs:_ZN7__scudoL6CookieE ; __scudo::Cookie
                mov     dl, cs:_ZN7__scudoL13HashAlgorithmE ; unsigned __int64
                cmp     dl, 1
                jnz     short loc_884B
                mov     edi, eax        ; this
                mov     rsi, rcx        ; unsigned int
                call    _ZN7__scudo20computeHardwareCRC32Ejm ; __scudo::computeHardwareCRC32(uint,ulong)
                jmp     loc_88DB
loc_884B:                               ; CODE XREF: __scudo::ScudoChunk::computeChecksum(__scudo::UnpackedHeader *)+1Aj

... software crc32 ...

loc_88DB:                               ; CODE XREF: __scudo::ScudoChunk::computeChecksum(__scudo::UnpackedHeader *)+26j
                mov     cl, cs:_ZN7__scudoL13HashAlgorithmE ; __scudo::HashAlgorithm
                cmp     cl, 1
                jnz     short loc_88FC
                and     r14, 0FFFFFFFFFFFF0000h
                mov     edi, eax        ; this
                mov     rsi, r14        ; unsigned int
                call    _ZN7__scudo20computeHardwareCRC32Ejm ; __scudo::computeHardwareCRC32(uint,ulong)
                jmp     loc_8982
loc_88FC:                               ; CODE XREF: __scudo::ScudoChunk::computeChecksum(__scudo::UnpackedHeader *)+C4j

... software crc32 ...

It seems somewhat messier. What do you think?

cryptoad added inline comments.May 8 2017, 1:19 PM

lib/scudo/scudo_allocator.cpp
89–90	I gave this a try, with -O3, the generated code for `computeChecksum` ends up being identical with the initial version of the patch.
lib/scudo/scudo_crc32.cpp
23	computeHardwareCRC32 != 0 if `defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)`, but it could also be defined by an external library, even though this is not the expectation. With full RELRO (`-Wl,-z,relro,-z,now`), if not defined at load, it can't be defined later.

alekseyshl added inline comments.May 8 2017, 1:34 PM

lib/scudo/scudo_crc32.cpp
23	I see. I'm asking because in the current version of the code the same condition switches the top level code to CRC32_INTRINSIC, completely ignoring this function. I was curious if it is even necessary now. It just does not add up. Even for the possible future other uses of computeCRC32, for the performance reasons we have to ifdef to INTRINSIC calls at the call site, which renders computeHardwareCRC32 useless, right?

cryptoad added inline comments.May 8 2017, 1:49 PM

lib/scudo/scudo_crc32.cpp
23	I am going to try and clarify the situation. Let me know if something still doesn't make sense after! Compilation wise, there are 3 cases: `-msse4_2` (or ARM equivalent) is enabled for the whole project (eg: google3), instructions will be emitted accordingly, and we will only use hardware `crc32` (no weak function looked, only intrinsics); `-msse4_2` (or ARM equivalent) is enabled for scudo_crc32.cpp only (eg: current cmake config if supported): SSE instructions are only emitted in this file, `computeHardwareCRC32` is defined, and can be used at runtime; no SSE instructions will be emitted anywhere else; this allows runtime detection and to have a HW enabled version that also runs on old hardware; `-msse4_2` (or ARM equivalent) is not enabled (eg: current cmake config if not supported): `computeHardwareCRC32` is not defined. Runtime wise, if the processor supports HW CRC32, then #1 or #2 will leverage it. If it doesn't, #1 will crash on illegal instruction, #2 will still work. #3 works in any case. This patch will make #1 a lot cleaner ASM wise due to the call inline and the removal of the unnecessary load of `HashType`, #2 & #3 somewhat better due to the check removal for `computeHardwareCRC32` at each iteration. So I think your point about the top level code completely ignoring it refers to #1, but we still need it for #2.

dvyukov added inline comments.May 8 2017, 1:50 PM

lib/scudo/scudo_allocator.cpp
39	I missed that this function consumes a single word rather than an array. I basically mean a function which you give initial crc value and an array of data and it gives you new crc value using the most efficient method available whatever it is.

This new solution should fit both Dmitry's and Aleksey's suggestions.

Wrap up the checksum logic in a function that takes as parameters all the
information needed to compute the checksum, and returns the checksum using
the fastest way available.

Regarding the assembly, the full SSE one looks like (no extra HashAlgorithm
load and inlined crc32 calls):

mov     rax, 0FFFFFFFFFFFFFFh
lea     rcx, [r13-10h]
shl     r10, 38h
and     r14, rax
mov     rax, cs:_ZN7__scudoL6CookieE ; __scudo::Cookie
or      r14, r10
mov     rdx, r14
crc32   eax, rcx
xor     dx, dx
crc32   eax, rdx
mov     r14w, ax
mov     rax, r13
mov     [r13-10h], r14

And the partial SSE one (only one HashAlgorithm load):

                mov     rax, cs:_ZN7__scudoL6CookieE ; __scudo::Cookie
                mov     dl, cs:_ZN7__scudoL13HashAlgorithmE ; unsigned __int64
                cmp     dl, 1
                jnz     short loc_8859
                and     rbx, 0FFFFFFFFFFFF0000h
                mov     edi, eax        ; this
                mov     rsi, rcx        ; unsigned int
                call    _ZN7__scudo20computeHardwareCRC32Ejm ; __scudo::computeHardwareCRC32(uint,ulong)
                mov     edi, eax        ; this
                mov     rsi, rbx        ; unsigned int
                call    _ZN7__scudo20computeHardwareCRC32Ejm ; __scudo::computeHardwareCRC32(uint,ulong)
                jmp     loc_8974
; ---------------------------------------------------------------------------

loc_8859:                               ; CODE XREF: __scudo::ScudoChunk::computeChecksum(__scudo::UnpackedHeader *)+17j

... software CRC32 ...

So this seems pretty good.

alekseyshl added inline comments.May 8 2017, 2:29 PM

lib/scudo/scudo_allocator.cpp
60	Now it feels logical to move this entire function along with HashAlgorithm var into scudo_crs32.cpp, but, I guess, not inlining it affects performance, right?
lib/scudo/scudo_crc32.cpp
23	Ah, now I remember the original reasoning behind this separation. Mentioning that in comments helps a lot. Thanks!

cryptoad added inline comments.May 8 2017, 2:34 PM

lib/scudo/scudo_allocator.cpp
60	Yes. And also I would lose the distinction of whether `__SSE4_2__` is defined everywhere (eg: in scudo_allocator.cpp) or only in scudo_crc32.cpp.
lib/scudo/scudo_crc32.cpp
23	Anytime!

Getting rid of the temporary assignment to HashType which is no longer needed.

Harbormaster completed remote builds in B6260: Diff 98217.May 8 2017, 2:56 PM

alekseyshl accepted this revision.May 8 2017, 4:00 PM

This revision is now accepted and ready to land.May 8 2017, 4:00 PM

cryptoad closed this revision.May 9 2017, 8:25 AM

Revision Contents

Path

Size

lib/

scudo/

48 lines

101 lines

19 lines

59 lines

Diff 98217

lib/scudo/scudo_allocator.cpp

Show All 9 Lines
/// Scudo Hardened Allocator implementation.		/// Scudo Hardened Allocator implementation.
/// It uses the sanitizer_common allocator as a base and aims at mitigating		/// It uses the sanitizer_common allocator as a base and aims at mitigating
/// heap corruption vulnerabilities. It provides a checksum-guarded chunk		/// heap corruption vulnerabilities. It provides a checksum-guarded chunk
/// header, a delayed free list, and additional sanity checks.		/// header, a delayed free list, and additional sanity checks.
///		///
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "scudo_allocator.h"		#include "scudo_allocator.h"
		#include "scudo_crc32.h"
#include "scudo_tls.h"		#include "scudo_tls.h"
#include "scudo_utils.h"		#include "scudo_utils.h"

#include "sanitizer_common/sanitizer_allocator_interface.h"		#include "sanitizer_common/sanitizer_allocator_interface.h"
#include "sanitizer_common/sanitizer_quarantine.h"		#include "sanitizer_common/sanitizer_quarantine.h"

#include <limits.h>		#include <limits.h>
#include <pthread.h>		#include <pthread.h>
#include <string.h>		#include <string.h>

namespace __scudo {		namespace __scudo {

// Global static cookie, initialized at start-up.		// Global static cookie, initialized at start-up.
static uptr Cookie;		static uptr Cookie;

// We default to software CRC32 if the alternatives are not supported, either		// We default to software CRC32 if the alternatives are not supported, either
// at compilation or at runtime.		// at compilation or at runtime.
static atomic_uint8_t HashAlgorithm = { CRC32Software };		static atomic_uint8_t HashAlgorithm = { CRC32Software };

SANITIZER_WEAK_ATTRIBUTE u32 computeHardwareCRC32(u32 Crc, uptr Data);		INLINE u32 computeCRC32(uptr Crc, uptr Value, uptr *Array, uptr ArraySize) {
		// If the hardware CRC32 feature is defined here, it was enabled everywhere,
INLINE u32 computeCRC32(u32 Crc, uptr Data, u8 HashType) {		// as opposed to only for scudo_crc32.cpp. This means that other hardware
dvyukovUnsubmitted Not Done Reply Inline Actions It looked nice to have a separate function that wraps all of this logic. Why do you inline it? I would just move the load of HashAlgorithm into this function so that it does not happen when not needed. dvyukov: It looked nice to have a separate function that wraps all of this logic. Why do you inline it?
cryptoadAuthorUnsubmitted Not Done Reply Inline Actions Indeed, I considered this as well. The other advantage of doing it the new way was to have the `crc32` instructions inlined as opposed to be calls (which is another part of the performance gain), which required me to use the intrinsic within the loop. After duplicating the loop with intrinsic, I didn't see a need to keep that function. cryptoad: Indeed, I considered this as well. The other advantage of doing it the new way was to have the…
dvyukovUnsubmitted Not Done Reply Inline Actions Do you can have intrinsics in the inlinable computeCRC32 function. It should lead to the same assembly. With intrinsics it becomes more complex, so there is even more reason to have a separate function for it. Consider that we want to calculate crc in a another place. dvyukov: Do you can have intrinsics in the inlinable computeCRC32 function. It should lead to the same…
cryptoadAuthorUnsubmitted Not Done Reply Inline Actions So something like: INLINE u32 computeCRC32(u32 Crc, uptr Data, u8 HashType) { #if defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32) return CRC32_INTRINSIC(Crc, Data); #else if (atomic_load_relaxed(&HashAlgorithm) == CRC32Hardware) return computeHardwareCRC32(Crc, Data); return computeSoftwareCRC32(Crc, Data); #endif } cryptoad: So something like: ``` INLINE u32 computeCRC32(u32 Crc, uptr Data, u8 HashType) { #if defined…
cryptoadAuthorUnsubmitted Not Done Reply Inline Actions (without the 3rd param) cryptoad: (without the 3rd param)
dvyukovUnsubmitted Not Done Reply Inline Actions yup dvyukov: yup
cryptoadAuthorUnsubmitted Not Done Reply Inline Actions Just gave it a try, for the non-full-SSE the loop is unrolled but with a load & check of the algorithm at each iteration (2 for 64-bit, 3 for 32-bit): mov rax, cs:_ZN7__scudoL6CookieE ; __scudo::Cookie mov dl, cs:_ZN7__scudoL13HashAlgorithmE ; unsigned __int64 cmp dl, 1 jnz short loc_884B mov edi, eax ; this mov rsi, rcx ; unsigned int call _ZN7__scudo20computeHardwareCRC32Ejm ; __scudo::computeHardwareCRC32(uint,ulong) jmp loc_88DB loc_884B: ; CODE XREF: __scudo::ScudoChunk::computeChecksum(__scudo::UnpackedHeader )+1Aj ... software crc32 ... loc_88DB: ; CODE XREF: __scudo::ScudoChunk::computeChecksum(__scudo::UnpackedHeader )+26j mov cl, cs:_ZN7__scudoL13HashAlgorithmE ; __scudo::HashAlgorithm cmp cl, 1 jnz short loc_88FC and r14, 0FFFFFFFFFFFF0000h mov edi, eax ; this mov rsi, r14 ; unsigned int call _ZN7__scudo20computeHardwareCRC32Ejm ; __scudo::computeHardwareCRC32(uint,ulong) jmp loc_8982 loc_88FC: ; CODE XREF: __scudo::ScudoChunk::computeChecksum(__scudo::UnpackedHeader )+C4j ... software crc32 ... It seems somewhat messier. What do you think? cryptoad:* Just gave it a try, for the non-full-SSE the loop is unrolled but with a load & check of the…
dvyukovUnsubmitted Not Done Reply Inline Actions I missed that this function consumes a single word rather than an array. I basically mean a function which you give initial crc value and an array of data and it gives you new crc value using the most efficient method available whatever it is. dvyukov: I missed that this function consumes a single word rather than an array. I basically mean a…
// If SSE4.2 is defined here, it was enabled everywhere, as opposed to only		// specific instructions were likely emitted at other places, and as a
// for scudo_crc32.cpp. This means that other SSE instructions were likely		// result there is no reason to not use it here.
// emitted at other places, and as a result there is no reason to not use
// the hardware version of the CRC32.
#if defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)		#if defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)
return computeHardwareCRC32(Crc, Data);		Crc = CRC32_INTRINSIC(Crc, Value);
		for (uptr i = 0; i < ArraySize; i++)
		Crc = CRC32_INTRINSIC(Crc, Array[i]);
		return Crc;
#else		#else
if (computeHardwareCRC32 && HashType == CRC32Hardware)		if (atomic_load_relaxed(&HashAlgorithm) == CRC32Hardware) {
return computeHardwareCRC32(Crc, Data);		Crc = computeHardwareCRC32(Crc, Value);
else		for (uptr i = 0; i < ArraySize; i++)
return computeSoftwareCRC32(Crc, Data);		Crc = computeHardwareCRC32(Crc, Array[i]);
#endif // defined(__SSE4_2__)		return Crc;
		}
		Crc = computeSoftwareCRC32(Crc, Value);
		for (uptr i = 0; i < ArraySize; i++)
		Crc = computeSoftwareCRC32(Crc, Array[i]);
		return Crc;
		#endif // defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)
}		}
		alekseyshlUnsubmitted Not Done Reply Inline Actions Now it feels logical to move this entire function along with HashAlgorithm var into scudo_crs32.cpp, but, I guess, not inlining it affects performance, right? alekseyshl: Now it feels logical to move this entire function along with HashAlgorithm var into scudo_crs32.
		cryptoadAuthorUnsubmitted Not Done Reply Inline Actions Yes. And also I would lose the distinction of whether `__SSE4_2__` is defined everywhere (eg: in scudo_allocator.cpp) or only in scudo_crc32.cpp. cryptoad: Yes. And also I would lose the distinction of whether `__SSE4_2__` is defined everywhere (eg…

static ScudoBackendAllocator &getBackendAllocator();		static ScudoBackendAllocator &getBackendAllocator();

struct ScudoChunk : UnpackedHeader {		struct ScudoChunk : UnpackedHeader {
// We can't use the offset member of the chunk itself, as we would double		// We can't use the offset member of the chunk itself, as we would double
// fetch it without any warranty that it wouldn't have been tampered. To		// fetch it without any warranty that it wouldn't have been tampered. To
// prevent this, we work with a local copy of the header.		// prevent this, we work with a local copy of the header.
void getAllocBeg(UnpackedHeader Header) {		void getAllocBeg(UnpackedHeader Header) {
Show All 12 Lines	struct ScudoChunk : UnpackedHeader {
}		}

// Compute the checksum of the Chunk pointer and its ChunkHeader.		// Compute the checksum of the Chunk pointer and its ChunkHeader.
u16 computeChecksum(UnpackedHeader *Header) const {		u16 computeChecksum(UnpackedHeader *Header) const {
UnpackedHeader ZeroChecksumHeader = *Header;		UnpackedHeader ZeroChecksumHeader = *Header;
ZeroChecksumHeader.Checksum = 0;		ZeroChecksumHeader.Checksum = 0;
uptr HeaderHolder[sizeof(UnpackedHeader) / sizeof(uptr)];		uptr HeaderHolder[sizeof(UnpackedHeader) / sizeof(uptr)];
memcpy(&HeaderHolder, &ZeroChecksumHeader, sizeof(HeaderHolder));		memcpy(&HeaderHolder, &ZeroChecksumHeader, sizeof(HeaderHolder));
u8 HashType = atomic_load_relaxed(&HashAlgorithm);		u32 Crc = computeCRC32(Cookie, reinterpret_cast<uptr>(this), HeaderHolder,
u32 Crc = computeCRC32(Cookie, reinterpret_cast<uptr>(this), HashType);		ARRAY_SIZE(HeaderHolder));
		alekseyshlUnsubmitted Not Done Reply Inline Actions Would it make sense to check algorithm once and duplicate the code here, instead of comparing on every loop iteration? More code, but since we're after the performance... if (HashType == CRC32Hardware) { Crc = computeHardwareCRC32(Cookie, reinterpret_cast<uptr>(this)); for (uptr i = 0; i < ARRAY_SIZE(HeaderHolder); i++) Crc = computeHardwareCRC32(Crc, HeaderHolder[i]); } else { Crc = computeSoftwareCRC32(Cookie, reinterpret_cast<uptr>(this)); for (uptr i = 0; i < ARRAY_SIZE(HeaderHolder); i++) Crc = computeSoftwareCRC32(Crc, HeaderHolder[i]); } alekseyshl: Would it make sense to check algorithm once and duplicate the code here, instead of comparing…
		cryptoadAuthorUnsubmitted Not Done Reply Inline Actions I gave this a try, with -O3, the generated code for `computeChecksum` ends up being identical with the initial version of the patch. cryptoad: I gave this a try, with -O3, the generated code for `computeChecksum` ends up being identical…
for (uptr i = 0; i < ARRAY_SIZE(HeaderHolder); i++)
Crc = computeCRC32(Crc, HeaderHolder[i], HashType);
return static_cast<u16>(Crc);		return static_cast<u16>(Crc);
}		}

// Checks the validity of a chunk by verifying its checksum. It doesn't		// Checks the validity of a chunk by verifying its checksum. It doesn't
// incur termination in the event of an invalid chunk.		// incur termination in the event of an invalid chunk.
bool isValid() {		bool isValid() {
UnpackedHeader NewUnpackedHeader;		UnpackedHeader NewUnpackedHeader;
const AtomicPackedHeader *AtomicHeader =		const AtomicPackedHeader *AtomicHeader =
▲ Show 20 Lines • Show All 97 Lines • ▼ Show 20 Lines

static bool ScudoInitIsRunning = false;		static bool ScudoInitIsRunning = false;

void initScudo() {		void initScudo() {
SanitizerToolName = "Scudo";		SanitizerToolName = "Scudo";
CHECK(!ScudoInitIsRunning && "Scudo init calls itself!");		CHECK(!ScudoInitIsRunning && "Scudo init calls itself!");
ScudoInitIsRunning = true;		ScudoInitIsRunning = true;

// Check is SSE4.2 is supported, if so, opt for the CRC32 hardware version.		// Check if hardware CRC32 is supported in the binary and by the platform, if
if (testCPUFeature(CRC32CPUFeature)) {		// so, opt for the CRC32 hardware version of the checksum.
		if (computeHardwareCRC32 && testCPUFeature(CRC32CPUFeature))
atomic_store_relaxed(&HashAlgorithm, CRC32Hardware);		atomic_store_relaxed(&HashAlgorithm, CRC32Hardware);
}

initFlags();		initFlags();

AllocatorOptions Options;		AllocatorOptions Options;
Options.setFrom(getFlags(), common_flags());		Options.setFrom(getFlags(), common_flags());
initScudoInternal(Options);		initScudoInternal(Options);

// TODO(kostyak): determine if MaybeStartBackgroudThread could be of some use.		// TODO(kostyak): determine if MaybeStartBackgroudThread could be of some use.
Show All 13 Lines	if (UNLIKELY(Header.State != ChunkQuarantine)) {
dieWithMessage("ERROR: invalid chunk state when recycling address %p\n",		dieWithMessage("ERROR: invalid chunk state when recycling address %p\n",
Chunk);		Chunk);
}		}
Chunk->eraseHeader();		Chunk->eraseHeader();
void *Ptr = Chunk->getAllocBeg(&Header);		void *Ptr = Chunk->getAllocBeg(&Header);
getBackendAllocator().Deallocate(Cache_, Ptr);		getBackendAllocator().Deallocate(Cache_, Ptr);
}		}

/// Internal quarantine allocation and deallocation functions.		// Internal quarantine allocation and deallocation functions.
void *Allocate(uptr Size) {		void *Allocate(uptr Size) {
// TODO(kostyak): figure out the best way to protect the batches.		// TODO(kostyak): figure out the best way to protect the batches.
return getBackendAllocator().Allocate(Cache_, Size, MinAlignment);		return getBackendAllocator().Allocate(Cache_, Size, MinAlignment);
}		}

void Deallocate(void *Ptr) {		void Deallocate(void *Ptr) {
getBackendAllocator().Deallocate(Cache_, Ptr);		getBackendAllocator().Deallocate(Cache_, Ptr);
}		}
▲ Show 20 Lines • Show All 469 Lines • Show Last 20 Lines

lib/scudo/scudo_crc32.h

This file was added.

				//===-- scudo_crc32.h -------------------------------------------- C++ --===//
				//
				// The LLVM Compiler Infrastructure
				//
				// This file is distributed under the University of Illinois Open Source
				// License. See LICENSE.TXT for details.
				//
				//===----------------------------------------------------------------------===//
				///
				/// Scudo chunk header checksum related definitions.
				///
				//===----------------------------------------------------------------------===//

				#ifndef SCUDO_CRC32_H_
				#define SCUDO_CRC32_H_

				#include "sanitizer_common/sanitizer_internal_defs.h"

				// Hardware CRC32 is supported at compilation via the following:
				// - for i386 & x86_64: -msse4.2
				// - for ARM & AArch64: -march=armv8-a+crc or -mcrc
				// An additional check must be performed at runtime as well to make sure the
				// emitted instructions are valid on the target host.

				#if defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)
				# ifdef __SSE4_2__
				# include <smmintrin.h>
				# define CRC32_INTRINSIC FIRST_32_SECOND_64(_mm_crc32_u32, _mm_crc32_u64)
				# endif
				# ifdef __ARM_FEATURE_CRC32
				# include <arm_acle.h>
				# define CRC32_INTRINSIC FIRST_32_SECOND_64(__crc32cw, __crc32cd)
				# endif
				#endif // defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)

				namespace __scudo {

				enum : u8 {
				CRC32Software = 0,
				CRC32Hardware = 1,
				};

				const static u32 CRC32Table[] = {
				0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
				0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
				0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
				0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
				0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
				0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
				0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
				0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
				0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
				0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
				0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
				0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
				0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
				0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
				0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
				0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
				0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
				0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
				0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
				0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
				0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
				0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
				0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
				0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
				0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
				0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
				0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
				0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
				0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
				0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
				0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
				0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
				0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
				0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
				0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
				0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
				0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
				0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
				0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
				0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
				0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
				0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
				0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
				};

				INLINE u32 computeSoftwareCRC32(u32 Crc, uptr Data) {
				for (uptr i = 0; i < sizeof(Data); i++) {
				Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
				Data >>= 8;
				}
				return Crc;
				}

				SANITIZER_WEAK_ATTRIBUTE u32 computeHardwareCRC32(u32 Crc, uptr Data);

				} // namespace __scudo

				#endif // SCUDO_CRC32_H_

lib/scudo/scudo_crc32.cpp

	//===-- scudo_crc32.cpp ------------------------------------------ C++ --===//			//===-- scudo_crc32.cpp ------------------------------------------ C++ --===//
	//			//
	// The LLVM Compiler Infrastructure			// The LLVM Compiler Infrastructure
	//			//
	// This file is distributed under the University of Illinois Open Source			// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.			// License. See LICENSE.TXT for details.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	///			///
	/// CRC32 function leveraging hardware specific instructions. This has to be			/// CRC32 function leveraging hardware specific instructions. This has to be
	/// kept separated to restrict the use of compiler specific flags to this file.			/// kept separated to restrict the use of compiler specific flags to this file.
	///			///
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#include "sanitizer_common/sanitizer_internal_defs.h"			#include "scudo_crc32.h"

	// Hardware CRC32 is supported at compilation via the following:
	// - for i386 & x86_64: -msse4.2
	// - for ARM & AArch64: -march=armv8-a+crc or -mcrc
	// An additional check must be performed at runtime as well to make sure the
	// emitted instructions are valid on the target host.

	#if defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)
	# ifdef __SSE4_2__
	# include <smmintrin.h>
	# define CRC32_INTRINSIC FIRST_32_SECOND_64(_mm_crc32_u32, _mm_crc32_u64)
	# endif
	# ifdef __ARM_FEATURE_CRC32
	# include <arm_acle.h>
	# define CRC32_INTRINSIC FIRST_32_SECOND_64(__crc32cw, __crc32cd)
	# endif
	#endif // defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)

	namespace __scudo {			namespace __scudo {

	#if defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)			#if defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)
	u32 computeHardwareCRC32(u32 Crc, uptr Data) {			u32 computeHardwareCRC32(u32 Crc, uptr Data) {
	return CRC32_INTRINSIC(Crc, Data);			return CRC32_INTRINSIC(Crc, Data);
	}			}
	#endif // defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)			#endif // defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)
				alekseyshlUnsubmitted Not Done Reply Inline Actions Please remind me, does it mean that computeHardwareCRC32 != 0 only when defined(SSE4_2) \|\| defined(__ARM_FEATURE_CRC32)? Or is it expected to come from other libraries too? alekseyshl: Please remind me, does it mean that computeHardwareCRC32 != 0 only when defined(__SSE4_2__) \|\|…
				cryptoadAuthorUnsubmitted Not Done Reply Inline Actions computeHardwareCRC32 != 0 if `defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)`, but it could also be defined by an external library, even though this is not the expectation. With full RELRO (`-Wl,-z,relro,-z,now`), if not defined at load, it can't be defined later. cryptoad: computeHardwareCRC32 != 0 if `defined(__SSE4_2__) \|\| defined(__ARM_FEATURE_CRC32)`, but it…
				alekseyshlUnsubmitted Not Done Reply Inline Actions I see. I'm asking because in the current version of the code the same condition switches the top level code to CRC32_INTRINSIC, completely ignoring this function. I was curious if it is even necessary now. It just does not add up. Even for the possible future other uses of computeCRC32, for the performance reasons we have to ifdef to INTRINSIC calls at the call site, which renders computeHardwareCRC32 useless, right? alekseyshl: I see. I'm asking because in the current version of the code the same condition switches the…
				cryptoadAuthorUnsubmitted Not Done Reply Inline Actions I am going to try and clarify the situation. Let me know if something still doesn't make sense after! Compilation wise, there are 3 cases: `-msse4_2` (or ARM equivalent) is enabled for the whole project (eg: google3), instructions will be emitted accordingly, and we will only use hardware `crc32` (no weak function looked, only intrinsics); `-msse4_2` (or ARM equivalent) is enabled for scudo_crc32.cpp only (eg: current cmake config if supported): SSE instructions are only emitted in this file, `computeHardwareCRC32` is defined, and can be used at runtime; no SSE instructions will be emitted anywhere else; this allows runtime detection and to have a HW enabled version that also runs on old hardware; `-msse4_2` (or ARM equivalent) is not enabled (eg: current cmake config if not supported): `computeHardwareCRC32` is not defined. Runtime wise, if the processor supports HW CRC32, then #1 or #2 will leverage it. If it doesn't, #1 will crash on illegal instruction, #2 will still work. #3 works in any case. This patch will make #1 a lot cleaner ASM wise due to the call inline and the removal of the unnecessary load of `HashType`, #2 & #3 somewhat better due to the check removal for `computeHardwareCRC32` at each iteration. So I think your point about the top level code completely ignoring it refers to #1, but we still need it for #2. cryptoad: I am going to try and clarify the situation. Let me know if something still doesn't make sense…
				alekseyshlUnsubmitted Not Done Reply Inline Actions Ah, now I remember the original reasoning behind this separation. Mentioning that in comments helps a lot. Thanks! alekseyshl: Ah, now I remember the original reasoning behind this separation. Mentioning that in comments…
				cryptoadAuthorUnsubmitted Not Done Reply Inline Actions Anytime! cryptoad: Anytime!

	} // namespace __scudo			} // namespace __scudo

lib/scudo/scudo_utils.h

Show First 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	u64 getNext() {
x ^= x << 23;		x ^= x << 23;
State[1] = x ^ y ^ (x >> 17) ^ (y >> 26);		State[1] = x ^ y ^ (x >> 17) ^ (y >> 26);
return State[1] + y;		return State[1] + y;
}		}
private:		private:
u64 State[2];		u64 State[2];
};		};

enum : u8 {
CRC32Software = 0,
CRC32Hardware = 1,
};

const static u32 CRC32Table[] = {
0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
};

INLINE u32 computeSoftwareCRC32(u32 Crc, uptr Data) {
for (uptr i = 0; i < sizeof(Data); i++) {
Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
Data >>= 8;
}
return Crc;
}

} // namespace __scudo		} // namespace __scudo

#endif // SCUDO_UTILS_H_		#endif // SCUDO_UTILS_H_

This is an archive of the discontinued LLVM Phabricator instance.

[scudo] CRC32 optimizationsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 98217

lib/scudo/scudo_allocator.cpp

lib/scudo/scudo_crc32.h

lib/scudo/scudo_crc32.cpp

lib/scudo/scudo_utils.h

[scudo] CRC32 optimizations
ClosedPublic