Index: cmake/config-ix.cmake
===================================================================
--- cmake/config-ix.cmake
+++ cmake/config-ix.cmake
@@ -160,7 +160,7 @@
 set(ALL_SAFESTACK_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM64} ${MIPS32} ${MIPS64})
 set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64})
 set(ALL_ESAN_SUPPORTED_ARCH ${X86_64} ${MIPS64})
-set(ALL_SCUDO_SUPPORTED_ARCH ${X86_64})
+set(ALL_SCUDO_SUPPORTED_ARCH ${X86} ${X86_64})
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32})
 
 if(APPLE)
@@ -383,8 +383,7 @@
     ${ALL_SAFESTACK_SUPPORTED_ARCH})
   filter_available_targets(CFI_SUPPORTED_ARCH ${ALL_CFI_SUPPORTED_ARCH})
   filter_available_targets(ESAN_SUPPORTED_ARCH ${ALL_ESAN_SUPPORTED_ARCH})
-  filter_available_targets(SCUDO_SUPPORTED_ARCH
-    ${ALL_SCUDO_SUPPORTED_ARCH})
+  filter_available_targets(SCUDO_SUPPORTED_ARCH ${ALL_SCUDO_SUPPORTED_ARCH})
   filter_available_targets(XRAY_SUPPORTED_ARCH ${ALL_XRAY_SUPPORTED_ARCH})
 endif()
 
Index: lib/scudo/CMakeLists.txt
===================================================================
--- lib/scudo/CMakeLists.txt
+++ lib/scudo/CMakeLists.txt
@@ -4,7 +4,7 @@
 
 set(SCUDO_CFLAGS ${SANITIZER_COMMON_CFLAGS})
 append_rtti_flag(OFF SCUDO_CFLAGS)
-list(APPEND SCUDO_CFLAGS -msse4.2 -mcx16)
+list(APPEND SCUDO_CFLAGS -msse4.2)
 
 set(SCUDO_SOURCES
   scudo_allocator.cpp
Index: lib/scudo/scudo_allocator.h
===================================================================
--- lib/scudo/scudo_allocator.h
+++ lib/scudo/scudo_allocator.h
@@ -14,10 +14,6 @@
 #ifndef SCUDO_ALLOCATOR_H_
 #define SCUDO_ALLOCATOR_H_
 
-#ifndef __x86_64__
-# error "The Scudo hardened allocator currently only supports x86_64."
-#endif
-
 #include "scudo_flags.h"
 
 #include "sanitizer_common/sanitizer_allocator.h"
@@ -39,57 +35,38 @@
   ChunkQuarantine = 2
 };
 
-#if SANITIZER_WORDSIZE == 64
-// Our header requires 128 bits of storage on 64-bit platforms, which fits
-// nicely with the alignment requirements. Having the offset saves us from
+// Our header requires 64 bits of storage. Having the offset saves us from
 // using functions such as GetBlockBegin, that is fairly costly. Our first
 // implementation used the MetaData as well, which offers the advantage of
 // being stored away from the chunk itself, but accessing it was costly as
 // well. The header will be atomically loaded and stored using the 16-byte
 // primitives offered by the platform (likely requires cmpxchg16b support).
-typedef unsigned __int128 PackedHeader;
-struct UnpackedHeader {
-  u16  Checksum      : 16;
-  uptr RequestedSize : 40; // Needed for reallocation purposes.
-  u8   State         : 2;  // available, allocated, or quarantined
-  u8   AllocType     : 2;  // malloc, new, new[], or memalign
-  u8   Unused_0_     : 4;
-  uptr Offset        : 12; // Offset from the beginning of the backend
-                           // allocation to the beginning of the chunk itself,
-                           // in multiples of MinAlignment. See comment about
-                           // its maximum value and test in init().
-  u64  Unused_1_     : 36;
-  u16  Salt          : 16;
-};
-#elif SANITIZER_WORDSIZE == 32
-// On 32-bit platforms, our header requires 64 bits.
 typedef u64 PackedHeader;
 struct UnpackedHeader {
-  u16  Checksum      : 12;
-  uptr RequestedSize : 32; // Needed for reallocation purposes.
-  u8   State         : 2;  // available, allocated, or quarantined
-  u8   AllocType     : 2;  // malloc, new, new[], or memalign
-  uptr Offset        : 12; // Offset from the beginning of the backend
-                           // allocation to the beginning of the chunk itself,
-                           // in multiples of MinAlignment. See comment about
-                           // its maximum value and test in Allocator::init().
-  u16  Salt          : 4;
+  u64 Checksum    : 16;
+  u64 UnusedBytes : 24; // Needed for reallocation purposes.
+  u64 State       : 2;  // available, allocated, or quarantined
+  u64 AllocType   : 2;  // malloc, new, new[], or memalign
+  u64 Offset      : 12; // Offset from the beginning of the backend
+                        // allocation to the beginning of the chunk itself,
+                        // in multiples of MinAlignment. See comment about
+                        // its maximum value and test in init().
+  u64 Salt        : 8;
 };
-#else
-# error "Unsupported SANITIZER_WORDSIZE."
-#endif  // SANITIZER_WORDSIZE
 
 typedef std::atomic<PackedHeader> AtomicPackedHeader;
 COMPILER_CHECK(sizeof(UnpackedHeader) == sizeof(PackedHeader));
 
-const uptr ChunkHeaderSize = sizeof(PackedHeader);
-
 // Minimum alignment of 8 bytes for 32-bit, 16 for 64-bit
 const uptr MinAlignmentLog = FIRST_32_SECOND_64(3, 4);
 const uptr MaxAlignmentLog = 24; // 16 MB
 const uptr MinAlignment = 1 << MinAlignmentLog;
 const uptr MaxAlignment = 1 << MaxAlignmentLog;
 
+const uptr ChunkHeaderSize = sizeof(PackedHeader);
+const uptr AlignedChunkHeaderSize =
+    (ChunkHeaderSize + MinAlignment - 1) & ~(MinAlignment - 1);
+
 struct AllocatorOptions {
   u32 QuarantineSizeMb;
   u32 ThreadLocalQuarantineSizeKb;
@@ -119,6 +96,6 @@
 
 #include "scudo_allocator_secondary.h"
 
-} // namespace __scudo
+}  // namespace __scudo
 
 #endif  // SCUDO_ALLOCATOR_H_
Index: lib/scudo/scudo_allocator.cpp
===================================================================
--- lib/scudo/scudo_allocator.cpp
+++ lib/scudo/scudo_allocator.cpp
@@ -28,17 +28,33 @@
 
 namespace __scudo {
 
+#if SANITIZER_CAN_USE_ALLOCATOR64
+const uptr AllocatorSpace = ~0ULL;
+const uptr AllocatorSize = 0x40000000000ULL;
+typedef DefaultSizeClassMap SizeClassMap;
 struct AP {
-  static const uptr kSpaceBeg = ~0ULL;
-  static const uptr kSpaceSize = 0x10000000000ULL;
+  static const uptr kSpaceBeg = AllocatorSpace;
+  static const uptr kSpaceSize = AllocatorSize;
   static const uptr kMetadataSize = 0;
-  typedef DefaultSizeClassMap SizeClassMap;
+  typedef __scudo::SizeClassMap SizeClassMap;
   typedef NoOpMapUnmapCallback MapUnmapCallback;
   static const uptr kFlags =
       SizeClassAllocator64FlagMasks::kRandomShuffleChunks;
 };
-
 typedef SizeClassAllocator64<AP> PrimaryAllocator;
+#else
+static const uptr RegionSizeLog = 20;
+static const uptr NumRegions = SANITIZER_MMAP_RANGE_SIZE >> RegionSizeLog;
+# if SANITIZER_WORDSIZE == 32
+typedef FlatByteMap<NumRegions> ByteMap;
+# elif SANITIZER_WORDSIZE == 64
+typedef TwoLevelByteMap<(NumRegions >> 12), 1 << 12> ByteMap;
+# endif  // SANITIZER_WORDSIZE
+typedef SizeClassMap<3, 4, 8, 16, 64, 14> SizeClassMap;
+typedef SizeClassAllocator32<0, SANITIZER_MMAP_RANGE_SIZE, 0, SizeClassMap,
+    RegionSizeLog, ByteMap> PrimaryAllocator;
+#endif  // SANITIZER_CAN_USE_ALLOCATOR64
+
 typedef SizeClassAllocatorLocalCache<PrimaryAllocator> AllocatorCache;
 typedef ScudoLargeMmapAllocator SecondaryAllocator;
 typedef CombinedAllocator<PrimaryAllocator, AllocatorCache, SecondaryAllocator>
@@ -48,7 +64,13 @@
 
 static thread_local Xorshift128Plus Prng;
 // Global static cookie, initialized at start-up.
-static u64 Cookie;
+static uptr Cookie;
+
+enum CRC32AlgorithmType : u8 {
+  CRC32Software = 0,
+  CRC32Hardware = 1,
+};
+static atomic_uint8_t CRC32Algorithm = { CRC32Software };
 
 struct ScudoChunk : UnpackedHeader {
   // We can't use the offset member of the chunk itself, as we would double
@@ -59,39 +81,74 @@
         reinterpret_cast<uptr>(this) - (Header->Offset << MinAlignmentLog));
   }
 
+  // Returns the usable size for a chunk, meaning the amount of bytes from the
+  // beginning of the user data to the end of the backend allocated chunk.
+  uptr getUsableSize(UnpackedHeader *Header) {
+    uptr Size = getAllocator().GetActuallyAllocatedSize(getAllocBeg(Header));
+    if (Size == 0)
+      return Size;
+    return Size - AlignedChunkHeaderSize - (Header->Offset << MinAlignmentLog);
+  }
+
   // CRC32 checksum of the Chunk pointer and its ChunkHeader.
-  // It currently uses the Intel Nehalem SSE4.2 crc32 64-bit instruction.
+  // This uses the Intel Nehalem SSE4.2 crc32 instruction if available.
   u16 computeChecksum(UnpackedHeader *Header) const {
-    u64 HeaderHolder[2];
-    memcpy(HeaderHolder, Header, sizeof(HeaderHolder));
-    u64 Crc = _mm_crc32_u64(Cookie, reinterpret_cast<uptr>(this));
-    // This is somewhat of a shortcut. The checksum is stored in the 16 least
-    // significant bits of the first 8 bytes of the header, hence zero-ing
-    // those bits out. It would be more valid to zero the checksum field of the
-    // UnpackedHeader, but would require holding an additional copy of it.
-    Crc = _mm_crc32_u64(Crc, HeaderHolder[0] & 0xffffffffffff0000ULL);
-    Crc = _mm_crc32_u64(Crc, HeaderHolder[1]);
+    UnpackedHeader ZeroChecksumHeader = *Header;
+    ZeroChecksumHeader.Checksum = 0;
+    uptr Crc;
+#if SANITIZER_WORDSIZE == 64
+    u64 HeaderHolder;
+    memcpy(&HeaderHolder, &ZeroChecksumHeader, sizeof(HeaderHolder));
+    if (atomic_load_relaxed(&CRC32Algorithm) == CRC32Hardware) {
+      Crc = _mm_crc32_u64(Cookie, reinterpret_cast<uptr>(this));
+      Crc = _mm_crc32_u64(Crc, HeaderHolder);
+    } else {
+      Crc = doCRC32u64(Cookie, reinterpret_cast<uptr>(this));
+      Crc = doCRC32u64(Crc, HeaderHolder);
+    }
+#elif SANITIZER_WORDSIZE == 32
+    u32 HeaderHolder[2];
+    memcpy(&HeaderHolder, &ZeroChecksumHeader, sizeof(HeaderHolder));
+    if (atomic_load_relaxed(&CRC32Algorithm) == CRC32Hardware) {
+      Crc = _mm_crc32_u32(Cookie, reinterpret_cast<uptr>(this));
+      Crc = _mm_crc32_u32(Crc, HeaderHolder[0]);
+      Crc = _mm_crc32_u32(Crc, HeaderHolder[1]);
+    } else {
+      Crc = doCRC32u32(Cookie, reinterpret_cast<uptr>(this));
+      Crc = doCRC32u32(Crc, HeaderHolder[0]);
+      Crc = doCRC32u32(Crc, HeaderHolder[1]);
+    }
+#endif  // SANITIZER_WORDSIZE
     return static_cast<u16>(Crc);
   }
 
+  // Checks the validity of a chunk by verifying its checksum.
+  bool isValid() {
+    UnpackedHeader NewUnpackedHeader;
+    const AtomicPackedHeader *AtomicHeader =
+        reinterpret_cast<const AtomicPackedHeader *>(this);
+    PackedHeader NewPackedHeader =
+        AtomicHeader->load(std::memory_order_relaxed);
+    NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
+    return (NewUnpackedHeader.Checksum == computeChecksum(&NewUnpackedHeader));
+  }
+
   // Loads and unpacks the header, verifying the checksum in the process.
-  void loadHeader(UnpackedHeader *NewUnpackedHeader) const {
+  void loadHeader(UnpackedHeader *Header) const {
     const AtomicPackedHeader *AtomicHeader =
         reinterpret_cast<const AtomicPackedHeader *>(this);
     PackedHeader NewPackedHeader =
         AtomicHeader->load(std::memory_order_relaxed);
-    *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
-    if ((NewUnpackedHeader->Unused_0_ != 0) ||
-        (NewUnpackedHeader->Unused_1_ != 0) ||
-        (NewUnpackedHeader->Checksum != computeChecksum(NewUnpackedHeader))) {
+    *Header = bit_cast<UnpackedHeader>(NewPackedHeader);
+    if (Header->Checksum != computeChecksum(Header)) {
       dieWithMessage("ERROR: corrupted chunk header at address %p\n", this);
     }
   }
 
   // Packs and stores the header, computing the checksum in the process.
-  void storeHeader(UnpackedHeader *NewUnpackedHeader) {
-    NewUnpackedHeader->Checksum = computeChecksum(NewUnpackedHeader);
-    PackedHeader NewPackedHeader = bit_cast<PackedHeader>(*NewUnpackedHeader);
+  void storeHeader(UnpackedHeader *Header) {
+    Header->Checksum = computeChecksum(Header);
+    PackedHeader NewPackedHeader = bit_cast<PackedHeader>(*Header);
     AtomicPackedHeader *AtomicHeader =
         reinterpret_cast<AtomicPackedHeader *>(this);
     AtomicHeader->store(NewPackedHeader, std::memory_order_relaxed);
@@ -146,6 +203,11 @@
   CHECK(!ScudoInitIsRunning && "Scudo init calls itself!");
   ScudoInitIsRunning = true;
 
+  // Check is SSE4.2 is supported, if so, opt for the CRC32 hardware version.
+  if (testCPUFeature(CRC32CPUFeature)) {
+    atomic_store_relaxed(&CRC32Algorithm, CRC32Hardware);
+  }
+
   initFlags();
 
   AllocatorOptions Options;
@@ -251,9 +313,6 @@
       FallbackQuarantineCache(LINKER_INITIALIZED) {}
 
   void init(const AllocatorOptions &Options) {
-    // Currently SSE 4.2 support is required. This might change later.
-    CHECK(testCPUFeature(SSE4_2)); // for crc32
-
     // Verify that the header offset field can hold the maximum offset. In the
     // case of the Secondary allocator, it takes care of alignment and the
     // offset will always be 0. In the case of the Primary, the worst case
@@ -264,14 +323,25 @@
     // last size class minus the header size, in multiples of MinAlignment.
     UnpackedHeader Header = {};
     uptr MaxPrimaryAlignment = 1 << MostSignificantSetBitIndex(
-        PrimaryAllocator::SizeClassMap::kMaxSize - MinAlignment);
-    uptr MaximumOffset = (MaxPrimaryAlignment - ChunkHeaderSize) >>
+        SizeClassMap::kMaxSize - MinAlignment);
+    uptr MaxOffset = (MaxPrimaryAlignment - AlignedChunkHeaderSize) >>
         MinAlignmentLog;
-    Header.Offset = MaximumOffset;
-    if (Header.Offset != MaximumOffset) {
+    Header.Offset = MaxOffset;
+    if (Header.Offset != MaxOffset) {
       dieWithMessage("ERROR: the maximum possible offset doesn't fit in the "
                      "header\n");
     }
+    // Verify that we can fit the maximum amount of unused bytes in the header.
+    // The worst case scenario would be when allocating 1 byte on a MaxAlignment
+    // alignment. Since the combined allocator currently rounds the size up to
+    // the alignment before passing it to the secondary, we end up with
+    // MaxAlignment - 1 extra bytes.
+    uptr MaxUnusedBytes = MaxAlignment - 1;
+    Header.UnusedBytes = MaxUnusedBytes;
+    if (Header.UnusedBytes != MaxUnusedBytes) {
+      dieWithMessage("ERROR: the maximum possible unused bytes doesn't fit in "
+                     "the header\n");
+    }
 
     DeallocationTypeMismatch = Options.DeallocationTypeMismatch;
     DeleteSizeMismatch = Options.DeleteSizeMismatch;
@@ -284,6 +354,17 @@
     Cookie = Prng.Next();
   }
 
+  // Helper function that checks for a valid Scudo chunk.
+  bool isValidPointer(const void *UserPtr) {
+    uptr ChunkBeg = reinterpret_cast<uptr>(UserPtr);
+    if (!IsAligned(ChunkBeg, MinAlignment)) {
+      return false;
+    }
+    ScudoChunk *Chunk =
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    return Chunk->isValid();
+  }
+
   // Allocates a chunk.
   void *allocate(uptr Size, uptr Alignment, AllocType Type) {
     if (UNLIKELY(!ThreadInited))
@@ -300,7 +381,7 @@
     if (Size >= MaxAllowedMallocSize)
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
     uptr RoundedSize = RoundUpTo(Size, MinAlignment);
-    uptr NeededSize = RoundedSize + ChunkHeaderSize;
+    uptr NeededSize = RoundedSize + AlignedChunkHeaderSize;
     if (Alignment > MinAlignment)
       NeededSize += Alignment;
     if (NeededSize >= MaxAllowedMallocSize)
@@ -319,28 +400,33 @@
     if (!Ptr)
       return BackendAllocator.ReturnNullOrDieOnOOM();
 
-    // If requested, we will zero out the entire contents of the returned chunk.
-    if (ZeroContents && BackendAllocator.FromPrimary(Ptr))
-       memset(Ptr, 0, BackendAllocator.GetActuallyAllocatedSize(Ptr));
-
     uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
     // If the allocation was serviced by the secondary, the returned pointer
     // accounts for ChunkHeaderSize to pass the alignment check of the combined
     // allocator. Adjust it here.
     if (!FromPrimary)
-      AllocBeg -= ChunkHeaderSize;
-    uptr ChunkBeg = AllocBeg + ChunkHeaderSize;
+      AllocBeg -= AlignedChunkHeaderSize;
+
+    uptr ActuallyAllocatedSize = BackendAllocator.GetActuallyAllocatedSize(
+        reinterpret_cast<void *>(AllocBeg));
+    // If requested, we will zero out the entire contents of the returned chunk.
+    if (ZeroContents && FromPrimary)
+       memset(Ptr, 0, ActuallyAllocatedSize);
+
+    uptr ChunkBeg = AllocBeg + AlignedChunkHeaderSize;
     if (!IsAligned(ChunkBeg, Alignment))
       ChunkBeg = RoundUpTo(ChunkBeg, Alignment);
     CHECK_LE(ChunkBeg + Size, AllocBeg + NeededSize);
     ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
     UnpackedHeader Header = {};
     Header.State = ChunkAllocated;
-    Header.Offset = (ChunkBeg - ChunkHeaderSize - AllocBeg) >> MinAlignmentLog;
+    uptr Offset = ChunkBeg - AlignedChunkHeaderSize - AllocBeg;
+    Header.Offset = Offset >> MinAlignmentLog;
     Header.AllocType = Type;
-    Header.RequestedSize = Size;
-    Header.Salt = static_cast<u16>(Prng.Next());
+    Header.UnusedBytes = ActuallyAllocatedSize - Offset -
+        AlignedChunkHeaderSize - Size;
+    Header.Salt = static_cast<u8>(Prng.Next());
     Chunk->storeHeader(&Header);
     void *UserPtr = reinterpret_cast<void *>(ChunkBeg);
     // TODO(kostyak): hooks sound like a terrible idea security wise but might
@@ -364,13 +450,14 @@
                      "aligned at address %p\n", UserPtr);
     }
     ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
     UnpackedHeader OldHeader;
     Chunk->loadHeader(&OldHeader);
     if (OldHeader.State != ChunkAllocated) {
       dieWithMessage("ERROR: invalid chunk state when deallocating address "
-                     "%p\n", Chunk);
+                     "%p\n", UserPtr);
     }
+    uptr UsableSize = Chunk->getUsableSize(&OldHeader);
     UnpackedHeader NewHeader = OldHeader;
     NewHeader.State = ChunkQuarantine;
     Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
@@ -384,69 +471,40 @@
         }
       }
     }
-    uptr Size = NewHeader.RequestedSize;
+    uptr Size = UsableSize - OldHeader.UnusedBytes;
     if (DeleteSizeMismatch) {
       if (DeleteSize && DeleteSize != Size) {
         dieWithMessage("ERROR: invalid sized delete on chunk at address %p\n",
                        Chunk);
       }
     }
+
     if (LIKELY(!ThreadTornDown)) {
       AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                              QuarantineCallback(&Cache), Chunk, Size);
+                              QuarantineCallback(&Cache), Chunk, UsableSize);
     } else {
       SpinMutexLock l(&FallbackMutex);
       AllocatorQuarantine.Put(&FallbackQuarantineCache,
                               QuarantineCallback(&FallbackAllocatorCache),
-                              Chunk, Size);
+                              Chunk, UsableSize);
     }
   }
 
-  // Returns the actual usable size of a chunk. Since this requires loading the
-  // header, we will return it in the second parameter, as it can be required
-  // by the caller to perform additional processing.
-  uptr getUsableSize(const void *Ptr, UnpackedHeader *Header) {
-    if (UNLIKELY(!ThreadInited))
-      initThread();
-    if (!Ptr)
-      return 0;
-    uptr ChunkBeg = reinterpret_cast<uptr>(Ptr);
-    ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
-    Chunk->loadHeader(Header);
-    // Getting the usable size of a chunk only makes sense if it's allocated.
-    if (Header->State != ChunkAllocated) {
-      dieWithMessage("ERROR: attempted to size a non-allocated chunk at "
-                     "address %p\n", Chunk);
-    }
-    uptr Size =
-        BackendAllocator.GetActuallyAllocatedSize(Chunk->getAllocBeg(Header));
-    // UsableSize works as malloc_usable_size, which is also what (AFAIU)
-    // tcmalloc's MallocExtension::GetAllocatedSize aims at providing. This
-    // means we will return the size of the chunk from the user beginning to
-    // the end of the 'user' allocation, hence us subtracting the header size
-    // and the offset from the size.
-    if (Size == 0)
-      return Size;
-    return Size - ChunkHeaderSize - (Header->Offset << MinAlignmentLog);
-  }
-
-  // Helper function that doesn't care about the header.
-  uptr getUsableSize(const void *Ptr) {
-    UnpackedHeader Header;
-    return getUsableSize(Ptr, &Header);
-  }
-
   // Reallocates a chunk. We can save on a new allocation if the new requested
   // size still fits in the chunk.
   void *reallocate(void *OldPtr, uptr NewSize) {
     if (UNLIKELY(!ThreadInited))
       initThread();
-    UnpackedHeader OldHeader;
-    uptr Size = getUsableSize(OldPtr, &OldHeader);
     uptr ChunkBeg = reinterpret_cast<uptr>(OldPtr);
     ScudoChunk *Chunk =
-        reinterpret_cast<ScudoChunk *>(ChunkBeg - ChunkHeaderSize);
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    UnpackedHeader OldHeader;
+    Chunk->loadHeader(&OldHeader);
+    if (OldHeader.State != ChunkAllocated) {
+      dieWithMessage("ERROR: invalid chunk state when reallocating address "
+                     "%p\n", OldPtr);
+    }
+    uptr Size = Chunk->getUsableSize(&OldHeader);
     if (OldHeader.AllocType != FromMalloc) {
       dieWithMessage("ERROR: invalid chunk type when reallocating address %p\n",
                      Chunk);
@@ -454,7 +512,7 @@
     UnpackedHeader NewHeader = OldHeader;
     // The new size still fits in the current chunk.
     if (NewSize <= Size) {
-      NewHeader.RequestedSize = NewSize;
+      NewHeader.UnusedBytes = Size - NewSize;
       Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
       return OldPtr;
     }
@@ -462,23 +520,42 @@
     // old one.
     void *NewPtr = allocate(NewSize, MinAlignment, FromMalloc);
     if (NewPtr) {
-      uptr OldSize = OldHeader.RequestedSize;
+      uptr OldSize = Size - OldHeader.UnusedBytes;
       memcpy(NewPtr, OldPtr, Min(NewSize, OldSize));
       NewHeader.State = ChunkQuarantine;
       Chunk->compareExchangeHeader(&NewHeader, &OldHeader);
       if (LIKELY(!ThreadTornDown)) {
         AllocatorQuarantine.Put(&ThreadQuarantineCache,
-                                QuarantineCallback(&Cache), Chunk, OldSize);
+                                QuarantineCallback(&Cache), Chunk, Size);
       } else {
         SpinMutexLock l(&FallbackMutex);
         AllocatorQuarantine.Put(&FallbackQuarantineCache,
                                 QuarantineCallback(&FallbackAllocatorCache),
-                                Chunk, OldSize);
+                                Chunk, Size);
       }
     }
     return NewPtr;
   }
 
+  // Helper function that returns the actual usable size of a chunk.
+  uptr getUsableSize(const void *Ptr) {
+    if (UNLIKELY(!ThreadInited))
+      initThread();
+    if (!Ptr)
+      return 0;
+    uptr ChunkBeg = reinterpret_cast<uptr>(Ptr);
+    ScudoChunk *Chunk =
+        reinterpret_cast<ScudoChunk *>(ChunkBeg - AlignedChunkHeaderSize);
+    UnpackedHeader Header;
+    Chunk->loadHeader(&Header);
+    // Getting the usable size of a chunk only makes sense if it's allocated.
+    if (Header.State != ChunkAllocated) {
+      dieWithMessage("ERROR: invalid chunk state when sizing address %p\n",
+                     Ptr);
+    }
+    return Chunk->getUsableSize(&Header);
+  }
+
   void *calloc(uptr NMemB, uptr Size) {
     if (UNLIKELY(!ThreadInited))
       initThread();
@@ -573,7 +650,7 @@
   return Instance.getUsableSize(Ptr);
 }
 
-} // namespace __scudo
+}  // namespace __scudo
 
 using namespace __scudo;
 
@@ -603,10 +680,10 @@
   return size;
 }
 
-int __sanitizer_get_ownership(const void *p) {
-  return Instance.getUsableSize(p) != 0;
+int __sanitizer_get_ownership(const void *Ptr) {
+  return Instance.isValidPointer(Ptr);
 }
 
-uptr __sanitizer_get_allocated_size(const void *p) {
-  return Instance.getUsableSize(p);
+uptr __sanitizer_get_allocated_size(const void *Ptr) {
+  return Instance.getUsableSize(Ptr);
 }
Index: lib/scudo/scudo_allocator_secondary.h
===================================================================
--- lib/scudo/scudo_allocator_secondary.h
+++ lib/scudo/scudo_allocator_secondary.h
@@ -32,7 +32,7 @@
   void *Allocate(AllocatorStats *Stats, uptr Size, uptr Alignment) {
     // The Scudo frontend prevents us from allocating more than
     // MaxAllowedMallocSize, so integer overflow checks would be superfluous.
-    uptr HeadersSize = sizeof(SecondaryHeader) + ChunkHeaderSize;
+    uptr HeadersSize = sizeof(SecondaryHeader) + AlignedChunkHeaderSize;
     uptr MapSize = RoundUpTo(Size + sizeof(SecondaryHeader), PageSize);
     // Account for 2 guard pages, one before and one after the chunk.
     MapSize += 2 * PageSize;
@@ -54,25 +54,34 @@
       uptr NewMapBeg = UserBeg - HeadersSize;
       NewMapBeg = (NewMapBeg & ~(PageSize - 1)) - PageSize;
       CHECK_GE(NewMapBeg, MapBeg);
-      uptr NewMapSize = MapEnd - NewMapBeg;
-      uptr Diff = NewMapBeg - MapBeg;
+      uptr NewMapSize = RoundUpTo(MapSize - Alignment, PageSize);
+      uptr NewMapEnd = NewMapBeg + NewMapSize;
+      CHECK_LE(NewMapEnd, MapEnd);
       // Unmap the extra memory if it's large enough.
+      uptr Diff = NewMapBeg - MapBeg;
       if (Diff > PageSize)
         UnmapOrDie(reinterpret_cast<void *>(MapBeg), Diff);
+      Diff = MapEnd - NewMapEnd;
+      if (Diff > PageSize)
+        UnmapOrDie(reinterpret_cast<void *>(NewMapEnd), Diff);
       MapBeg = NewMapBeg;
       MapSize = NewMapSize;
+      MapEnd = NewMapEnd;
     }
-    uptr UserEnd = UserBeg - ChunkHeaderSize + Size;
+    uptr UserEnd = UserBeg - AlignedChunkHeaderSize + Size;
     // For larger alignments, Alignment was added by the frontend to Size.
     if (Alignment > MinAlignment)
       UserEnd -= Alignment;
     CHECK_LE(UserEnd, MapEnd - PageSize);
     CHECK_EQ(MapBeg + PageSize, reinterpret_cast<uptr>(
         MmapFixedOrDie(MapBeg + PageSize, MapSize - 2 * PageSize)));
-    uptr Ptr = UserBeg - ChunkHeaderSize;
+    uptr Ptr = UserBeg - AlignedChunkHeaderSize;
     SecondaryHeader *Header = getHeader(Ptr);
     Header->MapBeg = MapBeg;
     Header->MapSize = MapSize;
+    // The primary adds the whole class size to the stats when allocating a
+    // chunk, so we will do something similar here. But we will not account for
+    // the guard pages.
     Stats->Add(AllocatorStatAllocated, MapSize - 2 * PageSize);
     Stats->Add(AllocatorStatMapped, MapSize - 2 * PageSize);
     CHECK(IsAligned(UserBeg, Alignment));
@@ -97,8 +106,8 @@
 
   void Deallocate(AllocatorStats *Stats, void *Ptr) {
     SecondaryHeader *Header = getHeader(Ptr);
-    Stats->Sub(AllocatorStatAllocated, Header->MapSize);
-    Stats->Sub(AllocatorStatMapped, Header->MapSize);
+    Stats->Sub(AllocatorStatAllocated, Header->MapSize - 2 * PageSize);
+    Stats->Sub(AllocatorStatMapped, Header->MapSize - 2 * PageSize);
     UnmapOrDie(reinterpret_cast<void *>(Header->MapBeg), Header->MapSize);
   }
 
@@ -154,8 +163,8 @@
     uptr MapBeg;
     uptr MapSize;
   };
-  // Check that sizeof(SecondaryHeader) is a multiple of 16.
-  COMPILER_CHECK((sizeof(SecondaryHeader) & 0xf) == 0);
+  // Check that sizeof(SecondaryHeader) is a multiple of MinAlignment.
+  COMPILER_CHECK((sizeof(SecondaryHeader) & (MinAlignment - 1)) == 0);
 
   SecondaryHeader *getHeader(uptr Ptr) {
     return reinterpret_cast<SecondaryHeader*>(Ptr - sizeof(SecondaryHeader));
Index: lib/scudo/scudo_flags.h
===================================================================
--- lib/scudo/scudo_flags.h
+++ lib/scudo/scudo_flags.h
@@ -28,6 +28,6 @@
 
 void initFlags();
 
-} // namespace __scudo
+}  // namespace __scudo
 
 #endif  // SCUDO_FLAGS_H_
Index: lib/scudo/scudo_flags.cpp
===================================================================
--- lib/scudo/scudo_flags.cpp
+++ lib/scudo/scudo_flags.cpp
@@ -90,4 +90,4 @@
   return &ScudoFlags;
 }
 
-}
+}  // namespace __scudo
Index: lib/scudo/scudo_interceptors.cpp
===================================================================
--- lib/scudo/scudo_interceptors.cpp
+++ lib/scudo/scudo_interceptors.cpp
@@ -72,4 +72,4 @@
   return -1;
 }
 
-#endif // SANITIZER_LINUX
+#endif  // SANITIZER_LINUX
Index: lib/scudo/scudo_new_delete.cpp
===================================================================
--- lib/scudo/scudo_new_delete.cpp
+++ lib/scudo/scudo_new_delete.cpp
@@ -24,7 +24,7 @@
 // Fake std::nothrow_t to avoid including <new>.
 namespace std {
 struct nothrow_t {};
-} // namespace std
+}  // namespace std
 
 CXX_OPERATOR_ATTRIBUTE
 void *operator new(size_t size) {
Index: lib/scudo/scudo_termination.cpp
===================================================================
--- lib/scudo/scudo_termination.cpp
+++ lib/scudo/scudo_termination.cpp
@@ -39,4 +39,4 @@
                           File, Line, Condition, Value1, Value2);
 }
 
-} // namespace __sanitizer
+}  // namespace __sanitizer
Index: lib/scudo/scudo_utils.h
===================================================================
--- lib/scudo/scudo_utils.h
+++ lib/scudo/scudo_utils.h
@@ -30,9 +30,9 @@
 
 void NORETURN dieWithMessage(const char *Format, ...);
 
-enum  CPUFeature {
-  SSE4_2 = 0,
-  ENUM_CPUFEATURE_MAX
+enum CPUFeature {
+  CRC32CPUFeature = 0,
+  MaxCPUFeature,
 };
 bool testCPUFeature(CPUFeature feature);
 
@@ -41,19 +41,41 @@
 struct Xorshift128Plus {
  public:
   Xorshift128Plus();
+#if SANITIZER_WORDSIZE == 64
   u64 Next() {
-    u64 x = State_0_;
-    const u64 y = State_1_;
-    State_0_ = y;
+    u64 x = State[0];
+    const u64 y = State[1];
+    State[0] = y;
     x ^= x << 23;
-    State_1_ = x ^ y ^ (x >> 17) ^ (y >> 26);
-    return State_1_ + y;
+    State[1] = x ^ y ^ (x >> 17) ^ (y >> 26);
+    return State[1] + y;
   }
+#else
+  u32 Next() {
+    u32 t = State[0];
+    t ^= t << 11;
+    t ^= t >> 8;
+    State[0] = State[1]; State[1] = State[2]; State[2] = State[3];
+    State[3] ^= State[3] >> 19;
+    State[3] ^= t;
+    return State[3];
+  }
+#endif  // SANITIZER_WORDSIZE == 64
  private:
-  u64 State_0_;
-  u64 State_1_;
+#if SANITIZER_WORDSIZE == 64
+  u64 State[2];
+#else
+  u32 State[4];
+#endif  // SANITIZER_WORDSIZE == 64
 };
 
-} // namespace __scudo
+// Software CRC32 functions, to be used when SSE 4.2 support is not detected.
+#if SANITIZER_WORDSIZE == 64
+u32 doCRC32u64(u32 Crc, u64 Data);
+#elif SANITIZER_WORDSIZE == 32
+u32 doCRC32u32(u32 Crc, u32 Data);
+#endif
+
+}  // namespace __scudo
 
 #endif  // SCUDO_UTILS_H_
Index: lib/scudo/scudo_utils.cpp
===================================================================
--- lib/scudo/scudo_utils.cpp
+++ lib/scudo/scudo_utils.cpp
@@ -17,6 +17,7 @@
 #include <fcntl.h>
 #include <stdarg.h>
 #include <unistd.h>
+#include <cpuid.h>
 
 #include <cstring>
 
@@ -28,7 +29,7 @@
 extern int VSNPrintf(char *buff, int buff_length, const char *format,
                      va_list args);
 
-} // namespace __sanitizer
+}  // namespace __sanitizer
 
 namespace __scudo {
 
@@ -44,6 +45,9 @@
   Die();
 }
 
+#if defined(__x86_64__) || defined(__i386__)
+// i386 and x86_64 specific code to detect CRC32 hardware support via CPUID.
+// CRC32 requires the SSE 4.2 instruction set.
 typedef struct {
   u32 Eax;
   u32 Ebx;
@@ -51,20 +55,17 @@
   u32 Edx;
 } CPUIDInfo;
 
-static void getCPUID(CPUIDInfo *info, u32 leaf, u32 subleaf)
+static void getCPUID(CPUIDInfo *Info, u32 Leaf)
 {
-  asm volatile("cpuid"
-      : "=a" (info->Eax), "=b" (info->Ebx), "=c" (info->Ecx), "=d" (info->Edx)
-      : "a" (leaf), "c" (subleaf)
-  );
+  __get_cpuid(Leaf, &Info->Eax, &Info->Ebx, &Info->Ecx, &Info->Edx);
 }
 
 // Returns true is the CPU is a "GenuineIntel" or "AuthenticAMD"
 static bool isSupportedCPU()
 {
-  CPUIDInfo Info;
+  CPUIDInfo Info = {};
 
-  getCPUID(&Info, 0, 0);
+  getCPUID(&Info, 0);
   if (memcmp(reinterpret_cast<char *>(&Info.Ebx), "Genu", 4) == 0 &&
       memcmp(reinterpret_cast<char *>(&Info.Edx), "ineI", 4) == 0 &&
       memcmp(reinterpret_cast<char *>(&Info.Ecx), "ntel", 4) == 0) {
@@ -78,26 +79,29 @@
   return false;
 }
 
-bool testCPUFeature(CPUFeature feature)
+bool testCPUFeature(CPUFeature Feature)
 {
   static bool InfoInitialized = false;
   static CPUIDInfo CPUInfo = {};
 
   if (InfoInitialized == false) {
     if (isSupportedCPU() == true)
-      getCPUID(&CPUInfo, 1, 0);
-    else
-      UNIMPLEMENTED();
+      getCPUID(&CPUInfo, 1);
     InfoInitialized = true;
   }
-  switch (feature) {
-    case SSE4_2:
+  switch (Feature) {
+    case CRC32CPUFeature:  // SSE 4.2 is bit 20 of ECX.
       return ((CPUInfo.Ecx >> 20) & 0x1) != 0;
     default:
       break;
   }
   return false;
 }
+#else
+bool testCPUFeature(CPUFeature Feature) {
+  return false;
+}
+#endif  // defined(__x86_64__) || defined(__i386__)
 
 // readRetry will attempt to read Count bytes from the Fd specified, and if
 // interrupted will retry to read additional bytes to reach Count.
@@ -117,17 +121,88 @@
   return AmountRead;
 }
 
-// Default constructor for Xorshift128Plus seeds the state with /dev/urandom
+// Default constructor for Xorshift128Plus seeds the state with /dev/urandom.
+// TODO(kostyak): investigate using getrandom() if available.
 Xorshift128Plus::Xorshift128Plus() {
   int Fd = open("/dev/urandom", O_RDONLY);
-  bool Success = readRetry(Fd, reinterpret_cast<u8 *>(&State_0_),
-                           sizeof(State_0_)) == sizeof(State_0_);
-  Success &= readRetry(Fd, reinterpret_cast<u8 *>(&State_1_),
-                           sizeof(State_1_)) == sizeof(State_1_);
+  bool Success = readRetry(Fd, reinterpret_cast<u8 *>(State),
+                           sizeof(State)) == sizeof(State);
   close(Fd);
   if (!Success) {
     dieWithMessage("ERROR: failed to read enough data from /dev/urandom.\n");
   }
 }
 
-} // namespace __scudo
+const static u32 CRC32Table[] = {
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+  0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+  0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+  0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
+  0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+  0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
+  0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+  0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+  0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+  0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+  0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+  0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+  0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+  0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+  0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+  0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+  0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+  0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+  0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+  0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+  0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+  0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+#if SANITIZER_WORDSIZE == 64
+u32 doCRC32u64(u32 Crc, u64 Data)
+{
+  Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
+  Crc = CRC32Table[(Crc ^ (Data >> 8)) & 0xff] ^ (Crc >> 8);
+  Crc = CRC32Table[(Crc ^ (Data >> 16)) & 0xff] ^ (Crc >> 8);
+  Crc = CRC32Table[(Crc ^ (Data >> 24)) & 0xff] ^ (Crc >> 8);
+  Crc = CRC32Table[(Crc ^ (Data >> 32)) & 0xff] ^ (Crc >> 8);
+  Crc = CRC32Table[(Crc ^ (Data >> 40)) & 0xff] ^ (Crc >> 8);
+  Crc = CRC32Table[(Crc ^ (Data >> 48)) & 0xff] ^ (Crc >> 8);
+  Crc = CRC32Table[(Crc ^ (Data >> 56)) & 0xff] ^ (Crc >> 8);
+  return Crc;
+}
+#elif SANITIZER_WORDSIZE == 32
+u32 doCRC32u32(u32 Crc, u32 Data)
+{
+  Crc = CRC32Table[(Crc ^ Data) & 0xff] ^ (Crc >> 8);
+  Crc = CRC32Table[(Crc ^ (Data >> 8)) & 0xff] ^ (Crc >> 8);
+  Crc = CRC32Table[(Crc ^ (Data >> 16)) & 0xff] ^ (Crc >> 8);
+  Crc = CRC32Table[(Crc ^ (Data >> 24)) & 0xff] ^ (Crc >> 8);
+  return Crc;
+}
+#else
+# error "Unsupported SANITIZER_WORDSIZE"
+#endif  // SANITIZER_WORDSIZE
+
+}  // namespace __scudo
Index: test/scudo/CMakeLists.txt
===================================================================
--- test/scudo/CMakeLists.txt
+++ test/scudo/CMakeLists.txt
@@ -18,11 +18,32 @@
    STRING(COMPARE EQUAL "sse4_2" "${SSE_THERE}" SSE42_TRUE)
 endif(CMAKE_SYSTEM_NAME MATCHES "Linux")
 
-if (SSE42_TRUE AND CMAKE_SIZEOF_VOID_P EQUAL 8)
-  add_lit_testsuite(check-scudo
-    "Running the Scudo Hardened Allocator tests"
-    ${CMAKE_CURRENT_BINARY_DIR}
+set(SCUDO_TEST_ARCH ${SCUDO_SUPPORTED_ARCH})
+foreach(arch ${SCUDO_TEST_ARCH})
+  set(SCUDO_TEST_TARGET_ARCH ${arch})
+  string(TOLOWER "-${arch}" SCUDO_TEST_CONFIG_SUFFIX)
+  
+  if(ANDROID OR ${arch} MATCHES "arm|aarch64")
+    # This is only true if we are cross-compiling.
+    # Build all tests with host compiler and use host tools.
+    set(SCUDO_TEST_TARGET_CFLAGS ${COMPILER_RT_TEST_COMPILER_CFLAGS})
+  else()
+    get_target_flags_for_arch(${arch} SCUDO_TEST_TARGET_CFLAGS)
+    string(REPLACE ";" " " SCUDO_TEST_TARGET_CFLAGS "${SCUDO_TEST_TARGET_CFLAGS}")
+  endif()
+
+  string(TOUPPER ${arch} ARCH_UPPER_CASE)
+  set(CONFIG_NAME ${ARCH_UPPER_CASE}${OS_NAME}Config)
+
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg)
+  list(APPEND SCUDO_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
+endforeach()
+
+if (SSE42_TRUE)
+  add_lit_testsuite(check-scudo "Running the Scudo Hardened Allocator tests"
+    ${SCUDO_TESTSUITES}
     DEPENDS ${SCUDO_TEST_DEPS})
-  set_target_properties(check-scudo PROPERTIES FOLDER
-    "Compiler-RT Misc")
-endif(SSE42_TRUE AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+  set_target_properties(check-scudo PROPERTIES FOLDER "Compiler-RT Misc")
+endif(SSE42_TRUE)
Index: test/scudo/alignment.cpp
===================================================================
--- test/scudo/alignment.cpp
+++ test/scudo/alignment.cpp
@@ -1,11 +1,10 @@
 // RUN: %clang_scudo %s -o %t
 // RUN: not %run %t pointers 2>&1 | FileCheck %s
 
-// Tests that a non-16-byte aligned pointer will trigger the associated error
-// on deallocation.
+// Tests that a non MinAlignment aligned pointer will trigger the associated
+// error on deallocation.
 
 #include <assert.h>
-#include <malloc.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -17,7 +16,7 @@
     void *p = malloc(1U << 16);
     if (!p)
       return 1;
-    free(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(p) | 8));
+    free(reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(p) | 1));
   }
   return 0;
 }
Index: test/scudo/double-free.cpp
===================================================================
--- test/scudo/double-free.cpp
+++ test/scudo/double-free.cpp
@@ -46,4 +46,4 @@
   return 0;
 }
 
-// CHECK: ERROR: invalid chunk state when deallocating address
+// CHECK: ERROR: invalid chunk state
Index: test/scudo/interface.cpp
===================================================================
--- /dev/null
+++ test/scudo/interface.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_scudo %s -o %t
+// RUN: %run %t 2>&1
+
+// Tests that the sanitizer interface functions behave appropriately.
+
+#include <stdlib.h>
+
+#include <vector>
+
+#include <sanitizer/allocator_interface.h>
+
+int main(int argc, char **argv)
+{
+  void *p;
+  std::vector<ssize_t> sizes{1, 8, 16, 32, 1024, 32768,
+    1 << 16, 1 << 17, 1 << 20, 1 << 24};
+  for (size_t size : sizes) {
+    p = malloc(size);
+    if (!p)
+      return 1;
+    if (!__sanitizer_get_ownership(p))
+      return 1;
+    if (__sanitizer_get_allocated_size(p) < size)
+      return 1;
+    free(p);
+  }
+  return 0;
+}
Index: test/scudo/lit.cfg
===================================================================
--- test/scudo/lit.cfg
+++ test/scudo/lit.cfg
@@ -3,7 +3,7 @@
 import os
 
 # Setup config name.
-config.name = 'Scudo'
+config.name = 'Scudo' + config.name_suffix
 
 # Setup source root.
 config.test_source_root = os.path.dirname(__file__)
@@ -14,18 +14,19 @@
 whole_archive = "-Wl,-whole-archive %s -Wl,-no-whole-archive " % base_lib
 
 # Test suffixes.
-config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.ll', '.test']
+config.suffixes = ['.c', '.cc', '.cpp']
 
 # C flags.
-c_flags = ["-std=c++11",
+c_flags = ([config.target_cflags] +
+           ["-std=c++11",
            "-lstdc++",
-           "-ldl",
            "-lrt",
-           "-pthread",
            "-latomic",
+           "-ldl",
+           "-pthread",
            "-fPIE",
            "-pie",
-           "-O0"]
+           "-O0"])
 
 def build_invocation(compile_flags):                                            
   return " " + " ".join([config.clang] + compile_flags) + " "                   
Index: test/scudo/lit.site.cfg.in
===================================================================
--- test/scudo/lit.site.cfg.in
+++ test/scudo/lit.site.cfg.in
@@ -1,5 +1,9 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+config.name_suffix = "@SCUDO_TEST_CONFIG_SUFFIX@"
+config.target_arch = "@SCUDO_TEST_TARGET_ARCH@"
+config.target_cflags = "@SCUDO_TEST_TARGET_CFLAGS@"
+
 # Load common config for all compiler-rt lit tests.
 lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
 
Index: test/scudo/malloc.cpp
===================================================================
--- test/scudo/malloc.cpp
+++ test/scudo/malloc.cpp
@@ -2,9 +2,9 @@
 // RUN: %run %t 2>&1
 
 // Tests that a regular workflow of allocation, memory fill and free works as
-// intended. Also tests that a zero-sized allocation succeeds.
+// intended. Tests various sizes serviced by the primary and secondary
+// allocators.
 
-#include <malloc.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -13,18 +13,25 @@
 int main(int argc, char **argv)
 {
   void *p;
-  std::vector<size_t> sizes{1, 1 << 5, 1 << 10, 1 << 15, 1 << 20};
+  std::vector<ssize_t> sizes{1, 8, 16, 32, 1024, 32768,
+    1 << 16, 1 << 17, 1 << 20, 1 << 24};
+  std::vector<int> offsets{1, 0, -1, -7, -8, -15, -16, -31, -32};
 
   p = malloc(0);
   if (!p)
     return 1;
   free(p);
-  for (size_t size : sizes) {
-    p = malloc(size);
-    if (!p)
-      return 1;
-    memset(p, 'A', size);
-    free(p);
+  for (ssize_t size : sizes) {
+    for (int offset: offsets) {
+      ssize_t actual_size = size + offset;
+      if (actual_size <= 0)
+        continue;
+      p = malloc(actual_size);
+      if (!p)
+        return 1;
+      memset(p, 0xff, actual_size);
+      free(p);
+    }
   }
 
   return 0;
Index: test/scudo/memalign.cpp
===================================================================
--- test/scudo/memalign.cpp
+++ test/scudo/memalign.cpp
@@ -31,7 +31,7 @@
       return 1;
     free(p);
     // Tests various combinations of alignment and sizes
-    for (int i = 4; i < 20; i++) {
+    for (int i = (sizeof(void *) == 4) ? 3 : 4; i <= 24; i++) {
       alignment = 1U << i;
       for (int j = 1; j < 33; j++) {
         size = 0x800 * j;
Index: test/scudo/mismatch.cpp
===================================================================
--- test/scudo/mismatch.cpp
+++ test/scudo/mismatch.cpp
@@ -30,7 +30,7 @@
     free((void *)p);
   }
   if (!strcmp(argv[1], "memaligndel")) {
-    int *p = (int *)memalign(0x10, 0x10);
+    int *p = (int *)memalign(16, 16);
     if (!p)
       return 1;
     delete p;
Index: test/scudo/overflow.cpp
===================================================================
--- test/scudo/overflow.cpp
+++ test/scudo/overflow.cpp
@@ -11,12 +11,13 @@
 int main(int argc, char **argv)
 {
   assert(argc == 2);
+  ssize_t offset = sizeof(void *) == 8 ? 8 : 0;
   if (!strcmp(argv[1], "malloc")) {
     // Simulate a header corruption of an allocated chunk (1-bit)
     void *p = malloc(1U << 4);
     if (!p)
       return 1;
-    ((char *)p)[-1] ^= 1;
+    ((char *)p)[-(offset + 1)] ^= 1;
     free(p);
   }
   if (!strcmp(argv[1], "quarantine")) {
@@ -25,7 +26,7 @@
       return 1;
     free(p);
     // Simulate a header corruption of a quarantined chunk
-    ((char *)p)[-2] ^= 1;
+    ((char *)p)[-(offset + 2)] ^= 1;
     // Trigger the quarantine recycle
     for (int i = 0; i < 0x100; i++) {
       p = malloc(1U << 16);
Index: test/scudo/preinit.cpp
===================================================================
--- test/scudo/preinit.cpp
+++ test/scudo/preinit.cpp
@@ -4,7 +4,6 @@
 // Verifies that calling malloc in a preinit_array function succeeds, and that
 // the resulting pointer can be freed at program termination.
 
-#include <malloc.h>
 #include <stdlib.h>
 #include <string.h>
 
Index: test/scudo/random_shuffle.cpp
===================================================================
--- test/scudo/random_shuffle.cpp
+++ test/scudo/random_shuffle.cpp
@@ -7,6 +7,7 @@
 // RUN: %run %t 10000 > %T/random_shuffle_tmp_dir/out2
 // RUN: not diff %T/random_shuffle_tmp_dir/out?
 // RUN: rm -rf %T/random_shuffle_tmp_dir
+// UNSUPPORTED: i386-linux,i686-linux
 
 // Tests that the allocator shuffles the chunks before returning to the user.
 
Index: test/scudo/realloc.cpp
===================================================================
--- test/scudo/realloc.cpp
+++ test/scudo/realloc.cpp
@@ -20,7 +20,7 @@
 {
   void *p, *old_p;
   // Those sizes will exercise both allocators (Primary & Secondary).
-  std::vector<size_t> sizes{1, 1 << 5, 1 << 10, 1 << 15, 1 << 20};
+  std::vector<size_t> sizes{1, 16, 1024, 32768, 1 << 16, 1 << 17, 1 << 20};
 
   assert(argc == 2);
   for (size_t size : sizes) {