Index: lib/scudo/scudo_allocator.h
===================================================================
--- lib/scudo/scudo_allocator.h
+++ lib/scudo/scudo_allocator.h
@@ -22,6 +22,8 @@
 
 #include "sanitizer_common/sanitizer_allocator.h"
 
+#include <atomic>
+
 namespace __scudo {
 
 enum AllocType : u8 {
@@ -31,6 +33,63 @@
   FromMemalign  = 3, // Memory block came from memalign, posix_memalign, etc.
 };
 
+enum ChunkState : u8 {
+  ChunkAvailable  = 0,
+  ChunkAllocated  = 1,
+  ChunkQuarantine = 2
+};
+
+#if SANITIZER_WORDSIZE == 64
+// Our header requires 128 bits of storage on 64-bit platforms, which fits
+// nicely with the alignment requirements. Having the offset saves us from
+// using functions such as GetBlockBegin, that is fairly costly. Our first
+// implementation used the MetaData as well, which offers the advantage of
+// being stored away from the chunk itself, but accessing it was costly as
+// well. The header will be atomically loaded and stored using the 16-byte
+// primitives offered by the platform (likely requires cmpxchg16b support).
+typedef unsigned __int128 PackedHeader;
+struct UnpackedHeader {
+  u16  Checksum      : 16;
+  uptr RequestedSize : 40; // Needed for reallocation purposes.
+  u8   State         : 2;  // available, allocated, or quarantined
+  u8   AllocType     : 2;  // malloc, new, new[], or memalign
+  u8   Unused_0_     : 4;
+  uptr Offset        : 12; // Offset from the beginning of the backend
+                           // allocation to the beginning of the chunk itself,
+                           // in multiples of MinAlignment. See comment about
+                           // its maximum value and test in init().
+  u64  Unused_1_     : 36;
+  u16  Salt          : 16;
+};
+#elif SANITIZER_WORDSIZE == 32
+// On 32-bit platforms, our header requires 64 bits.
+typedef unsigned u64 PackedHeader;
+struct UnpackedHeader {
+  u16  Checksum      : 12;
+  uptr RequestedSize : 32; // Needed for reallocation purposes.
+  u8   State         : 2;  // available, allocated, or quarantined
+  u8   AllocType     : 2;  // malloc, new, new[], or memalign
+  uptr Offset        : 12; // Offset from the beginning of the backend
+                           // allocation to the beginning of the chunk itself,
+                           // in multiples of MinAlignment. See comment about
+                           // its maximum value and test in Allocator::init().
+  u16  Salt          : 4;
+};
+#else
+# error "Unsupported SANITIZER_WORDSIZE."
+#endif  // SANITIZER_WORDSIZE
+
+typedef std::atomic<PackedHeader> AtomicPackedHeader;
+COMPILER_CHECK(sizeof(UnpackedHeader) == sizeof(PackedHeader));
+
+const uptr ChunkHeaderSize = sizeof(PackedHeader);
+
+// Minimum alignment of 8 bytes for 32-bit, 16 for 64-bit
+const uptr MinAlignmentLog = FIRST_32_SECOND_64(3, 4);
+const uptr MaxAlignmentLog = 24; // 16 MB
+const uptr MinAlignment = 1 << MinAlignmentLog;
+const uptr MaxAlignment = 1 << MaxAlignmentLog;
+
 struct AllocatorOptions {
   u32 QuarantineSizeMb;
   u32 ThreadLocalQuarantineSizeKb;
@@ -58,6 +117,8 @@
 void *scudoAlignedAlloc(uptr Alignment, uptr Size);
 uptr scudoMallocUsableSize(void *Ptr);
 
+#include "scudo_allocator_secondary.h"
+
 } // namespace __scudo
 
 #endif  // SCUDO_ALLOCATOR_H_
Index: lib/scudo/scudo_allocator.cpp
===================================================================
--- lib/scudo/scudo_allocator.cpp
+++ lib/scudo/scudo_allocator.cpp
@@ -16,7 +16,6 @@
 
 #include "scudo_allocator.h"
 #include "scudo_utils.h"
-#include "scudo_allocator_secondary.h"
 
 #include "sanitizer_common/sanitizer_allocator_interface.h"
 #include "sanitizer_common/sanitizer_quarantine.h"
@@ -25,14 +24,10 @@
 #include <pthread.h>
 #include <smmintrin.h>
 
-#include <atomic>
 #include <cstring>
 
 namespace __scudo {
 
-const uptr MinAlignmentLog = 4; // 16 bytes for x64
-const uptr MaxAlignmentLog = 24;
-
 struct AP {
   static const uptr kSpaceBeg = ~0ULL;
   static const uptr kSpaceSize = 0x10000000000ULL;
@@ -55,55 +50,18 @@
 // Global static cookie, initialized at start-up.
 static u64 Cookie;
 
-enum ChunkState : u8 {
-  ChunkAvailable  = 0,
-  ChunkAllocated  = 1,
-  ChunkQuarantine = 2
-};
-
-typedef unsigned __int128 PackedHeader;
-typedef std::atomic<PackedHeader> AtomicPackedHeader;
-
-// Our header requires 128-bit of storage on x64 (the only platform supported
-// as of now), which fits nicely with the alignment requirements.
-// Having the offset saves us from using functions such as GetBlockBegin, that
-// is fairly costly. Our first implementation used the MetaData as well, which
-// offers the advantage of being stored away from the chunk itself, but
-// accessing it was costly as well.
-// The header will be atomically loaded and stored using the 16-byte primitives
-// offered by the platform (likely requires cmpxchg16b support).
-struct UnpackedHeader {
-  // 1st 8 bytes
-  u16 Checksum      : 16;
-  u64 RequestedSize : 40; // Needed for reallocation purposes.
-  u8  State         : 2;  // available, allocated, or quarantined
-  u8  AllocType     : 2;  // malloc, new, new[], or memalign
-  u8  Unused_0_     : 4;
-  // 2nd 8 bytes
-  u64 Offset        : 20; // Offset from the beginning of the backend
-                          // allocation to the beginning of the chunk itself,
-                          // in multiples of MinAlignment. See comment about
-                          // its maximum value and test in init().
-  u64 Unused_1_     : 28;
-  u16 Salt          : 16;
-};
-
-COMPILER_CHECK(sizeof(UnpackedHeader) == sizeof(PackedHeader));
-
-const uptr ChunkHeaderSize = sizeof(PackedHeader);
-
 struct ScudoChunk : UnpackedHeader {
   // We can't use the offset member of the chunk itself, as we would double
   // fetch it without any warranty that it wouldn't have been tampered. To
   // prevent this, we work with a local copy of the header.
-  void *AllocBeg(UnpackedHeader *Header) {
+  void *getAllocBeg(UnpackedHeader *Header) {
     return reinterpret_cast<void *>(
         reinterpret_cast<uptr>(this) - (Header->Offset << MinAlignmentLog));
   }
 
   // CRC32 checksum of the Chunk pointer and its ChunkHeader.
   // It currently uses the Intel Nehalem SSE4.2 crc32 64-bit instruction.
-  u16 Checksum(UnpackedHeader *Header) const {
+  u16 computeChecksum(UnpackedHeader *Header) const {
     u64 HeaderHolder[2];
     memcpy(HeaderHolder, Header, sizeof(HeaderHolder));
     u64 Crc = _mm_crc32_u64(Cookie, reinterpret_cast<uptr>(this));
@@ -125,14 +83,14 @@
     *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
     if ((NewUnpackedHeader->Unused_0_ != 0) ||
         (NewUnpackedHeader->Unused_1_ != 0) ||
-        (NewUnpackedHeader->Checksum != Checksum(NewUnpackedHeader))) {
+        (NewUnpackedHeader->Checksum != computeChecksum(NewUnpackedHeader))) {
       dieWithMessage("ERROR: corrupted chunk header at address %p\n", this);
     }
   }
 
   // Packs and stores the header, computing the checksum in the process.
   void storeHeader(UnpackedHeader *NewUnpackedHeader) {
-    NewUnpackedHeader->Checksum = Checksum(NewUnpackedHeader);
+    NewUnpackedHeader->Checksum = computeChecksum(NewUnpackedHeader);
     PackedHeader NewPackedHeader = bit_cast<PackedHeader>(*NewUnpackedHeader);
     AtomicPackedHeader *AtomicHeader =
         reinterpret_cast<AtomicPackedHeader *>(this);
@@ -144,7 +102,7 @@
   // we are not being raced by a corruption occurring in another thread.
   void compareExchangeHeader(UnpackedHeader *NewUnpackedHeader,
                              UnpackedHeader *OldUnpackedHeader) {
-    NewUnpackedHeader->Checksum = Checksum(NewUnpackedHeader);
+    NewUnpackedHeader->Checksum = computeChecksum(NewUnpackedHeader);
     PackedHeader NewPackedHeader = bit_cast<PackedHeader>(*NewUnpackedHeader);
     PackedHeader OldPackedHeader = bit_cast<PackedHeader>(*OldUnpackedHeader);
     AtomicPackedHeader *AtomicHeader =
@@ -194,6 +152,8 @@
   Options.setFrom(getFlags(), common_flags());
   initAllocator(Options);
 
+  MaybeStartBackgroudThread();
+
   ScudoInitIsRunning = false;
 }
 
@@ -221,7 +181,7 @@
       dieWithMessage("ERROR: invalid chunk state when recycling address %p\n",
                      Chunk);
     }
-    void *Ptr = Chunk->AllocBeg(&Header);
+    void *Ptr = Chunk->getAllocBeg(&Header);
     getAllocator().Deallocate(Cache_, Ptr);
   }
 
@@ -269,9 +229,8 @@
 }
 
 struct Allocator {
-  static const uptr MaxAllowedMallocSize = 1ULL << 40;
-  static const uptr MinAlignment = 1 << MinAlignmentLog;
-  static const uptr MaxAlignment = 1 << MaxAlignmentLog; // 16 MB
+  static const uptr MaxAllowedMallocSize =
+      FIRST_32_SECOND_64(2UL << 30, 1ULL << 40);
 
   ScudoAllocator BackendAllocator;
   ScudoQuarantine AllocatorQuarantine;
@@ -296,13 +255,18 @@
     CHECK(testCPUFeature(SSE4_2)); // for crc32
 
     // Verify that the header offset field can hold the maximum offset. In the
-    // worst case scenario, the backend allocation is already aligned on
-    // MaxAlignment, so in order to store the header and still be aligned, we
-    // add an extra MaxAlignment. As a result, the offset from the beginning of
-    // the backend allocation to the chunk will be MaxAlignment -
-    // ChunkHeaderSize.
+    // case of the Secondary allocator, it takes care of alignment and the
+    // offset will always be 0. In the case of the Primary, the worst case
+    // scenario happens in the last size class, when the backend allocation
+    // would already be aligned on the requested alignment, which would happen
+    // to be the maximum alignment that would fit in that size class. As a
+    // result, the maximum offset will be at most the maximum alignment for the
+    // last size class minus the header size, in multiples of MinAlignment.
     UnpackedHeader Header = {};
-    uptr MaximumOffset = (MaxAlignment - ChunkHeaderSize) >> MinAlignmentLog;
+    uptr MaxPrimaryAlignment = 1 << MostSignificantSetBitIndex(
+        PrimaryAllocator::SizeClassMap::kMaxSize - MinAlignment);
+    uptr MaximumOffset = (MaxPrimaryAlignment - ChunkHeaderSize) >>
+        MinAlignmentLog;
     Header.Offset = MaximumOffset;
     if (Header.Offset != MaximumOffset) {
       dieWithMessage("ERROR: the maximum possible offset doesn't fit in the "
@@ -313,9 +277,9 @@
     DeleteSizeMismatch = Options.DeleteSizeMismatch;
     ZeroContents = Options.ZeroContents;
     BackendAllocator.Init(Options.MayReturnNull);
-    AllocatorQuarantine.Init(static_cast<uptr>(Options.QuarantineSizeMb) << 20,
-                             static_cast<uptr>(
-                                 Options.ThreadLocalQuarantineSizeKb) << 10);
+    AllocatorQuarantine.Init(
+        static_cast<uptr>(Options.QuarantineSizeMb) << 20,
+        static_cast<uptr>(Options.ThreadLocalQuarantineSizeKb) << 10);
     BackendAllocator.InitCache(&FallbackAllocatorCache);
     Cookie = Prng.Next();
   }
@@ -325,7 +289,7 @@
     if (UNLIKELY(!ThreadInited))
       initThread();
     if (!IsPowerOfTwo(Alignment)) {
-      dieWithMessage("ERROR: malloc alignment is not a power of 2\n");
+      dieWithMessage("ERROR: alignment is not a power of 2\n");
     }
     if (Alignment > MaxAlignment)
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
@@ -336,20 +300,21 @@
     if (Size >= MaxAllowedMallocSize)
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
     uptr RoundedSize = RoundUpTo(Size, MinAlignment);
-    uptr ExtraBytes = ChunkHeaderSize;
+    uptr NeededSize = RoundedSize + ChunkHeaderSize;
     if (Alignment > MinAlignment)
-      ExtraBytes += Alignment;
-    uptr NeededSize = RoundedSize + ExtraBytes;
+      NeededSize += Alignment;
     if (NeededSize >= MaxAllowedMallocSize)
       return BackendAllocator.ReturnNullOrDieOnBadRequest();
+    bool FromPrimary = PrimaryAllocator::CanAllocate(NeededSize, MinAlignment);
 
     void *Ptr;
     if (LIKELY(!ThreadTornDown)) {
-      Ptr = BackendAllocator.Allocate(&Cache, NeededSize, MinAlignment);
+      Ptr = BackendAllocator.Allocate(&Cache, NeededSize,
+                                      FromPrimary ? MinAlignment : Alignment);
     } else {
       SpinMutexLock l(&FallbackMutex);
       Ptr = BackendAllocator.Allocate(&FallbackAllocatorCache, NeededSize,
-                                      MinAlignment);
+                                      FromPrimary ? MinAlignment : Alignment);
     }
     if (!Ptr)
       return BackendAllocator.ReturnNullOrDieOnOOM();
@@ -359,6 +324,11 @@
        memset(Ptr, 0, BackendAllocator.GetActuallyAllocatedSize(Ptr));
 
     uptr AllocBeg = reinterpret_cast<uptr>(Ptr);
+    // If the allocation was serviced by the secondary, the returned pointer
+    // accounts for ChunkHeaderSize to pass the alignment check of the combined
+    // allocator. Adjust it here.
+    if (!FromPrimary)
+      AllocBeg -= ChunkHeaderSize;
     uptr ChunkBeg = AllocBeg + ChunkHeaderSize;
     if (!IsAligned(ChunkBeg, Alignment))
       ChunkBeg = RoundUpTo(ChunkBeg, Alignment);
@@ -450,7 +420,7 @@
                      "address %p\n", Chunk);
     }
     uptr Size =
-        BackendAllocator.GetActuallyAllocatedSize(Chunk->AllocBeg(Header));
+        BackendAllocator.GetActuallyAllocatedSize(Chunk->getAllocBeg(Header));
     // UsableSize works as malloc_usable_size, which is also what (AFAIU)
     // tcmalloc's MallocExtension::GetAllocatedSize aims at providing. This
     // means we will return the size of the chunk from the user beginning to
@@ -543,7 +513,7 @@
 }
 
 void *scudoMalloc(uptr Size, AllocType Type) {
-  return Instance.allocate(Size, Allocator::MinAlignment, Type);
+  return Instance.allocate(Size, MinAlignment, Type);
 }
 
 void scudoFree(void *Ptr, AllocType Type) {
@@ -556,7 +526,7 @@
 
 void *scudoRealloc(void *Ptr, uptr Size) {
   if (!Ptr)
-    return Instance.allocate(Size, Allocator::MinAlignment, FromMalloc);
+    return Instance.allocate(Size, MinAlignment, FromMalloc);
   if (Size == 0) {
     Instance.deallocate(Ptr, 0, FromMalloc);
     return nullptr;
Index: lib/scudo/scudo_allocator_secondary.h
===================================================================
--- lib/scudo/scudo_allocator_secondary.h
+++ lib/scudo/scudo_allocator_secondary.h
@@ -17,7 +17,9 @@
 #ifndef SCUDO_ALLOCATOR_SECONDARY_H_
 #define SCUDO_ALLOCATOR_SECONDARY_H_
 
-namespace __scudo {
+#ifndef SCUDO_ALLOCATOR_H_
+# error "This file must be included inside scudo_allocator.h."
+#endif
 
 class ScudoLargeMmapAllocator {
  public:
@@ -30,25 +32,51 @@
   void *Allocate(AllocatorStats *Stats, uptr Size, uptr Alignment) {
     // The Scudo frontend prevents us from allocating more than
     // MaxAllowedMallocSize, so integer overflow checks would be superfluous.
+    uptr HeadersSize = sizeof(SecondaryHeader) + ChunkHeaderSize;
     uptr MapSize = RoundUpTo(Size + sizeof(SecondaryHeader), PageSize);
     // Account for 2 guard pages, one before and one after the chunk.
-    uptr MapBeg = reinterpret_cast<uptr>(MmapNoAccess(MapSize + 2 * PageSize));
-    CHECK_NE(MapBeg, ~static_cast<uptr>(0));
+    MapSize += 2 * PageSize;
+    // Adding an extra Alignment is not required, it was done by the frontend.
+    uptr MapBeg = reinterpret_cast<uptr>(MmapNoAccess(MapSize));
+    if (MapBeg == ~static_cast<uptr>(0))
+      return ReturnNullOrDieOnOOM();
     // A page-aligned pointer is assumed after that, so check it now.
     CHECK(IsAligned(MapBeg, PageSize));
-    MapBeg += PageSize;
-    CHECK_EQ(MapBeg, reinterpret_cast<uptr>(MmapFixedOrDie(MapBeg, MapSize)));
     uptr MapEnd = MapBeg + MapSize;
-    uptr Ptr = MapBeg + sizeof(SecondaryHeader);
-    // TODO(kostyak): add a random offset to Ptr.
-    CHECK_GT(Ptr + Size, MapBeg);
-    CHECK_LE(Ptr + Size, MapEnd);
+    uptr UserBeg = MapBeg + PageSize + HeadersSize;
+    // In the event of larger alignments, we will attempt to fit the mmap area
+    // better and unmap extraneous memory. This will also ensure that the
+    // offset field of the header stays small (it will always be 0).
+    if (Alignment > MinAlignment) {
+      if (UserBeg & (Alignment - 1))
+        UserBeg += Alignment - (UserBeg & (Alignment - 1));
+      CHECK_GE(UserBeg, MapBeg);
+      uptr NewMapBeg = UserBeg - HeadersSize;
+      NewMapBeg = (NewMapBeg & ~(PageSize - 1)) - PageSize;
+      CHECK_GE(NewMapBeg, MapBeg);
+      uptr NewMapSize = MapEnd - NewMapBeg;
+      uptr Diff = NewMapBeg - MapBeg;
+      // Unmap the extra memory if it's large enough.
+      if (Diff > PageSize)
+        UnmapOrDie(reinterpret_cast<void *>(MapBeg), Diff);
+      MapBeg = NewMapBeg;
+      MapSize = NewMapSize;
+    }
+    uptr UserEnd = UserBeg - ChunkHeaderSize + Size;
+    // For larger alignments, Alignment was added by the frontend to Size.
+    if (Alignment > MinAlignment)
+      UserEnd -= Alignment;
+    CHECK_LE(UserEnd, MapEnd - PageSize);
+    CHECK_EQ(MapBeg + PageSize, reinterpret_cast<uptr>(
+        MmapFixedOrDie(MapBeg + PageSize, MapSize - 2 * PageSize)));
+    uptr Ptr = UserBeg - ChunkHeaderSize;
     SecondaryHeader *Header = getHeader(Ptr);
-    Header->MapBeg = MapBeg - PageSize;
-    Header->MapSize = MapSize + 2 * PageSize;
-    Stats->Add(AllocatorStatAllocated, MapSize);
-    Stats->Add(AllocatorStatMapped, MapSize);
-    return reinterpret_cast<void *>(Ptr);
+    Header->MapBeg = MapBeg;
+    Header->MapSize = MapSize;
+    Stats->Add(AllocatorStatAllocated, MapSize - 2 * PageSize);
+    Stats->Add(AllocatorStatMapped, MapSize - 2 * PageSize);
+    CHECK(IsAligned(UserBeg, Alignment));
+    return reinterpret_cast<void *>(UserBeg);
   }
 
   void *ReturnNullOrDieOnBadRequest() {
@@ -140,6 +168,4 @@
   atomic_uint8_t MayReturnNull;
 };
 
-} // namespace __scudo
-
 #endif  // SCUDO_ALLOCATOR_SECONDARY_H_
Index: test/scudo/memalign.cpp
===================================================================
--- test/scudo/memalign.cpp
+++ test/scudo/memalign.cpp
@@ -51,4 +51,4 @@
   return 0;
 }
 
-// CHECK: ERROR: malloc alignment is not a power of 2
+// CHECK: ERROR: alignment is not a power of 2