Index: CMakeLists.txt =================================================================== --- CMakeLists.txt +++ CMakeLists.txt @@ -180,6 +180,11 @@ check_cxx_compiler_flag(-Wglobal-constructors SUPPORTS_GLOBAL_CONSTRUCTORS_FLAG) # Not all sanitizers forbid global constructors. +# 64-bit atomic operations require armv7. +if(ANDROID) + list(APPEND SANITIZER_COMMON_CFLAGS -march=armv7) +endif() + if(APPLE) # Obtain the iOS Simulator SDK path from xcodebuild. execute_process( Index: lib/asan/asan_allocator.h =================================================================== --- lib/asan/asan_allocator.h +++ lib/asan/asan_allocator.h @@ -99,6 +99,8 @@ AsanThreadLocalMallocStorage() {} }; +typedef ALIGNED(8) AsanThreadLocalMallocStorage AsanThreadLocalMallocStorage; + void *asan_memalign(uptr alignment, uptr size, StackTrace *stack, AllocType alloc_type); void asan_free(void *ptr, StackTrace *stack, AllocType alloc_type); Index: lib/sanitizer_common/sanitizer_atomic.h =================================================================== --- lib/sanitizer_common/sanitizer_atomic.h +++ lib/sanitizer_common/sanitizer_atomic.h @@ -52,6 +52,9 @@ volatile Type val_dont_use; }; +// On 32-bit platforms u64 is not necessary aligned on 8 bytes. +typedef ALIGNED(8) atomic_uint64_t atomic_uint64_t; + } // namespace __sanitizer #if defined(__GNUC__) Index: lib/sanitizer_common/sanitizer_atomic_clang.h =================================================================== --- lib/sanitizer_common/sanitizer_atomic_clang.h +++ lib/sanitizer_common/sanitizer_atomic_clang.h @@ -34,6 +34,18 @@ __asm__ __volatile__("" ::: "memory"); } +// We would like to just use compiler builtin atomic operations +// for loads and stores, but they are mostly broken in clang: +// - they lead to vastly inefficient code generation +// (http://llvm.org/bugs/show_bug.cgi?id=17281) +// - 64-bit atomic operations are not implemented on x86_32 +// (http://llvm.org/bugs/show_bug.cgi?id=15034) +// - they are not implemented on ARM +// error: undefined reference to '__atomic_load_4' + +// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html +// for mappings of the memory model to different processors. + template INLINE typename T::Type atomic_load( const volatile T *a, memory_order mo) { @@ -41,23 +53,62 @@ | memory_order_acquire | memory_order_seq_cst)); DCHECK(!((uptr)a % sizeof(*a))); typename T::Type v; - // FIXME: - // 64-bit atomic operations are not atomic on 32-bit platforms. - // The implementation lacks necessary memory fences on ARM/PPC. - // We would like to use compiler builtin atomic operations, - // but they are mostly broken: - // - they lead to vastly inefficient code generation - // (http://llvm.org/bugs/show_bug.cgi?id=17281) - // - 64-bit atomic operations are not implemented on x86_32 - // (http://llvm.org/bugs/show_bug.cgi?id=15034) - // - they are not implemented on ARM - // error: undefined reference to '__atomic_load_4' - if (mo == memory_order_relaxed) { - v = a->val_dont_use; + + if (sizeof(*a) < 8 || sizeof(void*) == 8) { + // Assume that aligned loads are atomic. + if (mo == memory_order_relaxed) { + v = a->val_dont_use; + } else if (mo == memory_order_consume) { + // Assume that processor respects data dependencies + // (and that compiler won't break them). + __asm__ __volatile__("" ::: "memory"); + v = a->val_dont_use; + __asm__ __volatile__("" ::: "memory"); + } else if (mo == memory_order_acquire) { + __asm__ __volatile__("" ::: "memory"); + v = a->val_dont_use; +#if defined(__i386__) || defined(__x86_64__) + // On x86 loads are implicitly acquire. + __asm__ __volatile__("" ::: "memory"); +#else + __sync_synchronize(); +#endif + } else { // seq_cst +#if defined(__i386__) || defined(__x86_64__) + // On x86 plain MOV is enough for seq_cst store. + __asm__ __volatile__("" ::: "memory"); +#else + // E.g. on POWER we need a hw fence even before the store. + __sync_synchronize(); +#endif + v = a->val_dont_use; +#if defined(__i386__) || defined(__x86_64__) + __asm__ __volatile__("" ::: "memory"); +#else + __sync_synchronize(); +#endif + } } else { - atomic_signal_fence(memory_order_seq_cst); - v = a->val_dont_use; - atomic_signal_fence(memory_order_seq_cst); + // 64-bit load on 32-bit platform. + // FIXME(dvyukov): not implemented +#if defined(__i386__) + __asm__ __volatile__( + "movq %1, %%mm0;" // Use mmx reg for 64-bit atomic moves + "movq %%mm0, %0;" // (ptr could be read-only) + "emms;" // Empty mmx state/Reset FP regs + : "=m" (v) + : "m" (a->val_dont_use) + : // mark the FP stack and mmx registers as clobbered + "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", +#ifdef __MMX__ + "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", +#endif // #ifdef __MMX__ + "memory"); +#else // #if defined(__i386__) + // Gross, but simple and reliable. + // Assume that it is not in read-only memory. + v = __sync_fetch_and_add((typename T::Type volatile*)&a->val_dont_use, 0); +#endif } return v; } @@ -67,15 +118,59 @@ DCHECK(mo & (memory_order_relaxed | memory_order_release | memory_order_seq_cst)); DCHECK(!((uptr)a % sizeof(*a))); - if (mo == memory_order_relaxed) { - a->val_dont_use = v; + + if (sizeof(*a) < 8 || sizeof(void*) == 8) { + // Assume that aligned loads are atomic. + if (mo == memory_order_relaxed) { + a->val_dont_use = v; + } else if (mo == memory_order_release) { +#if defined(__i386__) || defined(__x86_64__) + // On x86 stores are implicitly release. + __asm__ __volatile__("" ::: "memory"); +#else + __sync_synchronize(); +#endif + a->val_dont_use = v; + __asm__ __volatile__("" ::: "memory"); + } else { // seq_cst +#if defined(__i386__) || defined(__x86_64__) + // On x86 stores are implicitly release. + __asm__ __volatile__("" ::: "memory"); +#else + __sync_synchronize(); +#endif + a->val_dont_use = v; + __sync_synchronize(); + } } else { - atomic_signal_fence(memory_order_seq_cst); - a->val_dont_use = v; - atomic_signal_fence(memory_order_seq_cst); + // 64-bit store on 32-bit platform. +#if defined(__i386__) + __asm__ __volatile__( + "movq %1, %%mm0;" // Use mmx reg for 64-bit atomic moves + "movq %%mm0, %0;" + "emms;" // Empty mmx state/Reset FP regs + : "=m" (a->val_dont_use) + : "m" (v) + : // mark the FP stack and mmx registers as clobbered + "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)", +#ifdef __MMX__ + "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", +#endif // #ifdef __MMX__ + "memory"); + if (mo == memory_order_seq_cst) + __sync_synchronize(); +#else // #if defined(__i386__) + // Gross, but simple and reliable. + typename T::Type cmp = a->val_dont_use; + typename T::Type cur; + for (;;) { + cur = __sync_val_compare_and_swap(&a->val_dont_use, cmp, v); + if (cmp == v) + break; + cmp = cur; + } +#endif } - if (mo == memory_order_seq_cst) - atomic_thread_fence(memory_order_seq_cst); } template Index: lib/sanitizer_common/tests/sanitizer_atomic_test.cc =================================================================== --- lib/sanitizer_common/tests/sanitizer_atomic_test.cc +++ lib/sanitizer_common/tests/sanitizer_atomic_test.cc @@ -15,6 +15,79 @@ namespace __sanitizer { +template +struct ValAndMagic { + typename T::Type magic0; + T a; + typename T::Type magic1; + + static ValAndMagic *sink; +}; + +template +ValAndMagic *ValAndMagic::sink; + +template +void CheckStoreLoad() { + typedef typename T::Type Type; + ValAndMagic val; + // Prevent the compiler from scalarizing the struct. + ValAndMagic::sink = &val; + // Ensure that surrounding memory is not overwritten. + val.magic0 = val.magic1 = (Type)-3; + for (u64 i = 0; i < 100; i++) { + // Generate a value that occupies all bytes of the variable. + u64 v = i; + v |= v << 8; + v |= v << 16; + v |= v << 32; + val.a.val_dont_use = (Type)v; + EXPECT_EQ(atomic_load(&val.a, load_mo), (Type)v); + val.a.val_dont_use = (Type)-1; + atomic_store(&val.a, (Type)v, store_mo); + EXPECT_EQ(val.a.val_dont_use, (Type)v); + } + EXPECT_EQ(val.magic0, (Type)-3); + EXPECT_EQ(val.magic1, (Type)-3); +} + +TEST(SanitizerCommon, AtomicStoreLoad) { + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + CheckStoreLoad(); + + CheckStoreLoad + (); + CheckStoreLoad + (); + CheckStoreLoad + (); + CheckStoreLoad + (); + CheckStoreLoad + (); +} + // Clang crashes while compiling this test for Android: // http://llvm.org/bugs/show_bug.cgi?id=15587 #if !SANITIZER_ANDROID