diff --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h
--- a/compiler-rt/lib/scudo/standalone/common.h
+++ b/compiler-rt/lib/scudo/standalone/common.h
@@ -112,21 +112,6 @@
   *RandState = State;
 }
 
-// Hardware specific inlinable functions.
-
-inline void yieldProcessor(UNUSED u8 Count) {
-#if defined(__i386__) || defined(__x86_64__)
-  __asm__ __volatile__("" ::: "memory");
-  for (u8 I = 0; I < Count; I++)
-    __asm__ __volatile__("pause");
-#elif defined(__aarch64__) || defined(__arm__)
-  __asm__ __volatile__("" ::: "memory");
-  for (u8 I = 0; I < Count; I++)
-    __asm__ __volatile__("yield");
-#endif
-  __asm__ __volatile__("" ::: "memory");
-}
-
 // Platform specific functions.
 
 extern uptr PageSizeCached;
diff --git a/compiler-rt/lib/scudo/standalone/mutex.h b/compiler-rt/lib/scudo/standalone/mutex.h
--- a/compiler-rt/lib/scudo/standalone/mutex.h
+++ b/compiler-rt/lib/scudo/standalone/mutex.h
@@ -35,7 +35,7 @@
 #pragma nounroll
 #endif
     for (u8 I = 0U; I < NumberOfTries; I++) {
-      yieldProcessor(NumberOfYields);
+      delayLoop();
       if (tryLock())
         return;
     }
@@ -53,10 +53,21 @@
   }
 
 private:
+  void delayLoop() {
+    // The value comes from the average time spent in accessing caches (which
+    // are the fastest operations) so that we are unlikely to wait too long for
+    // fast operations.
+    constexpr u32 SpinTimes = 16;
+    volatile u32 V = 0;
+    for (u32 I = 0; I < SpinTimes; ++I)
+      ++V;
+  }
+
   void assertHeldImpl();
 
-  static constexpr u8 NumberOfTries = 8U;
-  static constexpr u8 NumberOfYields = 8U;
+  // TODO(chiahungduan): Adapt this value based on scenarios. E.g., primary and
+  // secondary allocator have different allocation times.
+  static constexpr u8 NumberOfTries = 32U;
 
 #if SCUDO_LINUX
   atomic_u32 M = {};