Index: lld/trunk/ELF/InputSection.cpp
===================================================================
--- lld/trunk/ELF/InputSection.cpp
+++ lld/trunk/ELF/InputSection.cpp
@@ -575,6 +575,10 @@
     // Variant 1. The thread pointer points to a TCB with a fixed 2-word size,
     // followed by a variable amount of alignment padding, followed by the TLS
     // segment.
+    //
+    // NB: While the ARM/AArch64 ABI formally has a 2-word TCB size, lld
+    // effectively increases the TCB size to 8 words for Android compatibility.
+    // It accomplishes this by increasing the segment's alignment.
     return alignTo(Config->Wordsize * 2, Out::TlsPhdr->p_align);
   case EM_386:
   case EM_X86_64:
Index: lld/trunk/ELF/Writer.cpp
===================================================================
--- lld/trunk/ELF/Writer.cpp
+++ lld/trunk/ELF/Writer.cpp
@@ -2181,11 +2181,23 @@
       P->p_memsz = alignTo(P->p_memsz, Target->PageSize);
     }
 
-    // The TLS pointer goes after PT_TLS for variant 2 targets. At least glibc
-    // will align it, so round up the size to make sure the offsets are
-    // correct.
-    if (P->p_type == PT_TLS && P->p_memsz)
+    if (P->p_type == PT_TLS && P->p_memsz) {
+      if (!Config->Shared &&
+          (Config->EMachine == EM_ARM || Config->EMachine == EM_AARCH64)) {
+        // On ARM/AArch64, reserve extra space (8 words) between the thread
+        // pointer and an executable's TLS segment by overaligning the segment.
+        // This reservation is needed for backwards compatibility with Android's
+        // TCB, which allocates several slots after the thread pointer (e.g.
+        // TLS_SLOT_STACK_GUARD==5). For simplicity, this overalignment is also
+        // done on other operating systems.
+        P->p_align = std::max<uint64_t>(P->p_align, Config->Wordsize * 8);
+      }
+
+      // The TLS pointer goes after PT_TLS for variant 2 targets. At least glibc
+      // will align it, so round up the size to make sure the offsets are
+      // correct.
       P->p_memsz = alignTo(P->p_memsz, P->p_align);
+    }
   }
 }
 
Index: lld/trunk/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s
===================================================================
--- lld/trunk/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s
+++ lld/trunk/test/ELF/aarch64-cortex-a53-843419-tlsrelax.s
@@ -26,9 +26,9 @@
 // CHECK: _start:
 // CHECK-NEXT:   210ff8:        41 d0 3b d5     mrs     x1, TPIDR_EL0
 // CHECK-NEXT:   210ffc:        00 00 a0 d2     movz    x0, #0, lsl #16
-// CHECK-NEXT:   211000:        01 02 80 f2     movk    x1, #16
+// CHECK-NEXT:   211000:        01 08 80 f2     movk    x1, #64
 // CHECK-NEXT:   211004:        00 00 a0 d2     movz    x0, #0, lsl #16
-// CHECK-NEXT:   211008:        01 02 80 f2     movk    x1, #16
+// CHECK-NEXT:   211008:        01 08 80 f2     movk    x1, #64
 // CHECK-NEXT:   21100c:        c0 03 5f d6     ret
 
  .type  v,@object
Index: lld/trunk/test/ELF/aarch64-tls-gdle.s
===================================================================
--- lld/trunk/test/ELF/aarch64-tls-gdle.s
+++ lld/trunk/test/ELF/aarch64-tls-gdle.s
@@ -5,15 +5,15 @@
 # RUN: llvm-objdump -d %tout | FileCheck %s
 # RUN: llvm-readobj -s -r %tout | FileCheck -check-prefix=RELOC %s
 
-#Local-Dynamic to Initial-Exec relax creates no
+#Local-Dynamic to Local-Exec relax creates no
 #RELOC:      Relocations [
 #RELOC-NEXT: ]
 
-# TCB size = 0x16 and foo is first element from TLS register.
+# TCB size = 64 and foo is first element from TLS register.
 # CHECK: Disassembly of section .text:
 # CHECK: _start:
 # CHECK:  210000:	00 00 a0 d2	movz	x0, #0, lsl #16
-# CHECK:  210004:	00 02 80 f2 	movk	x0, #16
+# CHECK:  210004:	00 08 80 f2 	movk	x0, #64
 # CHECK:  210008:	1f 20 03 d5 	nop
 # CHECK:  21000c:	1f 20 03 d5 	nop
 
Index: lld/trunk/test/ELF/aarch64-tls-iele.s
===================================================================
--- lld/trunk/test/ELF/aarch64-tls-iele.s
+++ lld/trunk/test/ELF/aarch64-tls-iele.s
@@ -9,13 +9,13 @@
 # RELOC:      Relocations [
 # RELOC-NEXT: ]
 
-# TCB size = 0x16 and foo is first element from TLS register.
+# TCB size = 64 and foo is first element from TLS register.
 # CHECK: Disassembly of section .text:
 # CHECK: _start:
 # CHECK-NEXT: 210000:  00 00 a0 d2   movz   x0, #0, lsl #16
-# CHECK-NEXT: 210004:  80 02 80 f2   movk   x0, #20
+# CHECK-NEXT: 210004:  80 08 80 f2   movk   x0, #68
 # CHECK-NEXT: 210008:  00 00 a0 d2   movz   x0, #0, lsl #16
-# CHECK-NEXT: 21000c:  00 02 80 f2   movk   x0, #16
+# CHECK-NEXT: 21000c:  00 08 80 f2   movk   x0, #64
 
 .section .tdata
 .align 2
Index: lld/trunk/test/ELF/aarch64-tls-le.s
===================================================================
--- lld/trunk/test/ELF/aarch64-tls-le.s
+++ lld/trunk/test/ELF/aarch64-tls-le.s
@@ -4,7 +4,7 @@
 # RUN: llvm-objdump -d %tout | FileCheck %s
 # RUN: llvm-readobj -s -r %tout | FileCheck -check-prefix=RELOC %s
 
-#Local-Dynamic to Initial-Exec relax creates no
+#Local-Dynamic to Local-Exec relax creates no
 #RELOC:      Relocations [
 #RELOC-NEXT: ]
 
@@ -17,12 +17,12 @@
  add x0, x0, :tprel_hi12:v2
  add x0, x0, :tprel_lo12_nc:v2
 
-# TCB size = 0x16 and foo is first element from TLS register.
+# TCB size = 64 and foo is first element from TLS register.
 #CHECK: Disassembly of section .text:
 #CHECK: _start:
 #CHECK:  210000: 40 d0 3b d5     mrs     x0, TPIDR_EL0
 #CHECK:  210004: 00 00 40 91     add     x0, x0, #0, lsl #12
-#CHECK:  210008: 00 40 00 91     add     x0, x0, #16
+#CHECK:  210008: 00 00 01 91     add     x0, x0, #64
 #CHECK:  21000c: 40 d0 3b d5     mrs     x0, TPIDR_EL0
 #CHECK:  210010: 00 fc 7f 91     add     x0, x0, #4095, lsl #12
 #CHECK:  210014: 00 e0 3f 91     add     x0, x0, #4088
@@ -36,9 +36,9 @@
 .word  0
 .size  v1, 4
 
-# The current offset from the thread pointer is 20. Raise it to just below the
+# The current offset from the thread pointer is 68. Raise it to just below the
 # 24-bit limit.
-.space (0xfffff8 - 20)
+.space (0xfffff8 - 68)
 
 .type   v2,@object
 .globl  v2
Index: lld/trunk/test/ELF/aarch64-tlsld-ldst.s
===================================================================
--- lld/trunk/test/ELF/aarch64-tlsld-ldst.s
+++ lld/trunk/test/ELF/aarch64-tlsld-ldst.s
@@ -26,27 +26,27 @@
 
 // CHECK: _start:
 // CHECK-NEXT:    210000:       48 d0 3b d5     mrs     x8, TPIDR_EL0
-// 0x0 + c10 = 0xc10       = tcb (16-bytes) + var0
-// CHECK-NEXT:    210004:       08 01 40 91     add     x8, x8, #0, lsl #12
-// CHECK-NEXT:    210008:       14 05 c3 3d     ldr     q20, [x8, #3088]
-// 0x1000 + 0x820 = 0x1820 = tcb + var1
-// CHECK-NEXT:    21000c:       08 05 40 91     add     x8, x8, #1, lsl #12
-// CHECK-NEXT:    210010:       00 11 44 f9     ldr     x0, [x8, #2080]
-// 0x2000 + 0x428 = 0x2428 = tcb + var2
-// CHECK-NEXT:    210014:       08 09 40 91     add     x8, x8, #2, lsl #12
-// CHECK-NEXT:    210018:       00 29 44 b9     ldr     w0, [x8, #1064]
-// 0x3000 + 0x2c  = 0x302c = tcb + var3
-// CHECK-NEXT:    21001c:       08 0d 40 91     add     x8, x8, #3, lsl #12
-// CHECK-NEXT:    210020:       00 59 40 79     ldrh    w0, [x8, #44]
-// 0x3000 + 0xc2e = 0x32ce = tcb + var4
-// CHECK-NEXT:    210024:       08 0d 40 91     add     x8, x8, #3, lsl #12
-// CHECK-NEXT:    210028:       00 b9 70 39     ldrb    w0, [x8, #3118]
-
-// CHECK-SYMS:      0000000000000c00     0 TLS     GLOBAL DEFAULT    2 var0
-// CHECK-SYMS-NEXT: 0000000000001810     4 TLS     GLOBAL DEFAULT    2 var1
-// CHECK-SYMS-NEXT: 0000000000002418     2 TLS     GLOBAL DEFAULT    2 var2
-// CHECK-SYMS-NEXT: 000000000000301c     1 TLS     GLOBAL DEFAULT    2 var3
-// CHECK-SYMS-NEXT: 0000000000003c1e     0 TLS     GLOBAL DEFAULT    2 var4
+// 0x0 + c40 = 0xc40       = tcb (64-bytes) + var0
+// CHECK-NEXT:    210004:       08 01 40 91     add x8, x8, #0, lsl #12
+// CHECK-NEXT:    210008:       14 11 c3 3d     ldr q20, [x8, #3136]
+// 0x1000 + 0x850 = 0x1850 = tcb + var1
+// CHECK-NEXT:    21000c:       08 05 40 91     add x8, x8, #1, lsl #12
+// CHECK-NEXT:    210010:       00 29 44 f9     ldr x0, [x8, #2128]
+// 0x2000 + 0x458 = 0x2458 = tcb + var2
+// CHECK-NEXT:    210014:       08 09 40 91     add x8, x8, #2, lsl #12
+// CHECK-NEXT:    210018:       00 59 44 b9     ldr w0, [x8, #1112]
+// 0x3000 + 0x5c  = 0x305c = tcb + var3
+// CHECK-NEXT:    21001c:       08 0d 40 91     add x8, x8, #3, lsl #12
+// CHECK-NEXT:    210020:       00 b9 40 79     ldrh  w0, [x8, #92]
+// 0x3000 + 0xc5e = 0x3c5e = tcb + var4
+// CHECK-NEXT:    210024:       08 0d 40 91     add x8, x8, #3, lsl #12
+// CHECK-NEXT:    210028:       00 79 71 39     ldrb  w0, [x8, #3166]
+
+// CHECK-SYMS:      0000000000000c00    16 TLS     GLOBAL DEFAULT    2 var0
+// CHECK-SYMS-NEXT: 0000000000001810     8 TLS     GLOBAL DEFAULT    2 var1
+// CHECK-SYMS-NEXT: 0000000000002418     4 TLS     GLOBAL DEFAULT    2 var2
+// CHECK-SYMS-NEXT: 000000000000301c     2 TLS     GLOBAL DEFAULT    2 var3
+// CHECK-SYMS-NEXT: 0000000000003c1e     1 TLS     GLOBAL DEFAULT    2 var4
 
         .globl var0
         .globl var1
@@ -59,12 +59,12 @@
         .type var3,@object
 
 .section .tbss,"awT",@nobits
-        .balign 16
+        .balign 64
         .space 1024 * 3
 var0:
         .quad 0
         .quad 0
-        .size var1, 16
+        .size var0, 16
         .space 1024 * 3
 var1:
         .quad 0
@@ -72,14 +72,14 @@
         .space 1024 * 3
 var2:
         .word 0
-        .size var1, 4
+        .size var2, 4
 
         .space 1024 * 3
 var3:
         .hword 0
-        .size var2, 2
+        .size var3, 2
         .space 1024 * 3
 var4:
         .byte 0
-        .size var3, 1
+        .size var4, 1
         .space 1024 * 3
Index: lld/trunk/test/ELF/arm-tls-le32.s
===================================================================
--- lld/trunk/test/ELF/arm-tls-le32.s
+++ lld/trunk/test/ELF/arm-tls-le32.s
@@ -69,9 +69,9 @@
 
 // CHECK: Disassembly of section .text:
 // CHECK-NEXT: _start:
-// offset of x from Thread pointer = (TcbSize + 0x0 = 0x8)
-// CHECK-NEXT:   11000:         08 00 00 00
-// offset of z from Thread pointer = (TcbSize + 0x8 = 0x10)
-// CHECK-NEXT:   11004:         10 00 00 00
-// offset of y from Thread pointer = (TcbSize + 0x4 = 0xc)
-// CHECK-NEXT:   11008:         0c 00 00 00
+// offset of x from Thread pointer = (TcbSize + 0x0 = 0x20)
+// CHECK-NEXT:   11000:         20 00 00 00
+// offset of z from Thread pointer = (TcbSize + 0x8 = 0x28)
+// CHECK-NEXT:   11004:         28 00 00 00
+// offset of y from Thread pointer = (TcbSize + 0x4 = 0x24)
+// CHECK-NEXT:   11008:         24 00 00 00
Index: lld/trunk/test/ELF/arm-tls-norelax-ie-le.s
===================================================================
--- lld/trunk/test/ELF/arm-tls-norelax-ie-le.s
+++ lld/trunk/test/ELF/arm-tls-norelax-ie-le.s
@@ -37,5 +37,5 @@
  .type x2, %object
 
 // CHECK: Contents of section .got:
-// x1 at offset 8 from TP, x2 at offset c from TP. Offsets include TCB size of 8
-// CHECK-NEXT: 13064 08000000 0c000000
+// x1 at offset 0x20 from TP, x2 at offset 0x24 from TP. Offsets include TCB size of 0x20
+// CHECK-NEXT: 13064 20000000 24000000