ARM: add backend support for the ABI used in WatchOS

TNorthover · TNorthover · commit e0ccdc6de93e · 2015-10-28T22:46:43.000Z
At the LLVM level this ABI is essentially a minimal modification of AAPCS to
support 16-byte alignment for vector types and the stack.

llvm-svn: 251570
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.h b/llvm/lib/Target/ARM/ARMCallingConv.h
@@ -199,7 +199,9 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT,
 
   // Try to allocate a contiguous block of registers, each of the correct
   // size to hold one member.
-  unsigned Align = std::min(PendingMembers[0].getExtraInfo(), 8U);
+  auto &DL = State.getMachineFunction().getDataLayout();
+  unsigned StackAlign = DL.getStackAlignment();
+  unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
 
   ArrayRef<uint16_t> RegList;
   switch (LocVT.SimpleTy) {
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -125,6 +125,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[
   CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
   CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[v2f64], CCIfAlign<"16",
+           CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>,
   CCIfType<[v2f64], CCAssignToStackWithShadow<16, 8, [Q0, Q1, Q2, Q3]>>
 ]>;
 
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
@@ -58,7 +59,7 @@ bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
   // iOS requires FP not to be clobbered for backtracing purpose.
-  if (STI.isTargetIOS())
+  if (STI.isTargetIOS() || STI.isTargetWatchOS())
     return true;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1073,7 +1074,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
   // slot offsets can be wrong. The offset for d8 will always be correct.
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned DNum = CSI[i].getReg() - ARM::D8;
-    if (DNum >= 8)
+    if (DNum > NumAlignedDPRCS2Regs - 1)
       continue;
     int FI = CSI[i].getFrameIdx();
     // The even-numbered registers will be 16-byte aligned, the odd-numbered
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -155,11 +155,18 @@ void ARMSubtarget::initializeEnvironment() {
 
 void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (CPUString.empty()) {
-    if (isTargetDarwin() && TargetTriple.getArchName().endswith("v7s"))
-      // Default to the Swift CPU when targeting armv7s/thumbv7s.
-      CPUString = "swift";
-    else
-      CPUString = "generic";
+    CPUString = "generic";
+
+    if (isTargetDarwin()) {
+      StringRef ArchName = TargetTriple.getArchName();
+      if (ArchName.endswith("v7s"))
+        // Default to the Swift CPU when targeting armv7s/thumbv7s.
+        CPUString = "swift";
+      else if (ArchName.endswith("v7k"))
+        // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k.
+        // ARMv7k does not use SjLj exception handling.
+        CPUString = "cortex-a7";
+    }
   }
 
   // Insert the architecture feature derived from the target triple into the
@@ -190,7 +197,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   if (isAAPCS_ABI())
     stackAlignment = 8;
-  if (isTargetNaCl())
+  if (isTargetNaCl() || isAAPCS16_ABI())
     stackAlignment = 16;
 
   // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo::
@@ -241,8 +248,14 @@ bool ARMSubtarget::isAPCS_ABI() const {
 }
 bool ARMSubtarget::isAAPCS_ABI() const {
   assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
-  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS;
+  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS ||
+         TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
 }
+bool ARMSubtarget::isAAPCS16_ABI() const {
+  assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN);
+  return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+}
+
 
 /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
 bool
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -354,6 +354,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
   bool isTargetIOS() const { return TargetTriple.isiOS(); }
+  bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); }
@@ -391,12 +392,13 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
     // FIXME: this is invalid for WindowsCE
     return TargetTriple.getEnvironment() == Triple::GNUEABIHF ||
            TargetTriple.getEnvironment() == Triple::EABIHF ||
-           isTargetWindows();
+           isTargetWindows() || isAAPCS16_ABI();
   }
   bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
 
   bool isAPCS_ABI() const;
   bool isAAPCS_ABI() const;
+  bool isAAPCS16_ABI() const;
 
   bool useSoftFloat() const { return UseSoftFloat; }
   bool isThumb() const { return InThumbMode; }
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -66,7 +66,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
 static ARMBaseTargetMachine::ARMABI
 computeTargetABI(const Triple &TT, StringRef CPU,
                  const TargetOptions &Options) {
-  if (Options.MCOptions.getABIName().startswith("aapcs"))
+  if (Options.MCOptions.getABIName() == "aapcs16")
+    return ARMBaseTargetMachine::ARM_ABI_AAPCS16;
+  else if (Options.MCOptions.getABIName().startswith("aapcs"))
     return ARMBaseTargetMachine::ARM_ABI_AAPCS;
   else if (Options.MCOptions.getABIName().startswith("apcs"))
     return ARMBaseTargetMachine::ARM_ABI_APCS;
@@ -83,6 +85,8 @@ computeTargetABI(const Triple &TT, StringRef CPU,
         (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) ||
         CPU.startswith("cortex-m")) {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS;
+    } else if (TT.isWatchOS()) {
+      TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16;
     } else {
       TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS;
     }
@@ -145,7 +149,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
   // to 64. We always ty to give them natural alignment.
   if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS)
     Ret += "-v64:32:64-v128:32:128";
-  else
+  else if (ABI != ARMBaseTargetMachine::ARM_ABI_AAPCS16)
     Ret += "-v128:64:128";
 
   // Try to align aggregates to 32 bits (the default is 64 bits, which has no
@@ -157,7 +161,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU,
 
   // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit
   // aligned everywhere else.
-  if (TT.isOSNaCl())
+  if (TT.isOSNaCl() || ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16)
     Ret += "-S128";
   else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS)
     Ret += "-S64";
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.h b/llvm/lib/Target/ARM/ARMTargetMachine.h
@@ -26,7 +26,8 @@ class ARMBaseTargetMachine : public LLVMTargetMachine {
   enum ARMABI {
     ARM_ABI_UNKNOWN,
     ARM_ABI_APCS,
-    ARM_ABI_AAPCS // ARM EABI
+    ARM_ABI_AAPCS, // ARM EABI
+    ARM_ABI_AAPCS16
   } TargetABI;
 
 protected:
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -196,7 +196,8 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) {
     else
       // Use CPU to figure out the exact features.
       ARMArchFeature = "+v7";
-    break;  case Triple::ARMSubArch_v7:
+    break;
+  case Triple::ARMSubArch_v7:
     // v7 CPUs have lots of different feature sets. If no CPU is specified,
     // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
     // the "minimum" feature set and use CPU string to figure out the exact
diff --git a/llvm/test/CodeGen/ARM/v7k-abi-align.ll b/llvm/test/CodeGen/ARM/v7k-abi-align.ll
@@ -0,0 +1,146 @@
+; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 -o - %s | FileCheck %s
+
+%struct = type { i8, i64, i8, double, i8, <2 x float>, i8, <4 x float> }
+
+define i32 @test_i64_align() {
+; CHECK-LABEL: test_i64_align:
+; CHECL: movs r0, #8
+  ret i32 ptrtoint(i64* getelementptr(%struct, %struct* null, i32 0, i32 1) to i32)
+}
+
+define i32 @test_f64_align() {
+; CHECK-LABEL: test_f64_align:
+; CHECL: movs r0, #24
+  ret i32 ptrtoint(double* getelementptr(%struct, %struct* null, i32 0, i32 3) to i32)
+}
+
+define i32 @test_v2f32_align() {
+; CHECK-LABEL: test_v2f32_align:
+; CHECL: movs r0, #40
+  ret i32 ptrtoint(<2 x float>* getelementptr(%struct, %struct* null, i32 0, i32 5) to i32)
+}
+
+define i32 @test_v4f32_align() {
+; CHECK-LABEL: test_v4f32_align:
+; CHECL: movs r0, #64
+  ret i32 ptrtoint(<4 x float>* getelementptr(%struct, %struct* null, i32 0, i32 7) to i32)
+}
+
+; Key point here is than an extra register has to be saved so that the DPRs end
+; up in an aligned location (as prologue/epilogue inserter had calculated).
+define void @test_dpr_unwind_align() {
+; CHECK-LABEL: test_dpr_unwind_align:
+; CHECK: push {r5, r6, r7, lr}
+; CHECK-NOT: sub sp
+; CHECK: vpush {d8, d9}
+; [...]
+; CHECK: bl _test_i64_align
+; CHECK-NOT: add sp,
+; CHECK: vpop {d8, d9}
+; CHECK-NOT: add sp,
+; CHECK: pop {r5, r6, r7, pc}
+
+  call void asm sideeffect "", "~{r6},~{d8},~{d9}"()
+
+  ; Whatever
+  call i32 @test_i64_align()
+  ret void
+}
+
+; This time, there's no viable way to tack CS-registers onto the list: a real SP
+; adjustment needs to be performed to put d8 and d9 where they should be.
+define void @test_dpr_unwind_align_manually() {
+; CHECK-LABEL: test_dpr_unwind_align_manually:
+; CHECK: push {r4, r5, r6, r7, lr}
+; CHECK-NOT: sub sp
+; CHECK: push.w {r8, r11}
+; CHECK: sub sp, #4
+; CHECK: vpush {d8, d9}
+; [...]
+; CHECK: bl _test_i64_align
+; CHECK-NOT: add sp,
+; CHECK: vpop {d8, d9}
+; CHECK: add sp, #4
+; CHECK: pop.w {r8, r11}
+; CHECK: pop {r4, r5, r6, r7, pc}
+
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{r8},~{d8},~{d9}"()
+
+  ; Whatever
+  call i32 @test_i64_align()
+  ret void
+}
+
+; If there's only a CS1 area, the sub should be in the right place:
+define void @test_dpr_unwind_align_just_cs1() {
+; CHECK-LABEL: test_dpr_unwind_align_just_cs1:
+; CHECK: push {r4, r5, r6, r7, lr}
+; CHECK: sub sp, #4
+; CHECK: vpush {d8, d9}
+; CHECK: sub sp, #8
+; [...]
+; CHECK: bl _test_i64_align
+; CHECK: add sp, #8
+; CHECK: vpop {d8, d9}
+; CHECK: add sp, #4
+; CHECK: pop {r4, r5, r6, r7, pc}
+
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7},~{d8},~{d9}"()
+
+  ; Whatever
+  call i32 @test_i64_align()
+  ret void
+}
+
+; If there are no DPRs, we shouldn't try to align the stack in stages anyway
+define void @test_dpr_unwind_align_no_dprs() {
+; CHECK-LABEL: test_dpr_unwind_align_no_dprs:
+; CHECK: push {r4, r5, r6, r7, lr}
+; CHECK: sub sp, #12
+; [...]
+; CHECK: bl _test_i64_align
+; CHECK: add sp, #12
+; CHECK: pop {r4, r5, r6, r7, pc}
+
+  call void asm sideeffect "", "~{r4},~{r5},~{r6},~{r7}"()
+
+  ; Whatever
+  call i32 @test_i64_align()
+  ret void
+}
+
+; 128-bit vectors should use 128-bit (i.e. correctly aligned) slots on
+; the stack.
+define <4 x float> @test_v128_stack_pass([8 x double], float, <4 x float> %in) {
+; CHECK-LABEL: test_v128_stack_pass:
+; CHECK: add r[[ADDR:[0-9]+]], sp, #16
+; CHECK: vld1.64 {d0, d1}, [r[[ADDR]]:128]
+
+  ret <4 x float> %in
+}
+
+declare void @varargs(i32, ...)
+
+; When varargs are enabled, we go down a different route. Still want 128-bit
+; alignment though.
+define void @test_v128_stack_pass_varargs(<4 x float> %in) {
+; CHECK-LABEL: test_v128_stack_pass_varargs:
+; CHECK: add r[[ADDR:[0-9]+]], sp, #16
+; CHECK: vst1.64 {d0, d1}, [r[[ADDR]]:128]
+
+  call void(i32, ...) @varargs(i32 undef, [3 x i32] undef, float undef, <4 x float> %in)
+  ret void
+}
+
+; To be compatible with AAPCS's va_start model (store r0-r3 at incoming SP, give
+; a single pointer), 64-bit quantities must be pass
+define i64 @test_64bit_gpr_align(i32, i64 %r2_r3, i32 %sp) {
+; CHECK-LABEL: test_64bit_gpr_align:
+; CHECK: ldr [[RHS:r[0-9]+]], [sp]
+; CHECK: adds r0, [[RHS]], r2
+; CHECK: adc r1, r3, #0
+
+  %ext = zext i32 %sp to i64
+  %sum = add i64 %ext, %r2_r3
+  ret i64 %sum
+}