Index: llvm/include/llvm/CodeGen/CallingConvLower.h
===================================================================
--- llvm/include/llvm/CodeGen/CallingConvLower.h
+++ llvm/include/llvm/CodeGen/CallingConvLower.h
@@ -43,6 +43,7 @@
     AExtUpper, // The value is in the upper bits of the location and should be
                // extended with undefined upper bits when retrieved.
     BCvt,      // The value is bit-converted in the location.
+    Trunc,     // The value is truncated in the location.
     VExt,      // The value is vector-widened in the location.
                // FIXME: Not implemented yet. Code that uses AExt to mean
                // vector-widen should be fixed to use VExt instead.
Index: llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -274,6 +274,12 @@
   /// scheduling, DAGCombine, etc.).
   virtual bool useAA() const;
 
+  /// \brief Sink addresses into blocks using GEP instructions rather than
+  /// pointer casts and arithmetic.
+  virtual bool addrSinkUsingGEPs() const {
+    return useAA();
+  }
+
   /// Enable the use of the early if conversion pass.
   virtual bool enableEarlyIfConversion() const { return false; }
 
Index: llvm/include/llvm/Target/TargetCallingConv.td
===================================================================
--- llvm/include/llvm/Target/TargetCallingConv.td
+++ llvm/include/llvm/Target/TargetCallingConv.td
@@ -152,6 +152,12 @@
   ValueType DestTy = destTy;
 }
 
+/// CCTruncToType - If applied, this truncates the specified current value to
+/// the specified type.
+class CCTruncToType<ValueType destTy> : CCAction {
+  ValueType DestTy = destTy;
+}
+
 /// CCPassIndirect - If applied, this stores the value to stack and passes the pointer
 /// as normal argument.
 class CCPassIndirect<ValueType destTy> : CCAction {
Index: llvm/lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -4790,8 +4790,8 @@
                       << " for " << *MemoryInst << "\n");
     if (SunkAddr->getType() != Addr->getType())
       SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
-  } else if (AddrSinkUsingGEPs ||
-             (!AddrSinkUsingGEPs.getNumOccurrences() && TM && TTI->useAA())) {
+  } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
+                                   TM && SubtargetInfo->addrSinkUsingGEPs())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -9799,6 +9799,10 @@
           FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
+    // Analyses past this point are naive and don't expect an assertion.
+    if (Res.getOpcode() == ISD::AssertZext)
+      Res = Res.getOperand(0);
+
     // Update the SwiftErrorVRegDefMap.
     if (Res.getOpcode() == ISD::CopyFromReg && isSwiftErrorArg) {
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
Index: llvm/lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -167,6 +167,7 @@
         setLibcallName(RTLIB::BZERO, "__bzero");
       break;
     case Triple::aarch64:
+    case Triple::aarch64_32:
       setLibcallName(RTLIB::BZERO, "bzero");
       break;
     default:
Index: llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
===================================================================
--- llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -155,6 +155,7 @@
     break;
   case Triple::aarch64:
   case Triple::aarch64_be:
+  case Triple::aarch64_32:
     // The small model guarantees static code/data size < 4GB, but not where it
     // will be in memory. Most of these could end up >2GB away so even a signed
     // pc-relative 32-bit address is insufficient, theoretically.
Index: llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
===================================================================
--- llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -119,7 +119,8 @@
     return make_error<StringError>(
         std::string("No callback manager available for ") + T.str(),
         inconvertibleErrorCode());
-  case Triple::aarch64: {
+  case Triple::aarch64:
+  case Triple::aarch64_32: {
     typedef orc::LocalJITCompileCallbackManager<orc::OrcAArch64> CCMgrT;
     return CCMgrT::Create(ES, ErrorHandlerAddress);
     }
@@ -167,6 +168,7 @@
       };
 
     case Triple::aarch64:
+    case Triple::aarch64_32:
       return [](){
         return llvm::make_unique<
                        orc::LocalIndirectStubsManager<orc::OrcAArch64>>();
Index: llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
===================================================================
--- llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -90,6 +90,7 @@
         inconvertibleErrorCode());
 
   case Triple::aarch64:
+  case Triple::aarch64_32:
     return LocalLazyCallThroughManager::Create<OrcAArch64>(ES,
                                                            ErrorHandlerAddr);
 
Index: llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
===================================================================
--- llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -917,7 +917,8 @@
 
 uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr,
                                              unsigned AbiVariant) {
-  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be) {
+  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be ||
+      Arch == Triple::aarch64_32) {
     // This stub has to be able to access the full address space,
     // since symbol lookup won't necessarily find a handy, in-range,
     // PLT stub for functions which could be anywhere.
Index: llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
===================================================================
--- llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -353,6 +353,7 @@
   case Triple::arm:
     return make_unique<RuntimeDyldMachOARM>(MemMgr, Resolver);
   case Triple::aarch64:
+  case Triple::aarch64_32:
     return make_unique<RuntimeDyldMachOAArch64>(MemMgr, Resolver);
   case Triple::x86:
     return make_unique<RuntimeDyldMachOI386>(MemMgr, Resolver);
Index: llvm/lib/LTO/LTOCodeGenerator.cpp
===================================================================
--- llvm/lib/LTO/LTOCodeGenerator.cpp
+++ llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -365,7 +365,8 @@
       MCpu = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       MCpu = "yonah";
-    else if (Triple.getArch() == llvm::Triple::aarch64)
+    else if (Triple.getArch() == llvm::Triple::aarch64 ||
+             Triple.getArch() == llvm::Triple::aarch64_32)
       MCpu = "cyclone";
   }
 
Index: llvm/lib/LTO/LTOModule.cpp
===================================================================
--- llvm/lib/LTO/LTOModule.cpp
+++ llvm/lib/LTO/LTOModule.cpp
@@ -220,7 +220,8 @@
       CPU = "core2";
     else if (Triple.getArch() == llvm::Triple::x86)
       CPU = "yonah";
-    else if (Triple.getArch() == llvm::Triple::aarch64)
+    else if (Triple.getArch() == llvm::Triple::aarch64 ||
+             Triple.getArch() == llvm::Triple::aarch64_32)
       CPU = "cyclone";
   }
 
Index: llvm/lib/LTO/ThinLTOCodeGenerator.cpp
===================================================================
--- llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -490,7 +490,8 @@
       TMBuilder.MCpu = "core2";
     else if (TheTriple.getArch() == llvm::Triple::x86)
       TMBuilder.MCpu = "yonah";
-    else if (TheTriple.getArch() == llvm::Triple::aarch64)
+    else if (TheTriple.getArch() == llvm::Triple::aarch64 ||
+             TheTriple.getArch() == llvm::Triple::aarch64_32)
       TMBuilder.MCpu = "cyclone";
   }
   TMBuilder.TheTriple = std::move(TheTriple);
Index: llvm/lib/MC/MCObjectFileInfo.cpp
===================================================================
--- llvm/lib/MC/MCObjectFileInfo.cpp
+++ llvm/lib/MC/MCObjectFileInfo.cpp
@@ -28,7 +28,7 @@
     return false;
 
   // aarch64 always has it.
-  if (T.getArch() == Triple::aarch64)
+  if (T.getArch() == Triple::aarch64 || T.getArch() == Triple::aarch64_32)
     return true;
 
   // armv7k always has it.
@@ -57,7 +57,8 @@
           MachO::S_ATTR_STRIP_STATIC_SYMS | MachO::S_ATTR_LIVE_SUPPORT,
       SectionKind::getReadOnly());
 
-  if (T.isOSDarwin() && T.getArch() == Triple::aarch64)
+  if (T.isOSDarwin() &&
+      (T.getArch() == Triple::aarch64 || T.getArch() == Triple::aarch64_32))
     SupportsCompactUnwindWithoutEHFrame = true;
 
   if (T.isWatchABI())
@@ -193,7 +194,7 @@
 
     if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86)
       CompactUnwindDwarfEHFrameOnly = 0x04000000;  // UNWIND_X86_64_MODE_DWARF
-    else if (T.getArch() == Triple::aarch64)
+    else if (T.getArch() == Triple::aarch64 || T.getArch() == Triple::aarch64_32)
       CompactUnwindDwarfEHFrameOnly = 0x03000000;  // UNWIND_ARM64_MODE_DWARF
     else if (T.getArch() == Triple::arm || T.getArch() == Triple::thumb)
       CompactUnwindDwarfEHFrameOnly = 0x04000000;  // UNWIND_ARM_MODE_DWARF
Index: llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1193,4 +1193,6 @@
   RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
   RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
+  RegisterAsmPrinter<AArch64AsmPrinter> W(getTheARM64_32Target());
+  RegisterAsmPrinter<AArch64AsmPrinter> V(getTheAArch64_32Target());
 }
Index: llvm/lib/Target/AArch64/AArch64CallLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64CallLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -378,14 +378,16 @@
     return false;
 
   if (F.isVarArg()) {
-    if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
-      // FIXME: we need to reimplement saveVarArgsRegisters from
+    auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+    if (!Subtarget.isTargetDarwin()) {
+        // FIXME: we need to reimplement saveVarArgsRegisters from
       // AArch64ISelLowering.
       return false;
     }
 
-    // We currently pass all varargs at 8-byte alignment.
-    uint64_t StackOffset = alignTo(Handler.StackUsed, 8);
+    // We currently pass all varargs at 8-byte alignment, or 4 in ILP32.
+    uint64_t StackOffset =
+        alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8);
 
     auto &MFI = MIRBuilder.getMF().getFrameInfo();
     AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
Index: llvm/lib/Target/AArch64/AArch64CallingConvention.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64CallingConvention.h
+++ llvm/lib/Target/AArch64/AArch64CallingConvention.h
@@ -25,6 +25,9 @@
 bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                           CCValAssign::LocInfo LocInfo,
                           ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                          CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags, CCState &State);
 bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                              CCValAssign::LocInfo LocInfo,
                              ISD::ArgFlagsTy ArgFlags, CCState &State);
Index: llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -79,10 +79,15 @@
 static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                     CCValAssign::LocInfo &LocInfo,
                                     ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+      State.getMachineFunction().getSubtarget());
+
   // Try to allocate a contiguous block of registers, each of the correct
   // size to hold one member.
   ArrayRef<MCPhysReg> RegList;
-  if (LocVT.SimpleTy == MVT::i64)
+  bool IsDarwinILP32 = Subtarget.isTargetILP32() && Subtarget.isTargetMachO();
+  if (LocVT.SimpleTy == MVT::i64 ||
+      (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32))
     RegList = XRegList;
   else if (LocVT.SimpleTy == MVT::f16)
     RegList = HRegList;
@@ -107,8 +112,10 @@
   if (!ArgFlags.isInConsecutiveRegsLast())
     return true;
 
-  unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
-  if (RegResult) {
+  unsigned EltsPerReg = (IsDarwinILP32 && LocVT.SimpleTy == MVT::i32) ? 2 : 1;
+  unsigned RegResult = State.AllocateRegBlock(
+      RegList, alignTo(PendingMembers.size(), EltsPerReg) / EltsPerReg);
+  if (RegResult && EltsPerReg == 1) {
     for (auto &It : PendingMembers) {
       It.convertToReg(RegResult);
       State.addLoc(It);
@@ -116,14 +123,26 @@
     }
     PendingMembers.clear();
     return true;
+  } else if (RegResult) {
+    assert(EltsPerReg == 2 && "unexpected ABI");
+    bool UseHigh = false;
+    CCValAssign::LocInfo Info;
+    for (auto &It : PendingMembers) {
+      Info = UseHigh ? CCValAssign::AExtUpper : CCValAssign::ZExt;
+      State.addLoc(CCValAssign::getReg(It.getValNo(), MVT::i32, RegResult,
+                                       MVT::i64, Info));
+      UseHigh = !UseHigh;
+      if (!UseHigh)
+        ++RegResult;
+    }
+    PendingMembers.clear();
+    return true;
   }
 
   // Mark all regs in the class as unavailable
   for (auto Reg : RegList)
     State.AllocateReg(Reg);
 
-  const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
-      State.getMachineFunction().getSubtarget());
   unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
 
   return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
Index: llvm/lib/Target/AArch64/AArch64CallingConvention.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -17,6 +17,10 @@
 class CCIfBigEndian<CCAction A> :
   CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
 
+class CCIfILP32<CCAction A> :
+  CCIf<"State.getMachineFunction().getDataLayout().getPointerSize() == 4", A>;
+
+
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
 //===----------------------------------------------------------------------===//
@@ -111,6 +115,7 @@
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
   CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
@@ -202,6 +207,12 @@
   CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
   CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Re-demote pointers to 32-bits so we don't end up storing 64-bit
+  // values and clobbering neighbouring stack locations. Not very pretty.
+  CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
+  CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>,
+
   CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
            CCAssignToStack<8, 8>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
@@ -229,6 +240,29 @@
            CCAssignToStack<16, 16>>
 ]>;
 
+// In the ILP32 world, the minimum stack slot size is 4 bytes. Otherwise the
+// same as the normal Darwin VarArgs handling.
+let Entry = 1 in
+def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
+  CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
+  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+
+  // Handle all scalar types as either i32 or f32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[f16],     CCPromoteToType<f32>>,
+
+  // Everything is on the stack.
+  // i128 is split to two i64s, and its stack alignment is 16 bytes.
+  CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+           CCAssignToStack<8, 8>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+           CCAssignToStack<16, 16>>
+]>;
+
+
 // The WebKit_JS calling convention only passes the first argument (the callee)
 // in register and the remaining arguments on stack. We allow 32bit stack slots,
 // so that WebKit can write partial values in the stack and define the other
Index: llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -103,6 +103,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -181,6 +182,7 @@
   case AArch64::ADDXri:
     return canAddBePartOfLOH(MI);
   case AArch64::LDRXui:
+  case AArch64::LDRWui:
     // Check immediate to see if the immediate is an address.
     switch (MI.getOperand(2).getType()) {
     default:
@@ -312,7 +314,8 @@
     Info.Type = MCLOH_AdrpAdd;
     Info.IsCandidate = true;
     Info.MI0 = &MI;
-  } else if (MI.getOpcode() == AArch64::LDRXui &&
+  } else if ((MI.getOpcode() == AArch64::LDRXui ||
+              MI.getOpcode() == AArch64::LDRWui) &&
              MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) {
     Info.Type = MCLOH_AdrpLdrGot;
     Info.IsCandidate = true;
@@ -357,7 +360,9 @@
       return true;
     }
   } else {
-    assert(MI.getOpcode() == AArch64::LDRXui && "Expect LDRXui");
+    assert((MI.getOpcode() == AArch64::LDRXui ||
+            MI.getOpcode() == AArch64::LDRWui) &&
+           "Expect LDRXui or LDRWui");
     assert((MI.getOperand(2).getTargetFlags() & AArch64II::MO_GOT) &&
            "Expected GOT relocation");
     if (OpInfo.Type == MCLOH_AdrpAddStr && OpInfo.MI1 == nullptr) {
@@ -474,13 +479,23 @@
     handleClobber(LOHInfos[Idx]);
   }
   // Handle uses.
+
+  SmallSet<int, 4> UsesSeen;
   for (const MachineOperand &MO : MI.uses()) {
     if (!MO.isReg() || !MO.readsReg())
       continue;
     int Idx = mapRegToGPRIndex(MO.getReg());
     if (Idx < 0)
       continue;
-    handleUse(MI, MO, LOHInfos[Idx]);
+
+    // Multiple uses of the same register within a single instruction don't
+    // count as MultiUser or block optimization. This is especially important on
+    // arm64_32, where any memory operation is likely to be an explicit use of
+    // xN and an implicit use of wN (the base address register).
+    if (!UsesSeen.count(Idx)) {
+      handleUse(MI, MO, LOHInfos[Idx]);
+      UsesSeen.insert(Idx);
+    }
   }
 }
 
@@ -512,6 +527,7 @@
       switch (Opcode) {
       case AArch64::ADDXri:
       case AArch64::LDRXui:
+      case AArch64::LDRWui:
         if (canDefBePartOfLOH(MI)) {
           const MachineOperand &Def = MI.getOperand(0);
           const MachineOperand &Op = MI.getOperand(1);
Index: llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -495,12 +495,26 @@
       }
     } else {
       // Small codemodel expand into ADRP + LDR.
+      MachineFunction &MF = *MI.getParent()->getParent();
+      DebugLoc DL = MI.getDebugLoc();
       MachineInstrBuilder MIB1 =
           BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
-      MachineInstrBuilder MIB2 =
-          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
-              .add(MI.getOperand(0))
-              .addReg(DstReg);
+
+      MachineInstrBuilder MIB2;
+      if (MF.getSubtarget<AArch64Subtarget>().isTargetILP32()) {
+        auto TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
+        unsigned Reg32 = TRI->getSubReg(DstReg, AArch64::sub_32);
+        unsigned DstFlags = MI.getOperand(0).getTargetFlags();
+        MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRWui))
+                   .addDef(Reg32, RegState::Dead)
+                   .addReg(DstReg, RegState::Kill)
+                   .addReg(DstReg, DstFlags | RegState::Implicit);
+      } else {
+        unsigned DstReg = MI.getOperand(0).getReg();
+        MIB2 = BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXui))
+                   .add(MI.getOperand(0))
+                   .addUse(DstReg, RegState::Kill);
+      }
 
       if (MO1.isGlobal()) {
         MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
Index: llvm/lib/Target/AArch64/AArch64FastISel.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -474,12 +474,32 @@
             ADRPReg)
         .addGlobalAddress(GV, 0, AArch64II::MO_PAGE | OpFlags);
 
-    ResultReg = createResultReg(&AArch64::GPR64RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+    unsigned LdrOpc;
+    if (Subtarget->isTargetILP32()) {
+      ResultReg = createResultReg(&AArch64::GPR32RegClass);
+      LdrOpc = AArch64::LDRWui;
+    } else {
+      ResultReg = createResultReg(&AArch64::GPR64RegClass);
+      LdrOpc = AArch64::LDRXui;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(LdrOpc),
             ResultReg)
-        .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0,
-                          AArch64II::MO_PAGEOFF | AArch64II::MO_NC | OpFlags);
+      .addReg(ADRPReg)
+      .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                        AArch64II::MO_NC | OpFlags);
+    if (!Subtarget->isTargetILP32())
+      return ResultReg;
+
+    // LDRWui produces a 32-bit register, but pointers in-register are 64-bits
+    // so we must extend the result on ILP32.
+    unsigned Result64 = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::SUBREG_TO_REG))
+        .addDef(Result64)
+        .addImm(0)
+        .addReg(ResultReg, RegState::Kill)
+        .addImm(AArch64::sub_32);
+    return Result64;
   } else {
     // ADRP + ADDX
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
@@ -504,6 +524,15 @@
   if (!CEVT.isSimple())
     return 0;
   MVT VT = CEVT.getSimpleVT();
+  // arm64_32 has 32-bit pointers held in 64-bit registers. Because of that,
+  // 'null' pointers need to have a somewhat special treatment.
+  if (const auto *CPN = dyn_cast<ConstantPointerNull>(C)) {
+    (void)CPN;
+    assert(CPN->getType()->getPointerAddressSpace() == 0 &&
+           "Unexpected address space");
+    assert(VT == MVT::i64 && "Expected 64-bit pointers");
+    return materializeInt(ConstantInt::get(Type::getInt64Ty(*Context), 0), VT);
+  }
 
   if (const auto *CI = dyn_cast<ConstantInt>(C))
     return materializeInt(CI, VT);
@@ -946,6 +975,9 @@
 bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
   EVT evt = TLI.getValueType(DL, Ty, true);
 
+  if (Subtarget->isTargetILP32() && Ty->isPointerTy())
+    return false;
+
   // Only handle simple types.
   if (evt == MVT::Other || !evt.isSimple())
     return false;
@@ -988,6 +1020,9 @@
 }
 
 bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
+  if (Subtarget->isTargetILP32())
+    return false;
+
   unsigned ScaleFactor = getImplicitScaleFactor(VT);
   if (!ScaleFactor)
     return false;
@@ -3165,6 +3200,11 @@
   if (IsTailCall)
     return false;
 
+  // FIXME: we could and should support this, but for now correctness at -O0 is
+  // more important.
+  if (Subtarget->isTargetILP32())
+    return false;
+
   CodeModel::Model CM = TM.getCodeModel();
   // Only support the small-addressing and large code models.
   if (CM != CodeModel::Large && !Subtarget->useSmallAddressing())
@@ -3796,6 +3836,11 @@
   if (!FuncInfo.CanLowerReturn)
     return false;
 
+  // FIXME: in principle it could. Mostly just a case of zero extending outgoing
+  // pointers.
+  if (Subtarget->isTargetILP32())
+    return false;
+
   if (F.isVarArg())
     return false;
 
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -261,6 +261,14 @@
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
+  MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const override {
+    // Returning i64 unconditionally here (i.e. even for ILP32) means that the
+    // *DAG* representation of pointers will always be 64-bits. They will be
+    // truncated and extended when transferred to memory, but the 64-bit DAG
+    // allows us to use AArch64's addressing modes much more easily.
+    return MVT::getIntegerVT(64);
+  }
+
   bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
                                     TargetLoweringOpt &TLO) const override;
 
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -1025,6 +1026,14 @@
     Known.One &= Known2.One;
     break;
   }
+  case AArch64ISD::LOADgot:
+  case AArch64ISD::ADDlow: {
+    if (!Subtarget->isTargetILP32())
+      break;
+    // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
+    Known.Zero = APInt::getHighBitsSet(64, 32);
+    break;
+  }
   case ISD::INTRINSIC_W_CHAIN: {
     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
@@ -3040,8 +3049,11 @@
       return CC_AArch64_Win64_VarArg;
     if (!Subtarget->isTargetDarwin())
       return CC_AArch64_AAPCS;
-    return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
-  case CallingConv::Win64:
+    if (!IsVarArg)
+      return CC_AArch64_DarwinPCS;
+    return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
+                                      : CC_AArch64_DarwinPCS_VarArg;
+   case CallingConv::Win64:
     return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
   case CallingConv::AArch64_VectorCall:
     return CC_AArch64_AAPCS;
@@ -3064,6 +3076,7 @@
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
+  DenseMap<unsigned, SDValue> CopiedRegs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
@@ -3120,11 +3133,10 @@
       continue;
     }
 
+    SDValue ArgValue;
     if (VA.isRegLoc()) {
       // Arguments stored in registers.
       EVT RegVT = VA.getLocVT();
-
-      SDValue ArgValue;
       const TargetRegisterClass *RC;
 
       if (RegVT == MVT::i32)
@@ -3160,14 +3172,13 @@
       case CCValAssign::AExt:
       case CCValAssign::SExt:
       case CCValAssign::ZExt:
-        // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
-        // nodes after our lowering.
-        assert(RegVT == Ins[i].VT && "incorrect register location selected");
+        break;
+      case CCValAssign::AExtUpper:
+        ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
+                               DAG.getConstant(32, DL, RegVT));
+        ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
         break;
       }
-
-      InVals.push_back(ArgValue);
-
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
       unsigned ArgOffset = VA.getLocMemOffset();
@@ -3182,7 +3193,6 @@
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-      SDValue ArgValue;
 
       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
@@ -3191,6 +3201,7 @@
       switch (VA.getLocInfo()) {
       default:
         break;
+      case CCValAssign::Trunc:
       case CCValAssign::BCvt:
         MemVT = VA.getLocVT();
         break;
@@ -3210,8 +3221,11 @@
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
           MemVT);
 
-      InVals.push_back(ArgValue);
     }
+    if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
+      ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
+                             ArgValue, DAG.getValueType(MVT::i32));
+    InVals.push_back(ArgValue);
   }
 
   // varargs
@@ -3228,8 +3242,8 @@
 
     // This will point to the next argument passed via stack.
     unsigned StackOffset = CCInfo.getNextStackOffset();
-    // We currently pass all varargs at 8-byte alignment.
-    StackOffset = ((StackOffset + 7) & ~7);
+    // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
+    StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
 
     if (MFI.hasMustTailInVarArgFunc()) {
@@ -3392,6 +3406,7 @@
                           : RetCC_AArch64_AAPCS;
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
+  DenseMap<unsigned, SDValue> CopiedRegs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC);
@@ -3409,10 +3424,16 @@
       continue;
     }
 
-    SDValue Val =
-        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
-    Chain = Val.getValue(1);
-    InFlag = Val.getValue(2);
+    // Avoid copying a physreg twice since RegAllocFast is incompetent and only
+    // allows one use of a physreg per block.
+    SDValue Val = CopiedRegs.lookup(VA.getLocReg());
+    if (!Val) {
+      Val =
+          DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+      Chain = Val.getValue(1);
+      InFlag = Val.getValue(2);
+      CopiedRegs[VA.getLocReg()] = Val;
+    }
 
     switch (VA.getLocInfo()) {
     default:
@@ -3422,6 +3443,15 @@
     case CCValAssign::BCvt:
       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
       break;
+    case CCValAssign::AExtUpper:
+      Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
+                        DAG.getConstant(32, DL, VA.getLocVT()));
+      LLVM_FALLTHROUGH;
+    case CCValAssign::AExt:
+      LLVM_FALLTHROUGH;
+    case CCValAssign::ZExt:
+      Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
+      break;
     }
 
     InVals.push_back(Val);
@@ -3735,6 +3765,7 @@
                                         getPointerTy(DAG.getDataLayout()));
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallSet<unsigned, 8> RegsUsed;
   SmallVector<SDValue, 8> MemOpChains;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
@@ -3742,7 +3773,7 @@
     const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
     for (const auto &F : Forwards) {
       SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
-       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+       RegsToPass.emplace_back(F.PReg, Val);
     }
   }
 
@@ -3773,8 +3804,17 @@
       }
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
       break;
+    case CCValAssign::AExtUpper:
+      assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
+                        DAG.getConstant(32, DL, VA.getLocVT()));
+      break;
     case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      Arg = DAG.getBitcast(VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::Trunc:
+      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
       break;
     case CCValAssign::FPExt:
       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
@@ -3790,7 +3830,18 @@
                "unexpected use of 'returned'");
         IsThisReturn = true;
       }
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      if (RegsUsed.count(VA.getLocReg())) {
+        SDValue &Bits =
+            std::find_if(RegsToPass.begin(), RegsToPass.end(),
+                         [=](const std::pair<unsigned, SDValue> &Elt) {
+                           return Elt.first == VA.getLocReg();
+                         })
+                ->second;
+        Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
+      } else {
+        RegsToPass.emplace_back(VA.getLocReg(), Arg);
+        RegsUsed.insert(VA.getLocReg());
+      }
     } else {
       assert(VA.isMemLoc());
 
@@ -4009,7 +4060,8 @@
 
   // Copy the result values into the output registers.
   SDValue Flag;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
+  SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
+  SmallSet<unsigned, 4> RegsUsed;
   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
        ++i, ++realRVLocIdx) {
     CCValAssign &VA = RVLocs[i];
@@ -4031,11 +4083,38 @@
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
       break;
+    case CCValAssign::AExt:
+    case CCValAssign::ZExt:
+      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
+      break;
+    case CCValAssign::AExtUpper:
+      assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
+      Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
+      Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
+                        DAG.getConstant(32, DL, VA.getLocVT()));
+      break;
     }
 
-    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    if (RegsUsed.count(VA.getLocReg())) {
+      SDValue &Bits =
+          std::find_if(RetVals.begin(), RetVals.end(),
+                       [=](const std::pair<unsigned, SDValue> &Elt) {
+                         return Elt.first == VA.getLocReg();
+                       })
+              ->second;
+      Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
+    } else {
+      RetVals.emplace_back(VA.getLocReg(), Arg);
+      RegsUsed.insert(VA.getLocReg());
+    }
+  }
+
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  for (auto &RetVal : RetVals) {
+    Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
     Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+    RetOps.push_back(
+        DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
   }
 
   // Windows AArch64 ABIs require that for returning structs by value we copy
@@ -4229,6 +4308,7 @@
 
   SDLoc DL(Op);
   MVT PtrVT = getPointerTy(DAG.getDataLayout());
+  MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
   SDValue TLVPAddr =
@@ -4239,13 +4319,16 @@
   // to obtain the address of the variable.
   SDValue Chain = DAG.getEntryNode();
   SDValue FuncTLVGet = DAG.getLoad(
-      MVT::i64, DL, Chain, DescAddr,
+      PtrMemVT, DL, Chain, DescAddr,
       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
-      /* Alignment = */ 8,
+      /* Alignment = */ PtrMemVT.getSizeInBits() / 8,
       MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
           MachineMemOperand::MODereferenceable);
   Chain = FuncTLVGet.getValue(1);
 
+  // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
+  FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
+
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   MFI.setAdjustsStack(true);
 
@@ -5121,6 +5204,7 @@
   SDLoc DL(Op);
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
                                  getPointerTy(DAG.getDataLayout()));
+  FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                       MachinePointerInfo(SV));
@@ -5227,15 +5311,15 @@
   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
   // pointer.
   SDLoc DL(Op);
-  unsigned VaListSize =
-      Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
+  unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
+  unsigned VaListSize = (Subtarget->isTargetDarwin() ||
+                         Subtarget->isTargetWindows()) ? PtrSize : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
-  return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
-                       Op.getOperand(2),
-                       DAG.getConstant(VaListSize, DL, MVT::i32),
-                       8, false, false, false, MachinePointerInfo(DestSV),
+  return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize,
+                       false, false, false, MachinePointerInfo(DestSV),
                        MachinePointerInfo(SrcSV));
 }
 
@@ -5249,12 +5333,15 @@
   SDValue Chain = Op.getOperand(0);
   SDValue Addr = Op.getOperand(1);
   unsigned Align = Op.getConstantOperandVal(3);
+  unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
-
-  SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
+  auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
+  SDValue VAList =
+      DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
   Chain = VAList.getValue(1);
+  VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
 
-  if (Align > 8) {
+  if (Align > MinSlotSize) {
     assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                          DAG.getConstant(Align - 1, DL, PtrVT));
@@ -5263,14 +5350,14 @@
   }
 
   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
-  uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
+  unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
 
   // Scalar integer and FP values smaller than 64 bits are implicitly extended
   // up to 64 bits.  At the very least, we have to increase the striding of the
   // vaargs list to match this, and for FP values we need to introduce
   // FP_ROUND nodes as well.
   if (VT.isInteger() && !VT.isVector())
-    ArgSize = 8;
+    ArgSize = std::max(ArgSize, MinSlotSize);
   bool NeedFPTrunc = false;
   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
     ArgSize = 8;
@@ -5280,6 +5367,8 @@
   // Increment the pointer, VAList, to the next vaarg
   SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                                DAG.getConstant(ArgSize, DL, PtrVT));
+  VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
+
   // Store the incremented VAList to the legalized pointer
   SDValue APStore =
       DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
@@ -5309,10 +5398,15 @@
   SDLoc DL(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDValue FrameAddr =
-      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
                             MachinePointerInfo());
+
+  if (Subtarget->isTargetILP32())
+    FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
+                            DAG.getValueType(VT));
+
   return FrameAddr;
 }
 
Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1472,6 +1472,8 @@
     return false;
 
   MachineBasicBlock &MBB = *MI.getParent();
+  auto &Subtarget = MBB.getParent()->getSubtarget<AArch64Subtarget>();
+  auto TRI = Subtarget.getRegisterInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   if (MI.getOpcode() == AArch64::CATCHRET) {
@@ -1507,11 +1509,22 @@
   if ((OpFlags & AArch64II::MO_GOT) != 0) {
     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
         .addGlobalAddress(GV, 0, OpFlags);
-    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
-        .addReg(Reg, RegState::Kill)
-        .addImm(0)
-        .addMemOperand(*MI.memoperands_begin());
+    if (Subtarget.isTargetILP32()) {
+      unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
+      BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
+          .addDef(Reg32, RegState::Dead)
+          .addUse(Reg, RegState::Kill)
+          .addImm(0)
+          .addMemOperand(*MI.memoperands_begin())
+          .addDef(Reg, RegState::Implicit);
+    } else {
+      BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addImm(0)
+          .addMemOperand(*MI.memoperands_begin());
+    }
   } else if (TM.getCodeModel() == CodeModel::Large) {
+    assert(!Subtarget.isTargetILP32() && "how can large exist in ILP32?");
     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC)
         .addImm(0);
@@ -1538,10 +1551,20 @@
     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
-    BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
-        .addReg(Reg, RegState::Kill)
-        .addGlobalAddress(GV, 0, LoFlags)
-        .addMemOperand(*MI.memoperands_begin());
+    if (Subtarget.isTargetILP32()) {
+      unsigned Reg32 = TRI->getSubReg(Reg, AArch64::sub_32);
+      BuildMI(MBB, MI, DL, get(AArch64::LDRWui))
+          .addDef(Reg32, RegState::Dead)
+          .addUse(Reg, RegState::Kill)
+          .addGlobalAddress(GV, 0, LoFlags)
+          .addMemOperand(*MI.memoperands_begin())
+          .addDef(Reg, RegState::Implicit);
+    } else {
+      BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+          .addReg(Reg, RegState::Kill)
+          .addGlobalAddress(GV, 0, LoFlags)
+          .addMemOperand(*MI.memoperands_begin());
+    }
   }
 
   MBB.erase(MI);
Index: llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -32,7 +32,7 @@
     const AArch64TargetLowering &TLI = *STI.getTargetLowering();
 
     EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
-    Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+    Type *IntPtrTy = Type::getInt8PtrTy(*DAG.getContext());
     TargetLowering::ArgListTy Args;
     TargetLowering::ArgListEntry Entry;
     Entry.Node = Dst;
Index: llvm/lib/Target/AArch64/AArch64Subtarget.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -409,6 +409,8 @@
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
+  bool isTargetILP32() const { return TargetTriple.isArch32Bit(); }
+
   bool useAA() const override { return UseAA; }
 
   bool hasVH() const { return HasVH; }
@@ -435,6 +437,12 @@
   bool hasFMI() const { return HasFMI; }
   bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
 
+  bool addrSinkUsingGEPs() const override {
+    // Keeping GEPs inbounds is important for exploiting AArch64
+    // addressing-modes in ILP32 mode.
+    return useAA() || isTargetILP32();
+  }
+
   bool useSmallAddressing() const {
     switch (TLInfo.getTargetMachine().getCodeModel()) {
       case CodeModel::Kernel:
Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -157,6 +157,8 @@
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
   RegisterTargetMachine<AArch64beTargetMachine> Y(getTheAArch64beTarget());
   RegisterTargetMachine<AArch64leTargetMachine> Z(getTheARM64Target());
+  RegisterTargetMachine<AArch64leTargetMachine> W(getTheARM64_32Target());
+  RegisterTargetMachine<AArch64leTargetMachine> V(getTheAArch64_32Target());
   auto PR = PassRegistry::getPassRegistry();
   initializeGlobalISel(*PR);
   initializeAArch64A53Fix835769Pass(*PR);
@@ -200,8 +202,11 @@
                                      bool LittleEndian) {
   if (Options.getABIName() == "ilp32")
     return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
-  if (TT.isOSBinFormatMachO())
+  if (TT.isOSBinFormatMachO()) {
+    if (TT.getArch() == Triple::aarch64_32)
+      return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128";
     return "e-m:o-i64:64-i128:128-n32:64-S128";
+  }
   if (TT.isOSBinFormatCOFF())
     return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
@@ -278,7 +283,8 @@
   }
 
   // Enable GlobalISel at or below EnableGlobalISelAt0.
-  if (getOptLevel() <= EnableGlobalISelAtO) {
+  if (getOptLevel() <= EnableGlobalISelAtO &&
+      TT.getArch() != Triple::aarch64_32) {
     setGlobalISel(true);
     setGlobalISelAbort(GlobalISelAbortMode::Disable);
   }
Index: llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
===================================================================
--- llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -23,7 +23,7 @@
 class Triple;
 
 struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
-  explicit AArch64MCAsmInfoDarwin();
+  explicit AArch64MCAsmInfoDarwin(bool IsILP32);
   const MCExpr *
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
Index: llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -30,7 +30,7 @@
     cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
                clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
 
-AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin(bool IsILP32) {
   // We prefer NEON instructions to be printed in the short, Apple-specific
   // form when targeting Darwin.
   AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant;
@@ -39,7 +39,8 @@
   PrivateLabelPrefix = "L";
   SeparatorString = "%%";
   CommentString = ";";
-  CodePointerSize = CalleeSaveStackSlotSize = 8;
+  CalleeSaveStackSlotSize = 8;
+  CodePointerSize = IsILP32 ? 4 : 8;
 
   AlignmentIsInBytes = false;
   UsesELFSectionDirectiveForBSS = true;
Index: llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
===================================================================
--- llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -241,7 +241,7 @@
                                          const Triple &TheTriple) {
   MCAsmInfo *MAI;
   if (TheTriple.isOSBinFormatMachO())
-    MAI = new AArch64MCAsmInfoDarwin();
+    MAI = new AArch64MCAsmInfoDarwin(TheTriple.getArch() == Triple::aarch64_32);
   else if (TheTriple.isWindowsMSVCEnvironment())
     MAI = new AArch64MCAsmInfoMicrosoftCOFF();
   else if (TheTriple.isOSBinFormatCOFF())
Index: llvm/lib/Target/X86/X86FastISel.cpp
===================================================================
--- llvm/lib/Target/X86/X86FastISel.cpp
+++ llvm/lib/Target/X86/X86FastISel.cpp
@@ -3387,6 +3387,7 @@
     case CCValAssign::SExtUpper:
     case CCValAssign::ZExtUpper:
     case CCValAssign::FPExt:
+    case CCValAssign::Trunc:
       llvm_unreachable("Unexpected loc info!");
     case CCValAssign::Indirect:
       // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
Index: llvm/test/CodeGen/AArch64/arm64-aapcs.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-aapcs.ll
+++ llvm/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -25,7 +25,7 @@
 @var64 = global i64 0, align 8
 
 ; Check stack slots are 64-bit at all times.
-define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
+define void @test_stack_slots([8 x i64], i1 %bool, i8 %char, i16 %short,
                                 i32 %int, i64 %long) {
 ; CHECK-LABEL: test_stack_slots:
 ; CHECK-DAG: ldr w[[ext1:[0-9]+]], [sp, #24]
Index: llvm/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
+++ llvm/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -o - %s -mtriple=arm64-apple-ios -O3 -aarch64-enable-collect-loh | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64_32-apple-watchos -O3 -aarch64-enable-collect-loh | FileCheck %s
 ; Check that the LOH analysis does not crash when the analysed chained
 ; contains instructions that are filtered out.
 ;
Index: llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
+++ llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64_32-apple-ios -O2 | FileCheck %s
 ; Test case for <rdar://problem/15942912>.
 ; AdrpAddStr cannot be used when the store uses same
 ; register as address and value. Indeed, the related
Index: llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -o - %s -mtriple=arm64-apple-ios -O2 | FileCheck %s
+; RUN: llc -o - %s -mtriple=arm64_32-apple-watchos -O2 | FileCheck %s
 ; RUN: llc -o - %s -mtriple=arm64-linux-gnu -O2 | FileCheck %s --check-prefix=CHECK-ELF
 
 ; CHECK-ELF-NOT: .loh
@@ -60,9 +61,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i32 @getC() {
@@ -76,9 +77,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsw x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldrsw x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getSExtC() {
@@ -94,10 +95,10 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
-; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
-; CHECK-NEXT: str [[ADD]], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str [[ADD]], [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
 define void @getSeveralC(i32 %t) {
@@ -114,9 +115,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define void @setC(i32 %t) {
@@ -142,7 +143,7 @@
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
 define i32 @getInternalCPlus4() {
-  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %addr = getelementptr inbounds i32, i32* @InternalC, i32 4
   %res = load i32, i32* %addr, align 4
   ret i32 %res
 }
@@ -159,7 +160,7 @@
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getSExtInternalCPlus4() {
-  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %addr = getelementptr inbounds i32, i32* @InternalC, i32 4
   %res = load i32, i32* %addr, align 4
   %sextres = sext i32 %res to i64
   ret i64 %sextres
@@ -180,7 +181,7 @@
 ; CHECK: .loh AdrpAdd [[ADRP_LABEL]], [[ADDGOT_LABEL]]
 define void @getSeveralInternalCPlus4(i32 %t) {
 entry:
-  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %addr = getelementptr inbounds i32, i32* @InternalC, i32 4
   %tmp = load i32, i32* %addr, align 4
   %add = add nsw i32 %tmp, %t
   store i32 %add, i32* %addr, align 4
@@ -200,7 +201,7 @@
 ; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
 define void @setInternalCPlus4(i32 %t) {
 entry:
-  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %addr = getelementptr inbounds i32, i32* @InternalC, i32 4
   store i32 %t, i32* %addr, align 4
   ret void
 }
@@ -276,8 +277,8 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
-; CHECK-NEXT: ldrb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldrb w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
 define i8 @getD() {
@@ -289,9 +290,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: strb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: strb w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setD(i8 %t) {
@@ -305,9 +306,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldrsb w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i32 @getSExtD() {
@@ -322,9 +323,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsb x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldrsb x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getSExt64D() {
@@ -341,8 +342,8 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
-; CHECK-NEXT: ldrh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldrh w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
 define i16 @getE() {
@@ -356,9 +357,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldrsh w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i32 @getSExtE() {
@@ -371,9 +372,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: strh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: strh w0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setE(i16 %t) {
@@ -387,9 +388,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldrsh x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldrsh x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getSExt64E() {
@@ -406,9 +407,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define i64 @getF() {
@@ -420,9 +421,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str x0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setF(i64 %t) {
@@ -438,9 +439,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr s0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr s0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define float @getG() {
@@ -452,9 +453,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str s0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str s0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setG(float %t) {
@@ -470,9 +471,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr h0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr h0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define half @getH() {
@@ -484,9 +485,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str h0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str h0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setH(half %t) {
@@ -502,9 +503,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define double @getI() {
@@ -516,9 +517,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setI(double %t) {
@@ -534,9 +535,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define <2 x i32> @getJ() {
@@ -548,9 +549,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str d0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setJ(<2 x i32> %t) {
@@ -566,9 +567,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr q0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr q0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define <4 x i32> @getK() {
@@ -580,9 +581,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
 ; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: str q0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: str q0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
 define void @setK(<4 x i32> %t) {
@@ -598,9 +599,9 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
 ; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr b0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ldr b0, [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
 define <1 x i8> @getL() {
@@ -612,11 +613,11 @@
 ; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
 ; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
 ; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
-; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; CHECK-NEXT: ldr {{[xw]}}[[LDRGOT_REG:[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
 ; CHECK-NEXT: ; kill
 ; Ultimately we should generate str b0, but right now, we match the vector
 ; variant which does not allow to fold the immediate into the store.
-; CHECK-NEXT: st1.b { v0 }[0], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: st1.b { v0 }[0], [x[[LDRGOT_REG]]]
 ; CHECK-NEXT: ret
 ; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
 define void @setL(<1 x i8> %t) {
Index: llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-redzone | FileCheck %s
+; RUN: llc < %s -mtriple=arm64_32-apple-ios -aarch64-redzone | FileCheck %s
 
 define i64* @store64(i64* %ptr, i64 %index, i64 %spacing) {
 ; CHECK-LABEL: store64:
Index: llvm/test/CodeGen/AArch64/arm64-stacksave.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-stacksave.ll
+++ llvm/test/CodeGen/AArch64/arm64-stacksave.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -verify-coalescing
+; RUN: llc -mtriple=arm64-apple-macosx10.8.0 < %s -verify-coalescing
+; RUN: llc -mtriple=arm64_32-apple-ios9.0 < %s -verify-coalescing
 ; <rdar://problem/11522048>
-target triple = "arm64-apple-macosx10.8.0"
 
 ; Verify that we can handle spilling the stack pointer without attempting
 ; spilling it directly.
Index: llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -18,15 +18,14 @@
 define void @t1() nounwind ssp {
 entry:
 ; ALL-LABEL: t1:
-; ALL-NOT: fmov
 ; NONEFP: ldr h0,{{.*}}
-; NONEFP: fmov s1, wzr
-; NONEFP: fmov d2, xzr
-; NONEFP: movi{{(.16b)?}} v3{{(.2d)?}}, #0
-; NONE16: fmov h0, wzr
-; NONE16: fmov s1, wzr
-; NONE16: fmov d2, xzr
-; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0
+; NONEFP-DAG: fmov s1, wzr
+; NONEFP-DAG: fmov d2, xzr
+; NONEFP-DAG: movi{{(.16b)?}} v3{{(.2d)?}}, #0
+; NONE16-DAG: fmov h0, wzr
+; NONE16-DAG: fmov s1, wzr
+; NONE16-DAG: fmov d2, xzr
+; NONE16-DAG: movi{{(.16b)?}} v3{{(.2d)?}}, #0
 ; ZEROFP: ldr h0,{{.*}}
 ; ZEROFP: movi v{{[0-3]+}}.2d, #0
 ; ZEROFP: movi v{{[0-3]+}}.2d, #0
Index: llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-addrs.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+
+; If %base < 96 then the sum will not wrap (in an unsigned sense), but "ldr w0,
+; [x0, #-96]" would.
+define i32 @test_valid_wrap(i32 %base) {
+; CHECK-LABEL: test_valid_wrap:
+; CHECK: sub w[[ADDR:[0-9]+]], w0, #96
+; CHECK: ldr w0, [x[[ADDR]]]
+
+  %newaddr = add nuw i32 %base, -96
+  %ptr = inttoptr i32 %newaddr to i32*
+  %val = load i32, i32* %ptr
+  ret i32 %val
+}
+
+define i8 @test_valid_wrap_optimizable(i8* %base) {
+; CHECK-LABEL: test_valid_wrap_optimizable:
+; CHECK: ldurb w0, [x0, #-96]
+
+  %newaddr = getelementptr inbounds i8, i8* %base, i32 -96
+  %val = load i8, i8* %newaddr
+  ret i8 %val
+}
+
+define i8 @test_valid_wrap_optimizable1(i8* %base, i32 %offset) {
+; CHECK-LABEL: test_valid_wrap_optimizable1:
+; CHECK: ldrb w0, [x0, w1, sxtw]
+
+  %newaddr = getelementptr inbounds i8, i8* %base, i32 %offset
+  %val = load i8, i8* %newaddr
+  ret i8 %val
+}
+
+;
+define i8 @test_valid_wrap_optimizable2(i8* %base, i32 %offset) {
+; CHECK-LABEL: test_valid_wrap_optimizable2:
+; CHECK: sxtw x[[OFFSET:[0-9]+]], w1
+; CHECK: mov w[[BASE:[0-9]+]], #-100
+; CHECK: ldrb w0, [x[[OFFSET]], x[[BASE]]]
+
+  %newaddr = getelementptr inbounds i8, i8* inttoptr(i32 -100 to i8*), i32 %offset
+  %val = load i8, i8* %newaddr
+  ret i8 %val
+}
Index: llvm/test/CodeGen/AArch64/arm64_32-atomics.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-atomics.ll
@@ -0,0 +1,261 @@
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 -o - %s | FileCheck %s
+
+define i8 @test_load_8(i8* %addr) {
+; CHECK-LABAL: test_load_8:
+; CHECK: ldarb w0, [x0]
+  %val = load atomic i8, i8* %addr seq_cst, align 1
+  ret i8 %val
+}
+
+define i16 @test_load_16(i16* %addr) {
+; CHECK-LABAL: test_load_16:
+; CHECK: ldarh w0, [x0]
+  %val = load atomic i16, i16* %addr acquire, align 2
+  ret i16 %val
+}
+
+define i32 @test_load_32(i32* %addr) {
+; CHECK-LABAL: test_load_32:
+; CHECK: ldar w0, [x0]
+  %val = load atomic i32, i32* %addr seq_cst, align 4
+  ret i32 %val
+}
+
+define i64 @test_load_64(i64* %addr) {
+; CHECK-LABAL: test_load_64:
+; CHECK: ldar x0, [x0]
+  %val = load atomic i64, i64* %addr seq_cst, align 8
+  ret i64 %val
+}
+
+define i8* @test_load_ptr(i8** %addr) {
+; CHECK-LABAL: test_load_ptr:
+; CHECK: ldar w0, [x0]
+  %val = load atomic i8*, i8** %addr seq_cst, align 8
+  ret i8* %val
+}
+
+define void @test_store_8(i8* %addr) {
+; CHECK-LABAL: test_store_8:
+; CHECK: stlrb wzr, [x0]
+  store atomic i8 0, i8* %addr seq_cst, align 1
+  ret void
+}
+
+define void @test_store_16(i16* %addr) {
+; CHECK-LABAL: test_store_16:
+; CHECK: stlrh wzr, [x0]
+  store atomic i16 0, i16* %addr seq_cst, align 2
+  ret void
+}
+
+define void @test_store_32(i32* %addr) {
+; CHECK-LABAL: test_store_32:
+; CHECK: stlr wzr, [x0]
+  store atomic i32 0, i32* %addr seq_cst, align 4
+  ret void
+}
+
+define void @test_store_64(i64* %addr) {
+; CHECK-LABAL: test_store_64:
+; CHECK: stlr xzr, [x0]
+  store atomic i64 0, i64* %addr seq_cst, align 8
+  ret void
+}
+
+define void @test_store_ptr(i8** %addr) {
+; CHECK-LABAL: test_store_ptr:
+; CHECK: stlr wzr, [x0]
+  store atomic i8* null, i8** %addr seq_cst, align 8
+  ret void
+}
+
+declare i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+declare i64 @llvm.aarch64.ldxr.p0i16(i16* %addr)
+declare i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
+declare i64 @llvm.aarch64.ldxr.p0i64(i64* %addr)
+
+define i8 @test_ldxr_8(i8* %addr) {
+; CHECK-LABEL: test_ldxr_8:
+; CHECK: ldxrb w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+  %val8 = trunc i64 %val to i8
+  ret i8 %val8
+}
+
+define i16 @test_ldxr_16(i16* %addr) {
+; CHECK-LABEL: test_ldxr_16:
+; CHECK: ldxrh w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i16(i16* %addr)
+  %val16 = trunc i64 %val to i16
+  ret i16 %val16
+}
+
+define i32 @test_ldxr_32(i32* %addr) {
+; CHECK-LABEL: test_ldxr_32:
+; CHECK: ldxr w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
+  %val32 = trunc i64 %val to i32
+  ret i32 %val32
+}
+
+define i64 @test_ldxr_64(i64* %addr) {
+; CHECK-LABEL: test_ldxr_64:
+; CHECK: ldxr x0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldxr.p0i64(i64* %addr)
+  ret i64 %val
+}
+
+declare i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+declare i64 @llvm.aarch64.ldaxr.p0i16(i16* %addr)
+declare i64 @llvm.aarch64.ldaxr.p0i32(i32* %addr)
+declare i64 @llvm.aarch64.ldaxr.p0i64(i64* %addr)
+
+define i8 @test_ldaxr_8(i8* %addr) {
+; CHECK-LABEL: test_ldaxr_8:
+; CHECK: ldaxrb w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+  %val8 = trunc i64 %val to i8
+  ret i8 %val8
+}
+
+define i16 @test_ldaxr_16(i16* %addr) {
+; CHECK-LABEL: test_ldaxr_16:
+; CHECK: ldaxrh w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i16(i16* %addr)
+  %val16 = trunc i64 %val to i16
+  ret i16 %val16
+}
+
+define i32 @test_ldaxr_32(i32* %addr) {
+; CHECK-LABEL: test_ldaxr_32:
+; CHECK: ldaxr w0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i32(i32* %addr)
+  %val32 = trunc i64 %val to i32
+  ret i32 %val32
+}
+
+define i64 @test_ldaxr_64(i64* %addr) {
+; CHECK-LABEL: test_ldaxr_64:
+; CHECK: ldaxr x0, [x0]
+
+  %val = call i64 @llvm.aarch64.ldaxr.p0i64(i64* %addr)
+  ret i64 %val
+}
+
+declare i32 @llvm.aarch64.stxr.p0i8(i64, i8*)
+declare i32 @llvm.aarch64.stxr.p0i16(i64, i16*)
+declare i32 @llvm.aarch64.stxr.p0i32(i64, i32*)
+declare i32 @llvm.aarch64.stxr.p0i64(i64, i64*)
+
+define i32 @test_stxr_8(i8* %addr, i8 %val) {
+; CHECK-LABEL: test_stxr_8:
+; CHECK: stxrb [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i8 %val to i64
+  %success = call i32 @llvm.aarch64.stxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stxr_16(i16* %addr, i16 %val) {
+; CHECK-LABEL: test_stxr_16:
+; CHECK: stxrh [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i16 %val to i64
+  %success = call i32 @llvm.aarch64.stxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stxr_32(i32* %addr, i32 %val) {
+; CHECK-LABEL: test_stxr_32:
+; CHECK: stxr [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i32 %val to i64
+  %success = call i32 @llvm.aarch64.stxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stxr_64(i64* %addr, i64 %val) {
+; CHECK-LABEL: test_stxr_64:
+; CHECK: stxr [[TMP:w[0-9]+]], x1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %success = call i32 @llvm.aarch64.stxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %success
+}
+
+declare i32 @llvm.aarch64.stlxr.p0i8(i64, i8*)
+declare i32 @llvm.aarch64.stlxr.p0i16(i64, i16*)
+declare i32 @llvm.aarch64.stlxr.p0i32(i64, i32*)
+declare i32 @llvm.aarch64.stlxr.p0i64(i64, i64*)
+
+define i32 @test_stlxr_8(i8* %addr, i8 %val) {
+; CHECK-LABEL: test_stlxr_8:
+; CHECK: stlxrb [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i8 %val to i64
+  %success = call i32 @llvm.aarch64.stlxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stlxr_16(i16* %addr, i16 %val) {
+; CHECK-LABEL: test_stlxr_16:
+; CHECK: stlxrh [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i16 %val to i64
+  %success = call i32 @llvm.aarch64.stlxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stlxr_32(i32* %addr, i32 %val) {
+; CHECK-LABEL: test_stlxr_32:
+; CHECK: stlxr [[TMP:w[0-9]+]], w1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %extval = zext i32 %val to i64
+  %success = call i32 @llvm.aarch64.stlxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %success
+}
+
+define i32 @test_stlxr_64(i64* %addr, i64 %val) {
+; CHECK-LABEL: test_stlxr_64:
+; CHECK: stlxr [[TMP:w[0-9]+]], x1, [x0]
+; CHECK: mov w0, [[TMP]]
+
+  %success = call i32 @llvm.aarch64.stlxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %success
+}
+
+define {i8*, i1} @test_cmpxchg_ptr(i8** %addr, i8* %cmp, i8* %new) {
+; CHECK-LABEL: test_cmpxchg_ptr:
+; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
+; CHECK:     ldaxr [[OLD:w[0-9]+]], [x0]
+; CHECK:     cmp [[OLD]], w1
+; CHECK:     b.ne [[DONE:LBB[0-9]+_[0-9]+]]
+; CHECK:     stlxr [[SUCCESS:w[0-9]+]], w2, [x0]
+; CHECK:     cbnz [[SUCCESS]], [[LOOP]]
+
+; CHECK:     mov w1, #1
+; CHECK:     mov w0, [[OLD]]
+; CHECK:     ret
+
+; CHECK: [[DONE]]:
+; CHECK:     clrex
+; CHECK:     mov w1, wzr
+; CHECK:     mov w0, [[OLD]]
+; CHECK:     ret
+  %res = cmpxchg i8** %addr, i8* %cmp, i8* %new acq_rel acquire
+  ret {i8*, i1} %res
+}
Index: llvm/test/CodeGen/AArch64/arm64_32-fastisel.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-fastisel.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=arm64_32-apple-ios -O0 -fast-isel %s -o - | FileCheck %s
+@var = global i8* null
+
+define void @test_store_release_ptr() {
+; CHECK-LABEL: test_store_release_ptr
+; CHECK: mov [[ZERO:w[0-9]+]], wzr
+; CHECK: stlr [[ZERO]]
+  store atomic i8* null, i8** @var release, align 4
+  br label %next
+
+next:
+  ret void
+}
+
+declare [2 x i32] @callee()
+
+define void @test_struct_return(i32* %addr) {
+; CHECK-LABEL: test_struct_return:
+; CHECK: bl _callee
+; CHECK-DAG: lsr [[HI:x[0-9]+]], x0, #32
+; CHECK-DAG: str w0
+  %res = call [2 x i32] @callee()
+  %res.0 = extractvalue [2 x i32] %res, 0
+  store i32 %res.0, i32* %addr
+  %res.1 = extractvalue [2 x i32] %res, 1
+  store i32 %res.1, i32* %addr
+  ret void
+}
Index: llvm/test/CodeGen/AArch64/arm64_32-frame-pointers.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-frame-pointers.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=arm64_32-apple-ios8.0 %s -o - | FileCheck %s
+
+; We're provoking LocalStackSlotAllocation to create some shared frame bases
+; here: it wants multiple <fi#N> using instructions that can be satisfied by a
+; single base, but not within the addressing-mode.
+;
+; When that happens it's important that we don't mix our pointer sizes
+; (e.g. try to create an ldr from a w-register base).
+define i8 @test_register_wrangling() {
+; CHECK-LABEL: test_register_wrangling:
+; CHECK: add [[TMP:x[0-9]+]], sp,
+; CHECK: add x[[BASE:[0-9]+]], [[TMP]],
+; CHECK: ldrb {{w[0-9]+}}, [x[[BASE]], #1]
+; CHECK: ldrb {{w[0-9]+}}, [x[[BASE]]]
+
+  %var1 = alloca i8, i32 4100
+  %var3 = alloca i8
+  %dummy = alloca i8, i32 4100
+
+  %var1p1 = getelementptr i8, i8* %var1, i32 1
+  %val1 = load i8, i8* %var1
+  %val2 = load i8, i8* %var3
+
+  %sum = add i8 %val1, %val2
+  ret i8 %sum
+}
Index: llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
@@ -0,0 +1,61 @@
+; RUN: opt -codegenprepare -mtriple=arm64_32-apple-ios %s -S -o - | FileCheck %s
+
+define void @test_simple_sink(i1* %base, i64 %offset) {
+; CHECK-LABEL: @test_simple_sink
+; CHECK: next:
+; CHECK:   [[BASE8:%.*]] = bitcast i1* %base to i8*
+; CHECK:   [[ADDR8:%.*]] = getelementptr i8, i8* [[BASE8]], i64 %offset
+; CHECK:   [[ADDR:%.*]] = bitcast i8* [[ADDR8]] to i1*
+; CHECK:   load volatile i1, i1* [[ADDR]]
+  %addr = getelementptr i1, i1* %base, i64 %offset
+  %tst = load i1, i1* %addr
+  br i1 %tst, label %next, label %end
+
+next:
+  load volatile i1, i1* %addr
+  ret void
+
+end:
+  ret void
+}
+
+define void @test_inbounds_sink(i1* %base, i64 %offset) {
+; CHECK-LABEL: @test_inbounds_sink
+; CHECK: next:
+; CHECK:   [[BASE8:%.*]] = bitcast i1* %base to i8*
+; CHECK:   [[ADDR8:%.*]] = getelementptr inbounds i8, i8* [[BASE8]], i64 %offset
+; CHECK:   [[ADDR:%.*]] = bitcast i8* [[ADDR8]] to i1*
+; CHECK:   load volatile i1, i1* [[ADDR]]
+  %addr = getelementptr inbounds i1, i1* %base, i64 %offset
+  %tst = load i1, i1* %addr
+  br i1 %tst, label %next, label %end
+
+next:
+  load volatile i1, i1* %addr
+  ret void
+
+end:
+  ret void
+}
+
+; No address derived via an add can be guaranteed inbounds
+define void @test_add_sink(i1* %base, i64 %offset) {
+; CHECK-LABEL: @test_add_sink
+; CHECK: next:
+; CHECK:   [[BASE8:%.*]] = bitcast i1* %base to i8*
+; CHECK:   [[ADDR8:%.*]] = getelementptr i8, i8* [[BASE8]], i64 %offset
+; CHECK:   [[ADDR:%.*]] = bitcast i8* [[ADDR8]] to i1*
+; CHECK:   load volatile i1, i1* [[ADDR]]
+  %base64 = ptrtoint i1* %base to i64
+  %addr64 = add nsw nuw i64 %base64, %offset
+  %addr = inttoptr i64 %addr64 to i1*
+  %tst = load i1, i1* %addr
+  br i1 %tst, label %next, label %end
+
+next:
+  load volatile i1, i1* %addr
+  ret void
+
+end:
+  ret void
+}
Index: llvm/test/CodeGen/AArch64/arm64_32-memcpy.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-memcpy.ll
@@ -0,0 +1,66 @@
+; RUN: llc -mtriple=arm64_32-apple-ios9.0 -o - %s | FileCheck %s
+
+define i64 @test_memcpy(i64* %addr, i8* %src, i1 %tst) minsize {
+; CHECK-LABEL: test_memcpy:
+; CHECK: ldr [[VAL64:x[0-9]+]], [x0]
+; [...]
+; CHECK: and x0, [[VAL64]], #0xffffffff
+; CHECK: bl _memcpy
+
+  %val64 = load i64, i64* %addr
+  br i1 %tst, label %true, label %false
+
+true:
+  ret i64 %val64
+
+false:
+  %val32 = trunc i64 %val64 to i32
+  %val.ptr = inttoptr i32 %val32 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %val.ptr, i8* %src, i32 128, i32 0, i1 1)
+  ret i64 undef
+}
+
+define i64 @test_memmove(i64* %addr, i8* %src, i1 %tst) minsize {
+; CHECK-LABEL: test_memmove:
+; CHECK: ldr [[VAL64:x[0-9]+]], [x0]
+; [...]
+; CHECK: and x0, [[VAL64]], #0xffffffff
+; CHECK: bl _memmove
+
+  %val64 = load i64, i64* %addr
+  br i1 %tst, label %true, label %false
+
+true:
+  ret i64 %val64
+
+false:
+  %val32 = trunc i64 %val64 to i32
+  %val.ptr = inttoptr i32 %val32 to i8*
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %val.ptr, i8* %src, i32 128, i32 0, i1 1)
+  ret i64 undef
+}
+
+define i64 @test_memset(i64* %addr, i8* %src, i1 %tst) minsize {
+; CHECK-LABEL: test_memset:
+; CHECK: ldr [[VAL64:x[0-9]+]], [x0]
+; [...]
+; CHECK: and x0, [[VAL64]], #0xffffffff
+; CHECK: bl _memset
+
+  %val64 = load i64, i64* %addr
+  br i1 %tst, label %true, label %false
+
+true:
+  ret i64 %val64
+
+false:
+  %val32 = trunc i64 %val64 to i32
+  %val.ptr = inttoptr i32 %val32 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %val.ptr, i8 42, i32 256, i32 0, i1 1)
+  ret i64 undef
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+declare void @llvm.memmove.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1)
+
Index: llvm/test/CodeGen/AArch64/arm64_32-neon.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-neon.ll
@@ -0,0 +1,198 @@
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s
+
+define <2 x double> @test_insert_elt(<2 x double> %vec, double %val) {
+; CHECK-LABEL: test_insert_elt:
+; CHECK: mov.d v0[0], v1[0]
+  %res = insertelement <2 x double> %vec, double %val, i32 0
+  ret <2 x double> %res
+}
+
+define void @test_split_16B(<4 x float> %val, <4 x float>* %addr) {
+; CHECK-LABEL: test_split_16B:
+; CHECK: str q0, [x0]
+  store <4 x float> %val, <4 x float>* %addr, align 8
+  ret void
+}
+
+define void @test_split_16B_splat(<4 x i32>, <4 x i32>* %addr) {
+; CHECK-LABEL: test_split_16B_splat:
+; CHECK: str {{q[0-9]+}}
+
+  %vec.tmp0 = insertelement <4 x i32> undef, i32 42, i32 0
+  %vec.tmp1 = insertelement <4 x i32> %vec.tmp0, i32 42, i32 1
+  %vec.tmp2 = insertelement <4 x i32> %vec.tmp1, i32 42, i32 2
+  %vec = insertelement <4 x i32> %vec.tmp2, i32 42, i32 3
+
+  store <4 x i32> %vec, <4 x i32>* %addr, align 8
+  ret void
+}
+
+
+%vec = type <2 x double>
+
+declare {%vec, %vec} @llvm.aarch64.neon.ld2r.v2f64.p0i8(i8*)
+define {%vec, %vec} @test_neon_load(i8* %addr) {
+; CHECK-LABEL: test_neon_load:
+; CHECK: ld2r.2d { v0, v1 }, [x0]
+  %res = call {%vec, %vec} @llvm.aarch64.neon.ld2r.v2f64.p0i8(i8* %addr)
+  ret {%vec, %vec} %res
+}
+
+declare {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0i8(%vec, %vec, i64, i8*)
+define {%vec, %vec} @test_neon_load_lane(i8* %addr, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_load_lane:
+; CHECK: ld2.d { v0, v1 }[0], [x0]
+  %res = call {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0i8(%vec %in1, %vec %in2, i64 0, i8* %addr)
+  ret {%vec, %vec} %res
+}
+
+declare void @llvm.aarch64.neon.st2.v2f64.p0i8(%vec, %vec, i8*)
+define void @test_neon_store(i8* %addr, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_store:
+; CHECK: st2.2d { v0, v1 }, [x0]
+  call void @llvm.aarch64.neon.st2.v2f64.p0i8(%vec %in1, %vec %in2, i8* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st2lane.v2f64.p0i8(%vec, %vec, i64, i8*)
+define void @test_neon_store_lane(i8* %addr, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_store_lane:
+; CHECK: st2.d { v0, v1 }[1], [x0]
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(%vec %in1, %vec %in2, i64 1, i8* %addr)
+  ret void
+}
+
+declare {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0i8(i8*)
+define {{%vec, %vec}, i8*} @test_neon_load_post(i8* %addr, i32 %offset) {
+; CHECK-LABEL: test_neon_load_post:
+; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1
+; CHECK: ld2.2d { v0, v1 }, [x0], [[OFFSET]]
+
+  %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0i8(i8* %addr)
+
+  %addr.new = getelementptr inbounds i8, i8* %addr, i32 %offset
+
+  %res.tmp = insertvalue {{%vec, %vec}, i8*} undef, {%vec, %vec} %vecs, 0
+  %res = insertvalue {{%vec, %vec}, i8*} %res.tmp, i8* %addr.new, 1
+  ret {{%vec, %vec}, i8*} %res
+}
+
+define {{%vec, %vec}, i8*} @test_neon_load_post_lane(i8* %addr, i32 %offset, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_load_post_lane:
+; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1
+; CHECK: ld2.d { v0, v1 }[1], [x0], [[OFFSET]]
+
+  %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2lane.v2f64.p0i8(%vec %in1, %vec %in2, i64 1, i8* %addr)
+
+  %addr.new = getelementptr inbounds i8, i8* %addr, i32 %offset
+
+  %res.tmp = insertvalue {{%vec, %vec}, i8*} undef, {%vec, %vec} %vecs, 0
+  %res = insertvalue {{%vec, %vec}, i8*} %res.tmp, i8* %addr.new, 1
+  ret {{%vec, %vec}, i8*} %res
+}
+
+define i8* @test_neon_store_post(i8* %addr, i32 %offset, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_store_post:
+; CHECK-DAG: sxtw [[OFFSET:x[0-9]+]], w1
+; CHECK: st2.2d { v0, v1 }, [x0], [[OFFSET]]
+
+  call void @llvm.aarch64.neon.st2.v2f64.p0i8(%vec %in1, %vec %in2, i8* %addr)
+
+  %addr.new = getelementptr inbounds i8, i8* %addr, i32 %offset
+
+  ret i8* %addr.new
+}
+
+define i8* @test_neon_store_post_lane(i8* %addr, i32 %offset, %vec %in1, %vec %in2) {
+; CHECK-LABEL: test_neon_store_post_lane:
+; CHECK: sxtw [[OFFSET:x[0-9]+]], w1
+; CHECK: st2.d { v0, v1 }[0], [x0], [[OFFSET]]
+
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(%vec %in1, %vec %in2, i64 0, i8* %addr)
+
+  %addr.new = getelementptr inbounds i8, i8* %addr, i32 %offset
+
+  ret i8* %addr.new
+}
+
+; ld1 is slightly different because it goes via ISelLowering of normal IR ops
+; rather than an intrinsic.
+define {%vec, double*} @test_neon_ld1_post_lane(double* %addr, i32 %offset, %vec %in) {
+; CHECK-LABEL: test_neon_ld1_post_lane:
+; CHECK: sbfiz [[OFFSET:x[0-9]+]], x1, #3, #32
+; CHECK: ld1.d { v0 }[0], [x0], [[OFFSET]]
+
+  %loaded = load double, double* %addr, align 8
+  %newvec = insertelement %vec %in, double %loaded, i32 0
+
+  %addr.new = getelementptr inbounds double, double* %addr, i32 %offset
+
+  %res.tmp = insertvalue {%vec, double*} undef, %vec %newvec, 0
+  %res = insertvalue {%vec, double*} %res.tmp, double* %addr.new, 1
+
+  ret {%vec, double*} %res
+}
+
+define {{%vec, %vec}, i8*} @test_neon_load_post_exact(i8* %addr) {
+; CHECK-LABEL: test_neon_load_post_exact:
+; CHECK: ld2.2d { v0, v1 }, [x0], #32
+
+  %vecs = call {%vec, %vec} @llvm.aarch64.neon.ld2.v2f64.p0i8(i8* %addr)
+
+  %addr.new = getelementptr inbounds i8, i8* %addr, i32 32
+
+  %res.tmp = insertvalue {{%vec, %vec}, i8*} undef, {%vec, %vec} %vecs, 0
+  %res = insertvalue {{%vec, %vec}, i8*} %res.tmp, i8* %addr.new, 1
+  ret {{%vec, %vec}, i8*} %res
+}
+
+define {%vec, double*} @test_neon_ld1_post_lane_exact(double* %addr, %vec %in) {
+; CHECK-LABEL: test_neon_ld1_post_lane_exact:
+; CHECK: ld1.d { v0 }[0], [x0], #8
+
+  %loaded = load double, double* %addr, align 8
+  %newvec = insertelement %vec %in, double %loaded, i32 0
+
+  %addr.new = getelementptr inbounds double, double* %addr, i32 1
+
+  %res.tmp = insertvalue {%vec, double*} undef, %vec %newvec, 0
+  %res = insertvalue {%vec, double*} %res.tmp, double* %addr.new, 1
+
+  ret {%vec, double*} %res
+}
+
+; As in the general load/store case, this GEP has defined semantics when the
+; address wraps. We cannot use post-indexed addressing.
+define {%vec, double*} @test_neon_ld1_notpost_lane_exact(double* %addr, %vec %in) {
+; CHECK-LABEL: test_neon_ld1_notpost_lane_exact:
+; CHECK-NOT: ld1.d { {{v[0-9]+}} }[0], [{{x[0-9]+|sp}}], #8
+; CHECK: add w0, w0, #8
+; CHECK: ret
+
+  %loaded = load double, double* %addr, align 8
+  %newvec = insertelement %vec %in, double %loaded, i32 0
+
+  %addr.new = getelementptr double, double* %addr, i32 1
+
+  %res.tmp = insertvalue {%vec, double*} undef, %vec %newvec, 0
+  %res = insertvalue {%vec, double*} %res.tmp, double* %addr.new, 1
+
+  ret {%vec, double*} %res
+}
+
+define {%vec, double*} @test_neon_ld1_notpost_lane(double* %addr, i32 %offset, %vec %in) {
+; CHECK-LABEL: test_neon_ld1_notpost_lane:
+; CHECK-NOT: ld1.d { {{v[0-9]+}} }[0], [{{x[0-9]+|sp}}], {{x[0-9]+|sp}}
+; CHECK: add w0, w0, w1, lsl #3
+; CHECK: ret
+
+  %loaded = load double, double* %addr, align 8
+  %newvec = insertelement %vec %in, double %loaded, i32 0
+
+  %addr.new = getelementptr double, double* %addr, i32 %offset
+
+  %res.tmp = insertvalue {%vec, double*} undef, %vec %newvec, 0
+  %res = insertvalue {%vec, double*} %res.tmp, double* %addr.new, 1
+
+  ret {%vec, double*} %res
+}
Index: llvm/test/CodeGen/AArch64/arm64_32-null.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-null.ll
@@ -0,0 +1,28 @@
+; RUN: llc -fast-isel=true  -global-isel=false -O0 -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+; RUN: llc -fast-isel=false -global-isel=false -O0 -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+
+define void @test_store(i8** %p) {
+; CHECK-LABEL: test_store:
+; CHECK: mov [[R1:w[0-9]+]], wzr
+; CHECK: str [[R1]], [x0]
+
+  store i8* null, i8** %p
+  ret void
+}
+
+define void @test_phi(i8** %p) {
+; CHECK-LABEL: test_phi:
+; CHECK: mov [[R1:x[0-9]+]], xzr
+; CHECK: str [[R1]], [sp]
+; CHECK: b [[BB:LBB[0-9_]+]]
+; CHECK: [[BB]]:
+; CHECK: ldr x0, [sp]
+; CHECK: str w0, [x{{.*}}]
+
+bb0:
+  br label %bb1
+bb1:
+  %tmp0 = phi i8* [ null, %bb0 ]
+  store i8* %tmp0, i8** %p
+  ret void
+}
Index: llvm/test/CodeGen/AArch64/arm64_32-pointer-extend.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-pointer-extend.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 %s -o - | FileCheck %s
+
+define void @pass_pointer(i64 %in) {
+; CHECK-LABEL: pass_pointer:
+; CHECK: and x0, x0, #0xffffffff
+; CHECK: bl _take_pointer
+
+  %in32 = trunc i64 %in to i32
+  %ptr = inttoptr i32 %in32 to i8*
+  call i64 @take_pointer(i8* %ptr)
+  ret void
+}
+
+define i64 @take_pointer(i8* %ptr) nounwind {
+; CHECK-LABEL: take_pointer:
+; CHECK-NEXT: %bb.0
+; CHECK-NEXT: ret
+
+  %val = ptrtoint i8* %ptr to i32
+  %res = zext i32 %val to i64
+  ret i64 %res
+}
+
+define i32 @callee_ptr_stack_slot([8 x i64], i8*, i32 %val) {
+; CHECK-LABEL: callee_ptr_stack_slot:
+; CHECK: ldr w0, [sp, #4]
+
+  ret i32 %val
+}
+
+define void @caller_ptr_stack_slot(i8* %ptr) {
+; CHECK-LABEL: caller_ptr_stack_slot:
+; CHECK-DAG: mov [[VAL:w[0-9]]], #42
+; CHECK: stp w0, [[VAL]], [sp]
+
+  call i32 @callee_ptr_stack_slot([8 x i64] undef, i8* %ptr, i32 42)
+  ret void
+}
+
+define i8* @return_ptr(i64 %in, i64 %r) {
+; CHECK-LABEL: return_ptr:
+; CHECK: sdiv [[VAL64:x[0-9]+]], x0, x1
+; CHECK: and x0, [[VAL64]], #0xffffffff
+
+  %sum = sdiv i64 %in, %r
+  %sum32 = trunc i64 %sum to i32
+  %res = inttoptr i32 %sum32 to i8*
+  ret i8* %res
+}
Index: llvm/test/CodeGen/AArch64/arm64_32-stack-pointers.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-stack-pointers.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=arm64_32-apple-ios9.0 -o - %s | FileCheck %s
+
+declare void @callee([8 x i64], i8*, i8*)
+
+; Make sure we don't accidentally store X0 or XZR, which might well
+; clobber other arguments or data.
+define void @test_stack_ptr_32bits(i8* %in) {
+; CHECK-LABEL: test_stack_ptr_32bits:
+; CHECK-DAG: stp wzr, w0, [sp]
+
+  call void @callee([8 x i64] undef, i8* null, i8* %in)
+  ret void
+}
Index: llvm/test/CodeGen/AArch64/arm64_32-tls.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-tls.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+
+define i32 @test_thread_local() {
+; CHECK-LABEL: test_thread_local:
+; CHECK: adrp x[[TMP:[0-9]+]], _var@TLVPPAGE
+; CHECK: ldr w0, [x[[TMP]], _var@TLVPPAGEOFF]
+; CHECK: ldr w[[DEST:[0-9]+]], [x0]
+; CHECK: blr x[[DEST]]
+
+  %val = load i32, i32* @var
+  ret i32 %val
+}
+
+@var = thread_local global i32 zeroinitializer
+
+; CHECK: .tbss _var$tlv$init, 4, 2
+
+; CHECK-LABEL: __DATA,__thread_vars
+; CHECK: _var:
+; CHECK:    .long __tlv_bootstrap
+; CHECK:    .long 0
+; CHECK:    .long _var$tlv$init
Index: llvm/test/CodeGen/AArch64/arm64_32-va.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32-va.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=arm64_32-apple-ios %s -o - | FileCheck %s
+
+define void @test_va_copy(i8* %dst, i8* %src) {
+; CHECK-LABEL: test_va_copy:
+; CHECK: ldr [[PTR:w[0-9]+]], [x1]
+; CHECK: str [[PTR]], [x0]
+
+  call void @llvm.va_copy(i8* %dst, i8* %src)
+  ret void
+}
+
+define void @test_va_start(i32, ...)  {
+; CHECK-LABEL: test_va_start
+; CHECK: add x[[LIST:[0-9]+]], sp, #16
+; CHECK: str w[[LIST]],
+  %slot = alloca i8*, align 4
+  %list = bitcast i8** %slot to i8*
+  call void @llvm.va_start(i8* %list)
+  ret void
+}
+
+define void @test_va_start_odd([8 x i64], i32, ...) {
+; CHECK-LABEL: test_va_start_odd:
+; CHECK: add x[[LIST:[0-9]+]], sp, #20
+; CHECK: str w[[LIST]],
+  %slot = alloca i8*, align 4
+  %list = bitcast i8** %slot to i8*
+  call void @llvm.va_start(i8* %list)
+  ret void
+}
+
+define i8* @test_va_arg(i8** %list) {
+; CHECK-LABEL: test_va_arg:
+; CHECK: ldr w[[LOC:[0-9]+]], [x0]
+; CHECK: add [[NEXTLOC:w[0-9]+]], w[[LOC]], #4
+; CHECK: str [[NEXTLOC]], [x0]
+; CHECK: ldr w0, [x[[LOC]]]
+  %res = va_arg i8** %list, i8*
+  ret i8* %res
+}
+
+define i8* @really_test_va_arg(i8** %list, i1 %tst) {
+; CHECK-LABEL: really_test_va_arg:
+; CHECK: ldr w[[LOC:[0-9]+]], [x0]
+; CHECK: add [[NEXTLOC:w[0-9]+]], w[[LOC]], #4
+; CHECK: str [[NEXTLOC]], [x0]
+; CHECK: ldr w[[VAARG:[0-9]+]], [x[[LOC]]]
+; CHECK: csel x0, x[[VAARG]], xzr
+  %tmp = va_arg i8** %list, i8*
+  %res = select i1 %tst, i8* %tmp, i8* null
+  ret i8* %res
+}
+
+declare void @llvm.va_start(i8*) 
+
+declare void @llvm.va_copy(i8*, i8*)
Index: llvm/test/CodeGen/AArch64/arm64_32.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/arm64_32.ll
@@ -0,0 +1,715 @@
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 %s -filetype=obj -o - -disable-post-ra -frame-pointer=all | \
+; RUN:     llvm-objdump -private-headers - | \
+; RUN:     FileCheck %s --check-prefix=CHECK-MACHO
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 %s -o - -aarch64-enable-atomic-cfg-tidy=0 -disable-post-ra -frame-pointer=all | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OPT
+; RUN: llc -mtriple=arm64_32-apple-ios7.0 %s -o - -fast-isel -aarch64-enable-atomic-cfg-tidy=0 -disable-post-ra -frame-pointer=all | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FAST
+
+; CHECK-MACHO: Mach header
+; CHECK-MACHO: MH_MAGIC ARM64_32 V8
+
+@var64 = global i64 zeroinitializer, align 8
+@var32 = global i32 zeroinitializer, align 4
+
+@var_got = external global i8
+
+define i32* @test_global_addr() {
+; CHECK-LABEL: test_global_addr:
+; CHECK: adrp [[PAGE:x[0-9]+]], _var32@PAGE
+; CHECK: add x0, [[PAGE]], _var32@PAGEOFF
+  ret i32* @var32
+}
+
+; ADRP is necessarily 64-bit. The important point to check is that, however that
+; gets truncated to 32-bits, it's free. No need to zero out higher bits of that
+; register.
+define i64 @test_global_addr_extension() {
+; CHECK-LABEL: test_global_addr_extension:
+; CHECK: adrp [[PAGE:x[0-9]+]], _var32@PAGE
+; CHECK: add x0, [[PAGE]], _var32@PAGEOFF
+; CHECK-NOT: and
+; CHECK: ret
+
+  ret i64 ptrtoint(i32* @var32 to i64)
+}
+
+define i32 @test_global_value() {
+; CHECK-LABEL: test_global_value:
+; CHECK: adrp x[[PAGE:[0-9]+]], _var32@PAGE
+; CHECK: ldr w0, [x[[PAGE]], _var32@PAGEOFF]
+  %val = load i32, i32* @var32, align 4
+  ret i32 %val
+}
+
+; Because the addition may wrap, it is not safe to use "ldr w0, [xN, #32]" here.
+define i32 @test_unsafe_indexed_add() {
+; CHECK-LABEL: test_unsafe_indexed_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_plus_32 = add i32 %addr_int, 32
+  %addr = inttoptr i32 %addr_plus_32 to i32*
+  %val = load i32, i32* %addr, align 4
+  ret i32 %val
+}
+
+; Since we've promised there is no unsigned overflow, @var32 must be at least
+; 32-bytes below 2^32, and we can use the load this time.
+define i32 @test_safe_indexed_add() {
+; CHECK-LABEL: test_safe_indexed_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i64
+  %addr_plus_32 = add nuw i64 %addr_int, 32
+  %addr = inttoptr i64 %addr_plus_32 to i32*
+  %val = load i32, i32* %addr, align 4
+  ret i32 %val
+}
+
+define i32 @test_safe_indexed_or(i32 %in) {
+; CHECK-LABEL: test_safe_indexed_or:
+; CHECK: and [[TMP:w[0-9]+]], {{w[0-9]+}}, #0xfffffff0
+; CHECK: orr w[[ADDR:[0-9]+]], [[TMP]], #0x4
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = and i32 %in, -16
+  %addr_plus_4 = or i32 %addr_int, 4
+  %addr = inttoptr i32 %addr_plus_4 to i32*
+  %val = load i32, i32* %addr, align 4
+  ret i32 %val
+}
+
+
+; Promising nsw is not sufficient because the addressing mode basically
+; calculates "zext(base) + zext(offset)" and nsw only guarantees
+; "sext(base) + sext(offset) == base + offset".
+define i32 @test_unsafe_nsw_indexed_add() {
+; CHECK-LABEL: test_unsafe_nsw_indexed_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #32
+; CHECK-NOT: ubfx
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_plus_32 = add nsw i32 %addr_int, 32
+  %addr = inttoptr i32 %addr_plus_32 to i32*
+  %val = load i32, i32* %addr, align 4
+  ret i32 %val
+}
+
+; Because the addition may wrap, it is not safe to use "ldr w0, [xN, #32]" here.
+define i32 @test_unsafe_unscaled_add() {
+; CHECK-LABEL: test_unsafe_unscaled_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_plus_3 = add i32 %addr_int, 3
+  %addr = inttoptr i32 %addr_plus_3 to i32*
+  %val = load i32, i32* %addr, align 1
+  ret i32 %val
+}
+
+; Since we've promised there is no unsigned overflow, @var32 must be at least
+; 32-bytes below 2^32, and we can use the load this time.
+define i32 @test_safe_unscaled_add() {
+; CHECK-LABEL: test_safe_unscaled_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_plus_3 = add nuw i32 %addr_int, 3
+  %addr = inttoptr i32 %addr_plus_3 to i32*
+  %val = load i32, i32* %addr, align 1
+  ret i32 %val
+}
+
+; Promising nsw is not sufficient because the addressing mode basically
+; calculates "zext(base) + zext(offset)" and nsw only guarantees
+; "sext(base) + sext(offset) == base + offset".
+define i32 @test_unsafe_nsw_unscaled_add() {
+; CHECK-LABEL: test_unsafe_nsw_unscaled_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: add w[[ADDR:[0-9]+]], w[[VAR32]], #3
+; CHECK-NOT: ubfx
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_plus_3 = add nsw i32 %addr_int, 3
+  %addr = inttoptr i32 %addr_plus_3 to i32*
+  %val = load i32, i32* %addr, align 1
+  ret i32 %val
+}
+
+; Because the addition may wrap, it is not safe to use "ldur w0, [xN, #-3]"
+; here.
+define i32 @test_unsafe_negative_unscaled_add() {
+; CHECK-LABEL: test_unsafe_negative_unscaled_add:
+; CHECK: add x[[VAR32:[0-9]+]], {{x[0-9]+}}, _var32@PAGEOFF
+; CHECK: sub w[[ADDR:[0-9]+]], w[[VAR32]], #3
+; CHECK: ldr w0, [x[[ADDR]]]
+  %addr_int = ptrtoint i32* @var32 to i32
+  %addr_minus_3 = add i32 %addr_int, -3
+  %addr = inttoptr i32 %addr_minus_3 to i32*
+  %val = load i32, i32* %addr, align 1
+  ret i32 %val
+}
+
+define i8* @test_got_addr() {
+; CHECK-LABEL: test_got_addr:
+; CHECK: adrp x[[PAGE:[0-9]+]], _var_got@GOTPAGE
+; CHECK: ldr w0, [x[[PAGE]], _var_got@GOTPAGEOFF]
+  ret i8* @var_got
+}
+
+define float @test_va_arg_f32(i8** %list) {
+; CHECK-LABEL: test_va_arg_f32:
+
+; CHECK: ldr w[[START:[0-9]+]], [x0]
+; CHECK: add [[AFTER:w[0-9]+]], w[[START]], #8
+; CHECK: str [[AFTER]], [x0]
+
+  ; Floating point arguments get promoted to double as per C99.
+; CHECK: ldr [[DBL:d[0-9]+]], [x[[START]]]
+; CHECK: fcvt s0, [[DBL]]
+  %res = va_arg i8** %list, float
+  ret float %res
+}
+
+; Interesting point is that the slot is 4 bytes.
+define i8 @test_va_arg_i8(i8** %list) {
+; CHECK-LABEL: test_va_arg_i8:
+
+; CHECK: ldr w[[START:[0-9]+]], [x0]
+; CHECK: add [[AFTER:w[0-9]+]], w[[START]], #4
+; CHECK: str [[AFTER]], [x0]
+
+  ; i8 gets promoted to int (again, as per C99).
+; CHECK: ldr w0, [x[[START]]]
+
+  %res = va_arg i8** %list, i8
+  ret i8 %res
+}
+
+; Interesting point is that the slot needs aligning (again, min size is 4
+; bytes).
+define i64 @test_va_arg_i64(i64** %list) {
+; CHECK-LABEL: test_va_arg_i64:
+
+  ; Update the list for the next user (minimum slot size is 4, but the actual
+  ; argument is 8 which had better be reflected!)
+; CHECK: ldr w[[UNALIGNED_START:[0-9]+]], [x0]
+; CHECK: add [[ALIGN_TMP:x[0-9]+]], x[[UNALIGNED_START]], #7
+; CHECK: and x[[START:[0-9]+]], [[ALIGN_TMP]], #0x1fffffff8
+; CHECK: add w[[AFTER:[0-9]+]], w[[START]], #8
+; CHECK: str w[[AFTER]], [x0]
+
+; CHECK: ldr x0, [x[[START]]]
+
+  %res = va_arg i64** %list, i64
+  ret i64 %res
+}
+
+declare void @bar(...)
+define void @test_va_call(i8 %l, i8 %r, float %in, i8* %ptr) {
+; CHECK-LABEL: test_va_call:
+; CHECK: add [[SUM:w[0-9]+]], {{w[0-9]+}}, w1
+
+; CHECK-DAG: str w2, [sp, #32]
+; CHECK-DAG: str xzr, [sp, #24]
+; CHECK-DAG: str s0, [sp, #16]
+; CHECK-DAG: str xzr, [sp, #8]
+; CHECK-DAG: str [[SUM]], [sp]
+
+  ; Add them to ensure real promotion occurs.
+  %sum = add i8 %l, %r
+  call void(...) @bar(i8 %sum, i64 0, float %in, double 0.0, i8* %ptr)
+  ret void
+}
+
+declare i8* @llvm.frameaddress(i32)
+
+define i8* @test_frameaddr() {
+; CHECK-LABEL: test_frameaddr:
+; CHECK: ldr {{w0|x0}}, [x29]
+  %val = call i8* @llvm.frameaddress(i32 1)
+  ret i8* %val
+}
+
+declare i8* @llvm.returnaddress(i32)
+
+define i8* @test_toplevel_returnaddr() {
+; CHECK-LABEL: test_toplevel_returnaddr:
+; CHECK: mov x0, x30
+  %val = call i8* @llvm.returnaddress(i32 0)
+  ret i8* %val
+}
+
+define i8* @test_deep_returnaddr() {
+; CHECK-LABEL: test_deep_returnaddr:
+; CHECK: ldr x[[FRAME_REC:[0-9]+]], [x29]
+; CHECK: ldr x0, [x[[FRAME_REC]], #8]
+  %val = call i8* @llvm.returnaddress(i32 1)
+  ret i8* %val
+}
+
+define void @test_indirect_call(void()* %func) {
+; CHECK-LABEL: test_indirect_call:
+; CHECK: blr x0
+  call void() %func()
+  ret void
+}
+
+; Safe to use the unextended address here
+define void @test_indirect_safe_call(i32* %weird_funcs) {
+; CHECK-LABEL: test_indirect_safe_call:
+; CHECK: add w[[ADDR32:[0-9]+]], w0, #4
+; CHECK-OPT-NOT: ubfx
+; CHECK: blr x[[ADDR32]]
+  %addr = getelementptr i32, i32* %weird_funcs, i32 1
+  %func = bitcast i32* %addr to void()*
+  call void() %func()
+  ret void
+}
+
+declare void @simple()
+define void @test_simple_tail_call() {
+; CHECK-LABEL: test_simple_tail_call:
+; CHECK: b _simple
+  tail call void @simple()
+  ret void
+}
+
+define void @test_indirect_tail_call(void()* %func) {
+; CHECK-LABEL: test_indirect_tail_call:
+; CHECK: br x0
+  tail call void() %func()
+  ret void
+}
+
+; Safe to use the unextended address here
+define void @test_indirect_safe_tail_call(i32* %weird_funcs) {
+; CHECK-LABEL: test_indirect_safe_tail_call:
+; CHECK: add w[[ADDR32:[0-9]+]], w0, #4
+; CHECK-OPT-NOT: ubfx
+; CHECK-OPT: br x[[ADDR32]]
+  %addr = getelementptr i32, i32* %weird_funcs, i32 1
+  %func = bitcast i32* %addr to void()*
+  tail call void() %func()
+  ret void
+}
+
+; For the "armv7k" slice, Clang will be emitting some small structs as [N x
+; i32]. For ABI compatibility with arm64_32 these need to be passed in *X*
+; registers (e.g. [2 x i32] would be packed into a single register).
+
+define i32 @test_in_smallstruct_low([3 x i32] %in) {
+; CHECK-LABEL: test_in_smallstruct_low:
+; CHECK: mov x0, x1
+  %val = extractvalue [3 x i32] %in, 2
+  ret i32 %val
+}
+
+define i32 @test_in_smallstruct_high([3 x i32] %in) {
+; CHECK-LABEL: test_in_smallstruct_high:
+; CHECK: lsr x0, x0, #32
+  %val = extractvalue [3 x i32] %in, 1
+  ret i32 %val
+}
+
+; The 64-bit DarwinPCS ABI has the quirk that structs on the stack are always
+; 64-bit aligned. This must not happen for arm64_32 since othwerwise va_arg will
+; be incompatible with the armv7k ABI.
+define i32 @test_in_smallstruct_stack([8 x i64], i32, [3 x i32] %in) {
+; CHECK-LABEL: test_in_smallstruct_stack:
+; CHECK: ldr w0, [sp, #4]
+  %val = extractvalue [3 x i32] %in, 0
+  ret i32 %val
+}
+
+define [2 x i32] @test_ret_smallstruct([3 x i32] %in) {
+; CHECK-LABEL: test_ret_smallstruct:
+; CHECK: mov x0, #1
+; CHECK: movk x0, #2, lsl #32
+
+  ret [2 x i32] [i32 1, i32 2]
+}
+
+declare void @smallstruct_callee([4 x i32])
+define void @test_call_smallstruct() {
+; CHECK-LABEL: test_call_smallstruct:
+; CHECK: mov x0, #1
+; CHECK: movk x0, #2, lsl #32
+; CHECK: mov x1, #3
+; CHECK: movk x1, #4, lsl #32
+; CHECK: bl _smallstruct_callee
+
+  call void @smallstruct_callee([4 x i32] [i32 1, i32 2, i32 3, i32 4])
+  ret void
+}
+
+declare void @smallstruct_callee_stack([8 x i64], i32, [2 x i32])
+define void @test_call_smallstruct_stack() {
+; CHECK-LABEL: test_call_smallstruct_stack:
+; CHECK: mov [[VAL:x[0-9]+]], #1
+; CHECK: movk [[VAL]], #2, lsl #32
+; CHECK: stur [[VAL]], [sp, #4]
+
+  call void @smallstruct_callee_stack([8 x i64] undef, i32 undef, [2 x i32] [i32 1, i32 2])
+  ret void
+}
+
+declare [3 x i32] @returns_smallstruct()
+define i32 @test_use_smallstruct_low() {
+; CHECK-LABEL: test_use_smallstruct_low:
+; CHECK: bl _returns_smallstruct
+; CHECK: mov x0, x1
+
+  %struct = call [3 x i32] @returns_smallstruct()
+  %val = extractvalue [3 x i32] %struct, 2
+  ret i32 %val
+}
+
+define i32 @test_use_smallstruct_high() {
+; CHECK-LABEL: test_use_smallstruct_high:
+; CHECK: bl _returns_smallstruct
+; CHECK: lsr x0, x0, #32
+
+  %struct = call [3 x i32] @returns_smallstruct()
+  %val = extractvalue [3 x i32] %struct, 1
+  ret i32 %val
+}
+
+; If a small struct can't be allocated to x0-x7, the remaining registers should
+; be marked as unavailable and subsequent GPR arguments should also be on the
+; stack. Obviously the struct itself should be passed entirely on the stack.
+define i32 @test_smallstruct_padding([7 x i64], [4 x i32] %struct, i32 %in) {
+; CHECK-LABEL: test_smallstruct_padding:
+; CHECK-DAG: ldr [[IN:w[0-9]+]], [sp, #16]
+; CHECK-DAG: ldr [[LHS:w[0-9]+]], [sp]
+; CHECK: add w0, [[LHS]], [[IN]]
+  %lhs = extractvalue [4 x i32] %struct, 0
+  %sum = add i32 %lhs, %in
+  ret i32 %sum
+}
+
+declare void @take_small_smallstruct(i64, [1 x i32])
+define void @test_small_smallstruct() {
+; CHECK-LABEL: test_small_smallstruct:
+; CHECK-DAG: mov w0, #1
+; CHECK-DAG: mov w1, #2
+; CHECK: bl _take_small_smallstruct
+  call void @take_small_smallstruct(i64 1, [1 x i32] [i32 2])
+  ret void
+}
+
+define void @test_bare_frameaddr(i8** %addr) {
+; CHECK-LABEL: test_bare_frameaddr:
+; CHECK: add x[[LOCAL:[0-9]+]], sp, #{{[0-9]+}}
+; CHECK: str w[[LOCAL]],
+
+  %ptr = alloca i8
+  store i8* %ptr, i8** %addr, align 4
+  ret void
+}
+
+define void @test_sret_use([8 x i64]* sret %out) {
+; CHECK-LABEL: test_sret_use:
+; CHECK: str xzr, [x8]
+  %addr = getelementptr [8 x i64], [8 x i64]* %out, i32 0, i32 0
+  store i64 0, i64* %addr
+  ret void
+}
+
+define i64 @test_sret_call() {
+; CHECK-LABEL: test_sret_call:
+; CHECK: mov x8, sp
+; CHECK: bl _test_sret_use
+  %arr = alloca [8 x i64]
+  call void @test_sret_use([8 x i64]* sret %arr)
+
+  %addr = getelementptr [8 x i64], [8 x i64]* %arr, i32 0, i32 0
+  %val = load i64, i64* %addr
+  ret i64 %val
+}
+
+define double @test_constpool() {
+; CHECK-LABEL: test_constpool:
+; CHECK: adrp x[[PAGE:[0-9]+]], [[POOL:lCPI[0-9]+_[0-9]+]]@PAGE
+; CHECK: ldr d0, [x[[PAGE]], [[POOL]]@PAGEOFF]
+  ret double 1.0e-6
+}
+
+define i8* @test_blockaddress() {
+; CHECK-LABEL: test_blockaddress:
+; CHECK: [[BLOCK:Ltmp[0-9]+]]:
+; CHECK: adrp [[PAGE:x[0-9]+]], [[BLOCK]]@PAGE
+; CHECK: add x0, [[PAGE]], [[BLOCK]]@PAGEOFF
+  br label %dest
+dest:
+  ret i8* blockaddress(@test_blockaddress, %dest)
+}
+
+define i8* @test_indirectbr(i8* %dest) {
+; CHECK-LABEL: test_indirectbr:
+; CHECK: br x0
+  indirectbr i8* %dest, [label %true, label %false]
+
+true:
+  ret i8* blockaddress(@test_indirectbr, %true)
+false:
+  ret i8* blockaddress(@test_indirectbr, %false)
+}
+
+; ISelDAGToDAG tries to fold an offset FI load (in this case var+4) into the
+; actual load instruction. This needs to be done slightly carefully since we
+; claim the FI in the process -- it doesn't need extending.
+define float @test_frameindex_offset_load() {
+; CHECK-LABEL: test_frameindex_offset_load:
+; CHECK: ldr s0, [sp, #4]
+  %arr = alloca float, i32 4, align 8
+  %addr = getelementptr inbounds float, float* %arr, i32 1
+
+  %val = load float, float* %addr, align 4
+  ret float %val
+}
+
+define void @test_unaligned_frameindex_offset_store() {
+; CHECK-LABEL: test_unaligned_frameindex_offset_store:
+; CHECK: mov x[[TMP:[0-9]+]], sp
+; CHECK: orr w[[ADDR:[0-9]+]], w[[TMP]], #0x2
+; CHECK: mov [[VAL:w[0-9]+]], #42
+; CHECK: str [[VAL]], [x[[ADDR]]]
+  %arr = alloca [4 x i32]
+
+  %addr.int = ptrtoint [4 x i32]* %arr to i32
+  %addr.nextint = add nuw i32 %addr.int, 2
+  %addr.next = inttoptr i32 %addr.nextint to i32*
+  store i32 42, i32* %addr.next
+  ret void
+}
+
+
+define {i64, i64*} @test_pre_idx(i64* %addr) {
+; CHECK-LABEL: test_pre_idx:
+
+; CHECK: add w[[ADDR:[0-9]+]], w0, #8
+; CHECK: ldr x0, [x[[ADDR]]]
+  %addr.int = ptrtoint i64* %addr to i32
+  %addr.next.int = add nuw i32 %addr.int, 8
+  %addr.next = inttoptr i32 %addr.next.int to i64*
+  %val = load i64, i64* %addr.next
+
+  %tmp = insertvalue {i64, i64*} undef, i64 %val, 0
+  %res = insertvalue {i64, i64*} %tmp, i64* %addr.next, 1
+
+  ret {i64, i64*} %res
+}
+
+; Forming a post-indexed load is invalid here since the GEP needs to work when
+; %addr wraps round to 0.
+define {i64, i64*} @test_invalid_pre_idx(i64* %addr) {
+; CHECK-LABEL: test_invalid_pre_idx:
+; CHECK: add w1, w0, #8
+; CHECK: ldr x0, [x1]
+  %addr.next = getelementptr i64, i64* %addr, i32 1
+  %val = load i64, i64* %addr.next
+
+  %tmp = insertvalue {i64, i64*} undef, i64 %val, 0
+  %res = insertvalue {i64, i64*} %tmp, i64* %addr.next, 1
+
+  ret {i64, i64*} %res
+}
+
+declare void @callee([8 x i32]*)
+define void @test_stack_guard() ssp {
+; CHECK-LABEL: test_stack_guard:
+; CHECK: adrp x[[GUARD_GOTPAGE:[0-9]+]], ___stack_chk_guard@GOTPAGE
+; CHECK: ldr w[[GUARD_ADDR:[0-9]+]], [x[[GUARD_GOTPAGE]], ___stack_chk_guard@GOTPAGEOFF]
+; CHECK: ldr [[GUARD_VAL:w[0-9]+]], [x[[GUARD_ADDR]]]
+; CHECK: stur [[GUARD_VAL]], [x29, #[[GUARD_OFFSET:-[0-9]+]]]
+
+; CHECK: add x0, sp, #{{[0-9]+}}
+; CHECK: bl _callee
+
+; CHECK-OPT: adrp x[[GUARD_GOTPAGE:[0-9]+]], ___stack_chk_guard@GOTPAGE
+; CHECK-OPT: ldr w[[GUARD_ADDR:[0-9]+]], [x[[GUARD_GOTPAGE]], ___stack_chk_guard@GOTPAGEOFF]
+; CHECK-OPT: ldr [[GUARD_VAL:w[0-9]+]], [x[[GUARD_ADDR]]]
+; CHECK-OPT: ldur [[NEW_VAL:w[0-9]+]], [x29, #[[GUARD_OFFSET]]]
+; CHECK-OPT: cmp [[GUARD_VAL]], [[NEW_VAL]]
+; CHECK-OPT: b.ne [[FAIL:LBB[0-9]+_[0-9]+]]
+
+; CHECK-OPT: [[FAIL]]:
+; CHECK-OPT-NEXT: bl ___stack_chk_fail
+  %arr = alloca [8 x i32]
+  call void @callee([8 x i32]* %arr)
+  ret void
+}
+
+declare i32 @__gxx_personality_v0(...)
+declare void @eat_landingpad_args(i32, i8*, i32)
+@_ZTI8Whatever = external global i8
+define void @test_landingpad_marshalling() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: test_landingpad_marshalling:
+; CHECK-OPT: mov x2, x1
+; CHECK-OPT: mov x1, x0
+; CHECK: bl _eat_landingpad_args
+  invoke void @callee([8 x i32]* undef) to label %done unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %exc = landingpad { i8*, i32 }
+          catch i8* @_ZTI8Whatever
+  %pointer = extractvalue { i8*, i32 } %exc, 0
+  %selector = extractvalue { i8*, i32 } %exc, 1
+  call void @eat_landingpad_args(i32 undef, i8* %pointer, i32 %selector)
+  ret void
+
+done:
+  ret void
+}
+
+define void @test_dynamic_stackalloc() {
+; CHECK-LABEL: test_dynamic_stackalloc:
+; CHECK: sub [[REG:x[0-9]+]], sp, #32
+; CHECK: mov sp, [[REG]]
+; CHECK-OPT-NOT: ubfx
+; CHECK: bl _callee
+  br label %next
+
+next:
+  %val = alloca [8 x i32]
+  call void @callee([8 x i32]* %val)
+  ret void
+}
+
+define void @test_asm_memory(i32* %base.addr) {
+; CHECK-LABEL: test_asm_memory:
+; CHECK: add w[[ADDR:[0-9]+]], w0, #4
+; CHECK: str wzr, [x[[ADDR]]
+  %addr = getelementptr i32, i32* %base.addr, i32 1
+  call void asm sideeffect "str wzr, $0", "*m"(i32* %addr)
+  ret void
+}
+
+define void @test_unsafe_asm_memory(i64 %val) {
+; CHECK-LABEL: test_unsafe_asm_memory:
+; CHECK: and x[[ADDR:[0-9]+]], x0, #0xffffffff
+; CHECK: str wzr, [x[[ADDR]]]
+  %addr_int = trunc i64 %val to i32
+  %addr = inttoptr i32 %addr_int to i32*
+  call void asm sideeffect "str wzr, $0", "*m"(i32* %addr)
+  ret void
+}
+
+define [9 x i8*] @test_demoted_return(i8* %in) {
+; CHECK-LABEL: test_demoted_return:
+; CHECK: str w0, [x8, #32]
+  %res = insertvalue [9 x i8*] undef, i8* %in, 8
+  ret [9 x i8*] %res
+}
+
+define i8* @test_inttoptr(i64 %in) {
+; CHECK-LABEL: test_inttoptr:
+; CHECK: and x0, x0, #0xffffffff
+  %res = inttoptr i64 %in to i8*
+  ret i8* %res
+}
+
+declare i32 @llvm.get.dynamic.area.offset.i32()
+define i32 @test_dynamic_area() {
+; CHECK-LABEL: test_dynamic_area:
+; CHECK: mov w0, wzr
+  %res = call i32 @llvm.get.dynamic.area.offset.i32()
+  ret i32 %res
+}
+
+define void @test_pointer_vec_store(<2 x i8*>* %addr) {
+; CHECK-LABEL: test_pointer_vec_store:
+; CHECK: str xzr, [x0]
+; CHECK-NOT: str
+; CHECK-NOT: stp
+
+  store <2 x i8*> zeroinitializer, <2 x i8*>* %addr, align 16
+  ret void
+}
+
+define <2 x i8*> @test_pointer_vec_load(<2 x i8*>* %addr) {
+; CHECK-LABEL: test_pointer_vec_load:
+; CHECK: ldr d[[TMP:[0-9]+]], [x0]
+; CHECK: ushll.2d v0, v[[TMP]], #0
+  %val = load <2 x i8*>, <2 x i8*>* %addr, align 16
+  ret <2 x i8*> %val
+}
+
+define void @test_inline_asm_mem_pointer(i32* %in) {
+; CHECK-LABEL: test_inline_asm_mem_pointer:
+; CHECK: str w0,
+  tail call void asm sideeffect "ldr x0, $0", "rm"(i32* %in)
+  ret void
+}
+
+
+define void @test_struct_hi(i32 %hi) nounwind {
+; CHECK-LABEL: test_struct_hi:
+; CHECK: mov w[[IN:[0-9]+]], w0
+; CHECK: bl _get_int
+; CHECK-NEXT: bfi x0, x[[IN]], #32, #32
+; CHECK-NEXT: bl _take_pair
+  %val.64 = call i64 @get_int()
+  %val.32 = trunc i64 %val.64 to i32
+
+  %pair.0 = insertvalue [2 x i32] undef, i32 %val.32, 0
+  %pair.1 = insertvalue [2 x i32] %pair.0, i32 %hi, 1
+  call void @take_pair([2 x i32] %pair.1)
+
+  ret void
+}
+declare void @take_pair([2 x i32])
+declare i64 @get_int()
+
+define i1 @test_icmp_ptr(i8* %in) {
+; CHECK-LABEL: test_icmp_ptr
+; CHECK: ubfx x0, x0, #31, #1
+  %res = icmp slt i8* %in, null
+  ret i1 %res
+}
+
+define void @test_multiple_icmp_ptr(i8* %l, i8* %r) {
+; CHECK-LABEL: test_multiple_icmp_ptr:
+; CHECK: tbnz w0, #31, [[FALSEBB:LBB[0-9]+_[0-9]+]]
+; CHECK: tbnz w1, #31, [[FALSEBB]]
+  %tst1 = icmp sgt i8* %l, inttoptr (i32 -1 to i8*)
+  %tst2 = icmp sgt i8* %r, inttoptr (i32 -1 to i8*)
+  %tst = and i1 %tst1, %tst2
+  br i1 %tst, label %true, label %false
+
+true:
+  call void(...) @bar()
+  ret void
+
+false:
+  ret void
+}
+
+define { [18 x i8] }* @test_gep_nonpow2({ [18 x i8] }* %a0, i32 %a1) {
+; CHECK-LABEL: test_gep_nonpow2:
+; CHECK:      mov w[[SIZE:[0-9]+]], #18
+; CHECK-NEXT: smaddl x0, w1, w[[SIZE]], x0
+; CHECK-NEXT: ret
+  %tmp0 = getelementptr inbounds { [18 x i8] }, { [18 x i8] }* %a0, i32 %a1
+  ret { [18 x i8] }* %tmp0
+}
+
+define void @test_bzero(i64 %in)  {
+; CHECK-LABEL: test_bzero:
+; CHECK-DAG: lsr x1, x0, #32
+; CHECK-DAG: and x0, x0, #0xffffffff
+; CHECK: bl _bzero
+
+  %ptr.i32 = trunc i64 %in to i32
+  %size.64 = lshr i64 %in, 32
+  %size = trunc i64 %size.64 to i32
+  %ptr = inttoptr i32 %ptr.i32 to i8*
+  tail call void @llvm.memset.p0i8.i32(i8* align 4 %ptr, i8 0, i32 %size, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1)
Index: llvm/test/CodeGen/AArch64/fastcc-reserved.ll
===================================================================
--- llvm/test/CodeGen/AArch64/fastcc-reserved.ll
+++ llvm/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -4,7 +4,7 @@
 ; call-frame is not reserved (hence disable-fp-elim), but where
 ; callee-pop can occur (hence tailcallopt).
 
-declare fastcc void @will_pop([8 x i32], i32 %val)
+declare fastcc void @will_pop([8 x i64], i32 %val)
 
 define fastcc void @foo(i32 %in) {
 ; CHECK-LABEL: foo:
@@ -18,7 +18,7 @@
 ; Reserve space for call-frame:
 ; CHECK: str w{{[0-9]+}}, [sp, #-16]!
 
-  call fastcc void @will_pop([8 x i32] undef, i32 42)
+  call fastcc void @will_pop([8 x i64] undef, i32 42)
 ; CHECK: bl will_pop
 
 ; Since @will_pop is fastcc with tailcallopt, it will put the stack
@@ -31,7 +31,7 @@
   ret void
 }
 
-declare void @wont_pop([8 x i32], i32 %val)
+declare void @wont_pop([8 x i64], i32 %val)
 
 define void @foo1(i32 %in) {
 ; CHECK-LABEL: foo1:
@@ -44,7 +44,7 @@
 ; Reserve space for call-frame
 ; CHECK: str w{{[0-9]+}}, [sp, #-16]!
 
-  call void @wont_pop([8 x i32] undef, i32 42)
+  call void @wont_pop([8 x i64] undef, i32 42)
 ; CHECK: bl wont_pop
 
 ; This time we *do* need to unreserve the call-frame
Index: llvm/test/CodeGen/AArch64/fastcc.ll
===================================================================
--- llvm/test/CodeGen/AArch64/fastcc.ll
+++ llvm/test/CodeGen/AArch64/fastcc.ll
@@ -18,7 +18,7 @@
 ; CHECK-TAIL: str w{{[0-9]+}}, [sp]
 
 
-  call fastcc void @func_stack8([8 x i32] undef, i32 42)
+  call fastcc void @func_stack8([8 x i64] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 ; CHECK-NOT: [sp, #{{[-0-9]+}}]!
@@ -28,7 +28,7 @@
 ; CHECK-TAIL: stp xzr, xzr, [sp, #-16]!
 
 
-  call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
+  call fastcc void @func_stack32([8 x i64] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
@@ -56,7 +56,7 @@
 ; CHECK-TAIL-NEXT: ret
 }
 
-define fastcc void @func_stack8([8 x i32], i32 %stacked) {
+define fastcc void @func_stack8([8 x i64], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
 ; CHECK: sub sp, sp, #48
 ; CHECK: stp x29, x30, [sp, #32]
@@ -71,7 +71,7 @@
 ; CHECK-TAIL: str w{{[0-9]+}}, [sp]
 
 
-  call fastcc void @func_stack8([8 x i32] undef, i32 42)
+  call fastcc void @func_stack8([8 x i64] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 ; CHECK-NOT: [sp, #{{[-0-9]+}}]!
@@ -82,7 +82,7 @@
 ; CHECK-TAIL: stp xzr, xzr, [sp, #-16]!
 
 
-  call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
+  call fastcc void @func_stack32([8 x i64] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
@@ -109,7 +109,7 @@
 ; CHECK-TAIL-NEXT: ret
 }
 
-define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
+define fastcc void @func_stack32([8 x i64], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32:
 ; CHECK: add x29, sp, #32
 
@@ -117,7 +117,7 @@
 ; CHECK-TAIL: add x29, sp, #32
 
 
-  call fastcc void @func_stack8([8 x i32] undef, i32 42)
+  call fastcc void @func_stack8([8 x i64] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 ; CHECK-NOT: [sp, #{{[-0-9]+}}]!
@@ -127,7 +127,7 @@
 ; CHECK-TAIL: stp xzr, xzr, [sp, #-16]!
 
 
-  call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
+  call fastcc void @func_stack32([8 x i64] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
@@ -155,7 +155,7 @@
 }
 
 ; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
-define fastcc void @func_stack32_leaf([8 x i32], i128 %stacked0, i128 %stacked1) {
+define fastcc void @func_stack32_leaf([8 x i64], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32_leaf:
 ; CHECK: str     x20, [sp, #-16]!
 ; CHECK: nop
@@ -186,7 +186,7 @@
 }
 
 ; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
-define fastcc void @func_stack32_leaf_local([8 x i32], i128 %stacked0, i128 %stacked1) {
+define fastcc void @func_stack32_leaf_local([8 x i64], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32_leaf_local:
 ; CHECK: sub     sp, sp, #32
 ; CHECK-NEXT: str     x20, [sp, #16]
@@ -222,7 +222,7 @@
 }
 
 ; Check that arg stack pop is done after callee-save restore when no frame pointer is used.
-define fastcc void @func_stack32_leaf_local_nocs([8 x i32], i128 %stacked0, i128 %stacked1) {
+define fastcc void @func_stack32_leaf_local_nocs([8 x i64], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32_leaf_local_nocs:
 ; CHECK: sub     sp, sp, #16
 ; CHECK: add     sp, sp, #16
Index: llvm/test/CodeGen/AArch64/jump-table-32.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/jump-table-32.ll
@@ -0,0 +1,42 @@
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64_32-apple-ios7.0 -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
+
+define i32 @test_jumptable(i32 %in) {
+; CHECK: test_jumptable
+
+  switch i32 %in, label %def [
+    i32 0, label %lbl1
+    i32 1, label %lbl2
+    i32 2, label %lbl3
+    i32 4, label %lbl4
+  ]
+; CHECK: adrp    [[JTPAGE:x[0-9]+]], LJTI0_0@PAGE
+; CHECK: mov     w[[INDEX:[0-9]+]], w0
+; CHECK: add     x[[JT:[0-9]+]], [[JTPAGE]], LJTI0_0@PAGEOFF
+; CHECK: adr     [[BASE_BLOCK:x[0-9]+]], LBB0_2
+; CHECK: ldrb    w[[OFFSET:[0-9]+]], [x[[JT]], x[[INDEX]]]
+; CHECK: add     [[DEST:x[0-9]+]], [[BASE_BLOCK]], x[[OFFSET]], lsl #2
+; CHECK: br      [[DEST]]
+
+def:
+  ret i32 0
+
+lbl1:
+  ret i32 1
+
+lbl2:
+  ret i32 2
+
+lbl3:
+  ret i32 4
+
+lbl4:
+  ret i32 8
+
+}
+
+; CHECK: LJTI0_0:
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
+; CHECK-NEXT: .byte
Index: llvm/test/CodeGen/AArch64/sibling-call.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sibling-call.ll
+++ llvm/test/CodeGen/AArch64/sibling-call.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-ldst-opt=0 | FileCheck %s
 
 declare void @callee_stack0()
-declare void @callee_stack8([8 x i32], i64)
-declare void @callee_stack16([8 x i32], i64, i64)
+declare void @callee_stack8([8 x i64], i64)
+declare void @callee_stack16([8 x i64], i64, i64)
 
 define void @caller_to0_from0() nounwind {
 ; CHECK-LABEL: caller_to0_from0:
@@ -12,7 +12,7 @@
 ; CHECK-NEXT: b callee_stack0
 }
 
-define void @caller_to0_from8([8 x i32], i64) nounwind{
+define void @caller_to0_from8([8 x i64], i64) nounwind{
 ; CHECK-LABEL: caller_to0_from8:
 ; CHECK-NEXT: // %bb.
 
@@ -26,51 +26,51 @@
 
 ; Caller isn't going to clean up any extra stack we allocate, so it
 ; can't be a tail call.
-  tail call void @callee_stack8([8 x i32] undef, i64 42)
+  tail call void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 ; CHECK: bl callee_stack8
 }
 
-define void @caller_to8_from8([8 x i32], i64 %a) {
+define void @caller_to8_from8([8 x i64], i64 %a) {
 ; CHECK-LABEL: caller_to8_from8:
 ; CHECK-NOT: sub sp, sp,
 
 ; This should reuse our stack area for the 42
-  tail call void @callee_stack8([8 x i32] undef, i64 42)
+  tail call void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 ; CHECK: str {{x[0-9]+}}, [sp]
 ; CHECK-NEXT: b callee_stack8
 }
 
-define void @caller_to16_from8([8 x i32], i64 %a) {
+define void @caller_to16_from8([8 x i64], i64 %a) {
 ; CHECK-LABEL: caller_to16_from8:
 
 ; Shouldn't be a tail call: we can't use SP+8 because our caller might
 ; have something there. This may sound obvious but implementation does
 ; some funky aligning.
-  tail call void @callee_stack16([8 x i32] undef, i64 undef, i64 undef)
+  tail call void @callee_stack16([8 x i64] undef, i64 undef, i64 undef)
 ; CHECK: bl callee_stack16
   ret void
 }
 
-define void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
+define void @caller_to8_from24([8 x i64], i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: caller_to8_from24:
 ; CHECK-NOT: sub sp, sp
 
 ; Reuse our area, putting "42" at incoming sp
-  tail call void @callee_stack8([8 x i32] undef, i64 42)
+  tail call void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 ; CHECK: str {{x[0-9]+}}, [sp]
 ; CHECK-NEXT: b callee_stack8
 }
 
-define void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
+define void @caller_to16_from16([8 x i64], i64 %a, i64 %b) {
 ; CHECK-LABEL: caller_to16_from16:
 ; CHECK-NOT: sub sp, sp,
 
 ; Here we want to make sure that both loads happen before the stores:
 ; otherwise either %a or %b will be wrongly clobbered.
-  tail call void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
+  tail call void @callee_stack16([8 x i64] undef, i64 %b, i64 %a)
   ret void
 
 ; CHECK: ldr [[VAL0:x[0-9]+]],
Index: llvm/test/CodeGen/AArch64/swift-return.ll
===================================================================
--- llvm/test/CodeGen/AArch64/swift-return.ll
+++ llvm/test/CodeGen/AArch64/swift-return.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
 ; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-O0
+; RUN: llc -verify-machineinstrs -mtriple=arm64_32-apple-ios -o - %s | FileCheck %s
+; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=arm64_32-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-O0
 
 ; CHECK-LABEL: test1
 ; CHECK: bl      _gen
@@ -8,7 +10,7 @@
 ; CHECK-O0-LABEL: test1
 ; CHECK-O0: bl      _gen
 ; CHECK-O0: sxth    [[TMP:w.*]], w0
-; CHECK-O0: add     w8, [[TMP]], w1, sxtb
+; CHECK-O0: add     {{w[0-9]+}}, [[TMP]], w1, sxtb
 define i16 @test1(i32) {
 entry:
   %call = call swiftcc { i16, i8 } @gen(i32 %0)
Index: llvm/test/CodeGen/AArch64/swiftcc.ll
===================================================================
--- llvm/test/CodeGen/AArch64/swiftcc.ll
+++ llvm/test/CodeGen/AArch64/swiftcc.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64_32-apple-ios -o - %s | FileCheck %s
+; RUN: llc -O0 -verify-machineinstrs -mtriple=arm64_32-apple-ios -o - %s | FileCheck %s
 
 ; CHECK: t1
 ; CHECK: fadd s0, s0, s1
Index: llvm/test/CodeGen/AArch64/swifterror.ll
===================================================================
--- llvm/test/CodeGen/AArch64/swifterror.ll
+++ llvm/test/CodeGen/AArch64/swifterror.ll
@@ -1,5 +1,7 @@
-; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -frame-pointer=all -enable-shrink-wrap=false < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-APPLE %s
-; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -frame-pointer=all -O0 -fast-isel < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-O0 %s
+; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -frame-pointer=all -enable-shrink-wrap=false < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-APPLE --check-prefix=CHECK-APPLE-AARCH64 %s
+; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -frame-pointer=all -O0 -fast-isel < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-O0 --check-prefix=CHECK-O0-AARCH64 %s
+; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -frame-pointer=all -enable-shrink-wrap=false < %s -mtriple=arm64_32-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-APPLE --check-prefix=CHECK-APPLE-ARM64_32 %s
+; RUN: llc -fast-isel-sink-local-values -verify-machineinstrs -O0 -fast-isel < %s -mtriple=arm64_32-apple-ios -disable-post-ra | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK-O0 --check-prefix=CHECK-O0-ARM64_32 %s
 
 declare i8* @malloc(i64)
 declare void @free(i8*)
@@ -40,7 +42,8 @@
 ; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo
 ; CHECK-APPLE: mov x0, x21
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE-AARCH64: cbnz x21
+; CHECK-APPLE-ARM64_32: cbnz w0
 ; Access part of the error object and save it to error_ref
 ; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x0, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
@@ -50,7 +53,8 @@
 ; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo
 ; CHECK-O0: mov [[ID:x[0-9]+]], x21
-; CHECK-O0: cbnz x21
+; CHECK-O0-AARCH64: cbnz x21
+; CHECK-O0-ARM64_32: cmp x21, #0
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
   store %swift_error* null, %swift_error** %error_ptr_ref
@@ -76,7 +80,8 @@
 ; CHECK-APPLE: fmov [[CMP:s[0-9]+]], #1.0
 ; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE-AARCH64: cbnz x21
+; CHECK-APPLE-ARM64_32: cbnz w21
 ; CHECK-APPLE: fcmp s0, [[CMP]]
 ; CHECK-APPLE: b.le
 ; Access part of the error object and save it to error_ref
@@ -89,7 +94,8 @@
 ; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo
 ; CHECK-O0: mov [[ID:x[0-9]+]], x21
-; CHECK-O0: cbnz x21
+; CHECK-O0-AARCH64: cbnz x21
+; CHECK-O0-ARM64_32: cmp x21, #0
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
   br label %bb_loop
@@ -171,29 +177,54 @@
 ; CHECK-APPLE: mov x21, x0
 ; CHECK-APPLE: ret
 
-; CHECK-O0-LABEL: foo_loop:
+; CHECK-O0-AARCH64-LABEL: foo_loop:
 ; spill x21
-; CHECK-O0: str x21, [sp, [[SLOT:#[0-9]+]]]
-; CHECK-O0: b [[BB1:[A-Za-z0-9_]*]]
-; CHECK-O0: [[BB1]]:
-; CHECK-O0: ldr     x0, [sp, [[SLOT]]]
-; CHECK-O0: str     x0, [sp, [[SLOT2:#[0-9]+]]]
-; CHECK-O0: cbz {{.*}}, [[BB2:[A-Za-z0-9_]*]]
-; CHECK-O0: mov w{{.*}}, #16
-; CHECK-O0: malloc
-; CHECK-O0: mov [[ID:x[0-9]+]], x0
-; CHECK-O0: strb w{{.*}}, [{{.*}}[[ID]], #8]
+; CHECK-O0-AARCH64: str x21, [sp, [[SLOT:#[0-9]+]]]
+; CHECK-O0-AARCH64: b [[BB1:[A-Za-z0-9_]*]]
+; CHECK-O0-AARCH64: [[BB1]]:
+; CHECK-O0-AARCH64: ldr     x0, [sp, [[SLOT]]]
+; CHECK-O0-AARCH64: str     x0, [sp, [[SLOT2:#[0-9]+]]]
+; CHECK-O0-AARCH64: cbz {{.*}}, [[BB2:[A-Za-z0-9_]*]]
+; CHECK-O0-AARCH64: mov w{{.*}}, #16
+; CHECK-O0-AARCH64: malloc
+; CHECK-O0-AARCH64: mov [[ID:x[0-9]+]], x0
+; CHECK-O0-AARCH64: strb w{{.*}}, [{{.*}}[[ID]], #8]
 ; spill x0
-; CHECK-O0: str x0, [sp, [[SLOT2]]]
-; CHECK-O0:[[BB2]]:
-; CHECK-O0: ldr     x0, [sp, [[SLOT2]]]
-; CHECK-O0: fcmp
-; CHECK-O0: str     x0, [sp]
-; CHECK-O0: b.le [[BB1]]
+; CHECK-O0-AARCH64: str x0, [sp, [[SLOT2]]]
+; CHECK-O0-AARCH64:[[BB2]]:
+; CHECK-O0-AARCH64: ldr     x0, [sp, [[SLOT2]]]
+; CHECK-O0-AARCH64: fcmp
+; CHECK-O0-AARCH64: str     x0, [sp]
+; CHECK-O0-AARCH64: b.le [[BB1]]
 ; reload from stack
-; CHECK-O0: ldr [[ID3:x[0-9]+]], [sp]
-; CHECK-O0: mov x21, [[ID3]]
-; CHECK-O0: ret
+; CHECK-O0-AARCH64: ldr [[ID3:x[0-9]+]], [sp]
+; CHECK-O0-AARCH64: mov x21, [[ID3]]
+; CHECK-O0-AARCH64: ret
+
+; CHECK-O0-ARM64_32-LABEL: foo_loop:
+; spill x21
+; CHECK-O0-ARM64_32: str x21, [sp, [[SLOT:#[0-9]+]]]
+; CHECK-O0-ARM64_32: b [[BB1:[A-Za-z0-9_]*]]
+; CHECK-O0-ARM64_32: [[BB1]]:
+; CHECK-O0-ARM64_32: ldr     x0, [sp, [[SLOT]]]
+; CHECK-O0-ARM64_32: str     x0, [sp, [[SLOT2:#[0-9]+]]]
+; CHECK-O0-ARM64_32: cbz {{.*}}, [[BB2:[A-Za-z0-9_]*]]
+; CHECK-O0-ARM64_32: mov w{{.*}}, #16
+; CHECK-O0-ARM64_32: malloc
+; CHECK-O0-ARM64_32: mov {{.*}}, x0
+; CHECK-O0-ARM64_32: strb w{{.*}},
+; spill x0
+; CHECK-O0-ARM64_32: str {{.*}}, [sp, [[SLOT2]]]
+; CHECK-O0-ARM64_32:[[BB2]]:
+; CHECK-O0-ARM64_32: ldr     x0, [sp, [[SLOT2]]]
+; CHECK-O0-ARM64_32: fcmp
+; CHECK-O0-ARM64_32: str     x0, [sp[[OFFSET:.*]]]
+; CHECK-O0-ARM64_32: b.le [[BB1]]
+; reload from stack
+; CHECK-O0-ARM64_32: ldr [[ID3:x[0-9]+]], [sp[[OFFSET]]]
+; CHECK-O0-ARM64_32: mov x21, [[ID3]]
+; CHECK-O0-ARM64_32: ret
+
 entry:
   br label %bb_loop
 
@@ -261,7 +292,8 @@
 ; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo_sret
 ; CHECK-APPLE: mov x0, x21
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE-AARCH64: cbnz x21
+; CHECK-APPLE-ARM64_32: cbnz w0
 ; Access part of the error object and save it to error_ref
 ; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x0, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
@@ -273,7 +305,8 @@
 ; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo_sret
 ; CHECK-O0: mov [[ID2:x[0-9]+]], x21
-; CHECK-O0: cbnz x21
+; CHECK-O0-AARCH64: cbnz x21
+; CHECK-O0-ARM64_32: cmp x21, #0
 ; Access part of the error object and save it to error_ref
 ; reload from stack
 ; CHECK-O0: ldrb [[CODE:w[0-9]+]]
@@ -306,20 +339,22 @@
 ; CHECK-APPLE-LABEL: foo_vararg:
 ; CHECK-APPLE: mov w0, #16
 ; CHECK-APPLE: malloc
-; CHECK-APPLE-DAG: mov [[ID:w[0-9]+]], #1
-; CHECK-APPLE-DAG: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16
-; CHECK-APPLE-DAG: strb [[ID]], [x0, #8]
 
 ; First vararg
-; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #16]
+; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP:x[0-9]+]], #16]
+; CHECK-APPLE-AARCH64: mov [[ID:w[0-9]+]], #1
+; CHECK-APPLE-AARCH64: add [[ARGS:x[0-9]+]], [[TMP]], #16
+; CHECK-APPLE-AARCH64: strb [[ID]], [x0, #8]
 ; Second vararg
-; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #24]
-; CHECK-APPLE-DAG: add {{x[0-9]+}}, {{x[0-9]+}}, #16
+; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #24]
 ; Third vararg
-; CHECK-APPLE-DAG: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #32]
+; CHECK-APPLE-AARCH64: ldr {{w[0-9]+}}, [{{.*}}[[TMP]], #32]
+
+; CHECK-APPLE-ARM64_32: mov [[ID:w[0-9]+]], #1
+; CHECK-APPLE-ARM64_32: add [[ARGS:x[0-9]+]], [[TMP:x[0-9]+]], #16
+; CHECK-APPLE-ARM64_32: strb [[ID]], [x0, #8]
+
 
-; CHECK-APPLE: mov x21, x0
-; CHECK-APPLE-NOT: x21
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -347,18 +382,18 @@
 define float @caller4(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller4:
 
-; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
-; CHECK-APPLE: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
-; CHECK-APPLE: str {{x[0-9]+}}, [sp]
+; CHECK-APPLE-AARCH64: mov [[ID:x[0-9]+]], x0
+; CHECK-APPLE-AARCH64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK-APPLE-AARCH64: str {{x[0-9]+}}, [sp]
 
-; CHECK-APPLE: mov x21, xzr
-; CHECK-APPLE: bl {{.*}}foo_vararg
-; CHECK-APPLE: mov x0, x21
-; CHECK-APPLE: cbnz x21
+; CHECK-APPLE-AARCH64: mov x21, xzr
+; CHECK-APPLE-AARCH64: bl {{.*}}foo_vararg
+; CHECK-APPLE-AARCH64: mov x0, x21
+; CHECK-APPLE-AARCH64: cbnz x21
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x0, #8]
-; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: bl {{.*}}free
+; CHECK-APPLE-AARCH64: ldrb [[CODE:w[0-9]+]], [x0, #8]
+; CHECK-APPLE-AARCH64: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK-APPLE-AARCH64: bl {{.*}}free
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
   store %swift_error* null, %swift_error** %error_ptr_ref
Index: llvm/test/CodeGen/AArch64/swiftself.ll
===================================================================
--- llvm/test/CodeGen/AArch64/swiftself.ll
+++ llvm/test/CodeGen/AArch64/swiftself.ll
@@ -1,6 +1,7 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT --check-prefix=OPTAARCH64 %s
 ; RUN: llc -O0 -fast-isel -verify-machineinstrs -mtriple=aarch64-apple-ios -o - %s | FileCheck %s
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-unknown-linux-gnu -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-unknown-linux-gnu -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT --check-prefix=OPTAARCH64 %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64_32-apple-ios -o - %s | FileCheck --check-prefix=CHECK --check-prefix=OPT --check-prefix=OPTARM64_32 %s
 
 ; Parameter with swiftself should be allocated to x20.
 ; CHECK-LABEL: swiftself_param:
@@ -48,8 +49,9 @@
 ; We can use a tail call if the callee swiftself is the same as the caller one.
 ; This should also work with fast-isel.
 ; CHECK-LABEL: swiftself_tail:
-; CHECK: b {{_?}}swiftself_param
-; CHECK-NOT: ret
+; OPTAARCH64: b {{_?}}swiftself_param
+; OPTAARCH64-NOT: ret
+; OPTARM64_32: bl {{_?}}swiftself_param
 define i8* @swiftself_tail(i8* swiftself %addr0) {
   call void asm sideeffect "", "~{x20}"()
   %res = tail call i8* @swiftself_param(i8* swiftself %addr0)
@@ -71,12 +73,19 @@
 ; we normally would. We marked the first parameter with swiftself which means it
 ; will no longer be passed in x0.
 declare swiftcc i8* @thisreturn_attribute(i8* returned swiftself)
-; OPT-LABEL: swiftself_nothisreturn:
-; OPT-DAG: ldr  x20, [x20]
-; OPT-DAG: mov [[CSREG:x[1-9].*]], x8
-; OPT: bl {{_?}}thisreturn_attribute
-; OPT: str x0, {{\[}}[[CSREG]]
-; OPT: ret
+; OPTAARCH64-LABEL: swiftself_nothisreturn:
+; OPTAARCH64-DAG: ldr  x20, [x20]
+; OPTAARCH64-DAG: mov [[CSREG:x[1-9].*]], x8
+; OPTAARCH64: bl {{_?}}thisreturn_attribute
+; OPTAARCH64: str x0, {{\[}}[[CSREG]]
+; OPTAARCH64: ret
+
+; OPTARM64_32-LABEL: swiftself_nothisreturn:
+; OPTARM64_32-DAG: ldr  w20, [x20]
+; OPTARM64_32-DAG: mov [[CSREG:x[1-9].*]], x8
+; OPTARM64_32: bl {{_?}}thisreturn_attribute
+; OPTARM64_32: str w0, {{\[}}[[CSREG]]
+; OPTARM64_32: ret
 define hidden swiftcc void @swiftself_nothisreturn(i8** noalias nocapture sret, i8** noalias nocapture readonly swiftself) {
 entry:
   %2 = load i8*, i8** %1, align 8
Index: llvm/test/CodeGen/AArch64/tail-call.ll
===================================================================
--- llvm/test/CodeGen/AArch64/tail-call.ll
+++ llvm/test/CodeGen/AArch64/tail-call.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
 
 declare fastcc void @callee_stack0()
-declare fastcc void @callee_stack8([8 x i32], i64)
-declare fastcc void @callee_stack16([8 x i32], i64, i64)
+declare fastcc void @callee_stack8([8 x i64], i64)
+declare fastcc void @callee_stack16([8 x i64], i64, i64)
 declare extern_weak fastcc void @callee_weak()
 
 define fastcc void @caller_to0_from0() nounwind {
@@ -15,7 +15,7 @@
 ; CHECK-NEXT: b callee_stack0
 }
 
-define fastcc void @caller_to0_from8([8 x i32], i64) {
+define fastcc void @caller_to0_from8([8 x i64], i64) {
 ; CHECK-LABEL: caller_to0_from8:
 
   tail call fastcc void @callee_stack0()
@@ -31,33 +31,33 @@
 
 ; Key point is that the "42" should go #16 below incoming stack
 ; pointer (we didn't have arg space to reuse).
-  tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
+  tail call fastcc void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 
 ; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
 }
 
-define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
+define fastcc void @caller_to8_from8([8 x i64], i64 %a) {
 ; CHECK-LABEL: caller_to8_from8:
 ; CHECK: sub sp, sp, #16
 
 ; Key point is that the "%a" should go where at SP on entry.
-  tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
+  tail call fastcc void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 
 ; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
 }
 
-define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
+define fastcc void @caller_to16_from8([8 x i64], i64 %a) {
 ; CHECK-LABEL: caller_to16_from8:
 ; CHECK: sub sp, sp, #16
 
 ; Important point is that the call reuses the "dead" argument space
 ; above %a on the stack. If it tries to go below incoming-SP then the
 ; callee will not deallocate the space, even in fastcc.
-  tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
+  tail call fastcc void @callee_stack16([8 x i64] undef, i64 42, i64 2)
 
 ; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack16
@@ -65,12 +65,12 @@
 }
 
 
-define fastcc void @caller_to8_from24([8 x i32], i64 %a, i64 %b, i64 %c) {
+define fastcc void @caller_to8_from24([8 x i64], i64 %a, i64 %b, i64 %c) {
 ; CHECK-LABEL: caller_to8_from24:
 ; CHECK: sub sp, sp, #16
 
 ; Key point is that the "%a" should go where at #16 above SP on entry.
-  tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
+  tail call fastcc void @callee_stack8([8 x i64] undef, i64 42)
   ret void
 
 ; CHECK: str {{x[0-9]+}}, [sp, #32]!
@@ -78,13 +78,13 @@
 }
 
 
-define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
+define fastcc void @caller_to16_from16([8 x i64], i64 %a, i64 %b) {
 ; CHECK-LABEL: caller_to16_from16:
 ; CHECK: sub sp, sp, #16
 
 ; Here we want to make sure that both loads happen before the stores:
 ; otherwise either %a or %b will be wrongly clobbered.
-  tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
+  tail call fastcc void @callee_stack16([8 x i64] undef, i64 %b, i64 %a)
   ret void
 
 ; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
Index: llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
===================================================================
--- llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
+++ llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll
@@ -27,8 +27,8 @@
 ; AARCH-NEXT:    orr w10, w10, w11
 ; AARCH-NEXT:    orr w9, w10, w9
 ; AARCH-NEXT:    mul x0, x0, x2
-; AARCH-NEXT:    mov x1, x8
-; AARCH-NEXT:    mov w2, w9
+; AARCH-DAG:    mov x1, x8
+; AARCH-DAG:    mov w2, w9
 ; AARCH-NEXT:    ret
 start:
   %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2
Index: llvm/test/CodeGen/AArch64/win64_vararg.ll
===================================================================
--- llvm/test/CodeGen/AArch64/win64_vararg.ll
+++ llvm/test/CodeGen/AArch64/win64_vararg.ll
@@ -261,11 +261,11 @@
 ; CHECK-DAG: mov     w6,  w3
 ; CHECK-DAG: mov     [[REG1:w[0-9]+]],  w2
 ; CHECK: mov     w2, w1
-; CHECK: str     w4,  [sp]
 ; CHECK: fmov    x1,  d0
 ; CHECK: fmov    x3,  d1
 ; CHECK: fmov    x5,  d2
 ; CHECK: fmov    x7,  d3
+; CHECK: str     w4,  [sp]
 ; CHECK: mov     w4,  [[REG1]]
 ; CHECK: str     x30, [sp, #16]
 ; CHECK: str     d4,  [sp, #8]
Index: llvm/test/MC/AArch64/arm64_32-compact-unwind.s
===================================================================
--- /dev/null
+++ llvm/test/MC/AArch64/arm64_32-compact-unwind.s
@@ -0,0 +1,15 @@
+; RUN: llvm-mc -triple=arm64_32-ios7.0 -filetype=obj %s -o %t
+; RUN: llvm-objdump -s %t | FileCheck %s
+
+; The compact unwind format in ILP32 mode is pretty much the same, except
+; references to addresses (function, personality, LSDA) are pointer-sized.
+
+; CHECK: Contents of section __compact_unwind:
+; CHECK-NEXT:  0004 00000000 04000000 00000002 00000000
+; CHECK-NEXT:  0014 00000000
+        .globl  _test_compact_unwind
+        .align  2
+_test_compact_unwind:
+        .cfi_startproc
+        ret
+        .cfi_endproc
Index: llvm/utils/TableGen/CallingConvEmitter.cpp
===================================================================
--- llvm/utils/TableGen/CallingConvEmitter.cpp
+++ llvm/utils/TableGen/CallingConvEmitter.cpp
@@ -264,6 +264,10 @@
       Record *DestTy = Action->getValueAsDef("DestTy");
       O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
       O << IndentStr << "LocInfo = CCValAssign::BCvt;\n";
+    } else if (Action->isSubClassOf("CCTruncToType")) {
+      Record *DestTy = Action->getValueAsDef("DestTy");
+      O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";
+      O << IndentStr << "LocInfo = CCValAssign::Trunc;\n";
     } else if (Action->isSubClassOf("CCPassIndirect")) {
       Record *DestTy = Action->getValueAsDef("DestTy");
       O << IndentStr << "LocVT = " << getEnumName(getValueType(DestTy)) <<";\n";