Index: lib/Target/ARM/ARM.h
===================================================================
--- lib/Target/ARM/ARM.h
+++ lib/Target/ARM/ARM.h
@@ -39,6 +39,7 @@
 FunctionPass *createThumb2ITBlockPass();
 FunctionPass *createARMOptimizeBarriersPass();
 FunctionPass *createThumb2SizeReductionPass();
+FunctionPass *createARMAlignAllocaPass(const ARMBaseTargetMachine &TM);
 
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
Index: lib/Target/ARM/ARMAlignAllocaPass.cpp
===================================================================
--- /dev/null
+++ lib/Target/ARM/ARMAlignAllocaPass.cpp
@@ -0,0 +1,77 @@
+//===-- ARMAlignAllocaPass.cpp - Align Alloca instructions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Arrays (or other objects) whose address leaks into another function may end
+// up being memcpy'd there. memcpy typically tries to use LDM/STM if the
+// source/dest is aligned and the copy size is large enough. We therefore want
+// to align such objects. This pass looks for alloca instructions that fit this
+// criteria and aligns them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMTargetMachine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+namespace {
+  class ARMAlignAlloca : public FunctionPass,
+                         public InstVisitor<ARMAlignAlloca> {
+    static char ID;
+    const DataLayout *DL;
+  public:
+    ARMAlignAlloca(const ARMBaseTargetMachine &TM) : FunctionPass(ID), DL(TM.getDataLayout()) { }
+    bool runOnFunction(Function &F) override;
+    const char *getPassName() const override { return "ARM Align Alloca"; }
+    void visitCallInst(CallInst &I);
+    void tryAlignAlloca(Value *Val);
+  };
+}
+
+char ARMAlignAlloca::ID = 0;
+
+bool ARMAlignAlloca::runOnFunction(Function &F) {
+  visit(F);
+  return false;
+}
+
+// If this is an AllocaInst, and the allocated object is large enough that we
+// think memcpy would want to use LDM/STM, then align it.
+void ARMAlignAlloca::tryAlignAlloca(Value *Val) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
+    if (AI->getAlignment() < 4 &&
+        DL->getTypeAllocSize(AI->getAllocatedType()) >= 8)
+      AI->setAlignment(4);
+}
+
+void ARMAlignAlloca::visitCallInst(CallInst &CI) {
+  // Align any allocas used as arguments, either directly as argument operands
+  // or as operands to those arguments.
+  for (auto &Arg : CI.arg_operands()) {
+    tryAlignAlloca(Arg.get());
+    if (User *U = dyn_cast<User>(Arg.get()))
+      for (auto *Val : U->operand_values())
+        tryAlignAlloca(Val);
+  }
+  // If this is a memcpy (or similar) then we may have improved the alignment
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&CI)) {
+    unsigned Align = getKnownAlignment(MI->getDest());
+    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
+      Align = std::min(Align, getKnownAlignment(MTI->getSource()));
+    if (Align > MI->getAlignment())
+      MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align));
+  }
+}
+
+FunctionPass *llvm::createARMAlignAllocaPass(const ARMBaseTargetMachine &TM) {
+  return new ARMAlignAlloca(TM);
+}
Index: lib/Target/ARM/ARMTargetMachine.cpp
===================================================================
--- lib/Target/ARM/ARMTargetMachine.cpp
+++ lib/Target/ARM/ARMTargetMachine.cpp
@@ -295,6 +295,7 @@
   }
 
   void addIRPasses() override;
+  void addCodeGenPrepare() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   void addPreRegAlloc() override;
@@ -324,6 +325,11 @@
   TargetPassConfig::addIRPasses();
 }
 
+void ARMPassConfig::addCodeGenPrepare() {
+  addPass(createARMAlignAllocaPass(getARMTargetMachine()));
+  TargetPassConfig::addCodeGenPrepare();
+}
+
 bool ARMPassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     // FIXME: This is using the thumb1 only constant value for
Index: lib/Target/ARM/CMakeLists.txt
===================================================================
--- lib/Target/ARM/CMakeLists.txt
+++ lib/Target/ARM/CMakeLists.txt
@@ -15,6 +15,7 @@
 
 add_llvm_target(ARMCodeGen
   A15SDOptimizer.cpp
+  ARMAlignAllocaPass.cpp
   ARMAsmPrinter.cpp
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
Index: test/CodeGen/ARM/stack-object-align.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/stack-object-align.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=arm-eabi < %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7-eabi < %s -o - | FileCheck %s
+
+; Expect that small arrays are not aligned when passed to a function
+define void @test1() {
+entry:
+  %arr1 = alloca [3 x i8], align 1
+  %arr2 = alloca [3 x i8], align 1
+
+; CHECK: add{{(\.w)?}} r0, sp, #5
+  %arraydecay = getelementptr inbounds [3 x i8], [3 x i8]* %arr1, i32 0, i32 0
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arraydecay)
+
+; CHECK: add{{(\.w)?}} r0, sp, #2
+  %arraydecay1 = getelementptr inbounds [3 x i8], [3 x i8]* %arr2, i32 0, i32 0
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arraydecay1)
+
+  ret void
+}
+
+; Expect that larger arrays are aligned when passed to a function
+define void @test2() {
+entry:
+  %arr1 = alloca [9 x i8], align 1
+  %arr2 = alloca [9 x i8], align 1
+
+; CHECK: add{{(\.w)?}} r0, sp, #12
+  %arraydecay = getelementptr inbounds [9 x i8], [9 x i8]* %arr1, i32 0, i32 0
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arraydecay)
+
+; CHECK: mov r0, sp
+  %arraydecay1 = getelementptr inbounds [9 x i8], [9 x i8]* %arr2, i32 0, i32 0
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arraydecay1)
+
+  ret void
+}
+
+; Expect that larger arrays only accessed through array access are not aligned
+define void @test3() {
+entry:
+  %arr1 = alloca [9 x i8], align 1
+  %arr2 = alloca [9 x i8], align 1
+
+; CHECK: strb{{(\.w)?}} {{r[0-9]+}}, [sp, #11]
+  %arrayidx = getelementptr inbounds [9 x i8], [9 x i8]* %arr1, i32 0, i32 0
+  store i8 1, i8* %arrayidx, align 1
+
+; CHECK: strb{{(\.w)?}} {{r[0-9]+}}, [sp, #2]
+  %arrayidx1 = getelementptr inbounds [9 x i8], [9 x i8]* %arr2, i32 0, i32 0
+  store i8 1, i8* %arrayidx1, align 1
+
+  ret void
+}
+
+declare void @takeptr(i8*)