Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -976,6 +976,14 @@
     return false;
   }
 
+  /// Return true if the alloca arguments to CI should be aligned. If so then
+  /// AllocaSize is set to the minimum size the allocated object must be to be
+  /// aligned and AllocaAlign is set to the alignment the alloca is to be given.
+  virtual bool shouldAlignAllocaArgs(CallInst */*CI*/, unsigned &/*AllocaSize*/,
+                                     unsigned &/*AllocaAlign*/) const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   /// \name Helpers for TargetTransformInfo implementations
   /// @{
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -1228,6 +1228,40 @@
       return true;
   }
 
+  const DataLayout *TD = TLI ? TLI->getDataLayout() : nullptr;
+
+  // Align the alloca arguments to this call if the target thinks it's a good
+  // idea
+  unsigned AllocaSize = 0, AllocaAlign = 0;
+  if (TLI && TD && TLI->shouldAlignAllocaArgs(CI, AllocaSize, AllocaAlign)) {
+    assert(AllocaAlign != 0 && "shouldAlignAllocaArgs must set AllocaAlign");
+    for (auto &Arg : CI->arg_operands()) {
+      // We want to align both direct allocas and allocas used in casts and
+      // GEPs, though it only makes sense for GEPs if the offset is a multiple
+      // of the desired alignment and if size - offset meets the size threshold.
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      APInt Offset(TD->getPointerSizeInBits(
+                     cast<PointerType>(Arg->getType())->getAddressSpace()), 0);
+      Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*TD, Offset);
+      uint64_t Offset2 = Offset.getLimitedValue();
+      AllocaInst *AI;
+      if ((Offset2 & (AllocaAlign-1)) == 0 &&
+          (AI = dyn_cast<AllocaInst>(Val)) &&
+          AI->getAlignment() < AllocaAlign &&
+          TD->getTypeAllocSize(AI->getAllocatedType()) - Offset2 >= AllocaSize)
+        AI->setAlignment(AllocaAlign);
+    }
+    // If this is a memcpy (or similar) then we may have improved the alignment
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
+      unsigned Align = getKnownAlignment(MI->getDest());
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
+        Align = std::min(Align, getKnownAlignment(MTI->getSource()));
+      if (Align > MI->getAlignment())
+        MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align));
+    }
+  }
+
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II) {
     switch (II->getIntrinsicID()) {
@@ -1288,7 +1322,6 @@
   if (!CI->getCalledFunction()) return false;
 
   // We'll need DataLayout from here on out.
-  const DataLayout *TD = TLI ? TLI->getDataLayout() : nullptr;
   if (!TD) return false;
 
   // Lower all default uses of _chk calls.  This is very similar
Index: lib/Target/ARM/ARMISelLowering.h
===================================================================
--- lib/Target/ARM/ARMISelLowering.h
+++ lib/Target/ARM/ARMISelLowering.h
@@ -362,6 +362,9 @@
       return true;
     }
 
+    bool shouldAlignAllocaArgs(CallInst *CI, unsigned &AllocaSize,
+                               unsigned &AllocaAlign) const override;
+
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -1165,6 +1165,17 @@
   return TargetLowering::getRegClassFor(VT);
 }
 
+// Arrays (or other objects) whose address leaks into another function may end
+// up being memcpy'd there. memcpy typically tries to use LDM/STM if the
+// source/dest is aligned and the copy size is large enough. We therefore want
+// to align such objects.
+bool ARMTargetLowering::shouldAlignAllocaArgs(CallInst *, unsigned &AllocaSize,
+                                              unsigned &AllocaAlign) const {
+  AllocaSize = 8;
+  AllocaAlign = 4;
+  return true;
+}
+
 // Create a fast isel object.
 FastISel *
 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
Index: test/CodeGen/ARM/stack-object-align.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/stack-object-align.ll
@@ -0,0 +1,99 @@
+; RUN: llc -mtriple=arm-eabi < %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv7-eabi < %s -o - | FileCheck %s
+
+; Expect that small arrays are not aligned when passed to a function
+define void @test1() {
+entry:
+  %arr1 = alloca [3 x i8], align 1
+  %arr2 = alloca [3 x i8], align 1
+
+; CHECK: add{{(\.w)?}} r0, sp, #5
+  %arraydecay = getelementptr inbounds [3 x i8], [3 x i8]* %arr1, i32 0, i32 0
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arraydecay)
+
+; CHECK: add{{(\.w)?}} r0, sp, #2
+  %arraydecay1 = getelementptr inbounds [3 x i8], [3 x i8]* %arr2, i32 0, i32 0
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arraydecay1)
+
+  ret void
+}
+
+; Expect that larger arrays are aligned when passed to a function
+define void @test2() {
+entry:
+  %arr1 = alloca [9 x i8], align 1
+  %arr2 = alloca [9 x i8], align 1
+
+; CHECK: add{{(\.w)?}} r0, sp, #12
+  %arraydecay = getelementptr inbounds [9 x i8], [9 x i8]* %arr1, i32 0, i32 0
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arraydecay)
+
+; CHECK: mov r0, sp
+  %arraydecay1 = getelementptr inbounds [9 x i8], [9 x i8]* %arr2, i32 0, i32 0
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arraydecay1)
+
+  ret void
+}
+
+; Expect that larger arrays only accessed through array access are not aligned
+define void @test3() {
+entry:
+  %arr1 = alloca [9 x i8], align 1
+  %arr2 = alloca [9 x i8], align 1
+
+; CHECK: strb{{(\.w)?}} {{r[0-9]+}}, [sp, #11]
+  %arrayidx = getelementptr inbounds [9 x i8], [9 x i8]* %arr1, i32 0, i32 0
+  store i8 1, i8* %arrayidx, align 1
+
+; CHECK: strb{{(\.w)?}} {{r[0-9]+}}, [sp, #2]
+  %arrayidx1 = getelementptr inbounds [9 x i8], [9 x i8]* %arr2, i32 0, i32 0
+  store i8 1, i8* %arrayidx1, align 1
+
+  ret void
+}
+
+; Expect that when an element of a larger array is passed to a function the array is aligned
+; if the offset is a multiple of 4
+define void @test4() {
+entry:
+  %arr1 = alloca [13 x i8], align 1
+  %arr2 = alloca [13 x i8], align 1
+
+; CHECK: add{{(\.w)?}} r0, sp, #16
+  %arrayelem = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 4
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arrayelem)
+
+; CHECK: mov r0, sp
+  %arrayelem1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 4
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arrayelem1)
+
+  ret void
+}
+
+; Expect that when an element of a larger array is passed to a function the array is not
+; aligned if the offset is a not multiple of 4
+define void @test5() {
+entry:
+  %arr1 = alloca [13 x i8], align 1
+  %arr2 = alloca [13 x i8], align 1
+
+; CHECK: add{{(\.w)?}} r0, sp, #19
+  %arrayelem = getelementptr inbounds [13 x i8], [13 x i8]* %arr1, i32 0, i32 3
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arrayelem)
+
+; CHECK: add{{(\.w)?}} r0, sp, #6
+  %arrayelem1 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 3
+; CHECK: bl takeptr
+  call void @takeptr(i8* %arrayelem1)
+
+  ret void
+}
+
+declare void @takeptr(i8*)