Index: llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -60,6 +60,15 @@
 STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
 STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
 
+/// This switch disables formation of double/multi instructions that could
+/// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP
+/// disabled. This can be used to create libraries that are robust even when
+/// users provoke undefined behaviour by supplying misaligned pointers.
+/// \see mayCombineMisaligned()
+static cl::opt<bool>
+AssumeMisalignedLoadStores("arm-assume-misaligned-load-store", cl::Hidden,
+    cl::init(false), cl::desc("Be more conservative in ARM load/store opt"));
+
 namespace llvm {
 void initializeARMLoadStoreOptPass(PassRegistry &);
 }
@@ -916,6 +925,24 @@
   return (Value % 4) == 0 && Value < 1024;
 }
 
+/// Return true for loads/stores that can be combined to a double/multi
+/// operation without increasing the requirements for alignment.
+static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
+                                 const MachineInstr &MI) {
+  // vldr/vstr trap on misaligned pointers anyway, forming vldm makes no
+  // difference.
+  unsigned Opcode = MI.getOpcode();
+  if (!isi32Load(Opcode) && !isi32Store(Opcode))
+    return true;
+
+  // Stack pointer alignment is out of the programmers control so we can trust
+  // SP-relative loads/stores.
+  if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
+      STI.getFrameLowering()->getTransientStackAlignment() >= 4)
+    return true;
+  return false;
+}
+
 /// Find candidates for load/store multiple merge in list of MemOpQueueEntries.
 void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) {
   const MachineInstr *FirstMI = MemOps[0].MI;
@@ -954,6 +981,10 @@
     if (PReg == ARM::SP || PReg == ARM::PC)
       CanMergeToLSMulti = CanMergeToLSDouble = false;
 
+    // Should we be conservative?
+    if (AssumeMisalignedLoadStores && !mayCombineMisaligned(*STI, *MI))
+      CanMergeToLSMulti = CanMergeToLSDouble = false;
+
     // Merge following instructions where possible.
     for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) {
       int NewOffset = MemOps[I].Offset;
@@ -1926,6 +1957,9 @@
                 ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
 
 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (AssumeMisalignedLoadStores)
+    return false;
+
   TD = &Fn.getDataLayout();
   STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget());
   TII = STI->getInstrInfo();
Index: llvm/trunk/test/CodeGen/ARM/ldrd.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/ldrd.ll
+++ llvm/trunk/test/CodeGen/ARM/ldrd.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK -check-prefix=NORMAL
 ; rdar://6949835
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK -check-prefix=NORMAL
+
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-assume-misaligned-load-store | FileCheck %s -check-prefix=CHECK -check-prefix=CONSERVATIVE
 
 ; Magic ARM pair hints works best with linearscan / fast.
 
@@ -15,12 +17,13 @@
 
 define void @test_ldrd(i64 %a) nounwind readonly {
 ; CHECK-LABEL: test_ldrd:
-; CHECK: bl{{x?}} _get_ptr
+; NORMAL: bl{{x?}} _get_ptr
 ; A8: ldrd r0, r1, [r0]
 ; Cortex-M3 errata 602117: LDRD with base in list may result in incorrect base
 ; register when interrupted or faulted.
 ; M3-NOT: ldrd r[[REGNUM:[0-9]+]], {{r[0-9]+}}, [r[[REGNUM]]]
-; CHECK: bl{{x?}} _use_i64
+; CONSERVATIVE-NOT: ldrd
+; NORMAL: bl{{x?}} _use_i64
   %ptr = call i64* @get_ptr()
   %v = load i64, i64* %ptr, align 8
   call void @use_i64(i64 %v)
@@ -39,11 +42,10 @@
 ; evict another live range or use callee saved regs. Sorry if the test
 ; is sensitive to Regalloc changes, but it is an interesting case.
 ;
-; BASIC: @f
+; CHECK-LABEL: f:
 ; BASIC: %bb
 ; BASIC: ldrd
 ; BASIC: str
-; GREEDY: @f
 ; GREEDY: %bb
 ; GREEDY: ldrd
 ; GREEDY: str
@@ -76,14 +78,15 @@
 
 @TestVar = external global %struct.Test
 
+; CHECK-LABEL: Func1:
 define void @Func1() nounwind ssp {
-; CHECK: @Func1
 entry: 
 ; A8: movw [[BASE:r[0-9]+]], :lower16:{{.*}}TestVar{{.*}}
 ; A8: movt [[BASE]], :upper16:{{.*}}TestVar{{.*}}
 ; A8: ldrd [[FIELD1:r[0-9]+]], [[FIELD2:r[0-9]+]], {{\[}}[[BASE]], #4]
 ; A8-NEXT: add [[FIELD1]], [[FIELD2]]
 ; A8-NEXT: str [[FIELD1]], {{\[}}[[BASE]]{{\]}}
+; CONSERVATIVE-NOT: ldrd
   %orig_blocks = alloca [256 x i16], align 2
   %0 = bitcast [256 x i16]* %orig_blocks to i8*call void @llvm.lifetime.start(i64 512, i8* %0) nounwind
   %tmp1 = load i32, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 1), align 4
@@ -97,8 +100,9 @@
 declare void @extfunc(i32, i32, i32, i32)
 
 ; CHECK-LABEL: Func2:
+; CONSERVATIVE-NOT: ldrd
 ; A8: ldrd
-; A8: blx
+; CHECK: bl{{x?}} _extfunc
 ; A8: pop
 define void @Func2(i32* %p) {
 entry:
@@ -116,12 +120,14 @@
 ; M3: strd r1, r0, [sp, #-8]!
 ; BASIC: strd r1, r0, [sp, #-8]!
 ; GREEDY: strd r0, r1, [sp, #-8]!
-; CHECK: @ InlineAsm Start
-; CHECK: @ InlineAsm End
+; CONSERVATIVE: strd r0, r1, [sp, #-8]!
+; NORMAL: @ InlineAsm Start
+; NORMAL: @ InlineAsm End
 ; A8: ldrd r2, r1, [sp]
 ; M3: ldrd r2, r1, [sp]
 ; BASIC: ldrd r2, r1, [sp]
 ; GREEDY: ldrd r1, r2, [sp]
+; CONSERVATIVE: ldrd r1, r2, [sp]
 ; CHECK: bl{{x?}} _extfunc
 define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) {
   ; force %v0 and %v1 to be spilled
@@ -134,8 +140,9 @@
 declare void @extfunc2(i32*, i32, i32)
 
 ; CHECK-LABEL: ldrd_postupdate_dec:
-; CHECK: ldrd r1, r2, [r0], #-8
-; CHECK-NEXT: bl{{x?}} _extfunc
+; NORMAL: ldrd r1, r2, [r0], #-8
+; CONSERVATIVE-NOT: ldrd
+; CHECK: bl{{x?}} _extfunc
 define void @ldrd_postupdate_dec(i32* %p0) {
   %p0.1 = getelementptr i32, i32* %p0, i32 1
   %v0 = load i32, i32* %p0
@@ -146,8 +153,9 @@
 }
 
 ; CHECK-LABEL: ldrd_postupdate_inc:
-; CHECK: ldrd r1, r2, [r0], #8
-; CHECK-NEXT: bl{{x?}} _extfunc
+; NORMAL: ldrd r1, r2, [r0], #8
+; CONSERVATIVE-NOT: ldrd
+; CHECK: bl{{x?}} _extfunc
 define void @ldrd_postupdate_inc(i32* %p0) {
   %p0.1 = getelementptr i32, i32* %p0, i32 1
   %v0 = load i32, i32* %p0
@@ -158,8 +166,9 @@
 }
 
 ; CHECK-LABEL: strd_postupdate_dec:
-; CHECK: strd r1, r2, [r0], #-8
-; CHECK-NEXT: bx lr
+; NORMAL: strd r1, r2, [r0], #-8
+; CONSERVATIVE-NOT: strd
+; CHECK: bx lr
 define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) {
   %p0.1 = getelementptr i32, i32* %p0, i32 1
   store i32 %v0, i32* %p0
@@ -169,8 +178,9 @@
 }
 
 ; CHECK-LABEL: strd_postupdate_inc:
-; CHECK: strd r1, r2, [r0], #8
-; CHECK-NEXT: bx lr
+; NORMAL: strd r1, r2, [r0], #8
+; CONSERVATIVE-NOT: strd
+; CHECK: bx lr
 define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) {
   %p0.1 = getelementptr i32, i32* %p0, i32 1
   store i32 %v0, i32* %p0
Index: llvm/trunk/test/CodeGen/ARM/swift-vldm.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/swift-vldm.ll
+++ llvm/trunk/test/CodeGen/ARM/swift-vldm.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s
+; RUN: llc < %s -arm-assume-misaligned-load-store -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s
 
 ; Check that we avoid producing vldm instructions using d registers that
 ; begin in the most-significant half of a q register. These require more
Index: llvm/trunk/test/CodeGen/Thumb2/thumb2-ldm.ll
===================================================================
--- llvm/trunk/test/CodeGen/Thumb2/thumb2-ldm.ll
+++ llvm/trunk/test/CodeGen/Thumb2/thumb2-ldm.ll
@@ -1,12 +1,15 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 | FileCheck %s -check-prefix=ALL -check-prefix=CHECK
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 -arm-assume-misaligned-load-store | FileCheck %s -check-prefix=ALL -check-prefix=CONSERVATIVE
 
 @X = external global [0 x i32]          ; <[0 x i32]*> [#uses=5]
 
 define i32 @t1() {
-; CHECK-LABEL: t1:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t1:
+; ALL: push {r7, lr}
 ; CHECK: ldrd
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
         %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 0)            ; <i32> [#uses=1]
         %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 1)           ; <i32> [#uses=1]
         %tmp4 = call i32 @f1( i32 %tmp, i32 %tmp3 )                ; <i32> [#uses=1]
@@ -14,10 +17,12 @@
 }
 
 define i32 @t2() {
-; CHECK-LABEL: t2:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t2:
+; ALL: push {r7, lr}
 ; CHECK: ldm
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
         %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 2)            ; <i32> [#uses=1]
         %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 3)           ; <i32> [#uses=1]
         %tmp5 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 4)           ; <i32> [#uses=1]
@@ -26,10 +31,12 @@
 }
 
 define i32 @t3() {
-; CHECK-LABEL: t3:
-; CHECK: push {r7, lr}
+; ALL-LABEL: t3:
+; ALL: push {r7, lr}
 ; CHECK: ldm
-; CHECK: pop {r7, pc}
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: ldm
+; ALL: pop {r7, pc}
         %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 1)            ; <i32> [#uses=1]
         %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 2)           ; <i32> [#uses=1]
         %tmp5 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 3)           ; <i32> [#uses=1]
@@ -37,6 +44,34 @@
         ret i32 %tmp6
 }
 
+@g = common global i32* null
+
+define void @t4(i32 %a0, i32 %a1, i32 %a2) {
+; ALL-LABEL: t4:
+; ALL: stm.w sp, {r0, r1, r2}
+; ALL: blx _ext
+; ALL: ldm.w sp, {r0, r1, r2}
+; ALL: blx _f2
+  %arr = alloca [4 x i32], align 4
+  %p0 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 0
+  %p1 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 1
+  %p2 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 2
+  store i32* %p0, i32** @g, align 8
+
+  store i32 %a0, i32* %p0, align 4
+  store i32 %a1, i32* %p1, align 4
+  store i32 %a2, i32* %p2, align 4
+  call void @ext()
+
+  %v0 = load i32, i32* %p0, align 4
+  %v1 = load i32, i32* %p1, align 4
+  %v2 = load i32, i32* %p2, align 4
+  call i32 @f2(i32 %v0, i32 %v1, i32 %v2)
+  ret void
+}
+
 declare i32 @f1(i32, i32)
 
 declare i32 @f2(i32, i32, i32)
+
+declare void @ext()