Index: lib/Target/ARM/ARMLoadStoreOptimizer.cpp =================================================================== --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1995,9 +1995,11 @@ unsigned Align = (*Op0->memoperands_begin())->getAlignment(); const Function *Func = MF->getFunction(); - unsigned ReqAlign = STI->hasV6Ops() - ? TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext())) - : 8; // Pre-v6 need 8-byte align + unsigned ReqAlign = STI->hasV6Ops() ? 4 : 8; // Pre-v6 need 8-byte align + // LDRD may actually be slower than a pair of two LDRs if the memory is not + // 8 byte aligned on CortexA9. + if (STI->isCortexA9()) + ReqAlign = TD->getABITypeAlignment(Type::getInt64Ty(Func->getContext())); if (Align < ReqAlign) return false; Index: test/CodeGen/Thumb2/thumb2-ldrd.ll =================================================================== --- test/CodeGen/Thumb2/thumb2-ldrd.ll +++ test/CodeGen/Thumb2/thumb2-ldrd.ll @@ -2,12 +2,26 @@ @b = external global i64* -define i64 @t(i64 %a) nounwind readonly { -entry: +; CHECK-LABEL: foo ; CHECK: ldrd ; CHECK: umull +define i64 @foo(i64 %a) nounwind readonly { +entry: %0 = load i64*, i64** @b, align 4 %1 = load i64, i64* %0, align 4 %2 = mul i64 %1, %a ret i64 %2 } + +; CHECK-LABEL: bar +; CHECK: ldrd +; CHECK-NEXT: add +; CHECK-NEXT: bx +define i32 @bar(i32* %x) { + %addr0 = getelementptr i32, i32* %x, i32 0 + %addr1 = getelementptr i32, i32* %x, i32 1 + %v0 = load i32, i32* %addr0 + %v1 = load i32, i32* %addr1 + %sum = add i32 %v0, %v1 + ret i32 %sum +}