Index: lib/Target/ARM/ARMLoadStoreOptimizer.cpp =================================================================== --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -60,6 +60,13 @@ STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); +/// This switch disables formation of double/multi instructions that could +/// potentially lead to (new) alignment traps even with CCR.UNALIGN_TRP +/// disabled. \see isConservativelyFine() +static cl::opt +ConservativeARMLoadStoreOpt("arm-conservative-load-store", cl::Hidden, + cl::init(false), cl::desc("Be more conservative in ARM load/store opt")); + namespace llvm { void initializeARMLoadStoreOptPass(PassRegistry &); } @@ -916,6 +923,25 @@ return (Value % 4) == 0 && Value < 1024; } +/// Return true for loads/stores that can be combined to a double/multi +/// operation without increasing the chances for traps because of unaligned +/// pointers. +static bool isConservativelyFine(const TargetSubtargetInfo &STI, + const MachineInstr &MI) { + // vldr/vstr trap on misaligned pointers anyway, so we won't make things worse + // by forming multi/double variants. + unsigned Opcode = MI.getOpcode(); + if (!isi32Load(Opcode) && !isi32Store(Opcode)) + return true; + + // Stack pointer alignment is out of the programmers control so we can trust + // SP-relative loads/stores. + if (getLoadStoreBaseOp(MI).getReg() == ARM::SP && + STI.getFrameLowering()->getTransientStackAlignment() >= 4) + return true; + return false; +} + /// Find candidates for load/store multiple merge in list of MemOpQueueEntries. void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { const MachineInstr *FirstMI = MemOps[0].MI; @@ -954,6 +980,10 @@ if (PReg == ARM::SP || PReg == ARM::PC) CanMergeToLSMulti = CanMergeToLSDouble = false; + // Should we be conservative? + if (ConservativeARMLoadStoreOpt && !isConservativelyFine(*STI, *MI)) + CanMergeToLSMulti = CanMergeToLSDouble = false; + // Merge following instructions where possible. for (unsigned I = SIndex+1; I < EIndex; ++I, ++Count) { int NewOffset = MemOps[I].Offset; @@ -1926,6 +1956,9 @@ ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + if (ConservativeARMLoadStoreOpt) + return false; + TD = &Fn.getDataLayout(); STI = &static_cast(Fn.getSubtarget()); TII = STI->getInstrInfo(); Index: test/CodeGen/ARM/ldrd.ll =================================================================== --- test/CodeGen/ARM/ldrd.ll +++ test/CodeGen/ARM/ldrd.ll @@ -1,9 +1,11 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=CHECK -; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=CHECK +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=fast -optimize-regalloc=0 -verify-machineinstrs | FileCheck %s -check-prefix=A8 -check-prefix=ALL -check-prefix=CHECK +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-m3 -regalloc=fast -optimize-regalloc=0 | FileCheck %s -check-prefix=M3 -check-prefix=ALL -check-prefix=CHECK ; rdar://6949835 -; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=CHECK -; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=CHECK -; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=CHECK +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s -check-prefix=BASIC -check-prefix=ALL -check-prefix=CHECK +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -regalloc=greedy | FileCheck %s -check-prefix=GREEDY -check-prefix=ALL -check-prefix=CHECK +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=swift | FileCheck %s -check-prefix=SWIFT -check-prefix=ALL -check-prefix=CHECK + +; RUN: llc < %s -mtriple=thumbv7-apple-ios -arm-conservative-load-store | FileCheck %s -check-prefix=ALL -check-prefix=CONSERVATIVE ; Magic ARM pair hints works best with linearscan / fast. @@ -14,12 +16,13 @@ declare void @use_i64(i64 %v) define void @test_ldrd(i64 %a) nounwind readonly { -; CHECK-LABEL: test_ldrd: +; ALL-LABEL: test_ldrd: ; CHECK: bl{{x?}} _get_ptr ; A8: ldrd r0, r1, [r0] ; Cortex-M3 errata 602117: LDRD with base in list may result in incorrect base ; register when interrupted or faulted. ; M3-NOT: ldrd r[[REGNUM:[0-9]+]], {{r[0-9]+}}, [r[[REGNUM]]] +; CONSERVATIVE-NOT: ldrd ; CHECK: bl{{x?}} _use_i64 %ptr = call i64* @get_ptr() %v = load i64, i64* %ptr, align 8 @@ -39,11 +42,10 @@ ; evict another live range or use callee saved regs. Sorry if the test ; is sensitive to Regalloc changes, but it is an interesting case. ; -; BASIC: @f +; ALL-LABEL: f: ; BASIC: %bb ; BASIC: ldrd ; BASIC: str -; GREEDY: @f ; GREEDY: %bb ; GREEDY: ldrd ; GREEDY: str @@ -76,14 +78,15 @@ @TestVar = external global %struct.Test +; ALL-LABEL: Func1: define void @Func1() nounwind ssp { -; CHECK: @Func1 entry: ; A8: movw [[BASE:r[0-9]+]], :lower16:{{.*}}TestVar{{.*}} ; A8: movt [[BASE]], :upper16:{{.*}}TestVar{{.*}} ; A8: ldrd [[FIELD1:r[0-9]+]], [[FIELD2:r[0-9]+]], {{\[}}[[BASE]], #4] ; A8-NEXT: add [[FIELD1]], [[FIELD2]] ; A8-NEXT: str [[FIELD1]], {{\[}}[[BASE]]{{\]}} +; CONSERVATIVE-NOT: ldrd %orig_blocks = alloca [256 x i16], align 2 %0 = bitcast [256 x i16]* %orig_blocks to i8*call void @llvm.lifetime.start(i64 512, i8* %0) nounwind %tmp1 = load i32, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 1), align 4 @@ -96,9 +99,10 @@ declare void @extfunc(i32, i32, i32, i32) -; CHECK-LABEL: Func2: +; ALL-LABEL: Func2: +; CONSERVATIVE-NOT: ldrd ; A8: ldrd -; A8: blx +; ALL: bl{{x?}} _extfunc ; A8: pop define void @Func2(i32* %p) { entry: @@ -111,18 +115,20 @@ ret void } -; CHECK-LABEL: strd_spill_ldrd_reload: +; ALL-LABEL: strd_spill_ldrd_reload: ; A8: strd r1, r0, [sp, #-8]! ; M3: strd r1, r0, [sp, #-8]! ; BASIC: strd r1, r0, [sp, #-8]! ; GREEDY: strd r0, r1, [sp, #-8]! +; CONSERVATIVE: strd r0, r1, [sp, #-8]! ; CHECK: @ InlineAsm Start ; CHECK: @ InlineAsm End ; A8: ldrd r2, r1, [sp] ; M3: ldrd r2, r1, [sp] ; BASIC: ldrd r2, r1, [sp] ; GREEDY: ldrd r1, r2, [sp] -; CHECK: bl{{x?}} _extfunc +; CONSERVATIVE: ldrd r1, r2, [sp] +; ALL: bl{{x?}} _extfunc define void @strd_spill_ldrd_reload(i32 %v0, i32 %v1) { ; force %v0 and %v1 to be spilled call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{lr}"() @@ -133,9 +139,10 @@ declare void @extfunc2(i32*, i32, i32) -; CHECK-LABEL: ldrd_postupdate_dec: +; ALL-LABEL: ldrd_postupdate_dec: ; CHECK: ldrd r1, r2, [r0], #-8 -; CHECK-NEXT: bl{{x?}} _extfunc +; CONSERVATIVE-NOT: ldrd +; ALL: bl{{x?}} _extfunc define void @ldrd_postupdate_dec(i32* %p0) { %p0.1 = getelementptr i32, i32* %p0, i32 1 %v0 = load i32, i32* %p0 @@ -145,9 +152,10 @@ ret void } -; CHECK-LABEL: ldrd_postupdate_inc: +; ALL-LABEL: ldrd_postupdate_inc: ; CHECK: ldrd r1, r2, [r0], #8 -; CHECK-NEXT: bl{{x?}} _extfunc +; CONSERVATIVE-NOT: ldrd +; ALL: bl{{x?}} _extfunc define void @ldrd_postupdate_inc(i32* %p0) { %p0.1 = getelementptr i32, i32* %p0, i32 1 %v0 = load i32, i32* %p0 @@ -157,9 +165,10 @@ ret void } -; CHECK-LABEL: strd_postupdate_dec: +; ALL-LABEL: strd_postupdate_dec: ; CHECK: strd r1, r2, [r0], #-8 -; CHECK-NEXT: bx lr +; CONSERVATIVE-NOT: strd +; ALL: bx lr define i32* @strd_postupdate_dec(i32* %p0, i32 %v0, i32 %v1) { %p0.1 = getelementptr i32, i32* %p0, i32 1 store i32 %v0, i32* %p0 @@ -168,9 +177,10 @@ ret i32* %p1 } -; CHECK-LABEL: strd_postupdate_inc: +; ALL-LABEL: strd_postupdate_inc: ; CHECK: strd r1, r2, [r0], #8 -; CHECK-NEXT: bx lr +; CONSERVATIVE-NOT: strd +; ALL: bx lr define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) { %p0.1 = getelementptr i32, i32* %p0, i32 1 store i32 %v0, i32* %p0 Index: test/CodeGen/ARM/swift-vldm.ll =================================================================== --- test/CodeGen/ARM/swift-vldm.ll +++ test/CodeGen/ARM/swift-vldm.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s +; RUN: llc < %s -arm-conservative-load-store -mcpu=swift -mtriple=armv7s-apple-ios | FileCheck %s ; Check that we avoid producing vldm instructions using d registers that ; begin in the most-significant half of a q register. These require more Index: test/CodeGen/Thumb2/thumb2-ldm.ll =================================================================== --- test/CodeGen/Thumb2/thumb2-ldm.ll +++ test/CodeGen/Thumb2/thumb2-ldm.ll @@ -1,12 +1,15 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 | FileCheck %s +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 | FileCheck %s -check-prefix=ALL -check-prefix=CHECK +; RUN: llc < %s -mtriple=thumbv7-apple-ios -mattr=+thumb2 -arm-conservative-load-store | FileCheck %s -check-prefix=ALL -check-prefix=CONSERVATIVE @X = external global [0 x i32] ; <[0 x i32]*> [#uses=5] define i32 @t1() { -; CHECK-LABEL: t1: -; CHECK: push {r7, lr} +; ALL-LABEL: t1: +; ALL: push {r7, lr} ; CHECK: ldrd -; CHECK: pop {r7, pc} +; CONSERVATIVE-NOT: ldrd +; CONSERVATIVE-NOT: ldm +; ALL: pop {r7, pc} %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 0) ; [#uses=1] %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 1) ; [#uses=1] %tmp4 = call i32 @f1( i32 %tmp, i32 %tmp3 ) ; [#uses=1] @@ -14,10 +17,12 @@ } define i32 @t2() { -; CHECK-LABEL: t2: -; CHECK: push {r7, lr} +; ALL-LABEL: t2: +; ALL: push {r7, lr} ; CHECK: ldm -; CHECK: pop {r7, pc} +; CONSERVATIVE-NOT: ldrd +; CONSERVATIVE-NOT: ldm +; ALL: pop {r7, pc} %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 2) ; [#uses=1] %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 3) ; [#uses=1] %tmp5 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 4) ; [#uses=1] @@ -26,10 +31,12 @@ } define i32 @t3() { -; CHECK-LABEL: t3: -; CHECK: push {r7, lr} +; ALL-LABEL: t3: +; ALL: push {r7, lr} ; CHECK: ldm -; CHECK: pop {r7, pc} +; CONSERVATIVE-NOT: ldrd +; CONSERVATIVE-NOT: ldm +; ALL: pop {r7, pc} %tmp = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 1) ; [#uses=1] %tmp3 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 2) ; [#uses=1] %tmp5 = load i32, i32* getelementptr ([0 x i32], [0 x i32]* @X, i32 0, i32 3) ; [#uses=1] @@ -37,6 +44,34 @@ ret i32 %tmp6 } +@g = common global i32* null + +define void @t4(i32 %a0, i32 %a1, i32 %a2) { +; ALL-LABEL: t4: +; ALL: stm.w sp, {r0, r1, r2} +; ALL: blx _ext +; ALL: ldm.w sp, {r0, r1, r2} +; ALL: blx _f2 + %arr = alloca [4 x i32], align 4 + %p0 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 0 + %p1 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 1 + %p2 = getelementptr inbounds [4 x i32], [4 x i32]* %arr, i64 0, i64 2 + store i32* %p0, i32** @g, align 8 + + store i32 %a0, i32* %p0, align 4 + store i32 %a1, i32* %p1, align 4 + store i32 %a2, i32* %p2, align 4 + call void @ext() + + %v0 = load i32, i32* %p0, align 4 + %v1 = load i32, i32* %p1, align 4 + %v2 = load i32, i32* %p2, align 4 + call i32 @f2(i32 %v0, i32 %v1, i32 %v2) + ret void +} + declare i32 @f1(i32, i32) declare i32 @f2(i32, i32, i32) + +declare void @ext()