Index: lib/Target/ARM/ARMLoadStoreOptimizer.cpp =================================================================== --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -2196,7 +2196,7 @@ else { SmallPtrSet MemOps; SmallSet MemRegs; - for (int i = NumMove-1; i >= 0; --i) { + for (size_t i = Ops.size() - NumMove, e = Ops.size(); i != e; ++i) { MemOps.insert(Ops[i]); MemRegs.insert(Ops[i]->getOperand(0).getReg()); } Index: test/CodeGen/ARM/prera-ldst-insertpt.mir =================================================================== --- /dev/null +++ test/CodeGen/ARM/prera-ldst-insertpt.mir @@ -0,0 +1,93 @@ +# RUN: llc -run-pass arm-prera-ldst-opt %s -o - | FileCheck %s +--- | + target triple = "thumbv7---eabi" + + define void @a(i32* nocapture %x, i32 %y, i32 %z) { + entry: + ret void + } + + define void @b(i32* nocapture %x, i32 %y, i32 %z) { + entry: + ret void + } +... +--- +name: a +alignment: 1 +tracksRegLiveness: true +liveins: + - { reg: '%r0', virtual-reg: '%0' } + - { reg: '%r1', virtual-reg: '%1' } + - { reg: '%r2', virtual-reg: '%2' } +body: | + bb.0.entry: + liveins: %r0, %r1, %r2 + + %2 : rgpr = COPY %r2 + %1 : rgpr = COPY %r1 + %0 : gpr = COPY %r0 + %3 : rgpr = t2MUL %2, %2, 14, _ + %4 : rgpr = t2MUL %1, %1, 14, _ + %5 : rgpr = t2MOVi32imm -858993459 + %6 : rgpr, %7 : rgpr = t2UMULL killed %3, %5, 14, _ + %8 : rgpr, %9 : rgpr = t2UMULL killed %4, %5, 14, _ + t2STRi12 %1, %0, 0, 14, _ :: (store 4) + %10 : rgpr = t2LSLri %2, 1, 14, _, _ + t2STRi12 killed %10, %0, 4, 14, _ :: (store 4) + %11 : rgpr = t2MOVi 55, 14, _, _ + %12 : gprnopc = t2ADDrs %11, killed %7, 19, 14, _, _ + t2STRi12 killed %12, %0, 16, 14, _ :: (store 4) + %13 : gprnopc = t2ADDrs %11, killed %9, 19, 14, _, _ + t2STRi12 killed %13, %0, 20, 14, _ :: (store 4) + tBX_RET 14, _ +--- +name: b +alignment: 1 +tracksRegLiveness: true +liveins: + - { reg: '%r0', virtual-reg: '%0' } + - { reg: '%r1', virtual-reg: '%1' } + - { reg: '%r2', virtual-reg: '%2' } +body: | + bb.0.entry: + liveins: %r0, %r1, %r2 + + %2 : rgpr = COPY %r2 + %1 : rgpr = COPY %r1 + %0 : gpr = COPY %r0 + t2STRi12 %1, %0, 0, 14, _ :: (store 4) + %10 : rgpr = t2LSLri %2, 1, 14, _, _ + t2STRi12 killed %10, %0, 4, 14, _ :: (store 4) + %3 : rgpr = t2MUL %2, %2, 14, _ + t2STRi12 %3, %0, 8, 14, _ :: (store 4) + %4 : rgpr = t2MUL %1, %1, 14, _ + %5 : rgpr = t2MOVi32imm -858993459 + %6 : rgpr, %7 : rgpr = t2UMULL killed %3, %5, 14, _ + %8 : rgpr, %9 : rgpr = t2UMULL killed %4, %5, 14, _ + %10 : rgpr = t2LSLri %2, 1, 14, _, _ + %11 : rgpr = t2MOVi 55, 14, _, _ + %12 : gprnopc = t2ADDrs %11, killed %7, 19, 14, _, _ + t2STRi12 killed %12, %0, 16, 14, _ :: (store 4) + %13 : gprnopc = t2ADDrs %11, killed %9, 19, 14, _, _ + t2STRi12 killed %13, %0, 20, 14, _ :: (store 4) + tBX_RET 14, _ + +... +# Make sure we move the paired stores next to each other. +# FIXME: Make sure we don't extend the live-range of a store +# when we don't need to. +# CHECK-LABEL: name: a +# CHECK: t2STRi12 %1, +# CHECK-NEXT: t2STRi12 killed %10, +# CHECK-NEXT: %13 = t2ADDrs %11 +# CHECK-NEXT: t2STRi12 killed %12, +# CHECK-NEXT: t2STRi12 killed %13, +# +# CHECK-LABEL: name: b +# CHECK: t2STRi12 {{.*}}, 0 +# CHECK-NEXT: t2STRi12 {{.*}}, 4 +# CHECK-NEXT: t2STRi12 {{.*}}, 8 +# CHECK-NEXT: t2ADDrs +# CHECK-NEXT: t2STRi12 {{.*}}, 16 +# CHECK-NEXT: t2STRi12 {{.*}}, 20 Index: test/CodeGen/ARM/vldm-liveness.ll =================================================================== --- test/CodeGen/ARM/vldm-liveness.ll +++ /dev/null @@ -1,40 +0,0 @@ -; RUN: llc -mtriple thumbv7-apple-ios -verify-machineinstrs -o - %s | FileCheck %s - -; ARM load store optimizer was dealing with a sequence like: -; s1 = VLDRS [r0, 1], Q0 -; s3 = VLDRS [r0, 2], Q0, Q0 -; s0 = VLDRS [r0, 0], Q0, Q0 -; s2 = VLDRS [r0, 4], Q0, Q0 -; -; It decided to combine the {s0, s1} loads into a single instruction in the -; third position. However, this leaves the instruction defining s3 with a stray -; imp-use of Q0, which is undefined. -; -; The verifier catches this, so this test just makes sure that appropriate -; liveness flags are added. -; -; I believe the change will be tested as long as the vldmia is not the first of -; the loads. Earlier optimisations may perturb the output over time, but -; fiddling the indices should be sufficient to restore the test. - -define arm_aapcs_vfpcc <4 x float> @foo(float* %ptr) { -; CHECK-LABEL: foo: -; CHECK: vldr s3, [r0, #8] -; CHECK: vldmia r0, {s0, s1} -; CHECK: vldr s2, [r0, #16] - %off0 = getelementptr float, float* %ptr, i32 0 - %val0 = load float, float* %off0 - %off1 = getelementptr float, float* %ptr, i32 1 - %val1 = load float, float* %off1 - %off4 = getelementptr float, float* %ptr, i32 4 - %val4 = load float, float* %off4 - %off2 = getelementptr float, float* %ptr, i32 2 - %val2 = load float, float* %off2 - - %vec1 = insertelement <4 x float> undef, float %val0, i32 0 - %vec2 = insertelement <4 x float> %vec1, float %val1, i32 1 - %vec3 = insertelement <4 x float> %vec2, float %val4, i32 2 - %vec4 = insertelement <4 x float> %vec3, float %val2, i32 3 - - ret <4 x float> %vec4 -} Index: test/CodeGen/ARM/vldm-liveness.mir =================================================================== --- /dev/null +++ test/CodeGen/ARM/vldm-liveness.mir @@ -0,0 +1,37 @@ +# RUN: llc -run-pass arm-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s +# ARM load store optimizer was dealing with a sequence like: +# s1 = VLDRS [r0, 1], Q0 +# s3 = VLDRS [r0, 2], Q0, Q0 +# s0 = VLDRS [r0, 0], Q0, Q0 +# s2 = VLDRS [r0, 4], Q0, Q0 +# +# It decided to combine the {s0, s1} loads into a single instruction in the +# third position. However, this leaves the instruction defining s3 with a stray +# imp-use of Q0, which is undefined. +# +# The verifier catches this, so this test just makes sure that appropriate +# liveness flags are added. +--- | + target triple = "thumbv7-apple-ios" + define arm_aapcs_vfpcc <4 x float> @foo(float* %ptr) { + ret <4 x float> undef + } +... +--- +name: foo +alignment: 1 +liveins: + - { reg: '%r0' } +body: | + bb.0 (%ir-block.0): + liveins: %r0 + + %s1 = VLDRS %r0, 1, 14, _, implicit-def %q0 :: (load 4) + %s3 = VLDRS %r0, 2, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4) + %s0 = VLDRS %r0, 0, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4) + %s2 = VLDRS killed %r0, 4, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4) + tBX_RET 14, _, implicit %q0 +... +# CHECK: %s3 = VLDRS %r0, 2, 14, _, implicit killed undef %q0, implicit-def %q0 :: (load 4) +# CHECK: VLDMSIA %r0, 14, _, def %s0, def %s1, implicit-def _ +# CHECK: %s2 = VLDRS killed %r0, 4, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)