Index: llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp +++ llvm/trunk/lib/Target/ARM/ARMTargetMachine.cpp @@ -403,6 +403,12 @@ TargetPassConfig::addIRPasses(); + // Run the parallel DSP pass and its helpers. + if (getOptLevel() == CodeGenOpt::Aggressive) { + addPass(createEarlyCSEPass()); + addPass(createARMParallelDSPPass()); + } + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); @@ -415,9 +421,6 @@ } bool ARMPassConfig::addPreISel() { - if (getOptLevel() != CodeGenOpt::None) - addPass(createARMParallelDSPPass()); - if ((TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge == cl::BOU_UNSET) || EnableGlobalMerge == cl::BOU_TRUE) { Index: llvm/trunk/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll +++ llvm/trunk/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll @@ -103,12 +103,6 @@ %35 = add i8 %33, 87 %iftmp.5.0.7 = select i1 %32, i8 %34, i8 %35 store volatile i8 %iftmp.5.0.7, i8* %p8, align 1 - ; CHECK: umull [[REGISTER:lr|r[0-9]+]], - ; CHECK-NOT: [[REGISTER]], - ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}} - ; CHECK: umull [[REGISTER:lr|r[0-9]+]], - ; CHECK-NOT: [[REGISTER]], - ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}} %36 = udiv i32 %2, 100000000 %37 = urem i32 %36, 10 %38 = icmp ult i32 %37, 10 Index: llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll +++ llvm/trunk/test/CodeGen/ARM/O3-pipeline.ll @@ -0,0 +1,150 @@ +; RUN: llc -mtriple=arm -O3 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s + +; REQUIRES: asserts + +; CHECK: ModulePass Manager +; CHECK: Pre-ISel Intrinsic Lowering +; CHECK: FunctionPass Manager +; CHECK: Expand Atomic instructions +; CHECK: Simplify the CFG +; CHECK: Dominator Tree Construction +; CHECK: Basic Alias Analysis (stateless AA impl) +; CHECK: Module Verifier +; CHECK: Natural Loop Information +; CHECK: Canonicalize natural loops +; CHECK: Scalar Evolution Analysis +; CHECK: Loop Pass Manager +; CHECK: Induction Variable Users +; CHECK: Loop Strength Reduction +; CHECK: Basic Alias Analysis (stateless AA impl) +; CHECK: Function Alias Analysis Results +; CHECK: Merge contiguous icmps into a memcmp +; CHECK: Expand memcmp() to load/stores +; CHECK: Lower Garbage Collection Instructions +; CHECK: Shadow Stack GC Lowering +; CHECK: Remove unreachable blocks from the CFG +; CHECK: Dominator Tree Construction +; CHECK: Natural Loop Information +; CHECK: Branch Probability Analysis +; CHECK: Block Frequency Analysis +; CHECK: Constant Hoisting +; CHECK: Partially inline calls to library functions +; CHECK: Instrument function entry/exit with calls to e.g. mcount() (post inlining) +; CHECK: Scalarize Masked Memory Intrinsics +; CHECK: Expand reduction intrinsics +; CHECK: Dominator Tree Construction +; CHECK: Early CSE +; CHECK: Natural Loop Information +; CHECK: Scalar Evolution Analysis +; CHECK: Basic Alias Analysis (stateless AA impl) +; CHECK: Function Alias Analysis Results +; CHECK: Loop Pass Manager +; CHECK: Transform loops to use DSP intrinsics +; CHECK: Interleaved Access Pass +; CHECK: ARM IR optimizations +; CHECK: Dominator Tree Construction +; CHECK: Natural Loop Information +; CHECK: CodeGen Prepare +; CHECK: Rewrite Symbols +; CHECK: FunctionPass Manager +; CHECK: Dominator Tree Construction +; CHECK: Exception handling preparation +; CHECK: Merge internal globals +; CHECK: Safe Stack instrumentation pass +; CHECK: Insert stack protectors +; CHECK: Module Verifier +; CHECK: Dominator Tree Construction +; CHECK: Basic Alias Analysis (stateless AA impl) +; CHECK: Function Alias Analysis Results +; CHECK: Natural Loop Information +; CHECK: Branch Probability Analysis +; CHECK: ARM Instruction Selection +; CHECK: Expand ISel Pseudo-instructions +; CHECK: Early Tail Duplication +; CHECK: Optimize machine instruction PHIs +; CHECK: Slot index numbering +; CHECK: Merge disjoint stack slots +; CHECK: Local Stack Slot Allocation +; CHECK: Remove dead machine instructions +; CHECK: MachineDominator Tree Construction +; CHECK: Machine Natural Loop Construction +; CHECK: Early Machine Loop Invariant Code Motion +; CHECK: Machine Common Subexpression Elimination +; CHECK: MachinePostDominator Tree Construction +; CHECK: Machine Block Frequency Analysis +; CHECK: Machine code sinking +; CHECK: Peephole Optimizations +; CHECK: Remove dead machine instructions +; CHECK: ARM MLA / MLS expansion pass +; CHECK: ARM pre- register allocation load / store optimization pass +; CHECK: ARM A15 S->D optimizer +; CHECK: Detect Dead Lanes +; CHECK: Process Implicit Definitions +; CHECK: Remove unreachable machine basic blocks +; CHECK: Live Variable Analysis +; CHECK: MachineDominator Tree Construction +; CHECK: Machine Natural Loop Construction +; CHECK: Eliminate PHI nodes for register allocation +; CHECK: Two-Address instruction pass +; CHECK: Slot index numbering +; CHECK: Live Interval Analysis +; CHECK: Simple Register Coalescing +; CHECK: Rename Disconnected Subregister Components +; CHECK: Machine Instruction Scheduler +; CHECK: Machine Block Frequency Analysis +; CHECK: Debug Variable Analysis +; CHECK: Live Stack Slot Analysis +; CHECK: Virtual Register Map +; CHECK: Live Register Matrix +; CHECK: Bundle Machine CFG Edges +; CHECK: Spill Code Placement Analysis +; CHECK: Lazy Machine Block Frequency Analysis +; CHECK: Machine Optimization Remark Emitter +; CHECK: Greedy Register Allocator +; CHECK: Virtual Register Rewriter +; CHECK: Stack Slot Coloring +; CHECK: Machine Copy Propagation Pass +; CHECK: Machine Loop Invariant Code Motion +; CHECK: PostRA Machine Sink +; CHECK: Machine Block Frequency Analysis +; CHECK: MachinePostDominator Tree Construction +; CHECK: Lazy Machine Block Frequency Analysis +; CHECK: Machine Optimization Remark Emitter +; CHECK: Shrink Wrapping analysis +; CHECK: Prologue/Epilogue Insertion & Frame Finalization +; CHECK: Control Flow Optimizer +; CHECK: Tail Duplication +; CHECK: Machine Copy Propagation Pass +; CHECK: Post-RA pseudo instruction expansion pass +; CHECK: ARM load / store optimization pass +; CHECK: ReachingDefAnalysis +; CHECK: ARM Execution Domain Fix +; CHECK: BreakFalseDeps +; CHECK: ARM pseudo instruction expansion pass +; CHECK: Thumb2 instruction size reduce pass +; CHECK: MachineDominator Tree Construction +; CHECK: Machine Natural Loop Construction +; CHECK: Machine Block Frequency Analysis +; CHECK: If Converter +; CHECK: Thumb IT blocks insertion pass +; CHECK: MachineDominator Tree Construction +; CHECK: Machine Natural Loop Construction +; CHECK: Post RA top-down list latency scheduler +; CHECK: Analyze Machine Code For Garbage Collection +; CHECK: Machine Block Frequency Analysis +; CHECK: MachinePostDominator Tree Construction +; CHECK: Branch Probability Basic Block Placement +; CHECK: Thumb2 instruction size reduce pass +; CHECK: Unpack machine instruction bundles +; CHECK: optimise barriers pass +; CHECK: ARM constant island placement and branch shortening pass +; CHECK: Contiguously Lay Out Funclets +; CHECK: StackMap Liveness Analysis +; CHECK: Live DEBUG_VALUE analysis +; CHECK: Insert fentry calls +; CHECK: Insert XRay ops +; CHECK: Implement the 'patchable-function' attribute +; CHECK: Lazy Machine Block Frequency Analysis +; CHECK: Machine Optimization Remark Emitter +; CHECK: ARM Assembly Printer +; CHECK: Free MachineFunction Index: llvm/trunk/test/CodeGen/ARM/loop-indexing.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/loop-indexing.ll +++ llvm/trunk/test/CodeGen/ARM/loop-indexing.ll @@ -1,9 +1,9 @@ -; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 -; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 +; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED -; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2 +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2 ; Tests to check that post increment addressing modes are used instead of ; updating base pointers with add instructions. Index: llvm/trunk/test/CodeGen/ARM/vldm-sched-a9.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/vldm-sched-a9.ll +++ llvm/trunk/test/CodeGen/ARM/vldm-sched-a9.ll @@ -5,67 +5,133 @@ ; This test used to test vector spilling using vstmia/vldmia instructions, but ; the changes for PR:18825 prevent that spilling. +; VST1 and VLD1 are now used for spilling/restoring. +; +; TODO: +; I think more vldm should be generated, initial ones are used to load some +; elements and then a sequence of vldr are used: +; vldr d15, [r1, #104] +; vldr d13, [r2, #96] +; vldr d9, [r1, #120] +; vldr d11, [r2, #112] +; vldr d14, [r1, #96] +; vldr d12, [r2, #88] +; vldr d8, [r1, #112] +; vldr d10, [r2, #104] + +; Also this patterns repeats several times which certainly seems like a vld1.64 +; should be used to load the data: +; vldr d16, [r1, #16] +; vldr d17, [r1, #24] +; vst1.64 {d16, d17}, [lr:128] @ 16-byte Spill + ; CHECK: test: -; CHECK: vstmia -; CHECK: vldmia -define void @test(i64* %src) #0 { +; CHECK: vldmia r{{.*}}, {d{{.*}}, d{{.*}}} +; CHECK: vldmia r{{.*}}, {d{{.*}}, d{{.*}}} +define <16 x i64> @test(i64* %src0, i64* %src1) #0 { entry: - %arrayidx39 = getelementptr inbounds i64, i64* %src, i32 13 - %vecinit285 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit285, <16 x i64>* undef, align 128 - %0 = load i64, i64* undef, align 8 - %vecinit379 = insertelement <16 x i64> undef, i64 %0, i32 9 - %1 = load i64, i64* undef, align 8 - %vecinit419 = insertelement <16 x i64> undef, i64 %1, i32 15 - store <16 x i64> %vecinit419, <16 x i64>* undef, align 128 - %vecinit579 = insertelement <16 x i64> undef, i64 0, i32 4 - %vecinit582 = shufflevector <16 x i64> %vecinit579, <16 x i64> , <16 x i32> - %vecinit584 = insertelement <16 x i64> %vecinit582, i64 undef, i32 9 - %vecinit586 = insertelement <16 x i64> %vecinit584, i64 0, i32 10 - %vecinit589 = shufflevector <16 x i64> %vecinit586, <16 x i64> , <16 x i32> - %2 = load i64, i64* undef, align 8 - %vecinit591 = insertelement <16 x i64> %vecinit589, i64 %2, i32 15 - store <16 x i64> %vecinit591, <16 x i64>* undef, align 128 - %vecinit694 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit694, <16 x i64>* undef, align 128 - %3 = load i64, i64* undef, align 8 - %vecinit1331 = insertelement <16 x i64> undef, i64 %3, i32 14 - %4 = load i64, i64* undef, align 8 - %vecinit1468 = insertelement <16 x i64> undef, i64 %4, i32 11 - %vecinit1471 = shufflevector <16 x i64> %vecinit1468, <16 x i64> , <16 x i32> - %vecinit1474 = shufflevector <16 x i64> %vecinit1471, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit1474, <16 x i64>* undef, align 128 - %vecinit1552 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - %vecinit1555 = shufflevector <16 x i64> %vecinit1552, <16 x i64> , <16 x i32> - %vecinit1558 = shufflevector <16 x i64> %vecinit1555, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit1558, <16 x i64>* undef, align 128 - %vecinit1591 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - %vecinit1594 = shufflevector <16 x i64> %vecinit1591, <16 x i64> , <16 x i32> - %vecinit1597 = shufflevector <16 x i64> %vecinit1594, <16 x i64> , <16 x i32> - %vecinit1599 = insertelement <16 x i64> %vecinit1597, i64 undef, i32 8 - %vecinit1601 = insertelement <16 x i64> %vecinit1599, i64 undef, i32 9 - %vecinit1603 = insertelement <16 x i64> %vecinit1601, i64 undef, i32 10 - %5 = load i64, i64* undef, align 8 - %vecinit1605 = insertelement <16 x i64> %vecinit1603, i64 %5, i32 11 - %vecinit1608 = shufflevector <16 x i64> %vecinit1605, <16 x i64> , <16 x i32> - %6 = load i64, i64* undef, align 8 - %vecinit1610 = insertelement <16 x i64> %vecinit1608, i64 %6, i32 15 - store <16 x i64> %vecinit1610, <16 x i64>* undef, align 128 - %vecinit2226 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - %7 = load i64, i64* undef, align 8 - %vecinit2228 = insertelement <16 x i64> %vecinit2226, i64 %7, i32 8 - %vecinit2230 = insertelement <16 x i64> %vecinit2228, i64 undef, i32 9 - %vecinit2233 = shufflevector <16 x i64> %vecinit2230, <16 x i64> , <16 x i32> - %vecinit2236 = shufflevector <16 x i64> %vecinit2233, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit2236, <16 x i64>* undef, align 128 - %vecinit2246 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - %vecinit2249 = shufflevector <16 x i64> %vecinit2246, <16 x i64> , <16 x i32> - %vecinit2252 = shufflevector <16 x i64> %vecinit2249, <16 x i64> , <16 x i32> - %vecinit2255 = shufflevector <16 x i64> %vecinit2252, <16 x i64> , <16 x i32> - %8 = load i64, i64* %arrayidx39, align 8 - %vecinit2257 = insertelement <16 x i64> %vecinit2255, i64 %8, i32 13 - %vecinit2260 = shufflevector <16 x i64> %vecinit2257, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit2260, <16 x i64>* null, align 128 - ret void + %addr.0 = getelementptr inbounds i64, i64* %src0, i32 0 + %el.0 = load i64, i64* %addr.0, align 8 + %addr.1 = getelementptr inbounds i64, i64* %src0, i32 1 + %el.1 = load i64, i64* %addr.1, align 8 + %addr.2 = getelementptr inbounds i64, i64* %src0, i32 2 + %el.2 = load i64, i64* %addr.2, align 8 + %addr.3 = getelementptr inbounds i64, i64* %src0, i32 3 + %el.3 = load i64, i64* %addr.3, align 8 + %addr.4 = getelementptr inbounds i64, i64* %src0, i32 4 + %el.4 = load i64, i64* %addr.4, align 8 + %addr.5 = getelementptr inbounds i64, i64* %src0, i32 5 + %el.5 = load i64, i64* %addr.5, align 8 + %addr.6 = getelementptr inbounds i64, i64* %src0, i32 6 + %el.6 = load i64, i64* %addr.6, align 8 + %addr.7 = getelementptr inbounds i64, i64* %src0, i32 7 + %el.7 = load i64, i64* %addr.7, align 8 + %addr.8 = getelementptr inbounds i64, i64* %src0, i32 8 + %el.8 = load i64, i64* %addr.8, align 8 + %addr.9 = getelementptr inbounds i64, i64* %src0, i32 9 + %el.9 = load i64, i64* %addr.9, align 8 + %addr.10 = getelementptr inbounds i64, i64* %src0, i32 10 + %el.10 = load i64, i64* %addr.10, align 8 + %addr.11 = getelementptr inbounds i64, i64* %src0, i32 11 + %el.11 = load i64, i64* %addr.11, align 8 + %addr.12 = getelementptr inbounds i64, i64* %src0, i32 12 + %el.12 = load i64, i64* %addr.12, align 8 + %addr.13 = getelementptr inbounds i64, i64* %src0, i32 13 + %el.13 = load i64, i64* %addr.13, align 8 + %addr.14 = getelementptr inbounds i64, i64* %src0, i32 14 + %el.14 = load i64, i64* %addr.14, align 8 + %addr.15 = getelementptr inbounds i64, i64* %src0, i32 15 + %el.15 = load i64, i64* %addr.15, align 8 + + %addr.0.1 = getelementptr inbounds i64, i64* %src1, i32 0 + %el.0.1 = load i64, i64* %addr.0.1, align 8 + %addr.1.1 = getelementptr inbounds i64, i64* %src1, i32 1 + %el.1.1 = load i64, i64* %addr.1.1, align 8 + %addr.2.1 = getelementptr inbounds i64, i64* %src1, i32 2 + %el.2.1 = load i64, i64* %addr.2.1, align 8 + %addr.3.1 = getelementptr inbounds i64, i64* %src1, i32 3 + %el.3.1 = load i64, i64* %addr.3.1, align 8 + %addr.4.1 = getelementptr inbounds i64, i64* %src1, i32 4 + %el.4.1 = load i64, i64* %addr.4.1, align 8 + %addr.5.1 = getelementptr inbounds i64, i64* %src1, i32 5 + %el.5.1 = load i64, i64* %addr.5.1, align 8 + %addr.6.1 = getelementptr inbounds i64, i64* %src1, i32 6 + %el.6.1 = load i64, i64* %addr.6.1, align 8 + %addr.7.1 = getelementptr inbounds i64, i64* %src1, i32 7 + %el.7.1 = load i64, i64* %addr.7.1, align 8 + %addr.8.1 = getelementptr inbounds i64, i64* %src1, i32 8 + %el.8.1 = load i64, i64* %addr.8.1, align 8 + %addr.9.1 = getelementptr inbounds i64, i64* %src1, i32 9 + %el.9.1 = load i64, i64* %addr.9.1, align 8 + %addr.10.1 = getelementptr inbounds i64, i64* %src1, i32 10 + %el.10.1 = load i64, i64* %addr.10.1, align 8 + %addr.11.1 = getelementptr inbounds i64, i64* %src1, i32 11 + %el.11.1 = load i64, i64* %addr.11.1, align 8 + %addr.12.1 = getelementptr inbounds i64, i64* %src1, i32 12 + %el.12.1 = load i64, i64* %addr.12.1, align 8 + %addr.13.1 = getelementptr inbounds i64, i64* %src1, i32 13 + %el.13.1 = load i64, i64* %addr.13.1, align 8 + %addr.14.1 = getelementptr inbounds i64, i64* %src1, i32 14 + %el.14.1 = load i64, i64* %addr.14.1, align 8 + %addr.15.1 = getelementptr inbounds i64, i64* %src1, i32 15 + %el.15.1 = load i64, i64* %addr.15.1, align 8 + %vec.0 = insertelement <16 x i64> undef, i64 %el.0, i32 0 + %vec.1 = insertelement <16 x i64> %vec.0, i64 %el.1, i32 1 + %vec.2 = insertelement <16 x i64> %vec.1, i64 %el.2, i32 2 + %vec.3 = insertelement <16 x i64> %vec.2, i64 %el.3, i32 3 + %vec.4 = insertelement <16 x i64> %vec.3, i64 %el.4, i32 4 + %vec.5 = insertelement <16 x i64> %vec.4, i64 %el.5, i32 5 + %vec.6 = insertelement <16 x i64> %vec.5, i64 %el.6, i32 6 + %vec.7 = insertelement <16 x i64> %vec.6, i64 %el.7, i32 7 + %vec.8 = insertelement <16 x i64> %vec.7, i64 %el.8, i32 8 + %vec.9 = insertelement <16 x i64> %vec.8, i64 %el.9, i32 9 + %vec.10 = insertelement <16 x i64> %vec.9, i64 %el.10, i32 10 + %vec.11 = insertelement <16 x i64> %vec.10, i64 %el.11, i32 11 + %vec.12 = insertelement <16 x i64> %vec.11, i64 %el.12, i32 12 + %vec.13 = insertelement <16 x i64> %vec.12, i64 %el.13, i32 13 + %vec.14 = insertelement <16 x i64> %vec.13, i64 %el.14, i32 14 + %vec.15 = insertelement <16 x i64> %vec.14, i64 %el.15, i32 15 + call void @capture(i64* %src0, i64* %src1) + %vec.0.1 = insertelement <16 x i64> undef, i64 %el.0.1, i32 0 + %vec.1.1 = insertelement <16 x i64> %vec.0.1, i64 %el.1.1, i32 1 + %vec.2.1 = insertelement <16 x i64> %vec.1.1, i64 %el.2.1, i32 2 + %vec.3.1 = insertelement <16 x i64> %vec.2.1, i64 %el.3.1, i32 3 + %vec.4.1 = insertelement <16 x i64> %vec.3.1, i64 %el.4.1, i32 4 + %vec.5.1 = insertelement <16 x i64> %vec.4.1, i64 %el.5.1, i32 5 + %vec.6.1 = insertelement <16 x i64> %vec.5.1, i64 %el.6.1, i32 6 + %vec.7.1 = insertelement <16 x i64> %vec.6.1, i64 %el.7.1, i32 7 + %vec.8.1 = insertelement <16 x i64> %vec.7.1, i64 %el.7.1, i32 8 + %vec.9.1 = insertelement <16 x i64> %vec.8.1, i64 %el.8.1, i32 9 + %vec.10.1 = insertelement <16 x i64> %vec.9.1, i64 %el.9.1, i32 10 + %vec.11.1 = insertelement <16 x i64> %vec.10.1, i64 %el.10.1, i32 11 + %vec.12.1 = insertelement <16 x i64> %vec.11.1, i64 %el.11.1, i32 12 + %vec.13.1 = insertelement <16 x i64> %vec.12.1, i64 %el.12.1, i32 13 + %vec.14.1 = insertelement <16 x i64> %vec.13.1, i64 %el.13.1, i32 14 + %vec.15.1 = insertelement <16 x i64> %vec.14.1, i64 %el.14.1, i32 15 + %res = add <16 x i64> %vec.15, %vec.15.1 + ret <16 x i64> %res } + +declare void @capture(i64*, i64*) + attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: llvm/trunk/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll +++ llvm/trunk/test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O3 | FileCheck %s +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O2 | FileCheck %s ; rdar://7493908 ; Make sure the result of the first dynamic_alloc isn't copied back to sp more Index: llvm/trunk/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll +++ llvm/trunk/test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll @@ -32,13 +32,12 @@ define fastcc i32 @parse_percent_token() nounwind { entry: -; CHECK: pop -; CHECK: pop -; CHECK: pop -; CHECK: pop -; CHECK: pop -; CHECK: pop -; CHECK: pop +; CHECK: bx lr +; CHECK: bx lr +; CHECK: bx lr +; CHECK: bx lr +; CHECK: bx lr +; CHECK: bx lr ; Do not convert into single stream code. BranchProbability Analysis assumes ; that branches which goes to "ret" instruction have lower probabilities. switch i32 undef, label %bb7 [ Index: llvm/trunk/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll =================================================================== --- llvm/trunk/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll +++ llvm/trunk/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll @@ -302,7 +302,6 @@ ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]] ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]] ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]] -; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], {{r[0-9]}} ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]] ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]] ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]