Index: lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- lib/Target/ARM/ARMTargetMachine.cpp +++ lib/Target/ARM/ARMTargetMachine.cpp @@ -20,6 +20,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/ScopedNoAliasAA.h" +#include "llvm/Analysis/TypeBasedAliasAnalysis.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/ExecutionDomainFix.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" @@ -403,6 +405,15 @@ TargetPassConfig::addIRPasses(); + // Run the parallel DSP pass and its helpers. + if (getOptLevel() == CodeGenOpt::Aggressive) { + addPass(createTypeBasedAAWrapperPass()); + addPass(createScopedNoAliasAAWrapperPass()); + addPass(createBasicAAWrapperPass()); + addPass(createEarlyCSEPass()); + addPass(createARMParallelDSPPass()); + } + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); @@ -415,9 +426,6 @@ } bool ARMPassConfig::addPreISel() { - if (getOptLevel() != CodeGenOpt::None) - addPass(createARMParallelDSPPass()); - if ((TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge == cl::BOU_UNSET) || EnableGlobalMerge == cl::BOU_TRUE) { Index: test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll =================================================================== --- test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll +++ test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll @@ -103,12 +103,6 @@ %35 = add i8 %33, 87 %iftmp.5.0.7 = select i1 %32, i8 %34, i8 %35 store volatile i8 %iftmp.5.0.7, i8* %p8, align 1 - ; CHECK: umull [[REGISTER:lr|r[0-9]+]], - ; CHECK-NOT: [[REGISTER]], - ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}} - ; CHECK: umull [[REGISTER:lr|r[0-9]+]], - ; CHECK-NOT: [[REGISTER]], - ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}} %36 = udiv i32 %2, 100000000 %37 = urem i32 %36, 10 %38 = icmp ult i32 %37, 10 Index: test/CodeGen/ARM/loop-indexing.ll =================================================================== --- test/CodeGen/ARM/loop-indexing.ll +++ test/CodeGen/ARM/loop-indexing.ll @@ -1,9 +1,9 @@ -; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 -; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 +; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 ; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED ; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED -; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2 +; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2 ; Tests to check that post increment addressing modes are used instead of ; updating base pointers with add instructions. Index: test/CodeGen/ARM/vldm-sched-a9.ll =================================================================== --- test/CodeGen/ARM/vldm-sched-a9.ll +++ test/CodeGen/ARM/vldm-sched-a9.ll @@ -5,67 +5,133 @@ ; This test used to test vector spilling using vstmia/vldmia instructions, but ; the changes for PR:18825 prevent that spilling. +; VST1 and VLD1 are now used for spilling/restoring. +; +; TODO: +; I think more vldm should be generated, initial ones are used to load some +; elements and then a sequence of vldr are used: +; vldr d15, [r1, #104] +; vldr d13, [r2, #96] +; vldr d9, [r1, #120] +; vldr d11, [r2, #112] +; vldr d14, [r1, #96] +; vldr d12, [r2, #88] +; vldr d8, [r1, #112] +; vldr d10, [r2, #104] + +; Also this patterns repeats several times which certainly seems like a vld1.64 +; should be used to load the data: +; vldr d16, [r1, #16] +; vldr d17, [r1, #24] +; vst1.64 {d16, d17}, [lr:128] @ 16-byte Spill + ; CHECK: test: -; CHECK: vstmia -; CHECK: vldmia -define void @test(i64* %src) #0 { +; CHECK: vldmia r{{.*}}, {d{{.*}}, d{{.*}}} +; CHECK: vldmia r{{.*}}, {d{{.*}}, d{{.*}}} +define <16 x i64> @test(i64* %src0, i64* %src1) #0 { entry: - %arrayidx39 = getelementptr inbounds i64, i64* %src, i32 13 - %vecinit285 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit285, <16 x i64>* undef, align 128 - %0 = load i64, i64* undef, align 8 - %vecinit379 = insertelement <16 x i64> undef, i64 %0, i32 9 - %1 = load i64, i64* undef, align 8 - %vecinit419 = insertelement <16 x i64> undef, i64 %1, i32 15 - store <16 x i64> %vecinit419, <16 x i64>* undef, align 128 - %vecinit579 = insertelement <16 x i64> undef, i64 0, i32 4 - %vecinit582 = shufflevector <16 x i64> %vecinit579, <16 x i64> , <16 x i32> - %vecinit584 = insertelement <16 x i64> %vecinit582, i64 undef, i32 9 - %vecinit586 = insertelement <16 x i64> %vecinit584, i64 0, i32 10 - %vecinit589 = shufflevector <16 x i64> %vecinit586, <16 x i64> , <16 x i32> - %2 = load i64, i64* undef, align 8 - %vecinit591 = insertelement <16 x i64> %vecinit589, i64 %2, i32 15 - store <16 x i64> %vecinit591, <16 x i64>* undef, align 128 - %vecinit694 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit694, <16 x i64>* undef, align 128 - %3 = load i64, i64* undef, align 8 - %vecinit1331 = insertelement <16 x i64> undef, i64 %3, i32 14 - %4 = load i64, i64* undef, align 8 - %vecinit1468 = insertelement <16 x i64> undef, i64 %4, i32 11 - %vecinit1471 = shufflevector <16 x i64> %vecinit1468, <16 x i64> , <16 x i32> - %vecinit1474 = shufflevector <16 x i64> %vecinit1471, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit1474, <16 x i64>* undef, align 128 - %vecinit1552 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - %vecinit1555 = shufflevector <16 x i64> %vecinit1552, <16 x i64> , <16 x i32> - %vecinit1558 = shufflevector <16 x i64> %vecinit1555, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit1558, <16 x i64>* undef, align 128 - %vecinit1591 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - %vecinit1594 = shufflevector <16 x i64> %vecinit1591, <16 x i64> , <16 x i32> - %vecinit1597 = shufflevector <16 x i64> %vecinit1594, <16 x i64> , <16 x i32> - %vecinit1599 = insertelement <16 x i64> %vecinit1597, i64 undef, i32 8 - %vecinit1601 = insertelement <16 x i64> %vecinit1599, i64 undef, i32 9 - %vecinit1603 = insertelement <16 x i64> %vecinit1601, i64 undef, i32 10 - %5 = load i64, i64* undef, align 8 - %vecinit1605 = insertelement <16 x i64> %vecinit1603, i64 %5, i32 11 - %vecinit1608 = shufflevector <16 x i64> %vecinit1605, <16 x i64> , <16 x i32> - %6 = load i64, i64* undef, align 8 - %vecinit1610 = insertelement <16 x i64> %vecinit1608, i64 %6, i32 15 - store <16 x i64> %vecinit1610, <16 x i64>* undef, align 128 - %vecinit2226 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - %7 = load i64, i64* undef, align 8 - %vecinit2228 = insertelement <16 x i64> %vecinit2226, i64 %7, i32 8 - %vecinit2230 = insertelement <16 x i64> %vecinit2228, i64 undef, i32 9 - %vecinit2233 = shufflevector <16 x i64> %vecinit2230, <16 x i64> , <16 x i32> - %vecinit2236 = shufflevector <16 x i64> %vecinit2233, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit2236, <16 x i64>* undef, align 128 - %vecinit2246 = shufflevector <16 x i64> undef, <16 x i64> , <16 x i32> - %vecinit2249 = shufflevector <16 x i64> %vecinit2246, <16 x i64> , <16 x i32> - %vecinit2252 = shufflevector <16 x i64> %vecinit2249, <16 x i64> , <16 x i32> - %vecinit2255 = shufflevector <16 x i64> %vecinit2252, <16 x i64> , <16 x i32> - %8 = load i64, i64* %arrayidx39, align 8 - %vecinit2257 = insertelement <16 x i64> %vecinit2255, i64 %8, i32 13 - %vecinit2260 = shufflevector <16 x i64> %vecinit2257, <16 x i64> , <16 x i32> - store <16 x i64> %vecinit2260, <16 x i64>* null, align 128 - ret void + %addr.0 = getelementptr inbounds i64, i64* %src0, i32 0 + %el.0 = load i64, i64* %addr.0, align 8 + %addr.1 = getelementptr inbounds i64, i64* %src0, i32 1 + %el.1 = load i64, i64* %addr.1, align 8 + %addr.2 = getelementptr inbounds i64, i64* %src0, i32 2 + %el.2 = load i64, i64* %addr.2, align 8 + %addr.3 = getelementptr inbounds i64, i64* %src0, i32 3 + %el.3 = load i64, i64* %addr.3, align 8 + %addr.4 = getelementptr inbounds i64, i64* %src0, i32 4 + %el.4 = load i64, i64* %addr.4, align 8 + %addr.5 = getelementptr inbounds i64, i64* %src0, i32 5 + %el.5 = load i64, i64* %addr.5, align 8 + %addr.6 = getelementptr inbounds i64, i64* %src0, i32 6 + %el.6 = load i64, i64* %addr.6, align 8 + %addr.7 = getelementptr inbounds i64, i64* %src0, i32 7 + %el.7 = load i64, i64* %addr.7, align 8 + %addr.8 = getelementptr inbounds i64, i64* %src0, i32 8 + %el.8 = load i64, i64* %addr.8, align 8 + %addr.9 = getelementptr inbounds i64, i64* %src0, i32 9 + %el.9 = load i64, i64* %addr.9, align 8 + %addr.10 = getelementptr inbounds i64, i64* %src0, i32 10 + %el.10 = load i64, i64* %addr.10, align 8 + %addr.11 = getelementptr inbounds i64, i64* %src0, i32 11 + %el.11 = load i64, i64* %addr.11, align 8 + %addr.12 = getelementptr inbounds i64, i64* %src0, i32 12 + %el.12 = load i64, i64* %addr.12, align 8 + %addr.13 = getelementptr inbounds i64, i64* %src0, i32 13 + %el.13 = load i64, i64* %addr.13, align 8 + %addr.14 = getelementptr inbounds i64, i64* %src0, i32 14 + %el.14 = load i64, i64* %addr.14, align 8 + %addr.15 = getelementptr inbounds i64, i64* %src0, i32 15 + %el.15 = load i64, i64* %addr.15, align 8 + + %addr.0.1 = getelementptr inbounds i64, i64* %src1, i32 0 + %el.0.1 = load i64, i64* %addr.0.1, align 8 + %addr.1.1 = getelementptr inbounds i64, i64* %src1, i32 1 + %el.1.1 = load i64, i64* %addr.1.1, align 8 + %addr.2.1 = getelementptr inbounds i64, i64* %src1, i32 2 + %el.2.1 = load i64, i64* %addr.2.1, align 8 + %addr.3.1 = getelementptr inbounds i64, i64* %src1, i32 3 + %el.3.1 = load i64, i64* %addr.3.1, align 8 + %addr.4.1 = getelementptr inbounds i64, i64* %src1, i32 4 + %el.4.1 = load i64, i64* %addr.4.1, align 8 + %addr.5.1 = getelementptr inbounds i64, i64* %src1, i32 5 + %el.5.1 = load i64, i64* %addr.5.1, align 8 + %addr.6.1 = getelementptr inbounds i64, i64* %src1, i32 6 + %el.6.1 = load i64, i64* %addr.6.1, align 8 + %addr.7.1 = getelementptr inbounds i64, i64* %src1, i32 7 + %el.7.1 = load i64, i64* %addr.7.1, align 8 + %addr.8.1 = getelementptr inbounds i64, i64* %src1, i32 8 + %el.8.1 = load i64, i64* %addr.8.1, align 8 + %addr.9.1 = getelementptr inbounds i64, i64* %src1, i32 9 + %el.9.1 = load i64, i64* %addr.9.1, align 8 + %addr.10.1 = getelementptr inbounds i64, i64* %src1, i32 10 + %el.10.1 = load i64, i64* %addr.10.1, align 8 + %addr.11.1 = getelementptr inbounds i64, i64* %src1, i32 11 + %el.11.1 = load i64, i64* %addr.11.1, align 8 + %addr.12.1 = getelementptr inbounds i64, i64* %src1, i32 12 + %el.12.1 = load i64, i64* %addr.12.1, align 8 + %addr.13.1 = getelementptr inbounds i64, i64* %src1, i32 13 + %el.13.1 = load i64, i64* %addr.13.1, align 8 + %addr.14.1 = getelementptr inbounds i64, i64* %src1, i32 14 + %el.14.1 = load i64, i64* %addr.14.1, align 8 + %addr.15.1 = getelementptr inbounds i64, i64* %src1, i32 15 + %el.15.1 = load i64, i64* %addr.15.1, align 8 + %vec.0 = insertelement <16 x i64> undef, i64 %el.0, i32 0 + %vec.1 = insertelement <16 x i64> %vec.0, i64 %el.1, i32 1 + %vec.2 = insertelement <16 x i64> %vec.1, i64 %el.2, i32 2 + %vec.3 = insertelement <16 x i64> %vec.2, i64 %el.3, i32 3 + %vec.4 = insertelement <16 x i64> %vec.3, i64 %el.4, i32 4 + %vec.5 = insertelement <16 x i64> %vec.4, i64 %el.5, i32 5 + %vec.6 = insertelement <16 x i64> %vec.5, i64 %el.6, i32 6 + %vec.7 = insertelement <16 x i64> %vec.6, i64 %el.7, i32 7 + %vec.8 = insertelement <16 x i64> %vec.7, i64 %el.8, i32 8 + %vec.9 = insertelement <16 x i64> %vec.8, i64 %el.9, i32 9 + %vec.10 = insertelement <16 x i64> %vec.9, i64 %el.10, i32 10 + %vec.11 = insertelement <16 x i64> %vec.10, i64 %el.11, i32 11 + %vec.12 = insertelement <16 x i64> %vec.11, i64 %el.12, i32 12 + %vec.13 = insertelement <16 x i64> %vec.12, i64 %el.13, i32 13 + %vec.14 = insertelement <16 x i64> %vec.13, i64 %el.14, i32 14 + %vec.15 = insertelement <16 x i64> %vec.14, i64 %el.15, i32 15 + call void @capture(i64* %src0, i64* %src1) + %vec.0.1 = insertelement <16 x i64> undef, i64 %el.0.1, i32 0 + %vec.1.1 = insertelement <16 x i64> %vec.0.1, i64 %el.1.1, i32 1 + %vec.2.1 = insertelement <16 x i64> %vec.1.1, i64 %el.2.1, i32 2 + %vec.3.1 = insertelement <16 x i64> %vec.2.1, i64 %el.3.1, i32 3 + %vec.4.1 = insertelement <16 x i64> %vec.3.1, i64 %el.4.1, i32 4 + %vec.5.1 = insertelement <16 x i64> %vec.4.1, i64 %el.5.1, i32 5 + %vec.6.1 = insertelement <16 x i64> %vec.5.1, i64 %el.6.1, i32 6 + %vec.7.1 = insertelement <16 x i64> %vec.6.1, i64 %el.7.1, i32 7 + %vec.8.1 = insertelement <16 x i64> %vec.7.1, i64 %el.7.1, i32 8 + %vec.9.1 = insertelement <16 x i64> %vec.8.1, i64 %el.8.1, i32 9 + %vec.10.1 = insertelement <16 x i64> %vec.9.1, i64 %el.9.1, i32 10 + %vec.11.1 = insertelement <16 x i64> %vec.10.1, i64 %el.10.1, i32 11 + %vec.12.1 = insertelement <16 x i64> %vec.11.1, i64 %el.11.1, i32 12 + %vec.13.1 = insertelement <16 x i64> %vec.12.1, i64 %el.12.1, i32 13 + %vec.14.1 = insertelement <16 x i64> %vec.13.1, i64 %el.13.1, i32 14 + %vec.15.1 = insertelement <16 x i64> %vec.14.1, i64 %el.14.1, i32 15 + %res = add <16 x i64> %vec.15, %vec.15.1 + ret <16 x i64> %res } + +declare void @capture(i64*, i64*) + attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll =================================================================== --- test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll +++ test/CodeGen/Thumb2/2010-04-15-DynAllocBug.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O3 | FileCheck %s +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 -O2 | FileCheck %s ; rdar://7493908 ; Make sure the result of the first dynamic_alloc isn't copied back to sp more Index: test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll =================================================================== --- test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll +++ test/CodeGen/Thumb2/2010-06-21-TailMergeBug.ll @@ -32,13 +32,12 @@ define fastcc i32 @parse_percent_token() nounwind { entry: -; CHECK: pop -; CHECK: pop -; CHECK: pop -; CHECK: pop -; CHECK: pop -; CHECK: pop -; CHECK: pop +; CHECK: bx lr +; CHECK: bx lr +; CHECK: bx lr +; CHECK: bx lr +; CHECK: bx lr +; CHECK: bx lr ; Do not convert into single stream code. BranchProbability Analysis assumes ; that branches which goes to "ret" instruction have lower probabilities. switch i32 undef, label %bb7 [