diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp --- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/OrderedBasicBlock.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/NoFolder.h" #include "llvm/Transforms/Scalar.h" @@ -42,6 +43,10 @@ DisableParallelDSP("disable-arm-parallel-dsp", cl::Hidden, cl::init(false), cl::desc("Disable the ARM Parallel DSP pass")); +static cl::opt +NumLoadLimit("arm-parallel-dsp-load-limit", cl::Hidden, cl::init(16), + cl::desc("Limit the number of loads analysed")); + namespace { struct MulCandidate; class Reduction; @@ -346,6 +351,7 @@ SmallVector Writes; LoadPairs.clear(); WideLoads.clear(); + OrderedBasicBlock OrderedBB(BB); // Collect loads and instruction that may write to memory. For now we only // record loads which are simple, sign-extended and have a single user. @@ -360,21 +366,24 @@ Loads.push_back(Ld); } + if (Loads.empty() || Loads.size() > NumLoadLimit) + return false; + using InstSet = std::set; using DepMap = std::map; DepMap RAWDeps; // Record any writes that may alias a load. const auto Size = LocationSize::unknown(); - for (auto Read : Loads) { - for (auto Write : Writes) { + for (auto Write : Writes) { + for (auto Read : Loads) { MemoryLocation ReadLoc = MemoryLocation(Read->getPointerOperand(), Size); if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc), ModRefInfo::ModRef))) continue; - if (DT->dominates(Write, Read)) + if (OrderedBB.dominates(Write, Read)) RAWDeps[Read].insert(Write); } } @@ -382,8 +391,8 @@ // Check whether there's not a write between the two loads which would // prevent them from being safely merged. auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) { - LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset; - LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base; + LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset; + LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base; if (RAWDeps.count(Dominated)) { InstSet &WritesBefore = RAWDeps[Dominated]; @@ -391,7 +400,7 @@ for (auto Before : WritesBefore) { // We can't move the second load backward, past a write, to merge // with the first load. - if (DT->dominates(Dominator, Before)) + if (OrderedBB.dominates(Dominator, Before)) return false; } } @@ -401,7 +410,7 @@ // Record base, offset load pairs. for (auto *Base : Loads) { for (auto *Offset : Loads) { - if (Base == Offset) + if (Base == Offset || OffsetLoads.count(Offset)) continue; if (AreSequentialAccesses(Base, Offset, *DL, *SE) && @@ -613,7 +622,6 @@ return !R.getMulPairs().empty(); } - void ARMParallelDSP::InsertParallelMACs(Reduction &R) { auto CreateSMLAD = [&](LoadInst* WideLd0, LoadInst *WideLd1, @@ -633,30 +641,45 @@ Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); IRBuilder Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); + BasicBlock::iterator(InsertAfter)); Instruction *Call = Builder.CreateCall(SMLAD, Args); NumSMLAD++; return Call; }; - Instruction *InsertAfter = R.getRoot(); + // Return the instruction after the dominated instruction. + auto GetInsertPoint = [this](Value *A, Value *B) { + assert((isa(A) || isa(B)) && + "expected at least one instruction"); + + Value *V = nullptr; + if (!isa(A)) + V = B; + else if (!isa(B)) + V = A; + else + V = DT->dominates(cast(A), cast(B)) ? B : A; + + return &*++BasicBlock::iterator(cast(V)); + }; + Value *Acc = R.getAccumulator(); // For any muls that were discovered but not paired, accumulate their values // as before. - IRBuilder Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); + IRBuilder Builder(R.getRoot()->getParent()); MulCandList &MulCands = R.getMuls(); for (auto &MulCand : MulCands) { if (MulCand->Paired) continue; - Value *Mul = MulCand->Root; + Instruction *Mul = cast(MulCand->Root); LLVM_DEBUG(dbgs() << "Accumulating unpaired mul: " << *Mul << "\n"); if (R.getType() != Mul->getType()) { assert(R.is64Bit() && "expected 64-bit result"); - Mul = Builder.CreateSExt(Mul, R.getType()); + Builder.SetInsertPoint(&*++BasicBlock::iterator(Mul)); + Mul = cast(Builder.CreateSExt(Mul, R.getRoot()->getType())); } if (!Acc) { @@ -664,8 +687,11 @@ continue; } + // If Acc is the original incoming value to the reduction, it could be a + // phi. But the phi will dominate Mul, meaning that Mul will be the + // insertion point. + Builder.SetInsertPoint(GetInsertPoint(Mul, Acc)); Acc = Builder.CreateAdd(Mul, Acc); - InsertAfter = cast(Acc); } if (!Acc) { @@ -677,6 +703,14 @@ Acc = Builder.CreateSExt(Acc, R.getType()); } + // Roughly sort the mul pairs in their program order. + OrderedBasicBlock OrderedBB(R.getRoot()->getParent()); + llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) { + const Instruction *A = PairA.first->Root; + const Instruction *B = PairB.first->Root; + return OrderedBB.dominates(A, B); + }); + IntegerType *Ty = IntegerType::get(M->getContext(), 32); for (auto &Pair : R.getMulPairs()) { MulCandidate *LHSMul = Pair.first; @@ -688,8 +722,9 @@ LoadInst *WideRHS = WideLoads.count(BaseRHS) ? WideLoads[BaseRHS]->getLoad() : CreateWideLoad(RHSMul->VecLd, Ty); + Instruction *InsertAfter = GetInsertPoint(WideLHS, WideRHS); + InsertAfter = GetInsertPoint(InsertAfter, Acc); Acc = CreateSMLAD(WideLHS, WideRHS, Acc, RHSMul->Exchange, InsertAfter); - InsertAfter = cast(Acc); } R.UpdateRoot(cast(Acc)); } diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/blocks.ll @@ -134,3 +134,162 @@ ret i32 %res } +; TODO: Four smlads should be generated here, but mul.0 and mul.3 remain as +; scalars. +; CHECK-LABEL: num_load_limit +; CHECK: call i32 @llvm.arm.smlad +; CHECK: call i32 @llvm.arm.smlad +; CHECK: call i32 @llvm.arm.smlad +; CHECK-NOT: call i32 @llvm.arm.smlad +define i32 @num_load_limit(i16* %a, i16* %b, i32 %acc) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add.0 = add i32 %mul.0, %mul.1 + + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %addr.b.3 = getelementptr i16, i16* %b, i32 3 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %ld.b.3 = load i16, i16* %addr.b.3 + %sext.b.3 = sext i16 %ld.b.3 to i32 + %mul.3 = mul i32 %sext.a.1, %sext.b.3 + %add.3 = add i32 %mul.2, %mul.3 + + %addr.a.4 = getelementptr i16, i16* %a, i32 4 + %addr.b.4 = getelementptr i16, i16* %b, i32 4 + %ld.a.4 = load i16, i16* %addr.a.4 + %sext.a.4 = sext i16 %ld.a.4 to i32 + %ld.b.4 = load i16, i16* %addr.b.4 + %sext.b.4 = sext i16 %ld.b.4 to i32 + %mul.4 = mul i32 %sext.a.4, %sext.b.4 + %addr.a.5 = getelementptr i16, i16* %a, i32 5 + %addr.b.5 = getelementptr i16, i16* %b, i32 5 + %ld.a.5 = load i16, i16* %addr.a.5 + %sext.a.5 = sext i16 %ld.a.5 to i32 + %ld.b.5 = load i16, i16* %addr.b.5 + %sext.b.5 = sext i16 %ld.b.5 to i32 + %mul.5 = mul i32 %sext.a.5, %sext.b.5 + %add.5 = add i32 %mul.4, %mul.5 + + %addr.a.6 = getelementptr i16, i16* %a, i32 6 + %addr.b.6 = getelementptr i16, i16* %b, i32 6 + %ld.a.6 = load i16, i16* %addr.a.6 + %sext.a.6 = sext i16 %ld.a.6 to i32 + %ld.b.6 = load i16, i16* %addr.b.6 + %sext.b.6 = sext i16 %ld.b.6 to i32 + %mul.6 = mul i32 %sext.a.6, %sext.b.6 + %addr.a.7 = getelementptr i16, i16* %a, i32 7 + %addr.b.7 = getelementptr i16, i16* %b, i32 7 + %ld.a.7 = load i16, i16* %addr.a.7 + %sext.a.7 = sext i16 %ld.a.7 to i32 + %ld.b.7 = load i16, i16* %addr.b.7 + %sext.b.7 = sext i16 %ld.b.7 to i32 + %mul.7 = mul i32 %sext.a.7, %sext.b.7 + %add.7 = add i32 %mul.6, %mul.7 + + %add.10 = add i32 %add.7, %add.5 + %add.11 = add i32 %add.3, %add.0 + %add.12 = add i32 %add.10, %add.11 + %res = add i32 %add.12, %acc + ret i32 %res +} + +; CHECK-LABEL: too_many_loads +; CHECK-NOT: call i32 @llvm.arm.smlad +define i32 @too_many_loads(i16* %a, i16* %b, i32 %acc) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add.0 = add i32 %mul.0, %mul.1 + + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %addr.b.3 = getelementptr i16, i16* %b, i32 3 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %ld.b.3 = load i16, i16* %addr.b.3 + %sext.b.3 = sext i16 %ld.b.3 to i32 + %mul.3 = mul i32 %sext.a.1, %sext.b.3 + %add.3 = add i32 %mul.2, %mul.3 + + %addr.a.4 = getelementptr i16, i16* %a, i32 4 + %addr.b.4 = getelementptr i16, i16* %b, i32 4 + %ld.a.4 = load i16, i16* %addr.a.4 + %sext.a.4 = sext i16 %ld.a.4 to i32 + %ld.b.4 = load i16, i16* %addr.b.4 + %sext.b.4 = sext i16 %ld.b.4 to i32 + %mul.4 = mul i32 %sext.a.4, %sext.b.4 + %addr.a.5 = getelementptr i16, i16* %a, i32 5 + %addr.b.5 = getelementptr i16, i16* %b, i32 5 + %ld.a.5 = load i16, i16* %addr.a.5 + %sext.a.5 = sext i16 %ld.a.5 to i32 + %ld.b.5 = load i16, i16* %addr.b.5 + %sext.b.5 = sext i16 %ld.b.5 to i32 + %mul.5 = mul i32 %sext.a.5, %sext.b.5 + %add.5 = add i32 %mul.4, %mul.5 + + %addr.a.6 = getelementptr i16, i16* %a, i32 6 + %addr.b.6 = getelementptr i16, i16* %b, i32 6 + %ld.a.6 = load i16, i16* %addr.a.6 + %sext.a.6 = sext i16 %ld.a.6 to i32 + %ld.b.6 = load i16, i16* %addr.b.6 + %sext.b.6 = sext i16 %ld.b.6 to i32 + %mul.6 = mul i32 %sext.a.6, %sext.b.6 + %addr.a.7 = getelementptr i16, i16* %a, i32 7 + %addr.b.7 = getelementptr i16, i16* %b, i32 7 + %ld.a.7 = load i16, i16* %addr.a.7 + %sext.a.7 = sext i16 %ld.a.7 to i32 + %ld.b.7 = load i16, i16* %addr.b.7 + %sext.b.7 = sext i16 %ld.b.7 to i32 + %mul.7 = mul i32 %sext.a.7, %sext.b.7 + %add.7 = add i32 %mul.6, %mul.7 + + %addr.a.8 = getelementptr i16, i16* %a, i32 7 + %addr.b.8 = getelementptr i16, i16* %b, i32 7 + %ld.a.8 = load i16, i16* %addr.a.8 + %sext.a.8 = sext i16 %ld.a.8 to i32 + %ld.b.8 = load i16, i16* %addr.b.8 + %sext.b.8 = sext i16 %ld.b.8 to i32 + %mul.8 = mul i32 %sext.a.8, %sext.b.8 + + %add.10 = add i32 %add.7, %add.5 + %add.11 = add i32 %add.3, %add.0 + %add.12 = add i32 %add.10, %add.11 + %add.13 = add i32 %add.12, %acc + %res = add i32 %add.13, %mul.8 + ret i32 %res +} diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll b/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/complex_dot_prod.ll @@ -1,21 +1,51 @@ -; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m4 -O3 %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m4 -O3 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LLC +; RUN: opt -S -mtriple=armv7-a -arm-parallel-dsp -dce %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OPT ; TODO: Think we should be able to use smlsdx/smlsldx here. ; CHECK-LABEL: complex_dot_prod -; CHECK: smulbb -; CHECK: smultt -; CHECK: smlalbb -; CHECK: smultt -; CHECK: smlalbb -; CHECK: smultt -; CHECK: smlalbb -; CHECK: smultt -; CHECK: smlaldx -; CHECK: smlaldx -; CHECK: smlaldx -; CHECK: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-LLC: smlaldx +; CHECK-LLC: smulbb +; CHECK-LLC: smultt +; CHECK-LLC: smlaldx +; CHECK-LLC: smlalbb +; CHECK-LLC: smultt +; CHECK-LLC: smlalbb +; CHECK-LLC: smultt +; CHECK-LLC: smlaldx +; CHECK-LLC: smlalbb +; CHECK-LLC: smultt +; CHECK-LLC: smlaldx +; CHECK-LCC: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} + +; CHECK-OPT: [[ADDR_A:%[^ ]+]] = bitcast i16* %pSrcA to i32* +; CHECK-OPT: [[A:%[^ ]+]] = load i32, i32* [[ADDR_A]], align 2 +; CHECK-OPT: [[ADDR_A_2:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 2 +; CHECK-OPT: [[ADDR_B:%[^ ]+]] = bitcast i16* %pSrcB to i32* +; CHECK-OPT: [[B:%[^ ]+]] = load i32, i32* [[ADDR_B]], align 2 +; CHECK-OPT: [[ACC0:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A]], i32 [[B]], i64 0) +; CHECK-OPT: [[ADDR_B_2:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 2 +; CHECK-OPT: [[CAST_ADDR_A_2:%[^ ]+]] = bitcast i16* [[ADDR_A_2]] to i32* +; CHECK-OPT: [[A_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_2]], align 2 +; CHECK-OPT: [[ADDR_A_4:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 4 +; CHECK-OPT: [[CAST_ADDR_B_2:%[^ ]+]] = bitcast i16* [[ADDR_B_2]] to i32* +; CHECK-OPT: [[B_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_2]], align 2 +; CHECK-OPT: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A_2]], i32 [[B_2]], i64 [[ACC0]]) +; CHECK-OPT: [[ADDR_B_4:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 4 +; CHECK-OPT: [[CAST_ADDR_A_4:%[^ ]+]] = bitcast i16* [[ADDR_A_4]] to i32* +; CHECK-OPT: [[A_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_4]], align 2 +; CHECK-OPT: [[ADDR_A_6:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcA, i32 6 +; CHECK-OPT: [[CAST_ADDR_B_4:%[^ ]+]] = bitcast i16* [[ADDR_B_4]] to i32* +; CHECK-OPT: [[B_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_4]], align 2 +; CHECK-OPT: [[ACC2:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[A_4]], i32 [[B_4]], i64 [[ACC1]]) +; CHECK-OPT: [[ADDR_B_6:%[^ ]+]] = getelementptr inbounds i16, i16* %pSrcB, i32 6 +; CHECK-OPT: [[CAST_ADDR_A_6:%[^ ]+]] = bitcast i16* [[ADDR_A_6]] to i32* +; CHECK-OPT: [[A_6:%[^ ]+]] = load i32, i32* [[CAST_ADDR_A_6]], align 2 +; CHECK-OPT: [[CAST_ADDR_B_6:%[^ ]+]] = bitcast i16* [[ADDR_B_6]] to i32* +; CHECK-OPT: [[B_6:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_6]], align 2 +; CHECK-OPT: call i64 @llvm.arm.smlaldx(i32 [[A_6]], i32 [[B_6]], i64 [[ACC2]]) + define dso_local arm_aapcscc void @complex_dot_prod(i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i32* nocapture %realResult, i32* nocapture %imagResult) { entry: %incdec.ptr = getelementptr inbounds i16, i16* %pSrcA, i32 1 @@ -107,7 +137,7 @@ %conv85 = sext i32 %mul84 to i64 %sub86 = sub nsw i64 %add76, %conv85 %mul89 = mul nsw i32 %conv73, %conv82 - %conv90 = sext i32 %mul89 to i64 + %conv90 = sext i32 %mul89 to i64 %add81 = add nsw i64 %add67, %conv90 %add91 = add nsw i64 %add81, %conv80 %16 = lshr i64 %sub86, 6 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -105,10 +105,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]]) define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) { entry: @@ -144,10 +144,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) define i64 @exchange_multi_use_64_1(i16* %a, i16* %b, i64 %acc) { entry: @@ -184,10 +184,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_A]], i32 [[LD_B]], i64 %acc ; CHECK: call i64 @llvm.arm.smlald(i32 [[LD_A_2]], i32 [[LD_B]], i64 [[X]]) define i64 @exchange_multi_use_64_2(i16* %a, i16* %b, i64 %acc) { entry: @@ -225,10 +225,10 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc ; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]]) define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) { entry: @@ -306,8 +306,8 @@ ; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 0) -; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 [[ACC]]) +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[LD_B]], i32 [[LD_A_2]], i64 0) +; CHECK: [[X:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]]) define i64 @exchange_multi_use_64_3(i16* %a, i16* %b, i64 %acc) { entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll b/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/inner-full-unroll.ll @@ -11,14 +11,14 @@ ; CHECK: [[BIJ_LD:%[^ ]+]] = load i32, i32* [[BIJ_CAST]], align 2 ; CHECK: [[CIJ_CAST:%[^ ]+]] = bitcast i16* [[CIJ]] to i32* ; CHECK: [[CIJ_LD:%[^ ]+]] = load i32, i32* [[CIJ_CAST]], align 2 +; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 0) ; CHECK: [[BIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[BIJ]], i32 2 ; CHECK: [[BIJ_2_CAST:%[^ ]+]] = bitcast i16* [[BIJ_2]] to i32* ; CHECK: [[BIJ_2_LD:%[^ ]+]] = load i32, i32* [[BIJ_2_CAST]], align 2 ; CHECK: [[CIJ_2:%[^ ]+]] = getelementptr inbounds i16, i16* [[CIJ]], i32 2 ; CHECK: [[CIJ_2_CAST:%[^ ]+]] = bitcast i16* [[CIJ_2]] to i32* ; CHECK: [[CIJ_2_LD:%[^ ]+]] = load i32, i32* [[CIJ_2_CAST]], align 2 -; CHECK: [[SMLAD0:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 0) -; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_LD]], i32 [[BIJ_LD]], i32 [[SMLAD0]]) +; CHECK: [[SMLAD1:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[CIJ_2_LD]], i32 [[BIJ_2_LD]], i32 [[SMLAD0]]) ; CHECK: store i32 [[SMLAD1]], i32* %arrayidx, align 4 define void @full_unroll(i32* noalias nocapture %a, i16** noalias nocapture readonly %b, i16** noalias nocapture readonly %c, i32 %N) { diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll @@ -19,9 +19,9 @@ ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-LE-NEXT: ldr lr, [r3, #2]! ; CHECK-LE-NEXT: ldr r4, [r2, #2]! -; CHECK-LE-NEXT: sxtah r1, r1, lr ; CHECK-LE-NEXT: subs r0, #1 ; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: sxtah r1, r1, lr ; CHECK-LE-NEXT: bne .LBB0_2 ; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-LE-NEXT: add.w r0, r12, r1 @@ -210,33 +210,33 @@ define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { ; CHECK-LE-LABEL: mul_top_user: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, r5, r7, lr} -; CHECK-LE-NEXT: push {r4, r5, r7, lr} +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: cmp r0, #1 ; CHECK-LE-NEXT: blt .LBB2_4 ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-LE-NEXT: sub.w lr, r2, #2 +; CHECK-LE-NEXT: subs r2, #2 ; CHECK-LE-NEXT: subs r3, #2 ; CHECK-LE-NEXT: mov.w r12, #0 ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: .p2align 2 ; CHECK-LE-NEXT: .LBB2_2: @ %for.body ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-LE-NEXT: ldr r2, [lr, #2]! -; CHECK-LE-NEXT: ldr r4, [r3, #2]! -; CHECK-LE-NEXT: asrs r5, r2, #16 -; CHECK-LE-NEXT: smlad r12, r2, r4, r12 +; CHECK-LE-NEXT: ldr lr, [r3, #2]! +; CHECK-LE-NEXT: ldr r4, [r2, #2]! ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: mul r1, r5, r1 +; CHECK-LE-NEXT: smlad r12, r4, lr, r12 +; CHECK-LE-NEXT: asr.w r4, r4, #16 +; CHECK-LE-NEXT: mul r1, r4, r1 ; CHECK-LE-NEXT: bne .LBB2_2 ; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .LBB2_4: ; CHECK-LE-NEXT: mov.w r12, #0 ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: mul_top_user: ; CHECK-BE: @ %bb.0: @ %entry @@ -313,8 +313,8 @@ define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { ; CHECK-LE-LABEL: and_user: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: .save {r4, r5, r7, lr} -; CHECK-LE-NEXT: push {r4, r5, r7, lr} +; CHECK-LE-NEXT: .save {r4, lr} +; CHECK-LE-NEXT: push {r4, lr} ; CHECK-LE-NEXT: cmp r0, #1 ; CHECK-LE-NEXT: blt .LBB3_4 ; CHECK-LE-NEXT: @ %bb.1: @ %for.body.preheader @@ -327,19 +327,19 @@ ; CHECK-LE-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-LE-NEXT: ldr r2, [r3, #2]! ; CHECK-LE-NEXT: ldr r4, [lr, #2]! -; CHECK-LE-NEXT: uxth r5, r2 -; CHECK-LE-NEXT: smlad r12, r4, r2, r12 ; CHECK-LE-NEXT: subs r0, #1 -; CHECK-LE-NEXT: mul r1, r5, r1 +; CHECK-LE-NEXT: smlad r12, r4, r2, r12 +; CHECK-LE-NEXT: uxth r2, r2 +; CHECK-LE-NEXT: mul r1, r2, r1 ; CHECK-LE-NEXT: bne .LBB3_2 ; CHECK-LE-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; CHECK-LE-NEXT: .LBB3_4: ; CHECK-LE-NEXT: mov.w r12, #0 ; CHECK-LE-NEXT: movs r1, #0 ; CHECK-LE-NEXT: add.w r0, r12, r1 -; CHECK-LE-NEXT: pop {r4, r5, r7, pc} +; CHECK-LE-NEXT: pop {r4, pc} ; ; CHECK-BE-LABEL: and_user: ; CHECK-BE: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -7,12 +7,12 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) ; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* ; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 %acc) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 [[ACC]]) ; CHECK: ret i32 [[RES]] define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { entry: @@ -51,12 +51,12 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 %acc) ; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* ; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 %acc) -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 [[ACC]]) ; CHECK: ret i64 [[RES]] define i64 @overlap_64_1(i16* %a, i16* %b, i64 %acc) { entry: @@ -133,13 +133,14 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] ; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]]) +; CHECK: ret i32 [[RES]] define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 @@ -178,13 +179,14 @@ ; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] ; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* ; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc) ; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* ; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] ; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 ; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* ; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] -; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) -; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]]) +; CHECK: ret i32 [[RES]] define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { entry: %addr.a.1 = getelementptr i16, i16* %a, i32 1 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/pr43073.ll @@ -15,14 +15,14 @@ ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ADD0]]) ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ADD0]]) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]]) ; CHECK: ret i32 [[RES]] define i32 @first_mul_invalid(i16* nocapture readonly %in, i16* nocapture readonly %b) { entry: @@ -88,14 +88,14 @@ ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[MUL0]]) ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[MUL0]]) -; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ACC]]) ; CHECK: ret i32 [[RES]] define i32 @with_no_acc_input(i16* nocapture readonly %in, i16* nocapture readonly %b) { entry: @@ -157,14 +157,14 @@ ; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 ; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* ; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ADD0]]) ; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 ; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* ; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 ; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 ; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* ; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 -; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ADD0]]) -; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i64 [[ACC]]) +; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i64 [[ACC]]) ; CHECK: ret i64 [[RES]] define i64 @with_64bit_acc(i16* nocapture readonly %in, i16* nocapture readonly %b) { entry: @@ -238,13 +238,13 @@ ; CHECK: [[Y_1:%[^ ]+]] = load i16, i16* [[ADDR_Y_MINUS_1]], align 2 ; CHECK: [[SEXT_Y_1:%[^ ]+]] = sext i16 [[Y_1]] to i32 ; CHECK: [[UNPAIRED:%[^ ]+]] = mul nsw i32 [[SEXT_Y_1]], [[SEXT_X_1]] +; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64 +; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]] ; CHECK: [[ADDR_X_PLUS_2:%[^ ]+]] = bitcast i16* [[X_PLUS_2]] to i32* ; CHECK: [[X_2:%[^ ]+]] = load i32, i32* [[ADDR_X_PLUS_2]], align 2 ; CHECK: [[Y_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %py.8757.unr, i32 -3 ; CHECK: [[ADDR_Y_MINUS_3:%[^ ]+]] = bitcast i16* [[Y_MINUS_3]] to i32* ; CHECK: [[Y_3:%[^ ]+]] = load i32, i32* [[ADDR_Y_MINUS_3]], align 2 -; CHECK: [[SEXT:%[^ ]+]] = sext i32 [[UNPAIRED]] to i64 -; CHECK: [[ACC:%[^ ]+]] = add i64 [[SEXT]], [[ADD_1]] ; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[Y_3]], i32 [[X_2]], i64 [[ACC]]) ; CHECK: ret i64 [[RES]] define i64 @with_64bit_add_acc(i16* nocapture readonly %px.10756.unr, i16* nocapture readonly %py.8757.unr, i32 %acc) { diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlad11.ll @@ -10,10 +10,10 @@ ; CHECK: [[V16:%[0-9]+]] = load i32, i32* [[V15]], align 2 ; CHECK: [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32* ; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2 +; CHECK: [[ACC:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054) ; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32* ; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2 -; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 %mac1{{\.}}054) -; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 [[V12]]) +; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[ACC]]) ; ; And we don't want to see a 3rd smlad: ; CHECK-NOT: call i32 @llvm.arm.smlad diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smladx-1.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smladx-1.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/smladx-1.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smladx-1.ll @@ -12,12 +12,13 @@ ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC0]]) + ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC0]]) -; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC1]]) +; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC1]]) ; CHECK-NOT: call i32 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i32 @llvm.arm.smlad @@ -130,6 +131,7 @@ ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC0]]) ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -138,8 +140,7 @@ ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]]) -; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC1]]) +; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll @@ -11,12 +11,12 @@ ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC0]]) ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC1]]) ; CHECK-NOT: call i64 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad @@ -187,6 +187,7 @@ ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]]) ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -194,9 +195,7 @@ ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 - -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll @@ -11,12 +11,12 @@ ; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 ; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* ; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC0]]) ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC1]]) ; CHECK-NOT: call i64 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad @@ -187,6 +187,7 @@ ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]]) ; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* @@ -194,9 +195,7 @@ ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 - -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4 diff --git a/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll b/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll --- a/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll +++ b/llvm/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll @@ -1,6 +1,6 @@ ; RUN: llc -O3 -mtriple=thumbv7em -mcpu=cortex-m4 %s -o - | FileCheck %s --check-prefix=CHECK-REG-PRESSURE ; RUN: llc -O3 -mtriple=thumbv7eb %s -o - | FileCheck %s --check-prefix=CHECK-UNSUPPORTED -; RUN: llc -O3 -mtriple=thumbv8m.main -mattr=+dsp %s -o - | FileCheck %s --check-prefix=CHECK +; RUN: llc -O3 -mtriple=thumbv8m.main -mattr=+dsp -arm-parallel-dsp-load-limit=20 %s -o - | FileCheck %s --check-prefix=CHECK ; CHECK-UNSUPPORTED-LABEL: unroll_n_jam_smlad ; CHECK-UNSUPPORTED-NOT: smlad r{{.}} @@ -38,13 +38,13 @@ ; CHECK-NOT: smlad r{{.*}} ; CHECK-REG-PRESSURE: .LBB0_1: +; CHECK-REG-PRESSURE-NOT: call i32 @llvm.arm.smlad ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: ldr{{.*}}, [sp -; CHECK-REG-PRESSURE: ldr{{.*}}, [sp -; CHECK-REG-PRESSURE: ldr{{.*}}, [sp +; CHECK-REG-PRESSURE-NOT: ldr{{.*}}, [sp ; CHECK-REG-PRESSURE: bne .LBB0_1 for.body: