Index: lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- lib/Target/ARM/ARMParallelDSP.cpp +++ lib/Target/ARM/ARMParallelDSP.cpp @@ -1,4 +1,4 @@ -//===- ParallelDSP.cpp - Parallel DSP Pass --------------------------------===// +//===- ARMParallelDSP.cpp - Parallel DSP Pass -----------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -18,13 +18,10 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopAccessAnalysis.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/NoFolder.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Pass.h" #include "llvm/PassRegistry.h" #include "llvm/PassSupport.h" @@ -61,6 +58,7 @@ Value* RHS; bool Exchange = false; bool ReadOnly = true; + bool Paired = false; SmallVector VecLd; // Container for loads to widen. MulCandidate(Instruction *I, Value *lhs, Value *rhs) : @@ -71,7 +69,7 @@ } LoadInst *getBaseLoad() const { - return cast(LHS); + return VecLd.front(); } }; @@ -82,7 +80,7 @@ Value *Acc = nullptr; MulCandList Muls; MulPairList MulPairs; - SmallPtrSet Adds; + SetVector Adds; public: Reduction() = delete; @@ -92,10 +90,35 @@ /// Record an Add instruction that is a part of the this reduction. void InsertAdd(Instruction *I) { Adds.insert(I); } - /// Record a MulCandidate, rooted at a Mul instruction, that is a part of - /// this reduction. - void InsertMul(Instruction *I, Value *LHS, Value *RHS) { - Muls.push_back(std::make_unique(I, LHS, RHS)); + /// Create MulCandidates, each rooted at a Mul instruction, that is a part + /// of this reduction. + void InsertMuls() { + auto GetMulOperand = [](Value *V) -> Instruction* { + if (auto *SExt = dyn_cast(V)) { + if (auto *I = dyn_cast(SExt->getOperand(0))) + if (I->getOpcode() == Instruction::Mul) + return I; + } else if (auto *I = dyn_cast(V)) { + if (I->getOpcode() == Instruction::Mul) + return I; + } + return nullptr; + }; + + auto InsertMul = [this](Instruction *I) { + Value *LHS = cast(I->getOperand(0))->getOperand(0); + Value *RHS = cast(I->getOperand(1))->getOperand(0); + Muls.push_back(std::make_unique(I, LHS, RHS)); + }; + + for (auto *Add : Adds) { + if (Add == Acc) + continue; + if (auto *Mul = GetMulOperand(Add->getOperand(0))) + InsertMul(Mul); + if (auto *Mul = GetMulOperand(Add->getOperand(1))) + InsertMul(Mul); + } } /// Add the incoming accumulator value, returns true if a value had not @@ -110,7 +133,15 @@ /// Set two MulCandidates, rooted at muls, that can be executed as a single /// parallel operation. - void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1) { + void AddMulPair(MulCandidate *Mul0, MulCandidate *Mul1, + bool Exchange = false) { + LLVM_DEBUG(dbgs() << "Pairing:\n" + << *Mul0->Root << "\n" + << *Mul1->Root << "\n"); + Mul0->Paired = true; + Mul1->Paired = true; + if (Exchange) + Mul1->Exchange = true; MulPairs.push_back(std::make_pair(Mul0, Mul1)); } @@ -127,7 +158,7 @@ Value *getAccumulator() { return Acc; } /// Return the set of adds that comprise the reduction. - SmallPtrSetImpl &getAdds() { return Adds; } + SetVector &getAdds() { return Adds; } /// Return the MulCandidate, rooted at mul instruction, that comprise the /// the reduction. @@ -141,6 +172,17 @@ void UpdateRoot(Instruction *SMLAD) { Root->replaceAllUsesWith(SMLAD); } + + void dump() { + LLVM_DEBUG(dbgs() << "Reduction:\n"); + for (auto *Add : Adds) + LLVM_DEBUG(dbgs() << *Add << "\n"); + for (auto &Mul : Muls) + LLVM_DEBUG(dbgs() << *Mul->Root << "\n" + << " " << *Mul->LHS << "\n" + << " " << *Mul->RHS << "\n"); + LLVM_DEBUG(if (Acc) dbgs() << "Acc in: " << *Acc << "\n"); + } }; class WidenedLoad { @@ -158,13 +200,11 @@ } }; - class ARMParallelDSP : public LoopPass { + class ARMParallelDSP : public FunctionPass { ScalarEvolution *SE; AliasAnalysis *AA; TargetLibraryInfo *TLI; DominatorTree *DT; - LoopInfo *LI; - Loop *L; const DataLayout *DL; Module *M; std::map LoadPairs; @@ -172,8 +212,8 @@ std::map> WideLoads; template - bool IsNarrowSequence(Value *V, Value *&Src); - + bool IsNarrowSequence(Value *V); + bool Search(Value *V, BasicBlock *BB, Reduction &R); bool RecordMemoryOps(BasicBlock *BB); void InsertParallelMACs(Reduction &Reduction); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); @@ -185,63 +225,38 @@ /// products to a 32-bit accumulate operand. Optionally, the instruction can /// exchange the halfwords of the second operand before performing the /// arithmetic. - bool MatchSMLAD(Loop *L); + bool MatchSMLAD(Function &F); public: static char ID; - ARMParallelDSP() : LoopPass(ID) { } - - bool doInitialization(Loop *L, LPPassManager &LPM) override { - LoadPairs.clear(); - WideLoads.clear(); - return true; - } + ARMParallelDSP() : FunctionPass(ID) { } void getAnalysisUsage(AnalysisUsage &AU) const override { - LoopPass::getAnalysisUsage(AU); + FunctionPass::getAnalysisUsage(AU); AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addRequired(); AU.addRequired(); AU.addRequired(); - AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); AU.setPreservesCFG(); } - bool runOnLoop(Loop *TheLoop, LPPassManager &) override { + bool runOnFunction(Function &F) override { if (DisableParallelDSP) return false; - if (skipLoop(TheLoop)) + if (skipFunction(F)) return false; - L = TheLoop; SE = &getAnalysis().getSE(); AA = &getAnalysis().getAAResults(); TLI = &getAnalysis().getTLI(); DT = &getAnalysis().getDomTree(); - LI = &getAnalysis().getLoopInfo(); auto &TPC = getAnalysis(); - BasicBlock *Header = TheLoop->getHeader(); - if (!Header) - return false; - - // TODO: We assume the loop header and latch to be the same block. - // This is not a fundamental restriction, but lifting this would just - // require more work to do the transformation and then patch up the CFG. - if (Header != TheLoop->getLoopLatch()) { - LLVM_DEBUG(dbgs() << "The loop header is not the loop latch: not " - "running pass ARMParallelDSP\n"); - return false; - } - - if (!TheLoop->getLoopPreheader()) - InsertPreheaderForLoop(L, DT, LI, nullptr, true); - - Function &F = *Header->getParent(); M = F.getParent(); DL = &M->getDataLayout(); @@ -266,17 +281,10 @@ return false; } - LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI); - LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); - if (!RecordMemoryOps(Header)) { - LLVM_DEBUG(dbgs() << " - No sequential loads found.\n"); - return false; - } - - bool Changes = MatchSMLAD(L); + bool Changes = MatchSMLAD(F); return Changes; } }; @@ -315,18 +323,14 @@ // TODO: we currently only collect i16, and will support i8 later, so that's // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth. template -bool ARMParallelDSP::IsNarrowSequence(Value *V, Value *&Src) { +bool ARMParallelDSP::IsNarrowSequence(Value *V) { if (auto *SExt = dyn_cast(V)) { if (SExt->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) return false; if (auto *Ld = dyn_cast(SExt->getOperand(0))) { - // Check that these load could be paired. - if (!LoadPairs.count(Ld) && !OffsetLoads.count(Ld)) - return false; - - Src = Ld; - return true; + // Check that this load could be paired. + return LoadPairs.count(Ld) || OffsetLoads.count(Ld); } } return false; @@ -337,6 +341,8 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { SmallVector Loads; SmallVector Writes; + LoadPairs.clear(); + WideLoads.clear(); // Collect loads and instruction that may write to memory. For now we only // record loads which are simple, sign-extended and have a single user. @@ -414,7 +420,54 @@ return LoadPairs.size() > 1; } -// Loop Pass that needs to identify integer add/sub reductions of 16-bit vector +// Search recursively back through the operands to find a tree of values that +// form a multiply-accumulate chain. The search records the Add and Mul +// instructions that form the reduction and allows us to find a single value +// to be used as the initial input to the accumlator. +bool ARMParallelDSP::Search(Value *V, BasicBlock *BB, Reduction &R) { + // If we find a non-instruction, try to use it as the initial accumulator + // value. This may have already been found during the search in which case + // this function will return false, signaling a search fail. + auto *I = dyn_cast(V); + if (!I) + return R.InsertAcc(V); + + if (I->getParent() != BB) + return false; + + switch (I->getOpcode()) { + default: + break; + case Instruction::PHI: + // Could be the accumulator value. + return R.InsertAcc(V); + case Instruction::Add: { + // Adds should be adding together two muls, or another add and a mul to + // be within the mac chain. One of the operands may also be the + // accumulator value at which point we should stop searching. + R.InsertAdd(I); + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + bool ValidLHS = Search(LHS, BB, R); + bool ValidRHS = Search(RHS, BB, R); + + if (ValidLHS && ValidRHS) + return true; + + return R.InsertAcc(I); + } + case Instruction::Mul: { + Value *MulOp0 = I->getOperand(0); + Value *MulOp1 = I->getOperand(1); + return IsNarrowSequence<16>(MulOp0) && IsNarrowSequence<16>(MulOp1); + } + case Instruction::SExt: + return Search(I->getOperand(0), BB, R); + } + return false; +} + +// The pass needs to identify integer add/sub reductions of 16-bit vector // multiplications. // To use SMLAD: // 1) we first need to find integer add then look for this pattern: @@ -445,88 +498,39 @@ // If loop invariants are used instead of loads, these need to be packed // before the loop begins. // -bool ARMParallelDSP::MatchSMLAD(Loop *L) { - // Search recursively back through the operands to find a tree of values that - // form a multiply-accumulate chain. The search records the Add and Mul - // instructions that form the reduction and allows us to find a single value - // to be used as the initial input to the accumlator. - std::function Search = [&] - (Value *V, Reduction &R) -> bool { - - // If we find a non-instruction, try to use it as the initial accumulator - // value. This may have already been found during the search in which case - // this function will return false, signaling a search fail. - auto *I = dyn_cast(V); - if (!I) - return R.InsertAcc(V); - - switch (I->getOpcode()) { - default: - break; - case Instruction::PHI: - // Could be the accumulator value. - return R.InsertAcc(V); - case Instruction::Add: { - // Adds should be adding together two muls, or another add and a mul to - // be within the mac chain. One of the operands may also be the - // accumulator value at which point we should stop searching. - bool ValidLHS = Search(I->getOperand(0), R); - bool ValidRHS = Search(I->getOperand(1), R); - if (!ValidLHS && !ValidLHS) - return false; - else if (ValidLHS && ValidRHS) { - R.InsertAdd(I); - return true; - } else { - R.InsertAdd(I); - return R.InsertAcc(I); - } - } - case Instruction::Mul: { - Value *MulOp0 = I->getOperand(0); - Value *MulOp1 = I->getOperand(1); - if (isa(MulOp0) && isa(MulOp1)) { - Value *LHS = nullptr; - Value *RHS = nullptr; - if (IsNarrowSequence<16>(MulOp0, LHS) && - IsNarrowSequence<16>(MulOp1, RHS)) { - R.InsertMul(I, LHS, RHS); - return true; - } - } - return false; - } - case Instruction::SExt: - return Search(I->getOperand(0), R); - } - return false; - }; - +bool ARMParallelDSP::MatchSMLAD(Function &F) { bool Changed = false; - SmallPtrSet AllAdds; - BasicBlock *Latch = L->getLoopLatch(); - for (Instruction &I : reverse(*Latch)) { - if (I.getOpcode() != Instruction::Add) + for (auto &BB : F) { + SmallPtrSet AllAdds; + if (!RecordMemoryOps(&BB)) continue; - if (AllAdds.count(&I)) - continue; + for (Instruction &I : reverse(BB)) { + if (I.getOpcode() != Instruction::Add) + continue; - const auto *Ty = I.getType(); - if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) - continue; + if (AllAdds.count(&I)) + continue; - Reduction R(&I); - if (!Search(&I, R)) - continue; + const auto *Ty = I.getType(); + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) + continue; - if (!CreateParallelPairs(R)) - continue; + Reduction R(&I); + if (!Search(&I, &BB, R)) + continue; + + R.InsertMuls(); + LLVM_DEBUG(dbgs() << "After search, Reduction:\n"; R.dump()); - InsertParallelMACs(R); - Changed = true; - AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); + if (!CreateParallelPairs(R)) + continue; + + InsertParallelMACs(R); + Changed = true; + AllAdds.insert(R.getAdds().begin(), R.getAdds().end()); + } } return Changed; @@ -554,12 +558,6 @@ auto Ld2 = static_cast(PMul0->RHS); auto Ld3 = static_cast(PMul1->RHS); - LLVM_DEBUG(dbgs() << "Loads:\n" - << " - " << *Ld0 << "\n" - << " - " << *Ld1 << "\n" - << " - " << *Ld2 << "\n" - << " - " << *Ld3 << "\n"); - if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); @@ -568,8 +566,7 @@ } else if (AreSequentialLoads(Ld3, Ld2, PMul1->VecLd)) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); LLVM_DEBUG(dbgs() << " exchanging Ld2 and Ld3\n"); - PMul1->Exchange = true; - R.AddMulPair(PMul0, PMul1); + R.AddMulPair(PMul0, PMul1, true); return true; } } else if (AreSequentialLoads(Ld1, Ld0, PMul0->VecLd) && @@ -577,9 +574,8 @@ LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); LLVM_DEBUG(dbgs() << " exchanging Ld0 and Ld1\n"); LLVM_DEBUG(dbgs() << " and swapping muls\n"); - PMul0->Exchange = true; // Only the second operand can be exchanged, so swap the muls. - R.AddMulPair(PMul1, PMul0); + R.AddMulPair(PMul1, PMul0, true); return true; } return false; @@ -587,10 +583,9 @@ MulCandList &Muls = R.getMuls(); const unsigned Elems = Muls.size(); - SmallPtrSet Paired; for (unsigned i = 0; i < Elems; ++i) { MulCandidate *PMul0 = static_cast(Muls[i].get()); - if (Paired.count(PMul0->Root)) + if (PMul0->Paired) continue; for (unsigned j = 0; j < Elems; ++j) { @@ -598,7 +593,7 @@ continue; MulCandidate *PMul1 = static_cast(Muls[j].get()); - if (Paired.count(PMul1->Root)) + if (PMul1->Paired) continue; const Instruction *Mul0 = PMul0->Root; @@ -608,11 +603,8 @@ assert(PMul0 != PMul1 && "expected different chains"); - if (CanPair(R, PMul0, PMul1)) { - Paired.insert(Mul0); - Paired.insert(Mul1); + if (CanPair(R, PMul0, PMul1)) break; - } } } return !R.getMulPairs().empty(); @@ -649,15 +641,23 @@ if (!Acc) Acc = ConstantInt::get(IntegerType::get(M->getContext(), 32), 0); + // For any muls that were discovered but not paired, accumulate their values + // as before. + IRBuilder Builder(InsertAfter->getParent(), + ++BasicBlock::iterator(InsertAfter)); + MulCandList &MulCands = R.getMuls(); + for (auto &MulCand : MulCands) { + if (MulCand->Paired) + continue; + + Acc = Builder.CreateAdd(MulCand->Root, Acc); + InsertAfter = cast(Acc); + } IntegerType *Ty = IntegerType::get(M->getContext(), 32); - LLVM_DEBUG(dbgs() << "Root: " << *InsertAfter << "\n" - << "Acc: " << *Acc << "\n"); + for (auto &Pair : R.getMulPairs()) { MulCandidate *LHSMul = Pair.first; MulCandidate *RHSMul = Pair.second; - LLVM_DEBUG(dbgs() << "Muls:\n" - << "- " << *LHSMul->Root << "\n" - << "- " << *RHSMul->Root << "\n"); LoadInst *BaseLHS = LHSMul->getBaseLoad(); LoadInst *BaseRHS = RHSMul->getBaseLoad(); LoadInst *WideLHS = WideLoads.count(BaseLHS) ? @@ -724,14 +724,25 @@ // Loads[0] needs trunc while Loads[1] needs a lshr and trunc. // TODO: Support big-endian as well. Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); - BaseSExt->setOperand(0, Bottom); + Value *NewBaseSExt = IRB.CreateSExt(Bottom, BaseSExt->getType()); + BaseSExt->replaceAllUsesWith(NewBaseSExt); IntegerType *OffsetTy = cast(Offset->getType()); Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth()); Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); - OffsetSExt->setOperand(0, Trunc); - + Value *NewOffsetSExt = IRB.CreateSExt(Trunc, OffsetSExt->getType()); + OffsetSExt->replaceAllUsesWith(NewOffsetSExt); + + LLVM_DEBUG(dbgs() << "From Base and Offset:\n" + << *Base << "\n" << *Offset << "\n" + << "Created Wide Load:\n" + << *WideLoad << "\n" + << *Bottom << "\n" + << *NewBaseSExt << "\n" + << *Top << "\n" + << *Trunc << "\n" + << *NewOffsetSExt << "\n"); WideLoads.emplace(std::make_pair(Base, std::make_unique(Loads, WideLoad))); return WideLoad; @@ -744,6 +755,6 @@ char ARMParallelDSP::ID = 0; INITIALIZE_PASS_BEGIN(ARMParallelDSP, "arm-parallel-dsp", - "Transform loops to use DSP intrinsics", false, false) + "Transform functions to use DSP intrinsics", false, false) INITIALIZE_PASS_END(ARMParallelDSP, "arm-parallel-dsp", - "Transform loops to use DSP intrinsics", false, false) + "Transform functions to use DSP intrinsics", false, false) Index: test/CodeGen/ARM/O3-pipeline.ll =================================================================== --- test/CodeGen/ARM/O3-pipeline.ll +++ test/CodeGen/ARM/O3-pipeline.ll @@ -37,8 +37,7 @@ ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Loop Pass Manager -; CHECK-NEXT: Transform loops to use DSP intrinsics +; CHECK-NEXT: Transform functions to use DSP intrinsics ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: ARM IR optimizations ; CHECK-NEXT: Dominator Tree Construction Index: test/CodeGen/ARM/ParallelDSP/blocks.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/ParallelDSP/blocks.ll @@ -0,0 +1,79 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: single_block +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 %acc) +define i32 @single_block(i16* %a, i16* %b, i32 %acc) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: multi_block +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smlad(i32 [[A]], i32 [[B]], i32 0) +define i32 @multi_block(i16* %a, i16* %b, i32 %acc) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + br label %bb.1 + +bb.1: + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: multi_block_1 +; CHECK-NOT: call i32 @llvm.arm.smlad +define i32 @multi_block_1(i16* %a, i16* %b, i32 %acc) { +entry: + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + br label %bb.1 + +bb.1: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.1 = load i16, i16* %addr.a.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + Index: test/CodeGen/ARM/ParallelDSP/exchange.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/ParallelDSP/exchange.ll @@ -0,0 +1,329 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: exchange_1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_1(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.b.1, %sext.a.0 + %mul.1 = mul i32 %sext.b.0, %sext.a.1 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_3 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_4 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_4(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.b.1, %sext.a.0 + %mul.1 = mul i32 %sext.b.0, %sext.a.1 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_multi_use_1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B]], i32 [[X]]) +define i32 @exchange_multi_use_1(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.1 + %mul.1 = mul i32 %sext.a.1, %sext.b.0 + %add = add i32 %mul.0, %mul.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.a.3, %sext.b.1 + %mul.3 = mul i32 %sext.a.2, %sext.b.0 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add, %add.1 + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_multi_use_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 [[X]]) +define i32 @exchange_multi_use_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.0, %sext.a.3 + %mul.3 = mul i32 %sext.b.1, %sext.a.2 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add, %add.1 + %res = add i32 %add.2, %acc + ret i32 %res +} + +; TODO: Why aren't two intrinsics generated? +; CHECK-LABEL: exchange_multi_use_3 +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[GEP:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK-NOT: call i32 @llvm.arm.smlad +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A_2]], i32 0 +define i32 @exchange_multi_use_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.0, %sext.a.3 + %mul.3 = mul i32 %sext.b.1, %sext.a.2 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %sub = sub i32 %add, %add.1 + %res = add i32 %acc, %sub + ret i32 %res +} + +; TODO: Why isn't smladx generated too? +; CHECK-LABEL: exchange_multi_use_4 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[X:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 0 +; CHECK-NOT: call i32 @llvm.arm.smlad +define i32 @exchange_multi_use_4(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.0, %sext.a.3 + %mul.3 = mul i32 %sext.b.1, %sext.a.2 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %add.1 = add i32 %mul.2, %mul.3 + %add = add i32 %mul.0, %mul.1 + %sub = sub i32 %add, %add.1 + %res = add i32 %acc, %sub + ret i32 %res +} + +; CHECK-LABEL: exchange_swap +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_B]], i32 [[LD_A]] +define i32 @exchange_swap(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.1, %sext.b.0 + %mul.1 = mul i32 %sext.a.0, %sext.b.1 + %add = add i32 %mul.0, %mul.1 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_swap_2 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_swap_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.1, %sext.b.0 + %mul.1 = mul i32 %sext.a.0, %sext.b.1 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} + +; CHECK-LABEL: exchange_swap_3 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: call i32 @llvm.arm.smladx(i32 [[LD_A]], i32 [[LD_B]] +define i32 @exchange_swap_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.b.0, %sext.a.1 + %mul.1 = mul i32 %sext.b.1, %sext.a.0 + %add = add i32 %mul.1, %mul.0 + %res = add i32 %add, %acc + ret i32 %res +} Index: test/CodeGen/ARM/ParallelDSP/overlapping.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/ParallelDSP/overlapping.ll @@ -0,0 +1,172 @@ +; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s + +; CHECK-LABEL: overlap_1 +; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 +; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32* +; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 %acc) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC]]) +; CHECK: ret i32 [[RES]] +define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.a.2, %sext.b.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.1, %mul.2 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_2 +; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 +; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[ACC1:%[^ ]+]] = add i32 %mul.1, %acc +; CHECK: [[ACC2:%[^ ]+]] = add i32 %mul.2, [[ACC1]] +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC2]]) +; CHECK: ret i32 [[RES]] +define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %mul.2 = mul i32 %sext.b.2, %sext.a.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.1, %mul.2 + %add.2 = add i32 %add, %add.1 + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_3 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.a.2, %sext.b.1 + %mul.3 = mul i32 %sext.a.3, %sext.b.2 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} + +; CHECK-LABEL: overlap_4 +; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 +; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* +; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] +; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* +; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] +; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* +; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] +; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 +; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* +; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] +; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) +; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) +define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { +entry: + %addr.a.1 = getelementptr i16, i16* %a, i32 1 + %addr.b.1 = getelementptr i16, i16* %b, i32 1 + %ld.a.0 = load i16, i16* %a + %sext.a.0 = sext i16 %ld.a.0 to i32 + %ld.b.0 = load i16, i16* %b + %ld.a.1 = load i16, i16* %addr.a.1 + %ld.b.1 = load i16, i16* %addr.b.1 + %sext.a.1 = sext i16 %ld.a.1 to i32 + %sext.b.1 = sext i16 %ld.b.1 to i32 + %sext.b.0 = sext i16 %ld.b.0 to i32 + %mul.0 = mul i32 %sext.a.0, %sext.b.0 + %mul.1 = mul i32 %sext.a.1, %sext.b.1 + %addr.a.2 = getelementptr i16, i16* %a, i32 2 + %addr.b.2 = getelementptr i16, i16* %b, i32 2 + %addr.a.3 = getelementptr i16, i16* %a, i32 3 + %ld.a.2 = load i16, i16* %addr.a.2 + %ld.b.2 = load i16, i16* %addr.b.2 + %ld.a.3 = load i16, i16* %addr.a.3 + %sext.a.2 = sext i16 %ld.a.2 to i32 + %sext.b.2 = sext i16 %ld.b.2 to i32 + %sext.a.3 = sext i16 %ld.a.3 to i32 + %mul.2 = mul i32 %sext.b.2, %sext.a.2 + %mul.3 = mul i32 %sext.b.1, %sext.a.3 + %add = add i32 %mul.0, %mul.1 + %add.1 = add i32 %mul.2, %mul.3 + %add.2 = add i32 %add.1, %add + %res = add i32 %add.2, %acc + ret i32 %res +} Index: test/CodeGen/ARM/ParallelDSP/pr42729.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/pr42729.ll +++ test/CodeGen/ARM/ParallelDSP/pr42729.ll @@ -9,7 +9,7 @@ ; CHECK: [[GEP16:%[^ ]+]] = getelementptr i16, i16* [[CAST_GEP8]], i32 6 ; CHECK: [[CAST_GEP16:%[^ ]+]] = bitcast i16* [[GEP16]] to i32* ; CHECK: [[LOAD_UNDEF:%[^ ]+]] = load i32, i32* [[CAST_GEP16]], align 2 -; CHECK: call i32 @llvm.arm.smladx(i32 [[LOAD_A]], i32 [[LOAD_UNDEF]], i32 undef) +; CHECK: call i32 @llvm.arm.smladx(i32 [[LOAD_UNDEF]], i32 [[LOAD_A]], i32 undef) define void @undef_no_return(i16* %a) { entry: %incdec.ptr21 = getelementptr inbounds i16, i16* %a, i32 3 @@ -48,7 +48,7 @@ ; CHECK: [[GEP16:%[^ ]+]] = getelementptr i16, i16* [[CAST_GEP8]], i32 %iv ; CHECK: [[CAST_GEP16:%[^ ]+]] = bitcast i16* [[GEP16]] to i32* ; CHECK: [[LOAD_B:%[^ ]+]] = load i32, i32* [[CAST_GEP16]], align 2 -; CHECK: [[ACC_NEXT]] = call i32 @llvm.arm.smladx(i32 [[LOAD_A]], i32 [[LOAD_B]], i32 [[ACC]]) +; CHECK: [[ACC_NEXT]] = call i32 @llvm.arm.smladx(i32 [[LOAD_B]], i32 [[LOAD_A]], i32 [[ACC]]) define i32 @return(i16* %a, i8* %b, i32 %N) { entry: %incdec.ptr21 = getelementptr inbounds i16, i16* %a, i32 3 Index: test/CodeGen/ARM/ParallelDSP/pr43073.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/ParallelDSP/pr43073.ll @@ -0,0 +1,78 @@ +; RUN: opt -mtriple=thumbv7-unknown-linux-gnueabihf -arm-parallel-dsp -dce %s -S -o - | FileCheck %s + +; CHECK-LABEL: first_mul_invalid +; CHECK: [[ADDR_IN_MINUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -1 +; CHECK: [[LD_IN_MINUS_1:%[^ ]+]] = load i16, i16* [[ADDR_IN_MINUS_1]], align 2 +; CHECK: [[IN_MINUS_1:%[^ ]+]] = sext i16 [[LD_IN_MINUS_1]] to i32 +; CHECK: [[ADDR_B_PLUS_1:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 1 +; CHECK: [[LD_B_PLUS_1:%[^ ]+]] = load i16, i16* [[ADDR_B_PLUS_1]], align 2 +; CHECK: [[B_PLUS_1:%[^ ]+]] = sext i16 [[LD_B_PLUS_1]] to i32 +; CHECK: [[MUL0:%[^ ]+]] = mul nsw i32 [[B_PLUS_1]], [[IN_MINUS_1]] +; CHECK: [[ADD0:%[^ ]+]] = add i32 [[MUL0]] +; CHECK: [[ADDR_IN_MINUS_3:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -3 +; CHECK: [[CAST_ADDR_IN_MINUS_3:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_3]] to i32* +; CHECK: [[IN_MINUS_3:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_3]], align 2 +; CHECK: [[ADDR_B_PLUS_2:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 2 +; CHECK: [[CAST_ADDR_B_PLUS_2:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_2]] to i32* +; CHECK: [[B_PLUS_2:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_2]], align 2 +; CHECK: [[ADDR_IN_MINUS_5:%[^ ]+]] = getelementptr inbounds i16, i16* %in, i32 -5 +; CHECK: [[CAST_ADDR_IN_MINUS_5:%[^ ]+]] = bitcast i16* [[ADDR_IN_MINUS_5]] to i32* +; CHECK: [[IN_MINUS_5:%[^ ]+]] = load i32, i32* [[CAST_ADDR_IN_MINUS_5]], align 2 +; CHECK: [[ADDR_B_PLUS_4:%[^ ]+]] = getelementptr inbounds i16, i16* %b, i32 4 +; CHECK: [[CAST_ADDR_B_PLUS_4:%[^ ]+]] = bitcast i16* [[ADDR_B_PLUS_4]] to i32* +; CHECK: [[B_PLUS_4:%[^ ]+]] = load i32, i32* [[CAST_ADDR_B_PLUS_4]], align 2 +; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_5]], i32 [[B_PLUS_4]], i32 [[ADD0]]) +; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN_MINUS_3]], i32 [[B_PLUS_2]], i32 [[ACC]]) +; CHECK: ret i32 [[RES]] +define i32 @first_mul_invalid(i16* nocapture readonly %in, i16* nocapture readonly %b) { +entry: + %0 = load i16, i16* %in, align 2 + %conv = sext i16 %0 to i32 + %1 = load i16, i16* %b, align 2 + %conv2 = sext i16 %1 to i32 + %call = tail call i32 @bar(i32 %conv, i32 %conv2) + %arrayidx3 = getelementptr inbounds i16, i16* %in, i32 -1 + %2 = load i16, i16* %arrayidx3, align 2 + %conv4 = sext i16 %2 to i32 + %arrayidx5 = getelementptr inbounds i16, i16* %b, i32 1 + %3 = load i16, i16* %arrayidx5, align 2 + %conv6 = sext i16 %3 to i32 + %mul = mul nsw i32 %conv6, %conv4 + %add = add i32 %mul, %call + %arrayidx7 = getelementptr inbounds i16, i16* %in, i32 -2 + %4 = load i16, i16* %arrayidx7, align 2 + %conv8 = sext i16 %4 to i32 + %arrayidx9 = getelementptr inbounds i16, i16* %b, i32 2 + %5 = load i16, i16* %arrayidx9, align 2 + %conv10 = sext i16 %5 to i32 + %mul11 = mul nsw i32 %conv10, %conv8 + %add12 = add i32 %add, %mul11 + %arrayidx13 = getelementptr inbounds i16, i16* %in, i32 -3 + %6 = load i16, i16* %arrayidx13, align 2 + %conv14 = sext i16 %6 to i32 + %arrayidx15 = getelementptr inbounds i16, i16* %b, i32 3 + %7 = load i16, i16* %arrayidx15, align 2 + %conv16 = sext i16 %7 to i32 + %mul17 = mul nsw i32 %conv16, %conv14 + %add18 = add i32 %add12, %mul17 + %arrayidx19 = getelementptr inbounds i16, i16* %in, i32 -4 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = sext i16 %8 to i32 + %arrayidx21 = getelementptr inbounds i16, i16* %b, i32 4 + %9 = load i16, i16* %arrayidx21, align 2 + %conv22 = sext i16 %9 to i32 + %mul23 = mul nsw i32 %conv22, %conv20 + %add24 = add i32 %add18, %mul23 + %arrayidx25 = getelementptr inbounds i16, i16* %in, i32 -5 + %10 = load i16, i16* %arrayidx25, align 2 + %conv26 = sext i16 %10 to i32 + %arrayidx27 = getelementptr inbounds i16, i16* %b, i32 5 + %11 = load i16, i16* %arrayidx27, align 2 + %conv28 = sext i16 %11 to i32 + %mul29 = mul nsw i32 %conv28, %conv26 + %add30 = add i32 %add24, %mul29 + ret i32 %add30 +} + +declare dso_local i32 @bar(i32, i32) local_unnamed_addr + Index: test/CodeGen/ARM/ParallelDSP/smlad11.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlad11.ll +++ test/CodeGen/ARM/ParallelDSP/smlad11.ll @@ -12,8 +12,8 @@ ; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2 ; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32* ; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2 -; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054) -; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[V12]]) +; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 %mac1{{\.}}054) +; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 [[V12]]) ; ; And we don't want to see a 3rd smlad: ; CHECK-NOT: call i32 @llvm.arm.smlad Index: test/CodeGen/ARM/ParallelDSP/smlad12.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlad12.ll +++ test/CodeGen/ARM/ParallelDSP/smlad12.ll @@ -2,7 +2,7 @@ ; ; The loop header is not the loop latch. ; -; CHECK-NOT: call i32 @llvm.arm.smlad +; CHECK: call i32 @llvm.arm.smlad ; define dso_local i32 @test(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: Index: test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll +++ test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll @@ -195,8 +195,8 @@ ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]]) -; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN1]], i32 [[IN2_2]], i64 [[ACC1]]) +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) +; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 ; CHECK: [[PIN2_NEXT]] = getelementptr i16, i16* [[PIN2]], i32 -4