Index: lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- lib/Target/ARM/ARMParallelDSP.cpp +++ lib/Target/ARM/ARMParallelDSP.cpp @@ -47,9 +47,9 @@ struct BinOpChain; struct Reduction; - using OpChainList = SmallVector; + using OpChainList = SmallVector, 8>; using ReductionList = SmallVector; - using ValueList = SmallVector; + using ValueList = SmallSetVector; using MemInstList = SmallVector; using PMACPair = std::pair; using PMACPairList = SmallVector; @@ -59,25 +59,45 @@ struct OpChain { Instruction *Root; ValueList AllValues; - MemInstList VecLd; // List of all load instructions. - MemLocList MemLocs; // All memory locations read by this tree. + MemInstList VecLd; // List of all load instructions. + MemLocList MemReadLocs; // All memory locations read by this tree. + MemLocList MemWriteLocs; // All memory locations written. bool ReadOnly = true; + OpChain(Instruction *I) : Root(I) { } + OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { } - void SetMemoryLocations() { + void Finalise() { + AllValues.insert(Root); const auto Size = MemoryLocation::UnknownSize; for (auto *V : AllValues) { if (auto *I = dyn_cast(V)) { - if (I->mayWriteToMemory()) + if (I->mayWriteToMemory()) { + assert(isa(I) && "Expect only stores to write memory"); + MemWriteLocs.push_back(MemoryLocation( + cast(I)->getPointerOperand(), Size)); ReadOnly = false; - if (auto *Ld = dyn_cast(V)) - MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size)); + } + if (auto *Ld = dyn_cast(V)) { + MemReadLocs.push_back( + MemoryLocation(Ld->getPointerOperand(), Size)); + } } } } + MemLocList &Reads() { + return MemReadLocs; + } + + MemLocList &Writes() { + return MemWriteLocs; + } + unsigned size() const { return AllValues.size(); } + + bool contains(Value *V) const { return AllValues.count(V) != 0; } }; // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures. @@ -91,33 +111,119 @@ BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) : OpChain(I, lhs), LHS(lhs), RHS(rhs) { for (auto *V : RHS) - AllValues.push_back(V); + AllValues.insert(V); + } + }; + + class ParallelChains { + protected: + SmallVector Roots; + std::map ChainMap; + OpChainList Chains; + + public: + ParallelChains() { } + + void addRoot(Instruction *I, OpChain *Chain) { + assert(!ChainMap.count(I) && + "Root already added to parallel sequence"); + ChainMap[I] = Roots.size(); + Roots.push_back(I); + Chain->Finalise(); + Chains.push_back(std::unique_ptr{Chain}); + } + + OpChain *getSequence(unsigned i) { + assert(ChainMap.count(Roots[i]) && + "trying to get unknown sequence"); + return Chains[i].get(); + } + + OpChainList &getAllCandidates() { + return Chains; + } + + bool contains(Value *V) { + for (unsigned i = 0; i < Chains.size(); ++i) { + OpChain *C = Chains[i].get(); + if (C->contains(V)) + return true; } + return false; + } }; - struct Reduction { + class SuperWord : public ParallelChains { + public: + + SuperWord(Instruction *I, OpChain *Chain) { + ChainMap[I] = Roots.size(); + Roots.push_back(I); + Chains.push_back(std::unique_ptr{Chain}); + Chain->Finalise(); + } + + ~SuperWord() { + for (auto *I : Roots) { + ValueList &AllValues = Chains[ChainMap[I]].get()->AllValues; + AllValues.remove(I); + I->dropAllReferences(); + I->removeFromParent(); + + for (auto VI = AllValues.rbegin(), E = AllValues.rend(); VI != E; + ++VI) { + Value *V = *VI; + if (V->hasNUses(0)) { + if (auto *I = dyn_cast(V)) { + I->dropAllReferences(); + I->removeFromParent(); + } + } + } + } + } + + unsigned getNumLanes() const { return Roots.size(); } + + unsigned getElementSize() const { + assert(!Roots.empty() && "Invalid access of Roots"); + + Type *Ty = Roots.front()->getType(); + if (auto *Store = dyn_cast(Roots.front())) + Ty = Store->getValueOperand()->getType(); + return Ty->getPrimitiveSizeInBits(); + } + + unsigned getVectorLength() const { + unsigned Length = 32 / getElementSize(); + return Length > Roots.size() ? Roots.size() : Length; + } + }; + + struct Reduction : public ParallelChains { PHINode *Phi; // The Phi-node from where we start // pattern matching. Instruction *AccIntAdd; // The accumulating integer add statement, // i.e, the reduction statement. - OpChainList MACCandidates; // The MAC candidates associated with - // this reduction statement. - Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { }; + Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { } + + bool isValidMemoryAccess(); }; class ARMParallelDSP : public LoopPass { - ScalarEvolution *SE; - AliasAnalysis *AA; - TargetLibraryInfo *TLI; - DominatorTree *DT; - LoopInfo *LI; - Loop *L; - const DataLayout *DL; - Module *M; + ScalarEvolution *SE = nullptr; + AliasAnalysis *AA = nullptr; + TargetLibraryInfo *TLI = nullptr; + DominatorTree *DT = nullptr; + Loop *L = nullptr; + const DataLayout *DL = nullptr; + Module *M = nullptr; bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); + bool AreSequentialStores(StoreInst *St0, StoreInst *St1, + MemInstList &VecMem); PMACPairList CreateParallelMACPairs(OpChainList &Candidates); Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, Instruction *Acc, Instruction *InsertAfter); @@ -128,6 +234,8 @@ /// exchange the halfwords of the second operand before performing the /// arithmetic. bool MatchSMLAD(Function &F); + void Parallelise(OpChain *ParallelInsts, unsigned BitWidth); + bool FindParallelChains(); public: static char ID; @@ -153,7 +261,6 @@ AA = &getAnalysis().getAAResults(); TLI = &getAnalysis().getTLI(); DT = &getAnalysis().getDomTree(); - LI = &getAnalysis().getLoopInfo(); auto &TPC = getAnalysis(); BasicBlock *Header = TheLoop->getHeader(); @@ -188,11 +295,12 @@ return false; } - LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI); bool Changes = false; - LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n\n"); + LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); + LLVM_DEBUG(dbgs() << "Function: " << F.getName() << ", " << *L << "\n"); Changes = MatchSMLAD(F); + Changes |= FindParallelChains(); return Changes; } }; @@ -206,7 +314,13 @@ template static bool IsNarrowSequence(Value *V, ValueList &VL) { LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump()); - ConstantInt *CInt; + ConstantInt *CInt = nullptr; + + auto IsNarrowType = [&](Type *Ty) { + if (auto *PtrTy = dyn_cast(Ty)) + Ty = PtrTy->getElementType(); + return Ty->getPrimitiveSizeInBits() == MaxBitWidth; + }; if (match(V, m_ConstantInt(CInt))) { // TODO: if a constant is used, it needs to fit within the bit width. @@ -218,25 +332,28 @@ return false; Value *Val, *LHS, *RHS; - if (match(V, m_Trunc(m_Value(Val)))) { - if (cast(I)->getDestTy()->getIntegerBitWidth() == MaxBitWidth) - return IsNarrowSequence(Val, VL); - } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) { - // TODO: we need to implement sadd16/sadd8 for this, which enables to - // also do the rewrite for smlad8.ll, but it is unsupported for now. - LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump()); - return false; - } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) { - if (cast(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) { - LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " << - cast(I)->getSrcTy()->getIntegerBitWidth() << "\n"); - return false; + if (match(I, m_Trunc(m_Value(Val)))) { + if (IsNarrowType(cast(I)->getDestTy())) + return IsNarrowSequence(Val, VL); + } else if (match(I, m_Add(m_Value(LHS), m_Value(RHS)))) { + LLVM_DEBUG(dbgs() << "Found add in sequence: " << *V << "\n"); + if (IsNarrowSequence(LHS, VL) && + IsNarrowSequence(RHS, VL)) { + VL.insert(I); + return true; } - - if (match(Val, m_Load(m_Value()))) { - LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump()); - VL.push_back(Val); - VL.push_back(I); + } else if (match(I, m_ZExtOrSExt(m_Value(Val)))) { + if (IsNarrowType(cast(I)->getSrcTy()) && + IsNarrowSequence(Val, VL)) { + VL.insert(I); + return true; + } else + LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " << + cast(I)->getSrcTy()->getIntegerBitWidth() << "\n"); + } else if (auto *Ld = dyn_cast(I)) { + if (Ld->isSimple() && IsNarrowType(Ld->getPointerOperandType())) { + LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; V->dump()); + VL.insert(I); return true; } } @@ -320,6 +437,19 @@ return AreSequentialAccesses(Ld0, Ld1, VecMem, *DL, *SE); } +bool ARMParallelDSP::AreSequentialStores(StoreInst *St0, StoreInst *St1, + MemInstList &VecMem) { + if (!St0 || !St1) + return false; + + LLVM_DEBUG(dbgs() << "Are consecutive stores:\n"; + dbgs() << "St0:"; St0->dump(); + dbgs() << "St1:"; St1->dump(); + ); + + return AreSequentialAccesses(St0, St1, VecMem, *DL, *SE); +} + PMACPairList ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) { const unsigned Elems = Candidates.size(); @@ -332,8 +462,8 @@ // We can compare all elements, but then we need to compare and evaluate // different solutions. for(unsigned i=0; i(Candidates[i]); - BinOpChain *PMul1 = static_cast(Candidates[i+1]); + BinOpChain *PMul0 = static_cast(Candidates[i].get()); + BinOpChain *PMul1 = static_cast(Candidates[i+1].get()); const Instruction *Mul0 = PMul0->Root; const Instruction *Mul1 = PMul1->Root; @@ -349,6 +479,11 @@ const ValueList &Mul1_LHS = PMul1->LHS; const ValueList &Mul1_RHS = PMul1->RHS; + if (Mul0_LHS.size() != Mul1_LHS.size() || + Mul0_RHS.size() != Mul1_RHS.size() || + Mul0_LHS.size() != Mul1_RHS.size()) + continue; + if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) || !AreSymmetrical(Mul0_RHS, Mul1_RHS)) continue; @@ -445,46 +580,45 @@ ); } -static void AddMACCandidate(OpChainList &Candidates, - const Instruction *Acc, - Value *MulOp0, Value *MulOp1, int MulOpNum) { - Instruction *Mul = dyn_cast(Acc->getOperand(MulOpNum)); - LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump()); - ValueList LHS; - ValueList RHS; - if (IsNarrowSequence<16>(MulOp0, LHS) && - IsNarrowSequence<16>(MulOp1, RHS)) { - LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump()); - Candidates.push_back(new BinOpChain(Mul, LHS, RHS)); - } -} - -static void MatchParallelMACSequences(Reduction &R, - OpChainList &Candidates) { - const Instruction *Acc = R.AccIntAdd; +static void MatchParallelMACSequences(Reduction &R) { + Instruction *Acc = R.AccIntAdd; Value *A, *MulOp0, *MulOp1; LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump()); + auto AddMACCandidate = [&](Instruction *Acc, Value *MulOp0, Value *MulOp1, + int MulOpNum) { + Instruction *Mul = dyn_cast(Acc->getOperand(MulOpNum)); + LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump()); + ValueList LHS; + ValueList RHS; + if (IsNarrowSequence<16>(MulOp0, LHS) && + IsNarrowSequence<16>(MulOp1, RHS)) { + LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump()); + R.addRoot(Acc, new BinOpChain(Mul, LHS, RHS)); + } + }; + // Pattern 1: the accumulator is the RHS of the mul. while(match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A)))){ - AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0); + AddMACCandidate(Acc, MulOp0, MulOp1, 0); Acc = dyn_cast(A); } // Pattern 2: the accumulator is the LHS of the mul. while(match(Acc, m_Add(m_Value(A), m_Mul(m_Value(MulOp0), m_Value(MulOp1))))) { - AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 1); + AddMACCandidate(Acc, MulOp0, MulOp1, 1); Acc = dyn_cast(A); } // The last mul in the chain has a slightly different pattern: // the mul is the first operand if (match(Acc, m_Add(m_Mul(m_Value(MulOp0), m_Value(MulOp1)), m_Value(A)))) - AddMACCandidate(Candidates, Acc, MulOp0, MulOp1, 0); + AddMACCandidate(Acc, MulOp0, MulOp1, 0); // Because we start at the bottom of the chain, and we work our way up, // the muls are added in reverse program order to the list. + OpChainList &Candidates = R.getAllCandidates(); std::reverse(Candidates.begin(), Candidates.end()); } @@ -502,49 +636,69 @@ // Check whether statements in the basic block that write to memory alias with // the memory locations accessed by the MAC-chains. -// TODO: we need the read statements when we accept more complicated chains. static bool AreAliased(AliasAnalysis *AA, Instructions &Reads, - Instructions &Writes, OpChainList &MACCandidates) { + Instructions &Writes, ParallelChains &ParallelInsts) { LLVM_DEBUG(dbgs() << "Alias checks:\n"); - for (auto *MAC : MACCandidates) { - LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump()); - // At the moment, we allow only simple chains that only consist of reads, - // accumulate their result with an integer add, and thus that don't write - // memory, and simply bail if they do. - if (!MAC->ReadOnly) - return true; + auto DoAlias = [&](OpChain *C, Instructions &Insts, MemLocList &MemLocs) { + for (auto *I : Insts) { + + // Any writes (stores) within ParallelInsts will be sequential and the + // same of the size, so they won't be accessing the same memory location. + if (I->mayWriteToMemory() && ParallelInsts.contains(I)) + continue; - // Now for all writes in the basic block, check that they don't alias with - // the memory locations accessed by our MAC-chain: - for (auto *I : Writes) { - LLVM_DEBUG(dbgs() << "- "; I->dump()); - assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs"); - for (auto &MemLoc : MAC->MemLocs) { + for (auto &MemLoc : MemLocs) { if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc), ModRefInfo::ModRef))) { - LLVM_DEBUG(dbgs() << "Yes, aliases found\n"); + LLVM_DEBUG(dbgs() << "Yes, aliases found with: " << *I << "\n"); return true; } } } + return false; + }; + + OpChainList &Candidates = ParallelInsts.getAllCandidates(); + for (unsigned i = 0; i < Candidates.size(); ++i) { + OpChain *Chain = Candidates[i].get(); + LLVM_DEBUG(dbgs() << "Root: " << *Chain->Root << "\n"); + + if (DoAlias(Chain, Writes, Chain->Reads())) + return true; + + LLVM_DEBUG(dbgs() << "Chain reads are ok.\n"); + + if (Chain->ReadOnly) + continue; + + if (DoAlias(Chain, Reads, Chain->Writes())) { + LLVM_DEBUG(dbgs() << "Chain writes interfere with reads.\n"); + return true; + } + if (DoAlias(Chain, Writes, Chain->Writes())) { + LLVM_DEBUG(dbgs() << "Chain writes interfere with other writes.\n"); + return true; + } } LLVM_DEBUG(dbgs() << "OK: no aliases found!\n"); return false; } -static bool CheckMACMemory(OpChainList &Candidates) { - for (auto *C : Candidates) { +bool Reduction::isValidMemoryAccess() { + for (unsigned i = 0; i < Chains.size(); ++i) { + OpChain *C = Chains[i].get(); // A mul has 2 operands, and a narrow op consist of sext and a load; thus // we expect at least 4 items in this operand value list. if (C->size() < 4) { LLVM_DEBUG(dbgs() << "Operand list too short.\n"); return false; } - C->SetMemoryLocations(); ValueList &LHS = static_cast(C)->LHS; ValueList &RHS = static_cast(C)->RHS; + if (LHS.size() != RHS.size()) + return false; // Use +=2 to skip over the expected extend instructions. for (unsigned i = 0, e = LHS.size(); i < e; i += 2) { @@ -555,6 +709,163 @@ return true; } +static LoadInst *CreateWideLoad(IRBuilder &IRB, const Type *WideTy, + LoadInst *VecLd) { + const unsigned AddrSpace = VecLd->getPointerAddressSpace(); + + Value *VecPtr = IRB.CreateBitCast(VecLd->getPointerOperand(), + WideTy->getPointerTo(AddrSpace)); + return IRB.CreateAlignedLoad(VecPtr, VecLd->getAlignment()); +} + +static StoreInst *CreateWideStore(IRBuilder &IRB, const Type *WideTy, + StoreInst *VecSt, Value *V) { + const unsigned AddrSpace = VecSt->getPointerAddressSpace(); + + Value *VecPtr = IRB.CreateBitCast(VecSt->getPointerOperand(), + WideTy->getPointerTo(AddrSpace)); + return IRB.CreateAlignedStore(V, VecPtr, VecSt->getAlignment()); +} + +void ARMParallelDSP::Parallelise(OpChain *ParallelInsts, + unsigned BitWidth) { + + ValueToValueMap WideInsts; + Instruction *InsertAfter = ParallelInsts->Root; + IRBuilder Builder(L->getLoopLatch(), + ++BasicBlock::iterator(InsertAfter)); + + auto CreateParallelBinOp = [&](Instruction *I, Intrinsic::ID IntNo) { + Function *DSPInst = Intrinsic::getDeclaration(M, IntNo); + Value *Args[] = { WideInsts[I->getOperand(0)], + WideInsts[I->getOperand(1)] }; + return Builder.CreateCall(DSPInst, Args); + }; + + Type *WideTy = Type::getInt32Ty(M->getContext()); + + for (auto *V : ParallelInsts->AllValues) { + LLVM_DEBUG(dbgs() << "Widening: " << *V << "\n"); + if (auto *I = dyn_cast(V)) { + switch (I->getOpcode()) { + case Instruction::Add: { + Intrinsic::ID SADD = BitWidth == 8 ? + Intrinsic::arm_sadd8 : Intrinsic::arm_sadd16; + WideInsts[I] = CreateParallelBinOp(I, SADD); + break; + } + case Instruction::Load: + WideInsts[I] = CreateWideLoad(Builder, WideTy, cast(I)); + break; + case Instruction::Store: { + auto *St = cast(I); + WideInsts[I] = CreateWideStore(Builder, WideTy, St, + WideInsts[St->getValueOperand()]); + break; + } + } + } + } +} + +/// Search the given loop for store instructions, then search up from them to +/// find valid narrow sequences. From those, We then build maximal sets of +/// sequential stores. +bool ARMParallelDSP::FindParallelChains() { + SmallVector Candidates; + std::map CandidateMap; + + for (auto &I : *L->getLoopLatch()) { + if (!isa(I)) + continue; + + ValueList VL; + Value *V = cast(I).getValueOperand(); + if (IsNarrowSequence<16>(V, VL)) { + LLVM_DEBUG(dbgs() << "NarrowSequence:\n"); + LLVM_DEBUG(for (auto *V : VL) V->dump();); + Candidates.push_back(new OpChain(&I, VL)); + CandidateMap[&I] = Candidates.back(); + } + } + + if (Candidates.empty()) + return false; + + // Check every store against the other stores to find sequential ones. + // Map stores to their base store (if it exists) and also record all the + // subsequent stores of that base. SequentialStores will use the base store + // as the key to a vector of it's subsequent, and sequential, accesses. + std::map BaseStores; + std::map> SequentialStores; + + for (auto *Cand0 : Candidates) { + for (auto *Cand1 : Candidates) { + if (Cand0->Root == Cand1->Root || + !AreSymmetrical(Cand0->AllValues, Cand1->AllValues)) + continue; + + auto *St0 = cast(Cand0->Root); + auto *St1 = cast(Cand1->Root); + MemInstList VecMem; + + if (AreSequentialStores(St0, St1, VecMem)) { + if (BaseStores.count(St0)) { + StoreInst *Base = BaseStores[St0]; + BaseStores[St1] = Base; + SequentialStores[Base].push_back(St1); + } else { + BaseStores[St1] = St0; + SequentialStores[St0].push_back(St1); + } + } + } + } + if (SequentialStores.empty()) + return false; + + LLVM_DEBUG(dbgs() << "Found " << SequentialStores.size() + << " group(s) of sequential stores:\n"; + for (auto &I : SequentialStores) { + LLVM_DEBUG(dbgs() << " " << *I.first << "\n"); + for (auto *St : I.second) + LLVM_DEBUG(dbgs() << " " << *St << "\n"); + }); + + // We've found some sequential stores, which could be: + // i8 x 2,3,4, ... + // i16 x 2,3,4, ... + // So we need to find the element width to calculate the vector width. The + // vector width will then determine how we step over the list of stores as + // we parallelise their operand chains. + for (auto &I : SequentialStores) { + StoreInst *BaseStore = I.first; + SmallVectorImpl &SubsequentStores = I.second; + SuperWord SW(BaseStore, CandidateMap[BaseStore]); + for (auto *St : SubsequentStores) + SW.addRoot(St, CandidateMap[St]); + + Instructions Reads; + Instructions Writes; + AliasCandidates(L->getLoopLatch(), Reads, Writes); + if (AreAliased(AA, Reads, Writes, SW)) + continue; + + unsigned VecLength = SW.getVectorLength(); + unsigned NumVecInsts = SW.getNumLanes() / SW.getVectorLength(); + unsigned BitWidth = SW.getElementSize(); + + LLVM_DEBUG(dbgs() << "Parallelising " << SW.getNumLanes() + << " chains using " << NumVecInsts << " intrinsics.\n"); + LLVM_DEBUG(dbgs() << "Vector length = " << SW.getVectorLength() << "\n"); + + for (unsigned i = 0; i < NumVecInsts; ++i) + Parallelise(SW.getSequence(i*VecLength), BitWidth); + } + + return true; +} + // Loop Pass that needs to identify integer add/sub reductions of 16-bit vector // multiplications. // To use SMLAD: @@ -598,15 +909,12 @@ MatchReductions(F, L, Header, Reductions); for (auto &R : Reductions) { - OpChainList MACCandidates; - MatchParallelMACSequences(R, MACCandidates); - if (!CheckMACMemory(MACCandidates)) + MatchParallelMACSequences(R); + if (!R.isValidMemoryAccess()) continue; - R.MACCandidates = MACCandidates; - LLVM_DEBUG(dbgs() << "MAC candidates:\n"; - for (auto &M : R.MACCandidates) + for (auto &M : R.getAllCandidates()) M->Root->dump(); dbgs() << "\n";); } @@ -618,28 +926,16 @@ AliasCandidates(Header, Reads, Writes); for (auto &R : Reductions) { - if (AreAliased(AA, Reads, Writes, R.MACCandidates)) + if (AreAliased(AA, Reads, Writes, R)) return false; - PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates); + PMACPairList PMACPairs = CreateParallelMACPairs(R.getAllCandidates()); Changed |= InsertParallelMACs(R, PMACPairs); - for (auto *C : R.MACCandidates) - delete C; } LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump();); return Changed; } -static void CreateLoadIns(IRBuilder &IRB, Instruction *Acc, - LoadInst **VecLd) { - const Type *AccTy = Acc->getType(); - const unsigned AddrSpace = (*VecLd)->getPointerAddressSpace(); - - Value *VecPtr = IRB.CreateBitCast((*VecLd)->getPointerOperand(), - AccTy->getPointerTo(AddrSpace)); - *VecLd = IRB.CreateAlignedLoad(VecPtr, (*VecLd)->getAlignment()); -} - Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, Instruction *Acc, Instruction *InsertAfter) { @@ -652,9 +948,9 @@ ++BasicBlock::iterator(InsertAfter)); // Replace the reduction chain with an intrinsic call - CreateLoadIns(Builder, Acc, &VecLd0); - CreateLoadIns(Builder, Acc, &VecLd1); - Value* Args[] = { VecLd0, VecLd1, Acc }; + LoadInst *Ld0 = CreateWideLoad(Builder, Acc->getType(), VecLd0); + LoadInst *Ld1 = CreateWideLoad(Builder, Acc->getType(), VecLd1); + Value* Args[] = { Ld0, Ld1, Acc }; Function *SMLAD = Intrinsic::getDeclaration(M, Intrinsic::arm_smlad); CallInst *Call = Builder.CreateCall(SMLAD, Args); NumSMLAD++; Index: test/CodeGen/ARM/sadd16-alias.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/sadd16-alias.ll @@ -0,0 +1,130 @@ +; RUN: opt -mtriple=thumbv8.main -mcpu=cortex-m33 -arm-parallel-dsp -S %s -o - | FileCheck %s + +; CHECK-LABEL: two_seq_add_i16 +; CHECK-NOT: call i32 @llvm.arm.sadd +define void @two_seq_add_i16(i32 %N, i16* readonly %A, + i16* readonly %B, i16* nocapture %Out) { +entry: + %cmp10 = icmp eq i32 %N, 0 + br i1 %cmp10, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.011 = phi i32 [ %inc.1, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.011 + %0 = load i16, i16* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.011 + %1 = load i16, i16* %arrayidx1, align 2 + %add = add i16 %1, %0 + %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.011 + store i16 %add, i16* %arrayidx4, align 2 + %inc = add nuw i32 %i.011, 1 + %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc + %2 = load i16, i16* %arrayidx.1, align 2 + %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc + %3 = load i16, i16* %arrayidx1.1, align 2 + %add.1 = add i16 %3, %2 + %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc + store i16 %add.1, i16* %arrayidx4.1, align 2 + %inc.1 = add i32 %i.011, 2 + %exitcond.2 = icmp eq i32 %inc.1, %N + br i1 %exitcond.2, label %for.cond.cleanup, label %for.body +} + +; Acc could alias with Out +; CHECK-LABEL: @sadd16_acc +; CHECK-NOT: call i32 @llvm.arm.sadd +define void @sadd16_acc(i32 %N, i16* nocapture readonly %A, i16* nocapture readonly %B, + i16* noalias nocapture %Out, i16* nocapture %Acc) { +entry: + %cmp17 = icmp eq i32 %N, 0 + br i1 %cmp17, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %.pre = load i16, i16* %Acc, align 2 + %0 = add i32 %N, -1 + %xtraiter = and i32 %N, 3 + %1 = icmp ult i32 %0, 3 + br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = sub i32 %N, %xtraiter + br label %for.body + +for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %.unr = phi i16 [ %.pre, %for.body.preheader ], [ %add8.3, %for.body ] + %i.018.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil + +for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil + %2 = phi i16 [ %add8.epil, %for.body.epil ], [ %.unr, %for.cond.cleanup.loopexit.unr-lcssa ] + %i.018.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.018.unr, %for.cond.cleanup.loopexit.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] + %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.018.epil + %3 = load i16, i16* %arrayidx.epil, align 2 + %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.018.epil + %4 = load i16, i16* %arrayidx1.epil, align 2 + %add.epil = add i16 %4, %3 + %arrayidx4.epil = getelementptr inbounds i16, i16* %Out, i32 %i.018.epil + store i16 %add.epil, i16* %arrayidx4.epil, align 2 + %add8.epil = add i16 %2, %add.epil + store i16 %add8.epil, i16* %Acc, align 2 + %inc.epil = add nuw i32 %i.018.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader.new + %5 = phi i16 [ %.pre, %for.body.preheader.new ], [ %add8.3, %for.body ] + %i.018 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] + %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018 + %6 = load i16, i16* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.018 + %7 = load i16, i16* %arrayidx1, align 2 + %add = add i16 %7, %6 + %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.018 + store i16 %add, i16* %arrayidx4, align 2 + %add8 = add i16 %5, %add + store i16 %add8, i16* %Acc, align 2 + %inc = or i32 %i.018, 1 + %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc + %8 = load i16, i16* %arrayidx.1, align 2 + %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc + %9 = load i16, i16* %arrayidx1.1, align 2 + %add.1 = add i16 %9, %8 + %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc + store i16 %add.1, i16* %arrayidx4.1, align 2 + %add8.1 = add i16 %add8, %add.1 + store i16 %add8.1, i16* %Acc, align 2 + %inc.1 = or i32 %i.018, 2 + %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1 + %10 = load i16, i16* %arrayidx.2, align 2 + %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1 + %11 = load i16, i16* %arrayidx1.2, align 2 + %add.2 = add i16 %11, %10 + %arrayidx4.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.1 + store i16 %add.2, i16* %arrayidx4.2, align 2 + %add8.2 = add i16 %add8.1, %add.2 + store i16 %add8.2, i16* %Acc, align 2 + %inc.2 = or i32 %i.018, 3 + %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2 + %12 = load i16, i16* %arrayidx.3, align 2 + %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2 + %13 = load i16, i16* %arrayidx1.3, align 2 + %add.3 = add i16 %13, %12 + %arrayidx4.3 = getelementptr inbounds i16, i16* %Out, i32 %inc.2 + store i16 %add.3, i16* %arrayidx4.3, align 2 + %add8.3 = add i16 %add8.2, %add.3 + store i16 %add8.3, i16* %Acc, align 2 + %inc.3 = add i32 %i.018, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body +} Index: test/CodeGen/ARM/sadd16.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/sadd16.ll @@ -0,0 +1,428 @@ +; RUN: opt -mtriple=thumbv8.main -mcpu=cortex-m33 -arm-parallel-dsp -S %s -o - | FileCheck %s + +; CHECK: @two_seq_add_i16 +define void @two_seq_add_i16(i32 %N, i16* readonly %A, + i16* readonly %B, i16* noalias nocapture %Out) { +entry: + %cmp10 = icmp eq i32 %N, 0 + br i1 %cmp10, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void + +; CHECK-LABEL: for.body: +; CHECK: [[B0:[^ ]+]] = bitcast i16* %arrayidx1 to i32* +; CHECK: [[LD_B0:[^ ]+]] = load i32, i32* [[B0]], align 2 +; CHECK: [[A0:[^ ]+]] = bitcast i16* %arrayidx to i32* +; CHECK: [[LD_A0:[^ ]+]] = load i32, i32* [[A0]], align 2 +; CHECK: [[RES:[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B0]], i32 [[LD_A0]]) +; CHECK: [[C0:[^ ]+]] = bitcast i16* %arrayidx4 to i32* +; CHECK: store i32 [[RES]], i32* [[C0]], align 2 +for.body: + %i.011 = phi i32 [ %inc.1, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.011 + %0 = load i16, i16* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.011 + %1 = load i16, i16* %arrayidx1, align 2 + %add = add i16 %1, %0 + %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.011 + store i16 %add, i16* %arrayidx4, align 2 + %inc = add nuw i32 %i.011, 1 + %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc + %2 = load i16, i16* %arrayidx.1, align 2 + %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc + %3 = load i16, i16* %arrayidx1.1, align 2 + %add.1 = add i16 %3, %2 + %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc + store i16 %add.1, i16* %arrayidx4.1, align 2 + %inc.1 = add i32 %i.011, 2 + %exitcond.2 = icmp eq i32 %inc.1, %N + br i1 %exitcond.2, label %for.cond.cleanup, label %for.body +} + +; CHECK: @four_seq_add_i16 +define void @four_seq_add_i16(i32 %N, i16* noalias nocapture readonly %A, + i16* noalias nocapture readonly %B, + i16* noalias nocapture %Out) { +entry: + %cmp12 = icmp eq i32 %N, 0 + br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: + %0 = add i32 %N, -1 + %xtraiter = and i32 %N, 3 + %1 = icmp ult i32 %0, 3 + br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: + %unroll_iter = sub i32 %N, %xtraiter + br label %for.body + +for.cond.cleanup.loopexit.unr-lcssa: + %i.013.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil + +for.body.epil: + %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.013.unr + %2 = load i16, i16* %arrayidx.epil, align 2 + %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.013.unr + %3 = load i16, i16* %arrayidx1.epil, align 2 + %add.epil = add i16 %3, %2 + %arrayidx4.epil = getelementptr inbounds i16, i16* %Out, i32 %i.013.unr + store i16 %add.epil, i16* %arrayidx4.epil, align 2 + %inc.epil = add nuw i32 %i.013.unr, 1 + %epil.iter.cmp = icmp eq i32 %xtraiter, 1 + br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil.1 + +for.cond.cleanup: + ret void + +; CHECK-LABEL: for.body: +; CHECK: [[B0:%[^ ]+]] = bitcast i16* %arrayidx1 to i32* +; CHECK: [[LD_B0:%[^ ]+]] = load i32, i32* [[B0]], align 2 +; CHECK: [[A0:%[^ ]+]] = bitcast i16* %arrayidx to i32* +; CHECK: [[LD_A0:%[^ ]+]] = load i32, i32* [[A0]], align 2 +; CHECK: [[SADD0:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B0]], i32 [[LD_A0]]) +; CHECK: [[C0:%[^ ]+]] = bitcast i16* %arrayidx4 to i32* +; CHECK: store i32 [[SADD0]], i32* [[C0]], align 2 + +; CHECK: [[B2:%[^ ]+]] = bitcast i16* %arrayidx1.2 to i32* +; CHECK: [[LD_B2:%[^ ]+]] = load i32, i32* [[B2]], align 2 +; CHECK: [[A2:%[^ ]+]] = bitcast i16* %arrayidx.2 to i32* +; CHECK: [[LD_A2:%[^ ]+]] = load i32, i32* [[A2]], align 2 +; CHECK: [[SADD1:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B2]], i32 [[LD_A2]]) +; CHECK: [[C2:%[^ ]+]] = bitcast i16* %arrayidx4.2 to i32* +; CHECK: store i32 [[SADD1]], i32* [[C2]], align 2 +for.body: + %i.013 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] + %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.013 + %4 = load i16, i16* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.013 + %5 = load i16, i16* %arrayidx1, align 2 + %add = add i16 %5, %4 + %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.013 + store i16 %add, i16* %arrayidx4, align 2 + %inc = or i32 %i.013, 1 + %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc + %6 = load i16, i16* %arrayidx.1, align 2 + %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc + %7 = load i16, i16* %arrayidx1.1, align 2 + %add.1 = add i16 %7, %6 + %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc + store i16 %add.1, i16* %arrayidx4.1, align 2 + %inc.1 = or i32 %i.013, 2 + %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1 + %8 = load i16, i16* %arrayidx.2, align 2 + %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1 + %9 = load i16, i16* %arrayidx1.2, align 2 + %add.2 = add i16 %9, %8 + %arrayidx4.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.1 + store i16 %add.2, i16* %arrayidx4.2, align 2 + %inc.2 = or i32 %i.013, 3 + %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2 + %10 = load i16, i16* %arrayidx.3, align 2 + %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2 + %11 = load i16, i16* %arrayidx1.3, align 2 + %add.3 = add i16 %11, %10 + %arrayidx4.3 = getelementptr inbounds i16, i16* %Out, i32 %inc.2 + store i16 %add.3, i16* %arrayidx4.3, align 2 + %inc.3 = add i32 %i.013, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body + +for.body.epil.1: + %arrayidx.epil.1 = getelementptr inbounds i16, i16* %A, i32 %inc.epil + %12 = load i16, i16* %arrayidx.epil.1, align 2 + %arrayidx1.epil.1 = getelementptr inbounds i16, i16* %B, i32 %inc.epil + %13 = load i16, i16* %arrayidx1.epil.1, align 2 + %add.epil.1 = add i16 %13, %12 + %arrayidx4.epil.1 = getelementptr inbounds i16, i16* %Out, i32 %inc.epil + store i16 %add.epil.1, i16* %arrayidx4.epil.1, align 2 + %inc.epil.1 = add i32 %i.013.unr, 2 + %epil.iter.cmp.1 = icmp eq i32 %xtraiter, 2 + br i1 %epil.iter.cmp.1, label %for.cond.cleanup, label %for.body.epil.2 + +for.body.epil.2: + %arrayidx.epil.2 = getelementptr inbounds i16, i16* %A, i32 %inc.epil.1 + %14 = load i16, i16* %arrayidx.epil.2, align 2 + %arrayidx1.epil.2 = getelementptr inbounds i16, i16* %B, i32 %inc.epil.1 + %15 = load i16, i16* %arrayidx1.epil.2, align 2 + %add.epil.2 = add i16 %15, %14 + %arrayidx4.epil.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.epil.1 + store i16 %add.epil.2, i16* %arrayidx4.epil.2, align 2 + br label %for.cond.cleanup +} + +; CHECK: @sadd16_restrict_acc +define void @sadd16_restrict_acc(i32 %N, i16* nocapture readonly %A, i16* nocapture readonly %B, + i16* noalias nocapture %Out, i16* noalias nocapture %Acc) { +entry: + %cmp17 = icmp eq i32 %N, 0 + br i1 %cmp17, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %.pre = load i16, i16* %Acc, align 2 + %0 = add i32 %N, -1 + %xtraiter = and i32 %N, 3 + %1 = icmp ult i32 %0, 3 + br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = sub i32 %N, %xtraiter + br label %for.body + +for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %.unr = phi i16 [ %.pre, %for.body.preheader ], [ %add8.3, %for.body ] + %i.018.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil + +for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil + %2 = phi i16 [ %add8.epil, %for.body.epil ], [ %.unr, %for.cond.cleanup.loopexit.unr-lcssa ] + %i.018.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.018.unr, %for.cond.cleanup.loopexit.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] + %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.018.epil + %3 = load i16, i16* %arrayidx.epil, align 2 + %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.018.epil + %4 = load i16, i16* %arrayidx1.epil, align 2 + %add.epil = add i16 %4, %3 + %arrayidx4.epil = getelementptr inbounds i16, i16* %Out, i32 %i.018.epil + store i16 %add.epil, i16* %arrayidx4.epil, align 2 + %add8.epil = add i16 %2, %add.epil + store i16 %add8.epil, i16* %Acc, align 2 + %inc.epil = add nuw i32 %i.018.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry + ret void + +; CHECK-LABEL: for.body: +; CHECK: [[B0:%[^ ]+]] = bitcast i16* %arrayidx1 to i32* +; CHECK: [[LD_B0:%[^ ]+]] = load i32, i32* [[B0]], align 2 +; CHECK: [[A0:%[^ ]+]] = bitcast i16* %arrayidx to i32* +; CHECK: [[LD_A0:%[^ ]+]] = load i32, i32* [[A0]], align 2 +; CHECK: [[SADD0:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B0]], i32 [[LD_A0]]) +; CHECK: [[C0:%[^ ]+]] = bitcast i16* %arrayidx4 to i32* +; CHECK: store i32 [[SADD0]], i32* [[C0]], align 2 + +; CHECK: [[B2:%[^ ]+]] = bitcast i16* %arrayidx1.2 to i32* +; CHECK: [[LD_B2:%[^ ]+]] = load i32, i32* [[B2]], align 2 +; CHECK: [[A2:%[^ ]+]] = bitcast i16* %arrayidx.2 to i32* +; CHECK: [[LD_A2:%[^ ]+]] = load i32, i32* [[A2]], align 2 +; CHECK: [[SADD1:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B2]], i32 [[LD_A2]]) +; CHECK: [[C2:%[^ ]+]] = bitcast i16* %arrayidx4.2 to i32* +; CHECK: store i32 [[SADD1]], i32* [[C2]], align 2 +for.body: ; preds = %for.body, %for.body.preheader.new + %5 = phi i16 [ %.pre, %for.body.preheader.new ], [ %add8.3, %for.body ] + %i.018 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] + %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.018 + %6 = load i16, i16* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.018 + %7 = load i16, i16* %arrayidx1, align 2 + %add = add i16 %7, %6 + %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.018 + store i16 %add, i16* %arrayidx4, align 2 + %add8 = add i16 %5, %add + store i16 %add8, i16* %Acc, align 2 + %inc = or i32 %i.018, 1 + %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc + %8 = load i16, i16* %arrayidx.1, align 2 + %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc + %9 = load i16, i16* %arrayidx1.1, align 2 + %add.1 = add i16 %9, %8 + %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc + store i16 %add.1, i16* %arrayidx4.1, align 2 + %add8.1 = add i16 %add8, %add.1 + store i16 %add8.1, i16* %Acc, align 2 + %inc.1 = or i32 %i.018, 2 + %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1 + %10 = load i16, i16* %arrayidx.2, align 2 + %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1 + %11 = load i16, i16* %arrayidx1.2, align 2 + %add.2 = add i16 %11, %10 + %arrayidx4.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.1 + store i16 %add.2, i16* %arrayidx4.2, align 2 + %add8.2 = add i16 %add8.1, %add.2 + store i16 %add8.2, i16* %Acc, align 2 + %inc.2 = or i32 %i.018, 3 + %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2 + %12 = load i16, i16* %arrayidx.3, align 2 + %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2 + %13 = load i16, i16* %arrayidx1.3, align 2 + %add.3 = add i16 %13, %12 + %arrayidx4.3 = getelementptr inbounds i16, i16* %Out, i32 %inc.2 + store i16 %add.3, i16* %arrayidx4.3, align 2 + %add8.3 = add i16 %add8.2, %add.3 + store i16 %add8.3, i16* %Acc, align 2 + %inc.3 = add i32 %i.018, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body +} + +; CHECK: @sadd16_unroll5 +define void @sadd16_unroll5(i16* nocapture readonly %A, i16* nocapture readonly %B, + i16* noalias nocapture %Out) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +; CHECK-LABEL: for.body: +; CHECK: [[B0:%[^ ]+]] = bitcast i16* %arrayidx1 to i32* +; CHECK: [[LD_B0:%[^ ]+]] = load i32, i32* [[B0]], align 2 +; CHECK: [[A0:%[^ ]+]] = bitcast i16* %arrayidx to i32* +; CHECK: [[LD_A0:%[^ ]+]] = load i32, i32* [[A0]], align 2 +; CHECK: [[SADD0:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B0]], i32 [[LD_A0]]) +; CHECK: [[C0:%[^ ]+]] = bitcast i16* %arrayidx4 to i32* +; CHECK: store i32 [[SADD0]], i32* [[C0]], align 2 + +; CHECK: [[B2:%[^ ]+]] = bitcast i16* %arrayidx1.2 to i32* +; CHECK: [[LD_B2:%[^ ]+]] = load i32, i32* [[B2]], align 2 +; CHECK: [[A2:%[^ ]+]] = bitcast i16* %arrayidx.2 to i32* +; CHECK: [[LD_A2:%[^ ]+]] = load i32, i32* [[A2]], align 2 +; CHECK: [[SADD1:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B2]], i32 [[LD_A2]]) +; CHECK: [[C2:%[^ ]+]] = bitcast i16* %arrayidx4.2 to i32* +; CHECK: store i32 [[SADD1]], i32* [[C2]], align 2 + +; CHECK-NOT: call i32 @llvm.arm.sadd +; CHECK: br i1 %exitcond +for.body: ; preds = %for.body, %entry + %i.010 = phi i32 [ 0, %entry ], [ %inc.4, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010 + %0 = load i16, i16* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010 + %1 = load i16, i16* %arrayidx1, align 2 + %add = add i16 %1, %0 + %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.010 + store i16 %add, i16* %arrayidx4, align 2 + %inc = add nuw nsw i32 %i.010, 1 + %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc + %2 = load i16, i16* %arrayidx.1, align 2 + %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc + %3 = load i16, i16* %arrayidx1.1, align 2 + %add.1 = add i16 %3, %2 + %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc + store i16 %add.1, i16* %arrayidx4.1, align 2 + %inc.1 = add nuw nsw i32 %i.010, 2 + %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1 + %4 = load i16, i16* %arrayidx.2, align 2 + %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1 + %5 = load i16, i16* %arrayidx1.2, align 2 + %add.2 = add i16 %5, %4 + %arrayidx4.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.1 + store i16 %add.2, i16* %arrayidx4.2, align 2 + %inc.2 = add nuw nsw i32 %i.010, 3 + %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2 + %6 = load i16, i16* %arrayidx.3, align 2 + %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2 + %7 = load i16, i16* %arrayidx1.3, align 2 + %add.3 = add i16 %7, %6 + %arrayidx4.3 = getelementptr inbounds i16, i16* %Out, i32 %inc.2 + store i16 %add.3, i16* %arrayidx4.3, align 2 + %inc.3 = add nuw nsw i32 %i.010, 4 + %arrayidx.4 = getelementptr inbounds i16, i16* %A, i32 %inc.3 + %8 = load i16, i16* %arrayidx.4, align 2 + %arrayidx1.4 = getelementptr inbounds i16, i16* %B, i32 %inc.3 + %9 = load i16, i16* %arrayidx1.4, align 2 + %add.4 = add i16 %9, %8 + %arrayidx4.4 = getelementptr inbounds i16, i16* %Out, i32 %inc.3 + store i16 %add.4, i16* %arrayidx4.4, align 2 + %inc.4 = add nuw nsw i32 %i.010, 5 + %exitcond.4 = icmp eq i32 %inc.4, 120 + br i1 %exitcond.4, label %for.cond.cleanup, label %for.body +} + +; CHECK: @sadd16_unroll6 +define void @sadd16_unroll6(i16* nocapture readonly %A, i16* nocapture readonly %B, + i16* noalias nocapture %Out) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +; CHECK-LABEL: for.body: +; CHECK: [[B0:%[^ ]+]] = bitcast i16* %arrayidx1 to i32* +; CHECK: [[LD_B0:%[^ ]+]] = load i32, i32* [[B0]], align 2 +; CHECK: [[A0:%[^ ]+]] = bitcast i16* %arrayidx to i32* +; CHECK: [[LD_A0:%[^ ]+]] = load i32, i32* [[A0]], align 2 +; CHECK: [[SADD0:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B0]], i32 [[LD_A0]]) +; CHECK: [[C0:%[^ ]+]] = bitcast i16* %arrayidx4 to i32* +; CHECK: store i32 [[SADD0]], i32* [[C0]], align 2 + +; CHECK: [[B2:%[^ ]+]] = bitcast i16* %arrayidx1.2 to i32* +; CHECK: [[LD_B2:%[^ ]+]] = load i32, i32* [[B2]], align 2 +; CHECK: [[A2:%[^ ]+]] = bitcast i16* %arrayidx.2 to i32* +; CHECK: [[LD_A2:%[^ ]+]] = load i32, i32* [[A2]], align 2 +; CHECK: [[SADD1:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B2]], i32 [[LD_A2]]) +; CHECK: [[C2:%[^ ]+]] = bitcast i16* %arrayidx4.2 to i32* +; CHECK: store i32 [[SADD1]], i32* [[C2]], align 2 + +; CHECK: [[B3:%[^ ]+]] = bitcast i16* %arrayidx1.4 to i32* +; CHECK: [[LD_B3:%[^ ]+]] = load i32, i32* [[B3]], align 2 +; CHECK: [[A3:%[^ ]+]] = bitcast i16* %arrayidx.4 to i32* +; CHECK: [[LD_A3:%[^ ]+]] = load i32, i32* [[A3]], align 2 +; CHECK: [[SADD2:%[^ ]+]] = call i32 @llvm.arm.sadd16(i32 [[LD_B3]], i32 [[LD_A3]]) +; CHECK: [[C3:%[^ ]+]] = bitcast i16* %arrayidx4.4 to i32* +; CHECK: store i32 [[SADD2]], i32* [[C3]], align 2 +for.body: ; preds = %for.body, %entry + %i.010 = phi i32 [ 0, %entry ], [ %inc.5, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010 + %0 = load i16, i16* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010 + %1 = load i16, i16* %arrayidx1, align 2 + %add = add i16 %1, %0 + %arrayidx4 = getelementptr inbounds i16, i16* %Out, i32 %i.010 + store i16 %add, i16* %arrayidx4, align 2 + %inc = or i32 %i.010, 1 + %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc + %2 = load i16, i16* %arrayidx.1, align 2 + %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc + %3 = load i16, i16* %arrayidx1.1, align 2 + %add.1 = add i16 %3, %2 + %arrayidx4.1 = getelementptr inbounds i16, i16* %Out, i32 %inc + store i16 %add.1, i16* %arrayidx4.1, align 2 + %inc.1 = add nuw nsw i32 %i.010, 2 + %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1 + %4 = load i16, i16* %arrayidx.2, align 2 + %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1 + %5 = load i16, i16* %arrayidx1.2, align 2 + %add.2 = add i16 %5, %4 + %arrayidx4.2 = getelementptr inbounds i16, i16* %Out, i32 %inc.1 + store i16 %add.2, i16* %arrayidx4.2, align 2 + %inc.2 = add nuw nsw i32 %i.010, 3 + %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2 + %6 = load i16, i16* %arrayidx.3, align 2 + %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2 + %7 = load i16, i16* %arrayidx1.3, align 2 + %add.3 = add i16 %7, %6 + %arrayidx4.3 = getelementptr inbounds i16, i16* %Out, i32 %inc.2 + store i16 %add.3, i16* %arrayidx4.3, align 2 + %inc.3 = add nuw nsw i32 %i.010, 4 + %arrayidx.4 = getelementptr inbounds i16, i16* %A, i32 %inc.3 + %8 = load i16, i16* %arrayidx.4, align 2 + %arrayidx1.4 = getelementptr inbounds i16, i16* %B, i32 %inc.3 + %9 = load i16, i16* %arrayidx1.4, align 2 + %add.4 = add i16 %9, %8 + %arrayidx4.4 = getelementptr inbounds i16, i16* %Out, i32 %inc.3 + store i16 %add.4, i16* %arrayidx4.4, align 2 + %inc.4 = add nuw nsw i32 %i.010, 5 + %arrayidx.5 = getelementptr inbounds i16, i16* %A, i32 %inc.4 + %10 = load i16, i16* %arrayidx.5, align 2 + %arrayidx1.5 = getelementptr inbounds i16, i16* %B, i32 %inc.4 + %11 = load i16, i16* %arrayidx1.5, align 2 + %add.5 = add i16 %11, %10 + %arrayidx4.5 = getelementptr inbounds i16, i16* %Out, i32 %inc.4 + store i16 %add.5, i16* %arrayidx4.5, align 2 + %inc.5 = add nuw nsw i32 %i.010, 6 + %exitcond.5 = icmp eq i32 %inc.5, 120 + br i1 %exitcond.5, label %for.cond.cleanup, label %for.body +} Index: test/CodeGen/ARM/sadd8.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/sadd8.ll @@ -0,0 +1,83 @@ +; RUN: opt -mtriple=thumbv8.main -mcpu=cortex-m33 -arm-parallel-dsp -S %s -o - | FileCheck %s + +; CHECK: @four_seq_add_i8 +define void @four_seq_add_i8(i32 %N, i8* noalias nocapture readonly %A, + i8* noalias nocapture readonly %B, + i8* noalias nocapture %Out) local_unnamed_addr #0 { +entry: + %cmp10 = icmp eq i32 %N, 0 + br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = add i32 %N, -1 + %xtraiter = and i32 %N, 3 + %1 = icmp ult i32 %0, 3 + br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new + +for.body.preheader.new: ; preds = %for.body.preheader + %unroll_iter = sub i32 %N, %xtraiter + br label %for.body + +for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader + %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] + %lcmp.mod = icmp eq i32 %xtraiter, 0 + br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil + +for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil + %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ] + %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] + %arrayidx.epil = getelementptr inbounds i8, i8* %A, i32 %i.011.epil + %2 = load i8, i8* %arrayidx.epil, align 1 + %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.011.epil + %3 = load i8, i8* %arrayidx1.epil, align 1 + %add.epil = add i8 %3, %2 + %arrayidx4.epil = getelementptr inbounds i8, i8* %Out, i32 %i.011.epil + store i8 %add.epil, i8* %arrayidx4.epil, align 1 + %inc.epil = add nuw i32 %i.011.epil, 1 + %epil.iter.sub = add i32 %epil.iter, -1 + %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 + br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry + ret void + +; CHECK-NOT: call i32 @llvm.arm.sadd +for.body: ; preds = %for.body, %for.body.preheader.new + %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] + %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.011 + %4 = load i8, i8* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.011 + %5 = load i8, i8* %arrayidx1, align 1 + %add = add i8 %5, %4 + %arrayidx4 = getelementptr inbounds i8, i8* %Out, i32 %i.011 + store i8 %add, i8* %arrayidx4, align 1 + %inc = or i32 %i.011, 1 + %arrayidx.1 = getelementptr inbounds i8, i8* %A, i32 %inc + %6 = load i8, i8* %arrayidx.1, align 1 + %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc + %7 = load i8, i8* %arrayidx1.1, align 1 + %add.1 = add i8 %7, %6 + %arrayidx4.1 = getelementptr inbounds i8, i8* %Out, i32 %inc + store i8 %add.1, i8* %arrayidx4.1, align 1 + %inc.1 = or i32 %i.011, 2 + %arrayidx.2 = getelementptr inbounds i8, i8* %A, i32 %inc.1 + %8 = load i8, i8* %arrayidx.2, align 1 + %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1 + %9 = load i8, i8* %arrayidx1.2, align 1 + %add.2 = add i8 %9, %8 + %arrayidx4.2 = getelementptr inbounds i8, i8* %Out, i32 %inc.1 + store i8 %add.2, i8* %arrayidx4.2, align 1 + %inc.2 = or i32 %i.011, 3 + %arrayidx.3 = getelementptr inbounds i8, i8* %A, i32 %inc.2 + %10 = load i8, i8* %arrayidx.3, align 1 + %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2 + %11 = load i8, i8* %arrayidx1.3, align 1 + %add.3 = add i8 %11, %10 + %arrayidx4.3 = getelementptr inbounds i8, i8* %Out, i32 %inc.2 + store i8 %add.3, i8* %arrayidx4.3, align 1 + %inc.3 = add i32 %i.011, 4 + %niter.nsub.3 = add i32 %niter, -4 + %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 + br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body +}