Index: lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- lib/Target/ARM/ARMParallelDSP.cpp +++ lib/Target/ARM/ARMParallelDSP.cpp @@ -39,29 +39,31 @@ #define DEBUG_TYPE "parallel-dsp" namespace { - struct ParallelMAC; + struct BinOpSequence; struct Reduction; - using ParallelMACList = SmallVector; + using BinOpSequenceList = SmallVector; using ReductionList = SmallVector; using ValueList = SmallVector; using MemInstList = SmallVector; - using PMACPair = std::pair; + using PMACPair = std::pair; using PMACPairList = SmallVector; using Instructions = SmallVector; using MemLocList = SmallVector; - // 'ParallelMAC' and 'Reduction' are just some bookkeeping data structures. + // 'BinOpSequence' and 'Reduction' are just some bookkeeping data structures. // 'Reduction' contains the phi-node and accumulator statement from where we - // start pattern matching, and 'ParallelMAC' the multiplication + // start pattern matching, and 'BinOpSequence' the multiplication // instructions that are candidates for parallel execution. - struct ParallelMAC { - Instruction *Mul; - ValueList VL; // List of all (narrow) operands of this Mul - MemInstList VecLd; // List of all load instructions of this Mul - MemLocList MemLocs; // All memory locations read by this Mul - - ParallelMAC(Instruction *I, ValueList &V) : Mul(I), VL(V) {}; + struct BinOpSequence { + Instruction *Root; + ValueList LHS; // List of all (narrow) left hand operands. + ValueList RHS; // List of all (narrow) right hand operands. + MemInstList VecLd; // List of all load instructions. + MemLocList MemLocs; // All memory locations read by this tree. + + BinOpSequence(Instruction *I, ValueList &lhs, ValueList &rhs) : + Root(I), LHS(lhs), RHS(rhs) {}; }; struct Reduction { @@ -85,7 +87,7 @@ bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); - PMACPairList CreateParallelMACPairs(ParallelMACList &Candidates); + PMACPairList CreateParallelMACPairs(BinOpSequenceList &Candidates); Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, Instruction *Acc, Instruction *InsertAfter); @@ -288,7 +290,7 @@ } PMACPairList -ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) { +ARMParallelDSP::CreateParallelMACPairs(BinOpSequenceList &Candidates) { const unsigned Elems = Candidates.size(); PMACPairList PMACPairs; @@ -299,10 +301,10 @@ // We can compare all elements, but then we need to compare and evaluate // different solutions. for(unsigned i=0; idump(); dbgs() << "- "; Mul1->dump()); - const ValueList &VL0 = PMul0.VL; - const ValueList &VL1 = PMul1.VL; + const ValueList &Mul0_LHS = PMul0.LHS; + const ValueList &Mul0_RHS = PMul0.RHS; + const ValueList &Mul1_LHS = PMul1.LHS; + const ValueList &Mul1_RHS = PMul1.RHS; - if (!AreSymmetrical(VL0, VL1)) + if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) || + !AreSymmetrical(Mul0_RHS, Mul1_RHS)) continue; LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n"); @@ -322,18 +327,18 @@ // that its two pairs of consecutive loads, then these can be transformed // into two wider loads and the users can be replaced with DSP // intrinsics. - for (unsigned x = 0; x < VL0.size(); x += 4) { - auto *Ld0 = dyn_cast(VL0[x]); - auto *Ld1 = dyn_cast(VL1[x]); - auto *Ld2 = dyn_cast(VL0[x+2]); - auto *Ld3 = dyn_cast(VL1[x+2]); + for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) { + auto *Ld0 = dyn_cast(Mul0_LHS[x]); + auto *Ld1 = dyn_cast(Mul1_LHS[x]); + auto *Ld2 = dyn_cast(Mul0_RHS[x]); + auto *Ld3 = dyn_cast(Mul1_RHS[x]); LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"; - dbgs() << "\t mul1: "; VL0[x]->dump(); - dbgs() << "\t mul2: "; VL1[x]->dump(); + dbgs() << "\t mul1: "; Mul0_LHS[x]->dump(); + dbgs() << "\t mul2: "; Mul1_LHS[x]->dump(); dbgs() << "and operands " << x + 2 << ":\n"; - dbgs() << "\t mul1: "; VL0[x+2]->dump(); - dbgs() << "\t mul2: "; VL1[x+2]->dump()); + dbgs() << "\t mul1: "; Mul0_RHS[x]->dump(); + dbgs() << "\t mul2: "; Mul1_RHS[x]->dump()); if (AreSequentialLoads(Ld0, Ld1, Candidates[i].VecLd) && AreSequentialLoads(Ld2, Ld3, Candidates[i+1].VecLd)) { @@ -352,8 +357,8 @@ for (auto &Pair : PMACPairs) { LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n"; - dbgs() << "- "; Pair.first->Mul->dump(); - dbgs() << "- "; Pair.second->Mul->dump()); + dbgs() << "- "; Pair.first->Root->dump(); + dbgs() << "- "; Pair.second->Root->dump()); auto *VecLd0 = cast(Pair.first->VecLd[0]); auto *VecLd1 = cast(Pair.second->VecLd[0]); Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter); @@ -368,9 +373,8 @@ return false; } -static ReductionList MatchReductions(Function &F, Loop *TheLoop, - BasicBlock *Header) { - ReductionList Reductions; +static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header, + ReductionList &Reductions) { RecurrenceDescriptor RecDesc; const bool HasFnNoNaNAttr = F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; @@ -378,7 +382,7 @@ // We need a preheader as getIncomingValueForBlock assumes there is one. if (!TheLoop->getLoopPreheader()) - return Reductions; + return; for (PHINode &Phi : Header->phis()) { const auto *Ty = Phi.getType(); @@ -406,23 +410,24 @@ dbgs() << "-> "; R.AccIntAdd->dump(); } ); - return Reductions; } -static void AddCandidateMAC(ParallelMACList &Candidates, const Instruction *Acc, +static void AddCandidateMAC(BinOpSequenceList &Candidates, + const Instruction *Acc, Value *MulOp0, Value *MulOp1, int MulOpNum) { Instruction *Mul = dyn_cast(Acc->getOperand(MulOpNum)); LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump()); - ValueList VL; - if (IsNarrowSequence<16>(MulOp0, VL) && - IsNarrowSequence<16>(MulOp1, VL)) { + ValueList LHS; + ValueList RHS; + if (IsNarrowSequence<16>(MulOp0, LHS) && + IsNarrowSequence<16>(MulOp1, RHS)) { LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump()); - Candidates.push_back(ParallelMAC(Mul, VL)); + Candidates.push_back(BinOpSequence(Mul, LHS, RHS)); } } -static ParallelMACList MatchParallelMACs(Reduction &R) { - ParallelMACList Candidates; +static void MatchParallelMACSequences(Reduction &R, + BinOpSequenceList &Candidates) { const Instruction *Acc = R.AccIntAdd; Value *A, *MulOp0, *MulOp1; LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump()); @@ -448,20 +453,27 @@ // Because we start at the bottom of the chain, and we work our way up, // the muls are added in reverse program order to the list. std::reverse(Candidates.begin(), Candidates.end()); - return Candidates; } // Collects all instructions that are not part of the MAC chains, which is the // set of instructions that can potentially alias with the MAC operands. -static Instructions AliasCandidates(BasicBlock *Header, - ParallelMACList &MACCandidates) { - Instructions Aliases; - auto IsMACCandidate = [] (Instruction *I, ParallelMACList &MACCandidates) { - for (auto &MAC : MACCandidates) - for (auto *Val : MAC.VL) - if (I == MAC.Mul || Val == I) +static void AliasCandidates(BasicBlock *Header, + BinOpSequenceList &MACCandidates, + Instructions &Aliases) { + auto IsMACCandidate = [] (Instruction *I, BinOpSequenceList &MACCandidates) { + for (auto &MAC : MACCandidates) { + if (I == MAC.Root) + return true; + for (auto *Val : MAC.LHS) { + if (Val == I) return true; - return false; + } + for (auto *Val : MAC.RHS) { + if (Val == I) + return true; + } + } + return false; }; std::for_each(Header->begin(), Header->end(), @@ -469,19 +481,18 @@ if (I.mayReadOrWriteMemory() && !IsMACCandidate(&I, MACCandidates)) Aliases.push_back(&I); }); - return Aliases; } // This compares all instructions from the "alias candidates" set, i.e., all // instructions that are not part of the MAC-chain, with all instructions in // the MAC candidate set, to see if instructions are aliased. static bool AreAliased(AliasAnalysis *AA, Instructions AliasCandidates, - ParallelMACList &MACCandidates) { + BinOpSequenceList &MACCandidates) { LLVM_DEBUG(dbgs() << "Alias checks:\n"); for (auto *I : AliasCandidates) { LLVM_DEBUG(dbgs() << "- "; I->dump()); for (auto &MAC : MACCandidates) { - LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump()); + LLVM_DEBUG(dbgs() << "mul: "; MAC.Root->dump()); assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs"); for (auto &MemLoc : MAC.MemLocs) { if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc), @@ -496,19 +507,19 @@ return false; } -static bool SetMemoryLocations(ParallelMACList &Candidates) { +static bool SetMemoryLocations(BinOpSequenceList &Candidates) { const auto Size = MemoryLocation::UnknownSize; for (auto &C : Candidates) { // A mul has 2 operands, and a narrow op consist of sext and a load; thus // we expect at least 4 items in this operand value list. - if (C.VL.size() < 4) { + if (C.LHS.size() < 2 || C.LHS.size() != C.RHS.size()) { LLVM_DEBUG(dbgs() << "Operand list too short.\n"); return false; } - for (unsigned i = 0; i < C.VL.size(); i += 4) { - auto *LdOp0 = dyn_cast(C.VL[i]); - auto *LdOp1 = dyn_cast(C.VL[i+2]); + for (unsigned i = 0, e = C.LHS.size(); i < e; i += 2) { + auto *LdOp0 = dyn_cast(C.LHS[i]); + auto *LdOp1 = dyn_cast(C.RHS[i]); if (!LdOp0 || !LdOp1) return false; @@ -560,13 +571,16 @@ dbgs() << "Loop info:\n\n"; L->dump()); bool Changed = false; - ReductionList Reductions = MatchReductions(F, L, Header); + ReductionList Reductions; + MatchReductions(F, L, Header, Reductions); for (auto &R : Reductions) { - ParallelMACList MACCandidates = MatchParallelMACs(R); + BinOpSequenceList MACCandidates; + MatchParallelMACSequences(R, MACCandidates); if (!SetMemoryLocations(MACCandidates)) continue; - Instructions Aliases = AliasCandidates(Header, MACCandidates); + Instructions Aliases; + AliasCandidates(Header, MACCandidates, Aliases); if (AreAliased(AA, Aliases, MACCandidates)) continue; PMACPairList PMACPairs = CreateParallelMACPairs(MACCandidates);