Index: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp +++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp @@ -43,38 +43,56 @@ STATISTIC(NumSMLAD , "Number of smlad instructions generated"); namespace { - struct ParallelMAC; + struct OpChain; + struct BinOpChain; struct Reduction; - using ParallelMACList = SmallVector; + using OpChainList = SmallVector; using ReductionList = SmallVector; using ValueList = SmallVector; using MemInstList = SmallVector; - using PMACPair = std::pair; + using PMACPair = std::pair; using PMACPairList = SmallVector; using Instructions = SmallVector; using MemLocList = SmallVector; - // 'ParallelMAC' and 'Reduction' are just some bookkeeping data structures. + struct OpChain { + Instruction *Root; + ValueList AllValues; + MemInstList VecLd; // List of all load instructions. + MemLocList MemLocs; // All memory locations read by this tree. + bool ReadOnly = true; + + OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { } + + void SetMemoryLocations() { + const auto Size = MemoryLocation::UnknownSize; + for (auto *V : AllValues) { + if (auto *I = dyn_cast(V)) { + if (I->mayWriteToMemory()) + ReadOnly = false; + if (auto *Ld = dyn_cast(V)) + MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size)); + } + } + } + + unsigned size() const { return AllValues.size(); } + }; + + // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures. // 'Reduction' contains the phi-node and accumulator statement from where we - // start pattern matching, and 'ParallelMAC' the multiplication + // start pattern matching, and 'BinOpChain' the multiplication // instructions that are candidates for parallel execution. - struct ParallelMAC { - Instruction *Mul; - ValueList VL; // List of all (narrow) operands of this Mul - MemInstList VecLd; // List of all load instructions of this Mul - MemLocList MemLocs; // All memory locations read by this Mul - - // The MAC-chains we currently recognise are simple chains that accumulate - // their results with a reducing integer add statement, and consist of - // a chain of adds and muls, which have only sext and load instructions as - // operands. Thus, these chains don't write memory. We check that this is - // true when we collect the operands, and use this in alias analysis checks - // that different parallel MACs don't interfere with each other. - bool ReadOnly; - - ParallelMAC(Instruction *I, ValueList &V, bool RdOnly) - : Mul(I), VL(V), ReadOnly(RdOnly) {}; + struct BinOpChain : public OpChain { + ValueList LHS; // List of all (narrow) left hand operands. + ValueList RHS; // List of all (narrow) right hand operands. + + BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) : + OpChain(I, lhs), LHS(lhs), RHS(rhs) { + for (auto *V : RHS) + AllValues.push_back(V); + } }; struct Reduction { @@ -83,7 +101,7 @@ Instruction *AccIntAdd; // The accumulating integer add statement, // i.e, the reduction statement. - ParallelMACList MACCandidates; // The MAC candidates associated with + OpChainList MACCandidates; // The MAC candidates associated with // this reduction statement. Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { }; }; @@ -100,7 +118,7 @@ bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); - PMACPairList CreateParallelMACPairs(ParallelMACList &Candidates); + PMACPairList CreateParallelMACPairs(OpChainList &Candidates); Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, Instruction *Acc, Instruction *InsertAfter); @@ -303,7 +321,7 @@ } PMACPairList -ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) { +ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) { const unsigned Elems = Candidates.size(); PMACPairList PMACPairs; @@ -314,10 +332,10 @@ // We can compare all elements, but then we need to compare and evaluate // different solutions. for(unsigned i=0; i(Candidates[i]); + BinOpChain *PMul1 = static_cast(Candidates[i+1]); + const Instruction *Mul0 = PMul0->Root; + const Instruction *Mul1 = PMul1->Root; if (Mul0 == Mul1) continue; @@ -326,10 +344,13 @@ dbgs() << "- "; Mul0->dump(); dbgs() << "- "; Mul1->dump()); - const ValueList &VL0 = PMul0.VL; - const ValueList &VL1 = PMul1.VL; + const ValueList &Mul0_LHS = PMul0->LHS; + const ValueList &Mul0_RHS = PMul0->RHS; + const ValueList &Mul1_LHS = PMul1->LHS; + const ValueList &Mul1_RHS = PMul1->RHS; - if (!AreSymmetrical(VL0, VL1)) + if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) || + !AreSymmetrical(Mul0_RHS, Mul1_RHS)) continue; LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n"); @@ -337,23 +358,23 @@ // that its two pairs of consecutive loads, then these can be transformed // into two wider loads and the users can be replaced with DSP // intrinsics. - for (unsigned x = 0; x < VL0.size(); x += 4) { - auto *Ld0 = dyn_cast(VL0[x]); - auto *Ld1 = dyn_cast(VL1[x]); - auto *Ld2 = dyn_cast(VL0[x+2]); - auto *Ld3 = dyn_cast(VL1[x+2]); + for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) { + auto *Ld0 = dyn_cast(Mul0_LHS[x]); + auto *Ld1 = dyn_cast(Mul1_LHS[x]); + auto *Ld2 = dyn_cast(Mul0_RHS[x]); + auto *Ld3 = dyn_cast(Mul1_RHS[x]); LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n"; - dbgs() << "\t mul1: "; VL0[x]->dump(); - dbgs() << "\t mul2: "; VL1[x]->dump(); + dbgs() << "\t mul1: "; Mul0_LHS[x]->dump(); + dbgs() << "\t mul2: "; Mul1_LHS[x]->dump(); dbgs() << "and operands " << x + 2 << ":\n"; - dbgs() << "\t mul1: "; VL0[x+2]->dump(); - dbgs() << "\t mul2: "; VL1[x+2]->dump()); + dbgs() << "\t mul1: "; Mul0_RHS[x]->dump(); + dbgs() << "\t mul2: "; Mul1_RHS[x]->dump()); - if (AreSequentialLoads(Ld0, Ld1, Candidates[i].VecLd) && - AreSequentialLoads(Ld2, Ld3, Candidates[i+1].VecLd)) { + if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd) && + AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n"); - PMACPairs.push_back(std::make_pair(&PMul0, &PMul1)); + PMACPairs.push_back(std::make_pair(PMul0, PMul1)); } } } @@ -367,8 +388,8 @@ for (auto &Pair : PMACPairs) { LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n"; - dbgs() << "- "; Pair.first->Mul->dump(); - dbgs() << "- "; Pair.second->Mul->dump()); + dbgs() << "- "; Pair.first->Root->dump(); + dbgs() << "- "; Pair.second->Root->dump()); auto *VecLd0 = cast(Pair.first->VecLd[0]); auto *VecLd1 = cast(Pair.second->VecLd[0]); Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter); @@ -383,9 +404,8 @@ return false; } -static ReductionList MatchReductions(Function &F, Loop *TheLoop, - BasicBlock *Header) { - ReductionList Reductions; +static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header, + ReductionList &Reductions) { RecurrenceDescriptor RecDesc; const bool HasFnNoNaNAttr = F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; @@ -394,7 +414,7 @@ // We need a preheader as getIncomingValueForBlock assumes there is one. if (!TheLoop->getLoopPreheader()) { LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n"); - return Reductions; + return; } for (PHINode &Phi : Header->phis()) { @@ -418,36 +438,29 @@ LLVM_DEBUG( dbgs() << "\nAccumulating integer additions (reductions) found:\n"; - for (auto R : Reductions) { + for (auto &R : Reductions) { dbgs() << "- "; R.Phi->dump(); dbgs() << "-> "; R.AccIntAdd->dump(); } ); - return Reductions; } -static void AddMACCandidate(ParallelMACList &Candidates, const Instruction *Acc, +static void AddMACCandidate(OpChainList &Candidates, + const Instruction *Acc, Value *MulOp0, Value *MulOp1, int MulOpNum) { Instruction *Mul = dyn_cast(Acc->getOperand(MulOpNum)); LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump()); - ValueList VL; - if (IsNarrowSequence<16>(MulOp0, VL) && - IsNarrowSequence<16>(MulOp1, VL)) { + ValueList LHS; + ValueList RHS; + if (IsNarrowSequence<16>(MulOp0, LHS) && + IsNarrowSequence<16>(MulOp1, RHS)) { LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump()); - - bool MayWriteMem = false; - for (auto &V : VL) { - if (dyn_cast(V)->mayWriteToMemory()) { - MayWriteMem = true; - break; - } - } - Candidates.push_back(ParallelMAC(Mul, VL, !MayWriteMem)); + Candidates.push_back(new BinOpChain(Mul, LHS, RHS)); } } -static ParallelMACList MatchParallelMACs(Reduction &R) { - ParallelMACList Candidates; +static void MatchParallelMACSequences(Reduction &R, + OpChainList &Candidates) { const Instruction *Acc = R.AccIntAdd; Value *A, *MulOp0, *MulOp1; LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump()); @@ -473,7 +486,6 @@ // Because we start at the bottom of the chain, and we work our way up, // the muls are added in reverse program order to the list. std::reverse(Candidates.begin(), Candidates.end()); - return Candidates; } // Collects all instructions that are not part of the MAC chains, which is the @@ -492,23 +504,23 @@ // the memory locations accessed by the MAC-chains. // TODO: we need the read statements when we accept more complicated chains. static bool AreAliased(AliasAnalysis *AA, Instructions &Reads, - Instructions &Writes, ParallelMACList &MACCandidates) { + Instructions &Writes, OpChainList &MACCandidates) { LLVM_DEBUG(dbgs() << "Alias checks:\n"); - for (auto &MAC : MACCandidates) { - LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump()); + for (auto *MAC : MACCandidates) { + LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump()); // At the moment, we allow only simple chains that only consist of reads, // accumulate their result with an integer add, and thus that don't write // memory, and simply bail if they do. - if (!MAC.ReadOnly) + if (!MAC->ReadOnly) return true; // Now for all writes in the basic block, check that they don't alias with // the memory locations accessed by our MAC-chain: for (auto *I : Writes) { LLVM_DEBUG(dbgs() << "- "; I->dump()); - assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs"); - for (auto &MemLoc : MAC.MemLocs) { + assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs"); + for (auto &MemLoc : MAC->MemLocs) { if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc), ModRefInfo::ModRef))) { LLVM_DEBUG(dbgs() << "Yes, aliases found\n"); @@ -522,24 +534,22 @@ return false; } -static bool SetMemoryLocations(ParallelMACList &Candidates) { - const auto Size = MemoryLocation::UnknownSize; - for (auto &C : Candidates) { +static bool CheckMACMemory(OpChainList &Candidates) { + for (auto *C : Candidates) { // A mul has 2 operands, and a narrow op consist of sext and a load; thus // we expect at least 4 items in this operand value list. - if (C.VL.size() < 4) { + if (C->size() < 4) { LLVM_DEBUG(dbgs() << "Operand list too short.\n"); return false; } - - for (unsigned i = 0; i < C.VL.size(); i += 4) { - auto *LdOp0 = dyn_cast(C.VL[i]); - auto *LdOp1 = dyn_cast(C.VL[i+2]); - if (!LdOp0 || !LdOp1) + C->SetMemoryLocations(); + ValueList &LHS = static_cast(C)->LHS; + ValueList &RHS = static_cast(C)->RHS; + + // Use +=2 to skip over the expected extend instructions. + for (unsigned i = 0, e = LHS.size(); i < e; i += 2) { + if (!isa(LHS[i]) || !isa(RHS[i])) return false; - - C.MemLocs.push_back(MemoryLocation(LdOp0->getPointerOperand(), Size)); - C.MemLocs.push_back(MemoryLocation(LdOp1->getPointerOperand(), Size)); } } return true; @@ -584,17 +594,20 @@ dbgs() << "Loop info:\n\n"; L->dump()); bool Changed = false; - ReductionList Reductions = MatchReductions(F, L, Header); + ReductionList Reductions; + MatchReductions(F, L, Header, Reductions); for (auto &R : Reductions) { - ParallelMACList MACCandidates = MatchParallelMACs(R); - if (!SetMemoryLocations(MACCandidates)) + OpChainList MACCandidates; + MatchParallelMACSequences(R, MACCandidates); + if (!CheckMACMemory(MACCandidates)) continue; + R.MACCandidates = MACCandidates; LLVM_DEBUG(dbgs() << "MAC candidates:\n"; for (auto &M : R.MACCandidates) - M.Mul->dump(); + M->Root->dump(); dbgs() << "\n";); } @@ -609,6 +622,8 @@ return false; PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates); Changed |= InsertParallelMACs(R, PMACPairs); + for (auto *C : R.MACCandidates) + delete C; } LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););