Index: llvm/include/llvm/Analysis/IRSimilarityIdentifier.h =================================================================== --- llvm/include/llvm/Analysis/IRSimilarityIdentifier.h +++ llvm/include/llvm/Analysis/IRSimilarityIdentifier.h @@ -705,6 +705,8 @@ /// \param [in] A - The first IRInstructionCandidate to compare. /// \param [in] B - The second IRInstructionCandidate to compare. + /// \param [in,out] OneToOne - A mapping of value numbers from candidate + /// \p A to candidate \B using the structure of the original instructions. /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from /// candidate \p A to candidate \B. /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from @@ -714,6 +716,7 @@ static bool compareStructure(const IRSimilarityCandidate &A, const IRSimilarityCandidate &B, + DenseMap &OneToOne, DenseMap> &ValueNumberMappingA, DenseMap> &ValueNumberMappingB); @@ -820,7 +823,8 @@ /// the canonical ordering in \p SourceCand. These are defined based on the /// found mappings in \p ToSourceMapping and \p FromSourceMapping. Both of /// these relationships should have the same information, just in opposite - /// directions. + /// directions. This has some amount of non-determinism, as there is no + /// fallback if a two pairs GVNs match one another. /// /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a /// canonical numbering from. @@ -832,6 +836,49 @@ IRSimilarityCandidate &SourceCand, DenseMap> &ToSourceMapping, DenseMap> &FromSourceMapping); + + /// Create a mapping for the value numbering of the calling + /// IRSimilarityCandidate, to a different separate set of numbers, based on + /// the canonical ordering in \p SourceCand. These are defined based on the + /// found mappings in \p ToSourceMapping and \p FromSourceMapping. Both of + /// these relationships should have the same information, just in opposite + /// directions. Uses the \p OneToOne mapping from target candidate to \p + /// SourceCand GVNs to determine the mapping first for values with multiple + /// mappings. This mapping is created by the ordering of operands in the + /// instruction they are first seen in the candidates. + /// + /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a + /// canonical numbering from. + /// \param [in,out] OneToOne - A mapping of value numbers from candidate + /// \p A to candidate \B using the structure of the original instructions. + /// \param ToSourceMapping - The mapping of value numbers from this candidate + /// to \p SourceCand. + /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand + /// to this candidate. + void createCanonicalRelationFrom( + IRSimilarityCandidate &SourceCand, + DenseMap &OneToOne, + DenseMap> &ToSourceMapping, + DenseMap> &FromSourceMapping); + + /// Create a mapping for the value numbering of the calling + /// IRSimilarityCandidate, to a different separate set of numbers, based on + /// the canonical ordering in \p SourceCand. These are defined based on the + /// canonical mapping defined between \p SoureCandLarge and + /// \p TargetCandLarge. These IRSimilarityCandidates are already structurally + /// similar, ad fully encasulate the IRSimilarityCandidates in question. + /// These are used as a "bridge" from the \p SourceCand to the target. + /// + /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a + /// canonical numbering from. + /// \param SoureCandLarge - The IRSimilarityCandidate fully containing + /// \p SourceCand. + /// \param TargetCandLarge - The IRSimilarityCandidate fully containing + /// this Candidate. + void createCanonicalRelationFrom( + IRSimilarityCandidate &SourceCand, + IRSimilarityCandidate &SourceCandLarge, + IRSimilarityCandidate &TargetCandLarge); /// \param [in,out] BBSet - The set to track the basic blocks. void getBasicBlocks(DenseSet &BBSet) const { Index: llvm/lib/Analysis/IRSimilarityIdentifier.cpp =================================================================== --- llvm/lib/Analysis/IRSimilarityIdentifier.cpp +++ llvm/lib/Analysis/IRSimilarityIdentifier.cpp @@ -754,7 +754,9 @@ const IRSimilarityCandidate &B) { DenseMap> MappingA; DenseMap> MappingB; - return IRSimilarityCandidate::compareStructure(A, B, MappingA, MappingB); + DenseMap OneToOne; + return IRSimilarityCandidate::compareStructure(A, B, OneToOne, MappingA, + MappingB); } typedef detail::zippy &, @@ -762,8 +764,13 @@ ArrayRef &> ZippedRelativeLocationsT; +typedef detail::zippy &, + ArrayRef &> + ZippedOperValsT; + bool IRSimilarityCandidate::compareStructure( const IRSimilarityCandidate &A, const IRSimilarityCandidate &B, + DenseMap &OneToOne, DenseMap> &ValueNumberMappingA, DenseMap> &ValueNumberMappingB) { if (A.getLength() != B.getLength()) @@ -814,6 +821,13 @@ std::make_pair(InstValB, DenseSet({InstValA}))); if (!WasInserted && !ValueMappingIt->second.contains(InstValA)) return false; + + ZippedOperValsT ZippedOperVals = zip(OperValsA, OperValsB); + for (const std::tuple &OperPair : ZippedOperVals) { + unsigned GVNSource = A.ValueToNumber.find(std::get<0>(OperPair))->second; + unsigned GVNTarg = B.ValueToNumber.find(std::get<1>(OperPair))->second; + OneToOne.insert(std::make_pair(GVNTarg, GVNSource)); + } // We have different paths for commutative instructions and non-commutative // instructions since commutative instructions could allow multiple mappings @@ -988,6 +1002,16 @@ IRSimilarityCandidate &SourceCand, DenseMap> &ToSourceMapping, DenseMap> &FromSourceMapping) { + DenseMap OneToOne; + createCanonicalRelationFrom(SourceCand, OneToOne, ToSourceMapping, + FromSourceMapping); +} + +void IRSimilarityCandidate::createCanonicalRelationFrom( + IRSimilarityCandidate &SourceCand, + DenseMap &OneToOne, + DenseMap> &ToSourceMapping, + DenseMap> &FromSourceMapping) { assert(SourceCand.CanonNumToNumber.size() != 0 && "Base canonical relationship is empty!"); assert(SourceCand.NumberToCanonNum.size() != 0 && @@ -1012,7 +1036,24 @@ // here to ensure a one-to-one mapping. if (GVNMapping.second.size() > 1) { bool Found = false; + DenseMap::iterator OTOIt = OneToOne.find(SourceGVN); + + // If possible use the One-to-One mapping from the original instructions + // to determine the mapping between global value numbers. + if (OTOIt != OneToOne.end()) { + DenseMap>::iterator It = + FromSourceMapping.find(OTOIt->second); + + if (It->second.contains(SourceGVN)) { + Found = true; + ResultGVN = OTOIt->second; + } + } + for (unsigned Val : GVNMapping.second) { + if (Found) + break; + // We make sure the target value number hasn't already been reserved. if (UsedGVNs.contains(Val)) continue; @@ -1079,6 +1120,94 @@ } } +void IRSimilarityCandidate::createCanonicalRelationFrom( + IRSimilarityCandidate &SourceCand, IRSimilarityCandidate &SourceCandLarge, + IRSimilarityCandidate &TargetCandLarge) { + assert(SourceCand.CanonNumToNumber.size() != 0 && + "Canonical Relationship is non-empty"); + assert(SourceCand.NumberToCanonNum.size() != 0 && + "Canonical Relationship is non-empty"); + + assert(SourceCandLarge.CanonNumToNumber.size() != 0 && + "Canonical Relationship is non-empty"); + assert(SourceCandLarge.NumberToCanonNum.size() != 0 && + "Canonical Relationship is non-empty"); + + assert(TargetCandLarge.CanonNumToNumber.size() != 0 && + "Canonical Relationship is non-empty"); + assert(TargetCandLarge.NumberToCanonNum.size() != 0 && + "Canonical Relationship is non-empty"); + + assert(CanonNumToNumber.size() == 0 && "Canonical Relationship is non-empty"); + assert(NumberToCanonNum.size() == 0 && "Canonical Relationship is non-empty"); + + // We're going to use the larger candidates as a "bridge" to create the + // canonical number for the target candidate since we have idetified two + // candidates as subsequences of larger sequences, and therefore must be + // structurally similar. + for (std::pair ValueNumPair : ValueToNumber) { + Value *CurrVal = ValueNumPair.first; + unsigned TargetCandGVN = ValueNumPair.second; + + // Find the numbering in the large candidate that surrounds the + // current candidate. + Optional OLargeTargetGVN = TargetCandLarge.getGVN(CurrVal); + assert(OLargeTargetGVN.has_value() && "GVN not found for Value"); + + // Get the canonical numbering in the large target candidate. + Optional OTargetCandCanon = + TargetCandLarge.getCanonicalNum(OLargeTargetGVN.value()); + assert(OTargetCandCanon.has_value() && + "Canononical Number not found for GVN"); + + // Get the GVN in the large source candidate from the canonical numbering. + Optional OLargeSourceGVN = + SourceCandLarge.fromCanonicalNum(OTargetCandCanon.value()); + assert(OLargeSourceGVN.has_value() && + "GVN Number not found for Canonical Number"); + + // Get the Value from the GVN in the large source candidate. + Optional OLargeSourceV = + SourceCandLarge.fromGVN(OLargeSourceGVN.value()); + assert(OLargeSourceV.has_value() && "Value not found for GVN"); + + // Get the GVN number for the Value in the source candidate. + Optional OSourceGVN = + SourceCand.getGVN(OLargeSourceV.value()); + if (!OSourceGVN.has_value()) { + TargetCandLarge.frontInstruction()->getParent()->getParent()->getParent()->dump(); + errs() << "------\n"; + CurrVal->dump(); + TargetCandLarge.frontInstruction()->dump(); + TargetCandLarge.backInstruction()->dump(); + frontInstruction()->dump(); + backInstruction()->dump(); + errs() << "------\n"; + errs() << "------\n"; + OLargeSourceV.value()->dump(); + SourceCandLarge.frontInstruction()->dump(); + SourceCandLarge.backInstruction()->dump(); + SourceCand.frontInstruction()->dump(); + SourceCand.backInstruction()->dump(); + errs() << "------\n"; + + } + assert(OSourceGVN.has_value() && "GVN Number not found for Value"); + + // Get the canonical numbering from the GVN/ + Optional OSourceCanon = + SourceCand.getCanonicalNum(OSourceGVN.value()); + assert(OSourceCanon.has_value() && "Canon Number not found for GVN"); + + // Insert the canonical numbering and GVN pair into their respective + // mappings. + CanonNumToNumber.insert( + std::make_pair(OSourceCanon.value(), TargetCandGVN)); + NumberToCanonNum.insert( + std::make_pair(TargetCandGVN, OSourceCanon.value())); + } +} + void IRSimilarityCandidate::createCanonicalMappingFor( IRSimilarityCandidate &CurrCand) { assert(CurrCand.CanonNumToNumber.size() == 0 && @@ -1096,6 +1225,76 @@ } } +/// Look for larger IRSimilarityCandidates From the previously matched +/// IRSimilarityCandidates that fully contain \p CandA or \p CandB. If there is +/// an overlap, return a pair of structurally similar, larger +/// IRSimilarityCandidates. +/// +/// \param [in] CandA - The first candidate we are trying to determine the +/// structure of. +/// \param [in] CandB - The second candidate we are trying to determine the +/// structure of. +/// \param [in] IndexToIncludedCand - Mapping of index of the an instruction in +/// a circuit to the IRSimilarityCandidates that include this instruction. +/// \param [in] CandToOverallGroup - Mapping of IRSimilarityCandidate to a +/// number representing the structural group assigned to it. +static Optional> +CheckLargerCands( + IRSimilarityCandidate &CandA, IRSimilarityCandidate &CandB, + DenseMap> &IndexToIncludedCand, + DenseMap &CandToGroup) { + DenseMap IncludedGroupAndCandA; + DenseMap IncludedGroupAndCandB; + DenseSet IncludedGroupsA; + DenseSet IncludedGroupsB; + + // Find the overall similarity group numbers that fully contain the candidate, + // and record the larger candidate for each group. + DenseMap>::iterator IdxIt; + IdxIt = IndexToIncludedCand.find(CandA.getStartIdx()); + Optional> Result; + if (IdxIt == IndexToIncludedCand.end()) + return Result; + for (IRSimilarityCandidate *MatchedCand : IdxIt->second) { + if (MatchedCand->getStartIdx() > CandA.getStartIdx() || + (MatchedCand->getEndIdx() < CandA.getEndIdx())) + continue; + unsigned GroupNum = CandToGroup.find(MatchedCand)->second; + IncludedGroupAndCandA.insert(std::make_pair(GroupNum, MatchedCand)); + IncludedGroupsA.insert(GroupNum); + } + + // Find the overall similarity group numbers that fully contain the next + // candidate, and record the larger candidate for each group. + IdxIt = IndexToIncludedCand.find(CandB.getStartIdx()); + if (IdxIt == IndexToIncludedCand.end()) + return Result; + for (IRSimilarityCandidate *MatchedCand : IdxIt->second) { + if (MatchedCand->getStartIdx() > CandB.getStartIdx() || + MatchedCand->getEndIdx() < CandB.getEndIdx()) + continue; + unsigned GroupNum = CandToGroup.find(MatchedCand)->second; + IncludedGroupAndCandB.insert(std::make_pair(GroupNum, MatchedCand)); + IncludedGroupsB.insert(GroupNum); + } + + // Find the intersection between the two groups, these are the groups where + // the larger candidates exist. + set_intersect(IncludedGroupsA, IncludedGroupsB); + + // If there is no intersection between the sets, then we cannot determine + // whether or not there is a match. + if (IncludedGroupsA.empty()) + return Result; + + // Create a pair that contains the larger candidates. + DenseMap::iterator ItA, ItB; + ItA = IncludedGroupAndCandA.find(*IncludedGroupsA.begin()); + ItB = IncludedGroupAndCandB.find(*IncludedGroupsA.begin()); + Result = std::make_pair(ItA->second, ItB->second); + return Result; +} + /// From the list of IRSimilarityCandidates, perform a comparison between each /// IRSimilarityCandidate to determine if there are overlapping /// IRInstructionData, or if they do not have the same structure. @@ -1105,9 +1304,16 @@ /// \param [out] StructuralGroups - the mapping of unsigned integers to vector /// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the /// vector are structurally similar to one another. +/// \param [in] IndexToIncludedCand - Mapping of index of the an instruction in +/// a circuit to the IRSimilarityCandidates that include this instruction. +/// \param [in] CandToOverallGroup - Mapping of IRSimilarityCandidate to a +/// number representing the structural group assigned to it. static void findCandidateStructures( std::vector &CandsForRepSubstring, - DenseMap &StructuralGroups) { + DenseMap &StructuralGroups, + DenseMap> &IndexToIncludedCand, + DenseMap &CandToOverallGroup + ) { std::vector::iterator CandIt, CandEndIt, InnerCandIt, InnerCandEndIt; @@ -1133,6 +1339,7 @@ // compatibility with other instructions DenseMap> ValueNumberMappingA; DenseMap> ValueNumberMappingB; + DenseMap OneToOne; for (CandIt = CandsForRepSubstring.begin(), CandEndIt = CandsForRepSubstring.end(); CandIt != CandEndIt; CandIt++) { @@ -1170,17 +1377,37 @@ if (CandToGroupItInner != CandToGroup.end()) continue; + // Check if we have found structural similarity between two candidates + // that fully contains the first and second candidates. + Optional> + LargerPair = CheckLargerCands( + *CandIt, *InnerCandIt, IndexToIncludedCand, CandToOverallGroup); + + // If a pair was found, it means that we can assume that these smaller + // substrings are also structurally similar. Use the larger candidates to + // determine the canonical mapping between the two sections. + if (LargerPair.has_value()) { + SameStructure = true; + InnerCandIt->createCanonicalRelationFrom( + *CandIt, *LargerPair.value().first, *LargerPair.value().second); + CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum)); + CurrentGroupPair->second.push_back(*InnerCandIt); + continue; + } + // Otherwise we determine if they have the same structure and add it to // vector if they match. ValueNumberMappingA.clear(); ValueNumberMappingB.clear(); + OneToOne.clear(); SameStructure = IRSimilarityCandidate::compareStructure( - *CandIt, *InnerCandIt, ValueNumberMappingA, ValueNumberMappingB); + *CandIt, *InnerCandIt, OneToOne, ValueNumberMappingA, + ValueNumberMappingB); if (!SameStructure) continue; - InnerCandIt->createCanonicalRelationFrom(*CandIt, ValueNumberMappingA, - ValueNumberMappingB); + InnerCandIt->createCanonicalRelationFrom( + *CandIt, OneToOne, ValueNumberMappingA, ValueNumberMappingB); CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum)); CurrentGroupPair->second.push_back(*InnerCandIt); } @@ -1196,24 +1423,58 @@ std::vector NewCandidateGroups; DenseMap StructuralGroups; + DenseMap> IndexToIncludedCand; + DenseMap CandToGroup; // Iterate over the subsequences found by the Suffix Tree to create // IRSimilarityCandidates for each repeated subsequence and determine which // instances are structurally similar to one another. - for (SuffixTree::RepeatedSubstring &RS : ST) { + + // Sort the suffix tree from longest substring to shortest. + std::vector RSes; + for (SuffixTree::RepeatedSubstring &RS : ST) + RSes.push_back(RS); + + llvm::stable_sort(RSes, [](const SuffixTree::RepeatedSubstring &LHS, + const SuffixTree::RepeatedSubstring &RHS) { + return LHS.Length > RHS.Length; + }); + for (SuffixTree::RepeatedSubstring &RS : RSes) { createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, RS, CandsForRepSubstring); if (CandsForRepSubstring.size() < 2) continue; - findCandidateStructures(CandsForRepSubstring, StructuralGroups); - for (std::pair &Group : StructuralGroups) + findCandidateStructures(CandsForRepSubstring, StructuralGroups, + IndexToIncludedCand, CandToGroup); + for (std::pair &Group : StructuralGroups) { // We only add the group if it contains more than one // IRSimilarityCandidate. If there is only one, that means there is no // other repeated subsequence with the same structure. - if (Group.second.size() > 1) + if (Group.second.size() > 1) { SimilarityCandidates->push_back(Group.second); + // Iterate over each candidate in the group, and add an entry for each + // instruction included with a mapping to a set of + // IRSimilarityCandidates that include that instruction. + for (IRSimilarityCandidate &IRCand : SimilarityCandidates->back()) { + for (unsigned Idx = IRCand.getStartIdx(), Edx = IRCand.getEndIdx(); + Idx <= Edx; ++Idx) { + DenseMap>::iterator + IdIt; + IdIt = IndexToIncludedCand.find(Idx); + bool Inserted = false; + if (IdIt == IndexToIncludedCand.end()) + std::tie(IdIt, Inserted) = IndexToIncludedCand.insert( + std::make_pair(Idx, DenseSet())); + IdIt->second.insert(&IRCand); + } + // Add mapping of candidate to the overall similarity group number. + CandToGroup.insert( + std::make_pair(&IRCand, SimilarityCandidates->size() - 1)); + } + } + } CandsForRepSubstring.clear(); StructuralGroups.clear();