diff --git a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h --- a/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h +++ b/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h @@ -41,6 +41,9 @@ // An IRSimilarityCandidate is a region of IRInstructionData (wrapped // Instructions), usually used to denote a region of similarity has been found. // +// A SimilarityGroup is a set of IRSimilarityCandidates that are structurally +// similar to one another. +// //===----------------------------------------------------------------------===// #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H @@ -575,6 +578,122 @@ iterator end() const { return std::next(iterator(back())); } }; +typedef std::vector SimilarityGroup; +typedef std::vector SimilarityGroupList; + +/// This class puts all the pieces of the IRInstructionData, +/// IRInstructionMapper, IRSimilarityCandidate together. +/// +/// It first feeds the Module or vector of Modules into the IRInstructionMapper, +/// and puts all the mapped instructions into a single long list of +/// IRInstructionData. +/// +/// The list of unsigned integers is given to the Suffix Tree or similar data +/// structure to find repeated subsequences. We construct an +/// IRSimilarityCandidate for each instance of the subsequence. We compare them +/// against one another since These repeated subsequences can have different +/// structure. For each different kind of structure found, we create a +/// similarity group. +/// +/// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are +/// structurally similar to one another, while C is different we would have two +/// SimilarityGroups: +/// +/// SimilarityGroup 1: SimilarityGroup 2 +/// A, B, D C +/// +/// A list of the different similarity groups is then returned after +/// analyzing the module. +class IRSimilarityIdentifier { +public: + IRSimilarityIdentifier() + : Mapper(&InstDataAllocator, &InstDataListAllocator) {} + + /// \param M the module to find similarity in. + explicit IRSimilarityIdentifier(Module &M) + : Mapper(&InstDataAllocator, &InstDataListAllocator) { + findSimilarity(M); + } + +private: + /// Map the instructions in the module to unsigned integers, using mapping + /// already present in the Mapper if possible. + /// + /// \param [in] M Module - To map to integers. + /// \param [in,out] InstrList - The vector to append IRInstructionData to. + /// \param [in,out] IntegerMapping - The vector to append integers to. + void populateMapper(Module &M, std::vector &InstrList, + std::vector &IntegerMapping); + + /// Map the instructions in the modules vector to unsigned integers, using + /// mapping already present in the mapper if possible. + /// + /// \param [in] Modules - The list of modules to use to populate the mapper + /// \param [in,out] InstrList - The vector to append IRInstructionData to. + /// \param [in,out] IntegerMapping - The vector to append integers to. + void populateMapper(ArrayRef> &Modules, + std::vector &InstrList, + std::vector &IntegerMapping); + + /// Find the similarity candidates in \p InstrList and corresponding + /// \p UnsignedVec + /// + /// \param [in,out] InstrList - The vector to append IRInstructionData to. + /// \param [in,out] IntegerMapping - The vector to append integers to. + /// candidates found in the program. + void findCandidates(std::vector &InstrList, + std::vector &IntegerMapping); + +public: + // Find the IRSimilarityCandidates in the \p Modules and group by structural + // similarity in a SimilarityGroup, each group is returned in a + // SimilarityGroupList. + // + // \param [in] Modules - the modules to analyze. + // \returns The groups of similarity ranges found in the modules. + SimilarityGroupList & + findSimilarity(ArrayRef> Modules); + + // Find the IRSimilarityCandidates in the given Module grouped by structural + // similarity in a SimilarityGroup, contained inside a SimilarityGroupList. + // + // \param [in] M - the module to analyze. + // \returns The groups of similarity ranges found in the module. + SimilarityGroupList &findSimilarity(Module &M); + + // Clears \ref SimilarityCandidates if it is already filled by a previous run. + void resetSimilarityCandidates() { + // If we've already analyzed a Module or set of Modules, so we must clear + // the SimilarityCandidates to make sure we do not have only old values + // hanging around. + if (SimilarityCandidates.hasValue()) + SimilarityCandidates->clear(); + else + SimilarityCandidates = SimilarityGroupList(); + } + + // \returns The groups of similarity ranges found in the most recently passed + // set of modules. + Optional &getSimilarity() { + return SimilarityCandidates; + } + +private: + /// The allocator for IRInstructionData. + SpecificBumpPtrAllocator InstDataAllocator; + + /// The allocator for IRInstructionDataLists. + SpecificBumpPtrAllocator InstDataListAllocator; + + /// Map Instructions to unsigned integers and wraps the Instruction in an + /// instance of IRInstructionData. + IRInstructionMapper Mapper; + + /// The SimilarityGroups found with the most recent run of \ref + /// findSimilarity. None if there is no recent run. + Optional SimilarityCandidates; +}; + } // end namespace IRSimilarity } // end namespace llvm diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp --- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp +++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/User.h" +#include "llvm/Support/SuffixTree.h" using namespace llvm; using namespace IRSimilarity; @@ -389,7 +390,6 @@ {B, OperValsB, ValueNumberMappingB})) return false; } - return true; } @@ -405,3 +405,242 @@ return DoesOverlap(A, B) || DoesOverlap(B, A); } + +void IRSimilarityIdentifier::populateMapper( + Module &M, std::vector &InstrList, + std::vector &IntegerMapping) { + + std::vector InstrListForModule; + std::vector IntegerMappingForModule; + // Iterate over the functions in the module to map each Instruction in each + // BasicBlock to an unsigned integer. + for (Function &F : M) { + + if (F.empty()) + continue; + + for (BasicBlock &BB : F) { + + if (BB.sizeWithoutDebug() < 2) + continue; + + // BB has potential to have similarity since it has a size greater than 2 + // and can therefore match other regions greater than 2. Map it to a list + // of unsigned integers. + Mapper.convertToUnsignedVec(BB, InstrListForModule, + IntegerMappingForModule); + } + } + + // Insert the InstrListForModule at the end of the overall InstrList so that + // we can have a long InstrList for the entire set of Modules being analyzed. + InstrList.insert(InstrList.end(), InstrListForModule.begin(), + InstrListForModule.end()); + // Do the same as above, but for IntegerMapping. + IntegerMapping.insert(IntegerMapping.end(), IntegerMappingForModule.begin(), + IntegerMappingForModule.end()); +} + +void IRSimilarityIdentifier::populateMapper( + ArrayRef> &Modules, + std::vector &InstrList, + std::vector &IntegerMapping) { + + // Iterate over, and map the instructions in each module. + for (const std::unique_ptr &M : Modules) + populateMapper(*M, InstrList, IntegerMapping); +} + +/// From a repeated subsequence, find all the different instances of the +/// subsequence from the \p InstrList, and create an IRSimilarityCandidate from +/// the IRInstructionData in subsequence. +/// +/// \param [in] Mapper - The instruction mapper for sanity checks. +/// \param [in] InstrList - The vector that holds the instruction data. +/// \param [in] IntegerMapping - The vector that holds the mapped integers. +/// \param [out] CandsForRepSubstring - The vector to store the generated +/// IRSimilarityCandidates. +static void createCandidatesFromSuffixTree( + IRInstructionMapper Mapper, std::vector &InstrList, + std::vector &IntegerMapping, SuffixTree::RepeatedSubstring &RS, + std::vector &CandsForRepSubstring) { + + unsigned StringLen = RS.Length; + unsigned NumFound = 0; + unsigned PreviousIdx = 0; + + // Create an IRSimilarityCandidate for instance of this subsequence \p RS. + for (const unsigned &StartIdx : RS.StartIndices) { + unsigned EndIdx = StartIdx + StringLen - 1; + + // Check that this subsequence does not contain an illegal instruction. + bool ContainsIllegal = false; + for (unsigned CurrIdx = StartIdx; CurrIdx <= EndIdx; CurrIdx++) { + unsigned Key = IntegerMapping[CurrIdx]; + if (Key > Mapper.IllegalInstrNumber) { + ContainsIllegal = true; + break; + } + } + + // If we have an illegal instruction, we should not create an + // IRSimilarityCandidate for this region. + if (ContainsIllegal) + continue; + + PreviousIdx = EndIdx; + NumFound++; + + // We are getting iterators to the instructions in this region of code + // by advancing the start and end indices from the start of the + // InstrList. + std::vector::iterator StartIt = InstrList.begin(); + std::advance(StartIt, StartIdx); + std::vector::iterator EndIt = InstrList.begin(); + std::advance(EndIt, EndIdx); + + CandsForRepSubstring.emplace_back(StartIdx, StringLen, *StartIt, *EndIt); + } +} + +/// From the list of IRSimilarityCandidates, perform a comparison between each +/// IRSimilarityCandidate to determine if there are overlapping +/// IRInstructionData, or if they do not have the same structure. +/// +/// \param [in] CandsForRepSubstring - The vector containing the +/// IRSimilarityCandidates. +/// \param [out] StructuralGroups - the mapping of unsigned integers to vector +/// of IRSimilarityCandidates where each of the IRSimilarityCandidates in the +/// vector are structurally similar to one another. +static void findCandidateStructures( + std::vector &CandsForRepSubstring, + DenseMap &StructuralGroups) { + std::vector::iterator CandIt, CandEndIt, InnerCandIt, + InnerCandEndIt; + + // IRSimilarityCandidates each have a structure for operand use. It is + // possible that two instances of the same subsequences have different + // structure. Each type of structure found is assigned a number. This + // DenseMap maps an IRSimilarityCandidate to which type of similarity + // discovered it fits within. + DenseMap CandToGroup; + + // Find the compatibility from each candidate to the others to determine + // which candidates overlap and which have the same structure by mapping + // each structure to a different group. + bool SameStructure; + bool Inserted; + unsigned CurrentGroupNum = 0; + unsigned OuterGroupNum; + DenseMap::iterator CandToGroupIt; + DenseMap::iterator CandToGroupItInner; + DenseMap::iterator CurrentGroupPair; + + // Iterate over the candidates to determine its structural and overlapping + // compatibility with other instructions + for (CandIt = CandsForRepSubstring.begin(), + CandEndIt = CandsForRepSubstring.end(); + CandIt != CandEndIt; CandIt++) { + + // Determine if it has an assigned structural group already. + CandToGroupIt = CandToGroup.find(&*CandIt); + if (CandToGroupIt == CandToGroup.end()) { + // If not, we assign it one, and add it to our mapping. + std::tie(CandToGroupIt, Inserted) = + CandToGroup.insert(std::make_pair(&*CandIt, CurrentGroupNum++)); + } + + // Get the structural group number from the iterator. + OuterGroupNum = CandToGroupIt->second; + + // Check if we already have a list of IRSimilarityCandidates for the current + // structural group. Create one if one does not exist. + CurrentGroupPair = StructuralGroups.find(OuterGroupNum); + if (CurrentGroupPair == StructuralGroups.end()) + std::tie(CurrentGroupPair, Inserted) = StructuralGroups.insert( + std::make_pair(OuterGroupNum, SimilarityGroup({*CandIt}))); + + // Iterate over the IRSimilarityCandidates following the current + // IRSimilarityCandidate in the list to determine whether the two + // IRSimilarityCandidates are compatible. This is so we do not repeat pairs + // of IRSimilarityCandidates. + for (InnerCandIt = std::next(CandIt), + InnerCandEndIt = CandsForRepSubstring.end(); + InnerCandIt != InnerCandEndIt; InnerCandIt++) { + + // We check if the inner item has a group already, if it does, we skip it. + CandToGroupItInner = CandToGroup.find(&*InnerCandIt); + if (CandToGroupItInner != CandToGroup.end()) + continue; + + // Otherwise we determine if they have the same structure and add it to + // vector if they match. + SameStructure = + IRSimilarityCandidate::compareStructure(*CandIt, *InnerCandIt); + if (!SameStructure) + continue; + + CandToGroup.insert(std::make_pair(&*InnerCandIt, OuterGroupNum)); + CurrentGroupPair->second.push_back(*InnerCandIt); + } + } +} + +void IRSimilarityIdentifier::findCandidates( + std::vector &InstrList, + std::vector &IntegerMapping) { + SuffixTree ST(IntegerMapping); + + std::vector CandsForRepSubstring; + std::vector NewCandidateGroups; + + DenseMap StructuralGroups; + + // Iterate over the subsequences found by the Suffix Tree to create + // IRSimilarityCandidates for each repeated subsequence and determine which + // instances are structurally similar to one another. + for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) { + createCandidatesFromSuffixTree(Mapper, InstrList, IntegerMapping, *It, + CandsForRepSubstring); + + if (CandsForRepSubstring.size() < 2) + continue; + + findCandidateStructures(CandsForRepSubstring, StructuralGroups); + for (std::pair &Group : StructuralGroups) + // We only add the group if it contains more than one + // IRSimilarityCandidate. If there is only one, that means there is no + // other repeated subsequence with the same structure. + if (Group.second.size() > 1) + SimilarityCandidates->push_back(Group.second); + + CandsForRepSubstring.clear(); + StructuralGroups.clear(); + NewCandidateGroups.clear(); + } +} + +SimilarityGroupList &IRSimilarityIdentifier::findSimilarity( + ArrayRef> Modules) { + resetSimilarityCandidates(); + + std::vector InstrList; + std::vector IntegerMapping; + + populateMapper(Modules, InstrList, IntegerMapping); + findCandidates(InstrList, IntegerMapping); + + return SimilarityCandidates.getValue(); +} + +SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) { + resetSimilarityCandidates(); + + std::vector InstrList; + std::vector IntegerMapping; + + populateMapper(M, InstrList, IntegerMapping); + findCandidates(InstrList, IntegerMapping); + + return SimilarityCandidates.getValue(); +} diff --git a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp --- a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp +++ b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp @@ -38,7 +38,14 @@ Mapper.convertToUnsignedVec(BB, InstrList, UnsignedVec); } -// Checks that different opcodes are mapped to different values. +void getSimilarities( + Module &M, + std::vector> &SimilarityCandidates) { + IRSimilarityIdentifier Identifier; + SimilarityCandidates = Identifier.findSimilarity(M); +} + +// Checks that different opcodes are mapped to different values TEST(IRInstructionMapper, OpcodeDifferentiation) { StringRef ModuleString = R"( define i32 @f(i32 %a, i32 %b) { @@ -1625,3 +1632,193 @@ ASSERT_TRUE(longSimCandCompare(InstrList, true)); } + +// Checks that two sets of identical instructions are found to be the same. +// Both sequences of adds have the same operand ordering, and the same +// instructions, making them strcturally equivalent. +TEST(IRSimilarityIdentifier, IdentitySimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = sub i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %a, %b + %3 = sub i32 %b, %a + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 1); + for (std::vector &Cands : SimilarityCandidates) { + ASSERT_TRUE(Cands.size() == 2); + unsigned InstIdx = 0; + for (IRSimilarityCandidate &Cand : Cands) { + ASSERT_TRUE(Cand.getStartIdx() == InstIdx); + InstIdx += 3; + } + } +} + +// Checks that incorrect sequences are not found as similar. In this case, +// we have different sequences of instructions. +TEST(IRSimilarityIdentifier, InstructionDifference) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b, i32 %c, i32 %d) { + bb0: + %0 = sub i32 %a, %b + %1 = add i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %c, %d + %3 = sub i32 %d, %c + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 0); +} + +// This test checks to see whether we can detect similarity for commutativen +// instructions where the operands have been reversed. Right now, we cannot. +TEST(IRSimilarityIdentifier, CommutativeSimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 %a, %b + %1 = add i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %a, %b + %3 = add i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 0); +} + +// Check that we are not finding commutative similarity in non commutative +// instructions. That is, while the instruction and operands used are the same +// in the two subtraction sequences, they cannot be counted as the same since +// a subtraction is not commutative. +TEST(IRSimilarityIdentifier, NonCommutativeDifference) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = sub i32 %a, %b + %1 = sub i32 %b, %a + br label %bb1 + bb1: + %2 = sub i32 %a, %b + %3 = sub i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 0); +} + +// Check that we find similarity despite changing the register names. +TEST(IRSimilarityIdentifier, MappingSimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b, i32 %c, i32 %d) { + bb0: + %0 = add i32 %a, %b + %1 = sub i32 %b, %a + br label %bb1 + bb1: + %2 = add i32 %c, %d + %3 = sub i32 %d, %c + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 1); + for (std::vector &Cands : SimilarityCandidates) { + ASSERT_TRUE(Cands.size() == 2); + unsigned InstIdx = 0; + for (IRSimilarityCandidate &Cand : Cands) { + ASSERT_TRUE(Cand.getStartIdx() == InstIdx); + InstIdx += 3; + } + } +} + +// Checks that constants are detected as the same operand in each use in the +// sequences of instructions. Also checks that we can find structural +// equivalence using constants. In this case the 1 has the same use pattern as +// %a. +TEST(IRSimilarityIdentifier, ConstantMappingSimilarity) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 1, %b + %1 = icmp sgt i32 %b, 1 + br label %bb1 + bb1: + %2 = add i32 %a, %b + %3 = icmp sgt i32 %b, %a + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 1); + for (std::vector &Cands : SimilarityCandidates) { + ASSERT_TRUE(Cands.size() == 2); + unsigned InstIdx = 0; + for (IRSimilarityCandidate &Cand : Cands) { + ASSERT_TRUE(Cand.getStartIdx() == InstIdx); + InstIdx += 3; + } + } +} + +// Check that constants are uniquely identified. i.e. two different constants +// are not considered the same. This means that this should not find any +// structural similarity. +TEST(IRSimilarityIdentifier, ConstantMappingDifference) { + StringRef ModuleString = R"( + define i32 @f(i32 %a, i32 %b) { + bb0: + %0 = add i32 1, %b + %1 = icmp sgt i32 %b, 2 + br label %bb1 + bb1: + %2 = add i32 %a, %b + %3 = icmp slt i32 %a, %b + ret i32 0 + })"; + LLVMContext Context; + std::unique_ptr M = makeLLVMModule(Context, ModuleString); + + std::vector> SimilarityCandidates; + getSimilarities(*M, SimilarityCandidates); + + ASSERT_TRUE(SimilarityCandidates.size() == 0); +}