Index: include/clang/Analysis/CloneDetection.h =================================================================== --- include/clang/Analysis/CloneDetection.h +++ include/clang/Analysis/CloneDetection.h @@ -17,7 +17,6 @@ #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/Hashing.h" -#include "llvm/ADT/StringMap.h" #include @@ -158,6 +157,7 @@ /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) /// are not supported. class CloneDetector { + public: typedef unsigned DataPiece; @@ -194,8 +194,7 @@ }; /// Holds group of StmtSequences that are clones of each other and the - /// complexity value (see CloneSignature::Complexity) that all stored - /// StmtSequences have in common. + /// CloneSignature that all clones have in common. struct CloneGroup { std::vector Sequences; CloneSignature Signature; @@ -206,13 +205,6 @@ : Signature(Signature) { Sequences.push_back(Seq); } - - /// \brief Returns false if and only if this group should be skipped when - /// searching for clones. - bool isValid() const { - // A clone group with only one member makes no sense, so we skip them. - return Sequences.size() > 1; - } }; /// \brief Generates and stores search data for all statements in the body of @@ -222,17 +214,129 @@ /// \brief Stores the CloneSignature to allow future querying. void add(const StmtSequence &S, const CloneSignature &Signature); - /// \brief Searches the provided statements for clones. + /// \brief Represents a way to filter clones from a list of clone groups. + /// + /// A reoccuring task in the CloneDetector is filtering out clones that don't + /// have certain properties. This filtering is done in a pass architecture + /// in which each pass is supposed to remove a certain kind of clones. + /// Usually, passes are grouped together in a PassList and are applied one + /// after another until all undesireable clones are removed. + /// + /// To create a new pass, inherit from this class and overwrite the respective + /// virtual functions. + class Pass { + public: + virtual ~Pass() {} + + /// \brief Allows skipping the processing of a given hash group. + /// \param HashGroup The given hash group. + /// \return Return true if the CloneDetector should start creating actual + /// clone groups from the given hash group. + virtual bool passHashGroup(const CloneGroup &HashGroup); + + /// \brief Allows skipping the creation of clone groups from a given + /// prototype StmtSequence. + /// \param Prototype The given prototype StmtSequence. + /// \param PrototypeGroup The clone group to which the Prototype belongs. + /// \return Return true if the CloneDetector should search for candidates + /// that will fit into the given prototype group. + virtual bool passPrototype(const StmtSequence &Prototype, + const CloneGroup &PrototypeGroup); + + /// \brief Decides if a StmtSequence belongs into a certain clone group. + /// \param Prototype The prototype that + /// \param Candidate + /// \return Return true if the candidate fits to the prototype. + virtual bool passCandidate(const StmtSequence &Prototype, + const StmtSequence &Candidate); + + /// \brief Decides if a given clone group should be added to the result. + /// \param Group The given clone group. + /// \return Return true if the given group should be included in the result. + virtual bool passGroup(const CloneGroup &Group); + + /// \brief Can process and modify the clone groups before they are returned. + /// \param Result The list of clone groups. + virtual void processResult(std::vector &Result); + }; + + /// \brief Stores a list of passes. /// - /// \param Result Output parameter that is filled with a list of found - /// clone groups. Each group contains multiple StmtSequences - /// that were identified to be clones of each other. - /// \param MinGroupComplexity Only return clones which have at least this - /// complexity value. - /// \param CheckPatterns Returns only clone groups in which the referenced - /// variables follow the same pattern. - void findClones(std::vector &Result, unsigned MinGroupComplexity, - bool CheckPatterns = true); + /// This class also takes over the memory ownership of all given passes. + class PassList { + std::vector Passes; + + public: + PassList(std::initializer_list IL) : Passes(IL.begin(), IL.end()) {} + ~PassList() { + for (Pass *P : Passes) { + delete P; + } + } + + // Non-copyable because we free all passes in the destructor. + PassList(const PassList &other) = delete; + PassList &operator=(const PassList &) = delete; + + std::vector::iterator begin() { return Passes.begin(); } + std::vector::iterator end() { return Passes.end(); } + }; + + /// \brief Generates hash groups and applies the given passes to them. + /// \param Result Output parameter to which all created clone groups are + /// added. + /// \param Passes The passes that should be used to filter the hash groups. + void findClones(std::vector &Result, PassList &Passes); + + /// \brief Applies the given passes on a list of clone groups. + /// \param CloneGroups The list of clone groups. + /// \param Result Output parameter to which all created clone groups are + /// added. + /// \param Passes The passes that should be applied. + static void applyPasses(const std::vector &CloneGroups, + std::vector &Result, PassList &Passes); + +private: + /// Stores all encountered StmtSequences alongside their CloneSignature. + std::vector> Sequences; +}; + +/// \brief Analyzes the pattern of the referenced variables in a statement. +class VariablePattern { + + /// \brief Describes an occurence of a variable reference in a statement. + struct VariableOccurence { + /// The index of the associated VarDecl in the Variables vector. + size_t KindID; + /// The location in the code where the variable was references. + SourceRange Location; + + VariableOccurence(size_t KindID, SourceRange Location) + : KindID(KindID), Location(Location) {} + }; + + /// All occurences of referenced variables in the order of appearance. + std::vector Occurences; + /// List of referenced variables in the order of appearance. + /// Every item in this list is unique. + std::vector Variables; + + /// \brief Adds a new variable referenced to this pattern. + /// \param VarDecl The declaration of the variable that is referenced. + /// \param Location The SourceRange where this variable is referenced. + void addVariableOccurence(const VarDecl *VarDecl, SourceRange Location); + + /// \brief Adds each referenced variable from the given statement. + void addVariables(const Stmt *S); + +public: + /// \brief Creates an VariablePattern object with information about the given + /// StmtSequence. + VariablePattern(const StmtSequence &Sequence) { + for (const Stmt *S : Sequence) + addVariables(S); + } + VariablePattern() {} /// \brief Describes two clones that reference their variables in a different /// pattern which could indicate a programming error. @@ -259,17 +363,110 @@ SuspiciousClone SecondClone; }; - /// \brief Searches the provided statements for pairs of clones that don't - /// follow the same pattern when referencing variables. - /// \param Result Output parameter that will contain the clone pairs. - /// \param MinGroupComplexity Only clone pairs in which the clones have at - /// least this complexity value. - void findSuspiciousClones(std::vector &Result, - unsigned MinGroupComplexity); + /// \brief Compares this pattern with the given one. + /// \param Other The given VariablePattern to compare with. + /// \param FirstMismatch Output parameter that will be filled with information + /// about the first difference between the two patterns. This parameter + /// can be a nullptr, in which case it will be ignored. + /// \return Returns the number of differences between the pattern this object + /// is following and the given VariablePattern. + /// + /// For example, the following statements all have the same pattern and this + /// function would return zero: + /// + /// if (a < b) return a; return b; + /// if (x < y) return x; return y; + /// if (u2 < u1) return u2; return u1; + /// + /// But the following statement has a different pattern (note the changed + /// variables in the return statements) and would have two differences when + /// compared with one of the statements above. + /// + /// if (a < b) return b; return a; + /// + /// This function should only be called if the related statements of the given + /// pattern and the statements of this objects are clones of each other. + unsigned getPatternDifferences(const VariablePattern &Other, + SuspiciousClonePair *FirstMismatch = nullptr); +}; + +/// \brief Filters out all clones that don't reference variables in the same +/// pattern. +class MatchingVariablePatternPass : public CloneDetector::Pass { + VariablePattern PrototypePattern; -private: - /// Stores all encountered StmtSequences alongside their CloneSignature. - std::vector> Sequences; +public: + virtual ~MatchingVariablePatternPass() {} + + virtual bool + passPrototype(const StmtSequence &Prototype, + const CloneDetector::CloneGroup &PrototypeGroup) override { + PrototypePattern = VariablePattern(Prototype); + return true; + } + + virtual bool passCandidate(const StmtSequence &Prototype, + const StmtSequence &Candidate) override { + VariablePattern CandidatePattern(Candidate); + return PrototypePattern.getPatternDifferences(CandidatePattern) == 0; + } +}; + +/// \brief Filters all clone groups that don't have at least the given size. +class MinGroupSizePass : public CloneDetector::Pass { + unsigned MinGroupSize; + +public: + virtual ~MinGroupSizePass() {} + MinGroupSizePass(unsigned MinGroupSize = 2) : MinGroupSize(MinGroupSize) {} + + virtual bool + passHashGroup(const CloneDetector::CloneGroup &HashGroup) override { + return HashGroup.Sequences.size() >= MinGroupSize; + } + + virtual bool + passGroup(const CloneDetector::CloneGroup &PrototypeGroup) override { + return PrototypeGroup.Sequences.size() >= MinGroupSize; + } +}; + +/// \brief Filters all clones that don't have at least the given complexity +/// value. +class MinComplexityPass : public CloneDetector::Pass { + unsigned MinComplexity; + +public: + virtual ~MinComplexityPass() {} + MinComplexityPass(unsigned MinComplexity = 10) + : MinComplexity(MinComplexity) {} + + virtual bool + passPrototype(const StmtSequence &Prototype, + const CloneDetector::CloneGroup &PrototypeGroup) override { + return PrototypeGroup.Signature.Complexity >= MinComplexity; + } +}; + +/// \brief Filters all presumed clones from a hash group that aren't actually +/// clones but landed in the hash group due to unintended hash code +/// collisions. +class VerifyCloneHashesPass : public CloneDetector::Pass { +public: + virtual ~VerifyCloneHashesPass() {} + + virtual bool passCandidate(const StmtSequence &Prototype, + const StmtSequence &Candidate) override; +}; + +/// \brief This pass removes all clone groups which are a full subset of another +/// found clone group. +class OnlyLargestClonePass : public CloneDetector::Pass { +public: + virtual ~OnlyLargestClonePass() {} + + virtual void + processResult(std::vector &Result) override; }; } // end namespace clang Index: lib/Analysis/CloneDetection.cpp =================================================================== --- lib/Analysis/CloneDetection.cpp +++ lib/Analysis/CloneDetection.cpp @@ -80,164 +80,6 @@ SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); } namespace { - -/// \brief Analyzes the pattern of the referenced variables in a statement. -class VariablePattern { - - /// \brief Describes an occurence of a variable reference in a statement. - struct VariableOccurence { - /// The index of the associated VarDecl in the Variables vector. - size_t KindID; - /// The location in the code where the variable was references. - SourceRange Location; - - VariableOccurence(size_t KindID, SourceRange Location) - : KindID(KindID), Location(Location) {} - }; - - /// All occurences of referenced variables in the order of appearance. - std::vector Occurences; - /// List of referenced variables in the order of appearance. - /// Every item in this list is unique. - std::vector Variables; - - /// \brief Adds a new variable referenced to this pattern. - /// \param VarDecl The declaration of the variable that is referenced. - /// \param Location The SourceRange where this variable is referenced. - void addVariableOccurence(const VarDecl *VarDecl, SourceRange Location) { - // First check if we already reference this variable - for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) { - if (Variables[KindIndex] == VarDecl) { - // If yes, add a new occurence that points to the existing entry in - // the Variables vector. - Occurences.emplace_back(KindIndex, Location); - return; - } - } - // If this variable wasn't already referenced, add it to the list of - // referenced variables and add a occurence that points to this new entry. - Occurences.emplace_back(Variables.size(), Location); - Variables.push_back(VarDecl); - } - - /// \brief Adds each referenced variable from the given statement. - void addVariables(const Stmt *S) { - // Sometimes we get a nullptr (such as from IfStmts which often have nullptr - // children). We skip such statements as they don't reference any - // variables. - if (!S) - return; - - // Check if S is a reference to a variable. If yes, add it to the pattern. - if (auto D = dyn_cast(S)) { - if (auto VD = dyn_cast(D->getDecl()->getCanonicalDecl())) - addVariableOccurence(VD, D->getSourceRange()); - } - - // Recursively check all children of the given statement. - for (const Stmt *Child : S->children()) { - addVariables(Child); - } - } - -public: - /// \brief Creates an VariablePattern object with information about the given - /// StmtSequence. - VariablePattern(const StmtSequence &Sequence) { - for (const Stmt *S : Sequence) - addVariables(S); - } - - /// \brief Compares this pattern with the given one. - /// \param Other The given VariablePattern to compare with. - /// \param FirstMismatch Output parameter that will be filled with information - /// about the first difference between the two patterns. This parameter - /// can be a nullptr, in which case it will be ignored. - /// \return Returns the number of differences between the pattern this object - /// is following and the given VariablePattern. - /// - /// For example, the following statements all have the same pattern and this - /// function would return zero: - /// - /// if (a < b) return a; return b; - /// if (x < y) return x; return y; - /// if (u2 < u1) return u2; return u1; - /// - /// But the following statement has a different pattern (note the changed - /// variables in the return statements) and would have two differences when - /// compared with one of the statements above. - /// - /// if (a < b) return b; return a; - /// - /// This function should only be called if the related statements of the given - /// pattern and the statements of this objects are clones of each other. - unsigned getPatternDifferences( - const VariablePattern &Other, - CloneDetector::SuspiciousClonePair *FirstMismatch = nullptr) { - unsigned NumberOfDifferences = 0; - - assert(Other.Occurences.size() == Occurences.size()); - for (unsigned i = 0; i < Occurences.size(); ++i) { - auto ThisOccurence = Occurences[i]; - auto OtherOccurence = Other.Occurences[i]; - if (ThisOccurence.KindID != OtherOccurence.KindID) { - ++NumberOfDifferences; - - // If FirstMismatch is not a nullptr, we need to store information about - // the first difference between the two patterns. - if (FirstMismatch == nullptr) - continue; - - // Only proceed if we just found the first difference as we only store - // information about the first difference. - if (NumberOfDifferences != 1) - continue; - - const VarDecl *FirstSuggestion = nullptr; - // If there is a variable available in the list of referenced variables - // which wouldn't break the pattern if it is used in place of the - // current variable, we provide this variable as the suggested fix. - if (OtherOccurence.KindID < Variables.size()) - FirstSuggestion = Variables[OtherOccurence.KindID]; - - // Store information about the first clone. - FirstMismatch->FirstClone = - CloneDetector::SuspiciousClonePair::SuspiciousClone( - Variables[ThisOccurence.KindID], ThisOccurence.Location, - FirstSuggestion); - - // Same as above but with the other clone. We do this for both clones as - // we don't know which clone is the one containing the unintended - // pattern error. - const VarDecl *SecondSuggestion = nullptr; - if (ThisOccurence.KindID < Other.Variables.size()) - SecondSuggestion = Other.Variables[ThisOccurence.KindID]; - - // Store information about the second clone. - FirstMismatch->SecondClone = - CloneDetector::SuspiciousClonePair::SuspiciousClone( - Variables[ThisOccurence.KindID], OtherOccurence.Location, - SecondSuggestion); - - // SuspiciousClonePair guarantees that the first clone always has a - // suggested variable associated with it. As we know that one of the two - // clones in the pair always has suggestion, we swap the two clones - // in case the first clone has no suggested variable which means that - // the second clone has a suggested variable and should be first. - if (!FirstMismatch->FirstClone.Suggestion) - std::swap(FirstMismatch->FirstClone, FirstMismatch->SecondClone); - - // This ensures that we always have at least one suggestion in a pair. - assert(FirstMismatch->FirstClone.Suggestion); - } - } - - return NumberOfDifferences; - } -}; -} - -namespace { /// \brief Collects the data of a single Stmt. /// /// This class defines what a code clone is: If it collects for two statements @@ -529,6 +371,73 @@ Sequences.push_back(std::make_pair(Signature, S)); } +void CloneDetector::applyPasses(const std::vector &CloneGroups, + std::vector &Result, + PassList &Passes) { + for (auto &HashGroup : CloneGroups) { + // Contains all indexes in HashGroup that were already added to a + // CloneGroup. + std::vector Indexes; + Indexes.resize(HashGroup.Sequences.size()); + + for (unsigned i = 0; i < HashGroup.Sequences.size(); ++i) { + // Skip indexes that are already part of a CloneGroup. + if (Indexes[i]) + continue; + + // Pick the first unhandled StmtSequence and consider it as the beginning + // of a new CloneGroup for now. + // We don't add i to Indexes because we never iterate back. + StmtSequence Prototype = HashGroup.Sequences[i]; + CloneDetector::CloneGroup PotentialGroup(Prototype, HashGroup.Signature); + ++Indexes[i]; + + if (!std::all_of(Passes.begin(), Passes.end(), + [&Prototype, &PotentialGroup](Pass *Pass) { + return Pass->passPrototype(Prototype, PotentialGroup); + })) + continue; + + // Check all following StmtSequences for clones. + for (unsigned j = i + 1; j < HashGroup.Sequences.size(); ++j) { + // Skip indexes that are already part of a CloneGroup. + if (Indexes[j]) + continue; + + // If a following StmtSequence belongs to our CloneGroup, we add it to + // it. + const StmtSequence &Candidate = HashGroup.Sequences[j]; + + if (!std::all_of(Passes.begin(), Passes.end(), + [&Prototype, &Candidate](Pass *Pass) { + return Pass->passCandidate(Prototype, Candidate); + })) + continue; + + PotentialGroup.Sequences.push_back(Candidate); + // Make sure we never visit this StmtSequence again. + ++Indexes[j]; + } + + if (!std::all_of(Passes.begin(), Passes.end(), + [&PotentialGroup](Pass *Pass) { + return Pass->passGroup(PotentialGroup); + })) + continue; + + // Otherwise, add it to the result and continue searching for more groups. + Result.push_back(PotentialGroup); + } + + assert(std::all_of(Indexes.begin(), Indexes.end(), + [](char c) { return c == 1; })); + } + + for (Pass *P : Passes) { + P->processResult(Result); + } +} + namespace { /// \brief Returns true if and only if \p Stmt contains at least one other /// sequence in the \p Group. @@ -559,11 +468,69 @@ } } // end anonymous namespace +void CloneDetector::findClones(std::vector &Result, + PassList &Passes) { + // Shortcut and necessary for the for-loop later in this function. + if (Sequences.empty()) + return; + + std::stable_sort(Sequences.begin(), Sequences.end(), + [](std::pair LHS, + std::pair RHS) { + return LHS.first.Hash < RHS.first.Hash; + }); + + std::vector CloneGroups; + + // Check for each CloneSignature if its successor has the same hash value. + // We don't check the last CloneSignature as it has no successor. + // Note: The 'size - 1 ' in the condition is safe because we check for an + // empty Sequences vector at the beginning of this function. + for (unsigned i = 0; i < Sequences.size() - 1; ++i) { + const auto Current = Sequences[i]; + const auto Next = Sequences[i + 1]; + + if (Current.first.Hash != Next.first.Hash) + continue; + + // It's likely that we just found an sequence of CloneSignatures that + // represent a CloneGroup, so we create a new group and start checking and + // adding the CloneSignatures in this sequence. + CloneGroup Group; + Group.Signature = Current.first; + + for (; i < Sequences.size(); ++i) { + const auto &Signature = Sequences[i]; + + // A different hash value means we have reached the end of the sequence. + if (Current.first.Hash != Signature.first.Hash) { + // The current Signature could be the start of a new CloneGroup. So we + // decrement i so that we visit it again in the outer loop. + // Note: i can never be 0 at this point because we are just comparing + // the hash of the Current CloneSignature with itself in the 'if' above. + assert(i != 0); + --i; + break; + } + + Group.Sequences.push_back(Signature.second); + } + + if (!std::all_of(Passes.begin(), Passes.end(), [&Group](Pass *Pass) { + return Pass->passHashGroup(Group); + })) + continue; + + CloneGroups.push_back(Group); + } + + applyPasses(CloneGroups, Result, Passes); +} + namespace { /// \brief Wrapper around FoldingSetNodeID that it can be used as the template /// argument of the StmtDataCollector. class FoldingSetWrapper { - llvm::FoldingSetNodeID &FS; public: @@ -610,144 +577,134 @@ return DataLHS == DataRHS; } -/// \brief Finds all actual clone groups in a single group of presumed clones. -/// \param Result Output parameter to which all found groups are added. -/// \param Group A group of presumed clones. The clones are allowed to have a -/// different variable pattern and may not be actual clones of each -/// other. -/// \param CheckVariablePatterns If true, every clone in a group that was added -/// to the output follows the same variable pattern as the other -/// clones in its group. -static void createCloneGroups(std::vector &Result, - const CloneDetector::CloneGroup &Group, - bool CheckVariablePattern) { - // We remove the Sequences one by one, so a list is more appropriate. - std::list UnassignedSequences(Group.Sequences.begin(), - Group.Sequences.end()); - - // Search for clones as long as there could be clones in UnassignedSequences. - while (UnassignedSequences.size() > 1) { - - // Pick the first Sequence as a protoype for a new clone group. - StmtSequence Prototype = UnassignedSequences.front(); - UnassignedSequences.pop_front(); - - CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Signature); - - // Analyze the variable pattern of the prototype. Every other StmtSequence - // needs to have the same pattern to get into the new clone group. - VariablePattern PrototypeFeatures(Prototype); - - // Search all remaining StmtSequences for an identical variable pattern - // and assign them to our new clone group. - auto I = UnassignedSequences.begin(), E = UnassignedSequences.end(); - while (I != E) { - // If the sequence doesn't fit to the prototype, we have encountered - // an unintended hash code collision and we skip it. - if (!areSequencesClones(Prototype, *I)) { - ++I; - continue; - } - - // If we weren't asked to check for a matching variable pattern in clone - // groups we can add the sequence now to the new clone group. - // If we were asked to check for matching variable pattern, we first have - // to check that there are no differences between the two patterns and - // only proceed if they match. - if (!CheckVariablePattern || - VariablePattern(*I).getPatternDifferences(PrototypeFeatures) == 0) { - FilteredGroup.Sequences.push_back(*I); - I = UnassignedSequences.erase(I); - continue; - } - - // We didn't found a matching variable pattern, so we continue with the - // next sequence. - ++I; +void VariablePattern::addVariableOccurence(const VarDecl *VarDecl, + SourceRange Location) { + // First check if we already reference this variable + for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) { + if (Variables[KindIndex] == VarDecl) { + // If yes, add a new occurence that points to the existing entry in + // the Variables vector. + Occurences.emplace_back(KindIndex, Location); + return; } - - // Add a valid clone group to the list of found clone groups. - if (!FilteredGroup.isValid()) - continue; - Result.push_back(FilteredGroup); } + // If this variable wasn't already referenced, add it to the list of + // referenced variables and add a occurence that points to this new entry. + Occurences.emplace_back(Variables.size(), Location); + Variables.push_back(VarDecl); } -void CloneDetector::findClones(std::vector &Result, - unsigned MinGroupComplexity, - bool CheckPatterns) { - // A shortcut (and necessary for the for-loop later in this function). - if (Sequences.empty()) +void VariablePattern::addVariables(const Stmt *S) { + // Sometimes we get a nullptr (such as from IfStmts which often have nullptr + // children). We skip such statements as they don't reference any + // variables. + if (!S) return; - // We need to search for groups of StmtSequences with the same hash code to - // create our initial clone groups. By sorting all known StmtSequences by - // their hash value we make sure that StmtSequences with the same hash code - // are grouped together in the Sequences vector. - // Note: We stable sort here because the StmtSequences are added in the order - // in which they appear in the source file. We want to preserve that order - // because we also want to report them in that order in the CloneChecker. - std::stable_sort(Sequences.begin(), Sequences.end(), - [](std::pair LHS, - std::pair RHS) { - return LHS.first.Hash < RHS.first.Hash; - }); + // Check if S is a reference to a variable. If yes, add it to the pattern. + if (auto D = dyn_cast(S)) { + if (auto VD = dyn_cast(D->getDecl()->getCanonicalDecl())) + addVariableOccurence(VD, D->getSourceRange()); + } - std::vector CloneGroups; + // Recursively check all children of the given statement. + for (const Stmt *Child : S->children()) { + addVariables(Child); + } +} - // Check for each CloneSignature if its successor has the same hash value. - // We don't check the last CloneSignature as it has no successor. - // Note: The 'size - 1' in the condition is safe because we check for an empty - // Sequences vector at the beginning of this function. - for (unsigned i = 0; i < Sequences.size() - 1; ++i) { - const auto Current = Sequences[i]; - const auto Next = Sequences[i + 1]; +unsigned +VariablePattern::getPatternDifferences(const VariablePattern &Other, + SuspiciousClonePair *FirstMismatch) { + unsigned NumberOfDifferences = 0; + + assert(Other.Occurences.size() == Occurences.size()); + for (unsigned i = 0; i < Occurences.size(); ++i) { + auto ThisOccurence = Occurences[i]; + auto OtherOccurence = Other.Occurences[i]; + if (ThisOccurence.KindID != OtherOccurence.KindID) { + ++NumberOfDifferences; + + // If FirstMismatch is not a nullptr, we need to store information about + // the first difference between the two patterns. + if (FirstMismatch == nullptr) + continue; - if (Current.first.Hash != Next.first.Hash) - continue; + // Only proceed if we just found the first difference as we only store + // information about the first difference. + if (NumberOfDifferences != 1) + continue; - // It's likely that we just found an sequence of CloneSignatures that - // represent a CloneGroup, so we create a new group and start checking and - // adding the CloneSignatures in this sequence. - CloneGroup Group; - Group.Signature = Current.first; + const VarDecl *FirstSuggestion = nullptr; + // If there is a variable available in the list of referenced variables + // which wouldn't break the pattern if it is used in place of the + // current variable, we provide this variable as the suggested fix. + if (OtherOccurence.KindID < Variables.size()) + FirstSuggestion = Variables[OtherOccurence.KindID]; + + // Store information about the first clone. + FirstMismatch->FirstClone = SuspiciousClonePair::SuspiciousClone( + Variables[ThisOccurence.KindID], ThisOccurence.Location, + FirstSuggestion); + + // Same as above but with the other clone. We do this for both clones as + // we don't know which clone is the one containing the unintended + // pattern error. + const VarDecl *SecondSuggestion = nullptr; + if (ThisOccurence.KindID < Other.Variables.size()) + SecondSuggestion = Other.Variables[ThisOccurence.KindID]; + + // Store information about the second clone. + FirstMismatch->SecondClone = SuspiciousClonePair::SuspiciousClone( + Variables[ThisOccurence.KindID], OtherOccurence.Location, + SecondSuggestion); + + // SuspiciousClonePair guarantees that the first clone always has a + // suggested variable associated with it. As we know that one of the two + // clones in the pair always has suggestion, we swap the two clones + // in case the first clone has no suggested variable which means that + // the second clone has a suggested variable and should be first. + if (!FirstMismatch->FirstClone.Suggestion) + std::swap(FirstMismatch->FirstClone, FirstMismatch->SecondClone); + + // This ensures that we always have at least one suggestion in a pair. + assert(FirstMismatch->FirstClone.Suggestion); + } + } - for (; i < Sequences.size(); ++i) { - const auto &Signature = Sequences[i]; + return NumberOfDifferences; +} - // A different hash value means we have reached the end of the sequence. - if (Current.first.Hash != Signature.first.Hash) { - // The current Signature could be the start of a new CloneGroup. So we - // decrement i so that we visit it again in the outer loop. - // Note: i can never be 0 at this point because we are just comparing - // the hash of the Current CloneSignature with itself in the 'if' above. - assert(i != 0); - --i; - break; - } +bool CloneDetector::Pass::passHashGroup( + const CloneDetector::CloneGroup &HashGroup) { + return true; +} - // Skip CloneSignatures that won't pass the complexity requirement. - if (Signature.first.Complexity < MinGroupComplexity) - continue; +bool CloneDetector::Pass::passPrototype( + const StmtSequence &Prototype, + const CloneDetector::CloneGroup &PrototypeGroup) { + return true; +} - Group.Sequences.push_back(Signature.second); - } +bool CloneDetector::Pass::passCandidate(const StmtSequence &Prototype, + const StmtSequence &Candidate) { + return true; +} - // There is a chance that we haven't found more than two fitting - // CloneSignature because not enough CloneSignatures passed the complexity - // requirement. As a CloneGroup with less than two members makes no sense, - // we ignore this CloneGroup and won't add it to the result. - if (!Group.isValid()) - continue; +bool CloneDetector::Pass::passGroup(const CloneDetector::CloneGroup &Group) { + return true; +} - CloneGroups.push_back(Group); - } +void CloneDetector::Pass::processResult( + std::vector &Result) {} - // Add every valid clone group that fulfills the complexity requirement. - for (const CloneGroup &Group : CloneGroups) { - createCloneGroups(Result, Group, CheckPatterns); - } +bool VerifyCloneHashesPass::passCandidate(const StmtSequence &Prototype, + const StmtSequence &Candidate) { + return areSequencesClones(Prototype, Candidate); +} +void OnlyLargestClonePass::processResult( + std::vector &Result) { std::vector IndexesToRemove; // Compare every group in the result with the rest. If one groups contains @@ -774,37 +731,3 @@ Result.erase(Result.begin() + *I); } } - -void CloneDetector::findSuspiciousClones( - std::vector &Result, - unsigned MinGroupComplexity) { - std::vector Clones; - // Reuse the normal search for clones but specify that the clone groups don't - // need to have a common referenced variable pattern so that we can manually - // search for the kind of pattern errors this function is supposed to find. - findClones(Clones, MinGroupComplexity, false); - - for (const CloneGroup &Group : Clones) { - for (unsigned i = 0; i < Group.Sequences.size(); ++i) { - VariablePattern PatternA(Group.Sequences[i]); - - for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) { - VariablePattern PatternB(Group.Sequences[j]); - - CloneDetector::SuspiciousClonePair ClonePair; - // For now, we only report clones which break the variable pattern just - // once because multiple differences in a pattern are an indicator that - // those differences are maybe intended (e.g. because it's actually - // a different algorithm). - // TODO: In very big clones even multiple variables can be unintended, - // so replacing this number with a percentage could better handle such - // cases. On the other hand it could increase the false-positive rate - // for all clones if the percentage is too high. - if (PatternA.getPatternDifferences(PatternB, &ClonePair) == 1) { - Result.push_back(ClonePair); - break; - } - } - } - } -} Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp =================================================================== --- lib/StaticAnalyzer/Checkers/CloneChecker.cpp +++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp @@ -36,11 +36,9 @@ AnalysisManager &Mgr, BugReporter &BR) const; /// \brief Reports all clones to the user. - void reportClones(SourceManager &SM, AnalysisManager &Mgr, - int MinComplexity) const { - - std::vector CloneGroups; - CloneDetector.findClones(CloneGroups, MinComplexity); + void + reportClones(SourceManager &SM, AnalysisManager &Mgr, + const std::vector &Clones) const { DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic(); @@ -50,7 +48,7 @@ unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note, "Related code clone is here."); - for (CloneDetector::CloneGroup &Group : CloneGroups) { + for (const CloneDetector::CloneGroup &Group : Clones) { // We group the clones by printing the first as a warning and all others // as a note. DiagEngine.Report(Group.Sequences.front().getStartLoc(), WarnID); @@ -62,11 +60,37 @@ /// \brief Reports only suspicious clones to the user along with informaton /// that explain why they are suspicious. - void reportSuspiciousClones(SourceManager &SM, AnalysisManager &Mgr, - int MinComplexity) const { - - std::vector Clones; - CloneDetector.findSuspiciousClones(Clones, MinComplexity); + void reportSuspiciousClones( + SourceManager &SM, AnalysisManager &Mgr, + const std::vector &Clones) const { + + std::vector Pairs; + + for (const CloneDetector::CloneGroup &Group : Clones) { + for (unsigned i = 0; i < Group.Sequences.size(); ++i) { + VariablePattern PatternA(Group.Sequences[i]); + + for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) { + VariablePattern PatternB(Group.Sequences[j]); + + VariablePattern::SuspiciousClonePair ClonePair; + // For now, we only report clones which break the variable pattern + // just + // once because multiple differences in a pattern are an indicator + // that + // those differences are maybe intended (e.g. because it's actually + // a different algorithm). + // TODO: In very big clones even multiple variables can be unintended, + // so replacing this number with a percentage could better handle such + // cases. On the other hand it could increase the false-positive rate + // for all clones if the percentage is too high. + if (PatternA.getPatternDifferences(PatternB, &ClonePair) == 1) { + Pairs.push_back(ClonePair); + break; + } + } + } + } DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic(); @@ -81,7 +105,7 @@ DiagnosticsEngine::Note, "Or maybe you wanted to use %0 here in this " "similar piece of code?"); - for (CloneDetector::SuspiciousClonePair &Pair : Clones) { + for (VariablePattern::SuspiciousClonePair &Pair : Pairs) { // The first clone always has a suggestion and we report it to the user // along with the place where the suggestion should be used. DiagEngine.Report(Pair.FirstClone.Location.getBegin(), WarnID) @@ -118,6 +142,8 @@ // At this point, every statement in the translation unit has been analyzed by // the CloneDetector. The only thing left to do is to report the found clones. + SourceManager &SM = BR.getSourceManager(); + int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger( "MinimumCloneComplexity", 10, this); assert(MinComplexity >= 0); @@ -128,11 +154,24 @@ bool ReportNormalClones = Mgr.getAnalyzerOptions().getBooleanOption( "ReportNormalClones", true, this); + std::vector AllCloneGroups; + CloneDetector::PassList Passes = { + new MinComplexityPass(MinComplexity), new MinGroupSizePass(2), + new VerifyCloneHashesPass(), new OnlyLargestClonePass()}; + CloneDetector.findClones(AllCloneGroups, Passes); + if (ReportSuspiciousClones) - reportSuspiciousClones(BR.getSourceManager(), Mgr, MinComplexity); + reportSuspiciousClones(SM, Mgr, AllCloneGroups); + + CloneDetector::PassList OnlyMatchingVariablePatterns = { + new MinGroupSizePass(2), new MatchingVariablePatternPass()}; + + std::vector CloneGroups; + CloneDetector::applyPasses(AllCloneGroups, CloneGroups, + OnlyMatchingVariablePatterns); if (ReportNormalClones) - reportClones(BR.getSourceManager(), Mgr, MinComplexity); + reportClones(SM, Mgr, CloneGroups); } //===----------------------------------------------------------------------===//