Index: include/clang/Analysis/CloneDetection.h =================================================================== --- include/clang/Analysis/CloneDetection.h +++ include/clang/Analysis/CloneDetection.h @@ -17,7 +17,6 @@ #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/Hashing.h" -#include "llvm/ADT/StringMap.h" #include @@ -158,6 +157,7 @@ /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) /// are not supported. class CloneDetector { + public: typedef unsigned DataPiece; @@ -201,8 +201,7 @@ }; /// Holds group of StmtSequences that are clones of each other and the - /// complexity value (see CloneSignature::Complexity) that all stored - /// StmtSequences have in common. + /// CloneSignature that all clones have in common. struct CloneGroup { std::vector Sequences; CloneSignature Signature; @@ -213,13 +212,6 @@ : Signature(Signature) { Sequences.push_back(Seq); } - - /// \brief Returns false if and only if this group should be skipped when - /// searching for clones. - bool isValid() const { - // A clone group with only one member makes no sense, so we skip them. - return Sequences.size() > 1; - } }; /// \brief Generates and stores search data for all statements in the body of @@ -229,17 +221,140 @@ /// \brief Stores the CloneSignature to allow future querying. void add(const StmtSequence &S, const CloneSignature &Signature); - /// \brief Searches the provided statements for clones. + /// \brief Represents a way to filter clones from a list of clone groups. /// - /// \param Result Output parameter that is filled with a list of found - /// clone groups. Each group contains multiple StmtSequences - /// that were identified to be clones of each other. - /// \param MinGroupComplexity Only return clones which have at least this - /// complexity value. - /// \param CheckPatterns Returns only clone groups in which the referenced - /// variables follow the same pattern. - void findClones(std::vector &Result, unsigned MinGroupComplexity, - bool CheckPatterns = true); + /// A reoccuring task in the CloneDetector is filtering out clones that don't + /// have certain properties. This filtering is done by reusable constraints + /// of which each removes clones with a undesireable property. + /// Usually, constraints are grouped together in a ConstraintList and are + /// applied one after another by the CloneDetector until all undesireable + /// clones are removed. + /// + /// To create a new constraint, inherit from this class and overwrite the + /// respective virtual methods. + class Constraint { + public: + virtual ~Constraint() {} + + /// \brief Allows skipping the processing of a given hash group. + /// \param HashGroup The given hash group. + /// \return Return true if the CloneDetector should start creating actual + /// clone groups from the given hash group. + virtual bool acceptsHashGroup(const CloneGroup &HashGroup); + + /// \brief Allows skipping the creation of clone groups from a given + /// prototype StmtSequence. + /// \param Prototype The given prototype StmtSequence. + /// \param PrototypeGroup The clone group to which the Prototype belongs. + /// \return Return true if the CloneDetector should search for candidates + /// that will fit into the given prototype group. + virtual bool acceptsPrototype(const StmtSequence &Prototype, + const CloneGroup &PrototypeGroup); + + /// \brief Decides if a StmtSequence belongs into a certain clone group. + /// \param Prototype The prototype that + /// \param Candidate + /// \return Return true if the candidate fits to the prototype. + virtual bool acceptsCandidate(const StmtSequence &Prototype, + const StmtSequence &Candidate); + + /// \brief Decides if a given clone group should be added to the result. + /// \param Group The given clone group. + /// \return Return true if the given group should be included in the result. + /// + /// This function should be used to enforce context-sensitive constraints + /// for clones (i.e. where the constraint needs to access other clones in + /// the current clone group). + virtual bool acceptsGroup(const CloneGroup &Group); + + /// \brief Can process and modify the clone groups before they are returned. + /// \param Result The list of clone groups. + /// + /// This function should be used to enforce context-sensitive constraints + /// for clone groups (i.e. where the constraint needs to access other clone + /// groups). + virtual void constrainResult(std::vector &Result); + }; + + /// \brief Stores a list of constraints. + /// + /// This class also takes over the memory ownership of all given constraints. + class ConstraintList { + std::vector Constraints; + + public: + ConstraintList(std::initializer_list IL) + : Constraints(IL.begin(), IL.end()) {} + ~ConstraintList() { + for (Constraint *P : Constraints) { + delete P; + } + } + + // Non-copyable because we free all passes in the destructor. + ConstraintList(const ConstraintList &other) = delete; + ConstraintList &operator=(const ConstraintList &) = delete; + + std::vector::iterator begin() { return Constraints.begin(); } + std::vector::iterator end() { return Constraints.end(); } + }; + + /// \brief Generates hash groups and applies the given passes to them. + /// \param Result Output parameter to which all created clone groups are + /// added. + /// \param Passes The passes that should be used to filter the hash groups. + void findClones(std::vector &Result, ConstraintList &Passes); + + /// \brief Applies the given passes on a list of clone groups. + /// \param CloneGroups The list of clone groups. + /// \param Result Output parameter to which all created clone groups are + /// added. + /// \param Passes The passes that should be applied. + static void constrainClones(const std::vector &CloneGroups, + std::vector &Result, + ConstraintList &Passes); + +private: + /// Stores all encountered StmtSequences alongside their CloneSignature. + std::vector> Sequences; +}; + +/// \brief Analyzes the pattern of the referenced variables in a statement. +class VariablePattern { + + /// \brief Describes an occurence of a variable reference in a statement. + struct VariableOccurence { + /// The index of the associated VarDecl in the Variables vector. + size_t KindID; + /// The location in the code where the variable was references. + SourceRange Location; + + VariableOccurence(size_t KindID, SourceRange Location) + : KindID(KindID), Location(Location) {} + }; + + /// All occurences of referenced variables in the order of appearance. + std::vector Occurences; + /// List of referenced variables in the order of appearance. + /// Every item in this list is unique. + std::vector Variables; + + /// \brief Adds a new variable referenced to this pattern. + /// \param VarDecl The declaration of the variable that is referenced. + /// \param Location The SourceRange where this variable is referenced. + void addVariableOccurence(const VarDecl *VarDecl, SourceRange Location); + + /// \brief Adds each referenced variable from the given statement. + void addVariables(const Stmt *S); + +public: + /// \brief Creates an VariablePattern object with information about the given + /// StmtSequence. + VariablePattern(const StmtSequence &Sequence) { + for (const Stmt *S : Sequence) + addVariables(S); + } + VariablePattern() {} /// \brief Describes two clones that reference their variables in a different /// pattern which could indicate a programming error. @@ -266,17 +381,111 @@ SuspiciousCloneInfo SecondCloneInfo; }; - /// \brief Searches the provided statements for pairs of clones that don't - /// follow the same pattern when referencing variables. - /// \param Result Output parameter that will contain the clone pairs. - /// \param MinGroupComplexity Only clone pairs in which the clones have at - /// least this complexity value. - void findSuspiciousClones(std::vector &Result, - unsigned MinGroupComplexity); + /// \brief Compares this pattern with the given one. + /// \param Other The given VariablePattern to compare with. + /// \param FirstMismatch Output parameter that will be filled with information + /// about the first difference between the two patterns. This parameter + /// can be a nullptr, in which case it will be ignored. + /// \return Returns the number of differences between the pattern this object + /// is following and the given VariablePattern. + /// + /// For example, the following statements all have the same pattern and this + /// function would return zero: + /// + /// if (a < b) return a; return b; + /// if (x < y) return x; return y; + /// if (u2 < u1) return u2; return u1; + /// + /// But the following statement has a different pattern (note the changed + /// variables in the return statements) and would have two differences when + /// compared with one of the statements above. + /// + /// if (a < b) return b; return a; + /// + /// This function should only be called if the related statements of the given + /// pattern and the statements of this objects are clones of each other. + unsigned getPatternDifferences(const VariablePattern &Other, + SuspiciousClonePair *FirstMismatch = nullptr); +}; + +/// \brief Filters out all clones that don't reference variables in the same +/// pattern. +class MatchingVariablePatternConstraint : public CloneDetector::Constraint { + VariablePattern PrototypePattern; -private: - /// Stores all encountered StmtSequences alongside their CloneSignature. - std::vector> Sequences; +public: + virtual ~MatchingVariablePatternConstraint() {} + + virtual bool + acceptsPrototype(const StmtSequence &Prototype, + const CloneDetector::CloneGroup &PrototypeGroup) override { + PrototypePattern = VariablePattern(Prototype); + return true; + } + + virtual bool acceptsCandidate(const StmtSequence &Prototype, + const StmtSequence &Candidate) override { + VariablePattern CandidatePattern(Candidate); + return PrototypePattern.getPatternDifferences(CandidatePattern) == 0; + } +}; + +/// \brief Filters all clone groups that don't have at least the given size. +class MinGroupSizeConstraint : public CloneDetector::Constraint { + unsigned MinGroupSize; + +public: + virtual ~MinGroupSizeConstraint() {} + MinGroupSizeConstraint(unsigned MinGroupSize = 2) + : MinGroupSize(MinGroupSize) {} + + virtual bool + acceptsHashGroup(const CloneDetector::CloneGroup &HashGroup) override { + return HashGroup.Sequences.size() >= MinGroupSize; + } + + virtual bool + acceptsGroup(const CloneDetector::CloneGroup &PrototypeGroup) override { + return PrototypeGroup.Sequences.size() >= MinGroupSize; + } +}; + +/// \brief Filters all clones that don't have at least the given complexity +/// value. +class MinComplexityConstraint : public CloneDetector::Constraint { + unsigned MinComplexity; + +public: + virtual ~MinComplexityConstraint() {} + MinComplexityConstraint(unsigned MinComplexity = 10) + : MinComplexity(MinComplexity) {} + + virtual bool + acceptsPrototype(const StmtSequence &Prototype, + const CloneDetector::CloneGroup &PrototypeGroup) override { + return PrototypeGroup.Signature.Complexity >= MinComplexity; + } +}; + +/// \brief Filters all presumed clones from a hash group that aren't actually +/// clones but landed in the hash group due to unintended hash code +/// collisions. +class VerifyCloneHashesConstraint : public CloneDetector::Constraint { +public: + virtual ~VerifyCloneHashesConstraint() {} + + virtual bool acceptsCandidate(const StmtSequence &Prototype, + const StmtSequence &Candidate) override; +}; + +/// \brief This constraint removes all clone groups which are a full subset of +/// another found clone group. +class OnlyLargestCloneConstraint : public CloneDetector::Constraint { +public: + virtual ~OnlyLargestCloneConstraint() {} + + virtual void + constrainResult(std::vector &Result) override; }; } // end namespace clang Index: lib/Analysis/CloneDetection.cpp =================================================================== --- lib/Analysis/CloneDetection.cpp +++ lib/Analysis/CloneDetection.cpp @@ -82,166 +82,6 @@ SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); } -namespace { - -/// \brief Analyzes the pattern of the referenced variables in a statement. -class VariablePattern { - - /// \brief Describes an occurence of a variable reference in a statement. - struct VariableOccurence { - /// The index of the associated VarDecl in the Variables vector. - size_t KindID; - /// The source range in the code where the variable was referenced. - SourceRange Range; - - VariableOccurence(size_t KindID, SourceRange Range) - : KindID(KindID), Range(Range) {} - }; - - /// All occurences of referenced variables in the order of appearance. - std::vector Occurences; - /// List of referenced variables in the order of appearance. - /// Every item in this list is unique. - std::vector Variables; - - /// \brief Adds a new variable referenced to this pattern. - /// \param VarDecl The declaration of the variable that is referenced. - /// \param Range The SourceRange where this variable is referenced. - void addVariableOccurence(const VarDecl *VarDecl, SourceRange Range) { - // First check if we already reference this variable - for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) { - if (Variables[KindIndex] == VarDecl) { - // If yes, add a new occurence that points to the existing entry in - // the Variables vector. - Occurences.emplace_back(KindIndex, Range); - return; - } - } - // If this variable wasn't already referenced, add it to the list of - // referenced variables and add a occurence that points to this new entry. - Occurences.emplace_back(Variables.size(), Range); - Variables.push_back(VarDecl); - } - - /// \brief Adds each referenced variable from the given statement. - void addVariables(const Stmt *S) { - // Sometimes we get a nullptr (such as from IfStmts which often have nullptr - // children). We skip such statements as they don't reference any - // variables. - if (!S) - return; - - // Check if S is a reference to a variable. If yes, add it to the pattern. - if (auto D = dyn_cast(S)) { - if (auto VD = dyn_cast(D->getDecl()->getCanonicalDecl())) - addVariableOccurence(VD, D->getSourceRange()); - } - - // Recursively check all children of the given statement. - for (const Stmt *Child : S->children()) { - addVariables(Child); - } - } - -public: - /// \brief Creates an VariablePattern object with information about the given - /// StmtSequence. - VariablePattern(const StmtSequence &Sequence) { - for (const Stmt *S : Sequence) - addVariables(S); - } - - /// \brief Counts the differences between this pattern and the given one. - /// \param Other The given VariablePattern to compare with. - /// \param FirstMismatch Output parameter that will be filled with information - /// about the first difference between the two patterns. This parameter - /// can be a nullptr, in which case it will be ignored. - /// \return Returns the number of differences between the pattern this object - /// is following and the given VariablePattern. - /// - /// For example, the following statements all have the same pattern and this - /// function would return zero: - /// - /// if (a < b) return a; return b; - /// if (x < y) return x; return y; - /// if (u2 < u1) return u2; return u1; - /// - /// But the following statement has a different pattern (note the changed - /// variables in the return statements) and would have two differences when - /// compared with one of the statements above. - /// - /// if (a < b) return b; return a; - /// - /// This function should only be called if the related statements of the given - /// pattern and the statements of this objects are clones of each other. - unsigned countPatternDifferences( - const VariablePattern &Other, - CloneDetector::SuspiciousClonePair *FirstMismatch = nullptr) { - unsigned NumberOfDifferences = 0; - - assert(Other.Occurences.size() == Occurences.size()); - for (unsigned i = 0; i < Occurences.size(); ++i) { - auto ThisOccurence = Occurences[i]; - auto OtherOccurence = Other.Occurences[i]; - if (ThisOccurence.KindID == OtherOccurence.KindID) - continue; - - ++NumberOfDifferences; - - // If FirstMismatch is not a nullptr, we need to store information about - // the first difference between the two patterns. - if (FirstMismatch == nullptr) - continue; - - // Only proceed if we just found the first difference as we only store - // information about the first difference. - if (NumberOfDifferences != 1) - continue; - - const VarDecl *FirstSuggestion = nullptr; - // If there is a variable available in the list of referenced variables - // which wouldn't break the pattern if it is used in place of the - // current variable, we provide this variable as the suggested fix. - if (OtherOccurence.KindID < Variables.size()) - FirstSuggestion = Variables[OtherOccurence.KindID]; - - // Store information about the first clone. - FirstMismatch->FirstCloneInfo = - CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo( - Variables[ThisOccurence.KindID], ThisOccurence.Range, - FirstSuggestion); - - // Same as above but with the other clone. We do this for both clones as - // we don't know which clone is the one containing the unintended - // pattern error. - const VarDecl *SecondSuggestion = nullptr; - if (ThisOccurence.KindID < Other.Variables.size()) - SecondSuggestion = Other.Variables[ThisOccurence.KindID]; - - // Store information about the second clone. - FirstMismatch->SecondCloneInfo = - CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo( - Variables[ThisOccurence.KindID], OtherOccurence.Range, - SecondSuggestion); - - // SuspiciousClonePair guarantees that the first clone always has a - // suggested variable associated with it. As we know that one of the two - // clones in the pair always has suggestion, we swap the two clones - // in case the first clone has no suggested variable which means that - // the second clone has a suggested variable and should be first. - if (!FirstMismatch->FirstCloneInfo.Suggestion) - std::swap(FirstMismatch->FirstCloneInfo, - FirstMismatch->SecondCloneInfo); - - // This ensures that we always have at least one suggestion in a pair. - assert(FirstMismatch->FirstCloneInfo.Suggestion); - } - - return NumberOfDifferences; - } -}; -} - /// \brief Prints the macro name that contains the given SourceLocation into /// the given raw_string_ostream. static void printMacroName(llvm::raw_string_ostream &MacroStack, @@ -584,6 +424,76 @@ Sequences.push_back(std::make_pair(Signature, S)); } +void CloneDetector::constrainClones( + const std::vector &CloneGroups, + std::vector &Result, + ConstraintList &Constraints) { + for (auto &HashGroup : CloneGroups) { + // Contains all indexes in HashGroup that were already added to a + // CloneGroup. + std::vector Indexes; + Indexes.resize(HashGroup.Sequences.size()); + + for (unsigned i = 0; i < HashGroup.Sequences.size(); ++i) { + // Skip indexes that are already part of a CloneGroup. + if (Indexes[i]) + continue; + + // Pick the first unhandled StmtSequence and consider it as the beginning + // of a new CloneGroup for now. + // We don't add i to Indexes because we never iterate back. + StmtSequence Prototype = HashGroup.Sequences[i]; + CloneDetector::CloneGroup PotentialGroup(Prototype, HashGroup.Signature); + ++Indexes[i]; + + if (!std::all_of(Constraints.begin(), Constraints.end(), + [&Prototype, &PotentialGroup](Constraint *Constraint) { + return Constraint->acceptsPrototype(Prototype, + PotentialGroup); + })) + continue; + + // Check all following StmtSequences for clones. + for (unsigned j = i + 1; j < HashGroup.Sequences.size(); ++j) { + // Skip indexes that are already part of a CloneGroup. + if (Indexes[j]) + continue; + + // If a following StmtSequence belongs to our CloneGroup, we add it to + // it. + const StmtSequence &Candidate = HashGroup.Sequences[j]; + + if (!std::all_of(Constraints.begin(), Constraints.end(), + [&Prototype, &Candidate](Constraint *Constraint) { + return Constraint->acceptsCandidate(Prototype, + Candidate); + })) + continue; + + PotentialGroup.Sequences.push_back(Candidate); + // Make sure we never visit this StmtSequence again. + ++Indexes[j]; + } + + if (!std::all_of(Constraints.begin(), Constraints.end(), + [&PotentialGroup](Constraint *Constraint) { + return Constraint->acceptsGroup(PotentialGroup); + })) + continue; + + // Otherwise, add it to the result and continue searching for more groups. + Result.push_back(PotentialGroup); + } + + assert(std::all_of(Indexes.begin(), Indexes.end(), + [](char c) { return c == 1; })); + } + + for (Constraint *P : Constraints) { + P->constrainResult(Result); + } +} + namespace { /// \brief Returns true if and only if \p Stmt contains at least one other /// sequence in the \p Group. @@ -614,6 +524,66 @@ } } // end anonymous namespace +void CloneDetector::findClones(std::vector &Result, + ConstraintList &Constraints) { + // Shortcut and necessary for the for-loop later in this function. + if (Sequences.empty()) + return; + + std::stable_sort(Sequences.begin(), Sequences.end(), + [](std::pair LHS, + std::pair RHS) { + return LHS.first.Hash < RHS.first.Hash; + }); + + std::vector CloneGroups; + + // Check for each CloneSignature if its successor has the same hash value. + // We don't check the last CloneSignature as it has no successor. + // Note: The 'size - 1 ' in the condition is safe because we check for an + // empty Sequences vector at the beginning of this function. + for (unsigned i = 0; i < Sequences.size() - 1; ++i) { + const auto Current = Sequences[i]; + const auto Next = Sequences[i + 1]; + + if (Current.first.Hash != Next.first.Hash) + continue; + + // It's likely that we just found an sequence of CloneSignatures that + // represent a CloneGroup, so we create a new group and start checking and + // adding the CloneSignatures in this sequence. + CloneGroup Group; + Group.Signature = Current.first; + + for (; i < Sequences.size(); ++i) { + const auto &Signature = Sequences[i]; + + // A different hash value means we have reached the end of the sequence. + if (Current.first.Hash != Signature.first.Hash) { + // The current Signature could be the start of a new CloneGroup. So we + // decrement i so that we visit it again in the outer loop. + // Note: i can never be 0 at this point because we are just comparing + // the hash of the Current CloneSignature with itself in the 'if' above. + assert(i != 0); + --i; + break; + } + + Group.Sequences.push_back(Signature.second); + } + + if (!std::all_of(Constraints.begin(), Constraints.end(), + [&Group](Constraint *Constraint) { + return Constraint->acceptsHashGroup(Group); + })) + continue; + + CloneGroups.push_back(Group); + } + + constrainClones(CloneGroups, Result, Constraints); +} + namespace { /// \brief Wrapper around FoldingSetNodeID that it can be used as the template /// argument of the StmtDataCollector. @@ -663,145 +633,129 @@ return DataLHS == DataRHS; } -/// \brief Finds all actual clone groups in a single group of presumed clones. -/// \param Result Output parameter to which all found groups are added. -/// \param Group A group of presumed clones. The clones are allowed to have a -/// different variable pattern and may not be actual clones of each -/// other. -/// \param CheckVariablePatterns If true, every clone in a group that was added -/// to the output follows the same variable pattern as the other -/// clones in its group. -static void createCloneGroups(std::vector &Result, - const CloneDetector::CloneGroup &Group, - bool CheckVariablePattern) { - // We remove the Sequences one by one, so a list is more appropriate. - std::list UnassignedSequences(Group.Sequences.begin(), - Group.Sequences.end()); - - // Search for clones as long as there could be clones in UnassignedSequences. - while (UnassignedSequences.size() > 1) { - - // Pick the first Sequence as a protoype for a new clone group. - StmtSequence Prototype = UnassignedSequences.front(); - UnassignedSequences.pop_front(); - - CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Signature); - - // Analyze the variable pattern of the prototype. Every other StmtSequence - // needs to have the same pattern to get into the new clone group. - VariablePattern PrototypeFeatures(Prototype); - - // Search all remaining StmtSequences for an identical variable pattern - // and assign them to our new clone group. - auto I = UnassignedSequences.begin(), E = UnassignedSequences.end(); - while (I != E) { - // If the sequence doesn't fit to the prototype, we have encountered - // an unintended hash code collision and we skip it. - if (!areSequencesClones(Prototype, *I)) { - ++I; - continue; - } - - // If we weren't asked to check for a matching variable pattern in clone - // groups we can add the sequence now to the new clone group. - // If we were asked to check for matching variable pattern, we first have - // to check that there are no differences between the two patterns and - // only proceed if they match. - if (!CheckVariablePattern || - VariablePattern(*I).countPatternDifferences(PrototypeFeatures) == 0) { - FilteredGroup.Sequences.push_back(*I); - I = UnassignedSequences.erase(I); - continue; - } - - // We didn't found a matching variable pattern, so we continue with the - // next sequence. - ++I; +void VariablePattern::addVariableOccurence(const VarDecl *VarDecl, + SourceRange Location) { + // First check if we already reference this variable + for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) { + if (Variables[KindIndex] == VarDecl) { + // If yes, add a new occurence that points to the existing entry in + // the Variables vector. + Occurences.emplace_back(KindIndex, Location); + return; } - - // Add a valid clone group to the list of found clone groups. - if (!FilteredGroup.isValid()) - continue; - - Result.push_back(FilteredGroup); } + // If this variable wasn't already referenced, add it to the list of + // referenced variables and add a occurence that points to this new entry. + Occurences.emplace_back(Variables.size(), Location); + Variables.push_back(VarDecl); } -void CloneDetector::findClones(std::vector &Result, - unsigned MinGroupComplexity, - bool CheckPatterns) { - // A shortcut (and necessary for the for-loop later in this function). - if (Sequences.empty()) +void VariablePattern::addVariables(const Stmt *S) { + // Sometimes we get a nullptr (such as from IfStmts which often have nullptr + // children). We skip such statements as they don't reference any + // variables. + if (!S) return; - // We need to search for groups of StmtSequences with the same hash code to - // create our initial clone groups. By sorting all known StmtSequences by - // their hash value we make sure that StmtSequences with the same hash code - // are grouped together in the Sequences vector. - // Note: We stable sort here because the StmtSequences are added in the order - // in which they appear in the source file. We want to preserve that order - // because we also want to report them in that order in the CloneChecker. - std::stable_sort(Sequences.begin(), Sequences.end(), - [](std::pair LHS, - std::pair RHS) { - return LHS.first.Hash < RHS.first.Hash; - }); + // Check if S is a reference to a variable. If yes, add it to the pattern. + if (auto D = dyn_cast(S)) { + if (auto VD = dyn_cast(D->getDecl()->getCanonicalDecl())) + addVariableOccurence(VD, D->getSourceRange()); + } + // Recursively check all children of the given statement. + for (const Stmt *Child : S->children()) { + addVariables(Child); + } +} - std::vector CloneGroups; +unsigned +VariablePattern::getPatternDifferences(const VariablePattern &Other, + SuspiciousClonePair *FirstMismatch) { + unsigned NumberOfDifferences = 0; - // Check for each CloneSignature if its successor has the same hash value. - // We don't check the last CloneSignature as it has no successor. - // Note: The 'size - 1' in the condition is safe because we check for an empty - // Sequences vector at the beginning of this function. - for (unsigned i = 0; i < Sequences.size() - 1; ++i) { - const auto Current = Sequences[i]; - const auto Next = Sequences[i + 1]; + assert(Other.Occurences.size() == Occurences.size()); + for (unsigned i = 0; i < Occurences.size(); ++i) { + auto ThisOccurence = Occurences[i]; + auto OtherOccurence = Other.Occurences[i]; + if (ThisOccurence.KindID != OtherOccurence.KindID) { + ++NumberOfDifferences; - if (Current.first.Hash != Next.first.Hash) - continue; + // If FirstMismatch is not a nullptr, we need to store information about + // the first difference between the two patterns. + if (FirstMismatch == nullptr) + continue; + const VarDecl *FirstSuggestion = nullptr; + // If there is a variable available in the list of referenced variables + // which wouldn't break the pattern if it is used in place of the + // current variable, we provide this variable as the suggested fix. + if (OtherOccurence.KindID < Variables.size()) + FirstSuggestion = Variables[OtherOccurence.KindID]; - // It's likely that we just found an sequence of CloneSignatures that - // represent a CloneGroup, so we create a new group and start checking and - // adding the CloneSignatures in this sequence. - CloneGroup Group; - Group.Signature = Current.first; + // Store information about the first clone. + FirstMismatch->FirstCloneInfo = SuspiciousClonePair::SuspiciousCloneInfo( + Variables[ThisOccurence.KindID], ThisOccurence.Location, + FirstSuggestion); - for (; i < Sequences.size(); ++i) { - const auto &Signature = Sequences[i]; + // Same as above but with the other clone. We do this for both clones as + // we don't know which clone is the one containing the unintended + // pattern error. + const VarDecl *SecondSuggestion = nullptr; + if (ThisOccurence.KindID < Other.Variables.size()) + SecondSuggestion = Other.Variables[ThisOccurence.KindID]; - // A different hash value means we have reached the end of the sequence. - if (Current.first.Hash != Signature.first.Hash) { - // The current Signature could be the start of a new CloneGroup. So we - // decrement i so that we visit it again in the outer loop. - // Note: i can never be 0 at this point because we are just comparing - // the hash of the Current CloneSignature with itself in the 'if' above. - assert(i != 0); - --i; - break; - } + // Store information about the second clone. + FirstMismatch->SecondCloneInfo = SuspiciousClonePair::SuspiciousCloneInfo( + Variables[ThisOccurence.KindID], OtherOccurence.Location, + SecondSuggestion); - // Skip CloneSignatures that won't pass the complexity requirement. - if (Signature.first.Complexity < MinGroupComplexity) - continue; + // SuspiciousClonePair guarantees that the first clone always has a + // suggested variable associated with it. As we know that one of the two + // clones in the pair always has suggestion, we swap the two clones + // in case the first clone has no suggested variable which means that + // the second clone has a suggested variable and should be first. + if (!FirstMismatch->FirstCloneInfo.Suggestion) + std::swap(FirstMismatch->FirstCloneInfo, + FirstMismatch->SecondCloneInfo); - Group.Sequences.push_back(Signature.second); + // This ensures that we always have at least one suggestion in a pair. + assert(FirstMismatch->FirstCloneInfo.Suggestion); } + } - // There is a chance that we haven't found more than two fitting - // CloneSignature because not enough CloneSignatures passed the complexity - // requirement. As a CloneGroup with less than two members makes no sense, - // we ignore this CloneGroup and won't add it to the result. - if (!Group.isValid()) - continue; + return NumberOfDifferences; +} - CloneGroups.push_back(Group); - } +bool CloneDetector::Constraint::acceptsHashGroup( + const CloneDetector::CloneGroup &HashGroup) { + return true; +} - // Add every valid clone group that fulfills the complexity requirement. - for (const CloneGroup &Group : CloneGroups) { - createCloneGroups(Result, Group, CheckPatterns); - } +bool CloneDetector::Constraint::acceptsPrototype( + const StmtSequence &Prototype, + const CloneDetector::CloneGroup &PrototypeGroup) { + return true; +} +bool CloneDetector::Constraint::acceptsCandidate( + const StmtSequence &Prototype, const StmtSequence &Candidate) { + return true; +} + +bool CloneDetector::Constraint::acceptsGroup( + const CloneDetector::CloneGroup &Group) { + return true; +} + +void CloneDetector::Constraint::constrainResult( + std::vector &Result) {} + +bool VerifyCloneHashesConstraint::acceptsCandidate( + const StmtSequence &Prototype, const StmtSequence &Candidate) { + return areSequencesClones(Prototype, Candidate); +} + +void OnlyLargestCloneConstraint::constrainResult( + std::vector &Result) { std::vector IndexesToRemove; // Compare every group in the result with the rest. If one groups contains @@ -828,37 +782,3 @@ Result.erase(Result.begin() + *I); } } - -void CloneDetector::findSuspiciousClones( - std::vector &Result, - unsigned MinGroupComplexity) { - std::vector Clones; - // Reuse the normal search for clones but specify that the clone groups don't - // need to have a common referenced variable pattern so that we can manually - // search for the kind of pattern errors this function is supposed to find. - findClones(Clones, MinGroupComplexity, false); - - for (const CloneGroup &Group : Clones) { - for (unsigned i = 0; i < Group.Sequences.size(); ++i) { - VariablePattern PatternA(Group.Sequences[i]); - - for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) { - VariablePattern PatternB(Group.Sequences[j]); - - CloneDetector::SuspiciousClonePair ClonePair; - // For now, we only report clones which break the variable pattern just - // once because multiple differences in a pattern are an indicator that - // those differences are maybe intended (e.g. because it's actually - // a different algorithm). - // TODO: In very big clones even multiple variables can be unintended, - // so replacing this number with a percentage could better handle such - // cases. On the other hand it could increase the false-positive rate - // for all clones if the percentage is too high. - if (PatternA.countPatternDifferences(PatternB, &ClonePair) == 1) { - Result.push_back(ClonePair); - break; - } - } - } - } -} Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp =================================================================== --- lib/StaticAnalyzer/Checkers/CloneChecker.cpp +++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp @@ -37,12 +37,13 @@ /// \brief Reports all clones to the user. void reportClones(SourceManager &SM, AnalysisManager &Mgr, - int MinComplexity) const; + const std::vector &Clones) const; /// \brief Reports only suspicious clones to the user along with informaton /// that explain why they are suspicious. - void reportSuspiciousClones(SourceManager &SM, AnalysisManager &Mgr, - int MinComplexity) const; + void reportSuspiciousClones( + SourceManager &SM, AnalysisManager &Mgr, + const std::vector &Clones) const; }; } // end anonymous namespace @@ -59,6 +60,8 @@ // At this point, every statement in the translation unit has been analyzed by // the CloneDetector. The only thing left to do is to report the found clones. + SourceManager &SM = BR.getSourceManager(); + int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger( "MinimumCloneComplexity", 10, this); assert(MinComplexity >= 0); @@ -69,18 +72,50 @@ bool ReportNormalClones = Mgr.getAnalyzerOptions().getBooleanOption( "ReportNormalClones", true, this); - if (ReportSuspiciousClones) - reportSuspiciousClones(BR.getSourceManager(), Mgr, MinComplexity); + // Let the CloneDetector create a list of clones from all the analyzed + // statements. We don't filter for matching variable patterns at this point + // because reportSuspiciousClones() wants to search them for errors. + std::vector AllCloneGroups; + + CloneDetector::ConstraintList GenericConstraints = { + // Remove all clones that are below the user-specified complexity value. + new MinComplexityConstraint(MinComplexity), + // We are ok with at least two clones in a group. + new MinGroupSizeConstraint(2), + // Remove all false-positive clones due to unintended collisions in our + // hash function. + new VerifyCloneHashesConstraint(), + // If one clone group contains another clone group, only report the bigger + // one. It's obvious to the user that if the child statements of a clone + // are also clones. + new OnlyLargestCloneConstraint()}; + + Detector.findClones(AllCloneGroups, GenericConstraints); - if (ReportNormalClones) - reportClones(BR.getSourceManager(), Mgr, MinComplexity); -} + if (ReportSuspiciousClones) + reportSuspiciousClones(SM, Mgr, AllCloneGroups); -void CloneChecker::reportClones(SourceManager &SM, AnalysisManager &Mgr, - int MinComplexity) const { + // We are done for this translation unit unless we also need to report normal + // clones. + if (!ReportNormalClones) + return; + // Now that the suspicious clone detector has checked for pattern errors, + // we also filter all clones who don't have matching patterns std::vector CloneGroups; - Detector.findClones(CloneGroups, MinComplexity); + + CloneDetector::ConstraintList OnlyMatchingVariablePatterns = { + new MinGroupSizeConstraint(2), new MatchingVariablePatternConstraint()}; + + CloneDetector::constrainClones(AllCloneGroups, CloneGroups, + OnlyMatchingVariablePatterns); + + reportClones(SM, Mgr, CloneGroups); +} + +void CloneChecker::reportClones( + SourceManager &SM, AnalysisManager &Mgr, + const std::vector &Clones) const { DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic(); @@ -90,7 +125,7 @@ unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note, "Related code clone is here."); - for (CloneDetector::CloneGroup &Group : CloneGroups) { + for (const CloneDetector::CloneGroup &Group : Clones) { // We group the clones by printing the first as a warning and all others // as a note. DiagEngine.Report(Group.Sequences.front().getStartLoc(), WarnID); @@ -100,12 +135,35 @@ } } -void CloneChecker::reportSuspiciousClones(SourceManager &SM, - AnalysisManager &Mgr, - int MinComplexity) const { - - std::vector Clones; - Detector.findSuspiciousClones(Clones, MinComplexity); +void CloneChecker::reportSuspiciousClones( + SourceManager &SM, AnalysisManager &Mgr, + const std::vector &Clones) const { + + std::vector Pairs; + + for (const CloneDetector::CloneGroup &Group : Clones) { + for (unsigned i = 0; i < Group.Sequences.size(); ++i) { + VariablePattern PatternA(Group.Sequences[i]); + + for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) { + VariablePattern PatternB(Group.Sequences[j]); + + VariablePattern::SuspiciousClonePair ClonePair; + // For now, we only report clones which break the variable pattern just + // once because multiple differences in a pattern are an indicator that + // those differences are maybe intended (e.g. because it's actually a + // different algorithm). + // TODO: In very big clones even multiple variables can be unintended, + // so replacing this number with a percentage could better handle such + // cases. On the other hand it could increase the false-positive rate + // for all clones if the percentage is too high. + if (PatternA.getPatternDifferences(PatternB, &ClonePair) == 1) { + Pairs.push_back(ClonePair); + break; + } + } + } + } DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic(); @@ -122,7 +180,7 @@ "variable in a similar piece of code; did you " "mean to use %0?"); - for (CloneDetector::SuspiciousClonePair &Pair : Clones) { + for (VariablePattern::SuspiciousClonePair &Pair : Pairs) { // The first clone always has a suggestion and we report it to the user // along with the place where the suggestion should be used. DiagEngine.Report(Pair.FirstCloneInfo.VarRange.getBegin(),