Index: include/clang/AST/CloneDetection.h =================================================================== --- /dev/null +++ include/clang/AST/CloneDetection.h @@ -0,0 +1,224 @@ +//===--- CloneDetection.h - Analyses the structure of an AST ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// /file +/// This file defines classes for searching and anlyzing source code clones. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_AST_CLONEDETECTION_H +#define LLVM_CLANG_AST_CLONEDETECTION_H + +#include "clang/Basic/SourceLocation.h" + +#include +#include + +namespace clang { + +class Stmt; +class ASTContext; +class CompoundStmt; +class TranslationUnitDecl; + +/// \brief Identifies a list of statements. +/// +/// Can either identify a single arbitrary Stmt object, a continuous sequence of +/// child statements inside a CompoundStmt or no statements at all. +class StmtSequence { + // If this object identifies a sequence of statements inside a CompoundStmt, + // S points to this CompoundStmt. + // If this object only identifies a single Stmt, then S is a pointer to this + // Stmt. + Stmt *S; + + // The related ASTContext for S. + ASTContext *Context; + + /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence + /// instance is representing the CompoundStmt children inside the array + /// [StartIndex, EndIndex). + unsigned StartIndex; + unsigned EndIndex; + +public: + /// \brief Constructs a StmtSequence holding multiple statements. + /// + /// The resulting StmtSequence identifies a continuous sequence of statements + /// in the body of the given CompoundStmt. + /// Which statements of the body should be identified needs to be specified + /// by providing a start and end index that describe a non-empty sub-array + /// in the body of the given CompoundStmt. + /// + /// \param Stmt A CompoundStmt that contains all statements in its body. + /// \param Context The ASTContext for the given CompoundStmt. + /// \param StartIndex the inclusive start index in the children array of + /// \p Stmt + /// \param EndIndex the exclusive end index in the children array of \p Stmt. + StmtSequence(CompoundStmt *Stmt, ASTContext *Context, unsigned StartIndex, + unsigned EndIndex); + + /// \brief Constructs a StmtSequence holding a single statement. + /// + /// \param Stmt An arbitrary Stmt. + /// \param Context The ASTContext for the given Stmt. + StmtSequence(Stmt *Stmt, ASTContext *Context); + + /// \brief Constructs an empty StmtSequence. + StmtSequence() : StmtSequence(nullptr, nullptr) {} + + /// Returns an iterator-style pointer to the first statement in this sequence. + Stmt *const *begin() const; + + /// Returns an iterator-style pointer that points behind the last statement + /// in this sequence. + Stmt *const *end() const; + + /// Returns the first statement in this sequence. + /// This method should only be called on a non-empty StmtSequence object. + Stmt *front() const { + assert(!empty()); + return begin()[0]; + } + + /// Returns the last statement in this sequence. + /// This method should only be called on a non-empty StmtSequence object. + Stmt *back() const { + assert(!empty()); + return begin()[size() - 1]; + } + + /// Returns the number of statements this object holds. + unsigned size() const { + if (holdsSequence()) + return EndIndex - StartIndex; + if (S == nullptr) + return 0; + return 1; + } + + /// Returns true if and only if this StmtSequence contains no statements. + bool empty() const { return size() == 0; } + + /// Returns the related ASTContext for the stored Stmts. + ASTContext &getASTContext() const { + assert(Context); + return *Context; + } + + /// Returns true if this objects holds a list of statements. + bool holdsSequence() const { return EndIndex != 0; } + + /// Returns the start sourcelocation of the first statement in this sequence. + /// This method should only be called on a non-empty StmtSequence object. + SourceLocation getStartLoc() const; + + /// Returns the end sourcelocation of the last statement in this sequence. + /// This method should only be called on a non-empty StmtSequence object. + SourceLocation getEndLoc() const; + + /// Provides an strict weak ordering operator. + bool operator<(const StmtSequence &Other) const { + return std::tie(S, StartIndex, EndIndex) < + std::tie(Other.S, Other.StartIndex, Other.EndIndex); + } + + /// Returns true if and only if this sequence covers a source range that + /// contains the source range of the given sequence. + /// + /// This method should only be called on a non-empty StmtSequence object + /// and passed a non-empty StmtSequence object. + bool contains(const StmtSequence &Other) const; +}; + +/// \brief Searches clones in source code. +/// +/// This class first needs to be provided with (possibly multiple) translation +/// units by passing them \p AnalyzeTranslationUnitDecl . +/// Afterwards all provided translation units can be searched for clones +/// by calling \p findClones . +/// +/// This class only searches for clones in exectuable source code +/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) +/// are not supported. +/// +/// CloneDetector generates search data for efficiently finding clones +/// in the given statements. +/// This is done by hashing all statements using a locality-sensitive hash +/// function that generates identical hash values for similar statement +/// trees. +class CloneDetector { +public: + /// Holds the generated data for a StmtSequence + struct StmtData { + /// The generated hash value. + unsigned Hash; + /// \brief The complexity of the StmtSequence. + /// + /// This scalar values serves as a simple way of filtering clones + /// that are too small to be reported. + /// A greater value indicates that the related StmtSequence is probably + /// more interesting to the user. + unsigned Complexity; + StmtData() : Hash(0), Complexity(0) {} + + StmtData(unsigned Hash, unsigned Complexity) + : Hash(Hash), Complexity(Complexity) {} + }; + + /// \brief Generates and stores search data for all statements in the + /// given translation unit declarations. + /// + /// This method can be called multiple times on the same CloneDetector + /// to search clones across translation unit boundaries. + void AnalyzeTranslationUnitDecl(TranslationUnitDecl *TU); + + /// \brief Returns the generated search data for the given StmtSequence. + /// + /// \param S the given StmtSequence + /// \param Error output parameter that is set to true if no + /// data was generated for the given StmtSequence. + /// This can happen if the given StmtSequence is not part of a + /// translation unit that was passed to \p AnalyzeTranslationUnitDecl + /// so far. + StmtData findHash(const StmtSequence &S, bool &Error) { + auto I = HashedStmts.find(S); + if (I == HashedStmts.end()) { + Error = true; + return StmtData(); + } + Error = false; + return I->second; + } + + /// \brief Stores the StmtSequence with its associated data in the search + /// database of this CloneDetector. + /// + /// StmtSequence objects added by this method are included in the clone + /// search. + void add(const StmtSequence &S, const StmtData &Data); + + typedef std::vector CloneGroup; + + /// \brief Searches all translation units in this CloneDetector for clones. + /// + /// \param MinGroupComplexity Only return clones which have at least this + /// complexity value. + /// \return a list of found clone groups. Each group contains multiple + /// StmtSequences that were identified to be clones of each other. + std::vector findClones(unsigned MinGroupComplexity = 10); + +private: + // Maps each StmtSequence to the search data that was generated for it. + std::map HashedStmts; +}; + +} // end namespace clang + +#endif // LLVM_CLANG_AST_CLONEDETECTION_H Index: include/clang/StaticAnalyzer/Checkers/Checkers.td =================================================================== --- include/clang/StaticAnalyzer/Checkers/Checkers.td +++ include/clang/StaticAnalyzer/Checkers/Checkers.td @@ -77,6 +77,8 @@ def LLVM : Package<"llvm">; def Debug : Package<"debug">; +def CloneDetection : Package<"clone">; + //===----------------------------------------------------------------------===// // Core Checkers. //===----------------------------------------------------------------------===// @@ -657,3 +659,17 @@ DescFile<"DebugCheckers.cpp">; } // end "debug" + + +//===----------------------------------------------------------------------===// +// Clone Detection +//===----------------------------------------------------------------------===// + +let ParentPackage = CloneDetection in { + +def CloneChecker : Checker<"CloneChecker">, + HelpText<"Reports similar pieces of code.">, + DescFile<"CloneChecker.cpp">; + +} // end "clone" + Index: lib/AST/CMakeLists.txt =================================================================== --- lib/AST/CMakeLists.txt +++ lib/AST/CMakeLists.txt @@ -9,6 +9,7 @@ ASTImporter.cpp ASTTypeTraits.cpp AttrImpl.cpp + CloneDetection.cpp CXXInheritance.cpp Comment.cpp CommentBriefParser.cpp Index: lib/AST/CloneDetection.cpp =================================================================== --- /dev/null +++ lib/AST/CloneDetection.cpp @@ -0,0 +1,310 @@ +//===--- CloneDetection.cpp - Analyses the structure of an AST --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// This file implements classes for searching and anlyzing source code clones. +/// +//===----------------------------------------------------------------------===// + +#include "clang/AST/CloneDetection.h" + +#include "clang/AST/ASTContext.h" +#include "clang/AST/RecursiveASTVisitor.h" +#include "clang/AST/Stmt.h" + +using namespace clang; + +StmtSequence::StmtSequence(CompoundStmt *Stmt, ASTContext *Context, + unsigned StartIndex, unsigned EndIndex) + : S(Stmt), Context(Context), StartIndex(StartIndex), EndIndex(EndIndex) { + assert(Stmt && Context && "Both Stmt and Context need to be valid pointers"); + assert(StartIndex < EndIndex && "Given array should not be empty"); + assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt"); +} + +StmtSequence::StmtSequence(Stmt *Stmt, ASTContext *Context) + : S(Stmt), Context(Context), StartIndex(0), EndIndex(0) { + // Test against passing StmtSequence(nullptr, valid_ptr) or + // StmtSequence(valid_ptr, nullptr). + assert((!Stmt && !Context) || (Stmt && Context)); +} + +bool StmtSequence::contains(const StmtSequence &Other) const { + // If both sequences reside in different translation units, they can + // never contain each other. + if (Context != Other.Context) + return false; + + // Otherwise check if the start and end locations of the current sequence + // surround the other sequence. + bool StartIsInBounds = Context->getSourceManager().isBeforeInTranslationUnit( + getStartLoc(), Other.getStartLoc()) || + getStartLoc() == Other.getStartLoc(); + bool EndIsInBounds = Context->getSourceManager().isBeforeInTranslationUnit( + Other.getEndLoc(), getEndLoc()) || + Other.getEndLoc() == getEndLoc(); + return StartIsInBounds && EndIsInBounds; +} + +Stmt *const *StmtSequence::begin() const { + if (holdsSequence()) { + auto CS = static_cast(S); + return CS->body_begin() + StartIndex; + } + return &S; +} + +Stmt *const *StmtSequence::end() const { + if (holdsSequence()) { + auto CS = static_cast(S); + return CS->body_begin() + EndIndex; + } + return &S; +} + +SourceLocation StmtSequence::getStartLoc() const { + return front()->getLocStart(); +} + +SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); } + +namespace { +/// Utility class for computing a hash value. +class HashValue { + unsigned Value = 0; + +public: + /// Computes the given data into the hash value. + void addData(unsigned Data) { + Value *= 53; + Value += Data; + } + + /// Returns the hash value. + unsigned getValue() { return Value; } +}; +} // end anonymous namespace + +namespace { +/// Traverses the AST and calculates hash values and complexity values +/// for all statements. The results are stored in a CloneDetector +/// object. +/// +/// This calculation happens in linear time and each statement is only +/// visited a fixed amount of times during this process. +/// The hash value for a statement is first calculated from the data stored +/// directly in the statement object. +/// Afterwards, the hash values of the children are calculated into the +/// computed hash value. +class HashVisitor : public RecursiveASTVisitor { + + static const unsigned NullChildHashValue = 313; + +public: + explicit HashVisitor(CloneDetector &CD, ASTContext &Context) + : CD(CD), Context(Context) {} + + // We need to traverse postorder over the AST for our algorithm + // to work as each parent expects that its children were already hashed. + bool shouldTraversePostOrder() const { return true; } + + bool VisitStmt(Stmt *S); + +private: + void HandleChildHashes(HashValue &Hash, unsigned &Complexity, + Stmt::child_iterator Iterator, unsigned StartPos, + unsigned Length) { + auto StartIterator = Iterator; + for (unsigned I = 0; I < StartPos; ++I) + ++StartIterator; + auto EndIterator = Iterator; + for (unsigned I = 0; I < Length; ++I) + ++EndIterator; + HandleChildHashes(Hash, Complexity, StartIterator, EndIterator); + } + + /// Iterates over the specified Stmt array and computes the hash + /// values of all statements into the given hash value. + /// Also, the complexity of all statements is added to the given + /// complexity value. + void HandleChildHashes(HashValue &Hash, unsigned &Complexity, + Stmt::child_iterator StartIterator, + Stmt::child_iterator EndIterator) { + // Iterate over each child in the sub-sequence. + for (auto I = StartIterator; I != EndIterator; ++I) { + Stmt *Child = *I; + if (Child == nullptr) { + ++Complexity; + // We use an placeholder hash value for null children. + Hash.addData(NullChildHashValue); + } else { + StmtSequence ChildSequence(StmtSequence(Child, &Context)); + + bool Error; + CloneDetector::StmtData Data = CD.findHash(ChildSequence, Error); + assert(!Error && "Child statement wasn't hashed before parent."); + + Complexity += Data.Complexity; + Hash.addData(Data.Hash); + } + } + } + + // Saves the current hash value into the persistent storage of this + // CloneDetector. + void SaveData(StmtSequence CurrentStmt, HashValue Hash, + unsigned Complexity); + + CloneDetector &CD; + ASTContext &Context; +}; +} // end anonymous namespace + +bool HashVisitor::VisitStmt(Stmt *S) { + StmtSequence CurrentStmt(S, &Context); + HashValue Hash; + unsigned Complexity = 1; + + // The only relevant data for now is the class of the statement, + // so we calculate the hash class into the current hash value. + // TODO: Handle statement class specific data. + Hash.addData(S->getStmtClass()); + + // Incorporate the hash values of all child Stmts into the current + // hash value. + HandleChildHashes(Hash, Complexity, S->child_begin(), S->child_end()); + + SaveData(CurrentStmt, Hash, Complexity); + return true; +} + +void HashVisitor::SaveData(StmtSequence CurrentStmt, HashValue Hash, + unsigned Complexity) { + // Save current dat to the CloneDetector. + CloneDetector::StmtData Data(Hash.getValue(), Complexity); + CD.add(CurrentStmt, Data); + + // If the currently hashed statement is a CompoundStmt, we also + // need to calculate the hash values for all possible sub-sequences + // in the body of the CompoundStmt. + auto CS = dyn_cast(CurrentStmt.front()); + if (!CS) + return; + + // The length of the sub-sequence. We don't need to hash sequences + // with the length 1 as they are already handled when we hashed the + // children. + // We also don't need to hash the sub-sequence that contains the + // whole body as this would be equal to the hash of the CompoundStmt itself. + for (unsigned Length = 2; Length < CS->size(); ++Length) { + // The start index in the body of the CompoundStmt. + // We increase the position and rehash until the end of the sub-sequence + // reaches the end of the CompoundStmt body. + for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) { + HashValue SubHash; + unsigned SubComplexity = 1; + StmtSequence Sequence(CS, &Context, Pos, Pos + Length); + + // We consider a sub-sequence to be CompoundStmt that only contains + // the children in the sub-sequence. This is necessary for matching + // sub-sequences with full CompoundStmts. + SubHash.addData(CS->getStmtClass()); + + HandleChildHashes(SubHash, SubComplexity, CS->child_begin(), Pos, Length); + + CloneDetector::StmtData SubData(SubHash.getValue(), SubComplexity); + CD.add(Sequence, SubData); + } + } +} + +void CloneDetector::AnalyzeTranslationUnitDecl(TranslationUnitDecl *TU) { + HashVisitor visitor(*this, TU->getASTContext()); + visitor.TraverseDecl(TU); +} + +void CloneDetector::add(const StmtSequence &S, const StmtData &Data) { + assert(!S.empty()); + HashedStmts[S] = Data; +} + +namespace { +/// Returns true if and only if \p Stmt contains at least one +/// other sequence in the \p Group. +bool StmtContainsAnyInGroup(StmtSequence &Stmt, + CloneDetector::CloneGroup &Group) { + for (StmtSequence &GroupStmt : Group) { + if (Stmt.contains(GroupStmt)) + return true; + } + return false; +} + +/// Returns true if and only if all sequences in \p OtherGroup are contained +/// by a sequence in \p Group. +bool GroupContains(CloneDetector::CloneGroup &Group, + CloneDetector::CloneGroup &OtherGroup) { + // We have less sequences in the current group than we have in the other, + // so we will never fulfill the requirement for returning true. + // This is only possible because we know that a sequence in Group can + // contain at most one sequence in OtherGroup. + if (Group.size() < OtherGroup.size()) + return false; + + for (StmtSequence &Stmt : Group) { + if (!StmtContainsAnyInGroup(Stmt, OtherGroup)) + return false; + } + return true; +} +} // end anonymous namespace + +std::vector +CloneDetector::findClones(unsigned MinGroupComplexity) { + std::vector Result; + + std::map GroupsByHash; + + // Create a CloneGroup for every known statement. + for (auto &Pair : HashedStmts) { + // but ignore statements that don't fulfill the complexity requirement. + if (Pair.second.Complexity > MinGroupComplexity) { + GroupsByHash[Pair.second.Hash].push_back(Pair.first); + } + } + + // Put all groups that now contain at least one sequence into the + // result. We still need to filter the Result. + for (auto &HashGroupPair : GroupsByHash) { + if (HashGroupPair.second.size() > 1) { + Result.push_back(HashGroupPair.second); + } + } + + std::set IndexesToRemove; + + // Compare every group in the result with the others. If one groups + // contains another group, we only need to return the bigger group. + for (unsigned I = 0; I < Result.size(); ++I) { + for (unsigned J = 0; J < Result.size(); ++J) { + if (I != J) { + if (GroupContains(Result[J], Result[I])) { + IndexesToRemove.insert(I); + break; + } + } + } + } + + for (auto Iter = IndexesToRemove.rbegin(); Iter != IndexesToRemove.rend(); + ++Iter) { + Result.erase(Result.begin() + (*Iter)); + } + + return Result; +} Index: lib/StaticAnalyzer/Checkers/CMakeLists.txt =================================================================== --- lib/StaticAnalyzer/Checkers/CMakeLists.txt +++ lib/StaticAnalyzer/Checkers/CMakeLists.txt @@ -22,6 +22,7 @@ CheckerDocumentation.cpp ChrootChecker.cpp ClangCheckers.cpp + CloneChecker.cpp DeadStoresChecker.cpp DebugCheckers.cpp DereferenceChecker.cpp Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp =================================================================== --- /dev/null +++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp @@ -0,0 +1,67 @@ +//===--- CloneDetection.cpp - Clone detection checkers ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// CloneChecker is a checker that reports clones in the current translation +/// unit. +/// +//===----------------------------------------------------------------------===// + +#include "ClangSACheckers.h" +#include "clang/AST/CloneDetection.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/StaticAnalyzer/Core/Checker.h" +#include "clang/StaticAnalyzer/Core/CheckerManager.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" + +using namespace clang; +using namespace ento; + +namespace { +class CloneChecker : public Checker { + +public: + void checkEndOfTranslationUnit(const TranslationUnitDecl *TU, + AnalysisManager &Mgr, BugReporter &BR) const; +}; +} // end anonymous namespace + +void CloneChecker::checkEndOfTranslationUnit(const TranslationUnitDecl *TU, + AnalysisManager &Mgr, + BugReporter &BR) const { + CloneDetector CloneDetector; + CloneDetector.AnalyzeTranslationUnitDecl( + TU->getASTContext().getTranslationUnitDecl()); + + std::vector CloneGroups = + CloneDetector.findClones(); + + DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic(); + + unsigned WarnID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Warning, + "Detected code clone."); + + unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note, + "Related code clone is here."); + + for (CloneDetector::CloneGroup &Group : CloneGroups) { + DiagEngine.Report(Group.front().getStartLoc(), WarnID); + for (unsigned J = 1; J < Group.size(); ++J) { + DiagEngine.Report(Group[J].getStartLoc(), NoteID); + } + } +} + +//===----------------------------------------------------------------------===// +// Register CloneChecker +//===----------------------------------------------------------------------===// + +void ento::registerCloneChecker(CheckerManager &Mgr) { + Mgr.registerChecker(); +} Index: test/Analysis/copypaste/test-min-max.cpp =================================================================== --- /dev/null +++ test/Analysis/copypaste/test-min-max.cpp @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=clone.CloneChecker -verify %s + +void log(); + +int max(int a, int b) { // expected-warning{{Detected code clone.}} + log(); + if (a > b) + return a; + return b; +} + +int maxClone(int a, int b) { // expected-note{{Related code clone is here.}} + log(); + if (a > b) + return a; + return b; +} + +// False positives below. These clones should not be reported. + +// FIXME: Detect different binary operator kinds. +int min1(int a, int b) { // expected-note{{Related code clone is here.}} + log(); + if (a < b) + return a; + return b; +} + +// FIXME: Detect different variable patterns. +int min2(int a, int b) { // expected-note{{Related code clone is here.}} + log(); + if (b > a) + return a; + return b; +} + +// Functions below are not clones and should not be reported. + +int foo(int a, int b) { + return a + b; +} Index: test/Analysis/copypaste/test-sub-sequences.cpp =================================================================== --- /dev/null +++ test/Analysis/copypaste/test-sub-sequences.cpp @@ -0,0 +1,28 @@ +// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=clone.CloneChecker -verify %s + +// We test if sub-sequences can match with normal sequences containing only +// a single statement. + +void log2(int a); +void log(); + +int max(int a, int b) { + log2(a); + log(); // expected-warning{{Detected code clone.}} + if (a > b) + return a; + return b; +} + +int maxClone(int a, int b) { + log(); // expected-note{{Related code clone is here.}} + if (a > b) + return a; + return b; +} + +// Functions below are not clones and should not be reported. + +int foo(int a, int b) { + return a + b; +}