Index: include/clang/Analysis/CloneDetection.h =================================================================== --- /dev/null +++ include/clang/Analysis/CloneDetection.h @@ -0,0 +1,234 @@ +//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// /file +/// This file defines classes for searching and anlyzing source code clones. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_AST_CLONEDETECTION_H +#define LLVM_CLANG_AST_CLONEDETECTION_H + +#include "clang/Basic/SourceLocation.h" +#include "llvm/ADT/StringMap.h" + +#include + +namespace clang { + +class Stmt; +class ASTContext; +class CompoundStmt; +class TranslationUnitDecl; + +/// \brief Identifies a list of statements. +/// +/// Can either identify a single arbitrary Stmt object, a continuous sequence of +/// child statements inside a CompoundStmt or no statements at all. +class StmtSequence { + /// If this object identifies a sequence of statements inside a CompoundStmt, + /// S points to this CompoundStmt. If this object only identifies a single + /// Stmt, then S is a pointer to this Stmt. + const Stmt *S; + + /// The related ASTContext for S. + ASTContext *Context; + + /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence + /// instance is representing the CompoundStmt children inside the array + /// [StartIndex, EndIndex). + unsigned StartIndex; + unsigned EndIndex; + +public: + /// \brief Constructs a StmtSequence holding multiple statements. + /// + /// The resulting StmtSequence identifies a continuous sequence of statements + /// in the body of the given CompoundStmt. Which statements of the body should + /// be identified needs to be specified by providing a start and end index + /// that describe a non-empty sub-array in the body of the given CompoundStmt. + /// + /// \param Stmt A CompoundStmt that contains all statements in its body. + /// \param Context The ASTContext for the given CompoundStmt. + /// \param StartIndex The inclusive start index in the children array of + /// \p Stmt + /// \param EndIndex The exclusive end index in the children array of \p Stmt. + StmtSequence(const CompoundStmt *Stmt, ASTContext &Context, + unsigned StartIndex, unsigned EndIndex); + + /// \brief Constructs a StmtSequence holding a single statement. + /// + /// \param Stmt An arbitrary Stmt. + /// \param Context The ASTContext for the given Stmt. + StmtSequence(const Stmt *Stmt, ASTContext &Context); + + /// \brief Constructs an empty StmtSequence. + StmtSequence(); + + typedef const Stmt *const *iterator; + + /// Returns an iterator pointing to the first statement in this sequence. + iterator begin() const; + + /// Returns an iterator pointing behind the last statement in this sequence. + iterator end() const; + + /// Returns the first statement in this sequence. + /// + /// This method should only be called on a non-empty StmtSequence object. + const Stmt *front() const { + assert(!empty()); + return begin()[0]; + } + + /// Returns the last statement in this sequence. + /// + /// This method should only be called on a non-empty StmtSequence object. + const Stmt *back() const { + assert(!empty()); + return begin()[size() - 1]; + } + + /// Returns the number of statements this object holds. + unsigned size() const { + if (holdsSequence()) + return EndIndex - StartIndex; + if (S == nullptr) + return 0; + return 1; + } + + /// Returns true if and only if this StmtSequence contains no statements. + bool empty() const { return size() == 0; } + + /// Returns the related ASTContext for the stored Stmts. + ASTContext &getASTContext() const { + assert(Context); + return *Context; + } + + /// Returns true if this objects holds a list of statements. + bool holdsSequence() const { return EndIndex != 0; } + + /// Returns the start sourcelocation of the first statement in this sequence. + /// + /// This method should only be called on a non-empty StmtSequence object. + SourceLocation getStartLoc() const; + + /// Returns the end sourcelocation of the last statement in this sequence. + /// + /// This method should only be called on a non-empty StmtSequence object. + SourceLocation getEndLoc() const; + + bool operator==(const StmtSequence &Other) const { + return std::tie(S, StartIndex, EndIndex) == + std::tie(Other.S, Other.StartIndex, Other.EndIndex); + } + + bool operator!=(const StmtSequence &Other) const { + return std::tie(S, StartIndex, EndIndex) != + std::tie(Other.S, Other.StartIndex, Other.EndIndex); + } + + /// Returns true if and only if this sequence covers a source range that + /// contains the source range of the given sequence \p Other. + /// + /// This method should only be called on a non-empty StmtSequence object + /// and passed a non-empty StmtSequence object. + bool contains(const StmtSequence &Other) const; +}; + +/// \brief Searches for clones in source code. +/// +/// First, this class needs a translation unit which is passed via +/// \p analyzeTranslationUnit . It will then generate and store search data +/// for all statements inside the given translation unit. +/// Afterwards the generated data can be used to find code clones by calling +/// \p findClones . +/// +/// This class only searches for clones in exectuable source code +/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) +/// are not supported. +class CloneDetector { +public: + /// Holds the data about a StmtSequence that is needed during the search for + /// code clones. + struct CloneSignature { + /// \brief Holds all relevant data of a StmtSequence. + /// + /// If this variable is equal for two different StmtSequences, then they can + /// be considered clones of each other. + std::vector Data; + + /// \brief The complexity of the StmtSequence. + /// + /// This scalar value serves as a simple way of filtering clones that are + /// too small to be reported. A greater value indicates that the related + /// StmtSequence is probably more interesting to the user. + unsigned Complexity; + + /// \brief Creates an empty CloneSignature without any data. + CloneSignature() : Complexity(1) {} + + CloneSignature(const std::vector &Data, unsigned Complexity) + : Data(Data), Complexity(Complexity) {} + + /// \brief Adds the data from the given CloneSignature to this one. + void add(const CloneSignature &Other) { + Data.insert(Data.end(), Other.Data.begin(), Other.Data.end()); + Complexity += Other.Complexity; + } + }; + + /// Holds group of StmtSequences that are clones of each other and a single + /// CloneSignature that all stored StmtSequences have in common. + struct CloneGroup { + std::vector Sequences; + CloneSignature Signature; + + CloneGroup(const StmtSequence &Seq, const CloneSignature &Signature) + : Signature(Signature) { + Sequences.push_back(Seq); + } + + /// \brief Returns false if and only if this group should be skipped when + /// searching for clones. + bool IsValid() const { + // A clone group with only one member makes no sense, so we skip them. + return Sequences.size() > 1; + } + }; + + /// \brief Generates and stores search data for all statements in the given + /// translation unit. + void analyzeTranslationUnit(TranslationUnitDecl *TU); + + /// \brief Stores the CloneSignature to allow future querying. + void add(const StmtSequence &S, const CloneSignature &Signature); + + /// \brief Searches the provided statements for clones. + /// + /// \param Result Output parameter that is filled with a list of found + /// clone groups. Each group contains multiple StmtSequences + /// that were identified to be clones of each other. + /// \param MinGroupComplexity Only return clones which have at least this + /// complexity value. + void findClones(std::vector &Result, unsigned MinGroupComplexity); + +private: + /// Stores all found clone groups including invalid groups with only a single + /// statement. + std::vector CloneGroups; + /// Maps search data to its related index in the \p CloneGroups vector. + llvm::StringMap CloneGroupIndexes; +}; + +} // end namespace clang + +#endif // LLVM_CLANG_AST_CLONEDETECTION_H Index: include/clang/StaticAnalyzer/Checkers/Checkers.td =================================================================== --- include/clang/StaticAnalyzer/Checkers/Checkers.td +++ include/clang/StaticAnalyzer/Checkers/Checkers.td @@ -77,6 +77,8 @@ def LLVM : Package<"llvm">; def Debug : Package<"debug">; +def CloneDetectionAlpha : Package<"clone">, InPackage, Hidden; + //===----------------------------------------------------------------------===// // Core Checkers. //===----------------------------------------------------------------------===// @@ -657,3 +659,17 @@ DescFile<"DebugCheckers.cpp">; } // end "debug" + + +//===----------------------------------------------------------------------===// +// Clone Detection +//===----------------------------------------------------------------------===// + +let ParentPackage = CloneDetectionAlpha in { + +def CloneChecker : Checker<"CloneChecker">, + HelpText<"Reports similar pieces of code.">, + DescFile<"CloneChecker.cpp">; + +} // end "clone" + Index: lib/Analysis/CMakeLists.txt =================================================================== --- lib/Analysis/CMakeLists.txt +++ lib/Analysis/CMakeLists.txt @@ -9,6 +9,7 @@ CFGReachabilityAnalysis.cpp CFGStmtMap.cpp CallGraph.cpp + CloneDetection.cpp CocoaConventions.cpp Consumed.cpp CodeInjector.cpp Index: lib/Analysis/CloneDetection.cpp =================================================================== --- /dev/null +++ lib/Analysis/CloneDetection.cpp @@ -0,0 +1,280 @@ +//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// This file implements classes for searching and anlyzing source code clones. +/// +//===----------------------------------------------------------------------===// + +#include "clang/Analysis/CloneDetection.h" + +#include "clang/AST/ASTContext.h" +#include "clang/AST/RecursiveASTVisitor.h" +#include "clang/AST/Stmt.h" +#include "llvm/ADT/StringRef.h" + +using namespace clang; + +StmtSequence::StmtSequence(const CompoundStmt *Stmt, ASTContext &Context, + unsigned StartIndex, unsigned EndIndex) + : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) { + assert(Stmt && "Stmt must not be a nullptr"); + assert(StartIndex < EndIndex && "Given array should not be empty"); + assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt"); +} + +StmtSequence::StmtSequence(const Stmt *Stmt, ASTContext &Context) + : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {} + +StmtSequence::StmtSequence() + : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {} + +bool StmtSequence::contains(const StmtSequence &Other) const { + // If both sequences reside in different translation units, they can never + // contain each other. + if (Context != Other.Context) + return false; + + const SourceManager &SM = Context->getSourceManager(); + + // Otherwise check if the start and end locations of the current sequence + // surround the other sequence. + bool StartIsInBounds = + SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) || + getStartLoc() == Other.getStartLoc(); + if (!StartIsInBounds) + return false; + + bool EndIsInBounds = + SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) || + Other.getEndLoc() == getEndLoc(); + return EndIsInBounds; +} + +StmtSequence::iterator StmtSequence::begin() const { + if (!holdsSequence()) { + return &S; + } + auto CS = cast(S); + return CS->body_begin() + StartIndex; +} + +StmtSequence::iterator StmtSequence::end() const { + if (!holdsSequence()) { + return &S + 1; + } + auto CS = cast(S); + return CS->body_begin() + EndIndex; +} + +SourceLocation StmtSequence::getStartLoc() const { + return front()->getLocStart(); +} + +SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); } + +namespace { +/// Traverses all statements in an AST and collects their relevant data. +/// The results are stored in a CloneDetector object. Each statement is only +/// visited a fixed amount of times during this process. +class DataCollectVisitor : public RecursiveASTVisitor { + + CloneDetector &CD; + ASTContext &Context; + + /// \brief Collects the relevant data of all statements in the given statement + /// tree and stores it in the CloneDetector. + /// + /// \param S The root of the given statement tree. + /// \return The CloneSignature of the root statement. + CloneDetector::CloneSignature CollectData(const Stmt *S) { + // Create an empty signature that will be filled in this method. + CloneDetector::CloneSignature Signature; + + // The only relevant data for now is the class of the statement. + // TODO: Collect statement class specific data. + Signature.Data.push_back(S->getStmtClass()); + + // Storage for the signatures of the direct child statements. This is only + // needed if the current statemnt is a CompoundStmt. + std::vector ChildSignatures; + const CompoundStmt *CS = dyn_cast(S); + + // The signature of a statement includes the signatures of its children. + // Therefore we create the signatures for every child and add them to the + // current signature. + for (const Stmt *Child : S->children()) { + // Some statements like 'if' can have nullptr children that we will skip. + if (!Child) + continue; + + // Recursive call to create the signature of the child statement. This + // will also create and store all clone groups in this child statement. + auto ChildSignature = CollectData(Child); + + // Add the collected data to the signature of the current statement. + Signature.add(ChildSignature); + + // If the current statement is a CompoundStatement, we need to store the + // signature for the generation of the sub-sequences. + if (CS) + ChildSignatures.push_back(ChildSignature); + } + + // If the current statement is a CompoundStmt, we also need to create the + // clone groups from the sub-sequences inside the children. + if (CS) + HandleSubSequences(CS, ChildSignatures); + + // Save the signature for the current statement in the CloneDetector object. + CD.add(StmtSequence(S, Context), Signature); + + return Signature; + } + + /// \brief Adds all possible sub-sequences in the child array of the given + /// CompoundStmt to the CloneDetector. + /// \param CS The given CompoundStmt. + /// \param ChildSignatures A list of calculated signatures for each child in + /// the given CompoundStmt. + void HandleSubSequences( + const CompoundStmt *CS, + std::vector ChildSignatures) { + + // FIXME: This function has quadratic runtime right now. Check if skipping + // this function for too long CompoundStmts is an option. + + // The length of the sub-sequence. We don't need to handle sequences with + // the length 1 as they are already handled in CollectData(). + for (unsigned Length = 2; Length <= CS->size(); ++Length) { + // The start index in the body of the CompoundStmt. We increase the + // position until the end of the sub-sequence reaches the end of the + // CompoundStmt body. + for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) { + // Create an empty signature and add the signatures of all selected + // child statements to it. + CloneDetector::CloneSignature SubSignature; + + for (unsigned i = Pos; i < Pos + Length; ++i) { + SubSignature.add(ChildSignatures[i]); + } + + // Save the signature together with the information about what children + // sequence we selected. + CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature); + } + } + } + +public: + explicit DataCollectVisitor(CloneDetector &CD, ASTContext &Context) + : CD(CD), Context(Context) {} + + bool VisitFunctionDecl(FunctionDecl *D) { + // If we found a function, we start the clone search on its body statement. + if (D->hasBody()) + CollectData(D->getBody()); + return true; + } +}; +} // end anonymous namespace + +void CloneDetector::analyzeTranslationUnit(TranslationUnitDecl *TU) { + DataCollectVisitor visitor(*this, TU->getASTContext()); + visitor.TraverseDecl(TU); +} + +void CloneDetector::add(const StmtSequence &S, + const CloneSignature &Signature) { + // StringMap only works with StringRefs, so we create one for our data vector. + auto &Data = Signature.Data; + StringRef DataRef = StringRef(reinterpret_cast(Data.data()), + Data.size() * sizeof(unsigned)); + + // Search with the help of the signature if we already have encountered a + // clone of the given StmtSequence. + auto I = CloneGroupIndexes.find(DataRef); + if (I == CloneGroupIndexes.end()) { + // We haven't found an existing clone group, so we create a new clone group + // for this StmtSequence and store the index of it in our search map. + CloneGroupIndexes[DataRef] = CloneGroups.size(); + CloneGroups.emplace_back(S, Signature); + return; + } + + // We have found an existing clone group and can expand it with the given + // StmtSequence. + CloneGroups[I->getValue()].Sequences.push_back(S); +} + +namespace { +/// \brief Returns true if and only if \p Stmt contains at least one other +/// sequence in the \p Group. +bool StmtContainsAnyInGroup(StmtSequence &Stmt, + CloneDetector::CloneGroup &Group) { + for (StmtSequence &GroupStmt : Group.Sequences) { + if (Stmt.contains(GroupStmt)) + return true; + } + return false; +} + +/// \brief Returns true if and only if all sequences in \p OtherGroup are +/// contained by a sequence in \p Group. +bool GroupContains(CloneDetector::CloneGroup &Group, + CloneDetector::CloneGroup &OtherGroup) { + // We have less sequences in the current group than we have in the other, + // so we will never fulfill the requirement for returning true. This is only + // possible because we know that a sequence in Group can contain at most + // one sequence in OtherGroup. + if (Group.Sequences.size() < OtherGroup.Sequences.size()) + return false; + + for (StmtSequence &Stmt : Group.Sequences) { + if (!StmtContainsAnyInGroup(Stmt, OtherGroup)) + return false; + } + return true; +} +} // end anonymous namespace + +void CloneDetector::findClones(std::vector &Result, + unsigned MinGroupComplexity) { + // Add every valid clone group that fulfills the complexity requirement. + for (const CloneGroup &Group : CloneGroups) { + if (Group.IsValid() && Group.Signature.Complexity >= MinGroupComplexity) { + Result.push_back(Group); + } + } + + std::vector IndexesToRemove; + + // Compare every group in the result with the rest. If one groups contains + // another group, we only need to return the bigger group. + // Note: This doesn't scale well, so if possible avoid calling any heavy + // function from this loop to minimize the performance impact. + for (unsigned i = 0; i < Result.size(); ++i) { + for (unsigned j = 0; j < Result.size(); ++j) { + // Don't compare a group with itself. + if (i == j) + continue; + + if (GroupContains(Result[j], Result[i])) { + IndexesToRemove.push_back(i); + break; + } + } + } + + // Erasing a list of indexes from the vector should be done with decreasing + // indexes. As IndexesToRemove is constructed with increasing values, we just + // reverse iterate over it to get the desired order. + for (auto I = IndexesToRemove.rbegin(); I != IndexesToRemove.rend(); ++I) { + Result.erase(Result.begin() + *I); + } +} Index: lib/StaticAnalyzer/Checkers/CMakeLists.txt =================================================================== --- lib/StaticAnalyzer/Checkers/CMakeLists.txt +++ lib/StaticAnalyzer/Checkers/CMakeLists.txt @@ -22,6 +22,7 @@ CheckerDocumentation.cpp ChrootChecker.cpp ClangCheckers.cpp + CloneChecker.cpp DeadStoresChecker.cpp DebugCheckers.cpp DereferenceChecker.cpp Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp =================================================================== --- /dev/null +++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp @@ -0,0 +1,85 @@ +//===--- CloneChecker.cpp - Clone detection checker -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// CloneChecker is a checker that reports clones in the current translation +/// unit. +/// +//===----------------------------------------------------------------------===// + +#include "ClangSACheckers.h" +#include "clang/Analysis/CloneDetection.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/StaticAnalyzer/Core/Checker.h" +#include "clang/StaticAnalyzer/Core/CheckerManager.h" +#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h" + +using namespace clang; +using namespace ento; + +namespace { +class CloneChecker : public Checker { + +public: + void checkEndOfTranslationUnit(const TranslationUnitDecl *TU, + AnalysisManager &Mgr, BugReporter &BR) const; +}; +} // end anonymous namespace + +void CloneChecker::checkEndOfTranslationUnit(const TranslationUnitDecl *TU, + AnalysisManager &Mgr, + BugReporter &BR) const { + + int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger( + "MinimumCloneComplexity", 10, this); + + assert(MinComplexity >= 0); + + SourceManager &SM = BR.getSourceManager(); + + CloneDetector CloneDetector; + CloneDetector.analyzeTranslationUnit(const_cast(TU)); + + std::vector CloneGroups; + CloneDetector.findClones(CloneGroups, MinComplexity); + + DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic(); + + unsigned WarnID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Warning, + "Detected code clone."); + + unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note, + "Related code clone is here."); + + for (CloneDetector::CloneGroup &Group : CloneGroups) { + // For readability reasons we sort the clones by line numbers. + std::sort(Group.Sequences.begin(), Group.Sequences.end(), + [&SM](const StmtSequence &LHS, const StmtSequence &RHS) { + return SM.isBeforeInTranslationUnit(LHS.getStartLoc(), + RHS.getStartLoc()) && + SM.isBeforeInTranslationUnit(LHS.getEndLoc(), + RHS.getEndLoc()); + }); + + // We group the clones by printing the first as a warning and all others + // as a note. + DiagEngine.Report(Group.Sequences.front().getStartLoc(), WarnID); + for (unsigned J = 1; J < Group.Sequences.size(); ++J) { + DiagEngine.Report(Group.Sequences[J].getStartLoc(), NoteID); + } + } +} + +//===----------------------------------------------------------------------===// +// Register CloneChecker +//===----------------------------------------------------------------------===// + +void ento::registerCloneChecker(CheckerManager &Mgr) { + Mgr.registerChecker(); +} Index: test/Analysis/copypaste/test-min-max.cpp =================================================================== --- /dev/null +++ test/Analysis/copypaste/test-min-max.cpp @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s + +void log(); + +int max(int a, int b) { // expected-warning{{Detected code clone.}} + log(); + if (a > b) + return a; + return b; +} + +int maxClone(int x, int y) { // expected-note{{Related code clone is here.}} + log(); + if (x > y) + return x; + return y; +} + +// False positives below. These clones should not be reported. + +// FIXME: Detect different binary operator kinds. +int min1(int a, int b) { // expected-note{{Related code clone is here.}} + log(); + if (a < b) + return a; + return b; +} + +// FIXME: Detect different variable patterns. +int min2(int a, int b) { // expected-note{{Related code clone is here.}} + log(); + if (b > a) + return a; + return b; +} + +// Functions below are not clones and should not be reported. + +int foo(int a, int b) { // no-warning + return a + b; +} Index: test/Analysis/copypaste/test-sub-sequences.cpp =================================================================== --- /dev/null +++ test/Analysis/copypaste/test-sub-sequences.cpp @@ -0,0 +1,28 @@ +// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s + +// We test if sub-sequences can match with normal sequences containing only +// a single statement. + +void log2(int a); +void log(); + +int max(int a, int b) { + log2(a); + log(); // expected-warning{{Detected code clone.}} + if (a > b) + return a; + return b; +} + +int maxClone(int a, int b) { + log(); // expected-note{{Related code clone is here.}} + if (a > b) + return a; + return b; +} + +// Functions below are not clones and should not be reported. + +int foo(int a, int b) { // no-warning + return a + b; +}