blob: 693e4a0bfad456d5392d7a2951ed2db7daf652be [file] [log] [blame]
//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// /file
/// This file defines classes for searching and anlyzing source code clones.
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_AST_CLONEDETECTION_H
#define LLVM_CLANG_AST_CLONEDETECTION_H
#include "clang/Basic/SourceLocation.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/StringMap.h"
#include <vector>
namespace clang {
class Stmt;
class Decl;
class VarDecl;
class ASTContext;
class CompoundStmt;
/// \brief Identifies a list of statements.
///
/// Can either identify a single arbitrary Stmt object, a continuous sequence of
/// child statements inside a CompoundStmt or no statements at all.
class StmtSequence {
/// If this object identifies a sequence of statements inside a CompoundStmt,
/// S points to this CompoundStmt. If this object only identifies a single
/// Stmt, then S is a pointer to this Stmt.
const Stmt *S;
/// The related ASTContext for S.
ASTContext *Context;
/// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
/// instance is representing the CompoundStmt children inside the array
/// [StartIndex, EndIndex).
unsigned StartIndex;
unsigned EndIndex;
public:
/// \brief Constructs a StmtSequence holding multiple statements.
///
/// The resulting StmtSequence identifies a continuous sequence of statements
/// in the body of the given CompoundStmt. Which statements of the body should
/// be identified needs to be specified by providing a start and end index
/// that describe a non-empty sub-array in the body of the given CompoundStmt.
///
/// \param Stmt A CompoundStmt that contains all statements in its body.
/// \param Context The ASTContext for the given CompoundStmt.
/// \param StartIndex The inclusive start index in the children array of
/// \p Stmt
/// \param EndIndex The exclusive end index in the children array of \p Stmt.
StmtSequence(const CompoundStmt *Stmt, ASTContext &Context,
unsigned StartIndex, unsigned EndIndex);
/// \brief Constructs a StmtSequence holding a single statement.
///
/// \param Stmt An arbitrary Stmt.
/// \param Context The ASTContext for the given Stmt.
StmtSequence(const Stmt *Stmt, ASTContext &Context);
/// \brief Constructs an empty StmtSequence.
StmtSequence();
typedef const Stmt *const *iterator;
/// Returns an iterator pointing to the first statement in this sequence.
iterator begin() const;
/// Returns an iterator pointing behind the last statement in this sequence.
iterator end() const;
/// Returns the first statement in this sequence.
///
/// This method should only be called on a non-empty StmtSequence object.
const Stmt *front() const {
assert(!empty());
return begin()[0];
}
/// Returns the last statement in this sequence.
///
/// This method should only be called on a non-empty StmtSequence object.
const Stmt *back() const {
assert(!empty());
return begin()[size() - 1];
}
/// Returns the number of statements this object holds.
unsigned size() const {
if (holdsSequence())
return EndIndex - StartIndex;
if (S == nullptr)
return 0;
return 1;
}
/// Returns true if and only if this StmtSequence contains no statements.
bool empty() const { return size() == 0; }
/// Returns the related ASTContext for the stored Stmts.
ASTContext &getASTContext() const {
assert(Context);
return *Context;
}
/// Returns true if this objects holds a list of statements.
bool holdsSequence() const { return EndIndex != 0; }
/// Returns the start sourcelocation of the first statement in this sequence.
///
/// This method should only be called on a non-empty StmtSequence object.
SourceLocation getStartLoc() const;
/// Returns the end sourcelocation of the last statement in this sequence.
///
/// This method should only be called on a non-empty StmtSequence object.
SourceLocation getEndLoc() const;
bool operator==(const StmtSequence &Other) const {
return std::tie(S, StartIndex, EndIndex) ==
std::tie(Other.S, Other.StartIndex, Other.EndIndex);
}
bool operator!=(const StmtSequence &Other) const {
return std::tie(S, StartIndex, EndIndex) !=
std::tie(Other.S, Other.StartIndex, Other.EndIndex);
}
/// Returns true if and only if this sequence covers a source range that
/// contains the source range of the given sequence \p Other.
///
/// This method should only be called on a non-empty StmtSequence object
/// and passed a non-empty StmtSequence object.
bool contains(const StmtSequence &Other) const;
};
/// \brief Searches for clones in source code.
///
/// First, this class needs a translation unit which is passed via
/// \p analyzeTranslationUnit . It will then generate and store search data
/// for all statements inside the given translation unit.
/// Afterwards the generated data can be used to find code clones by calling
/// \p findClones .
///
/// This class only searches for clones in exectuable source code
/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
/// are not supported.
class CloneDetector {
public:
typedef unsigned DataPiece;
/// Holds the data about a StmtSequence that is needed during the search for
/// code clones.
struct CloneSignature {
/// \brief The hash code of the StmtSequence.
///
/// The initial clone groups that are formed during the search for clones
/// consist only of Sequences that share the same hash code. This makes this
/// value the central part of this heuristic that is needed to find clones
/// in a performant way. For this to work, the type of this variable
/// always needs to be small and fast to compare.
///
/// Also, StmtSequences that are clones of each others have to share
/// the same hash code. StmtSequences that are not clones of each other
/// shouldn't share the same hash code, but if they do, it will only
/// degrade the performance of the hash search but doesn't influence
/// the correctness of the result.
size_t Hash;
/// \brief The complexity of the StmtSequence.
///
/// This value gives an approximation on how many direct or indirect child
/// statements are contained in the related StmtSequence. In general, the
/// greater this value, the greater the amount of statements. However, this
/// is only an approximation and the actual amount of statements can be
/// higher or lower than this value. Statements that are generated by the
/// compiler (e.g. macro expansions) for example barely influence the
/// complexity value.
///
/// The main purpose of this value is to filter clones that are too small
/// and therefore probably not interesting enough for the user.
unsigned Complexity;
/// \brief Creates an empty CloneSignature without any data.
CloneSignature() : Complexity(1) {}
CloneSignature(llvm::hash_code Hash, unsigned Complexity)
: Hash(Hash), Complexity(Complexity) {}
};
/// Holds group of StmtSequences that are clones of each other and the
/// complexity value (see CloneSignature::Complexity) that all stored
/// StmtSequences have in common.
struct CloneGroup {
std::vector<StmtSequence> Sequences;
CloneSignature Signature;
CloneGroup() {}
CloneGroup(const StmtSequence &Seq, CloneSignature Signature)
: Signature(Signature) {
Sequences.push_back(Seq);
}
/// \brief Returns false if and only if this group should be skipped when
/// searching for clones.
bool isValid() const {
// A clone group with only one member makes no sense, so we skip them.
return Sequences.size() > 1;
}
};
/// \brief Generates and stores search data for all statements in the body of
/// the given Decl.
void analyzeCodeBody(const Decl *D);
/// \brief Stores the CloneSignature to allow future querying.
void add(const StmtSequence &S, const CloneSignature &Signature);
/// \brief Searches the provided statements for clones.
///
/// \param Result Output parameter that is filled with a list of found
/// clone groups. Each group contains multiple StmtSequences
/// that were identified to be clones of each other.
/// \param MinGroupComplexity Only return clones which have at least this
/// complexity value.
/// \param CheckPatterns Returns only clone groups in which the referenced
/// variables follow the same pattern.
void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity,
bool CheckPatterns = true);
/// \brief Describes two clones that reference their variables in a different
/// pattern which could indicate a programming error.
struct SuspiciousClonePair {
/// \brief Utility class holding the relevant information about a single
/// clone in this pair.
struct SuspiciousCloneInfo {
/// The variable which referencing in this clone was against the pattern.
const VarDecl *Variable;
/// Where the variable was referenced.
SourceRange VarRange;
/// The variable that should have been referenced to follow the pattern.
/// If Suggestion is a nullptr then it's not possible to fix the pattern
/// by referencing a different variable in this clone.
const VarDecl *Suggestion;
SuspiciousCloneInfo(const VarDecl *Variable, SourceRange Range,
const VarDecl *Suggestion)
: Variable(Variable), VarRange(Range), Suggestion(Suggestion) {}
SuspiciousCloneInfo() {}
};
/// The first clone in the pair which always has a suggested variable.
SuspiciousCloneInfo FirstCloneInfo;
/// This other clone in the pair which can have a suggested variable.
SuspiciousCloneInfo SecondCloneInfo;
};
/// \brief Searches the provided statements for pairs of clones that don't
/// follow the same pattern when referencing variables.
/// \param Result Output parameter that will contain the clone pairs.
/// \param MinGroupComplexity Only clone pairs in which the clones have at
/// least this complexity value.
void findSuspiciousClones(std::vector<SuspiciousClonePair> &Result,
unsigned MinGroupComplexity);
private:
/// Stores all encountered StmtSequences alongside their CloneSignature.
std::vector<std::pair<CloneSignature, StmtSequence>> Sequences;
};
} // end namespace clang
#endif // LLVM_CLANG_AST_CLONEDETECTION_H