blob: d016464691f192f978b4b36ab046a6ae2789c2c1 [file] [log] [blame]
//===--- FuzzyStringMatcher.cpp -------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2020 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
#include "swift/IDE/FuzzyStringMatcher.h"
#include "clang/Basic/CharInfo.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallString.h"
using namespace swift;
using namespace swift::ide;
using clang::toUppercase;
using clang::toLowercase;
using clang::isUppercase;
using clang::isLowercase;
FuzzyStringMatcher::FuzzyStringMatcher(StringRef pattern_)
: pattern(pattern_), charactersInPattern(1 << (sizeof(char) * 8)) {
lowercasePattern.reserve(pattern.size());
unsigned upperCharCount = 0;
for (char c : pattern) {
char lower = toLowercase(c);
upperCharCount += (c == lower) ? 0 : 1;
lowercasePattern.push_back(lower);
charactersInPattern.set(static_cast<unsigned char>(lower));
charactersInPattern.set(static_cast<unsigned char>(toUppercase(c)));
}
assert(pattern.size() == lowercasePattern.size());
// FIXME: pull out the magic constants.
// This depends on the inner details of the matching algorithm and will need
// to be updated if we substantially alter it.
if (pattern.size() == 1) {
maxScore = 3.0 + // uppercase match
0.001; // size bonus
} else {
maxScore = 0.25 + // percent match bonus
2.5 + // match at start bonus
pattern.size() * pattern.size(); // max run length score
if (upperCharCount) // max uppercase match score
maxScore += (upperCharCount + 1) * (upperCharCount + 1);
maxScore *= 1.1 * 2.5; // exact prefix match bonus
}
}
bool FuzzyStringMatcher::matchesCandidate(StringRef candidate) const {
unsigned patternLength = pattern.size();
unsigned candidateLength = candidate.size();
if (patternLength > candidateLength)
return false;
// Do all of the pattern characters match the candidate in order?
unsigned pidx = 0, cidx = 0;
while (pidx < patternLength && cidx < candidateLength) {
char c = candidate[cidx];
char p = lowercasePattern[pidx];
if (p == c || p == toLowercase(c))
++pidx;
++cidx;
}
return pidx == patternLength;
}
static bool isTokenizingChar(char c) {
switch (c) {
case '/':
case '.':
case '_':
case '+':
case '-':
case ':':
case ',':
case ' ':
case '(':
case ')':
case '!':
case '?':
return true;
default:
return false;
}
}
namespace {
/// A simple index range.
struct Range {
unsigned location;
unsigned length;
};
} // end anonymous namespace
static void
populateTokenTable(SmallVectorImpl<Range> &tokens,
llvm::MutableArrayRef<unsigned> characterToTokenIndex,
StringRef candidate) {
unsigned start = 0;
characterToTokenIndex[0] = 0;
for (unsigned cidx = 1; cidx < candidate.size(); ++cidx) {
char current = candidate[cidx];
char prev = candidate[cidx - 1];
// Is this a special tokenizing character like '_', or the start of a camel
// case word? The uppercase character should start a new token.
if (isTokenizingChar(prev) ||
(isUppercase(current) && !isUppercase(prev)) ||
(clang::isDigit(current) && !clang::isDigit(prev))) {
tokens.push_back({start, cidx - start});
start = cidx;
} else if (isLowercase(current) && isUppercase(prev) && start != cidx - 1) {
// Or is this the end of a run of uppercase characters?
// E.g. in NSWindow, the 'W' should start a new token.
tokens.push_back({start, cidx - start - 1});
characterToTokenIndex[cidx - 1] = tokens.size();
start = cidx - 1;
}
characterToTokenIndex[cidx] = tokens.size();
}
tokens.push_back({start, static_cast<unsigned>(candidate.size() - start)});
}
static constexpr unsigned notFound = ~0U;
namespace {
/// The candidate-specific matching data and algorithms.
struct CandidateSpecificMatcher {
// The following StringRefs are owned by FuzzyStringMatcher and must outlive
// this object.
StringRef pattern;
StringRef lowercasePattern;
StringRef candidate;
SmallVector<char, 128> lowercaseCandidate;
SmallVector<unsigned, 128> jumpTable; ///< The next matching character index.
SmallVector<Range, 128> tokens; ///< Tokenized ranges from the candidate.
SmallVector<unsigned, 128> characterToTokenIndex;
SmallVector<Range, 128> runs;
CandidateSpecificMatcher(StringRef pattern, StringRef lowercasePattern,
StringRef candidate,
const llvm::BitVector &charactersInPattern,
unsigned &firstPatternPos);
/// Calculates the candidate's score, matching the candidate from
/// \p firstPatternPos or later.
///
/// This drives scoreCandidateTrial by trying the possible matches.
double scoreCandidate(unsigned firstPatternPos);
/// Calculates the candidate's score, matching the candidate from
/// exactly \p firstPatternPos.
double scoreCandidateTrial(unsigned firstPatternPos);
};
} // end anonymous namespace
double FuzzyStringMatcher::scoreCandidate(StringRef candidate) const {
double finalScore = 0.0;
if (candidate.empty() || pattern.empty() || candidate.size() < pattern.size())
return finalScore;
// Single character pattern matching should be simple and fast. Just look at
// the first character.
if (pattern.size() == 1) {
char c = candidate[0];
if (c == pattern[0] && isUppercase(c)) {
finalScore = 3.0;
} else if (toLowercase(c) == lowercasePattern[0]) {
finalScore = 2.0;
}
// Make sure shorter results come first;
if (finalScore)
finalScore += (1 / static_cast<double>(candidate.size())) * (1 / 1000.0);
if (normalize)
finalScore /= maxScore;
return finalScore;
}
// FIXME: path separators would be handled here, jumping straight to the last
// component if the pattern doesn't contain a separator.
unsigned firstPatternPos = 0;
CandidateSpecificMatcher CSM(pattern, lowercasePattern, candidate,
charactersInPattern, firstPatternPos);
finalScore = CSM.scoreCandidate(firstPatternPos);
if (normalize)
finalScore /= maxScore;
return finalScore;
}
CandidateSpecificMatcher::CandidateSpecificMatcher(
StringRef pattern, StringRef lowercasePattern, StringRef candidate,
const llvm::BitVector &charactersInPattern, unsigned &firstPatternPos)
: pattern(pattern), lowercasePattern(lowercasePattern),
candidate(candidate) {
assert(!pattern.empty() && pattern.size() <= candidate.size());
assert(pattern.size() == lowercasePattern.size());
// Build a table that points at the next pattern character so we skip
// through candidate faster.
unsigned candidateLength = candidate.size();
jumpTable.resize(candidateLength);
lowercaseCandidate.resize(candidateLength);
unsigned lastPatternPos = notFound;
for (unsigned cidx = candidateLength - 1;; --cidx) {
char c = candidate[cidx];
lowercaseCandidate[cidx] = toLowercase(c);
jumpTable[cidx] = lastPatternPos;
if (charactersInPattern[static_cast<unsigned char>(c)])
lastPatternPos = cidx;
if (!cidx)
break;
}
firstPatternPos = lastPatternPos;
// Build the token table.
characterToTokenIndex.resize(candidate.size());
populateTokenTable(tokens, characterToTokenIndex, candidate);
}
double CandidateSpecificMatcher::scoreCandidate(unsigned firstPatternPos) {
double finalScore = 0.0;
// The outer matching loop. We run multiple trials so that "a_b_c_abc"
// matching "abc" is matched on the "abc" part instead of the "a_b_c" part.
while (firstPatternPos != notFound) {
// Quickly skip to the first character that matches. We need
// the loop in case the first pattern-character in the
// candidate is not the first character in the pattern.
while (firstPatternPos != notFound) {
if (lowercasePattern[0] == lowercaseCandidate[firstPatternPos])
break;
firstPatternPos = jumpTable[firstPatternPos];
}
if (firstPatternPos == notFound)
break;
double trialScore = scoreCandidateTrial(firstPatternPos);
if (trialScore > finalScore) {
finalScore = trialScore;
// FIXME: update output ranges, if necessary
}
firstPatternPos = jumpTable[firstPatternPos];
}
return finalScore;
}
static double scoreRun(unsigned runStart, unsigned runLength,
unsigned prevTokenStart, unsigned tokenIndex,
unsigned uppercaseMatches, bool isTokenizingChar) {
if (runLength == 0)
return 0.0;
// We really don't like not matching at token starts, but if it's a long match
// give some credit.
if (runStart != prevTokenStart && !isTokenizingChar) {
if (runLength < 5) {
return (runLength < 3) ? 0.0 : runLength;
}
// For really long matches, we'll give a high score. Pretend it's a bit
// shorter.
runLength -= 2;
}
// Bonus if the match is the first or second token.
double prefixBonus = (runStart == 0) ? 2.5 : ((tokenIndex < 2) ? 1.0 : 0.0);
double uppercaseBonus =
uppercaseMatches ? (uppercaseMatches + 1) * (uppercaseMatches + 1) : 0.0;
return (runLength * runLength) + uppercaseBonus + prefixBonus;
}
double
CandidateSpecificMatcher::scoreCandidateTrial(unsigned firstPatternPos) {
double trialScore = 0.0; /// We run multiple trials so that "a_b_c_abc"
/// matching "abc" is matched on the "abc" part
/// instead of the "a_b_c" part.
unsigned uppercaseMatches = 0;
unsigned cidx = firstPatternPos;
unsigned pidx = 0;
unsigned runLength = 0;
unsigned runStart = cidx;
unsigned nonTokenRuns = 0;
unsigned camelCaseLen = 0;
unsigned camelCaseLastToken = 0;
double camelCaseStartBonus = 0.0;
unsigned camelCaseSkips = 0;
unsigned patternLength = pattern.size();
unsigned candidateLength = candidate.size();
while (pidx < patternLength && cidx < candidateLength) {
char lowerPatternChar = lowercasePattern[pidx];
char lowerCandidateChar = lowercaseCandidate[cidx];
unsigned nextCidx = jumpTable[cidx];
bool matched = lowerPatternChar == lowerCandidateChar;
if (matched) {
if (isUppercase(pattern[pidx]) && isUppercase(candidate[cidx])) {
++uppercaseMatches;
}
++runLength;
++pidx;
if (pidx < patternLength)
lowerPatternChar = lowercasePattern[pidx];
}
// If we're skipping forward and were running, the run ended.
if (((cidx + 1) != nextCidx) || !matched) {
if (runLength) {
double runValue =
scoreRun(runStart, runLength,
tokens[characterToTokenIndex[runStart]].location,
characterToTokenIndex[runStart], uppercaseMatches,
isTokenizingChar(candidate[runStart]));
// If it's a poor match in the middle of a token, see if the next char
// starts a token and also matches. If so, use it.
if (runLength == 1 && pidx > 1 && runValue == 0.0 &&
nextCidx != notFound &&
characterToTokenIndex[runStart] < tokens.size() - 1) {
bool foundIt = false;
unsigned charToCheck = matched ? nextCidx : cidx;
while (charToCheck != notFound) {
if (tokens[characterToTokenIndex[charToCheck]].location ==
charToCheck &&
lowercasePattern[pidx - 1] == lowercaseCandidate[charToCheck]) {
foundIt = true;
break;
}
charToCheck = jumpTable[charToCheck];
}
if (foundIt) {
--pidx;
lowerPatternChar = lowercasePattern[pidx];
runStart = cidx = charToCheck;
runLength = 0;
continue;
}
}
// We really don't like matches that don't start at a token.
if (runValue == 0.0) {
++nonTokenRuns;
} else {
unsigned tokenIndex = characterToTokenIndex[runStart];
if (runStart == tokens[tokenIndex].location ||
isTokenizingChar(lowerCandidateChar)) {
camelCaseLen += runLength;
// Bonus for matching the beginning of the candidate.
if (tokenIndex <= 1) {
camelCaseStartBonus = 2.0;
// Penalty for skipping a token.
} else if (tokenIndex != camelCaseLastToken + 1) {
camelCaseSkips += tokenIndex - camelCaseLastToken - 1;
}
camelCaseLastToken = tokenIndex;
if (isTokenizingChar(lowerCandidateChar) && runLength == 1) {
--camelCaseLastToken;
}
}
}
// Accumulate run and reset for next run.
trialScore += runValue;
runs.push_back({runStart, runLength});
uppercaseMatches = 0;
runLength = 0;
}
runStart = nextCidx;
}
cidx = nextCidx;
}
// The trial is done, did we find a match?
// FIXME: this can happen spuriously in foo => ufDownOb.
if (pidx != patternLength)
return 0.0;
// Okay, we found a match.
// FIXME: this code is largely duplicated with the previous block. There are
// some subtle differences that can be seen if you try to remove this one and
// check for pidx == patternLength for the other block.
if (runLength) {
double runValue = scoreRun(
runStart, runLength, tokens[characterToTokenIndex[runStart]].location,
characterToTokenIndex[runStart], uppercaseMatches,
isTokenizingChar(candidate[runStart]));
// If it's a poor match in the middle of a token, see if the next char
// starts a token and also matches. If so, use it.
if (runLength == 1 && runValue == 0.0) {
unsigned nextCidx = jumpTable[runStart];
if (nextCidx != notFound &&
characterToTokenIndex[runStart] < tokens.size() - 1) {
bool foundIt = false;
while (nextCidx != notFound) {
if (tokens[characterToTokenIndex[nextCidx]].location == nextCidx &&
lowercasePattern[pidx - 1] == lowercaseCandidate[nextCidx]) {
foundIt = true;
break;
}
nextCidx = jumpTable[nextCidx];
}
if (foundIt) {
runStart = nextCidx;
uppercaseMatches +=
(isUppercase(pattern[pidx - 1]) &&
isUppercase(candidate[runStart])) ? 1 : 0;
runValue = scoreRun(runStart, runLength,
tokens[characterToTokenIndex[runStart]].location,
characterToTokenIndex[runStart], uppercaseMatches,
isTokenizingChar(candidate[runStart]));
}
}
}
// We really don't like matches that don't start at a token.
if (runValue == 0.0) {
++nonTokenRuns;
} else {
unsigned tokenIndex = characterToTokenIndex[runStart];
if (runStart == tokens[tokenIndex].location ||
isTokenizingChar(lowercaseCandidate[runStart])) {
camelCaseLen += runLength;
if (tokenIndex <= 1) {
// Bonus for matching the beginning of the candidate.
camelCaseStartBonus = 2.0;
} else if (tokenIndex != camelCaseLastToken + 1) {
// Penalty for skipping a token.
camelCaseSkips += tokenIndex - camelCaseLastToken - 1;
}
}
}
// Accumulate run.
trialScore += runValue;
runs.push_back({runStart, runLength});
}
// Unless there were bad matches, prefer camel case matches.
if (nonTokenRuns == 0 && camelCaseSkips < 3) {
double camelCaseScore = (camelCaseLen * camelCaseLen) + camelCaseStartBonus;
if (camelCaseSkips == 1) {
camelCaseScore *= 0.9;
} else if (camelCaseSkips == 2) {
camelCaseScore *= 0.8;
}
if (trialScore < camelCaseScore) {
// Camel case matched better.
trialScore = camelCaseScore;
}
}
// FIXME: using the range up to a dot is silly when candidate isn't a file.
auto dotLoc = candidate.find_last_of('.');
unsigned baseNameLength =
dotLoc != StringRef::npos && dotLoc > 1 ? dotLoc : candidateLength;
// FIXME: file type bonus if we're checking a file path.
// Add a bit for the percentage of the candidate matched.
trialScore += patternLength / static_cast<double>(baseNameLength) * 0.25;
// Exact matches are even better.
if (patternLength >= baseNameLength && !runs.empty() &&
runs[0].location == 0) {
trialScore *= 1.1;
}
// Exact prefix matches are the best.
if (!runs.empty() && runs[0].location == 0 && runs[0].length == patternLength)
trialScore *= 2.5;
// FIXME: popular/unpopular API.
// We really don't like matches that don't start at a token.
switch (nonTokenRuns) {
case 0:
break;
case 1:
trialScore *= 0.8125;
break;
case 2:
trialScore *= 0.5;
break;
case 3:
trialScore *= 0.25;
break;
default:
trialScore *= 0.0625;
break;
}
// FIXME: matched ranges output.
return trialScore;
}