lib/IDE/FuzzyStringMatcher.cpp - third_party/swift - Git at Google

 //===--- FuzzyStringMatcher.cpp -------------------------------------------===//
 //
 // This source file is part of the Swift.org open source project
 //
 // Copyright (c) 2014 - 2020 Apple Inc. and the Swift project authors
 // Licensed under Apache License v2.0 with Runtime Library Exception
 //
 // See https://swift.org/LICENSE.txt for license information
 // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
 //
 //===----------------------------------------------------------------------===//

 #include "swift/IDE/FuzzyStringMatcher.h"
 #include "clang/Basic/CharInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"

 using namespace swift;
 using namespace swift::ide;
 using clang::toUppercase;
 using clang::toLowercase;
 using clang::isUppercase;
 using clang::isLowercase;

 FuzzyStringMatcher::FuzzyStringMatcher(StringRef pattern_)
     : pattern(pattern_), charactersInPattern(1 << (sizeof(char) * 8)) {
   lowercasePattern.reserve(pattern.size());
   unsigned upperCharCount = 0;
   for (char c : pattern) {
     char lower = toLowercase(c);
     upperCharCount += (c == lower) ? 0 : 1;
     lowercasePattern.push_back(lower);
     charactersInPattern.set(static_cast<unsigned char>(lower));
     charactersInPattern.set(static_cast<unsigned char>(toUppercase(c)));
   }
   assert(pattern.size() == lowercasePattern.size());

   // FIXME: pull out the magic constants.
   // This depends on the inner details of the matching algorithm and will need
   // to be updated if we substantially alter it.
   if (pattern.size() == 1) {
     maxScore = 3.0 +  // uppercase match
                0.001; // size bonus
   } else {
     maxScore = 0.25 +                           // percent match bonus
                2.5 +                            // match at start bonus
                pattern.size() * pattern.size(); // max run length score
     if (upperCharCount)                         // max uppercase match score
       maxScore += (upperCharCount + 1) * (upperCharCount + 1);
     maxScore *= 1.1 * 2.5; // exact prefix match bonus
   }
 }

 bool FuzzyStringMatcher::matchesCandidate(StringRef candidate) const {
   unsigned patternLength = pattern.size();
   unsigned candidateLength = candidate.size();
   if (patternLength > candidateLength)
     return false;

   // Do all of the pattern characters match the candidate in order?
   unsigned pidx = 0, cidx = 0;
   while (pidx < patternLength && cidx < candidateLength) {
     char c = candidate[cidx];
     char p = lowercasePattern[pidx];
     if (p == c || p == toLowercase(c))
       ++pidx;
     ++cidx;
   }

   return pidx == patternLength;
 }

 static bool isTokenizingChar(char c) {
   switch (c) {
   case '/':
   case '.':
   case '_':
   case '+':
   case '-':
   case ':':
   case ',':
   case ' ':
   case '(':
   case ')':
   case '!':
   case '?':
     return true;
   default:
     return false;
   }
 }

 namespace {
 /// A simple index range.
 struct Range {
   unsigned location;
   unsigned length;
 };
 } // end anonymous namespace

 static void
 populateTokenTable(SmallVectorImpl<Range> &tokens,
                    llvm::MutableArrayRef<unsigned> characterToTokenIndex,
                    StringRef candidate) {
   unsigned start = 0;
   characterToTokenIndex[0] = 0;

   for (unsigned cidx = 1; cidx < candidate.size(); ++cidx) {
     char current = candidate[cidx];
     char prev = candidate[cidx - 1];

     // Is this a special tokenizing character like '_', or the start of a camel
     // case word?  The uppercase character should start a new token.
     if (isTokenizingChar(prev) ||
         (isUppercase(current) && !isUppercase(prev)) ||
         (clang::isDigit(current) && !clang::isDigit(prev))) {
       tokens.push_back({start, cidx - start});
       start = cidx;


     } else if (isLowercase(current) && isUppercase(prev) && start != cidx - 1) {
       // Or is this the end of a run of uppercase characters?
       // E.g. in NSWindow, the 'W' should start a new token.
       tokens.push_back({start, cidx - start - 1});
       characterToTokenIndex[cidx - 1] = tokens.size();
       start = cidx - 1;
     }
     characterToTokenIndex[cidx] = tokens.size();
   }

   tokens.push_back({start, static_cast<unsigned>(candidate.size() - start)});
 }

 static constexpr unsigned notFound = ~0U;

 namespace {
 /// The candidate-specific matching data and algorithms.
 struct CandidateSpecificMatcher {
   // The following StringRefs are owned by FuzzyStringMatcher and must outlive
   // this object.
   StringRef pattern;
   StringRef lowercasePattern;
   StringRef candidate;
   SmallVector<char, 128> lowercaseCandidate;
   SmallVector<unsigned, 128> jumpTable; ///< The next matching character index.
   SmallVector<Range, 128> tokens; ///< Tokenized ranges from the candidate.
   SmallVector<unsigned, 128> characterToTokenIndex;
   SmallVector<Range, 128> runs;

   CandidateSpecificMatcher(StringRef pattern, StringRef lowercasePattern,
                            StringRef candidate,
                            const llvm::BitVector &charactersInPattern,
                            unsigned &firstPatternPos);

   /// Calculates the candidate's score, matching the candidate from
   /// \p firstPatternPos or later.
   ///
   /// This drives scoreCandidateTrial by trying the possible matches.
   double scoreCandidate(unsigned firstPatternPos);

   /// Calculates the candidate's score, matching the candidate from
   /// exactly \p firstPatternPos.
   double scoreCandidateTrial(unsigned firstPatternPos);
 };
 } // end anonymous namespace

 double FuzzyStringMatcher::scoreCandidate(StringRef candidate) const {
   double finalScore = 0.0;
   if (candidate.empty() || pattern.empty() || candidate.size() < pattern.size())
     return finalScore;

   // Single character pattern matching should be simple and fast.  Just look at
   // the first character.
   if (pattern.size() == 1) {
     char c = candidate[0];
     if (c == pattern[0] && isUppercase(c)) {
       finalScore = 3.0;
     } else if (toLowercase(c) == lowercasePattern[0]) {
       finalScore = 2.0;
     }

     // Make sure shorter results come first;
     if (finalScore)
       finalScore += (1 / static_cast<double>(candidate.size())) * (1 / 1000.0);

     if (normalize)
       finalScore /= maxScore;

     return finalScore;
   }

   // FIXME: path separators would be handled here, jumping straight to the last
   // component if the pattern doesn't contain a separator.

   unsigned firstPatternPos = 0;
   CandidateSpecificMatcher CSM(pattern, lowercasePattern, candidate,
                                charactersInPattern, firstPatternPos);
   finalScore = CSM.scoreCandidate(firstPatternPos);

   if (normalize)
     finalScore /= maxScore;

   return finalScore;
 }

 CandidateSpecificMatcher::CandidateSpecificMatcher(
     StringRef pattern, StringRef lowercasePattern, StringRef candidate,
     const llvm::BitVector &charactersInPattern, unsigned &firstPatternPos)
     : pattern(pattern), lowercasePattern(lowercasePattern),
       candidate(candidate) {

   assert(!pattern.empty() && pattern.size() <= candidate.size());
   assert(pattern.size() == lowercasePattern.size());

   // Build a table that points at the next pattern character so we skip
   // through candidate faster.
   unsigned candidateLength = candidate.size();
   jumpTable.resize(candidateLength);
   lowercaseCandidate.resize(candidateLength);
   unsigned lastPatternPos = notFound;
   for (unsigned cidx = candidateLength - 1;; --cidx) {
     char c = candidate[cidx];
     lowercaseCandidate[cidx] = toLowercase(c);
     jumpTable[cidx] = lastPatternPos;
     if (charactersInPattern[static_cast<unsigned char>(c)])
       lastPatternPos = cidx;

     if (!cidx)
       break;
   }
   firstPatternPos = lastPatternPos;

   // Build the token table.
   characterToTokenIndex.resize(candidate.size());
   populateTokenTable(tokens, characterToTokenIndex, candidate);
 }

 double CandidateSpecificMatcher::scoreCandidate(unsigned firstPatternPos) {
   double finalScore = 0.0;

   // The outer matching loop. We run multiple trials so that "a_b_c_abc"
   // matching "abc" is matched on the "abc" part instead of the "a_b_c" part.
   while (firstPatternPos != notFound) {
     // Quickly skip to the first character that matches. We need
     // the loop in case the first pattern-character in the
     // candidate is not the first character in the pattern.
     while (firstPatternPos != notFound) {
       if (lowercasePattern[0] == lowercaseCandidate[firstPatternPos])
         break;
       firstPatternPos = jumpTable[firstPatternPos];
     }
     if (firstPatternPos == notFound)
       break;

     double trialScore = scoreCandidateTrial(firstPatternPos);

     if (trialScore > finalScore) {
       finalScore = trialScore;
       // FIXME: update output ranges, if necessary
     }

     firstPatternPos = jumpTable[firstPatternPos];
   }

   return finalScore;
 }

 static double scoreRun(unsigned runStart, unsigned runLength,
                        unsigned prevTokenStart, unsigned tokenIndex,
                        unsigned uppercaseMatches, bool isTokenizingChar) {
   if (runLength == 0)
     return 0.0;

   // We really don't like not matching at token starts, but if it's a long match
   // give some credit.
   if (runStart != prevTokenStart && !isTokenizingChar) {
     if (runLength < 5) {
       return (runLength < 3) ? 0.0 : runLength;
     }

     // For really long matches, we'll give a high score.  Pretend it's a bit
     // shorter.
     runLength -= 2;
   }

   // Bonus if the match is the first or second token.
   double prefixBonus = (runStart == 0) ? 2.5 : ((tokenIndex < 2) ? 1.0 : 0.0);
   double uppercaseBonus =
       uppercaseMatches ? (uppercaseMatches + 1) * (uppercaseMatches + 1) : 0.0;

   return (runLength * runLength) + uppercaseBonus + prefixBonus;
 }

 double
 CandidateSpecificMatcher::scoreCandidateTrial(unsigned firstPatternPos) {
   double trialScore = 0.0; /// We run multiple trials so that "a_b_c_abc"
                            /// matching "abc" is matched on the "abc" part
                            /// instead of the "a_b_c" part.
   unsigned uppercaseMatches = 0;
   unsigned cidx = firstPatternPos;
   unsigned pidx = 0;
   unsigned runLength = 0;
   unsigned runStart = cidx;
   unsigned nonTokenRuns = 0;
   unsigned camelCaseLen = 0;
   unsigned camelCaseLastToken = 0;
   double camelCaseStartBonus = 0.0;
   unsigned camelCaseSkips = 0;

   unsigned patternLength = pattern.size();
   unsigned candidateLength = candidate.size();

   while (pidx < patternLength && cidx < candidateLength) {
     char lowerPatternChar = lowercasePattern[pidx];
     char lowerCandidateChar = lowercaseCandidate[cidx];
     unsigned nextCidx = jumpTable[cidx];
     bool matched = lowerPatternChar == lowerCandidateChar;
     if (matched) {
       if (isUppercase(pattern[pidx]) && isUppercase(candidate[cidx])) {
         ++uppercaseMatches;
       }

       ++runLength;
       ++pidx;
       if (pidx < patternLength)
         lowerPatternChar = lowercasePattern[pidx];
     }

     // If we're skipping forward and were running, the run ended.
     if (((cidx + 1) != nextCidx) || !matched) {
       if (runLength) {
         double runValue =
             scoreRun(runStart, runLength,
                      tokens[characterToTokenIndex[runStart]].location,
                      characterToTokenIndex[runStart], uppercaseMatches,
                      isTokenizingChar(candidate[runStart]));

         // If it's a poor match in the middle of a token, see if the next char
         // starts a token and also matches. If so, use it.
         if (runLength == 1 && pidx > 1 && runValue == 0.0 &&
             nextCidx != notFound &&
             characterToTokenIndex[runStart] < tokens.size() - 1) {
           bool foundIt = false;
           unsigned charToCheck = matched ? nextCidx : cidx;
           while (charToCheck != notFound) {
             if (tokens[characterToTokenIndex[charToCheck]].location ==
                     charToCheck &&
                 lowercasePattern[pidx - 1] == lowercaseCandidate[charToCheck]) {
               foundIt = true;
               break;
             }
             charToCheck = jumpTable[charToCheck];
           }

           if (foundIt) {
             --pidx;
             lowerPatternChar = lowercasePattern[pidx];
             runStart = cidx = charToCheck;
             runLength = 0;
             continue;
           }
         }

         // We really don't like matches that don't start at a token.
         if (runValue == 0.0) {
           ++nonTokenRuns;

         } else {
           unsigned tokenIndex = characterToTokenIndex[runStart];
           if (runStart == tokens[tokenIndex].location ||
               isTokenizingChar(lowerCandidateChar)) {
             camelCaseLen += runLength;

             // Bonus for matching the beginning of the candidate.
             if (tokenIndex <= 1) {
               camelCaseStartBonus = 2.0;

               // Penalty for skipping a token.
             } else if (tokenIndex != camelCaseLastToken + 1) {
               camelCaseSkips += tokenIndex - camelCaseLastToken - 1;
             }

             camelCaseLastToken = tokenIndex;

             if (isTokenizingChar(lowerCandidateChar) && runLength == 1) {
               --camelCaseLastToken;
             }
           }
         }

         // Accumulate run and reset for next run.
         trialScore += runValue;
         runs.push_back({runStart, runLength});
         uppercaseMatches = 0;
         runLength = 0;
       }

       runStart = nextCidx;
     }

     cidx = nextCidx;
   }

   // The trial is done, did we find a match?
   // FIXME: this can happen spuriously in foo => ufDownOb.
   if (pidx != patternLength)
     return 0.0;

   // Okay, we found a match.

   // FIXME: this code is largely duplicated with the previous block. There are
   // some subtle differences that can be seen if you try to remove this one and
   // check for pidx == patternLength for the other block.
   if (runLength) {
     double runValue = scoreRun(
         runStart, runLength, tokens[characterToTokenIndex[runStart]].location,
         characterToTokenIndex[runStart], uppercaseMatches,
         isTokenizingChar(candidate[runStart]));

     // If it's a poor match in the middle of a token, see if the next char
     // starts a token and also matches. If so, use it.
     if (runLength == 1 && runValue == 0.0) {
       unsigned nextCidx = jumpTable[runStart];
       if (nextCidx != notFound &&
           characterToTokenIndex[runStart] < tokens.size() - 1) {
         bool foundIt = false;
         while (nextCidx != notFound) {
           if (tokens[characterToTokenIndex[nextCidx]].location == nextCidx &&
               lowercasePattern[pidx - 1] == lowercaseCandidate[nextCidx]) {
             foundIt = true;
             break;
           }
           nextCidx = jumpTable[nextCidx];
         }

         if (foundIt) {
           runStart = nextCidx;
           uppercaseMatches +=
               (isUppercase(pattern[pidx - 1]) &&
                isUppercase(candidate[runStart])) ? 1 : 0;
           runValue = scoreRun(runStart, runLength,
                               tokens[characterToTokenIndex[runStart]].location,
                               characterToTokenIndex[runStart], uppercaseMatches,
                               isTokenizingChar(candidate[runStart]));
         }
       }
     }

     // We really don't like matches that don't start at a token.
     if (runValue == 0.0) {
       ++nonTokenRuns;

     } else {
       unsigned tokenIndex = characterToTokenIndex[runStart];
       if (runStart == tokens[tokenIndex].location ||
           isTokenizingChar(lowercaseCandidate[runStart])) {
         camelCaseLen += runLength;

         if (tokenIndex <= 1) {
           // Bonus for matching the beginning of the candidate.
           camelCaseStartBonus = 2.0;
         } else if (tokenIndex != camelCaseLastToken + 1) {
           // Penalty for skipping a token.
           camelCaseSkips += tokenIndex - camelCaseLastToken - 1;
         }
       }
     }

     // Accumulate run.
     trialScore += runValue;
     runs.push_back({runStart, runLength});
   }

   // Unless there were bad matches, prefer camel case matches.
   if (nonTokenRuns == 0 && camelCaseSkips < 3) {
     double camelCaseScore = (camelCaseLen * camelCaseLen) + camelCaseStartBonus;
     if (camelCaseSkips == 1) {
       camelCaseScore *= 0.9;
     } else if (camelCaseSkips == 2) {
       camelCaseScore *= 0.8;
     }

     if (trialScore < camelCaseScore) {
       // Camel case matched better.
       trialScore = camelCaseScore;
     }
   }

   // FIXME: using the range up to a dot is silly when candidate isn't a file.
   auto dotLoc = candidate.find_last_of('.');
   unsigned baseNameLength =
       dotLoc != StringRef::npos && dotLoc > 1 ? dotLoc : candidateLength;

   // FIXME: file type bonus if we're checking a file path.

   // Add a bit for the percentage of the candidate matched.
   trialScore += patternLength / static_cast<double>(baseNameLength) * 0.25;

   // Exact matches are even better.
   if (patternLength >= baseNameLength && !runs.empty() &&
       runs[0].location == 0) {
     trialScore *= 1.1;
   }

   // Exact prefix matches are the best.
   if (!runs.empty() && runs[0].location == 0 && runs[0].length == patternLength)
     trialScore *= 2.5;

   // FIXME: popular/unpopular API.

   // We really don't like matches that don't start at a token.
   switch (nonTokenRuns) {
   case 0:
     break;
   case 1:
     trialScore *= 0.8125;
     break;
   case 2:
     trialScore *= 0.5;
     break;
   case 3:
     trialScore *= 0.25;
     break;
   default:
     trialScore *= 0.0625;
     break;
   }

   // FIXME: matched ranges output.

   return trialScore;
 }
	//===--- FuzzyStringMatcher.cpp -------------------------------------------===//
	//
	// This source file is part of the Swift.org open source project
	//
	// Copyright (c) 2014 - 2020 Apple Inc. and the Swift project authors
	// Licensed under Apache License v2.0 with Runtime Library Exception
	//
	// See https://swift.org/LICENSE.txt for license information
	// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
	//
	//===----------------------------------------------------------------------===//

	#include "swift/IDE/FuzzyStringMatcher.h"
	#include "clang/Basic/CharInfo.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/SmallString.h"

	using namespace swift;
	using namespace swift::ide;
	using clang::toUppercase;
	using clang::toLowercase;
	using clang::isUppercase;
	using clang::isLowercase;

	FuzzyStringMatcher::FuzzyStringMatcher(StringRef pattern_)
	: pattern(pattern_), charactersInPattern(1 << (sizeof(char) * 8)) {
	lowercasePattern.reserve(pattern.size());
	unsigned upperCharCount = 0;
	for (char c : pattern) {
	char lower = toLowercase(c);
	upperCharCount += (c == lower) ? 0 : 1;
	lowercasePattern.push_back(lower);
	charactersInPattern.set(static_cast<unsigned char>(lower));
	charactersInPattern.set(static_cast<unsigned char>(toUppercase(c)));
	}
	assert(pattern.size() == lowercasePattern.size());

	// FIXME: pull out the magic constants.
	// This depends on the inner details of the matching algorithm and will need
	// to be updated if we substantially alter it.
	if (pattern.size() == 1) {
	maxScore = 3.0 + // uppercase match
	0.001; // size bonus
	} else {
	maxScore = 0.25 + // percent match bonus
	2.5 + // match at start bonus
	pattern.size() * pattern.size(); // max run length score
	if (upperCharCount) // max uppercase match score
	maxScore += (upperCharCount + 1) * (upperCharCount + 1);
	maxScore = 1.1 2.5; // exact prefix match bonus
	}
	}

	bool FuzzyStringMatcher::matchesCandidate(StringRef candidate) const {
	unsigned patternLength = pattern.size();
	unsigned candidateLength = candidate.size();
	if (patternLength > candidateLength)
	return false;

	// Do all of the pattern characters match the candidate in order?
	unsigned pidx = 0, cidx = 0;
	while (pidx < patternLength && cidx < candidateLength) {
	char c = candidate[cidx];
	char p = lowercasePattern[pidx];
	if (p == c \|\| p == toLowercase(c))
	++pidx;
	++cidx;
	}

	return pidx == patternLength;
	}

	static bool isTokenizingChar(char c) {
	switch (c) {
	case '/':
	case '.':
	case '_':
	case '+':
	case '-':
	case ':':
	case ',':
	case ' ':
	case '(':
	case ')':
	case '!':
	case '?':
	return true;
	default:
	return false;
	}
	}

	namespace {
	/// A simple index range.
	struct Range {
	unsigned location;
	unsigned length;
	};
	} // end anonymous namespace

	static void
	populateTokenTable(SmallVectorImpl<Range> &tokens,
	llvm::MutableArrayRef<unsigned> characterToTokenIndex,
	StringRef candidate) {
	unsigned start = 0;
	characterToTokenIndex[0] = 0;

	for (unsigned cidx = 1; cidx < candidate.size(); ++cidx) {
	char current = candidate[cidx];
	char prev = candidate[cidx - 1];

	// Is this a special tokenizing character like '_', or the start of a camel
	// case word? The uppercase character should start a new token.
	if (isTokenizingChar(prev) \|\|
	(isUppercase(current) && !isUppercase(prev)) \|\|
	(clang::isDigit(current) && !clang::isDigit(prev))) {
	tokens.push_back({start, cidx - start});
	start = cidx;


	} else if (isLowercase(current) && isUppercase(prev) && start != cidx - 1) {
	// Or is this the end of a run of uppercase characters?
	// E.g. in NSWindow, the 'W' should start a new token.
	tokens.push_back({start, cidx - start - 1});
	characterToTokenIndex[cidx - 1] = tokens.size();
	start = cidx - 1;
	}
	characterToTokenIndex[cidx] = tokens.size();
	}

	tokens.push_back({start, static_cast<unsigned>(candidate.size() - start)});
	}

	static constexpr unsigned notFound = ~0U;

	namespace {
	/// The candidate-specific matching data and algorithms.
	struct CandidateSpecificMatcher {
	// The following StringRefs are owned by FuzzyStringMatcher and must outlive
	// this object.
	StringRef pattern;
	StringRef lowercasePattern;
	StringRef candidate;
	SmallVector<char, 128> lowercaseCandidate;
	SmallVector<unsigned, 128> jumpTable; ///< The next matching character index.
	SmallVector<Range, 128> tokens; ///< Tokenized ranges from the candidate.
	SmallVector<unsigned, 128> characterToTokenIndex;
	SmallVector<Range, 128> runs;

	CandidateSpecificMatcher(StringRef pattern, StringRef lowercasePattern,
	StringRef candidate,
	const llvm::BitVector &charactersInPattern,
	unsigned &firstPatternPos);

	/// Calculates the candidate's score, matching the candidate from
	/// \p firstPatternPos or later.
	///
	/// This drives scoreCandidateTrial by trying the possible matches.
	double scoreCandidate(unsigned firstPatternPos);

	/// Calculates the candidate's score, matching the candidate from
	/// exactly \p firstPatternPos.
	double scoreCandidateTrial(unsigned firstPatternPos);
	};
	} // end anonymous namespace

	double FuzzyStringMatcher::scoreCandidate(StringRef candidate) const {
	double finalScore = 0.0;
	if (candidate.empty() \|\| pattern.empty() \|\| candidate.size() < pattern.size())
	return finalScore;

	// Single character pattern matching should be simple and fast. Just look at
	// the first character.
	if (pattern.size() == 1) {
	char c = candidate[0];
	if (c == pattern[0] && isUppercase(c)) {
	finalScore = 3.0;
	} else if (toLowercase(c) == lowercasePattern[0]) {
	finalScore = 2.0;
	}

	// Make sure shorter results come first;
	if (finalScore)
	finalScore += (1 / static_cast<double>(candidate.size())) * (1 / 1000.0);

	if (normalize)
	finalScore /= maxScore;

	return finalScore;
	}

	// FIXME: path separators would be handled here, jumping straight to the last
	// component if the pattern doesn't contain a separator.

	unsigned firstPatternPos = 0;
	CandidateSpecificMatcher CSM(pattern, lowercasePattern, candidate,
	charactersInPattern, firstPatternPos);
	finalScore = CSM.scoreCandidate(firstPatternPos);

	if (normalize)
	finalScore /= maxScore;

	return finalScore;
	}

	CandidateSpecificMatcher::CandidateSpecificMatcher(
	StringRef pattern, StringRef lowercasePattern, StringRef candidate,
	const llvm::BitVector &charactersInPattern, unsigned &firstPatternPos)
	: pattern(pattern), lowercasePattern(lowercasePattern),
	candidate(candidate) {

	assert(!pattern.empty() && pattern.size() <= candidate.size());
	assert(pattern.size() == lowercasePattern.size());

	// Build a table that points at the next pattern character so we skip
	// through candidate faster.
	unsigned candidateLength = candidate.size();
	jumpTable.resize(candidateLength);
	lowercaseCandidate.resize(candidateLength);
	unsigned lastPatternPos = notFound;
	for (unsigned cidx = candidateLength - 1;; --cidx) {
	char c = candidate[cidx];
	lowercaseCandidate[cidx] = toLowercase(c);
	jumpTable[cidx] = lastPatternPos;
	if (charactersInPattern[static_cast<unsigned char>(c)])
	lastPatternPos = cidx;

	if (!cidx)
	break;
	}
	firstPatternPos = lastPatternPos;

	// Build the token table.
	characterToTokenIndex.resize(candidate.size());
	populateTokenTable(tokens, characterToTokenIndex, candidate);
	}

	double CandidateSpecificMatcher::scoreCandidate(unsigned firstPatternPos) {
	double finalScore = 0.0;

	// The outer matching loop. We run multiple trials so that "a_b_c_abc"
	// matching "abc" is matched on the "abc" part instead of the "a_b_c" part.
	while (firstPatternPos != notFound) {
	// Quickly skip to the first character that matches. We need
	// the loop in case the first pattern-character in the
	// candidate is not the first character in the pattern.
	while (firstPatternPos != notFound) {
	if (lowercasePattern[0] == lowercaseCandidate[firstPatternPos])
	break;
	firstPatternPos = jumpTable[firstPatternPos];
	}
	if (firstPatternPos == notFound)
	break;

	double trialScore = scoreCandidateTrial(firstPatternPos);

	if (trialScore > finalScore) {
	finalScore = trialScore;
	// FIXME: update output ranges, if necessary
	}

	firstPatternPos = jumpTable[firstPatternPos];
	}

	return finalScore;
	}

	static double scoreRun(unsigned runStart, unsigned runLength,
	unsigned prevTokenStart, unsigned tokenIndex,
	unsigned uppercaseMatches, bool isTokenizingChar) {
	if (runLength == 0)
	return 0.0;

	// We really don't like not matching at token starts, but if it's a long match
	// give some credit.
	if (runStart != prevTokenStart && !isTokenizingChar) {
	if (runLength < 5) {
	return (runLength < 3) ? 0.0 : runLength;
	}

	// For really long matches, we'll give a high score. Pretend it's a bit
	// shorter.
	runLength -= 2;
	}

	// Bonus if the match is the first or second token.
	double prefixBonus = (runStart == 0) ? 2.5 : ((tokenIndex < 2) ? 1.0 : 0.0);
	double uppercaseBonus =
	uppercaseMatches ? (uppercaseMatches + 1) * (uppercaseMatches + 1) : 0.0;

	return (runLength * runLength) + uppercaseBonus + prefixBonus;
	}

	double
	CandidateSpecificMatcher::scoreCandidateTrial(unsigned firstPatternPos) {
	double trialScore = 0.0; /// We run multiple trials so that "a_b_c_abc"
	/// matching "abc" is matched on the "abc" part
	/// instead of the "a_b_c" part.
	unsigned uppercaseMatches = 0;
	unsigned cidx = firstPatternPos;
	unsigned pidx = 0;
	unsigned runLength = 0;
	unsigned runStart = cidx;
	unsigned nonTokenRuns = 0;
	unsigned camelCaseLen = 0;
	unsigned camelCaseLastToken = 0;
	double camelCaseStartBonus = 0.0;
	unsigned camelCaseSkips = 0;

	unsigned patternLength = pattern.size();
	unsigned candidateLength = candidate.size();

	while (pidx < patternLength && cidx < candidateLength) {
	char lowerPatternChar = lowercasePattern[pidx];
	char lowerCandidateChar = lowercaseCandidate[cidx];
	unsigned nextCidx = jumpTable[cidx];
	bool matched = lowerPatternChar == lowerCandidateChar;
	if (matched) {
	if (isUppercase(pattern[pidx]) && isUppercase(candidate[cidx])) {
	++uppercaseMatches;
	}

	++runLength;
	++pidx;
	if (pidx < patternLength)
	lowerPatternChar = lowercasePattern[pidx];
	}

	// If we're skipping forward and were running, the run ended.
	if (((cidx + 1) != nextCidx) \|\| !matched) {
	if (runLength) {
	double runValue =
	scoreRun(runStart, runLength,
	tokens[characterToTokenIndex[runStart]].location,
	characterToTokenIndex[runStart], uppercaseMatches,
	isTokenizingChar(candidate[runStart]));

	// If it's a poor match in the middle of a token, see if the next char
	// starts a token and also matches. If so, use it.
	if (runLength == 1 && pidx > 1 && runValue == 0.0 &&
	nextCidx != notFound &&
	characterToTokenIndex[runStart] < tokens.size() - 1) {
	bool foundIt = false;
	unsigned charToCheck = matched ? nextCidx : cidx;
	while (charToCheck != notFound) {
	if (tokens[characterToTokenIndex[charToCheck]].location ==
	charToCheck &&
	lowercasePattern[pidx - 1] == lowercaseCandidate[charToCheck]) {
	foundIt = true;
	break;
	}
	charToCheck = jumpTable[charToCheck];
	}

	if (foundIt) {
	--pidx;
	lowerPatternChar = lowercasePattern[pidx];
	runStart = cidx = charToCheck;
	runLength = 0;
	continue;
	}
	}

	// We really don't like matches that don't start at a token.
	if (runValue == 0.0) {
	++nonTokenRuns;

	} else {
	unsigned tokenIndex = characterToTokenIndex[runStart];
	if (runStart == tokens[tokenIndex].location \|\|
	isTokenizingChar(lowerCandidateChar)) {
	camelCaseLen += runLength;

	// Bonus for matching the beginning of the candidate.
	if (tokenIndex <= 1) {
	camelCaseStartBonus = 2.0;

	// Penalty for skipping a token.
	} else if (tokenIndex != camelCaseLastToken + 1) {
	camelCaseSkips += tokenIndex - camelCaseLastToken - 1;
	}

	camelCaseLastToken = tokenIndex;

	if (isTokenizingChar(lowerCandidateChar) && runLength == 1) {
	--camelCaseLastToken;
	}
	}
	}

	// Accumulate run and reset for next run.
	trialScore += runValue;
	runs.push_back({runStart, runLength});
	uppercaseMatches = 0;
	runLength = 0;
	}

	runStart = nextCidx;
	}

	cidx = nextCidx;
	}

	// The trial is done, did we find a match?
	// FIXME: this can happen spuriously in foo => ufDownOb.
	if (pidx != patternLength)
	return 0.0;

	// Okay, we found a match.

	// FIXME: this code is largely duplicated with the previous block. There are
	// some subtle differences that can be seen if you try to remove this one and
	// check for pidx == patternLength for the other block.
	if (runLength) {
	double runValue = scoreRun(
	runStart, runLength, tokens[characterToTokenIndex[runStart]].location,
	characterToTokenIndex[runStart], uppercaseMatches,
	isTokenizingChar(candidate[runStart]));

	// If it's a poor match in the middle of a token, see if the next char
	// starts a token and also matches. If so, use it.
	if (runLength == 1 && runValue == 0.0) {
	unsigned nextCidx = jumpTable[runStart];
	if (nextCidx != notFound &&
	characterToTokenIndex[runStart] < tokens.size() - 1) {
	bool foundIt = false;
	while (nextCidx != notFound) {
	if (tokens[characterToTokenIndex[nextCidx]].location == nextCidx &&
	lowercasePattern[pidx - 1] == lowercaseCandidate[nextCidx]) {
	foundIt = true;
	break;
	}
	nextCidx = jumpTable[nextCidx];
	}

	if (foundIt) {
	runStart = nextCidx;
	uppercaseMatches +=
	(isUppercase(pattern[pidx - 1]) &&
	isUppercase(candidate[runStart])) ? 1 : 0;
	runValue = scoreRun(runStart, runLength,
	tokens[characterToTokenIndex[runStart]].location,
	characterToTokenIndex[runStart], uppercaseMatches,
	isTokenizingChar(candidate[runStart]));
	}
	}
	}

	// We really don't like matches that don't start at a token.
	if (runValue == 0.0) {
	++nonTokenRuns;

	} else {
	unsigned tokenIndex = characterToTokenIndex[runStart];
	if (runStart == tokens[tokenIndex].location \|\|
	isTokenizingChar(lowercaseCandidate[runStart])) {
	camelCaseLen += runLength;

	if (tokenIndex <= 1) {
	// Bonus for matching the beginning of the candidate.
	camelCaseStartBonus = 2.0;
	} else if (tokenIndex != camelCaseLastToken + 1) {
	// Penalty for skipping a token.
	camelCaseSkips += tokenIndex - camelCaseLastToken - 1;
	}
	}
	}

	// Accumulate run.
	trialScore += runValue;
	runs.push_back({runStart, runLength});
	}

	// Unless there were bad matches, prefer camel case matches.
	if (nonTokenRuns == 0 && camelCaseSkips < 3) {
	double camelCaseScore = (camelCaseLen * camelCaseLen) + camelCaseStartBonus;
	if (camelCaseSkips == 1) {
	camelCaseScore *= 0.9;
	} else if (camelCaseSkips == 2) {
	camelCaseScore *= 0.8;
	}

	if (trialScore < camelCaseScore) {
	// Camel case matched better.
	trialScore = camelCaseScore;
	}
	}

	// FIXME: using the range up to a dot is silly when candidate isn't a file.
	auto dotLoc = candidate.find_last_of('.');
	unsigned baseNameLength =
	dotLoc != StringRef::npos && dotLoc > 1 ? dotLoc : candidateLength;

	// FIXME: file type bonus if we're checking a file path.

	// Add a bit for the percentage of the candidate matched.
	trialScore += patternLength / static_cast<double>(baseNameLength) * 0.25;

	// Exact matches are even better.
	if (patternLength >= baseNameLength && !runs.empty() &&
	runs[0].location == 0) {
	trialScore *= 1.1;
	}

	// Exact prefix matches are the best.
	if (!runs.empty() && runs[0].location == 0 && runs[0].length == patternLength)
	trialScore *= 2.5;

	// FIXME: popular/unpopular API.

	// We really don't like matches that don't start at a token.
	switch (nonTokenRuns) {
	case 0:
	break;
	case 1:
	trialScore *= 0.8125;
	break;
	case 2:
	trialScore *= 0.5;
	break;
	case 3:
	trialScore *= 0.25;
	break;
	default:
	trialScore *= 0.0625;
	break;
	}

	// FIXME: matched ranges output.

	return trialScore;
	}