src/developer/debug/zxdb/expr/expr_tokenizer.cc - fuchsia - Git at Google

 // Copyright 2018 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "src/developer/debug/zxdb/expr/expr_tokenizer.h"

 #include <ctype.h>
 #include <lib/syslog/cpp/macros.h>

 #include <type_traits>

 #include "src/developer/debug/zxdb/expr/number_parser.h"
 #include "src/developer/debug/zxdb/expr/parse_special_identifier.h"
 #include "src/developer/debug/zxdb/expr/parse_string.h"
 #include "src/lib/fxl/strings/string_printf.h"

 namespace zxdb {

 namespace {

 bool IsNameFirstChar(char c) {
   return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' || c == '~';
 }

 bool IsNameContinuingChar(char c) { return IsNameFirstChar(c) || (c >= '0' && c <= '9'); }

 bool IsIntegerFirstChar(char c) { return isdigit(c); }

 // This allows all alphanumeric characters for simplicity. Integer literals aren't validated at the
 // tokenizer level and will be checked later. Our job is to find the extent of the literal.
 bool IsIntegerContinuingChar(char c) { return isalnum(c); }

 // Returns a list of all tokens sharing the given first character.
 const std::vector<const ExprTokenRecord*>& TokensWithFirstChar(char c) {
   // Lookup table for all 7-bit characters.
   constexpr unsigned char kMaxLookupChar = 0x80;
   static std::vector<const ExprTokenRecord*> mapping[kMaxLookupChar];
   static bool initialized = false;

   if (!initialized) {
     // Construct the lookup table.
     initialized = true;
     for (size_t i = 0; i < kNumExprTokenTypes; i++) {
       const ExprTokenRecord& record = RecordForTokenType(static_cast<ExprTokenType>(i));
       if (!record.static_value.empty())
         mapping[static_cast<size_t>(record.static_value[0])].push_back(&record);
     }
   }

   if (static_cast<unsigned char>(c) >= kMaxLookupChar) {
     static std::vector<const ExprTokenRecord*> empty_records;
     return empty_records;
   }
   return mapping[static_cast<size_t>(c)];
 }

 }  // namespace

 ExprTokenizer::ExprTokenizer(const std::string& input, ExprLanguage lang)
     : input_(input), language_(lang) {}

 bool ExprTokenizer::Tokenize() {
   while (!done()) {
     AdvanceToNextToken();
     if (done())
       break;

     if (auto string_info = DoesBeginStringLiteral(language_, input_, cur_)) {
       // String literals are handled specially by the string parser.
       auto result = ParseStringLiteral(input_, *string_info, &cur_, &error_location_);
       if (result.has_error()) {
         err_ = result.err();
         break;
       }

       tokens_.emplace_back(ExprTokenType::kStringLiteral, result.value(),
                            string_info->string_begin);
       continue;
     }

     // Special escaped identifiers.
     if (cur_char() == '$') {
       // Here just discard the name and contents. These will be re-extracted by the parser. This
       // could be optimized to avoid the extra work, but we don't need that level of optimization.
       size_t token_begin = cur_;
       SpecialIdentifier special = SpecialIdentifier::kNone;
       std::string special_cont;
       err_ = ParseSpecialIdentifier(input_, &cur_, &special, &special_cont, &error_location_);
       if (err_.has_error())
         break;
       tokens_.emplace_back(ExprTokenType::kSpecialName,
                            input_.substr(token_begin, cur_ - token_begin), token_begin);
       continue;
     }

     // Floats.
     if (size_t float_len = GetFloatTokenLength(language_, input_.substr(cur_))) {
       tokens_.emplace_back(ExprTokenType::kFloat, input_.substr(cur_, float_len), cur_);
       cur_ += float_len;
       continue;
     }

     const ExprTokenRecord& record = ClassifyCurrent();
     if (has_error())
       break;

     size_t token_begin = cur_;
     AdvanceToEndOfToken(record);
     if (has_error())
       break;

     size_t token_end = cur_;
     std::string token_value(&input_[token_begin], token_end - token_begin);
     tokens_.emplace_back(record.type, token_value, token_begin);
   }
   return !has_error();
 }

 // static
 bool ExprTokenizer::IsNameToken(std::string_view input) {
   if (input.empty())
     return false;
   if (!IsNameFirstChar(input[0]))
     return false;
   for (size_t i = 1; i < input.size(); i++) {
     if (!IsNameContinuingChar(input[i]))
       return false;
   }
   return true;
 }

 // static
 std::string ExprTokenizer::GetErrorContext(const std::string& input, size_t byte_offset) {
   // Index should be in range of the input string. Also allow indicating one
   // character past the end.
   FX_DCHECK(byte_offset <= input.size());

   // Future enhancements:
   // - If we allow multiline expressions in the input, the returned context should not cross
   //   newlines or it will be messed up.
   // - Input longer than 80 chars should be clipped to guarantee it doesn't wrap.

   std::string output;
   output = "  " + input + "\n  ";
   output.append(byte_offset, ' ');
   output.push_back('^');
   return output;
 }

 void ExprTokenizer::AdvanceChars(int n) { cur_ += n; }

 void ExprTokenizer::AdvanceOneChar() { cur_++; }

 void ExprTokenizer::AdvanceToNextToken() {
   while (!at_end() && IsCurrentWhitespace())
     AdvanceOneChar();
 }

 void ExprTokenizer::AdvanceToEndOfToken(const ExprTokenRecord& record) {
   if (!record.static_value.empty()) {
     // Known sizes. Because the token matched we should always have enough characters.
     FX_DCHECK(input_.size() >= cur_ + record.static_value.size());
     cur_ += record.static_value.size();
     return;
   }

   // Manually advance over variable-length tokens.
   switch (record.type) {
     case ExprTokenType::kInteger:
       do {
         AdvanceOneChar();
       } while (!at_end() && IsIntegerContinuingChar(cur_char()));
       break;

     case ExprTokenType::kName:
       do {
         AdvanceOneChar();
       } while (!at_end() && IsNameContinuingChar(cur_char()));
       break;

     default:
       FX_NOTREACHED();
       err_ = Err("Internal parser error.");
       error_location_ = cur_;
       break;
   }
 }

 bool ExprTokenizer::CurrentMatchesTokenRecord(const ExprTokenRecord& record) const {
   // Non-statically-known tokens shouldn't use this code path.
   FX_DCHECK(!record.static_value.empty());

   const size_t size = record.static_value.size();
   if (!can_advance(size))
     return false;  // Not enough room.

   if (!(record.languages & static_cast<unsigned>(language_)))
     return false;  // Doesn't apply to this language.

   if (std::string_view(&input_[cur_], size) != record.static_value)
     return false;  // Doesn't match the token static value.

   if (record.is_alphanum) {
     if (cur_ + size < input_.size() && isalnum(input_[cur_ + size]))
       return false;  // Alphanumeric character follows so won't match.
   }

   return true;
 }

 bool ExprTokenizer::IsCurrentWhitespace() const {
   FX_DCHECK(!at_end());
   char c = input_[cur_];
   return c == 0x0A || c == 0x0D || c == 0x20;
 }

 const ExprTokenRecord& ExprTokenizer::ClassifyCurrent() {
   FX_DCHECK(!at_end());
   char cur = cur_char();

   const ExprTokenRecord* longest = nullptr;
   for (const ExprTokenRecord* match : TokensWithFirstChar(cur)) {
     if (!CurrentMatchesTokenRecord(*match))
       continue;

     if (!longest || match->static_value.size() > longest->static_value.size())
       longest = match;
   }

   if (longest)
     return *longest;

   // Integers.
   if (IsIntegerFirstChar(cur))
     return RecordForTokenType(ExprTokenType::kInteger);

   // Everything else is a general name.
   if (IsNameFirstChar(cur))
     return RecordForTokenType(ExprTokenType::kName);

   error_location_ = cur_;
   err_ = Err(fxl::StringPrintf("Invalid character '%c' in expression.\n", cur) +
              GetErrorContext(input_, cur_));
   return RecordForTokenType(ExprTokenType::kInvalid);
 }

 }  // namespace zxdb
	// Copyright 2018 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "src/developer/debug/zxdb/expr/expr_tokenizer.h"

	#include <ctype.h>
	#include <lib/syslog/cpp/macros.h>

	#include <type_traits>

	#include "src/developer/debug/zxdb/expr/number_parser.h"
	#include "src/developer/debug/zxdb/expr/parse_special_identifier.h"
	#include "src/developer/debug/zxdb/expr/parse_string.h"
	#include "src/lib/fxl/strings/string_printf.h"

	namespace zxdb {

	namespace {

	bool IsNameFirstChar(char c) {
	return (c >= 'A' && c <= 'Z') \|\| (c >= 'a' && c <= 'z') \|\| c == '_' \|\| c == '~';
	}

	bool IsNameContinuingChar(char c) { return IsNameFirstChar(c) \|\| (c >= '0' && c <= '9'); }

	bool IsIntegerFirstChar(char c) { return isdigit(c); }

	// This allows all alphanumeric characters for simplicity. Integer literals aren't validated at the
	// tokenizer level and will be checked later. Our job is to find the extent of the literal.
	bool IsIntegerContinuingChar(char c) { return isalnum(c); }

	// Returns a list of all tokens sharing the given first character.
	const std::vector<const ExprTokenRecord*>& TokensWithFirstChar(char c) {
	// Lookup table for all 7-bit characters.
	constexpr unsigned char kMaxLookupChar = 0x80;
	static std::vector<const ExprTokenRecord*> mapping[kMaxLookupChar];
	static bool initialized = false;

	if (!initialized) {
	// Construct the lookup table.
	initialized = true;
	for (size_t i = 0; i < kNumExprTokenTypes; i++) {
	const ExprTokenRecord& record = RecordForTokenType(static_cast<ExprTokenType>(i));
	if (!record.static_value.empty())
	mapping[static_cast<size_t>(record.static_value[0])].push_back(&record);
	}
	}

	if (static_cast<unsigned char>(c) >= kMaxLookupChar) {
	static std::vector<const ExprTokenRecord*> empty_records;
	return empty_records;
	}
	return mapping[static_cast<size_t>(c)];
	}

	} // namespace

	ExprTokenizer::ExprTokenizer(const std::string& input, ExprLanguage lang)
	: input_(input), language_(lang) {}

	bool ExprTokenizer::Tokenize() {
	while (!done()) {
	AdvanceToNextToken();
	if (done())
	break;

	if (auto string_info = DoesBeginStringLiteral(language_, input_, cur_)) {
	// String literals are handled specially by the string parser.
	auto result = ParseStringLiteral(input_, *string_info, &cur_, &error_location_);
	if (result.has_error()) {
	err_ = result.err();
	break;
	}

	tokens_.emplace_back(ExprTokenType::kStringLiteral, result.value(),
	string_info->string_begin);
	continue;
	}

	// Special escaped identifiers.
	if (cur_char() == '$') {
	// Here just discard the name and contents. These will be re-extracted by the parser. This
	// could be optimized to avoid the extra work, but we don't need that level of optimization.
	size_t token_begin = cur_;
	SpecialIdentifier special = SpecialIdentifier::kNone;
	std::string special_cont;
	err_ = ParseSpecialIdentifier(input_, &cur_, &special, &special_cont, &error_location_);
	if (err_.has_error())
	break;
	tokens_.emplace_back(ExprTokenType::kSpecialName,
	input_.substr(token_begin, cur_ - token_begin), token_begin);
	continue;
	}

	// Floats.
	if (size_t float_len = GetFloatTokenLength(language_, input_.substr(cur_))) {
	tokens_.emplace_back(ExprTokenType::kFloat, input_.substr(cur_, float_len), cur_);
	cur_ += float_len;
	continue;
	}

	const ExprTokenRecord& record = ClassifyCurrent();
	if (has_error())
	break;

	size_t token_begin = cur_;
	AdvanceToEndOfToken(record);
	if (has_error())
	break;

	size_t token_end = cur_;
	std::string token_value(&input_[token_begin], token_end - token_begin);
	tokens_.emplace_back(record.type, token_value, token_begin);
	}
	return !has_error();
	}

	// static
	bool ExprTokenizer::IsNameToken(std::string_view input) {
	if (input.empty())
	return false;
	if (!IsNameFirstChar(input[0]))
	return false;
	for (size_t i = 1; i < input.size(); i++) {
	if (!IsNameContinuingChar(input[i]))
	return false;
	}
	return true;
	}

	// static
	std::string ExprTokenizer::GetErrorContext(const std::string& input, size_t byte_offset) {
	// Index should be in range of the input string. Also allow indicating one
	// character past the end.
	FX_DCHECK(byte_offset <= input.size());

	// Future enhancements:
	// - If we allow multiline expressions in the input, the returned context should not cross
	// newlines or it will be messed up.
	// - Input longer than 80 chars should be clipped to guarantee it doesn't wrap.

	std::string output;
	output = " " + input + "\n ";
	output.append(byte_offset, ' ');
	output.push_back('^');
	return output;
	}

	void ExprTokenizer::AdvanceChars(int n) { cur_ += n; }

	void ExprTokenizer::AdvanceOneChar() { cur_++; }

	void ExprTokenizer::AdvanceToNextToken() {
	while (!at_end() && IsCurrentWhitespace())
	AdvanceOneChar();
	}

	void ExprTokenizer::AdvanceToEndOfToken(const ExprTokenRecord& record) {
	if (!record.static_value.empty()) {
	// Known sizes. Because the token matched we should always have enough characters.
	FX_DCHECK(input_.size() >= cur_ + record.static_value.size());
	cur_ += record.static_value.size();
	return;
	}

	// Manually advance over variable-length tokens.
	switch (record.type) {
	case ExprTokenType::kInteger:
	do {
	AdvanceOneChar();
	} while (!at_end() && IsIntegerContinuingChar(cur_char()));
	break;

	case ExprTokenType::kName:
	do {
	AdvanceOneChar();
	} while (!at_end() && IsNameContinuingChar(cur_char()));
	break;

	default:
	FX_NOTREACHED();
	err_ = Err("Internal parser error.");
	error_location_ = cur_;
	break;
	}
	}

	bool ExprTokenizer::CurrentMatchesTokenRecord(const ExprTokenRecord& record) const {
	// Non-statically-known tokens shouldn't use this code path.
	FX_DCHECK(!record.static_value.empty());

	const size_t size = record.static_value.size();
	if (!can_advance(size))
	return false; // Not enough room.

	if (!(record.languages & static_cast<unsigned>(language_)))
	return false; // Doesn't apply to this language.

	if (std::string_view(&input_[cur_], size) != record.static_value)
	return false; // Doesn't match the token static value.

	if (record.is_alphanum) {
	if (cur_ + size < input_.size() && isalnum(input_[cur_ + size]))
	return false; // Alphanumeric character follows so won't match.
	}

	return true;
	}

	bool ExprTokenizer::IsCurrentWhitespace() const {
	FX_DCHECK(!at_end());
	char c = input_[cur_];
	return c == 0x0A \|\| c == 0x0D \|\| c == 0x20;
	}

	const ExprTokenRecord& ExprTokenizer::ClassifyCurrent() {
	FX_DCHECK(!at_end());
	char cur = cur_char();

	const ExprTokenRecord* longest = nullptr;
	for (const ExprTokenRecord* match : TokensWithFirstChar(cur)) {
	if (!CurrentMatchesTokenRecord(*match))
	continue;

	if (!longest \|\| match->static_value.size() > longest->static_value.size())
	longest = match;
	}

	if (longest)
	return *longest;

	// Integers.
	if (IsIntegerFirstChar(cur))
	return RecordForTokenType(ExprTokenType::kInteger);

	// Everything else is a general name.
	if (IsNameFirstChar(cur))
	return RecordForTokenType(ExprTokenType::kName);

	error_location_ = cur_;
	err_ = Err(fxl::StringPrintf("Invalid character '%c' in expression.\n", cur) +
	GetErrorContext(input_, cur_));
	return RecordForTokenType(ExprTokenType::kInvalid);
	}

	} // namespace zxdb