blob: 832e9d6644c4cce4f6c737e31fe6efc7eca60529 [file] [log] [blame]
// Copyright 2018 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/developer/debug/zxdb/expr/expr_tokenizer.h"
#include <ctype.h>
#include <lib/syslog/cpp/macros.h>
#include <type_traits>
#include "src/developer/debug/zxdb/expr/number_parser.h"
#include "src/developer/debug/zxdb/expr/parse_special_identifier.h"
#include "src/developer/debug/zxdb/expr/parse_string.h"
#include "src/lib/fxl/strings/string_printf.h"
namespace zxdb {
namespace {
bool IsNameFirstChar(char c) {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' || c == '~';
}
bool IsNameContinuingChar(char c) { return IsNameFirstChar(c) || (c >= '0' && c <= '9'); }
bool IsIntegerFirstChar(char c) { return isdigit(c); }
// This allows all alphanumeric characters for simplicity. Integer literals aren't validated at the
// tokenizer level and will be checked later. Our job is to find the extent of the literal.
bool IsIntegerContinuingChar(char c) { return isalnum(c); }
// Returns a list of all tokens sharing the given first character.
const std::vector<const ExprTokenRecord*>& TokensWithFirstChar(char c) {
// Lookup table for all 7-bit characters.
constexpr unsigned char kMaxLookupChar = 0x80;
static std::vector<const ExprTokenRecord*> mapping[kMaxLookupChar];
static bool initialized = false;
if (!initialized) {
// Construct the lookup table.
initialized = true;
for (size_t i = 0; i < kNumExprTokenTypes; i++) {
const ExprTokenRecord& record = RecordForTokenType(static_cast<ExprTokenType>(i));
if (!record.static_value.empty())
mapping[static_cast<size_t>(record.static_value[0])].push_back(&record);
}
}
if (static_cast<unsigned char>(c) >= kMaxLookupChar) {
static std::vector<const ExprTokenRecord*> empty_records;
return empty_records;
}
return mapping[static_cast<size_t>(c)];
}
} // namespace
ExprTokenizer::ExprTokenizer(const std::string& input, ExprLanguage lang)
: input_(input), language_(lang) {}
bool ExprTokenizer::Tokenize() {
while (!done()) {
AdvanceToNextToken();
if (done())
break;
if (auto string_info = DoesBeginStringLiteral(language_, input_, cur_)) {
// String literals are handled specially by the string parser.
auto result = ParseStringLiteral(input_, *string_info, &cur_, &error_location_);
if (result.has_error()) {
err_ = result.err();
break;
}
tokens_.emplace_back(ExprTokenType::kStringLiteral, result.value(),
string_info->string_begin);
continue;
}
// Special escaped identifiers.
if (cur_char() == '$') {
// Here just discard the name and contents. These will be re-extracted by the parser. This
// could be optimized to avoid the extra work, but we don't need that level of optimization.
size_t token_begin = cur_;
SpecialIdentifier special = SpecialIdentifier::kNone;
std::string special_cont;
err_ = ParseSpecialIdentifier(input_, &cur_, &special, &special_cont, &error_location_);
if (err_.has_error())
break;
tokens_.emplace_back(ExprTokenType::kSpecialName,
input_.substr(token_begin, cur_ - token_begin), token_begin);
continue;
}
// Floats.
if (size_t float_len = GetFloatTokenLength(language_, input_.substr(cur_))) {
tokens_.emplace_back(ExprTokenType::kFloat, input_.substr(cur_, float_len), cur_);
cur_ += float_len;
continue;
}
const ExprTokenRecord& record = ClassifyCurrent();
if (has_error())
break;
size_t token_begin = cur_;
AdvanceToEndOfToken(record);
if (has_error())
break;
size_t token_end = cur_;
std::string token_value(&input_[token_begin], token_end - token_begin);
tokens_.emplace_back(record.type, token_value, token_begin);
}
return !has_error();
}
// static
bool ExprTokenizer::IsNameToken(std::string_view input) {
if (input.empty())
return false;
if (!IsNameFirstChar(input[0]))
return false;
for (size_t i = 1; i < input.size(); i++) {
if (!IsNameContinuingChar(input[i]))
return false;
}
return true;
}
// static
std::string ExprTokenizer::GetErrorContext(const std::string& input, size_t byte_offset) {
// Index should be in range of the input string. Also allow indicating one
// character past the end.
FX_DCHECK(byte_offset <= input.size());
// Future enhancements:
// - If we allow multiline expressions in the input, the returned context should not cross
// newlines or it will be messed up.
// - Input longer than 80 chars should be clipped to guarantee it doesn't wrap.
std::string output;
output = " " + input + "\n ";
output.append(byte_offset, ' ');
output.push_back('^');
return output;
}
void ExprTokenizer::AdvanceChars(int n) { cur_ += n; }
void ExprTokenizer::AdvanceOneChar() { cur_++; }
void ExprTokenizer::AdvanceToNextToken() {
while (!at_end() && IsCurrentWhitespace())
AdvanceOneChar();
}
void ExprTokenizer::AdvanceToEndOfToken(const ExprTokenRecord& record) {
if (!record.static_value.empty()) {
// Known sizes. Because the token matched we should always have enough characters.
FX_DCHECK(input_.size() >= cur_ + record.static_value.size());
cur_ += record.static_value.size();
return;
}
// Manually advance over variable-length tokens.
switch (record.type) {
case ExprTokenType::kInteger:
do {
AdvanceOneChar();
} while (!at_end() && IsIntegerContinuingChar(cur_char()));
break;
case ExprTokenType::kName:
do {
AdvanceOneChar();
} while (!at_end() && IsNameContinuingChar(cur_char()));
break;
default:
FX_NOTREACHED();
err_ = Err("Internal parser error.");
error_location_ = cur_;
break;
}
}
bool ExprTokenizer::CurrentMatchesTokenRecord(const ExprTokenRecord& record) const {
// Non-statically-known tokens shouldn't use this code path.
FX_DCHECK(!record.static_value.empty());
const size_t size = record.static_value.size();
if (!can_advance(size))
return false; // Not enough room.
if (!(record.languages & static_cast<unsigned>(language_)))
return false; // Doesn't apply to this language.
if (std::string_view(&input_[cur_], size) != record.static_value)
return false; // Doesn't match the token static value.
if (record.is_alphanum) {
if (cur_ + size < input_.size() && isalnum(input_[cur_ + size]))
return false; // Alphanumeric character follows so won't match.
}
return true;
}
bool ExprTokenizer::IsCurrentWhitespace() const {
FX_DCHECK(!at_end());
char c = input_[cur_];
return c == 0x0A || c == 0x0D || c == 0x20;
}
const ExprTokenRecord& ExprTokenizer::ClassifyCurrent() {
FX_DCHECK(!at_end());
char cur = cur_char();
const ExprTokenRecord* longest = nullptr;
for (const ExprTokenRecord* match : TokensWithFirstChar(cur)) {
if (!CurrentMatchesTokenRecord(*match))
continue;
if (!longest || match->static_value.size() > longest->static_value.size())
longest = match;
}
if (longest)
return *longest;
// Integers.
if (IsIntegerFirstChar(cur))
return RecordForTokenType(ExprTokenType::kInteger);
// Everything else is a general name.
if (IsNameFirstChar(cur))
return RecordForTokenType(ExprTokenType::kName);
error_location_ = cur_;
err_ = Err(fxl::StringPrintf("Invalid character '%c' in expression.\n", cur) +
GetErrorContext(input_, cur_));
return RecordForTokenType(ExprTokenType::kInvalid);
}
} // namespace zxdb