garnet/bin/zxdb/expr/template_type_extractor.cc - fuchsia/ - Git at Google

 // Copyright 2018 The Fuchsia Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.

 #include "garnet/bin/zxdb/expr/template_type_extractor.h"

 #include "garnet/bin/zxdb/expr/expr_token.h"
 #include "lib/fxl/logging.h"

 namespace zxdb {

 namespace {

 // Tracks the level of nesting of brackets.
 struct Nesting {
   Nesting(size_t i, ExprTokenType e) : opening_index(i), end(e) {}

   size_t opening_index = 0;                     // Index of opening bracket.
   ExprTokenType end = ExprTokenType::kInvalid;  // Expected closing bracket.
 };

 // A table of operators that need special handling. These are ones that can
 // interfere with the parsing. Things like "operator+" are skipped fine using
 // the normal code path of "word" + "punctuation" so don't need to be here for
 // the current limited use case.
 //
 // This is in order we should evaluate it, so if one is a subset of another
 // (e.g. "operator+" is a subset of "operator++"), the more specific one should
 // be first.
 struct OperatorRecord {
   ExprTokenType first;
   ExprTokenType second;
 };
 const OperatorRecord kOperators[] = {
     {ExprTokenType::kLess, ExprTokenType::kLess},        // <<
     {ExprTokenType::kLess, ExprTokenType::kInvalid},     // <
     {ExprTokenType::kGreater, ExprTokenType::kGreater},  // >>
     {ExprTokenType::kGreater, ExprTokenType::kInvalid},  // >
     {ExprTokenType::kComma, ExprTokenType::kInvalid},    // ,
 };

 bool IsNamelikeToken(const ExprToken& token) {
   return token.type() == ExprTokenType::kName ||
          token.type() == ExprTokenType::kTrue ||
          token.type() == ExprTokenType::kFalse ||
          token.type() == ExprTokenType::kConst ||
          token.type() == ExprTokenType::kVolatile;
 }

 // Returns true if the token at the given index needs a space before it to
 // separate it from the previous token. The first_index is the index of the
 // first token being considered for type extraction (so we don't consider the
 // boundary before this).
 bool NeedsSpaceBefore(const std::vector<ExprToken>& tokens, size_t first_index,
                       size_t index) {
   FXL_DCHECK(first_index <= index);
   if (first_index == index)
     return false;  // Also catches index == 0.

   // Names always need a space between then. A name here is any word, so
   // "const Foo" would be an example.
   if (IsNamelikeToken(tokens[index - 1]) && IsNamelikeToken(tokens[index]))
     return true;

   // Put a space after a comma. This is undesirable in the case of "operator,"
   // appearing as in "template<CmpOp a = operator,>" but not a big deal.
   if (tokens[index - 1].type() == ExprTokenType::kComma)
     return true;

   // Most other things can go next to each other as far as valid C++ goes.
   // These are some cases that this does incorrectly, see the comment above
   // ExtractTemplateType() for why this isn't so bad and how it could be
   // improved.
   return false;
 }

 // |*index| points to the index of the operator token. It will be updated to
 // point to the last token consumed.
 void HandleOperator(const std::vector<ExprToken>& tokens, size_t* index,
                     std::string* result) {
   // Always append "operator" itself.
   result->append(tokens[*index].value());
   if (tokens.size() - 1 == *index)
     return;  // "operator" at end of stream, just append it.

   // 0 when not found, otherwise # tokens matched after "operator".
   int matched_tokens = 0;

   // The second token we're looking for.
   ExprTokenType second_type = tokens.size() > *index + 2
                                   ? tokens[*index + 2].type()
                                   : ExprTokenType::kInvalid;
   for (const auto& cur_op : kOperators) {
     if (cur_op.first == tokens[*index + 1].type()) {
       // First character matched.
       if (cur_op.second == ExprTokenType::kInvalid) {
         // Anything matches, we found it.
         matched_tokens = 1;
         break;
       }

       // The following token should also match, and the two tokens should be
       // adjacent in the input stream.
       if (cur_op.second == second_type &&
           tokens[*index + 1].byte_offset() + 1 ==
               tokens[*index + 2].byte_offset()) {
         matched_tokens = 2;
         break;
       }
     }
   }

   // Append any matched tokens. If no token is matched, it's probably an invalid
   // operator specification (doesn't matter since we're just identifying and
   // canonicalizing).
   if (matched_tokens >= 1) {
     result->append(tokens[*index + 1].value());
     if (matched_tokens == 2)
       result->append(tokens[*index + 2].value());
   }
   *index += matched_tokens;
 }

 }  // namespace

 // This doesn't handle some evil things, mostly around "operator" keywords:
 //
 //   template<CmpOp a = operator> > void DoBar();
 //   template<CmpOp a = operator>>> void DoBar();
 //   template<CmpOp a = operator,> void DoBar();
 //
 //   auto foo = operator + + 1;
 //
 // Currently it assumes all operators can be put next to each other without
 // affecting meaning. When we're canonicalizing types for the purposes of
 // string comparisons, this is almost certainly the case. If we start using
 // the output from this function for more things, we'll want to handle
 // these cases better.
 //
 // To address this, I'm thinking we should look for the "operator" keyword.
 // Then look up the following tokens in a table of valid C++ operator function
 // names to consume those that are actually part of the operator name (this
 // needs some careful handling of spaces (ExprToken.byte_offset), since
 // "operator++" and "operator ++" are the same thing but "operator ++" and
 // "operator + +" are different).
 //
 // When we have this lookahead for "operator>" we can remove the
 // "PreviousTokenIsOperatorKeyword" code.
 TemplateTypeResult ExtractTemplateType(const std::vector<ExprToken>& tokens,
                                        size_t begin_token) {
   TemplateTypeResult result;

   bool inhibit_next_space = false;

   std::vector<Nesting> nesting;
   size_t i = begin_token;
   for (; i < tokens.size(); i++) {
     ExprTokenType type = tokens[i].type();
     if (type == ExprTokenType::kLeftSquare) {
       // [
       nesting.emplace_back(i, ExprTokenType::kRightSquare);
     } else if (type == ExprTokenType::kLeftParen) {
       // (
       nesting.emplace_back(i, ExprTokenType::kRightParen);
     } else if (type == ExprTokenType::kLess) {
       // < (the sequences "operator<" and "operator<<" were handled when we
       //    got the "operator" token).
       nesting.emplace_back(i, ExprTokenType::kGreater);
     } else if (nesting.empty() && (type == ExprTokenType::kGreater ||
                                    type == ExprTokenType::kRightParen ||
                                    type == ExprTokenType::kComma)) {
       // These tokens mark the end of a type when seen without nesting. Usually
       // this marks the end of the enclosing cast or template.
       break;
     } else if (!nesting.empty() && type == nesting.back().end) {
       // Found the closing token for a previous opening one.
       nesting.pop_back();
     } else if (type == ExprTokenType::kName &&
                tokens[i].value() == "operator") {
       // Possible space before "operator".
       if (NeedsSpaceBefore(tokens, begin_token, i))
         result.canonical_name.push_back(' ');
       HandleOperator(tokens, &i, &result.canonical_name);

       // This prevents adding a space after the "," that would normally go
       // there for a normal comma.
       inhibit_next_space = true;
       continue;  // Skip the code at the bottom that appends the token.
     }

     if (!inhibit_next_space && NeedsSpaceBefore(tokens, begin_token, i))
       result.canonical_name.push_back(' ');
     inhibit_next_space = false;

     result.canonical_name += tokens[i].value();
   }

   if (nesting.empty()) {
     result.success = true;
     result.end_token = i;
   } else {
     // Unterminated thing, tell the caller where it started.
     result.success = false;
     result.unmatched_error_token = nesting.back().opening_index;
     result.canonical_name.clear();
     result.end_token = tokens.size();
   }
   return result;
 }

 }  // namespace zxdb
	// Copyright 2018 The Fuchsia Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style license that can be
	// found in the LICENSE file.

	#include "garnet/bin/zxdb/expr/template_type_extractor.h"

	#include "garnet/bin/zxdb/expr/expr_token.h"
	#include "lib/fxl/logging.h"

	namespace zxdb {

	namespace {

	// Tracks the level of nesting of brackets.
	struct Nesting {
	Nesting(size_t i, ExprTokenType e) : opening_index(i), end(e) {}

	size_t opening_index = 0; // Index of opening bracket.
	ExprTokenType end = ExprTokenType::kInvalid; // Expected closing bracket.
	};

	// A table of operators that need special handling. These are ones that can
	// interfere with the parsing. Things like "operator+" are skipped fine using
	// the normal code path of "word" + "punctuation" so don't need to be here for
	// the current limited use case.
	//
	// This is in order we should evaluate it, so if one is a subset of another
	// (e.g. "operator+" is a subset of "operator++"), the more specific one should
	// be first.
	struct OperatorRecord {
	ExprTokenType first;
	ExprTokenType second;
	};
	const OperatorRecord kOperators[] = {
	{ExprTokenType::kLess, ExprTokenType::kLess}, // <<
	{ExprTokenType::kLess, ExprTokenType::kInvalid}, // <
	{ExprTokenType::kGreater, ExprTokenType::kGreater}, // >>
	{ExprTokenType::kGreater, ExprTokenType::kInvalid}, // >
	{ExprTokenType::kComma, ExprTokenType::kInvalid}, // ,
	};

	bool IsNamelikeToken(const ExprToken& token) {
	return token.type() == ExprTokenType::kName \|\|
	token.type() == ExprTokenType::kTrue \|\|
	token.type() == ExprTokenType::kFalse \|\|
	token.type() == ExprTokenType::kConst \|\|
	token.type() == ExprTokenType::kVolatile;
	}

	// Returns true if the token at the given index needs a space before it to
	// separate it from the previous token. The first_index is the index of the
	// first token being considered for type extraction (so we don't consider the
	// boundary before this).
	bool NeedsSpaceBefore(const std::vector<ExprToken>& tokens, size_t first_index,
	size_t index) {
	FXL_DCHECK(first_index <= index);
	if (first_index == index)
	return false; // Also catches index == 0.

	// Names always need a space between then. A name here is any word, so
	// "const Foo" would be an example.
	if (IsNamelikeToken(tokens[index - 1]) && IsNamelikeToken(tokens[index]))
	return true;

	// Put a space after a comma. This is undesirable in the case of "operator,"
	// appearing as in "template<CmpOp a = operator,>" but not a big deal.
	if (tokens[index - 1].type() == ExprTokenType::kComma)
	return true;

	// Most other things can go next to each other as far as valid C++ goes.
	// These are some cases that this does incorrectly, see the comment above
	// ExtractTemplateType() for why this isn't so bad and how it could be
	// improved.
	return false;
	}

	// \|*index\| points to the index of the operator token. It will be updated to
	// point to the last token consumed.
	void HandleOperator(const std::vector<ExprToken>& tokens, size_t* index,
	std::string* result) {
	// Always append "operator" itself.
	result->append(tokens[*index].value());
	if (tokens.size() - 1 == *index)
	return; // "operator" at end of stream, just append it.

	// 0 when not found, otherwise # tokens matched after "operator".
	int matched_tokens = 0;

	// The second token we're looking for.
	ExprTokenType second_type = tokens.size() > *index + 2
	? tokens[*index + 2].type()
	: ExprTokenType::kInvalid;
	for (const auto& cur_op : kOperators) {
	if (cur_op.first == tokens[*index + 1].type()) {
	// First character matched.
	if (cur_op.second == ExprTokenType::kInvalid) {
	// Anything matches, we found it.
	matched_tokens = 1;
	break;
	}

	// The following token should also match, and the two tokens should be
	// adjacent in the input stream.
	if (cur_op.second == second_type &&
	tokens[*index + 1].byte_offset() + 1 ==
	tokens[*index + 2].byte_offset()) {
	matched_tokens = 2;
	break;
	}
	}
	}

	// Append any matched tokens. If no token is matched, it's probably an invalid
	// operator specification (doesn't matter since we're just identifying and
	// canonicalizing).
	if (matched_tokens >= 1) {
	result->append(tokens[*index + 1].value());
	if (matched_tokens == 2)
	result->append(tokens[*index + 2].value());
	}
	*index += matched_tokens;
	}

	} // namespace

	// This doesn't handle some evil things, mostly around "operator" keywords:
	//
	// template<CmpOp a = operator> > void DoBar();
	// template<CmpOp a = operator>>> void DoBar();
	// template<CmpOp a = operator,> void DoBar();
	//
	// auto foo = operator + + 1;
	//
	// Currently it assumes all operators can be put next to each other without
	// affecting meaning. When we're canonicalizing types for the purposes of
	// string comparisons, this is almost certainly the case. If we start using
	// the output from this function for more things, we'll want to handle
	// these cases better.
	//
	// To address this, I'm thinking we should look for the "operator" keyword.
	// Then look up the following tokens in a table of valid C++ operator function
	// names to consume those that are actually part of the operator name (this
	// needs some careful handling of spaces (ExprToken.byte_offset), since
	// "operator++" and "operator ++" are the same thing but "operator ++" and
	// "operator + +" are different).
	//
	// When we have this lookahead for "operator>" we can remove the
	// "PreviousTokenIsOperatorKeyword" code.
	TemplateTypeResult ExtractTemplateType(const std::vector<ExprToken>& tokens,
	size_t begin_token) {
	TemplateTypeResult result;

	bool inhibit_next_space = false;

	std::vector<Nesting> nesting;
	size_t i = begin_token;
	for (; i < tokens.size(); i++) {
	ExprTokenType type = tokens[i].type();
	if (type == ExprTokenType::kLeftSquare) {
	// [
	nesting.emplace_back(i, ExprTokenType::kRightSquare);
	} else if (type == ExprTokenType::kLeftParen) {
	// (
	nesting.emplace_back(i, ExprTokenType::kRightParen);
	} else if (type == ExprTokenType::kLess) {
	// < (the sequences "operator<" and "operator<<" were handled when we
	// got the "operator" token).
	nesting.emplace_back(i, ExprTokenType::kGreater);
	} else if (nesting.empty() && (type == ExprTokenType::kGreater \|\|
	type == ExprTokenType::kRightParen \|\|
	type == ExprTokenType::kComma)) {
	// These tokens mark the end of a type when seen without nesting. Usually
	// this marks the end of the enclosing cast or template.
	break;
	} else if (!nesting.empty() && type == nesting.back().end) {
	// Found the closing token for a previous opening one.
	nesting.pop_back();
	} else if (type == ExprTokenType::kName &&
	tokens[i].value() == "operator") {
	// Possible space before "operator".
	if (NeedsSpaceBefore(tokens, begin_token, i))
	result.canonical_name.push_back(' ');
	HandleOperator(tokens, &i, &result.canonical_name);

	// This prevents adding a space after the "," that would normally go
	// there for a normal comma.
	inhibit_next_space = true;
	continue; // Skip the code at the bottom that appends the token.
	}

	if (!inhibit_next_space && NeedsSpaceBefore(tokens, begin_token, i))
	result.canonical_name.push_back(' ');
	inhibit_next_space = false;

	result.canonical_name += tokens[i].value();
	}

	if (nesting.empty()) {
	result.success = true;
	result.end_token = i;
	} else {
	// Unterminated thing, tell the caller where it started.
	result.success = false;
	result.unmatched_error_token = nesting.back().opening_index;
	result.canonical_name.clear();
	result.end_token = tokens.size();
	}
	return result;
	}

	} // namespace zxdb