| //===--- CXX.cpp - Define public interfaces for C++ grammar ---------------===// |
| // |
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| // See https://llvm.org/LICENSE.txt for license information. |
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| // |
| //===----------------------------------------------------------------------===// |
| |
| #include "clang-pseudo/cxx/CXX.h" |
| #include "clang-pseudo/Forest.h" |
| #include "clang-pseudo/Language.h" |
| #include "clang-pseudo/grammar/Grammar.h" |
| #include "clang-pseudo/grammar/LRTable.h" |
| #include "clang/Basic/CharInfo.h" |
| #include "clang/Basic/TokenKinds.h" |
| #include "llvm/ADT/StringSwitch.h" |
| #include "llvm/Support/Debug.h" |
| #include <utility> |
| #define DEBUG_TYPE "CXX.cpp" |
| |
| namespace clang { |
| namespace pseudo { |
| namespace cxx { |
| namespace { |
| static const char *CXXBNF = |
| #include "CXXBNF.inc" |
| ; |
| |
| // User-defined string literals look like `""suffix`. |
| bool isStringUserDefined(const Token &Tok) { |
| return !Tok.text().ends_with("\""); |
| } |
| bool isCharUserDefined(const Token &Tok) { return !Tok.text().ends_with("'"); } |
| |
| // Combinable flags describing numbers. |
| // Clang has just one numeric_token kind, the grammar has 4. |
| enum NumericKind { |
| Integer = 0, |
| Floating = 1 << 0, |
| UserDefined = 1 << 1, |
| }; |
| // Determine the kind of numeric_constant we have. |
| // We can assume it's something valid, as it has been lexed. |
| // FIXME: is this expensive enough that we should set flags on the token |
| // and reuse them rather than computing it for each guard? |
| unsigned numKind(const Token &Tok) { |
| assert(Tok.Kind == tok::numeric_constant); |
| llvm::StringRef Text = Tok.text(); |
| if (Text.size() <= 1) |
| return Integer; |
| bool Hex = |
| Text.size() > 2 && Text[0] == '0' && (Text[1] == 'x' || Text[1] == 'X'); |
| uint8_t K = Integer; |
| |
| for (char C : Text) { |
| switch (C) { |
| case '.': |
| K |= Floating; |
| break; |
| case 'e': |
| case 'E': |
| if (!Hex) |
| K |= Floating; |
| break; |
| case 'p': |
| case 'P': |
| if (Hex) |
| K |= Floating; |
| break; |
| case '_': |
| K |= UserDefined; |
| break; |
| default: |
| break; |
| } |
| } |
| |
| // We would be done here, but there are stdlib UDLs that lack _. |
| // We must distinguish these from the builtin suffixes. |
| unsigned LastLetter = Text.size(); |
| while (LastLetter > 0 && isLetter(Text[LastLetter - 1])) |
| --LastLetter; |
| if (LastLetter == Text.size()) // Common case |
| return NumericKind(K); |
| // Trailing d/e/f are not part of the suffix in hex numbers. |
| while (Hex && LastLetter < Text.size() && isHexDigit(Text[LastLetter])) |
| ++LastLetter; |
| return llvm::StringSwitch<int, unsigned>(Text.substr(LastLetter)) |
| // std::chrono |
| .Cases("h", "min", "s", "ms", "us", "ns", "d", "y", K | UserDefined) |
| // complex |
| .Cases("il", "i", "if", K | UserDefined) |
| .Default(K); |
| } |
| |
| // RHS is expected to contain a single terminal. |
| // Returns the corresponding token. |
| const Token &onlyToken(tok::TokenKind Kind, |
| const ArrayRef<const ForestNode *> RHS, |
| const TokenStream &Tokens) { |
| assert(RHS.size() == 1 && RHS.front()->symbol() == tokenSymbol(Kind)); |
| return Tokens.tokens()[RHS.front()->startTokenIndex()]; |
| } |
| // RHS is expected to contain a single symbol. |
| // Returns the corresponding ForestNode. |
| const ForestNode &onlySymbol(SymbolID Kind, |
| const ArrayRef<const ForestNode *> RHS, |
| const TokenStream &Tokens) { |
| assert(RHS.size() == 1 && RHS.front()->symbol() == Kind); |
| return *RHS.front(); |
| } |
| |
| bool isFunctionDeclarator(const ForestNode *Declarator) { |
| assert(Declarator->symbol() == cxx::Symbol::declarator); |
| bool IsFunction = false; |
| while (true) { |
| // not well-formed code, return the best guess. |
| if (Declarator->kind() != ForestNode::Sequence) |
| return IsFunction; |
| |
| switch (Declarator->rule()) { |
| case rule::noptr_declarator::declarator_id: // reached the bottom |
| return IsFunction; |
| // *X is a nonfunction (unless X is a function). |
| case rule::ptr_declarator::ptr_operator__ptr_declarator: |
| Declarator = Declarator->elements()[1]; |
| IsFunction = false; |
| continue; |
| // X() is a function (unless X is a pointer or similar). |
| case rule::declarator:: |
| noptr_declarator__parameters_and_qualifiers__trailing_return_type: |
| case rule::noptr_declarator::noptr_declarator__parameters_and_qualifiers: |
| Declarator = Declarator->elements()[0]; |
| IsFunction = true; |
| continue; |
| // X[] is an array (unless X is a pointer or function). |
| case rule::noptr_declarator:: |
| noptr_declarator__L_SQUARE__constant_expression__R_SQUARE: |
| case rule::noptr_declarator::noptr_declarator__L_SQUARE__R_SQUARE: |
| Declarator = Declarator->elements()[0]; |
| IsFunction = false; |
| continue; |
| // (X) is whatever X is. |
| case rule::noptr_declarator::L_PAREN__ptr_declarator__R_PAREN: |
| Declarator = Declarator->elements()[1]; |
| continue; |
| case rule::ptr_declarator::noptr_declarator: |
| case rule::declarator::ptr_declarator: |
| Declarator = Declarator->elements()[0]; |
| continue; |
| |
| default: |
| assert(false && "unhandled declarator for IsFunction"); |
| return IsFunction; |
| } |
| } |
| llvm_unreachable("unreachable"); |
| } |
| |
| bool guardNextTokenNotElse(const GuardParams &P) { |
| return symbolToToken(P.Lookahead) != tok::kw_else; |
| } |
| |
| bool specifiesStructuredBinding(const GuardParams &P) { |
| const auto DSS = P.RHS[0]; |
| assert(DSS->symbol() == Symbol::decl_specifier_seq); |
| |
| auto Length = P.RHS[1]->startTokenIndex() - DSS->startTokenIndex(); |
| for (const auto &T : |
| P.Tokens.tokens().slice(DSS->startTokenIndex(), Length)) { |
| switch (T.Kind) { |
| case clang::tok::kw_static: |
| case clang::tok::kw_thread_local: |
| case clang::tok::kw_auto: |
| case clang::tok::kw_const: |
| case clang::tok::kw_volatile: |
| break; |
| default: |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| // Whether this e.g. decl-specifier contains an "exclusive" type such as a class |
| // name, and thus can't combine with a second exclusive type. |
| // |
| // Returns false for |
| // - non-types |
| // - "unsigned" etc that may suffice as types but may modify others |
| // - cases of uncertainty (e.g. due to ambiguity) |
| bool hasExclusiveType(const ForestNode *N) { |
| // FIXME: every time we apply this check, we walk the whole subtree. |
| // Add per-node caching instead. |
| while (true) { |
| assert(N->symbol() == Symbol::decl_specifier_seq || |
| N->symbol() == Symbol::type_specifier_seq || |
| N->symbol() == Symbol::defining_type_specifier_seq || |
| N->symbol() == Symbol::decl_specifier || |
| N->symbol() == Symbol::type_specifier || |
| N->symbol() == Symbol::defining_type_specifier || |
| N->symbol() == Symbol::simple_type_specifier); |
| if (N->kind() == ForestNode::Opaque) |
| return false; // conservative |
| if (N->kind() == ForestNode::Ambiguous) |
| return llvm::all_of(N->alternatives(), hasExclusiveType); // conservative |
| // All supported symbols are nonterminals. |
| assert(N->kind() == ForestNode::Sequence); |
| switch (N->rule()) { |
| // seq := element seq: check element then continue into seq |
| case rule::decl_specifier_seq::decl_specifier__decl_specifier_seq: |
| case rule::defining_type_specifier_seq::defining_type_specifier__defining_type_specifier_seq: |
| case rule::type_specifier_seq::type_specifier__type_specifier_seq: |
| if (hasExclusiveType(N->children()[0])) |
| return true; |
| N = N->children()[1]; |
| continue; |
| // seq := element: continue into element |
| case rule::decl_specifier_seq::decl_specifier: |
| case rule::type_specifier_seq::type_specifier: |
| case rule::defining_type_specifier_seq::defining_type_specifier: |
| N = N->children()[0]; |
| continue; |
| |
| // defining-type-specifier |
| case rule::defining_type_specifier::type_specifier: |
| N = N->children()[0]; |
| continue; |
| case rule::defining_type_specifier::class_specifier: |
| case rule::defining_type_specifier::enum_specifier: |
| return true; |
| |
| // decl-specifier |
| case rule::decl_specifier::defining_type_specifier: |
| N = N->children()[0]; |
| continue; |
| case rule::decl_specifier::CONSTEVAL: |
| case rule::decl_specifier::CONSTEXPR: |
| case rule::decl_specifier::CONSTINIT: |
| case rule::decl_specifier::INLINE: |
| case rule::decl_specifier::FRIEND: |
| case rule::decl_specifier::storage_class_specifier: |
| case rule::decl_specifier::TYPEDEF: |
| case rule::decl_specifier::function_specifier: |
| return false; |
| |
| // type-specifier |
| case rule::type_specifier::elaborated_type_specifier: |
| case rule::type_specifier::typename_specifier: |
| return true; |
| case rule::type_specifier::simple_type_specifier: |
| N = N->children()[0]; |
| continue; |
| case rule::type_specifier::cv_qualifier: |
| return false; |
| |
| // simple-type-specifier |
| case rule::simple_type_specifier::type_name: |
| case rule::simple_type_specifier::template_name: |
| case rule::simple_type_specifier::builtin_type: |
| case rule::simple_type_specifier::nested_name_specifier__TEMPLATE__simple_template_id: |
| case rule::simple_type_specifier::nested_name_specifier__template_name: |
| case rule::simple_type_specifier::nested_name_specifier__type_name: |
| case rule::simple_type_specifier::decltype_specifier: |
| case rule::simple_type_specifier::placeholder_type_specifier: |
| return true; |
| case rule::simple_type_specifier::LONG: |
| case rule::simple_type_specifier::SHORT: |
| case rule::simple_type_specifier::SIGNED: |
| case rule::simple_type_specifier::UNSIGNED: |
| return false; |
| |
| default: |
| LLVM_DEBUG(llvm::errs() << "Unhandled rule " << N->rule() << "\n"); |
| llvm_unreachable("hasExclusiveType be exhaustive!"); |
| } |
| } |
| } |
| |
| llvm::DenseMap<ExtensionID, RuleGuard> buildGuards() { |
| #define GUARD(cond) \ |
| { \ |
| [](const GuardParams &P) { return cond; } \ |
| } |
| #define TOKEN_GUARD(kind, cond) \ |
| [](const GuardParams& P) { \ |
| const Token &Tok = onlyToken(tok::kind, P.RHS, P.Tokens); \ |
| return cond; \ |
| } |
| #define SYMBOL_GUARD(kind, cond) \ |
| [](const GuardParams& P) { \ |
| const ForestNode &N = onlySymbol(Symbol::kind, P.RHS, P.Tokens); \ |
| return cond; \ |
| } |
| return { |
| {rule::function_declarator::declarator, |
| SYMBOL_GUARD(declarator, isFunctionDeclarator(&N))}, |
| {rule::non_function_declarator::declarator, |
| SYMBOL_GUARD(declarator, !isFunctionDeclarator(&N))}, |
| |
| // A {decl,type,defining-type}-specifier-sequence cannot have multiple |
| // "exclusive" types (like class names): a value has only one type. |
| {rule::defining_type_specifier_seq:: |
| defining_type_specifier__defining_type_specifier_seq, |
| GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, |
| {rule::type_specifier_seq::type_specifier__type_specifier_seq, |
| GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, |
| {rule::decl_specifier_seq::decl_specifier__decl_specifier_seq, |
| GUARD(!hasExclusiveType(P.RHS[0]) || !hasExclusiveType(P.RHS[1]))}, |
| |
| {rule::contextual_override::IDENTIFIER, |
| TOKEN_GUARD(identifier, Tok.text() == "override")}, |
| {rule::contextual_final::IDENTIFIER, |
| TOKEN_GUARD(identifier, Tok.text() == "final")}, |
| {rule::import_keyword::IDENTIFIER, |
| TOKEN_GUARD(identifier, Tok.text() == "import")}, |
| {rule::export_keyword::IDENTIFIER, |
| TOKEN_GUARD(identifier, Tok.text() == "export")}, |
| {rule::module_keyword::IDENTIFIER, |
| TOKEN_GUARD(identifier, Tok.text() == "module")}, |
| {rule::contextual_zero::NUMERIC_CONSTANT, |
| TOKEN_GUARD(numeric_constant, Tok.text() == "0")}, |
| |
| {rule::selection_statement::IF__L_PAREN__condition__R_PAREN__statement, |
| guardNextTokenNotElse}, |
| {rule::selection_statement:: |
| IF__L_PAREN__init_statement__condition__R_PAREN__statement, |
| guardNextTokenNotElse}, |
| {rule::selection_statement:: |
| IF__CONSTEXPR__L_PAREN__condition__R_PAREN__statement, |
| guardNextTokenNotElse}, |
| {rule::selection_statement:: |
| IF__CONSTEXPR__L_PAREN__init_statement__condition__R_PAREN__statement, |
| guardNextTokenNotElse}, |
| |
| // Implement C++ [basic.lookup.qual.general]: |
| // If a name, template-id, or decltype-specifier is followed by a |
| // ::, it shall designate a namespace, class, enumeration, or |
| // dependent type, and the :: is never interpreted as a complete |
| // nested-name-specifier. |
| {rule::nested_name_specifier::COLONCOLON, |
| TOKEN_GUARD(coloncolon, Tok.prev().Kind != tok::identifier)}, |
| |
| // Implement C++ [dcl.pre#6]: |
| // A simple-declaration with an identifier-list is called a structured |
| // binding declaration ([dcl.struct.bind]). If the decl-specifier-seq |
| // contains any decl-specifier other than static, thread_local, auto, |
| // or cv-qualifiers, the program is ill-formed. |
| {rule::simple_declaration:: |
| decl_specifier_seq__ref_qualifier__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI, |
| specifiesStructuredBinding}, |
| {rule::simple_declaration:: |
| decl_specifier_seq__L_SQUARE__identifier_list__R_SQUARE__initializer__SEMI, |
| specifiesStructuredBinding}, |
| |
| // The grammar distinguishes (only) user-defined vs plain string literals, |
| // where the clang lexer distinguishes (only) encoding types. |
| {rule::user_defined_string_literal_chunk::STRING_LITERAL, |
| TOKEN_GUARD(string_literal, isStringUserDefined(Tok))}, |
| {rule::user_defined_string_literal_chunk::UTF8_STRING_LITERAL, |
| TOKEN_GUARD(utf8_string_literal, isStringUserDefined(Tok))}, |
| {rule::user_defined_string_literal_chunk::UTF16_STRING_LITERAL, |
| TOKEN_GUARD(utf16_string_literal, isStringUserDefined(Tok))}, |
| {rule::user_defined_string_literal_chunk::UTF32_STRING_LITERAL, |
| TOKEN_GUARD(utf32_string_literal, isStringUserDefined(Tok))}, |
| {rule::user_defined_string_literal_chunk::WIDE_STRING_LITERAL, |
| TOKEN_GUARD(wide_string_literal, isStringUserDefined(Tok))}, |
| {rule::string_literal_chunk::STRING_LITERAL, |
| TOKEN_GUARD(string_literal, !isStringUserDefined(Tok))}, |
| {rule::string_literal_chunk::UTF8_STRING_LITERAL, |
| TOKEN_GUARD(utf8_string_literal, !isStringUserDefined(Tok))}, |
| {rule::string_literal_chunk::UTF16_STRING_LITERAL, |
| TOKEN_GUARD(utf16_string_literal, !isStringUserDefined(Tok))}, |
| {rule::string_literal_chunk::UTF32_STRING_LITERAL, |
| TOKEN_GUARD(utf32_string_literal, !isStringUserDefined(Tok))}, |
| {rule::string_literal_chunk::WIDE_STRING_LITERAL, |
| TOKEN_GUARD(wide_string_literal, !isStringUserDefined(Tok))}, |
| // And the same for chars. |
| {rule::user_defined_character_literal::CHAR_CONSTANT, |
| TOKEN_GUARD(char_constant, isCharUserDefined(Tok))}, |
| {rule::user_defined_character_literal::UTF8_CHAR_CONSTANT, |
| TOKEN_GUARD(utf8_char_constant, isCharUserDefined(Tok))}, |
| {rule::user_defined_character_literal::UTF16_CHAR_CONSTANT, |
| TOKEN_GUARD(utf16_char_constant, isCharUserDefined(Tok))}, |
| {rule::user_defined_character_literal::UTF32_CHAR_CONSTANT, |
| TOKEN_GUARD(utf32_char_constant, isCharUserDefined(Tok))}, |
| {rule::user_defined_character_literal::WIDE_CHAR_CONSTANT, |
| TOKEN_GUARD(wide_char_constant, isCharUserDefined(Tok))}, |
| {rule::character_literal::CHAR_CONSTANT, |
| TOKEN_GUARD(char_constant, !isCharUserDefined(Tok))}, |
| {rule::character_literal::UTF8_CHAR_CONSTANT, |
| TOKEN_GUARD(utf8_char_constant, !isCharUserDefined(Tok))}, |
| {rule::character_literal::UTF16_CHAR_CONSTANT, |
| TOKEN_GUARD(utf16_char_constant, !isCharUserDefined(Tok))}, |
| {rule::character_literal::UTF32_CHAR_CONSTANT, |
| TOKEN_GUARD(utf32_char_constant, !isCharUserDefined(Tok))}, |
| {rule::character_literal::WIDE_CHAR_CONSTANT, |
| TOKEN_GUARD(wide_char_constant, !isCharUserDefined(Tok))}, |
| // clang just has one NUMERIC_CONSTANT token for {ud,plain}x{float,int} |
| {rule::user_defined_integer_literal::NUMERIC_CONSTANT, |
| TOKEN_GUARD(numeric_constant, numKind(Tok) == (Integer | UserDefined))}, |
| {rule::user_defined_floating_point_literal::NUMERIC_CONSTANT, |
| TOKEN_GUARD(numeric_constant, numKind(Tok) == (Floating | UserDefined))}, |
| {rule::integer_literal::NUMERIC_CONSTANT, |
| TOKEN_GUARD(numeric_constant, numKind(Tok) == Integer)}, |
| {rule::floating_point_literal::NUMERIC_CONSTANT, |
| TOKEN_GUARD(numeric_constant, numKind(Tok) == Floating)}, |
| }; |
| #undef TOKEN_GUARD |
| #undef SYMBOL_GUARD |
| } |
| |
| Token::Index recoverBrackets(Token::Index Begin, const TokenStream &Tokens) { |
| assert(Begin > 0); |
| const Token &Left = Tokens.tokens()[Begin - 1]; |
| assert(Left.Kind == tok::l_brace || Left.Kind == tok::l_paren || |
| Left.Kind == tok::l_square); |
| if (const Token *Right = Left.pair()) { |
| assert(Tokens.index(*Right) > Begin - 1); |
| return Tokens.index(*Right); |
| } |
| return Token::Invalid; |
| } |
| |
| llvm::DenseMap<ExtensionID, RecoveryStrategy> buildRecoveryStrategies() { |
| return { |
| {Extension::Brackets, recoverBrackets}, |
| }; |
| } |
| |
| } // namespace |
| |
| const Language &getLanguage() { |
| static const auto &CXXLanguage = []() -> const Language & { |
| std::vector<std::string> Diags; |
| auto G = Grammar::parseBNF(CXXBNF, Diags); |
| assert(Diags.empty()); |
| LRTable Table = LRTable::buildSLR(G); |
| const Language *PL = new Language{ |
| std::move(G), |
| std::move(Table), |
| buildGuards(), |
| buildRecoveryStrategies(), |
| }; |
| return *PL; |
| }(); |
| return CXXLanguage; |
| } |
| |
| } // namespace cxx |
| } // namespace pseudo |
| } // namespace clang |