From 1470fed809be7b11f147f8a6ca924a252b473c97 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Sat, 22 Aug 2009 10:24:18 +0200 Subject: Implement the CLI language parser --- cli/makefile | 2 +- cli/parser.cxx | 613 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ cli/parser.hxx | 60 ++++++ 3 files changed, 674 insertions(+), 1 deletion(-) create mode 100644 cli/parser.cxx create mode 100644 cli/parser.hxx diff --git a/cli/makefile b/cli/makefile index d9be4f2..faea51c 100644 --- a/cli/makefile +++ b/cli/makefile @@ -5,7 +5,7 @@ include $(dir $(lastword $(MAKEFILE_LIST)))../build/bootstrap.make -cxx_tun := cli.cxx lexer.cxx +cxx_tun := cli.cxx lexer.cxx parser.cxx # # diff --git a/cli/parser.cxx b/cli/parser.cxx new file mode 100644 index 0000000..937bf34 --- /dev/null +++ b/cli/parser.cxx @@ -0,0 +1,613 @@ +// file : cli/parser.cxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2009 Code Synthesis Tools CC +// license : MIT; see accompanying LICENSE file + +#include + +#include "token.hxx" +#include "lexer.hxx" +#include "parser.hxx" + +using namespace std; + +const char* keywords[] = +{ + "include", + "namespace", + "class", + "signed", + "unsigned", + "bool", + "char", + "wchar_t", + "short", + "int", + "long", + "float", + "double" +}; + +const char* punctuation[] = {";", ",", "::", "{", "}", /*"(", ")",*/ "=", "|"}; + +// Output the token type and value in a format suitable for diagnostics. +// +std::ostream& +operator<< (std::ostream& os, Token const& t) +{ + switch (t.type ()) + { + case Token::t_eos: + { + os << "end-of-stream"; + break; + } + case Token::t_keyword: + { + os << "keyword '" << keywords[t.keyword ()] << "'"; + break; + } + case Token::t_identifier: + { + os << "identifier '" << t.identifier () << "'"; + break; + } + case Token::t_punctuation: + { + os << "'" << punctuation[t.punctuation ()] << "'"; + break; + } + case Token::t_path_lit: + { + os << "path literal"; + break; + } + case Token::t_string_lit: + { + os << "string literal"; + break; + } + case Token::t_char_lit: + { + os << "char literal"; + break; + } + case Token::t_bool_lit: + { + os << "bool literal"; + break; + } + case Token::t_int_lit: + { + os << "integer literal"; + break; + } + case Token::t_float_lit: + { + os << "floating point literal"; + break; + } + case Token::t_call_expr: + { + os << "call expression"; + break; + } + case Token::t_template_expr: + { + os << "template expression"; + break; + } + } + + return os; +} + +void Parser:: +recover (Token& t) +{ + // Recover by skipping past next ';'. + // + for (;; t = lexer_->next ()) + { + if (t.type () == Token::t_eos) + break; + + if (t.punctuation () == Token::p_semi) + { + t = lexer_->next (); + break; + } + } +} + +void Parser:: +parse (std::istream& is, std::string const& id) +{ + Lexer lexer (is, id); + lexer_ = &lexer; + id_ = &id; + valid_ = true; + def_unit (); + + if (!valid_ || !lexer.valid ()) + throw InvalidInput (); +} + +void Parser:: +def_unit () +{ + Token t (lexer_->next ()); + + // include-decl-seq + // + while (t.keyword () == Token::k_include) + { + try + { + include_decl (); + t = lexer_->next (); + } + catch (Error const&) + { + valid_ = false; + recover (t); + } + } + + // decl-seq + // + while (t.type () != Token::t_eos) + { + try + { + if (decl (t)) + { + t = lexer_->next (); + continue; + } + + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected namespace or class declaration instead of " << t + << endl; + throw Error (); + } + catch (Error const&) + { + valid_ = false; + break; // Non-recoverable error. + } + } +} + +void Parser:: +include_decl () +{ + Token t (lexer_->next ()); + + if (t.type () != Token::t_path_lit) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected path literal instead of " << t << endl; + throw Error (); + } + + t = lexer_->next (); + + if (t.punctuation () != Token::p_semi) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected ';' instead of " << t << endl; + throw Error (); + } +} + +bool Parser:: +decl (Token& t) +{ + if (t.type () == Token::t_keyword) + { + switch (t.keyword ()) + { + case Token::k_namespace: + { + namespace_def (); + return true; + } + case Token::k_class: + { + class_def (); + return true; + } + default: + break; + } + } + return false; +} + +void Parser:: +namespace_def () +{ + Token t (lexer_->next ()); + + if (t.type () != Token::t_identifier) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected identifier instead of " << t << endl; + throw Error (); + } + + t = lexer_->next (); + + if (t.punctuation () != Token::p_lcbrace) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected '{' instead of " << t << endl; + throw Error (); + } + + // decl-seq + // + t = lexer_->next (); + + while (decl (t)) + t = lexer_->next (); + + if (t.punctuation () != Token::p_rcbrace) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected namespace declaration, class declaration, or '}' " + << "instead of " << t << endl; + throw Error (); + } +} + +void Parser:: +class_def () +{ + Token t (lexer_->next ()); + + if (t.type () != Token::t_identifier) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected identifier instead of " << t << endl; + throw Error (); + } + + t = lexer_->next (); + + if (t.punctuation () != Token::p_lcbrace) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected '{' instead of " << t << endl; + throw Error (); + } + + // decl-seq + // + t = lexer_->next (); + + while (true) + { + try + { + if (!option_def (t)) + break; + + t = lexer_->next (); + } + catch (Error const&) + { + valid_ = false; + recover (t); + } + } + + if (t.punctuation () != Token::p_rcbrace) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected option declaration or '}' instead of " << t << endl; + throw Error (); + } + + t = lexer_->next (); + + if (t.punctuation () != Token::p_semi) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected ';' instead of " << t << endl; + throw Error (); + } +} + +bool Parser:: +option_def (Token& t) +{ + // type-spec + // + // These two functions set t to the next token if they return + // true. + // + if (!qualified_name (t) && !fundamental_type (t)) + return false; + + // option-name-seq + // + while (true) + { + if (t.type () != Token::t_identifier) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "option name expected instead of " << t << endl; + throw Error (); + } + + t = lexer_->next (); + + if (t.punctuation () == Token::p_or) + t = lexer_->next (); + else + break; + } + + // initializer + // + if (t.punctuation () == Token::p_eq) + { + t = lexer_->next (); + + // assignment initiaizer + // + if (qualified_name (t)) + { + } + else + { + switch (t.type ()) + { + case Token::t_string_lit: + case Token::t_char_lit: + case Token::t_bool_lit: + case Token::t_int_lit: + case Token::t_float_lit: + case Token::t_call_expr: + { + t = lexer_->next (); + break; + } + default: + { + cerr << *id_ << ':' << t.line () << ':' << t.column () + << ": error: expected intializer instead of " << t << endl; + throw Error (); + } + } + } + } + else if (t.type () == Token::t_call_expr) + { + // c-tor initializer + // + t = lexer_->next (); + } + + if (t.punctuation () != Token::p_semi) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected ';' instead of " << t << endl; + throw Error (); + } + + return true; +} + +bool Parser:: +qualified_name (Token& t) +{ + if (t.type () != Token::t_identifier && t.punctuation () != Token::p_dcolon) + return false; + + if (t.punctuation () == Token::p_dcolon) + t = lexer_->next (); + + while (true) + { + if (t.type () != Token::t_identifier) + { + cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: " + << "expected identifier after '::'" << endl; + throw Error (); + } + + t = lexer_->next (); + + if (t.type () == Token::t_template_expr) + { + // Template-id. + // + t = lexer_->next (); + } + + if (t.punctuation () == Token::p_dcolon) + t = lexer_->next (); + else + break; + } + + return true; +} + +bool Parser:: +fundamental_type (Token& t) +{ + switch (t.keyword ()) + { + case Token::k_signed: + case Token::k_unsigned: + { + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_short: + case Token::k_long: + { + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_int: + { + t = lexer_->next (); + } + default: + break; + } + break; + } + case Token::k_int: + { + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_short: + case Token::k_long: + { + t = lexer_->next (); + } + default: + break; + } + break; + } + case Token::k_char: + { + t = lexer_->next (); + break; + } + default: + break; + } + break; + } + case Token::k_short: + case Token::k_long: + { + bool l (t.keyword () == Token::k_long); + + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_signed: + case Token::k_unsigned: + { + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_int: + { + t = lexer_->next (); + } + default: + break; + } + break; + } + case Token::k_int: + { + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_signed: + case Token::k_unsigned: + { + t = lexer_->next (); + } + default: + break; + } + break; + } + case Token::k_double: + { + if (l) + t = lexer_->next (); + + break; + } + default: + break; + } + break; + } + case Token::k_int: + { + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_signed: + case Token::k_unsigned: + { + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_short: + case Token::k_long: + { + t = lexer_->next (); + } + default: + break; + } + break; + } + case Token::k_short: + case Token::k_long: + { + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_signed: + case Token::k_unsigned: + { + t = lexer_->next (); + } + default: + break; + } + break; + } + default: + break; + } + break; + } + case Token::k_char: + { + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_signed: + case Token::k_unsigned: + { + t = lexer_->next (); + } + default: + break; + } + break; + } + case Token::k_bool: + case Token::k_wchar: + case Token::k_float: + { + t = lexer_->next (); + break; + } + case Token::k_double: + { + switch ((t = lexer_->next ()).keyword ()) + { + case Token::k_long: + { + t = lexer_->next (); + } + default: + break; + } + break; + } + default: + return false; + } + + return true; +} diff --git a/cli/parser.hxx b/cli/parser.hxx new file mode 100644 index 0000000..d1f4b4b --- /dev/null +++ b/cli/parser.hxx @@ -0,0 +1,60 @@ +// file : cli/parser.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2009 Code Synthesis Tools CC +// license : MIT; see accompanying LICENSE file + +#ifndef CLI_PARSER_HXX +#define CLI_PARSER_HXX + +#include +#include + +class Token; +class Lexer; + +class Parser +{ +public: + struct InvalidInput {}; + + void + parse (std::istream& is, std::string const& id); + +private: + struct Error {}; + + void + def_unit (); + + void + include_decl (); + + bool + decl (Token&); + + void + namespace_def (); + + void + class_def (); + + bool + option_def (Token&); + + bool + qualified_name (Token&); + + bool + fundamental_type (Token&); + +private: + void + recover (Token& t); + +private: + bool valid_; + Lexer* lexer_; + std::string const* id_; +}; + +#endif // CLI_PARSER_HXX -- cgit v1.1