From 720c5a33b6a49cf328fdd7611f49153cf8f60247 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 8 Apr 2020 14:51:57 +0300 Subject: Separate tests and examples into individual packages Also make cli module to be explicitly enabled via the config.cli configuration variable. --- cli/cli/lexer.cxx | 604 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 604 insertions(+) create mode 100644 cli/cli/lexer.cxx (limited to 'cli/cli/lexer.cxx') diff --git a/cli/cli/lexer.cxx b/cli/cli/lexer.cxx new file mode 100644 index 0000000..573c76b --- /dev/null +++ b/cli/cli/lexer.cxx @@ -0,0 +1,604 @@ +// file : cli/lexer.cxx +// author : Boris Kolpackov +// license : MIT; see accompanying LICENSE file + +#include + +#include + +using namespace std; + +lexer:: +lexer (istream& is, string const& id) + : loc_ ("C"), + is_ (is), + id_ (id), + l_ (1), + c_(1), + eos_ (false), + include_ (false), + valid_ (true), + buf_ (0, 0, 0), + unget_ (false) +{ + keyword_map_["source"] = token::k_source; + keyword_map_["include"] = token::k_include; + keyword_map_["namespace"] = token::k_namespace; + keyword_map_["class"] = token::k_class; + keyword_map_["signed"] = token::k_signed; + keyword_map_["unsigned"] = token::k_unsigned; + keyword_map_["bool"] = token::k_bool; + keyword_map_["char"] = token::k_char; + keyword_map_["wchar_t"] = token::k_wchar; + keyword_map_["short"] = token::k_short; + keyword_map_["int"] = token::k_int; + keyword_map_["long"] = token::k_long; + keyword_map_["float"] = token::k_float; + keyword_map_["double"] = token::k_double; +} + +lexer::xchar lexer:: +peek () +{ + if (unget_) + return buf_; + else + { + if (eos_) + return xchar (xchar::traits_type::eof (), l_, c_); + else + { + xchar::int_type i (is_.peek ()); + + if (i == xchar::traits_type::eof ()) + eos_ = true; + + return xchar (i, l_, c_); + } + } +} + +lexer::xchar lexer:: +get () +{ + if (unget_) + { + unget_ = false; + return buf_; + } + else + { + // When is_.get () returns eof, the failbit is also set (stupid, + // isn't?) which may trigger an exception. To work around this + // we will call peek() first and only call get() if it is not + // eof. But we can only call peek() on eof once; any subsequent + // calls will spoil the failbit (even more stupid). + // + xchar c (peek ()); + + if (!is_eos (c)) + { + is_.get (); + + if (c == '\n') + { + l_++; + c_ = 1; + } + else + c_++; + } + + return c; + } +} + +void lexer:: +unget (xchar c) +{ + // Because iostream::unget cannot work once eos is reached, + // we have to provide our own implementation. + // + buf_ = c; + unget_ = true; +} + +token lexer:: +next () +{ + while (true) // Recovery loop. + { + bool include (include_); + include_ = false; + + skip_spaces (); + + xchar c (get ()); + + if (is_eos (c)) + return token (c.line (), c.column ()); + + try + { + switch (c) + { + case '\'': + { + return char_literal (c); + } + case '\"': + { + if (include) + return path_literal (c); + else + return string_literal (c); + } + case '<': + { + if (include) + return path_literal (c); + else + return template_expression (c); + } + case ';': + { + return token (token::p_semi, c.line (), c.column ()); + } + case ',': + { + return token (token::p_comma, c.line (), c.column ()); + } + case ':': + { + if (peek () == ':') + { + get (); + return token (token::p_dcolon, c.line (), c.column ()); + } + + return token (token::p_colon, c.line (), c.column ()); + } + case '{': + { + return token (token::p_lcbrace, c.line (), c.column ()); + } + case '}': + { + return token (token::p_rcbrace, c.line (), c.column ()); + } + case '(': + { + return call_expression (c); + } + case '=': + { + return token (token::p_eq, c.line (), c.column ()); + } + case '|': + { + return token (token::p_or, c.line (), c.column ()); + } + case '-': + { + // This can be a beginning of an identifier or a an integer + // literal. Figure out which one it is. + // + xchar p (peek ()); + + if (is_dec_digit (p)) + return int_literal (get (), true, c.line (), c.column ()); + else if (is_space (p)) + { + skip_spaces (); + p = peek (); + + if (is_dec_digit (p)) + return int_literal (get (), true, c.line (), c.column ()); + + // Stray '-'. + // + cerr << id_ << ':' << c.line () << ':' << c.column () + << ": error: unexpected character '-'" << endl; + throw invalid_input (); + } + + break; + } + } + + if (is_alpha (c) || c == '_' || c == '-' || c == '/') + { + return identifier (c); + } + + if (is_dec_digit (c)) + { + return int_literal (c); + } + + cerr << id_ << ':' << c.line () << ':' << c.column () + << ": error: unexpected character '" << c << "'" << endl; + throw invalid_input (); + } + catch (invalid_input const&) + { + valid_ = false; + } + + // Try to recover. + // + do + { + c = get (); + + if (is_eos (c)) + return token (c.line (), c.column ()); + } while (c != ';'); + } +} + +void lexer:: +skip_spaces () +{ + for (xchar c (peek ());; c = peek ()) + { + if (is_eos (c)) + break; + + if (c == '/') + { + c = get (); + xchar p (peek ()); + + if (p == '/') + { + get (); + + // C++ comment. Read until newline or eos. + // + for (c = get (); !is_eos (c) && c != '\n'; c = get ()) ; + continue; + } + else if (p == '*') + { + get (); + + // C comment. + // + for (c = get ();; c = get ()) + { + if (is_eos (c)) + { + cerr << id_ << ':' << c.line () << ':' << c.column () + << ": error: end of stream reached while reading " + << "C-style comment" << endl; + throw invalid_input (); + } + + if (c == '*') + { + c = peek (); + if (c == '/') + { + get (); + break; + } + } + } + continue; + } + else + { + unget (c); + break; + } + } + + if (!is_space (c)) + break; + + get (); + } +} + +token lexer:: +identifier (xchar c) +{ + size_t ln (c.line ()), cl (c.column ()); + string lexeme; + lexeme += c; + + bool check (c == '-' || c == '/'); + + for (c = peek (); + !is_eos (c) && (is_alnum (c) || c == '_' || c == '-'); + c = peek ()) + { + get (); + lexeme += c; + } + + // Check for invalid identifiers. + // + if (check) + { + size_t i (1); + + for (; i < lexeme.size (); ++i) + if (is_alnum (lexeme[i]) || lexeme[i] == '_') + break; + + if (i == lexeme.size ()) + { + cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " + << "invalid character sequence '" << lexeme << "'" << endl; + throw invalid_input (); + } + } + + keyword_map::const_iterator i (keyword_map_.find (lexeme)); + + if (i != keyword_map_.end ()) + { + if (i->second == token::k_include || i->second == token::k_source) + include_ = true; + + return token (i->second, ln, cl); + } + + if (lexeme == "true" || lexeme == "false") + return token (token::t_bool_lit, lexeme, ln, cl); + + return token (token::t_identifier, lexeme, ln, cl); +} + +token lexer:: +int_literal (xchar c, bool neg, size_t ml, size_t mc) +{ + size_t ln (neg ? ml : c.line ()), cl (neg ? mc : c.column ()); + string lexeme; + + if (neg) + lexeme += '-'; + + lexeme += c; + + for (c = peek (); !is_eos (c) && is_dec_digit (c); c = peek ()) + { + get (); + lexeme += c; + } + + return token (token::t_int_lit, lexeme, ln, cl); +} + +token lexer:: +char_literal (xchar c) +{ + size_t ln (c.line ()), cl (c.column ()); + string lexeme; + lexeme += c; + + char p (c); + + while (true) + { + c = get (); + + if (is_eos (c)) + { + cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " + << "end of stream reached while reading character literal" << endl; + throw invalid_input (); + } + + lexeme += c; + + if (c == '\'' && p != '\\') + break; + + // We need to keep track of \\ escapings so we don't confuse + // them with \', as in '\\'. + // + if (c == '\\' && p == '\\') + p = '\0'; + else + p = c; + } + + return token (token::t_char_lit, lexeme, ln, cl); +} + +token lexer:: +string_literal (xchar c) +{ + size_t ln (c.line ()), cl (c.column ()); + string lexeme; + lexeme += c; + + while (true) + { + lexeme += string_literal_trailer (); + + // Check if there are more strings. + // + skip_spaces (); + + c = peek (); + + if (is_eos (c) || c != '"') + break; + + get (); + lexeme += "\""; + } + + return token (token::t_string_lit, lexeme, ln, cl); +} + +string lexer:: +string_literal_trailer () +{ + string r; + char p ('\0'); + + while (true) + { + xchar c = get (); + + if (is_eos (c)) + { + cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " + << "end of stream reached while reading string literal" << endl; + throw invalid_input (); + } + + r += c; + + if (c == '"' && p != '\\') + break; + + // We need to keep track of \\ escapings so we don't confuse + // them with \", as in "\\". + // + if (c == '\\' && p == '\\') + p = '\0'; + else + p = c; + } + + return r; +} + +token lexer:: +path_literal (xchar c) +{ + size_t ln (c.line ()), cl (c.column ()); + string lexeme; + lexeme += c; + + char end (c == '<' ? '>' : '"'); + + while (true) + { + c = get (); + + if (is_eos (c)) + { + cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " + << "end of stream reached while reading path literal" << endl; + throw invalid_input (); + } + + lexeme += c; + + if (c == end) + break; + } + + token::token_type tt; + + if (lexeme.compare (1, 4, "c++:") == 0) + { + tt = token::t_cxx_path_lit; + lexeme = lexeme[0] + string (lexeme, 5, string::npos); + } + else if (lexeme.compare (1, 4, "cli:") == 0) + { + tt = token::t_cli_path_lit; + lexeme = lexeme[0] + string (lexeme, 5, string::npos); + } + else + { + // See if the path ends with .cli. If not, then we assume this is + // a C++ inclusion. + // + size_t n (lexeme.size ()); + + if (n > 5 && lexeme.compare (n - 5, 4, ".cli") == 0) + tt = token::t_cli_path_lit; + else + tt = token::t_cxx_path_lit; + } + + return token (tt, lexeme, ln, cl); +} + +token lexer:: +call_expression (xchar c) +{ + size_t ln (c.line ()), cl (c.column ()); + string lexeme; + lexeme += c; + size_t balance (1); + + while (balance != 0) + { + c = get (); + + if (is_eos (c)) + { + cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " + << "end of stream reached while reading call expression" << endl; + throw invalid_input (); + } + + lexeme += c; + + switch (c) + { + case '(': + { + balance++; + break; + } + case ')': + { + balance--; + break; + } + } + } + + return token (token::t_call_expr, lexeme, ln, cl); +} + +token lexer:: +template_expression (xchar c) +{ + size_t ln (c.line ()), cl (c.column ()); + string lexeme; + lexeme += c; + size_t balance (1); + + while (balance != 0) + { + c = get (); + + if (is_eos (c)) + { + cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " + << "end of stream reached while reading template expression" + << endl; + throw invalid_input (); + } + + lexeme += c; + + switch (c) + { + case '<': + { + balance++; + break; + } + case '>': + { + balance--; + break; + } + } + } + + return token (token::t_template_expr, lexeme, ln, cl); +} -- cgit v1.1