diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2020-04-08 14:51:57 +0300 |
---|---|---|
committer | Karen Arutyunov <karen@codesynthesis.com> | 2020-04-27 11:38:53 +0300 |
commit | 720c5a33b6a49cf328fdd7611f49153cf8f60247 (patch) | |
tree | 9725f3d1f42ec90fde84520f49647edea013ce5e /cli/lexer.cxx | |
parent | 3183f3bb927a90783ae0aeaf190a0919377aabe4 (diff) |
Separate tests and examples into individual packages
Also make cli module to be explicitly enabled via the config.cli configuration
variable.
Diffstat (limited to 'cli/lexer.cxx')
-rw-r--r-- | cli/lexer.cxx | 604 |
1 files changed, 0 insertions, 604 deletions
diff --git a/cli/lexer.cxx b/cli/lexer.cxx deleted file mode 100644 index 573c76b..0000000 --- a/cli/lexer.cxx +++ /dev/null @@ -1,604 +0,0 @@ -// file : cli/lexer.cxx -// author : Boris Kolpackov <boris@codesynthesis.com> -// license : MIT; see accompanying LICENSE file - -#include <iostream> - -#include <cli/lexer.hxx> - -using namespace std; - -lexer:: -lexer (istream& is, string const& id) - : loc_ ("C"), - is_ (is), - id_ (id), - l_ (1), - c_(1), - eos_ (false), - include_ (false), - valid_ (true), - buf_ (0, 0, 0), - unget_ (false) -{ - keyword_map_["source"] = token::k_source; - keyword_map_["include"] = token::k_include; - keyword_map_["namespace"] = token::k_namespace; - keyword_map_["class"] = token::k_class; - keyword_map_["signed"] = token::k_signed; - keyword_map_["unsigned"] = token::k_unsigned; - keyword_map_["bool"] = token::k_bool; - keyword_map_["char"] = token::k_char; - keyword_map_["wchar_t"] = token::k_wchar; - keyword_map_["short"] = token::k_short; - keyword_map_["int"] = token::k_int; - keyword_map_["long"] = token::k_long; - keyword_map_["float"] = token::k_float; - keyword_map_["double"] = token::k_double; -} - -lexer::xchar lexer:: -peek () -{ - if (unget_) - return buf_; - else - { - if (eos_) - return xchar (xchar::traits_type::eof (), l_, c_); - else - { - xchar::int_type i (is_.peek ()); - - if (i == xchar::traits_type::eof ()) - eos_ = true; - - return xchar (i, l_, c_); - } - } -} - -lexer::xchar lexer:: -get () -{ - if (unget_) - { - unget_ = false; - return buf_; - } - else - { - // When is_.get () returns eof, the failbit is also set (stupid, - // isn't?) which may trigger an exception. To work around this - // we will call peek() first and only call get() if it is not - // eof. But we can only call peek() on eof once; any subsequent - // calls will spoil the failbit (even more stupid). - // - xchar c (peek ()); - - if (!is_eos (c)) - { - is_.get (); - - if (c == '\n') - { - l_++; - c_ = 1; - } - else - c_++; - } - - return c; - } -} - -void lexer:: -unget (xchar c) -{ - // Because iostream::unget cannot work once eos is reached, - // we have to provide our own implementation. - // - buf_ = c; - unget_ = true; -} - -token lexer:: -next () -{ - while (true) // Recovery loop. - { - bool include (include_); - include_ = false; - - skip_spaces (); - - xchar c (get ()); - - if (is_eos (c)) - return token (c.line (), c.column ()); - - try - { - switch (c) - { - case '\'': - { - return char_literal (c); - } - case '\"': - { - if (include) - return path_literal (c); - else - return string_literal (c); - } - case '<': - { - if (include) - return path_literal (c); - else - return template_expression (c); - } - case ';': - { - return token (token::p_semi, c.line (), c.column ()); - } - case ',': - { - return token (token::p_comma, c.line (), c.column ()); - } - case ':': - { - if (peek () == ':') - { - get (); - return token (token::p_dcolon, c.line (), c.column ()); - } - - return token (token::p_colon, c.line (), c.column ()); - } - case '{': - { - return token (token::p_lcbrace, c.line (), c.column ()); - } - case '}': - { - return token (token::p_rcbrace, c.line (), c.column ()); - } - case '(': - { - return call_expression (c); - } - case '=': - { - return token (token::p_eq, c.line (), c.column ()); - } - case '|': - { - return token (token::p_or, c.line (), c.column ()); - } - case '-': - { - // This can be a beginning of an identifier or a an integer - // literal. Figure out which one it is. - // - xchar p (peek ()); - - if (is_dec_digit (p)) - return int_literal (get (), true, c.line (), c.column ()); - else if (is_space (p)) - { - skip_spaces (); - p = peek (); - - if (is_dec_digit (p)) - return int_literal (get (), true, c.line (), c.column ()); - - // Stray '-'. - // - cerr << id_ << ':' << c.line () << ':' << c.column () - << ": error: unexpected character '-'" << endl; - throw invalid_input (); - } - - break; - } - } - - if (is_alpha (c) || c == '_' || c == '-' || c == '/') - { - return identifier (c); - } - - if (is_dec_digit (c)) - { - return int_literal (c); - } - - cerr << id_ << ':' << c.line () << ':' << c.column () - << ": error: unexpected character '" << c << "'" << endl; - throw invalid_input (); - } - catch (invalid_input const&) - { - valid_ = false; - } - - // Try to recover. - // - do - { - c = get (); - - if (is_eos (c)) - return token (c.line (), c.column ()); - } while (c != ';'); - } -} - -void lexer:: -skip_spaces () -{ - for (xchar c (peek ());; c = peek ()) - { - if (is_eos (c)) - break; - - if (c == '/') - { - c = get (); - xchar p (peek ()); - - if (p == '/') - { - get (); - - // C++ comment. Read until newline or eos. - // - for (c = get (); !is_eos (c) && c != '\n'; c = get ()) ; - continue; - } - else if (p == '*') - { - get (); - - // C comment. - // - for (c = get ();; c = get ()) - { - if (is_eos (c)) - { - cerr << id_ << ':' << c.line () << ':' << c.column () - << ": error: end of stream reached while reading " - << "C-style comment" << endl; - throw invalid_input (); - } - - if (c == '*') - { - c = peek (); - if (c == '/') - { - get (); - break; - } - } - } - continue; - } - else - { - unget (c); - break; - } - } - - if (!is_space (c)) - break; - - get (); - } -} - -token lexer:: -identifier (xchar c) -{ - size_t ln (c.line ()), cl (c.column ()); - string lexeme; - lexeme += c; - - bool check (c == '-' || c == '/'); - - for (c = peek (); - !is_eos (c) && (is_alnum (c) || c == '_' || c == '-'); - c = peek ()) - { - get (); - lexeme += c; - } - - // Check for invalid identifiers. - // - if (check) - { - size_t i (1); - - for (; i < lexeme.size (); ++i) - if (is_alnum (lexeme[i]) || lexeme[i] == '_') - break; - - if (i == lexeme.size ()) - { - cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " - << "invalid character sequence '" << lexeme << "'" << endl; - throw invalid_input (); - } - } - - keyword_map::const_iterator i (keyword_map_.find (lexeme)); - - if (i != keyword_map_.end ()) - { - if (i->second == token::k_include || i->second == token::k_source) - include_ = true; - - return token (i->second, ln, cl); - } - - if (lexeme == "true" || lexeme == "false") - return token (token::t_bool_lit, lexeme, ln, cl); - - return token (token::t_identifier, lexeme, ln, cl); -} - -token lexer:: -int_literal (xchar c, bool neg, size_t ml, size_t mc) -{ - size_t ln (neg ? ml : c.line ()), cl (neg ? mc : c.column ()); - string lexeme; - - if (neg) - lexeme += '-'; - - lexeme += c; - - for (c = peek (); !is_eos (c) && is_dec_digit (c); c = peek ()) - { - get (); - lexeme += c; - } - - return token (token::t_int_lit, lexeme, ln, cl); -} - -token lexer:: -char_literal (xchar c) -{ - size_t ln (c.line ()), cl (c.column ()); - string lexeme; - lexeme += c; - - char p (c); - - while (true) - { - c = get (); - - if (is_eos (c)) - { - cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " - << "end of stream reached while reading character literal" << endl; - throw invalid_input (); - } - - lexeme += c; - - if (c == '\'' && p != '\\') - break; - - // We need to keep track of \\ escapings so we don't confuse - // them with \', as in '\\'. - // - if (c == '\\' && p == '\\') - p = '\0'; - else - p = c; - } - - return token (token::t_char_lit, lexeme, ln, cl); -} - -token lexer:: -string_literal (xchar c) -{ - size_t ln (c.line ()), cl (c.column ()); - string lexeme; - lexeme += c; - - while (true) - { - lexeme += string_literal_trailer (); - - // Check if there are more strings. - // - skip_spaces (); - - c = peek (); - - if (is_eos (c) || c != '"') - break; - - get (); - lexeme += "\""; - } - - return token (token::t_string_lit, lexeme, ln, cl); -} - -string lexer:: -string_literal_trailer () -{ - string r; - char p ('\0'); - - while (true) - { - xchar c = get (); - - if (is_eos (c)) - { - cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " - << "end of stream reached while reading string literal" << endl; - throw invalid_input (); - } - - r += c; - - if (c == '"' && p != '\\') - break; - - // We need to keep track of \\ escapings so we don't confuse - // them with \", as in "\\". - // - if (c == '\\' && p == '\\') - p = '\0'; - else - p = c; - } - - return r; -} - -token lexer:: -path_literal (xchar c) -{ - size_t ln (c.line ()), cl (c.column ()); - string lexeme; - lexeme += c; - - char end (c == '<' ? '>' : '"'); - - while (true) - { - c = get (); - - if (is_eos (c)) - { - cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " - << "end of stream reached while reading path literal" << endl; - throw invalid_input (); - } - - lexeme += c; - - if (c == end) - break; - } - - token::token_type tt; - - if (lexeme.compare (1, 4, "c++:") == 0) - { - tt = token::t_cxx_path_lit; - lexeme = lexeme[0] + string (lexeme, 5, string::npos); - } - else if (lexeme.compare (1, 4, "cli:") == 0) - { - tt = token::t_cli_path_lit; - lexeme = lexeme[0] + string (lexeme, 5, string::npos); - } - else - { - // See if the path ends with .cli. If not, then we assume this is - // a C++ inclusion. - // - size_t n (lexeme.size ()); - - if (n > 5 && lexeme.compare (n - 5, 4, ".cli") == 0) - tt = token::t_cli_path_lit; - else - tt = token::t_cxx_path_lit; - } - - return token (tt, lexeme, ln, cl); -} - -token lexer:: -call_expression (xchar c) -{ - size_t ln (c.line ()), cl (c.column ()); - string lexeme; - lexeme += c; - size_t balance (1); - - while (balance != 0) - { - c = get (); - - if (is_eos (c)) - { - cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " - << "end of stream reached while reading call expression" << endl; - throw invalid_input (); - } - - lexeme += c; - - switch (c) - { - case '(': - { - balance++; - break; - } - case ')': - { - balance--; - break; - } - } - } - - return token (token::t_call_expr, lexeme, ln, cl); -} - -token lexer:: -template_expression (xchar c) -{ - size_t ln (c.line ()), cl (c.column ()); - string lexeme; - lexeme += c; - size_t balance (1); - - while (balance != 0) - { - c = get (); - - if (is_eos (c)) - { - cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " - << "end of stream reached while reading template expression" - << endl; - throw invalid_input (); - } - - lexeme += c; - - switch (c) - { - case '<': - { - balance++; - break; - } - case '>': - { - balance--; - break; - } - } - } - - return token (token::t_template_expr, lexeme, ln, cl); -} |