From 720c5a33b6a49cf328fdd7611f49153cf8f60247 Mon Sep 17 00:00:00 2001
From: Karen Arutyunov <karen@codesynthesis.com>
Date: Wed, 8 Apr 2020 14:51:57 +0300
Subject: Separate tests and examples into individual packages

Also make cli module to be explicitly enabled via the config.cli configuration
variable.
---
 cli/cli/lexer.cxx | 604 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 604 insertions(+)
 create mode 100644 cli/cli/lexer.cxx

(limited to 'cli/cli/lexer.cxx')
diff --git a/cli/cli/lexer.cxx b/cli/cli/lexer.cxx
new file mode 100644
index 0000000..573c76b
--- /dev/null
+++ b/cli/cli/lexer.cxx
@@ -0,0 +1,604 @@
+// file      : cli/lexer.cxx
+// author    : Boris Kolpackov <boris@codesynthesis.com>
+// license   : MIT; see accompanying LICENSE file
+
+#include <iostream>
+
+#include <cli/lexer.hxx>
+
+using namespace std;
+
+lexer::
+lexer (istream& is, string const& id)
+    : loc_ ("C"),
+      is_ (is),
+      id_ (id),
+      l_ (1),
+      c_(1),
+      eos_ (false),
+      include_ (false),
+      valid_ (true),
+      buf_ (0, 0, 0),
+      unget_ (false)
+{
+  keyword_map_["source"]    = token::k_source;
+  keyword_map_["include"]   = token::k_include;
+  keyword_map_["namespace"] = token::k_namespace;
+  keyword_map_["class"]     = token::k_class;
+  keyword_map_["signed"]    = token::k_signed;
+  keyword_map_["unsigned"]  = token::k_unsigned;
+  keyword_map_["bool"]      = token::k_bool;
+  keyword_map_["char"]      = token::k_char;
+  keyword_map_["wchar_t"]   = token::k_wchar;
+  keyword_map_["short"]     = token::k_short;
+  keyword_map_["int"]       = token::k_int;
+  keyword_map_["long"]      = token::k_long;
+  keyword_map_["float"]     = token::k_float;
+  keyword_map_["double"]    = token::k_double;
+}
+
+lexer::xchar lexer::
+peek ()
+{
+  if (unget_)
+    return buf_;
+  else
+  {
+    if (eos_)
+      return xchar (xchar::traits_type::eof (), l_, c_);
+    else
+    {
+      xchar::int_type i (is_.peek ());
+
+      if (i == xchar::traits_type::eof ())
+        eos_ = true;
+
+      return xchar (i, l_, c_);
+    }
+  }
+}
+
+lexer::xchar lexer::
+get ()
+{
+  if (unget_)
+  {
+    unget_ = false;
+    return buf_;
+  }
+  else
+  {
+    // When is_.get () returns eof, the failbit is also set (stupid,
+    // isn't?) which may trigger an exception. To work around this
+    // we will call peek() first and only call get() if it is not
+    // eof. But we can only call peek() on eof once; any subsequent
+    // calls will spoil the failbit (even more stupid).
+    //
+    xchar c (peek ());
+
+    if (!is_eos (c))
+    {
+      is_.get ();
+
+      if (c == '\n')
+      {
+        l_++;
+        c_ = 1;
+      }
+      else
+        c_++;
+    }
+
+    return c;
+  }
+}
+
+void lexer::
+unget (xchar c)
+{
+  // Because iostream::unget cannot work once eos is reached,
+  // we have to provide our own implementation.
+  //
+  buf_ = c;
+  unget_ = true;
+}
+
+token lexer::
+next ()
+{
+  while (true) // Recovery loop.
+  {
+    bool include (include_);
+    include_ = false;
+
+    skip_spaces ();
+
+    xchar c (get ());
+
+    if (is_eos (c))
+      return token (c.line (), c.column ());
+
+    try
+    {
+      switch (c)
+      {
+      case '\'':
+        {
+          return char_literal (c);
+        }
+      case '\"':
+        {
+          if (include)
+            return path_literal (c);
+          else
+            return string_literal (c);
+        }
+      case '<':
+        {
+          if (include)
+            return path_literal (c);
+          else
+            return template_expression (c);
+        }
+      case ';':
+        {
+          return token (token::p_semi, c.line (), c.column ());
+        }
+      case ',':
+        {
+          return token (token::p_comma, c.line (), c.column ());
+        }
+      case ':':
+        {
+          if (peek () == ':')
+          {
+            get ();
+            return token (token::p_dcolon, c.line (), c.column ());
+          }
+
+          return token (token::p_colon, c.line (), c.column ());
+        }
+      case '{':
+        {
+          return token (token::p_lcbrace, c.line (), c.column ());
+        }
+      case '}':
+        {
+          return token (token::p_rcbrace, c.line (), c.column ());
+        }
+      case '(':
+        {
+          return call_expression (c);
+        }
+      case '=':
+        {
+          return token (token::p_eq, c.line (), c.column ());
+        }
+      case '|':
+        {
+          return token (token::p_or, c.line (), c.column ());
+        }
+      case '-':
+        {
+          // This can be a beginning of an identifier or a an integer
+          // literal. Figure out which one it is.
+          //
+          xchar p (peek ());
+
+          if (is_dec_digit (p))
+            return int_literal (get (), true, c.line (), c.column ());
+          else if (is_space (p))
+          {
+            skip_spaces ();
+            p = peek ();
+
+            if (is_dec_digit (p))
+              return int_literal (get (), true, c.line (), c.column ());
+
+            // Stray '-'.
+            //
+            cerr << id_ << ':' << c.line () << ':' << c.column ()
+                 << ": error: unexpected character '-'" << endl;
+            throw invalid_input ();
+          }
+
+          break;
+        }
+      }
+
+      if (is_alpha (c) || c == '_' || c == '-' || c == '/')
+      {
+        return identifier (c);
+      }
+
+      if (is_dec_digit (c))
+      {
+        return int_literal (c);
+      }
+
+      cerr << id_ << ':' << c.line () << ':' << c.column ()
+           << ": error: unexpected character '" << c << "'" << endl;
+      throw invalid_input ();
+    }
+    catch (invalid_input const&)
+    {
+      valid_ = false;
+    }
+
+    // Try to recover.
+    //
+    do
+    {
+      c = get ();
+
+      if (is_eos (c))
+        return token (c.line (), c.column ());
+    } while (c != ';');
+  }
+}
+
+void lexer::
+skip_spaces ()
+{
+  for (xchar c (peek ());; c = peek ())
+  {
+    if (is_eos (c))
+      break;
+
+    if (c == '/')
+    {
+      c = get ();
+      xchar p (peek ());
+
+      if (p == '/')
+      {
+        get ();
+
+        // C++ comment. Read until newline or eos.
+        //
+        for (c = get (); !is_eos (c) && c != '\n'; c = get ()) ;
+        continue;
+      }
+      else if (p == '*')
+      {
+        get ();
+
+        // C comment.
+        //
+        for (c = get ();; c = get ())
+        {
+          if (is_eos (c))
+          {
+            cerr << id_ << ':' << c.line () << ':' << c.column ()
+                 << ": error: end of stream reached while reading "
+                 << "C-style comment" << endl;
+            throw invalid_input ();
+          }
+
+          if (c == '*')
+          {
+            c = peek ();
+            if (c == '/')
+            {
+              get ();
+              break;
+            }
+          }
+        }
+        continue;
+      }
+      else
+      {
+        unget (c);
+        break;
+      }
+    }
+
+    if (!is_space (c))
+      break;
+
+    get ();
+  }
+}
+
+token lexer::
+identifier (xchar c)
+{
+  size_t ln (c.line ()), cl (c.column ());
+  string lexeme;
+  lexeme += c;
+
+  bool check (c == '-' || c == '/');
+
+  for (c = peek ();
+       !is_eos (c) && (is_alnum (c) || c == '_' || c == '-');
+       c = peek ())
+  {
+    get ();
+    lexeme += c;
+  }
+
+  // Check for invalid identifiers.
+  //
+  if (check)
+  {
+    size_t i (1);
+
+    for (; i < lexeme.size (); ++i)
+      if (is_alnum (lexeme[i]) || lexeme[i] == '_')
+        break;
+
+    if (i == lexeme.size ())
+    {
+      cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: "
+           << "invalid character sequence '" << lexeme << "'" << endl;
+      throw invalid_input ();
+    }
+  }
+
+  keyword_map::const_iterator i (keyword_map_.find (lexeme));
+
+  if (i != keyword_map_.end ())
+  {
+    if (i->second == token::k_include || i->second == token::k_source)
+      include_ = true;
+
+    return token (i->second, ln, cl);
+  }
+
+  if (lexeme == "true" || lexeme == "false")
+    return token (token::t_bool_lit, lexeme, ln, cl);
+
+  return token (token::t_identifier, lexeme, ln, cl);
+}
+
+token lexer::
+int_literal (xchar c, bool neg, size_t ml, size_t mc)
+{
+  size_t ln (neg ? ml : c.line ()), cl (neg ? mc : c.column ());
+  string lexeme;
+
+  if (neg)
+    lexeme += '-';
+
+  lexeme += c;
+
+  for (c = peek (); !is_eos (c) && is_dec_digit (c); c = peek ())
+  {
+    get ();
+    lexeme += c;
+  }
+
+  return token (token::t_int_lit, lexeme, ln, cl);
+}
+
+token lexer::
+char_literal (xchar c)
+{
+  size_t ln (c.line ()), cl (c.column ());
+  string lexeme;
+  lexeme += c;
+
+  char p (c);
+
+  while (true)
+  {
+    c = get ();
+
+    if (is_eos (c))
+    {
+      cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: "
+           << "end of stream reached while reading character literal" << endl;
+      throw invalid_input ();
+    }
+
+    lexeme += c;
+
+    if (c == '\'' && p != '\\')
+      break;
+
+    // We need to keep track of \\ escapings so we don't confuse
+    // them with \', as in '\\'.
+    //
+    if (c == '\\' && p == '\\')
+      p = '\0';
+    else
+      p = c;
+  }
+
+  return token (token::t_char_lit, lexeme, ln, cl);
+}
+
+token lexer::
+string_literal (xchar c)
+{
+  size_t ln (c.line ()), cl (c.column ());
+  string lexeme;
+  lexeme += c;
+
+  while (true)
+  {
+    lexeme += string_literal_trailer ();
+
+    // Check if there are more strings.
+    //
+    skip_spaces ();
+
+    c = peek ();
+
+    if (is_eos (c) || c != '"')
+      break;
+
+    get ();
+    lexeme += "\"";
+  }
+
+  return token (token::t_string_lit, lexeme, ln, cl);
+}
+
+string lexer::
+string_literal_trailer ()
+{
+  string r;
+  char p ('\0');
+
+  while (true)
+  {
+    xchar c = get ();
+
+    if (is_eos (c))
+    {
+      cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: "
+           << "end of stream reached while reading string literal" << endl;
+      throw invalid_input ();
+    }
+
+    r += c;
+
+    if (c == '"' && p != '\\')
+      break;
+
+    // We need to keep track of \\ escapings so we don't confuse
+    // them with \", as in "\\".
+    //
+    if (c == '\\' && p == '\\')
+      p = '\0';
+    else
+      p = c;
+  }
+
+  return r;
+}
+
+token lexer::
+path_literal (xchar c)
+{
+  size_t ln (c.line ()), cl (c.column ());
+  string lexeme;
+  lexeme += c;
+
+  char end (c == '<' ? '>' : '"');
+
+  while (true)
+  {
+    c = get ();
+
+    if (is_eos (c))
+    {
+      cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: "
+           << "end of stream reached while reading path literal" << endl;
+      throw invalid_input ();
+    }
+
+    lexeme += c;
+
+    if (c == end)
+      break;
+  }
+
+  token::token_type tt;
+
+  if (lexeme.compare (1, 4, "c++:") == 0)
+  {
+    tt = token::t_cxx_path_lit;
+    lexeme = lexeme[0] + string (lexeme, 5, string::npos);
+  }
+  else if (lexeme.compare (1, 4, "cli:") == 0)
+  {
+    tt = token::t_cli_path_lit;
+    lexeme = lexeme[0] + string (lexeme, 5, string::npos);
+  }
+  else
+  {
+    // See if the path ends with .cli. If not, then we assume this is
+    // a C++ inclusion.
+    //
+    size_t n (lexeme.size ());
+
+    if (n > 5 && lexeme.compare (n - 5, 4, ".cli") == 0)
+      tt = token::t_cli_path_lit;
+    else
+      tt = token::t_cxx_path_lit;
+  }
+
+  return token (tt, lexeme, ln, cl);
+}
+
+token lexer::
+call_expression (xchar c)
+{
+  size_t ln (c.line ()), cl (c.column ());
+  string lexeme;
+  lexeme += c;
+  size_t balance (1);
+
+  while (balance != 0)
+  {
+    c = get ();
+
+    if (is_eos (c))
+    {
+      cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: "
+           << "end of stream reached while reading call expression" << endl;
+      throw invalid_input ();
+    }
+
+    lexeme += c;
+
+    switch (c)
+    {
+    case '(':
+      {
+        balance++;
+        break;
+      }
+    case ')':
+      {
+        balance--;
+        break;
+      }
+    }
+  }
+
+  return token (token::t_call_expr, lexeme, ln, cl);
+}
+
+token lexer::
+template_expression (xchar c)
+{
+  size_t ln (c.line ()), cl (c.column ());
+  string lexeme;
+  lexeme += c;
+  size_t balance (1);
+
+  while (balance != 0)
+  {
+    c = get ();
+
+    if (is_eos (c))
+    {
+      cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: "
+           << "end of stream reached while reading template expression"
+           << endl;
+      throw invalid_input ();
+    }
+
+    lexeme += c;
+
+    switch (c)
+    {
+    case '<':
+      {
+        balance++;
+        break;
+      }
+    case '>':
+      {
+        balance--;
+        break;
+      }
+    }
+  }
+
+  return token (token::t_template_expr, lexeme, ln, cl);
+}
-- 
cgit v1.1