From 1470fed809be7b11f147f8a6ca924a252b473c97 Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Sat, 22 Aug 2009 10:24:18 +0200
Subject: Implement the CLI language parser

---
 cli/makefile   |   2 +-
 cli/parser.cxx | 613 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 cli/parser.hxx |  60 ++++++
 3 files changed, 674 insertions(+), 1 deletion(-)
 create mode 100644 cli/parser.cxx
 create mode 100644 cli/parser.hxx
diff --git a/cli/makefile b/cli/makefile
index d9be4f2..faea51c 100644
--- a/cli/makefile
+++ b/cli/makefile
@@ -5,7 +5,7 @@
 
 include $(dir $(lastword $(MAKEFILE_LIST)))../build/bootstrap.make
 
-cxx_tun := cli.cxx lexer.cxx
+cxx_tun := cli.cxx lexer.cxx parser.cxx
 
 #
 #
diff --git a/cli/parser.cxx b/cli/parser.cxx
new file mode 100644
index 0000000..937bf34
--- /dev/null
+++ b/cli/parser.cxx
@@ -0,0 +1,613 @@
+// file      : cli/parser.cxx
+// author    : Boris Kolpackov <boris@codesynthesis.com>
+// copyright : Copyright (c) 2009 Code Synthesis Tools CC
+// license   : MIT; see accompanying LICENSE file
+
+#include <iostream>
+
+#include "token.hxx"
+#include "lexer.hxx"
+#include "parser.hxx"
+
+using namespace std;
+
+const char* keywords[] =
+{
+  "include",
+  "namespace",
+  "class",
+  "signed",
+  "unsigned",
+  "bool",
+  "char",
+  "wchar_t",
+  "short",
+  "int",
+  "long",
+  "float",
+  "double"
+};
+
+const char* punctuation[] = {";", ",", "::", "{", "}", /*"(", ")",*/ "=", "|"};
+
+// Output the token type and value in a format suitable for diagnostics.
+//
+std::ostream&
+operator<< (std::ostream& os, Token const& t)
+{
+  switch (t.type ())
+  {
+  case Token::t_eos:
+    {
+      os << "end-of-stream";
+      break;
+    }
+  case Token::t_keyword:
+    {
+      os << "keyword '" << keywords[t.keyword ()] << "'";
+      break;
+    }
+  case Token::t_identifier:
+    {
+      os << "identifier '" << t.identifier () << "'";
+      break;
+    }
+  case Token::t_punctuation:
+    {
+      os << "'" << punctuation[t.punctuation ()] << "'";
+      break;
+    }
+  case Token::t_path_lit:
+    {
+      os << "path literal";
+      break;
+    }
+  case Token::t_string_lit:
+    {
+      os << "string literal";
+      break;
+    }
+  case Token::t_char_lit:
+    {
+      os << "char literal";
+      break;
+    }
+  case Token::t_bool_lit:
+    {
+      os << "bool literal";
+      break;
+    }
+  case Token::t_int_lit:
+    {
+      os << "integer literal";
+      break;
+    }
+  case Token::t_float_lit:
+    {
+      os << "floating point literal";
+      break;
+    }
+  case Token::t_call_expr:
+    {
+      os << "call expression";
+      break;
+    }
+  case Token::t_template_expr:
+    {
+      os << "template expression";
+      break;
+    }
+  }
+
+  return os;
+}
+
+void Parser::
+recover (Token& t)
+{
+  // Recover by skipping past next ';'.
+  //
+  for (;; t = lexer_->next ())
+  {
+    if (t.type () == Token::t_eos)
+      break;
+
+    if (t.punctuation () == Token::p_semi)
+    {
+      t = lexer_->next ();
+      break;
+    }
+  }
+}
+
+void Parser::
+parse (std::istream& is, std::string const& id)
+{
+  Lexer lexer (is, id);
+  lexer_ = &lexer;
+  id_ = &id;
+  valid_ = true;
+  def_unit ();
+
+  if (!valid_ || !lexer.valid ())
+    throw InvalidInput ();
+}
+
+void Parser::
+def_unit ()
+{
+  Token t (lexer_->next ());
+
+  // include-decl-seq
+  //
+  while (t.keyword () == Token::k_include)
+  {
+    try
+    {
+      include_decl ();
+      t = lexer_->next ();
+    }
+    catch (Error const&)
+    {
+      valid_ = false;
+      recover (t);
+    }
+  }
+
+  // decl-seq
+  //
+  while (t.type () != Token::t_eos)
+  {
+    try
+    {
+      if (decl (t))
+      {
+        t = lexer_->next ();
+        continue;
+      }
+
+      cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+           << "expected namespace or class declaration instead of " << t
+           << endl;
+      throw Error ();
+    }
+    catch (Error const&)
+    {
+      valid_ = false;
+      break; // Non-recoverable error.
+    }
+  }
+}
+
+void Parser::
+include_decl ()
+{
+  Token t (lexer_->next ());
+
+  if (t.type () != Token::t_path_lit)
+  {
+    cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+         << "expected path literal instead of " << t << endl;
+    throw Error ();
+  }
+
+  t = lexer_->next ();
+
+  if (t.punctuation () != Token::p_semi)
+  {
+    cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+         << "expected ';' instead of " << t << endl;
+    throw Error ();
+  }
+}
+
+bool Parser::
+decl (Token& t)
+{
+  if (t.type () == Token::t_keyword)
+  {
+    switch (t.keyword ())
+    {
+    case Token::k_namespace:
+      {
+        namespace_def ();
+        return true;
+      }
+    case Token::k_class:
+      {
+        class_def ();
+        return true;
+      }
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+void Parser::
+namespace_def ()
+{
+  Token t (lexer_->next ());
+
+  if (t.type () != Token::t_identifier)
+  {
+    cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+         << "expected identifier instead of " << t << endl;
+    throw Error ();
+  }
+
+  t = lexer_->next ();
+
+  if (t.punctuation () != Token::p_lcbrace)
+  {
+    cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+         << "expected '{' instead of " << t << endl;
+    throw Error ();
+  }
+
+  // decl-seq
+  //
+  t = lexer_->next ();
+
+  while (decl (t))
+    t = lexer_->next ();
+
+  if (t.punctuation () != Token::p_rcbrace)
+  {
+    cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+         << "expected namespace declaration, class declaration, or '}' "
+         << "instead of " << t << endl;
+    throw Error ();
+  }
+}
+
+void Parser::
+class_def ()
+{
+  Token t (lexer_->next ());
+
+  if (t.type () != Token::t_identifier)
+  {
+    cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+         << "expected identifier instead of " << t << endl;
+    throw Error ();
+  }
+
+  t = lexer_->next ();
+
+  if (t.punctuation () != Token::p_lcbrace)
+  {
+    cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+         << "expected '{' instead of " << t << endl;
+    throw Error ();
+  }
+
+  // decl-seq
+  //
+  t = lexer_->next ();
+
+  while (true)
+  {
+    try
+    {
+      if (!option_def (t))
+        break;
+
+      t = lexer_->next ();
+    }
+    catch (Error const&)
+    {
+      valid_ = false;
+      recover (t);
+    }
+  }
+
+  if (t.punctuation () != Token::p_rcbrace)
+  {
+    cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+         << "expected option declaration or '}' instead of " << t << endl;
+    throw Error ();
+  }
+
+  t = lexer_->next ();
+
+  if (t.punctuation () != Token::p_semi)
+  {
+    cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+         << "expected ';' instead of " << t << endl;
+    throw Error ();
+  }
+}
+
+bool Parser::
+option_def (Token& t)
+{
+  // type-spec
+  //
+  // These two functions set t to the next token if they return
+  // true.
+  //
+  if (!qualified_name (t) && !fundamental_type (t))
+    return false;
+
+  // option-name-seq
+  //
+  while (true)
+  {
+    if (t.type () != Token::t_identifier)
+    {
+      cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+           << "option name expected instead of " << t << endl;
+      throw Error ();
+    }
+
+    t = lexer_->next ();
+
+    if (t.punctuation () == Token::p_or)
+      t = lexer_->next ();
+    else
+      break;
+  }
+
+  // initializer
+  //
+  if (t.punctuation () == Token::p_eq)
+  {
+    t = lexer_->next ();
+
+    // assignment initiaizer
+    //
+    if (qualified_name (t))
+    {
+    }
+    else
+    {
+      switch (t.type ())
+      {
+      case Token::t_string_lit:
+      case Token::t_char_lit:
+      case Token::t_bool_lit:
+      case Token::t_int_lit:
+      case Token::t_float_lit:
+      case Token::t_call_expr:
+        {
+          t = lexer_->next ();
+          break;
+        }
+      default:
+        {
+          cerr << *id_ << ':' << t.line () << ':' << t.column ()
+               << ": error: expected intializer instead of " << t << endl;
+          throw Error ();
+        }
+      }
+    }
+  }
+  else if (t.type () == Token::t_call_expr)
+  {
+    // c-tor initializer
+    //
+    t = lexer_->next ();
+  }
+
+  if (t.punctuation () != Token::p_semi)
+  {
+    cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+         << "expected ';' instead of " << t << endl;
+    throw Error ();
+  }
+
+  return true;
+}
+
+bool Parser::
+qualified_name (Token& t)
+{
+  if (t.type () != Token::t_identifier && t.punctuation () != Token::p_dcolon)
+    return false;
+
+  if (t.punctuation () == Token::p_dcolon)
+    t = lexer_->next ();
+
+  while (true)
+  {
+    if (t.type () != Token::t_identifier)
+    {
+      cerr << *id_ << ':' << t.line () << ':' << t.column () << ": error: "
+           << "expected identifier after '::'" << endl;
+      throw Error ();
+    }
+
+    t = lexer_->next ();
+
+    if (t.type () == Token::t_template_expr)
+    {
+      // Template-id.
+      //
+      t = lexer_->next ();
+    }
+
+    if (t.punctuation () == Token::p_dcolon)
+      t = lexer_->next ();
+    else
+      break;
+  }
+
+  return true;
+}
+
+bool Parser::
+fundamental_type (Token& t)
+{
+  switch (t.keyword ())
+  {
+  case Token::k_signed:
+  case Token::k_unsigned:
+    {
+      switch ((t = lexer_->next ()).keyword ())
+      {
+      case Token::k_short:
+      case Token::k_long:
+        {
+          switch ((t = lexer_->next ()).keyword ())
+          {
+          case Token::k_int:
+            {
+              t = lexer_->next ();
+            }
+          default:
+            break;
+          }
+          break;
+        }
+      case Token::k_int:
+        {
+          switch ((t = lexer_->next ()).keyword ())
+          {
+          case Token::k_short:
+          case Token::k_long:
+            {
+              t = lexer_->next ();
+            }
+          default:
+            break;
+          }
+          break;
+        }
+      case Token::k_char:
+        {
+          t = lexer_->next ();
+          break;
+        }
+      default:
+        break;
+      }
+      break;
+    }
+  case Token::k_short:
+  case Token::k_long:
+    {
+      bool l (t.keyword () == Token::k_long);
+
+      switch ((t = lexer_->next ()).keyword ())
+      {
+      case Token::k_signed:
+      case Token::k_unsigned:
+        {
+          switch ((t = lexer_->next ()).keyword ())
+          {
+          case Token::k_int:
+            {
+              t = lexer_->next ();
+            }
+          default:
+            break;
+          }
+          break;
+        }
+      case Token::k_int:
+        {
+          switch ((t = lexer_->next ()).keyword ())
+          {
+          case Token::k_signed:
+          case Token::k_unsigned:
+            {
+              t = lexer_->next ();
+            }
+          default:
+            break;
+          }
+          break;
+        }
+      case Token::k_double:
+        {
+          if (l)
+            t = lexer_->next ();
+
+          break;
+        }
+      default:
+        break;
+      }
+      break;
+    }
+  case Token::k_int:
+    {
+      switch ((t = lexer_->next ()).keyword ())
+      {
+      case Token::k_signed:
+      case Token::k_unsigned:
+        {
+          switch ((t = lexer_->next ()).keyword ())
+          {
+          case Token::k_short:
+          case Token::k_long:
+            {
+              t = lexer_->next ();
+            }
+          default:
+            break;
+          }
+          break;
+        }
+      case Token::k_short:
+      case Token::k_long:
+        {
+          switch ((t = lexer_->next ()).keyword ())
+          {
+          case Token::k_signed:
+          case Token::k_unsigned:
+            {
+              t = lexer_->next ();
+            }
+          default:
+            break;
+          }
+          break;
+        }
+      default:
+        break;
+      }
+      break;
+    }
+  case Token::k_char:
+    {
+      switch ((t = lexer_->next ()).keyword ())
+      {
+      case Token::k_signed:
+      case Token::k_unsigned:
+        {
+          t = lexer_->next ();
+        }
+      default:
+        break;
+      }
+      break;
+    }
+  case Token::k_bool:
+  case Token::k_wchar:
+  case Token::k_float:
+    {
+      t = lexer_->next ();
+      break;
+    }
+  case Token::k_double:
+    {
+      switch ((t = lexer_->next ()).keyword ())
+      {
+      case Token::k_long:
+        {
+          t = lexer_->next ();
+        }
+      default:
+        break;
+      }
+      break;
+    }
+  default:
+    return false;
+  }
+
+  return true;
+}
diff --git a/cli/parser.hxx b/cli/parser.hxx
new file mode 100644
index 0000000..d1f4b4b
--- /dev/null
+++ b/cli/parser.hxx
@@ -0,0 +1,60 @@
+// file      : cli/parser.hxx
+// author    : Boris Kolpackov <boris@codesynthesis.com>
+// copyright : Copyright (c) 2009 Code Synthesis Tools CC
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef CLI_PARSER_HXX
+#define CLI_PARSER_HXX
+
+#include <string>
+#include <istream>
+
+class Token;
+class Lexer;
+
+class Parser
+{
+public:
+  struct InvalidInput {};
+
+  void
+  parse (std::istream& is, std::string const& id);
+
+private:
+  struct Error {};
+
+  void
+  def_unit ();
+
+  void
+  include_decl ();
+
+  bool
+  decl (Token&);
+
+  void
+  namespace_def ();
+
+  void
+  class_def ();
+
+  bool
+  option_def (Token&);
+
+  bool
+  qualified_name (Token&);
+
+  bool
+  fundamental_type (Token&);
+
+private:
+  void
+  recover (Token& t);
+
+private:
+  bool valid_;
+  Lexer* lexer_;
+  std::string const* id_;
+};
+
+#endif // CLI_PARSER_HXX
-- 
cgit v1.1