summaryrefslogtreecommitdiff
path: root/cli
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2009-08-09 15:20:02 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2009-08-09 15:20:02 +0200
commitd311e253ca3dcabb9e52d273110dea8b950571f7 (patch)
tree3f8503433cba6425259d897d5008e7a2497a6430 /cli
parentcaa0969db07e6f252dc5c270222107a591c1ca7d (diff)
Implement lexical analyzer for the CLI language
Diffstat (limited to 'cli')
-rw-r--r--cli/lexer.cxx413
-rw-r--r--cli/lexer.hxx127
-rw-r--r--cli/lexer.ixx86
3 files changed, 626 insertions, 0 deletions
diff --git a/cli/lexer.cxx b/cli/lexer.cxx
new file mode 100644
index 0000000..203667a
--- /dev/null
+++ b/cli/lexer.cxx
@@ -0,0 +1,413 @@
+// file : cli/lexer.cxx
+// author : Boris Kolpackov <boris@codesynthesis.com>
+// copyright : Copyright (c) 2009 Code Synthesis Tools CC
+// license : MIT; see accompanying LICENSE file
+
+#include <iostream>
+
+#include "lexer.hxx"
+
+using namespace std;
+
+Lexer::
+Lexer (istream& is, string const& id)
+ : loc_ ("C"),
+ is_ (is),
+ id_ (id),
+ l_ (1),
+ c_(1),
+ eos_ (false),
+ include_ (false)
+{
+ keyword_map_["include"] = Token::k_include;
+ keyword_map_["namespace"] = Token::k_namespace;
+ keyword_map_["class"] = Token::k_class;
+ keyword_map_["signed"] = Token::k_signed;
+ keyword_map_["unsigned"] = Token::k_unsigned;
+ keyword_map_["bool"] = Token::k_bool;
+ keyword_map_["char"] = Token::k_char;
+ keyword_map_["wchar_t"] = Token::k_wchar;
+ keyword_map_["short"] = Token::k_short;
+ keyword_map_["int"] = Token::k_int;
+ keyword_map_["long"] = Token::k_long;
+ keyword_map_["float"] = Token::k_float;
+ keyword_map_["double"] = Token::k_double;
+}
+
+Lexer::Char Lexer::
+get ()
+{
+ // When is_.get () returns eof, the failbit is also set (stupid,
+ // isn't?) which may trigger an exception. To work around this
+ // we will call peek() first and only call get() if it is not
+ // eof. But we can only call peek() on eof once; any subsequent
+ // calls will spoil the failbit (even more stupid).
+ //
+ Char c (peek ());
+
+ if (!is_eos (c))
+ {
+ is_.get ();
+
+ if (c == '\n')
+ {
+ l_++;
+ c_ = 1;
+ }
+ else
+ c_++;
+ }
+
+ return c;
+}
+
+Lexer::Char Lexer::
+peek ()
+{
+ if (eos_)
+ return Char (Char::Traits::eof (), l_, c_);
+ else
+ {
+ Char::IntType i (is_.peek ());
+
+ if (i == Char::Traits::eof ())
+ eos_ = true;
+
+ return Char (i, l_, c_);
+ }
+}
+
+Token Lexer::
+next ()
+{
+ while (true) // Recovery loop.
+ {
+ bool include (include_);
+ include_ = false;
+
+ skip_spaces ();
+
+ Char c (get ());
+
+ if (is_eos (c))
+ return Token (c.line (), c.column ());
+
+ try
+ {
+ switch (c)
+ {
+ case '\'':
+ {
+ return char_literal (c);
+ }
+ case '\"':
+ {
+ if (include)
+ return path_literal (c);
+ else
+ return string_literal (c);
+ }
+ case '<':
+ {
+ if (include)
+ return path_literal (c);
+
+ break;
+ }
+ case ';':
+ {
+ return Token (Token::p_semi, c.line (), c.column ());
+ }
+ case ',':
+ {
+ return Token (Token::p_comma, c.line (), c.column ());
+ }
+ case ':':
+ {
+ return Token (Token::p_colon, c.line (), c.column ());
+ }
+ case '{':
+ {
+ return Token (Token::p_lcbrace, c.line (), c.column ());
+ }
+ case '}':
+ {
+ return Token (Token::p_rcbrace, c.line (), c.column ());
+ }
+ case '(':
+ {
+ return Token (Token::p_lparen, c.line (), c.column ());
+ }
+ case ')':
+ {
+ return Token (Token::p_rparen, c.line (), c.column ());
+ }
+ case '=':
+ {
+ return Token (Token::p_eq, c.line (), c.column ());
+ }
+ case '|':
+ {
+ return Token (Token::p_or, c.line (), c.column ());
+ }
+ case '-':
+ {
+ // This can be a beginning of an identifier or a an integer
+ // literal. Figure out which one it is.
+ //
+ Char p (peek ());
+
+ if (is_dec_digit (p))
+ return int_literal (get (), true, c.line (), c.column ());
+ else if (is_space (p))
+ {
+ skip_spaces ();
+ p = peek ();
+
+ if (is_dec_digit (p))
+ return int_literal (get (), true, c.line (), c.column ());
+
+ // Stray '-'.
+ //
+ cerr << id_ << ':' << c.line () << ':' << c.column ()
+ << ": error: unexpected character '-'" << endl;
+ throw invalid_input ();
+ }
+
+ break;
+ }
+ }
+
+ if (is_alpha (c) || c == '_' || c == '-' || c == '/')
+ {
+ return identifier (c);
+ }
+
+ if (is_dec_digit (c))
+ {
+ return int_literal (c);
+ }
+
+ cerr << id_ << ':' << c.line () << ':' << c.column ()
+ << ": error: unexpected character '" << c << "'" << endl;
+ }
+ catch (invalid_input const&)
+ {
+ }
+
+ // Try to recover.
+ //
+ do
+ {
+ c = get ();
+
+ if (is_eos (c))
+ return Token (c.line (), c.column ());
+ } while (c != ';');
+ }
+}
+
+void Lexer::
+skip_spaces ()
+{
+ for (Char c (peek ()); !is_eos (c) && is_space (c); c = peek ())
+ get ();
+}
+
+Token Lexer::
+identifier (Char c)
+{
+ size_t ln (c.line ()), cl (c.column ());
+ string lexeme;
+ lexeme += c;
+
+ bool check (c == '-' || c == '/');
+
+ for (c = peek ();
+ !is_eos (c) && (is_alnum (c) || c == '_' || c == '-');
+ c = peek ())
+ {
+ get ();
+ lexeme += c;
+ }
+
+ // Check for invalid identifiers.
+ //
+ if (check)
+ {
+ size_t i (1);
+
+ for (; i < lexeme.size (); ++i)
+ if (is_alnum (lexeme[i]) || lexeme[i] == '_')
+ break;
+
+ if (i == lexeme.size ())
+ {
+ cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: "
+ << "invalid character sequence '" << lexeme << "'" << endl;
+ throw invalid_input ();
+ }
+ }
+
+ KeywordMap::const_iterator i (keyword_map_.find (lexeme));
+
+ if (i != keyword_map_.end ())
+ {
+ if (i->second == Token::k_include)
+ include_ = true;
+
+ return Token (i->second, ln, cl);
+ }
+
+ if (lexeme == "true" || lexeme == "false")
+ return Token (Token::t_bool_lit, lexeme, ln, cl);
+
+ return Token (Token::t_identifier, lexeme, ln, cl);
+}
+
+Token Lexer::
+int_literal (Char c, bool neg, size_t ml, size_t mc)
+{
+ size_t ln (neg ? ml : c.line ()), cl (neg ? mc : c.column ());
+ string lexeme;
+
+ if (neg)
+ lexeme += '-';
+
+ lexeme += c;
+
+ for (c = peek (); !is_eos (c) && is_dec_digit (c); c = peek ())
+ {
+ get ();
+ lexeme += c;
+ }
+
+ return Token (Token::t_int_lit, lexeme, ln, cl);
+}
+
+Token Lexer::
+char_literal (Char c)
+{
+ size_t ln (c.line ()), cl (c.column ());
+ string lexeme;
+ lexeme += c;
+
+ char p (c);
+
+ while (true)
+ {
+ c = get ();
+
+ if (is_eos (c))
+ {
+ cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: "
+ << "end of stream reached while reading character literal" << endl;
+ throw invalid_input ();
+ }
+
+ lexeme += c;
+
+ if (c == '\'' && p != '\\')
+ break;
+
+ // We need to keep track of \\ escapings so we don't confuse
+ // them with \', as in '\\'.
+ //
+ if (c == '\\' && p == '\\')
+ p = '.';
+ else
+ p = c;
+ }
+
+ return Token (Token::t_char_lit, lexeme, ln, cl);
+}
+
+Token Lexer::
+string_literal (Char c)
+{
+ size_t ln (c.line ()), cl (c.column ());
+ string lexeme;
+ lexeme += c;
+
+ while (true)
+ {
+ lexeme += string_literal_trailer ();
+
+ // Check if there are more strings.
+ //
+ skip_spaces ();
+
+ c = peek ();
+
+ if (is_eos (c) || c != '"')
+ break;
+
+ get ();
+ lexeme += " \"";
+ }
+
+ return Token (Token::t_char_lit, lexeme, ln, cl);
+}
+
+string Lexer::
+string_literal_trailer ()
+{
+ string r;
+ char p ('"');
+
+ while (true)
+ {
+ Char c = get ();
+
+ if (is_eos (c))
+ {
+ cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: "
+ << "end of stream reached while reading string literal" << endl;
+ throw invalid_input ();
+ }
+
+ r += c;
+
+ if (c == '"' && p != '\\')
+ break;
+
+ // We need to keep track of \\ escapings so we don't confuse
+ // them with \', as in '\\'.
+ //
+ if (c == '\\' && p == '\\')
+ p = '.';
+ else
+ p = c;
+ }
+
+ return r;
+}
+
+Token Lexer::
+path_literal (Char c)
+{
+ size_t ln (c.line ()), cl (c.column ());
+ string lexeme;
+ lexeme += c;
+
+ char end (c == '<' ? '>' : '"');
+
+ while (true)
+ {
+ c = get ();
+
+ if (is_eos (c))
+ {
+ cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: "
+ << "end of stream reached while reading path literal" << endl;
+ throw invalid_input ();
+ }
+
+ lexeme += c;
+
+ if (c == end)
+ break;
+ }
+
+ return Token (Token::t_path_lit, lexeme, ln, cl);
+}
diff --git a/cli/lexer.hxx b/cli/lexer.hxx
new file mode 100644
index 0000000..1caceb3
--- /dev/null
+++ b/cli/lexer.hxx
@@ -0,0 +1,127 @@
+// file : cli/lexer.hxx
+// author : Boris Kolpackov <boris@codesynthesis.com>
+// copyright : Copyright (c) 2009 Code Synthesis Tools CC
+// license : MIT; see accompanying LICENSE file
+
+#ifndef CLI_LEXER_HXX
+#define CLI_LEXER_HXX
+
+#include <map>
+#include <string>
+#include <locale>
+#include <cstddef> // std::size_t
+#include <istream>
+
+#include "token.hxx"
+
+class Lexer
+{
+public:
+ Lexer (std::istream& is, std::string const& id);
+
+ Token
+ next ();
+
+protected:
+ class Char
+ {
+ public:
+ typedef std::char_traits<char> Traits;
+ typedef Traits::int_type IntType;
+ typedef Traits::char_type CharType;
+
+ Char (IntType v, std::size_t l, std::size_t c);
+
+ operator CharType () const;
+
+ IntType
+ value () const;
+
+ std::size_t
+ line () const;
+
+ std::size_t
+ column () const;
+
+ private:
+ IntType v_;
+ std::size_t l_;
+ std::size_t c_;
+ };
+
+ Char
+ get ();
+
+ Char
+ peek ();
+
+protected:
+ class invalid_input {};
+
+ void
+ skip_spaces ();
+
+ Token
+ identifier (Char);
+
+ Token
+ int_literal (Char,
+ bool neg = false,
+ std::size_t ml = 0,
+ std::size_t mc = 0);
+
+ Token
+ char_literal (Char);
+
+ Token
+ string_literal (Char);
+
+ std::string
+ string_literal_trailer ();
+
+ Token
+ path_literal (Char);
+
+protected:
+ bool
+ is_alpha (char c) const;
+
+ bool
+ is_oct_digit (char c) const;
+
+ bool
+ is_dec_digit (char c) const;
+
+ bool
+ is_hex_digit (char c) const;
+
+ bool
+ is_alnum (char c) const;
+
+ bool
+ is_space (char c) const;
+
+ bool
+ is_eos (Char const& c) const;
+
+ char
+ to_upper (char c) const;
+
+private:
+ typedef std::map<std::string, Token::Keyword> KeywordMap;
+
+ std::locale loc_;
+ std::istream& is_;
+ std::string id_;
+ std::size_t l_;
+ std::size_t c_;
+
+ KeywordMap keyword_map_;
+
+ bool eos_;
+ bool include_;
+};
+
+#include "lexer.ixx"
+
+#endif // CLI_LEXER_HXX
diff --git a/cli/lexer.ixx b/cli/lexer.ixx
new file mode 100644
index 0000000..f7ff77e
--- /dev/null
+++ b/cli/lexer.ixx
@@ -0,0 +1,86 @@
+// file : cli/lexer.ixx
+// author : Boris Kolpackov <boris@codesynthesis.com>
+// copyright : Copyright (c) 2009 Code Synthesis Tools CC
+// license : MIT; see accompanying LICENSE file
+
+// Lexer::Char
+//
+inline Lexer::Char::
+Char (IntType v, std::size_t l, std::size_t c)
+ : v_ (v), l_ (l), c_ (c)
+{
+}
+
+inline Lexer::Char::
+operator CharType () const
+{
+ return Traits::to_char_type (v_);
+}
+
+inline Lexer::Char::IntType Lexer::Char::
+value () const
+{
+ return v_;
+}
+
+inline std::size_t Lexer::Char::
+line () const
+{
+ return l_;
+}
+
+inline std::size_t Lexer::Char::
+column () const
+{
+ return c_;
+}
+
+// Lexer
+//
+inline bool Lexer::
+is_alpha (char c) const
+{
+ return std::isalpha (c, loc_);
+}
+
+inline bool Lexer::
+is_oct_digit (char c) const
+{
+ return std::isdigit (c, loc_) && c != '8' && c != '9';
+}
+
+inline bool Lexer::
+is_dec_digit (char c) const
+{
+ return std::isdigit (c, loc_);
+}
+
+inline bool Lexer::
+is_hex_digit (char c) const
+{
+ return std::isxdigit (c, loc_);
+}
+
+inline bool Lexer::
+is_alnum (char c) const
+{
+ return std::isalnum (c, loc_);
+}
+
+inline bool Lexer::
+is_space (char c) const
+{
+ return std::isspace (c, loc_);
+}
+
+inline bool Lexer::
+is_eos (Char const& c) const
+{
+ return c.value () == Char::Traits::eof ();
+}
+
+inline char Lexer::
+to_upper (char c) const
+{
+ return std::toupper (c, loc_);
+}