From d311e253ca3dcabb9e52d273110dea8b950571f7 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Sun, 9 Aug 2009 15:20:02 +0200 Subject: Implement lexical analyzer for the CLI language --- cli/lexer.cxx | 413 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ cli/lexer.hxx | 127 ++++++++++++++++++ cli/lexer.ixx | 86 ++++++++++++ 3 files changed, 626 insertions(+) create mode 100644 cli/lexer.cxx create mode 100644 cli/lexer.hxx create mode 100644 cli/lexer.ixx (limited to 'cli') diff --git a/cli/lexer.cxx b/cli/lexer.cxx new file mode 100644 index 0000000..203667a --- /dev/null +++ b/cli/lexer.cxx @@ -0,0 +1,413 @@ +// file : cli/lexer.cxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2009 Code Synthesis Tools CC +// license : MIT; see accompanying LICENSE file + +#include + +#include "lexer.hxx" + +using namespace std; + +Lexer:: +Lexer (istream& is, string const& id) + : loc_ ("C"), + is_ (is), + id_ (id), + l_ (1), + c_(1), + eos_ (false), + include_ (false) +{ + keyword_map_["include"] = Token::k_include; + keyword_map_["namespace"] = Token::k_namespace; + keyword_map_["class"] = Token::k_class; + keyword_map_["signed"] = Token::k_signed; + keyword_map_["unsigned"] = Token::k_unsigned; + keyword_map_["bool"] = Token::k_bool; + keyword_map_["char"] = Token::k_char; + keyword_map_["wchar_t"] = Token::k_wchar; + keyword_map_["short"] = Token::k_short; + keyword_map_["int"] = Token::k_int; + keyword_map_["long"] = Token::k_long; + keyword_map_["float"] = Token::k_float; + keyword_map_["double"] = Token::k_double; +} + +Lexer::Char Lexer:: +get () +{ + // When is_.get () returns eof, the failbit is also set (stupid, + // isn't?) which may trigger an exception. To work around this + // we will call peek() first and only call get() if it is not + // eof. But we can only call peek() on eof once; any subsequent + // calls will spoil the failbit (even more stupid). + // + Char c (peek ()); + + if (!is_eos (c)) + { + is_.get (); + + if (c == '\n') + { + l_++; + c_ = 1; + } + else + c_++; + } + + return c; +} + +Lexer::Char Lexer:: +peek () +{ + if (eos_) + return Char (Char::Traits::eof (), l_, c_); + else + { + Char::IntType i (is_.peek ()); + + if (i == Char::Traits::eof ()) + eos_ = true; + + return Char (i, l_, c_); + } +} + +Token Lexer:: +next () +{ + while (true) // Recovery loop. + { + bool include (include_); + include_ = false; + + skip_spaces (); + + Char c (get ()); + + if (is_eos (c)) + return Token (c.line (), c.column ()); + + try + { + switch (c) + { + case '\'': + { + return char_literal (c); + } + case '\"': + { + if (include) + return path_literal (c); + else + return string_literal (c); + } + case '<': + { + if (include) + return path_literal (c); + + break; + } + case ';': + { + return Token (Token::p_semi, c.line (), c.column ()); + } + case ',': + { + return Token (Token::p_comma, c.line (), c.column ()); + } + case ':': + { + return Token (Token::p_colon, c.line (), c.column ()); + } + case '{': + { + return Token (Token::p_lcbrace, c.line (), c.column ()); + } + case '}': + { + return Token (Token::p_rcbrace, c.line (), c.column ()); + } + case '(': + { + return Token (Token::p_lparen, c.line (), c.column ()); + } + case ')': + { + return Token (Token::p_rparen, c.line (), c.column ()); + } + case '=': + { + return Token (Token::p_eq, c.line (), c.column ()); + } + case '|': + { + return Token (Token::p_or, c.line (), c.column ()); + } + case '-': + { + // This can be a beginning of an identifier or a an integer + // literal. Figure out which one it is. + // + Char p (peek ()); + + if (is_dec_digit (p)) + return int_literal (get (), true, c.line (), c.column ()); + else if (is_space (p)) + { + skip_spaces (); + p = peek (); + + if (is_dec_digit (p)) + return int_literal (get (), true, c.line (), c.column ()); + + // Stray '-'. + // + cerr << id_ << ':' << c.line () << ':' << c.column () + << ": error: unexpected character '-'" << endl; + throw invalid_input (); + } + + break; + } + } + + if (is_alpha (c) || c == '_' || c == '-' || c == '/') + { + return identifier (c); + } + + if (is_dec_digit (c)) + { + return int_literal (c); + } + + cerr << id_ << ':' << c.line () << ':' << c.column () + << ": error: unexpected character '" << c << "'" << endl; + } + catch (invalid_input const&) + { + } + + // Try to recover. + // + do + { + c = get (); + + if (is_eos (c)) + return Token (c.line (), c.column ()); + } while (c != ';'); + } +} + +void Lexer:: +skip_spaces () +{ + for (Char c (peek ()); !is_eos (c) && is_space (c); c = peek ()) + get (); +} + +Token Lexer:: +identifier (Char c) +{ + size_t ln (c.line ()), cl (c.column ()); + string lexeme; + lexeme += c; + + bool check (c == '-' || c == '/'); + + for (c = peek (); + !is_eos (c) && (is_alnum (c) || c == '_' || c == '-'); + c = peek ()) + { + get (); + lexeme += c; + } + + // Check for invalid identifiers. + // + if (check) + { + size_t i (1); + + for (; i < lexeme.size (); ++i) + if (is_alnum (lexeme[i]) || lexeme[i] == '_') + break; + + if (i == lexeme.size ()) + { + cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " + << "invalid character sequence '" << lexeme << "'" << endl; + throw invalid_input (); + } + } + + KeywordMap::const_iterator i (keyword_map_.find (lexeme)); + + if (i != keyword_map_.end ()) + { + if (i->second == Token::k_include) + include_ = true; + + return Token (i->second, ln, cl); + } + + if (lexeme == "true" || lexeme == "false") + return Token (Token::t_bool_lit, lexeme, ln, cl); + + return Token (Token::t_identifier, lexeme, ln, cl); +} + +Token Lexer:: +int_literal (Char c, bool neg, size_t ml, size_t mc) +{ + size_t ln (neg ? ml : c.line ()), cl (neg ? mc : c.column ()); + string lexeme; + + if (neg) + lexeme += '-'; + + lexeme += c; + + for (c = peek (); !is_eos (c) && is_dec_digit (c); c = peek ()) + { + get (); + lexeme += c; + } + + return Token (Token::t_int_lit, lexeme, ln, cl); +} + +Token Lexer:: +char_literal (Char c) +{ + size_t ln (c.line ()), cl (c.column ()); + string lexeme; + lexeme += c; + + char p (c); + + while (true) + { + c = get (); + + if (is_eos (c)) + { + cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " + << "end of stream reached while reading character literal" << endl; + throw invalid_input (); + } + + lexeme += c; + + if (c == '\'' && p != '\\') + break; + + // We need to keep track of \\ escapings so we don't confuse + // them with \', as in '\\'. + // + if (c == '\\' && p == '\\') + p = '.'; + else + p = c; + } + + return Token (Token::t_char_lit, lexeme, ln, cl); +} + +Token Lexer:: +string_literal (Char c) +{ + size_t ln (c.line ()), cl (c.column ()); + string lexeme; + lexeme += c; + + while (true) + { + lexeme += string_literal_trailer (); + + // Check if there are more strings. + // + skip_spaces (); + + c = peek (); + + if (is_eos (c) || c != '"') + break; + + get (); + lexeme += " \""; + } + + return Token (Token::t_char_lit, lexeme, ln, cl); +} + +string Lexer:: +string_literal_trailer () +{ + string r; + char p ('"'); + + while (true) + { + Char c = get (); + + if (is_eos (c)) + { + cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " + << "end of stream reached while reading string literal" << endl; + throw invalid_input (); + } + + r += c; + + if (c == '"' && p != '\\') + break; + + // We need to keep track of \\ escapings so we don't confuse + // them with \', as in '\\'. + // + if (c == '\\' && p == '\\') + p = '.'; + else + p = c; + } + + return r; +} + +Token Lexer:: +path_literal (Char c) +{ + size_t ln (c.line ()), cl (c.column ()); + string lexeme; + lexeme += c; + + char end (c == '<' ? '>' : '"'); + + while (true) + { + c = get (); + + if (is_eos (c)) + { + cerr << id_ << ':' << c.line () << ':' << c.column () << ": error: " + << "end of stream reached while reading path literal" << endl; + throw invalid_input (); + } + + lexeme += c; + + if (c == end) + break; + } + + return Token (Token::t_path_lit, lexeme, ln, cl); +} diff --git a/cli/lexer.hxx b/cli/lexer.hxx new file mode 100644 index 0000000..1caceb3 --- /dev/null +++ b/cli/lexer.hxx @@ -0,0 +1,127 @@ +// file : cli/lexer.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2009 Code Synthesis Tools CC +// license : MIT; see accompanying LICENSE file + +#ifndef CLI_LEXER_HXX +#define CLI_LEXER_HXX + +#include +#include +#include +#include // std::size_t +#include + +#include "token.hxx" + +class Lexer +{ +public: + Lexer (std::istream& is, std::string const& id); + + Token + next (); + +protected: + class Char + { + public: + typedef std::char_traits Traits; + typedef Traits::int_type IntType; + typedef Traits::char_type CharType; + + Char (IntType v, std::size_t l, std::size_t c); + + operator CharType () const; + + IntType + value () const; + + std::size_t + line () const; + + std::size_t + column () const; + + private: + IntType v_; + std::size_t l_; + std::size_t c_; + }; + + Char + get (); + + Char + peek (); + +protected: + class invalid_input {}; + + void + skip_spaces (); + + Token + identifier (Char); + + Token + int_literal (Char, + bool neg = false, + std::size_t ml = 0, + std::size_t mc = 0); + + Token + char_literal (Char); + + Token + string_literal (Char); + + std::string + string_literal_trailer (); + + Token + path_literal (Char); + +protected: + bool + is_alpha (char c) const; + + bool + is_oct_digit (char c) const; + + bool + is_dec_digit (char c) const; + + bool + is_hex_digit (char c) const; + + bool + is_alnum (char c) const; + + bool + is_space (char c) const; + + bool + is_eos (Char const& c) const; + + char + to_upper (char c) const; + +private: + typedef std::map KeywordMap; + + std::locale loc_; + std::istream& is_; + std::string id_; + std::size_t l_; + std::size_t c_; + + KeywordMap keyword_map_; + + bool eos_; + bool include_; +}; + +#include "lexer.ixx" + +#endif // CLI_LEXER_HXX diff --git a/cli/lexer.ixx b/cli/lexer.ixx new file mode 100644 index 0000000..f7ff77e --- /dev/null +++ b/cli/lexer.ixx @@ -0,0 +1,86 @@ +// file : cli/lexer.ixx +// author : Boris Kolpackov +// copyright : Copyright (c) 2009 Code Synthesis Tools CC +// license : MIT; see accompanying LICENSE file + +// Lexer::Char +// +inline Lexer::Char:: +Char (IntType v, std::size_t l, std::size_t c) + : v_ (v), l_ (l), c_ (c) +{ +} + +inline Lexer::Char:: +operator CharType () const +{ + return Traits::to_char_type (v_); +} + +inline Lexer::Char::IntType Lexer::Char:: +value () const +{ + return v_; +} + +inline std::size_t Lexer::Char:: +line () const +{ + return l_; +} + +inline std::size_t Lexer::Char:: +column () const +{ + return c_; +} + +// Lexer +// +inline bool Lexer:: +is_alpha (char c) const +{ + return std::isalpha (c, loc_); +} + +inline bool Lexer:: +is_oct_digit (char c) const +{ + return std::isdigit (c, loc_) && c != '8' && c != '9'; +} + +inline bool Lexer:: +is_dec_digit (char c) const +{ + return std::isdigit (c, loc_); +} + +inline bool Lexer:: +is_hex_digit (char c) const +{ + return std::isxdigit (c, loc_); +} + +inline bool Lexer:: +is_alnum (char c) const +{ + return std::isalnum (c, loc_); +} + +inline bool Lexer:: +is_space (char c) const +{ + return std::isspace (c, loc_); +} + +inline bool Lexer:: +is_eos (Char const& c) const +{ + return c.value () == Char::Traits::eof (); +} + +inline char Lexer:: +to_upper (char c) const +{ + return std::toupper (c, loc_); +} -- cgit v1.1