diff options
Diffstat (limited to 'libcutl/re/re.cxx')
-rw-r--r-- | libcutl/re/re.cxx | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/libcutl/re/re.cxx b/libcutl/re/re.cxx new file mode 100644 index 0000000..0bcbe96 --- /dev/null +++ b/libcutl/re/re.cxx @@ -0,0 +1,434 @@ +// file : libcutl/re/re.cxx +// license : MIT; see accompanying LICENSE file + +#include <libcutl/re.hxx> + +#include <regex> +#include <locale> +#include <cstddef> // size_t + +using namespace std; + +namespace cutl +{ + namespace re + { + // + // format_base + // + + char const* format_base:: + what () const noexcept + { + return description_.c_str (); + } + + // + // basic_regex + // + template <typename C> + struct basic_regex<C>::impl + { + typedef basic_string<C> string_type; + typedef std::basic_regex<C> regex_type; + typedef typename regex_type::flag_type flag_type; + + impl () {} + impl (regex_type const& r): r (r) {} + impl (string_type const& s, bool icase) + { + flag_type f (std::regex_constants::ECMAScript); + + if (icase) + f |= std::regex_constants::icase; + + r.assign (s, f); + } + + regex_type r; + }; + + template <> + LIBCUTL_EXPORT basic_regex<char>:: + ~basic_regex () + { + delete impl_; + } + + template <> + LIBCUTL_EXPORT basic_regex<wchar_t>:: + ~basic_regex () + { + delete impl_; + } + + template <> + LIBCUTL_EXPORT basic_regex<char>:: + basic_regex (basic_regex const& r) + : str_ (r.str_), impl_ (new impl (r.impl_->r)) + { + } + + template <> + LIBCUTL_EXPORT basic_regex<wchar_t>:: + basic_regex (basic_regex const& r) + : str_ (r.str_), impl_ (new impl (r.impl_->r)) + { + } + + template <> + LIBCUTL_EXPORT basic_regex<char>& basic_regex<char>:: + operator= (basic_regex const& r) + { + string_type tmp (r.str_); + impl_->r = r.impl_->r; + str_.swap (tmp); + return *this; + } + + template <> + LIBCUTL_EXPORT basic_regex<wchar_t>& basic_regex<wchar_t>:: + operator= (basic_regex const& r) + { + string_type tmp (r.str_); + impl_->r = r.impl_->r; + str_.swap (tmp); + return *this; + } + + template <> + LIBCUTL_EXPORT void basic_regex<char>:: + init (string_type const* s, bool icase) + { + string_type tmp (s == 0 ? string_type () : *s); + + try + { + if (impl_ == 0) + impl_ = s == 0 ? new impl : new impl (*s, icase); + else + { + impl::flag_type f (std::regex_constants::ECMAScript); + + if (icase) + f |= std::regex_constants::icase; + + impl_->r.assign (*s, f); + } + } + catch (std::regex_error const& e) + { + throw basic_format<char> (s == 0 ? "" : *s, e.what ()); + } + + str_.swap (tmp); + } + + template <> + LIBCUTL_EXPORT void basic_regex<wchar_t>:: + init (string_type const* s, bool icase) + { + string_type tmp (s == 0 ? string_type () : *s); + + try + { + if (impl_ == 0) + impl_ = s == 0 ? new impl : new impl (*s, icase); + else + { + impl::flag_type f (std::regex_constants::ECMAScript); + + if (icase) + f |= std::regex_constants::icase; + + impl_->r.assign (*s, f); + } + } + catch (std::regex_error const& e) + { + throw basic_format<wchar_t> (s == 0 ? L"" : *s, e.what ()); + } + + str_.swap (tmp); + } + + template <> + LIBCUTL_EXPORT bool basic_regex<char>:: + match (string_type const& s) const + { + return std::regex_match (s, impl_->r); + } + + template <> + LIBCUTL_EXPORT bool basic_regex<wchar_t>:: + match (string_type const& s) const + { + return std::regex_match (s, impl_->r); + } + + template <> + LIBCUTL_EXPORT bool basic_regex<char>:: + search (string_type const& s) const + { + return std::regex_search (s, impl_->r); + } + + template <> + LIBCUTL_EXPORT bool basic_regex<wchar_t>:: + search (string_type const& s) const + { + return std::regex_search (s, impl_->r); + } + + // If we are using C++11 regex then extend the standard ECMA-262 + // substitution escape sequences with a subset of Perl sequences: + // + // \\, \u, \l, \U, \L, \E, \1, ..., \9 + // + // Notes and limitations: + // + // - The only valid regex_constants flags are match_default, + // format_first_only (format_no_copy can easily be supported). + // + // - If backslash doesn't start any of the listed sequences then it is + // silently dropped and the following character is copied as is. + // + // - The character case conversion is performed according to the global + // C++ locale (which is, unless changed, is the same as C locale and + // both default to the POSIX locale aka "C"). + // + template <typename C> + static basic_string<C> + regex_replace_ex (const basic_string<C>& s, + const std::basic_regex<C>& re, + const basic_string<C>& fmt, + std::regex_constants::match_flag_type flags) + { + using string_type = basic_string<C>; + using str_it = typename string_type::const_iterator; + using regex_it = regex_iterator<str_it>; + + bool first_only ((flags & regex_constants::format_first_only) == + regex_constants::format_first_only); + + locale cl; // Copy of the global C++ locale. + string_type r; + + // Beginning of the last unmatched substring. + // + str_it ub (s.begin ()); + + for (regex_it b (s.begin (), s.end (), re, flags), i (b), e; i != e; ++i) + { + const match_results<str_it>& m (*i); + + // Copy the preceeding unmatched substring, save the beginning of the + // one that follows. + // + r.append (ub, m.prefix ().second); + ub = m.suffix ().first; + + if (first_only && i != b) + r.append (m[0].first, m[0].second); // Append matched substring. + else + { + // The standard implementation calls m.format() here. We perform our + // own formatting. + // + // Note that we are using char type literals with the assumption + // that being ASCII characters they will be properly "widened" to + // the corresponding literals of the C template parameter type. + // + auto digit = [] (C c) -> int + { + return c >= '0' && c <= '9' ? c - '0' : -1; + }; + + enum class case_conv {none, upper, lower, upper_once, lower_once} + mode (case_conv::none); + + auto conv_chr = [&mode, &cl] (C c) -> C + { + switch (mode) + { + case case_conv::upper_once: mode = case_conv::none; // Fall through. + case case_conv::upper: c = toupper (c, cl); break; + case case_conv::lower_once: mode = case_conv::none; // Fall through. + case case_conv::lower: c = tolower (c, cl); break; + case case_conv::none: break; + } + return c; + }; + + auto append_chr = [&r, &conv_chr] (C c) + { + r.push_back (conv_chr (c)); + }; + + auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e) + { + // Optimize for the common case. + // + if (mode == case_conv::none) + r.append (b, e); + else + { + for (str_it i (b); i != e; ++i) + r.push_back (conv_chr (*i)); + } + }; + + size_t n (fmt.size ()); + for (size_t i (0); i < n; ++i) + { + C c (fmt[i]); + + switch (c) + { + case '$': + { + // Check if this is a $-based escape sequence. Interpret it + // accordingly if that's the case, treat '$' as a regular + // character otherwise. + // + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '$': append_chr (c); break; + case '&': append_str (m[0].first, m[0].second); break; + case '`': + { + append_str (m.prefix ().first, m.prefix ().second); + break; + } + case '\'': + { + append_str (m.suffix ().first, m.suffix ().second); + break; + } + default: + { + // Check if this is a sub-expression 1-based index ($n or + // $nn). Append the matching substring if that's the case. + // Treat '$' as a regular character otherwise. Index + // greater than the sub-expression count is silently + // ignored. + // + int si (digit (c)); + if (si >= 0) + { + int d; + if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last. + { + si = si * 10 + d; + ++i; + } + } + + if (si > 0) + { + // m[0] refers to the matched substring. + // + if (static_cast<size_t> (si) < m.size ()) + append_str (m[si].first, m[si].second); + } + else + { + // Not a $-based escape sequence so treat '$' as a + // regular character. + // + --i; + append_chr ('$'); + } + + break; + } + } + + break; + } + case '\\': + { + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '\\': append_chr (c); break; + + case 'u': mode = case_conv::upper_once; break; + case 'l': mode = case_conv::lower_once; break; + case 'U': mode = case_conv::upper; break; + case 'L': mode = case_conv::lower; break; + case 'E': mode = case_conv::none; break; + default: + { + // Check if this is a sub-expression 1-based index. Append + // the matching substring if that's the case, Skip '\\' + // otherwise. Index greater than the sub-expression count + // is silently ignored. + // + int si (digit (c)); + if (si > 0) + { + // m[0] refers to the matched substring. + // + if (static_cast<size_t> (si) < m.size ()) + append_str (m[si].first, m[si].second); + } + else + --i; + + break; + } + } + + break; + } + default: + { + // Append a regular character. + // + append_chr (c); + break; + } + } + } + } + } + + r.append (ub, s.end ()); // Append the rightmost non-matched substring. + return r; + } + + template <> + LIBCUTL_EXPORT string basic_regex<char>:: + replace (string_type const& s, + string_type const& sub, + bool first_only) const + { + std::regex_constants::match_flag_type f ( + std::regex_constants::format_default); + + if (first_only) + f |= std::regex_constants::format_first_only; + + return regex_replace_ex (s, impl_->r, sub, f); + } + + template <> + LIBCUTL_EXPORT wstring basic_regex<wchar_t>:: + replace (string_type const& s, + string_type const& sub, + bool first_only) const + { + std::regex_constants::match_flag_type f ( + std::regex_constants::format_default); + + if (first_only) + f |= std::regex_constants::format_first_only; + + return regex_replace_ex (s, impl_->r, sub, f); + } + } +} |