From 96ccbbd113efbfe6a0a300a1971e4a6f12c9249c Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Sat, 19 Nov 2016 00:31:40 +0300 Subject: Add support for Perl regex escape sequences subset to C++11-based implementation --- cutl/details/config.hxx | 3 + cutl/re/re.cxx | 254 ++++++++++++++++++++++++++++++++++++++++++++++-- tests/re/buildfile | 1 - tests/re/driver.cxx | 82 +++++++++++++++- 4 files changed, 331 insertions(+), 9 deletions(-) diff --git a/cutl/details/config.hxx b/cutl/details/config.hxx index 40c8fba..6bb83af 100644 --- a/cutl/details/config.hxx +++ b/cutl/details/config.hxx @@ -32,6 +32,9 @@ #endif #ifdef LIBCUTL_BUILD2 +# ifndef LIBCUTL_CXX11 +# error C++ compiler does not support (enough of) C++11 +# endif # ifdef _MSC_VER # include # else diff --git a/cutl/re/re.cxx b/cutl/re/re.cxx index 2c835a2..f480f19 100644 --- a/cutl/re/re.cxx +++ b/cutl/re/re.cxx @@ -8,13 +8,28 @@ // For build2 build it is either C++11 regex or external Boost. // +// Note that some compiler/runtime combinations don't have usable C++11 +// regex. For example Clang 3.5 with libstdc++ from GCC 4.9. In this case +// you can fall back to using Boost regex by passing -DLIBCUTL_BOOST_REGEX +// preprocessor option when building libcutl. +// +// @@ Should this rather be a (custom) config.* variable? +// #ifdef LIBCUTL_BUILD2 -# ifdef LIBCUTL_CXX11 +# if defined(LIBCUTL_CXX11) && !defined(LIBCUTL_BOOST_REGEX) # include +# include +# include // size_t # else +# ifndef LIBCUTL_BOOST_REGEX +# define LIBCUTL_BOOST_REGEX +# endif # include # endif #else +# ifndef LIBCUTL_BOOST_REGEX +# define LIBCUTL_BOOST_REGEX +# endif # ifndef LIBCUTL_EXTERNAL_BOOST # include # else @@ -28,10 +43,10 @@ namespace cutl { namespace re { -#if defined(LIBCUTL_BUILD2) && defined(LIBCUTL_CXX11) - namespace ire = std; -#else +#ifdef LIBCUTL_BOOST_REGEX namespace ire = std::tr1; +#else + namespace ire = std; #endif // @@ -201,6 +216,233 @@ namespace cutl return ire::regex_search (s, impl_->r); } + // If we are using C++11 regex then extend the standard ECMA-262 + // substitution escape sequences with a subset of Perl sequences: + // + // \\, \u, \l, \U, \L, \E, \1, ..., \9 + // + // Notes and limitations: + // + // - The only valid regex_constants flags are match_default, + // format_first_only (format_no_copy can easily be supported). + // + // - If backslash doesn't start any of the listed sequences then it is + // silently dropped and the following character is copied as is. + // + // - The character case conversion is performed according to the global + // C++ locale (which is, unless changed, is the same as C locale and + // both default to the POSIX locale aka "C"). + // + template + static basic_string + regex_replace_ex (const basic_string& s, + const ire::basic_regex& re, + const basic_string& fmt, + ire::regex_constants::match_flag_type flags) + { +#ifdef LIBCUTL_BOOST_REGEX + // Boost regex already does what we need. + // + return ire::regex_replace (s, re, fmt, flags); +#else + using string_type = basic_string; + using str_it = typename string_type::const_iterator; + using regex_it = regex_iterator; + + bool first_only ((flags & regex_constants::format_first_only) == + regex_constants::format_first_only); + + locale cl; // Copy of the global C++ locale. + string_type r; + + // Beginning of the last unmatched substring. + // + str_it ub (s.begin ()); + + for (regex_it b (s.begin (), s.end (), re, flags), i (b), e; i != e; ++i) + { + const match_results& m (*i); + + // Copy the preceeding unmatched substring, save the beginning of the + // one that follows. + // + r.append (ub, m.prefix ().second); + ub = m.suffix ().first; + + if (first_only && i != b) + r.append (m[0].first, m[0].second); // Append matched substring. + else + { + // The standard implementation calls m.format() here. We perform our + // own formatting. + // + // Note that we are using char type literals with the assumption + // that being ASCII characters they will be properly "widened" to + // the corresponding literals of the C template parameter type. + // + auto digit = [] (C c) -> int + { + return c >= '0' && c <= '9' ? c - '0' : -1; + }; + + enum class case_conv {none, upper, lower, upper_once, lower_once} + mode (case_conv::none); + + auto conv_chr = [&mode, &cl] (C c) -> C + { + switch (mode) + { + case case_conv::upper_once: mode = case_conv::none; + case case_conv::upper: c = toupper (c, cl); break; + case case_conv::lower_once: mode = case_conv::none; + case case_conv::lower: c = tolower (c, cl); break; + case case_conv::none: break; + } + return c; + }; + + auto append_chr = [&r, &conv_chr] (C c) + { + r.push_back (conv_chr (c)); + }; + + auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e) + { + // Optimize for the common case. + // + if (mode == case_conv::none) + r.append (b, e); + else + { + for (str_it i (b); i != e; ++i) + r.push_back (conv_chr (*i)); + } + }; + + size_t n (fmt.size ()); + for (size_t i (0); i < n; ++i) + { + C c (fmt[i]); + + switch (c) + { + case '$': + { + // Check if this is a $-based escape sequence. Interpret it + // accordingly if that's the case, treat '$' as a regular + // character otherwise. + // + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '$': append_chr (c); break; + case '&': append_str (m[0].first, m[0].second); break; + case '`': + { + append_str (m.prefix ().first, m.prefix ().second); + break; + } + case '\'': + { + append_str (m.suffix ().first, m.suffix ().second); + break; + } + default: + { + // Check if this is a sub-expression 1-based index ($n or + // $nn). Append the matching substring if that's the case. + // Treat '$' as a regular character otherwise. Index + // greater than the sub-expression count is silently + // ignored. + // + int si (digit (c)); + if (si >= 0) + { + int d; + if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last. + { + si = si * 10 + d; + ++i; + } + } + + if (si > 0) + { + // m[0] refers to the matched substring. + // + if (static_cast (si) < m.size ()) + append_str (m[si].first, m[si].second); + } + else + { + // Not a $-based escape sequence so treat '$' as a + // regular character. + // + --i; + append_chr ('$'); + } + + break; + } + } + + break; + } + case '\\': + { + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '\\': append_chr (c); break; + + case 'u': mode = case_conv::upper_once; break; + case 'l': mode = case_conv::lower_once; break; + case 'U': mode = case_conv::upper; break; + case 'L': mode = case_conv::lower; break; + case 'E': mode = case_conv::none; break; + default: + { + // Check if this is a sub-expression 1-based index. Append + // the matching substring if that's the case, Skip '\\' + // otherwise. Index greater than the sub-expression count + // is silently ignored. + // + int si (digit (c)); + if (si > 0) + { + // m[0] refers to the matched substring. + // + if (static_cast (si) < m.size ()) + append_str (m[si].first, m[si].second); + } + else + --i; + + break; + } + } + + break; + } + default: + { + // Append a regular character. + // + append_chr (c); + break; + } + } + } + } + } + + r.append (ub, s.end ()); // Append the rightmost non-matched substring. + return r; +#endif + } + template <> LIBCUTL_EXPORT string basic_regex:: replace (string_type const& s, @@ -213,7 +455,7 @@ namespace cutl if (first_only) f |= ire::regex_constants::format_first_only; - return ire::regex_replace (s, impl_->r, sub, f); + return regex_replace_ex (s, impl_->r, sub, f); } template <> @@ -228,7 +470,7 @@ namespace cutl if (first_only) f |= ire::regex_constants::format_first_only; - return ire::regex_replace (s, impl_->r, sub, f); + return regex_replace_ex (s, impl_->r, sub, f); } } } diff --git a/tests/re/buildfile b/tests/re/buildfile index d31eb62..80d077a 100644 --- a/tests/re/buildfile +++ b/tests/re/buildfile @@ -5,4 +5,3 @@ import libs = libcutl%lib{cutl} exe{driver}: cxx{driver} $libs -exe{driver}: test = false # @@ TMP, fails. diff --git a/tests/re/driver.cxx b/tests/re/driver.cxx index 4b070dd..c8b06d2 100644 --- a/tests/re/driver.cxx +++ b/tests/re/driver.cxx @@ -51,6 +51,84 @@ main () assert (r.replace ("'foofoxbar'", "\\u$1") == "Fox"); } + // replace() using escape sequences. + // + { + regex r ("([aA][bB][cC])"); + + // $-based escape sequences. + // + assert (r.replace ("xabcyz", "v$") == "xv$yz"); + assert (r.replace ("xabcyz", "v$d") == "xv$dyz"); + assert (r.replace ("xabcyz", "v$1d") == "xvabcdyz"); + assert (r.replace ("xabcyabcz", "v$2d") == "xvdyvdz"); + assert (r.replace ("xabcyz", "v$&d") == "xvabcdyz"); + assert (r.replace ("xabcyz", "$`$$$\'") == "xx$yzyz"); + + { + regex r ("(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)"); + + assert (r.replace ("xabcdefghijy", "$10$9$8$7$6$5$4$3$2$1") == + "xjihgfedcbay"); + } + + // \-based escape sequences. + // + assert (r.replace ("xabcyz", "v\\d") == "xvdyz"); + assert (r.replace ("xabcyz", "v\\1d") == "xvabcdyz"); + assert (r.replace ("xabcyabcz", "v\\2d") == "xvdyvdz"); + assert (r.replace ("xabcyz", "v\\\\d") == "xv\\dyz"); + assert (r.replace ("xabcyz", "v\\$d") == "xv$dyz"); + + { + regex r ("(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)"); + + assert (r.replace ("xabcdefghijy", "\\10\\9\\8\\7\\6\\5\\4\\3\\2\\1") == + "xa0ihgfedcbay"); + } + + assert (r.replace ("xabcyz", "\\u") == "xyz"); + assert (r.replace ("xabcyz", "\\uv") == "xVyz"); + assert (r.replace ("xabcyz", "\\u\\1") == "xAbcyz"); + assert (r.replace ("xabcyz", "\\lV") == "xvyz"); + assert (r.replace ("xAbcyz", "\\l\\1") == "xabcyz"); + + assert (r.replace ("xabcyz", "\\U") == "xyz"); + assert (r.replace ("xabcyz", "\\Uv") == "xVyz"); + assert (r.replace ("xabcyz", "\\U\\1v") == "xABCVyz"); + assert (r.replace ("xabcyz", "\\U\\1\\Ev") == "xABCvyz"); + + assert (r.replace ("xabcyz", "\\L") == "xyz"); + assert (r.replace ("xabcyz", "\\LV") == "xvyz"); + assert (r.replace ("xABCyz", "\\L\\1V") == "xabcvyz"); + assert (r.replace ("xabcyz", "\\L\\1\\EV") == "xabcVyz"); + + assert (r.replace ("xabcyz", "\\Uv\\LV") == "xVvyz"); + assert (r.replace ("xabcyz", "\\U\\1\\LV") == "xABCvyz"); + + { + regex r ("(b?)-"); + assert (r.replace ("a-b-", "\\u\\1x") == "aXBx"); + } + } + + // wregex::replace(). + // + { + { + wregex r (L"['`]foo([^ ]*)bar['`]"); + assert (r.replace (L"'foofoxbar'", L"\\u$1") == L"Fox"); + } +/* + { + std::locale::global (std::locale ("en_US.utf8")); + + wregex r (L"(a)"); + assert (r.replace (L"a", L"\\l\u0190") == L"\u025b"); + } +*/ + } + // regexsub // { @@ -71,12 +149,12 @@ main () // try { - regexsub r ("/['`]foo([^ ]*)bar['`]#\\u$1/"); + regexsub r ("/['`]foo([^ ]*)bar['`]#$1/"); assert (false); } catch (format const& e) { - assert (e.regex () == "/['`]foo([^ ]*)bar['`]#\\u$1/"); + assert (e.regex () == "/['`]foo([^ ]*)bar['`]#$1/"); assert (!e.description ().empty ()); //std::cerr << e.description () << std::endl; } -- cgit v1.1