From 1ca6396a3dd284241de11bcaa210ad5836e8e5a8 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 8 Dec 2009 16:18:01 +0200 Subject: Multiple object model character encodings support Also add support for ISO-8859-1. --- libxsd/xsd/cxx/xml/char-iso8859-1.hxx | 72 +++++++++ libxsd/xsd/cxx/xml/char-iso8859-1.txx | 101 ++++++++++++ libxsd/xsd/cxx/xml/char-lcp.hxx | 56 +++++++ libxsd/xsd/cxx/xml/char-lcp.txx | 55 +++++++ libxsd/xsd/cxx/xml/char-utf8.hxx | 57 +++++++ libxsd/xsd/cxx/xml/char-utf8.txx | 293 +++++++++++++++++++++++++++++++++ libxsd/xsd/cxx/xml/exceptions.hxx | 20 +++ libxsd/xsd/cxx/xml/string.hxx | 9 +- libxsd/xsd/cxx/xml/string.ixx | 88 ++-------- libxsd/xsd/cxx/xml/string.txx | 294 +--------------------------------- 10 files changed, 675 insertions(+), 370 deletions(-) create mode 100644 libxsd/xsd/cxx/xml/char-iso8859-1.hxx create mode 100644 libxsd/xsd/cxx/xml/char-iso8859-1.txx create mode 100644 libxsd/xsd/cxx/xml/char-lcp.hxx create mode 100644 libxsd/xsd/cxx/xml/char-lcp.txx create mode 100644 libxsd/xsd/cxx/xml/char-utf8.hxx create mode 100644 libxsd/xsd/cxx/xml/char-utf8.txx create mode 100644 libxsd/xsd/cxx/xml/exceptions.hxx (limited to 'libxsd/xsd') diff --git a/libxsd/xsd/cxx/xml/char-iso8859-1.hxx b/libxsd/xsd/cxx/xml/char-iso8859-1.hxx new file mode 100644 index 0000000..38b633f --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-iso8859-1.hxx @@ -0,0 +1,72 @@ +// file : xsd/cxx/xml/char-iso8859-1.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#ifndef XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER_CHAR_ISO8859_1 + +#include +#include // std::size_t + +#include // XMLCh + +#include // invalid_utf16_string + +namespace xsd +{ + namespace cxx + { + namespace xml + { + struct iso8859_1_unrepresentable {}; + + // UTF-16 to/from ISO-8859-1 transcoder. + // + template + struct char_iso8859_1_transcoder + { + static std::basic_string + to (const XMLCh* s, std::size_t length); + + static XMLCh* + from (const C* s, std::size_t length); + + // Get/set a replacement for unrepresentable characters. If set to + // 0 (the default value), throw iso8859_1_unrepresentable instead. + // + static C + unrep_char () + { + return unrep_char_; + } + + static void + unrep_char (C c) + { + unrep_char_ = c; + } + + private: + static C unrep_char_; + }; + + typedef char_iso8859_1_transcoder char_transcoder; + } + } +} + +#include + +#else +# ifndef XSD_CXX_XML_TRANSCODER_CHAR_ISO8859_1 + // + // If you get this error, it usually means that either you compiled + // your schemas with different --char-encoding values or you included + // some of the libxsd headers (e.g., xsd/cxx/xml/string.hxx) directly + // without first including the correct xsd/cxx/xml/char-*.hxx header. + // +# error conflicting character encoding detected +# endif +#endif // XSD_CXX_XML_TRANSCODER diff --git a/libxsd/xsd/cxx/xml/char-iso8859-1.txx b/libxsd/xsd/cxx/xml/char-iso8859-1.txx new file mode 100644 index 0000000..6b20f01 --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-iso8859-1.txx @@ -0,0 +1,101 @@ +// file : xsd/cxx/xml/char-iso8859-1.txx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#include + +namespace xsd +{ + namespace cxx + { + namespace xml + { + template + C char_iso8859_1_transcoder::unrep_char_ = 0; + + template + std::basic_string char_iso8859_1_transcoder:: + to (const XMLCh* s, std::size_t len) + { + const XMLCh* end (s + len); + + // Find what the resulting buffer size will be. + // + std::size_t rl (0); + unsigned int u (0); // Four byte UCS-4 char. + + bool valid (true); + const XMLCh* p (s); + + for (; p < end; ++p) + { + if (*p >= 0xD800 && *p <= 0xDBFF) + { + // Make sure we have one more char and it has a valid + // value for the second char in a surrogate pair. + // + if (++p == end || !((*p >= 0xDC00) && (*p <= 0xDFFF))) + { + valid = false; + break; + } + } + + rl++; + } + + if (!valid) + throw invalid_utf16_string (); + + std::basic_string r; + r.reserve (rl + 1); + r.resize (rl); + C* rs (const_cast (r.c_str ())); + std::size_t i (0); + + p = s; + + // Tight first loop for the common case. + // + for (; p < end && *p < 0x100; ++p) + rs[i++] = C (*p); + + if (p < end && unrep_char_ == 0) + throw iso8859_1_unrepresentable (); + + for (; p < end; ++p) + { + XMLCh x (*p); + + if ((x >= 0xD800) && (x <= 0xDBFF)) + { + u = ((x - 0xD800) << 10) + (*++p - 0xDC00) + 0x10000; + } + else + u = x; + + rs[i++] = u < 0x100 ? C (u) : unrep_char_; + } + + return r; + } + + template + XMLCh* char_iso8859_1_transcoder:: + from (const C* s, std::size_t len) + { + const C* end (s + len); + + auto_array r (new XMLCh[len + 1]); + XMLCh* ir (r.get ()); + + for (const C* p (s); p < end; ++p) + *ir++ = static_cast (*p); + + *ir = XMLCh (0); + return r.release (); + } + } + } +} diff --git a/libxsd/xsd/cxx/xml/char-lcp.hxx b/libxsd/xsd/cxx/xml/char-lcp.hxx new file mode 100644 index 0000000..2c41753 --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-lcp.hxx @@ -0,0 +1,56 @@ +// file : xsd/cxx/xml/char-lcp.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#ifndef XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER_CHAR_LCP + +#include +#include // std::size_t + +#include // XMLCh + +namespace xsd +{ + namespace cxx + { + namespace xml + { + // UTF-16 to/from Xerces-C++ local code page (LCP) transcoder. + // + // Note that this transcoder has a custom interface due to Xerces-C++ + // idiosyncrasies. Don't use it as a base for your custom transcoder. + // + template + struct char_lcp_transcoder + { + static std::basic_string + to (const XMLCh* s); + + static std::basic_string + to (const XMLCh* s, std::size_t length); + + static XMLCh* + from (const C* s); + }; + + typedef char_lcp_transcoder char_transcoder; + } + } +} + +#include + +#else +# ifndef XSD_CXX_XML_TRANSCODER_CHAR_LCP + // + // If you get this error, it usually means that either you compiled + // your schemas with different --char-encoding values or you included + // some of the libxsd headers (e.g., xsd/cxx/xml/string.hxx) directly + // without first including the correct xsd/cxx/xml/char-*.hxx header. + // +# error conflicting character encoding detected +# endif +#endif // XSD_CXX_XML_TRANSCODER diff --git a/libxsd/xsd/cxx/xml/char-lcp.txx b/libxsd/xsd/cxx/xml/char-lcp.txx new file mode 100644 index 0000000..01bb36e --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-lcp.txx @@ -0,0 +1,55 @@ +// file : xsd/cxx/xml/char-lcp.txx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#include // std::memcpy + +#include + +#include +#include + +namespace xsd +{ + namespace cxx + { + namespace xml + { + template + std::basic_string char_lcp_transcoder:: + to (const XMLCh* s) + { + std_memory_manager mm; + auto_array r ( + xercesc::XMLString::transcode (s, &mm), mm); + return std::basic_string (r.get ()); + } + + template + std::basic_string char_lcp_transcoder:: + to (const XMLCh* s, std::size_t len) + { + auto_array tmp (new XMLCh[len + 1]); + std::memcpy (tmp.get (), s, len * sizeof (XMLCh)); + tmp[len] = XMLCh (0); + + std_memory_manager mm; + auto_array r ( + xercesc::XMLString::transcode (tmp.get (), &mm), mm); + + tmp.reset (); + + return std::basic_string (r.get ()); + } + + template + XMLCh* char_lcp_transcoder:: + from (const C* s) + { + std_memory_manager mm; + return xercesc::XMLString::transcode (s, &mm); + } + } + } +} diff --git a/libxsd/xsd/cxx/xml/char-utf8.hxx b/libxsd/xsd/cxx/xml/char-utf8.hxx new file mode 100644 index 0000000..c255b28 --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-utf8.hxx @@ -0,0 +1,57 @@ +// file : xsd/cxx/xml/char-utf8.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#ifndef XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER_CHAR_UTF8 + +#include +#include // std::size_t + +#include // XMLCh + +#include // invalid_utf16_string + +namespace xsd +{ + namespace cxx + { + namespace xml + { + struct invalid_utf8_string {}; + + // UTF-16 to/from UTF-8 transcoder. + // + template + struct char_utf8_transcoder + { + static std::basic_string + to (const XMLCh* s, std::size_t length); + + static XMLCh* + from (const C* s, std::size_t length); + + private: + static const unsigned char first_byte_mask_[5]; + }; + + typedef char_utf8_transcoder char_transcoder; + } + } +} + +#include + +#else +# ifndef XSD_CXX_XML_TRANSCODER_CHAR_UTF8 + // + // If you get this error, it usually means that either you compiled + // your schemas with different --char-encoding values or you included + // some of the libxsd headers (e.g., xsd/cxx/xml/string.hxx) directly + // without first including the correct xsd/cxx/xml/char-*.hxx header. + // +# error conflicting character encoding detected +# endif +#endif // XSD_CXX_XML_TRANSCODER diff --git a/libxsd/xsd/cxx/xml/char-utf8.txx b/libxsd/xsd/cxx/xml/char-utf8.txx new file mode 100644 index 0000000..96b36a4 --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-utf8.txx @@ -0,0 +1,293 @@ +// file : xsd/cxx/xml/char-utf8.txx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#include + +namespace xsd +{ + namespace cxx + { + namespace xml + { + template + const unsigned char char_utf8_transcoder::first_byte_mask_[5] = + { + 0x00, 0x00, 0xC0, 0xE0, 0xF0 + }; + + template + std::basic_string char_utf8_transcoder:: + to (const XMLCh* s, std::size_t len) + { + const XMLCh* end (s + len); + + // Find what the resulting buffer size will be. + // + std::size_t rl (0); + unsigned int u (0); // Four byte UCS-4 char. + + bool valid (true); + const XMLCh* p (s); + + for (; p < end; ++p) + { + XMLCh x (*p); + + if (x < 0xD800 || x > 0xDBFF) + u = x; + else + { + // Make sure we have one more char and it has a valid + // value for the second char in a surrogate pair. + // + if (++p == end || !((*p >= 0xDC00) && (*p <= 0xDFFF))) + { + valid = false; + break; + } + + u = ((x - 0xD800) << 10) + (*p - 0xDC00) + 0x10000; + } + + if (u < 0x80) + rl++; + else if (u < 0x800) + rl += 2; + else if (u < 0x10000) + rl += 3; + else if (u < 0x110000) + rl += 4; + else + { + valid = false; + break; + } + } + + if (!valid) + throw invalid_utf16_string (); + + std::basic_string r; + r.reserve (rl + 1); + r.resize (rl); + C* rs (const_cast (r.c_str ())); + + std::size_t i (0); + unsigned int count (0); + + p = s; + + // Tight first loop for the common case. + // + for (; p < end && *p < 0x80; ++p) + rs[i++] = C (*p); + + for (; p < end; ++p) + { + XMLCh x (*p); + + if ((x >= 0xD800) && (x <= 0xDBFF)) + { + u = ((x - 0xD800) << 10) + (*++p - 0xDC00) + 0x10000; + } + else + u = x; + + if (u < 0x80) + count = 1; + else if (u < 0x800) + count = 2; + else if (u < 0x10000) + count = 3; + else if (u < 0x110000) + count = 4; + + switch(count) + { + case 4: + { + rs[i + 3] = C ((u | 0x80UL) & 0xBFUL); + u >>= 6; + } + case 3: + { + rs[i + 2] = C ((u | 0x80UL) & 0xBFUL); + u >>= 6; + } + case 2: + { + rs[i + 1] = C ((u | 0x80UL) & 0xBFUL); + u >>= 6; + } + case 1: + { + rs[i] = C (u | first_byte_mask_[count]); + } + } + + i += count; + } + + return r; + } + + template + XMLCh* char_utf8_transcoder:: + from (const C* s, std::size_t len) + { + bool valid (true); + const C* end (s + len); + + // Find what the resulting buffer size will be. + // + std::size_t rl (0); + unsigned int count (0); + + for (const C* p (s); p < end; ++p) + { + unsigned char c (*p); + + if (c < 0x80) + { + // Fast path. + // + rl += 1; + continue; + } + else if ((c >> 5) == 0x06) + count = 2; + else if ((c >> 4) == 0x0E) + count = 3; + else if ((c >> 3) == 0x1E) + count = 4; + else + { + valid = false; + break; + } + + p += count - 1; // One will be added in the for loop + + if (p + 1 > end) + { + valid = false; + break; + } + + // BMP is represented by up to 3 code points in UTF-8. + // + rl += count > 3 ? 2 : 1; + } + + if (!valid) + throw invalid_utf8_string (); + + auto_array r (new XMLCh[rl + 1]); + XMLCh* ir (r.get ()); + + unsigned int u (0); // Four byte UCS-4 char. + + for (const C* p (s); p < end; ++p) + { + unsigned char c (*p); + + if (c < 0x80) + { + // Fast path. + // + *ir++ = static_cast (c); + continue; + } + else if ((c >> 5) == 0x06) + { + // UTF-8: 110yyyyy 10zzzzzz + // Unicode: 00000yyy yyzzzzzz + // + u = (c & 0x1F) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u |= c & 0x3F; + } + else if ((c >> 4) == 0x0E) + { + // UTF-8: 1110xxxx 10yyyyyy 10zzzzzz + // Unicode: xxxxyyyy yyzzzzzz + // + u = (c & 0x0F) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u = (u | (c & 0x3F)) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u |= c & 0x3F; + } + else if ((c >> 3) == 0x1E) + { + // UTF-8: 000wwwxx xxxxyyyy yyzzzzzz + // Unicode: 11110www 10xxxxxx 10yyyyyy 10zzzzzz + // + u = (c & 0x07) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u = (u | (c & 0x3F)) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u = (u | (c & 0x3F)) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u |= c & 0x3F; + } + + if (u & 0xFFFF0000) + { + // Surrogate pair. + // + *ir++ = static_cast (((u - 0x10000) >> 10) + 0xD800); + *ir++ = static_cast ((u & 0x3FF) + 0xDC00); + } + else + *ir++ = static_cast (u); + } + + if (!valid) + throw invalid_utf8_string (); + + *ir = XMLCh (0); + + return r.release (); + } + } + } +} diff --git a/libxsd/xsd/cxx/xml/exceptions.hxx b/libxsd/xsd/cxx/xml/exceptions.hxx new file mode 100644 index 0000000..6c2e029 --- /dev/null +++ b/libxsd/xsd/cxx/xml/exceptions.hxx @@ -0,0 +1,20 @@ +// file : xsd/cxx/xml/exceptions.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#ifndef XSD_CXX_XML_EXCEPTIONS_HXX +#define XSD_CXX_XML_EXCEPTIONS_HXX + +namespace xsd +{ + namespace cxx + { + namespace xml + { + struct invalid_utf16_string {}; + } + } +} + +#endif // XSD_CXX_XML_EXCEPTIONS_HXX diff --git a/libxsd/xsd/cxx/xml/string.hxx b/libxsd/xsd/cxx/xml/string.hxx index 2d08134..ec666ee 100644 --- a/libxsd/xsd/cxx/xml/string.hxx +++ b/libxsd/xsd/cxx/xml/string.hxx @@ -7,6 +7,7 @@ #define XSD_CXX_XML_STRING_HXX #include +#include // std::size_t #include #include // XMLCh @@ -17,12 +18,6 @@ namespace xsd { namespace xml { - // - // - struct invalid_utf8_string {}; - struct invalid_utf16_string {}; - - // Transcode a null-terminated string. // template @@ -84,7 +79,7 @@ namespace xsd } } -#endif // XSD_CXX_XML_STRING_HXX +#endif // XSD_CXX_XML_STRING_HXX #include #include diff --git a/libxsd/xsd/cxx/xml/string.ixx b/libxsd/xsd/cxx/xml/string.ixx index bde86d8..056a15f 100644 --- a/libxsd/xsd/cxx/xml/string.ixx +++ b/libxsd/xsd/cxx/xml/string.ixx @@ -6,11 +6,13 @@ #ifndef XSD_CXX_XML_STRING_IXX #define XSD_CXX_XML_STRING_IXX -#include -#include // std::memcpy - #include -#include + +// If no transcoder has been included, use the default UTF-8. +// +#ifndef XSD_CXX_XML_TRANSCODER +# include +#endif // We sometimes need this functionality even if we are building for // wchar_t. @@ -21,43 +23,17 @@ namespace xsd { namespace xml { -#ifndef XSD_USE_LCP - namespace bits - { - // UTF-16 to/from UTF-8 transcoder. - // - template - struct char_transcoder - { - static std::basic_string - to (const XMLCh* s, std::size_t length); - - static XMLCh* - from (const C* s, std::size_t length); - - private: - static const unsigned char first_byte_mask_[5]; - }; - } -#endif - template <> inline std::basic_string transcode (const XMLCh* s) { - if (s == 0) + if (s == 0 || *s == XMLCh (0)) return std::basic_string (); -#ifndef XSD_USE_LCP - return bits::char_transcoder::to ( - s, xercesc::XMLString::stringLen (s)); +#ifndef XSD_CXX_XML_TRANSCODER_CHAR_LCP + return char_transcoder::to (s, xercesc::XMLString::stringLen (s)); #else - // Use Xerces-C++ local code page transcoding. - // - std_memory_manager mm; - auto_array r ( - xercesc::XMLString::transcode (s, &mm), mm); - return std::basic_string (r.get ()); + return char_transcoder::to (s); #endif } @@ -68,41 +44,17 @@ namespace xsd if (s == 0 || len == 0) return std::basic_string (); -#ifndef XSD_USE_LCP - // Convert UTF-16 to UTF-8 - // - return bits::char_transcoder::to (s, len); -#else - // Use Xerces-C++ local code page transcoding. - // - auto_array tmp (new XMLCh[len + 1]); - std::memcpy (tmp.get (), s, len * sizeof (XMLCh)); - tmp[len] = XMLCh (0); - - std_memory_manager mm; - auto_array r ( - xercesc::XMLString::transcode (tmp.get (), &mm), mm); - - tmp.reset (); - - return std::basic_string (r.get ()); -#endif + return char_transcoder::to (s, len); } template <> inline XMLCh* transcode_to_xmlch (const char* s) { -#ifndef XSD_USE_LCP - // Convert UTF-8 to UTF-16 - // - return bits::char_transcoder::from ( - s, std::char_traits::length (s)); +#ifndef XSD_CXX_XML_TRANSCODER_CHAR_LCP + return char_transcoder::from (s, std::char_traits::length (s)); #else - // Use Xerces-C++ local code page transcoding. - // - std_memory_manager mm; - return xercesc::XMLString::transcode (s, &mm); + return char_transcoder::from (s); #endif } @@ -110,16 +62,10 @@ namespace xsd inline XMLCh* transcode_to_xmlch (const std::basic_string& s) { -#ifndef XSD_USE_LCP - // Convert UTF-8 to UTF-16 - // - return bits::char_transcoder::from ( - s.c_str (), s.length ()); +#ifndef XSD_CXX_XML_TRANSCODER_CHAR_LCP + return char_transcoder::from (s.c_str (), s.length ()); #else - // Use Xerces-C++ local code page transcoding. - // - std_memory_manager mm; - return xercesc::XMLString::transcode (s.c_str (), &mm); + return char_transcoder::from (s.c_str ()); #endif } } diff --git a/libxsd/xsd/cxx/xml/string.txx b/libxsd/xsd/cxx/xml/string.txx index cdef87e..f71480e 100644 --- a/libxsd/xsd/cxx/xml/string.txx +++ b/libxsd/xsd/cxx/xml/string.txx @@ -6,306 +6,16 @@ #ifndef XSD_CXX_XML_STRING_TXX #define XSD_CXX_XML_STRING_TXX -#ifndef XSD_USE_LCP -namespace xsd -{ - namespace cxx - { - namespace xml - { - namespace bits - { - template - const unsigned char char_transcoder::first_byte_mask_[5] = - { - 0x00, 0x00, 0xC0, 0xE0, 0xF0 - }; - - template - std::basic_string char_transcoder:: - to (const XMLCh* s, std::size_t len) - { - const XMLCh* end (s + len); - - // Find what the resulting buffer size will be. - // - std::size_t rl (0); - unsigned int u (0); // Four byte UCS-4 char. - - bool valid (true); - const XMLCh* p (s); - for (; p < end; ++p) - { - XMLCh x (*p); - - if (x < 0xD800 || x > 0xDBFF) - u = x; - else - { - // Make sure we have one more char and it has a valid - // value for the second char in a surrogate pair. - // - if (++p == end || !((*p >= 0xDC00) && (*p <= 0xDFFF))) - { - valid = false; - break; - } - - u = ((x - 0xD800) << 10) + (*p - 0xDC00) + 0x10000; - } - - if (u < 0x80) - rl++; - else if (u < 0x800) - rl += 2; - else if (u < 0x10000) - rl += 3; - else if (u < 0x110000) - rl += 4; - else - { - valid = false; - break; - } - } - - if (!valid) - throw invalid_utf16_string (); - - std::basic_string r; - r.reserve (rl + 1); - r.resize (rl); - C* rs (const_cast (r.c_str ())); - - std::size_t i (0); - unsigned int count (0); - - p = s; - - // Tight first loop for the common case. - // - for (; p < end && *p < 0x80; ++p) - rs[i++] = C (*p); - - for (; p < end; ++p) - { - XMLCh x (*p); - - if ((x >= 0xD800) && (x <= 0xDBFF)) - { - u = ((x - 0xD800) << 10) + (*++p - 0xDC00) + 0x10000; - } - else - u = x; - - if (u < 0x80) - count = 1; - else if (u < 0x800) - count = 2; - else if (u < 0x10000) - count = 3; - else if (u < 0x110000) - count = 4; - - switch(count) - { - case 4: - { - rs[i + 3] = C ((u | 0x80UL) & 0xBFUL); - u >>= 6; - } - case 3: - { - rs[i + 2] = C ((u | 0x80UL) & 0xBFUL); - u >>= 6; - } - case 2: - { - rs[i + 1] = C ((u | 0x80UL) & 0xBFUL); - u >>= 6; - } - case 1: - { - rs[i] = C (u | first_byte_mask_[count]); - } - } - - i += count; - } - - return r; - } - - template - XMLCh* char_transcoder:: - from (const C* s, std::size_t len) - { - bool valid (true); - const C* end (s + len); - - // Find what the resulting buffer size will be. - // - std::size_t rl (0); - unsigned int count (0); - - for (const C* p (s); p < end; ++p) - { - unsigned char c (*p); - - if (c < 0x80) - { - // Fast path. - // - rl += 1; - continue; - } - else if ((c >> 5) == 0x06) - count = 2; - else if ((c >> 4) == 0x0E) - count = 3; - else if ((c >> 3) == 0x1E) - count = 4; - else - { - valid = false; - break; - } - - p += count - 1; // One will be added in the for loop - - if (p + 1 > end) - { - valid = false; - break; - } - - // BMP is represented by up to 3 code points in UTF-8. - // - rl += count > 3 ? 2 : 1; - } - - if (!valid) - throw invalid_utf8_string (); - - auto_array r (new XMLCh[rl + 1]); - XMLCh* ir (r.get ()); - - unsigned int u (0); // Four byte UCS-4 char. - - for (const C* p (s); p < end; ++p) - { - unsigned char c (*p); - - if (c < 0x80) - { - // Fast path. - // - *ir++ = static_cast (c); - continue; - } - else if ((c >> 5) == 0x06) - { - // UTF-8: 110yyyyy 10zzzzzz - // Unicode: 00000yyy yyzzzzzz - // - u = (c & 0x1F) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u |= c & 0x3F; - } - else if ((c >> 4) == 0x0E) - { - // UTF-8: 1110xxxx 10yyyyyy 10zzzzzz - // Unicode: xxxxyyyy yyzzzzzz - // - u = (c & 0x0F) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u = (u | (c & 0x3F)) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u |= c & 0x3F; - } - else if ((c >> 3) == 0x1E) - { - // UTF-8: 000wwwxx xxxxyyyy yyzzzzzz - // Unicode: 11110www 10xxxxxx 10yyyyyy 10zzzzzz - // - u = (c & 0x07) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u = (u | (c & 0x3F)) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u = (u | (c & 0x3F)) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u |= c & 0x3F; - } - - if (u & 0xFFFF0000) - { - // Surrogate pair. - // - *ir++ = static_cast (((u - 0x10000) >> 10) + 0xD800); - *ir++ = static_cast ((u & 0x3FF) + 0xDC00); - } - else - *ir++ = static_cast (u); - } - - if (!valid) - throw invalid_utf8_string (); - - *ir = XMLCh (0); - - return r.release (); - } - } - } - } -} - -#endif // XSD_USE_LCP #endif // XSD_CXX_XML_STRING_TXX - #if defined(XSD_USE_WCHAR) || !defined(XSD_USE_CHAR) #ifndef XSD_CXX_XML_STRING_TXX_WCHAR #define XSD_CXX_XML_STRING_TXX_WCHAR +#include + namespace xsd { namespace cxx -- cgit v1.1