From 1ca6396a3dd284241de11bcaa210ad5836e8e5a8 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 8 Dec 2009 16:18:01 +0200 Subject: Multiple object model character encodings support Also add support for ISO-8859-1. --- libxsd/xsd/cxx/xml/string.txx | 294 +----------------------------------------- 1 file changed, 2 insertions(+), 292 deletions(-) (limited to 'libxsd/xsd/cxx/xml/string.txx') diff --git a/libxsd/xsd/cxx/xml/string.txx b/libxsd/xsd/cxx/xml/string.txx index cdef87e..f71480e 100644 --- a/libxsd/xsd/cxx/xml/string.txx +++ b/libxsd/xsd/cxx/xml/string.txx @@ -6,306 +6,16 @@ #ifndef XSD_CXX_XML_STRING_TXX #define XSD_CXX_XML_STRING_TXX -#ifndef XSD_USE_LCP -namespace xsd -{ - namespace cxx - { - namespace xml - { - namespace bits - { - template - const unsigned char char_transcoder::first_byte_mask_[5] = - { - 0x00, 0x00, 0xC0, 0xE0, 0xF0 - }; - - template - std::basic_string char_transcoder:: - to (const XMLCh* s, std::size_t len) - { - const XMLCh* end (s + len); - - // Find what the resulting buffer size will be. - // - std::size_t rl (0); - unsigned int u (0); // Four byte UCS-4 char. - - bool valid (true); - const XMLCh* p (s); - for (; p < end; ++p) - { - XMLCh x (*p); - - if (x < 0xD800 || x > 0xDBFF) - u = x; - else - { - // Make sure we have one more char and it has a valid - // value for the second char in a surrogate pair. - // - if (++p == end || !((*p >= 0xDC00) && (*p <= 0xDFFF))) - { - valid = false; - break; - } - - u = ((x - 0xD800) << 10) + (*p - 0xDC00) + 0x10000; - } - - if (u < 0x80) - rl++; - else if (u < 0x800) - rl += 2; - else if (u < 0x10000) - rl += 3; - else if (u < 0x110000) - rl += 4; - else - { - valid = false; - break; - } - } - - if (!valid) - throw invalid_utf16_string (); - - std::basic_string r; - r.reserve (rl + 1); - r.resize (rl); - C* rs (const_cast (r.c_str ())); - - std::size_t i (0); - unsigned int count (0); - - p = s; - - // Tight first loop for the common case. - // - for (; p < end && *p < 0x80; ++p) - rs[i++] = C (*p); - - for (; p < end; ++p) - { - XMLCh x (*p); - - if ((x >= 0xD800) && (x <= 0xDBFF)) - { - u = ((x - 0xD800) << 10) + (*++p - 0xDC00) + 0x10000; - } - else - u = x; - - if (u < 0x80) - count = 1; - else if (u < 0x800) - count = 2; - else if (u < 0x10000) - count = 3; - else if (u < 0x110000) - count = 4; - - switch(count) - { - case 4: - { - rs[i + 3] = C ((u | 0x80UL) & 0xBFUL); - u >>= 6; - } - case 3: - { - rs[i + 2] = C ((u | 0x80UL) & 0xBFUL); - u >>= 6; - } - case 2: - { - rs[i + 1] = C ((u | 0x80UL) & 0xBFUL); - u >>= 6; - } - case 1: - { - rs[i] = C (u | first_byte_mask_[count]); - } - } - - i += count; - } - - return r; - } - - template - XMLCh* char_transcoder:: - from (const C* s, std::size_t len) - { - bool valid (true); - const C* end (s + len); - - // Find what the resulting buffer size will be. - // - std::size_t rl (0); - unsigned int count (0); - - for (const C* p (s); p < end; ++p) - { - unsigned char c (*p); - - if (c < 0x80) - { - // Fast path. - // - rl += 1; - continue; - } - else if ((c >> 5) == 0x06) - count = 2; - else if ((c >> 4) == 0x0E) - count = 3; - else if ((c >> 3) == 0x1E) - count = 4; - else - { - valid = false; - break; - } - - p += count - 1; // One will be added in the for loop - - if (p + 1 > end) - { - valid = false; - break; - } - - // BMP is represented by up to 3 code points in UTF-8. - // - rl += count > 3 ? 2 : 1; - } - - if (!valid) - throw invalid_utf8_string (); - - auto_array r (new XMLCh[rl + 1]); - XMLCh* ir (r.get ()); - - unsigned int u (0); // Four byte UCS-4 char. - - for (const C* p (s); p < end; ++p) - { - unsigned char c (*p); - - if (c < 0x80) - { - // Fast path. - // - *ir++ = static_cast (c); - continue; - } - else if ((c >> 5) == 0x06) - { - // UTF-8: 110yyyyy 10zzzzzz - // Unicode: 00000yyy yyzzzzzz - // - u = (c & 0x1F) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u |= c & 0x3F; - } - else if ((c >> 4) == 0x0E) - { - // UTF-8: 1110xxxx 10yyyyyy 10zzzzzz - // Unicode: xxxxyyyy yyzzzzzz - // - u = (c & 0x0F) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u = (u | (c & 0x3F)) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u |= c & 0x3F; - } - else if ((c >> 3) == 0x1E) - { - // UTF-8: 000wwwxx xxxxyyyy yyzzzzzz - // Unicode: 11110www 10xxxxxx 10yyyyyy 10zzzzzz - // - u = (c & 0x07) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u = (u | (c & 0x3F)) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u = (u | (c & 0x3F)) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u |= c & 0x3F; - } - - if (u & 0xFFFF0000) - { - // Surrogate pair. - // - *ir++ = static_cast (((u - 0x10000) >> 10) + 0xD800); - *ir++ = static_cast ((u & 0x3FF) + 0xDC00); - } - else - *ir++ = static_cast (u); - } - - if (!valid) - throw invalid_utf8_string (); - - *ir = XMLCh (0); - - return r.release (); - } - } - } - } -} - -#endif // XSD_USE_LCP #endif // XSD_CXX_XML_STRING_TXX - #if defined(XSD_USE_WCHAR) || !defined(XSD_USE_CHAR) #ifndef XSD_CXX_XML_STRING_TXX_WCHAR #define XSD_CXX_XML_STRING_TXX_WCHAR +#include + namespace xsd { namespace cxx -- cgit v1.1