From 1ca6396a3dd284241de11bcaa210ad5836e8e5a8 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 8 Dec 2009 16:18:01 +0200 Subject: Multiple object model character encodings support Also add support for ISO-8859-1. --- NEWS | 9 + documentation/custom-literals.xsd | 49 ++++ documentation/cxx/parser/guide/index.xhtml | 40 +-- documentation/cxx/tree/guide/index.xhtml | 25 +- documentation/cxx/tree/manual/index.xhtml | 18 +- documentation/makefile | 2 + documentation/xsd.1 | 39 +++ documentation/xsd.xhtml | 27 ++ libxsd/xsd/cxx/xml/char-iso8859-1.hxx | 72 +++++ libxsd/xsd/cxx/xml/char-iso8859-1.txx | 101 +++++++ libxsd/xsd/cxx/xml/char-lcp.hxx | 56 ++++ libxsd/xsd/cxx/xml/char-lcp.txx | 55 ++++ libxsd/xsd/cxx/xml/char-utf8.hxx | 57 ++++ libxsd/xsd/cxx/xml/char-utf8.txx | 293 ++++++++++++++++++++ libxsd/xsd/cxx/xml/exceptions.hxx | 20 ++ libxsd/xsd/cxx/xml/string.hxx | 9 +- libxsd/xsd/cxx/xml/string.ixx | 88 ++---- libxsd/xsd/cxx/xml/string.txx | 294 +------------------- tests/cxx/tree/encoding/char/iso-8859-1/driver.cxx | 76 ++++++ tests/cxx/tree/encoding/char/iso-8859-1/makefile | 83 ++++++ tests/cxx/tree/encoding/char/iso-8859-1/test.std | 18 ++ tests/cxx/tree/encoding/char/iso-8859-1/test.xml | 14 + tests/cxx/tree/encoding/char/iso-8859-1/test.xsd | 31 +++ tests/cxx/tree/encoding/char/lcp/driver.cxx | 2 +- tests/cxx/tree/encoding/char/lcp/makefile | 4 +- tests/cxx/tree/encoding/char/makefile | 2 +- tests/cxx/tree/encoding/char/utf-8/makefile | 2 +- xsd/cxx/elements.cxx | 275 ++++++++++++++++++- xsd/cxx/elements.hxx | 39 ++- xsd/cxx/literal-map.cxx | 296 +++++++++++++++++++++ xsd/cxx/literal-map.hxx | 23 ++ xsd/cxx/parser/cli.hxx | 2 + xsd/cxx/parser/elements.cxx | 3 + xsd/cxx/parser/elements.hxx | 1 + xsd/cxx/parser/generator.cxx | 82 +++++- xsd/cxx/parser/generator.hxx | 2 + xsd/cxx/parser/name-processor.cxx | 16 +- xsd/cxx/parser/name-processor.hxx | 6 +- xsd/cxx/parser/parser-header.cxx | 7 + xsd/cxx/parser/validator.cxx | 17 +- xsd/cxx/tree/cli.hxx | 2 + xsd/cxx/tree/counter.cxx | 2 +- xsd/cxx/tree/elements.cxx | 3 + xsd/cxx/tree/elements.hxx | 1 + xsd/cxx/tree/generator.cxx | 73 ++++- xsd/cxx/tree/generator.hxx | 2 + xsd/cxx/tree/name-processor.cxx | 15 +- xsd/cxx/tree/name-processor.hxx | 9 +- xsd/cxx/tree/tree-forward.cxx | 7 + xsd/cxx/tree/tree-header.cxx | 14 + xsd/cxx/tree/validator.cxx | 1 + xsd/elements.hxx | 1 - xsd/makefile | 20 +- xsd/xsd.cxx | 70 ++++- 54 files changed, 2013 insertions(+), 462 deletions(-) create mode 100644 documentation/custom-literals.xsd create mode 100644 libxsd/xsd/cxx/xml/char-iso8859-1.hxx create mode 100644 libxsd/xsd/cxx/xml/char-iso8859-1.txx create mode 100644 libxsd/xsd/cxx/xml/char-lcp.hxx create mode 100644 libxsd/xsd/cxx/xml/char-lcp.txx create mode 100644 libxsd/xsd/cxx/xml/char-utf8.hxx create mode 100644 libxsd/xsd/cxx/xml/char-utf8.txx create mode 100644 libxsd/xsd/cxx/xml/exceptions.hxx create mode 100644 tests/cxx/tree/encoding/char/iso-8859-1/driver.cxx create mode 100644 tests/cxx/tree/encoding/char/iso-8859-1/makefile create mode 100644 tests/cxx/tree/encoding/char/iso-8859-1/test.std create mode 100644 tests/cxx/tree/encoding/char/iso-8859-1/test.xml create mode 100644 tests/cxx/tree/encoding/char/iso-8859-1/test.xsd create mode 100644 xsd/cxx/literal-map.cxx create mode 100644 xsd/cxx/literal-map.hxx diff --git a/NEWS b/NEWS index 9472972..dfbb46d 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,14 @@ Version 3.3.0 + * New option, --char-encoding, allows to specify the character encoding + that should be used in the object model. Valid values for the 'char' + character type are 'utf8' (default), 'iso8859-1' (new), 'lcp' (Xerces-C++ + local code page), and 'custom' (allows to support custom encodings). Note + that if you use a non-default character encoding and include some libxsd + headers (e.g., xsd/cxx/xml/string.hxx) directly, then you will need to + fist include the correct xsd/cxx/xml/char-.hxx header, where + is iso8859-1, lcp, etc. This mechanism replaces the XSD_USE_LCP macro. + * When the XSD compiler is built with Xerces-C++ 3.1.0 or later, enable handling of multiple imports for the same namespace. Before all subsequent imports for a namespace were ignored which caused errors diff --git a/documentation/custom-literals.xsd b/documentation/custom-literals.xsd new file mode 100644 index 0000000..ab2d649 --- /dev/null +++ b/documentation/custom-literals.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/documentation/cxx/parser/guide/index.xhtml b/documentation/cxx/parser/guide/index.xhtml index 7379c96..9653e37 100644 --- a/documentation/cxx/parser/guide/index.xhtml +++ b/documentation/cxx/parser/guide/index.xhtml @@ -280,7 +280,7 @@ 5Mapping Configuration - + @@ -1615,8 +1615,8 @@ namespace http://www.example.com/xmlns/my following map files. The string-based XML Schema types are mapped to either std::string or std::wstring depending on the character type - selected (see Section 5.1, "Character Type" for - more information).

+ selected (see Section 5.1, "Character Type and + Encoding" for more information).

 namespace http://www.w3.org/2001/XMLSchema
@@ -1909,7 +1909,7 @@ age:    28
      Compiler Command Line Manual.
   

-

5.1 Character Type

+

5.1 Character Type and Encoding

The C++/Parser mapping has built-in support for two character types: char and wchar_t. You can select the @@ -1921,15 +1921,24 @@ age: 28

Another aspect of the mapping that depends on the character type is character encoding. For the char character type - the encoding is UTF-8. For the wchar_t character type - the encoding is automatically selected between UTF-16 and - UTF-32/UCS-4 depending on the size of the wchar_t type. - On some platforms (for example, Windows with Visual C++ and AIX with IBM XL - C++) wchar_t is 2 bytes long. For these platforms the + the default encoding is UTF-8. Other supported encodings are + ISO-8859-1, Xerces-C++ Local Code Page (LPC), as well as + custom encodings. You can select which encoding should be used + in the object model with the --char-encoding command + line option.

+ +

For the wchar_t character type the encoding is + automatically selected between UTF-16 and UTF-32/UCS-4 depending + on the size of the wchar_t type. On some platforms + (for example, Windows with Visual C++ and AIX with IBM XL C++) + wchar_t is 2 bytes long. For these platforms the encoding is UTF-16. On other platforms wchar_t is 4 bytes - long and UTF-32/UCS-4 is used. -

+ long and UTF-32/UCS-4 is used.

+

Note also that the character encoding that is used in the object model + is independent of the encodings used in input and output XML. In fact, + all three (object mode, input XML, and output XML) can have different + encodings.

5.2 Underlying XML Parser

@@ -3306,7 +3315,7 @@ namespace xml_schema document type has the following interface. Note that if the character type is wchar_t, then the string type in the interface becomes std::wstring - (see Section 5.1, "Character Type").

+ (see Section 5.1, "Character Type and Encoding").

 namespace xml_schema
@@ -3601,7 +3610,7 @@ namespace xml_schema
      document type has the following interface. Note that
      if the character type is wchar_t, then the string type
      in the interface becomes std::wstring
-     (see Section 5.1, "Character Type").

+ (see Section 5.1, "Character Type and Encoding").

 namespace xml_schema
@@ -3886,7 +3895,8 @@ main (int argc, char* argv[])
      character type is wchar_t, then the string type
      and output stream type in the definition become
      std::wstring and std::wostream,
-     respectively (see Section 5.1, "Character Type").

+ respectively (see Section 5.1, "Character Type + and Encoding").

 namespace xml_schema
@@ -3998,7 +4008,7 @@ main (int argc, char* argv[])
      listing presents the definition of the error_handler
      interface. Note that if the character type is wchar_t,
      then the string type in the interface becomes std::wstring
-     (see Section 5.1, "Character Type").

+ (see Section 5.1, "Character Type and Encoding").

 namespace xml_schema
diff --git a/documentation/cxx/tree/guide/index.xhtml b/documentation/cxx/tree/guide/index.xhtml
index 787610a..f96b09b 100644
--- a/documentation/cxx/tree/guide/index.xhtml
+++ b/documentation/cxx/tree/guide/index.xhtml
@@ -226,7 +226,7 @@
     
5.1Character Type
5.1Character Type and Encoding
5.2Underlying XML Parser
5.3XML Schema Validation
5.4Support for Polymorphism
3Overall Mapping Configuration - + @@ -1148,7 +1148,7 @@ $ doxygen hello.doxygen Compiler Command Line Manual.

-

3.1 Character Type

+

3.1 Character Type and Encoding

The C++/Tree mapping has built-in support for two character types: char and wchar_t. You can select the @@ -1160,14 +1160,25 @@ $ doxygen hello.doxygen

Another aspect of the mapping that depends on the character type is character encoding. For the char character type - the encoding is UTF-8. For the wchar_t character type - the encoding is automatically selected between UTF-16 and - UTF-32/UCS-4 depending on the size of the wchar_t type. - On some platforms (for example, Windows with Visual C++ and AIX with IBM XL - C++) wchar_t is 2 bytes long. For these platforms the + the default encoding is UTF-8. Other supported encodings are + ISO-8859-1, Xerces-C++ Local Code Page (LPC), as well as + custom encodings. You can select which encoding should be used + in the object model with the --char-encoding command + line option.

+ +

For the wchar_t character type the encoding is + automatically selected between UTF-16 and UTF-32/UCS-4 depending + on the size of the wchar_t type. On some platforms + (for example, Windows with Visual C++ and AIX with IBM XL C++) + wchar_t is 2 bytes long. For these platforms the encoding is UTF-16. On other platforms wchar_t is 4 bytes long and UTF-32/UCS-4 is used.

+

Note also that the character encoding that is used in the object model + is independent of the encodings used in input and output XML. In fact, + all three (object mode, input XML, and output XML) can have different + encodings.

+

3.2 Support for Polymorphism

By default XSD generates non-polymorphic code. If your vocabulary diff --git a/documentation/cxx/tree/manual/index.xhtml b/documentation/cxx/tree/manual/index.xhtml index d468fe3..91c6154 100644 --- a/documentation/cxx/tree/manual/index.xhtml +++ b/documentation/cxx/tree/manual/index.xhtml @@ -226,7 +226,7 @@

3.1Character Type
3.1Character Type and Encoding
3.2Support for Polymorphism
3.3Namespace Mapping
3.4Thread Safety
2.1Preliminary Information - +
2.1.1Identifiers
2.1.2Character Type
2.1.2Character Type and Encoding
2.1.3XML Schema Namespace
2.1.4Anonymous Types
@@ -567,7 +567,7 @@ CONVENTION section in the XSD Compiler Command Line Manual.

-

2.1.2 Character Type

+

2.1.2 Character Type and Encoding

The code that implements the mapping, depending on the --char-type option, is generated using either @@ -577,6 +577,20 @@ your schemas, for example std::basic_string<C>.

+

Another aspect of the mapping that depends on the character type + is character encoding. For the char character type + the default encoding is UTF-8. Other supported encodings are + ISO-8859-1, Xerces-C++ Local Code Page (LPC), as well as + custom encodings and can be selected with the + --char-encoding command line option.

+ +

For the wchar_t character type the encoding is + automatically selected between UTF-16 and UTF-32/UCS-4 depending + on the size of the wchar_t type. On some platforms + (for example, Windows with Visual C++ and AIX with IBM XL C++) + wchar_t is 2 bytes long. For these platforms the + encoding is UTF-16. On other platforms wchar_t is 4 bytes + long and UTF-32/UCS-4 is used.

2.1.3 XML Schema Namespace

diff --git a/documentation/makefile b/documentation/makefile index 0638928..81a26fe 100644 --- a/documentation/makefile +++ b/documentation/makefile @@ -20,6 +20,7 @@ $(install): $(out_base)/cxx/.install $(call install-data,$(src_base)/future.xhtml,$(install_doc_dir)/xsd/future.xhtml) $(call install-data,$(src_base)/schema-authoring-guide.xhtml,$(install_doc_dir)/xsd/schema-authoring-guide.xhtml) $(call install-data,$(src_base)/xsd.xhtml,$(install_doc_dir)/xsd/xsd.xhtml) + $(call install-data,$(src_base)/custom-literals.xsd,$(install_doc_dir)/xsd/custom-literals.xsd) $(call install-data,$(src_base)/xsd.1,$(install_man_dir)/man1/xsd.1) # Dist. @@ -32,6 +33,7 @@ $(dist-common): $(call install-data,$(src_base)/xsd.1,$(dist_prefix)/documentation/xsd.1) $(call install-data,$(src_base)/future.xhtml,$(dist_prefix)/documentation/future.xhtml) $(call install-data,$(src_base)/schema-authoring-guide.xhtml,$(dist_prefix)/documentation/schema-authoring-guide.xhtml) + $(call install-data,$(src_base)/custom-literals.xsd,$(dist_prefix)/documentation/custom-literals.xsd) $(dist): $(dist-common) $(out_base)/cxx/.dist $(dist-win): $(dist-common) $(out_base)/cxx/.dist-win diff --git a/documentation/xsd.1 b/documentation/xsd.1 index b84586d..1038d50 100644 --- a/documentation/xsd.1 +++ b/documentation/xsd.1 @@ -127,6 +127,34 @@ Valid values are and .BR wchar_t . . +.IP "\fB\--char-encoding \fIenc\fR" +Specify the character encoding that should be used in the object model. +Valid values for the +.B char +character type are +.B utf8 +(default), +.BR iso8859-1 , lcp +(Xerces-C++ local code page), +and +.BR custom . +If you pass +.B custom +as the value then you will need to include the transcoder implementation +header for your encoding at the beginning of the generated header files +(see the +.B --hxx-prologue +option). + +For the +.B wchar_t +character type the only valid value is +.B auto +and the encoding is automatically selected between UTF-16 and UTF-32/UCS-4, +depending on the +.B wchar_t +type size. +. .IP "\fB\--output-dir \fIdir\fR" Write generated files to .I dir @@ -450,6 +478,17 @@ in places where DLL export/import control statements ( .BR __declspec(dllexport/dllimport) ) are necessary. +.IP "\fB\--custom-literals \fIfile\fR" +Load custom XML string to C++ literal mappings from +.IR file . +This mechanism can be useful if you are using a custom character encoding +and some of the strings in your schemas, for example element/attribute +names or enumeration values, contain non-ASCII characters. In this case +you will need to provide a custom mapping to C++ literals for such +strings. The format of this file is specified in the +.B custom-literals.xsd +XML Schema file that can be found in the documentation directory. + .IP "\fB\--export-xml-schema\fR" Export/import types in the XML Schema namespace using the export symbol provided with the diff --git a/documentation/xsd.xhtml b/documentation/xsd.xhtml index 49d6503..da2b52c 100644 --- a/documentation/xsd.xhtml +++ b/documentation/xsd.xhtml @@ -125,6 +125,21 @@ instead of the default char. Valid values are char and wchar_t. +
--char-encoding enc
+
Specify the character encoding that should be used in the object + model. Valid values for the char character type + are utf8 (default), iso8859-1, + lcp (Xerces-C++ local code page), and + custom. If you pass custom as + the value then you will need to include the transcoder implementation + header for your encoding at the beginning of the generated header + files (see the --hxx-prologue option). + +

For the wchar_t character type the only valid + value is auto and the encoding is automatically + selected between UTF-16 and UTF-32/UCS-4, depending on the + wchar_t type size.

+
--output-dir dir
Write generated files to dir instead of the current directory.
@@ -393,6 +408,18 @@ generated file for which there is no file-specific epilogue file. +
--custom-literals file
+
Load custom XML string to C++ literal mappings from + file. This mechanism can be useful if you + are using a custom character encoding and some of the strings + in your schemas, for example element/attribute names or enumeration + values, contain non-ASCII characters. In this case you will need + to provide a custom mapping to C++ literals for such + strings. The format of this file is specified in the + custom-literals.xsd XML Schema file that + can be found in the documentation directory. +
+
--export-symbol symbol
Insert symbol in places where DLL export/import control statements diff --git a/libxsd/xsd/cxx/xml/char-iso8859-1.hxx b/libxsd/xsd/cxx/xml/char-iso8859-1.hxx new file mode 100644 index 0000000..38b633f --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-iso8859-1.hxx @@ -0,0 +1,72 @@ +// file : xsd/cxx/xml/char-iso8859-1.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#ifndef XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER_CHAR_ISO8859_1 + +#include +#include // std::size_t + +#include // XMLCh + +#include // invalid_utf16_string + +namespace xsd +{ + namespace cxx + { + namespace xml + { + struct iso8859_1_unrepresentable {}; + + // UTF-16 to/from ISO-8859-1 transcoder. + // + template + struct char_iso8859_1_transcoder + { + static std::basic_string + to (const XMLCh* s, std::size_t length); + + static XMLCh* + from (const C* s, std::size_t length); + + // Get/set a replacement for unrepresentable characters. If set to + // 0 (the default value), throw iso8859_1_unrepresentable instead. + // + static C + unrep_char () + { + return unrep_char_; + } + + static void + unrep_char (C c) + { + unrep_char_ = c; + } + + private: + static C unrep_char_; + }; + + typedef char_iso8859_1_transcoder char_transcoder; + } + } +} + +#include + +#else +# ifndef XSD_CXX_XML_TRANSCODER_CHAR_ISO8859_1 + // + // If you get this error, it usually means that either you compiled + // your schemas with different --char-encoding values or you included + // some of the libxsd headers (e.g., xsd/cxx/xml/string.hxx) directly + // without first including the correct xsd/cxx/xml/char-*.hxx header. + // +# error conflicting character encoding detected +# endif +#endif // XSD_CXX_XML_TRANSCODER diff --git a/libxsd/xsd/cxx/xml/char-iso8859-1.txx b/libxsd/xsd/cxx/xml/char-iso8859-1.txx new file mode 100644 index 0000000..6b20f01 --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-iso8859-1.txx @@ -0,0 +1,101 @@ +// file : xsd/cxx/xml/char-iso8859-1.txx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#include + +namespace xsd +{ + namespace cxx + { + namespace xml + { + template + C char_iso8859_1_transcoder::unrep_char_ = 0; + + template + std::basic_string char_iso8859_1_transcoder:: + to (const XMLCh* s, std::size_t len) + { + const XMLCh* end (s + len); + + // Find what the resulting buffer size will be. + // + std::size_t rl (0); + unsigned int u (0); // Four byte UCS-4 char. + + bool valid (true); + const XMLCh* p (s); + + for (; p < end; ++p) + { + if (*p >= 0xD800 && *p <= 0xDBFF) + { + // Make sure we have one more char and it has a valid + // value for the second char in a surrogate pair. + // + if (++p == end || !((*p >= 0xDC00) && (*p <= 0xDFFF))) + { + valid = false; + break; + } + } + + rl++; + } + + if (!valid) + throw invalid_utf16_string (); + + std::basic_string r; + r.reserve (rl + 1); + r.resize (rl); + C* rs (const_cast (r.c_str ())); + std::size_t i (0); + + p = s; + + // Tight first loop for the common case. + // + for (; p < end && *p < 0x100; ++p) + rs[i++] = C (*p); + + if (p < end && unrep_char_ == 0) + throw iso8859_1_unrepresentable (); + + for (; p < end; ++p) + { + XMLCh x (*p); + + if ((x >= 0xD800) && (x <= 0xDBFF)) + { + u = ((x - 0xD800) << 10) + (*++p - 0xDC00) + 0x10000; + } + else + u = x; + + rs[i++] = u < 0x100 ? C (u) : unrep_char_; + } + + return r; + } + + template + XMLCh* char_iso8859_1_transcoder:: + from (const C* s, std::size_t len) + { + const C* end (s + len); + + auto_array r (new XMLCh[len + 1]); + XMLCh* ir (r.get ()); + + for (const C* p (s); p < end; ++p) + *ir++ = static_cast (*p); + + *ir = XMLCh (0); + return r.release (); + } + } + } +} diff --git a/libxsd/xsd/cxx/xml/char-lcp.hxx b/libxsd/xsd/cxx/xml/char-lcp.hxx new file mode 100644 index 0000000..2c41753 --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-lcp.hxx @@ -0,0 +1,56 @@ +// file : xsd/cxx/xml/char-lcp.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#ifndef XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER_CHAR_LCP + +#include +#include // std::size_t + +#include // XMLCh + +namespace xsd +{ + namespace cxx + { + namespace xml + { + // UTF-16 to/from Xerces-C++ local code page (LCP) transcoder. + // + // Note that this transcoder has a custom interface due to Xerces-C++ + // idiosyncrasies. Don't use it as a base for your custom transcoder. + // + template + struct char_lcp_transcoder + { + static std::basic_string + to (const XMLCh* s); + + static std::basic_string + to (const XMLCh* s, std::size_t length); + + static XMLCh* + from (const C* s); + }; + + typedef char_lcp_transcoder char_transcoder; + } + } +} + +#include + +#else +# ifndef XSD_CXX_XML_TRANSCODER_CHAR_LCP + // + // If you get this error, it usually means that either you compiled + // your schemas with different --char-encoding values or you included + // some of the libxsd headers (e.g., xsd/cxx/xml/string.hxx) directly + // without first including the correct xsd/cxx/xml/char-*.hxx header. + // +# error conflicting character encoding detected +# endif +#endif // XSD_CXX_XML_TRANSCODER diff --git a/libxsd/xsd/cxx/xml/char-lcp.txx b/libxsd/xsd/cxx/xml/char-lcp.txx new file mode 100644 index 0000000..01bb36e --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-lcp.txx @@ -0,0 +1,55 @@ +// file : xsd/cxx/xml/char-lcp.txx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#include // std::memcpy + +#include + +#include +#include + +namespace xsd +{ + namespace cxx + { + namespace xml + { + template + std::basic_string char_lcp_transcoder:: + to (const XMLCh* s) + { + std_memory_manager mm; + auto_array r ( + xercesc::XMLString::transcode (s, &mm), mm); + return std::basic_string (r.get ()); + } + + template + std::basic_string char_lcp_transcoder:: + to (const XMLCh* s, std::size_t len) + { + auto_array tmp (new XMLCh[len + 1]); + std::memcpy (tmp.get (), s, len * sizeof (XMLCh)); + tmp[len] = XMLCh (0); + + std_memory_manager mm; + auto_array r ( + xercesc::XMLString::transcode (tmp.get (), &mm), mm); + + tmp.reset (); + + return std::basic_string (r.get ()); + } + + template + XMLCh* char_lcp_transcoder:: + from (const C* s) + { + std_memory_manager mm; + return xercesc::XMLString::transcode (s, &mm); + } + } + } +} diff --git a/libxsd/xsd/cxx/xml/char-utf8.hxx b/libxsd/xsd/cxx/xml/char-utf8.hxx new file mode 100644 index 0000000..c255b28 --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-utf8.hxx @@ -0,0 +1,57 @@ +// file : xsd/cxx/xml/char-utf8.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#ifndef XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER +#define XSD_CXX_XML_TRANSCODER_CHAR_UTF8 + +#include +#include // std::size_t + +#include // XMLCh + +#include // invalid_utf16_string + +namespace xsd +{ + namespace cxx + { + namespace xml + { + struct invalid_utf8_string {}; + + // UTF-16 to/from UTF-8 transcoder. + // + template + struct char_utf8_transcoder + { + static std::basic_string + to (const XMLCh* s, std::size_t length); + + static XMLCh* + from (const C* s, std::size_t length); + + private: + static const unsigned char first_byte_mask_[5]; + }; + + typedef char_utf8_transcoder char_transcoder; + } + } +} + +#include + +#else +# ifndef XSD_CXX_XML_TRANSCODER_CHAR_UTF8 + // + // If you get this error, it usually means that either you compiled + // your schemas with different --char-encoding values or you included + // some of the libxsd headers (e.g., xsd/cxx/xml/string.hxx) directly + // without first including the correct xsd/cxx/xml/char-*.hxx header. + // +# error conflicting character encoding detected +# endif +#endif // XSD_CXX_XML_TRANSCODER diff --git a/libxsd/xsd/cxx/xml/char-utf8.txx b/libxsd/xsd/cxx/xml/char-utf8.txx new file mode 100644 index 0000000..96b36a4 --- /dev/null +++ b/libxsd/xsd/cxx/xml/char-utf8.txx @@ -0,0 +1,293 @@ +// file : xsd/cxx/xml/char-utf8.txx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#include + +namespace xsd +{ + namespace cxx + { + namespace xml + { + template + const unsigned char char_utf8_transcoder::first_byte_mask_[5] = + { + 0x00, 0x00, 0xC0, 0xE0, 0xF0 + }; + + template + std::basic_string char_utf8_transcoder:: + to (const XMLCh* s, std::size_t len) + { + const XMLCh* end (s + len); + + // Find what the resulting buffer size will be. + // + std::size_t rl (0); + unsigned int u (0); // Four byte UCS-4 char. + + bool valid (true); + const XMLCh* p (s); + + for (; p < end; ++p) + { + XMLCh x (*p); + + if (x < 0xD800 || x > 0xDBFF) + u = x; + else + { + // Make sure we have one more char and it has a valid + // value for the second char in a surrogate pair. + // + if (++p == end || !((*p >= 0xDC00) && (*p <= 0xDFFF))) + { + valid = false; + break; + } + + u = ((x - 0xD800) << 10) + (*p - 0xDC00) + 0x10000; + } + + if (u < 0x80) + rl++; + else if (u < 0x800) + rl += 2; + else if (u < 0x10000) + rl += 3; + else if (u < 0x110000) + rl += 4; + else + { + valid = false; + break; + } + } + + if (!valid) + throw invalid_utf16_string (); + + std::basic_string r; + r.reserve (rl + 1); + r.resize (rl); + C* rs (const_cast (r.c_str ())); + + std::size_t i (0); + unsigned int count (0); + + p = s; + + // Tight first loop for the common case. + // + for (; p < end && *p < 0x80; ++p) + rs[i++] = C (*p); + + for (; p < end; ++p) + { + XMLCh x (*p); + + if ((x >= 0xD800) && (x <= 0xDBFF)) + { + u = ((x - 0xD800) << 10) + (*++p - 0xDC00) + 0x10000; + } + else + u = x; + + if (u < 0x80) + count = 1; + else if (u < 0x800) + count = 2; + else if (u < 0x10000) + count = 3; + else if (u < 0x110000) + count = 4; + + switch(count) + { + case 4: + { + rs[i + 3] = C ((u | 0x80UL) & 0xBFUL); + u >>= 6; + } + case 3: + { + rs[i + 2] = C ((u | 0x80UL) & 0xBFUL); + u >>= 6; + } + case 2: + { + rs[i + 1] = C ((u | 0x80UL) & 0xBFUL); + u >>= 6; + } + case 1: + { + rs[i] = C (u | first_byte_mask_[count]); + } + } + + i += count; + } + + return r; + } + + template + XMLCh* char_utf8_transcoder:: + from (const C* s, std::size_t len) + { + bool valid (true); + const C* end (s + len); + + // Find what the resulting buffer size will be. + // + std::size_t rl (0); + unsigned int count (0); + + for (const C* p (s); p < end; ++p) + { + unsigned char c (*p); + + if (c < 0x80) + { + // Fast path. + // + rl += 1; + continue; + } + else if ((c >> 5) == 0x06) + count = 2; + else if ((c >> 4) == 0x0E) + count = 3; + else if ((c >> 3) == 0x1E) + count = 4; + else + { + valid = false; + break; + } + + p += count - 1; // One will be added in the for loop + + if (p + 1 > end) + { + valid = false; + break; + } + + // BMP is represented by up to 3 code points in UTF-8. + // + rl += count > 3 ? 2 : 1; + } + + if (!valid) + throw invalid_utf8_string (); + + auto_array r (new XMLCh[rl + 1]); + XMLCh* ir (r.get ()); + + unsigned int u (0); // Four byte UCS-4 char. + + for (const C* p (s); p < end; ++p) + { + unsigned char c (*p); + + if (c < 0x80) + { + // Fast path. + // + *ir++ = static_cast (c); + continue; + } + else if ((c >> 5) == 0x06) + { + // UTF-8: 110yyyyy 10zzzzzz + // Unicode: 00000yyy yyzzzzzz + // + u = (c & 0x1F) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u |= c & 0x3F; + } + else if ((c >> 4) == 0x0E) + { + // UTF-8: 1110xxxx 10yyyyyy 10zzzzzz + // Unicode: xxxxyyyy yyzzzzzz + // + u = (c & 0x0F) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u = (u | (c & 0x3F)) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u |= c & 0x3F; + } + else if ((c >> 3) == 0x1E) + { + // UTF-8: 000wwwxx xxxxyyyy yyzzzzzz + // Unicode: 11110www 10xxxxxx 10yyyyyy 10zzzzzz + // + u = (c & 0x07) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u = (u | (c & 0x3F)) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u = (u | (c & 0x3F)) << 6; + + c = *++p; + if ((c >> 6) != 2) + { + valid = false; + break; + } + u |= c & 0x3F; + } + + if (u & 0xFFFF0000) + { + // Surrogate pair. + // + *ir++ = static_cast (((u - 0x10000) >> 10) + 0xD800); + *ir++ = static_cast ((u & 0x3FF) + 0xDC00); + } + else + *ir++ = static_cast (u); + } + + if (!valid) + throw invalid_utf8_string (); + + *ir = XMLCh (0); + + return r.release (); + } + } + } +} diff --git a/libxsd/xsd/cxx/xml/exceptions.hxx b/libxsd/xsd/cxx/xml/exceptions.hxx new file mode 100644 index 0000000..6c2e029 --- /dev/null +++ b/libxsd/xsd/cxx/xml/exceptions.hxx @@ -0,0 +1,20 @@ +// file : xsd/cxx/xml/exceptions.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#ifndef XSD_CXX_XML_EXCEPTIONS_HXX +#define XSD_CXX_XML_EXCEPTIONS_HXX + +namespace xsd +{ + namespace cxx + { + namespace xml + { + struct invalid_utf16_string {}; + } + } +} + +#endif // XSD_CXX_XML_EXCEPTIONS_HXX diff --git a/libxsd/xsd/cxx/xml/string.hxx b/libxsd/xsd/cxx/xml/string.hxx index 2d08134..ec666ee 100644 --- a/libxsd/xsd/cxx/xml/string.hxx +++ b/libxsd/xsd/cxx/xml/string.hxx @@ -7,6 +7,7 @@ #define XSD_CXX_XML_STRING_HXX #include +#include // std::size_t #include #include // XMLCh @@ -17,12 +18,6 @@ namespace xsd { namespace xml { - // - // - struct invalid_utf8_string {}; - struct invalid_utf16_string {}; - - // Transcode a null-terminated string. // template @@ -84,7 +79,7 @@ namespace xsd } } -#endif // XSD_CXX_XML_STRING_HXX +#endif // XSD_CXX_XML_STRING_HXX #include #include diff --git a/libxsd/xsd/cxx/xml/string.ixx b/libxsd/xsd/cxx/xml/string.ixx index bde86d8..056a15f 100644 --- a/libxsd/xsd/cxx/xml/string.ixx +++ b/libxsd/xsd/cxx/xml/string.ixx @@ -6,11 +6,13 @@ #ifndef XSD_CXX_XML_STRING_IXX #define XSD_CXX_XML_STRING_IXX -#include -#include // std::memcpy - #include -#include + +// If no transcoder has been included, use the default UTF-8. +// +#ifndef XSD_CXX_XML_TRANSCODER +# include +#endif // We sometimes need this functionality even if we are building for // wchar_t. @@ -21,43 +23,17 @@ namespace xsd { namespace xml { -#ifndef XSD_USE_LCP - namespace bits - { - // UTF-16 to/from UTF-8 transcoder. - // - template - struct char_transcoder - { - static std::basic_string - to (const XMLCh* s, std::size_t length); - - static XMLCh* - from (const C* s, std::size_t length); - - private: - static const unsigned char first_byte_mask_[5]; - }; - } -#endif - template <> inline std::basic_string transcode (const XMLCh* s) { - if (s == 0) + if (s == 0 || *s == XMLCh (0)) return std::basic_string (); -#ifndef XSD_USE_LCP - return bits::char_transcoder::to ( - s, xercesc::XMLString::stringLen (s)); +#ifndef XSD_CXX_XML_TRANSCODER_CHAR_LCP + return char_transcoder::to (s, xercesc::XMLString::stringLen (s)); #else - // Use Xerces-C++ local code page transcoding. - // - std_memory_manager mm; - auto_array r ( - xercesc::XMLString::transcode (s, &mm), mm); - return std::basic_string (r.get ()); + return char_transcoder::to (s); #endif } @@ -68,41 +44,17 @@ namespace xsd if (s == 0 || len == 0) return std::basic_string (); -#ifndef XSD_USE_LCP - // Convert UTF-16 to UTF-8 - // - return bits::char_transcoder::to (s, len); -#else - // Use Xerces-C++ local code page transcoding. - // - auto_array tmp (new XMLCh[len + 1]); - std::memcpy (tmp.get (), s, len * sizeof (XMLCh)); - tmp[len] = XMLCh (0); - - std_memory_manager mm; - auto_array r ( - xercesc::XMLString::transcode (tmp.get (), &mm), mm); - - tmp.reset (); - - return std::basic_string (r.get ()); -#endif + return char_transcoder::to (s, len); } template <> inline XMLCh* transcode_to_xmlch (const char* s) { -#ifndef XSD_USE_LCP - // Convert UTF-8 to UTF-16 - // - return bits::char_transcoder::from ( - s, std::char_traits::length (s)); +#ifndef XSD_CXX_XML_TRANSCODER_CHAR_LCP + return char_transcoder::from (s, std::char_traits::length (s)); #else - // Use Xerces-C++ local code page transcoding. - // - std_memory_manager mm; - return xercesc::XMLString::transcode (s, &mm); + return char_transcoder::from (s); #endif } @@ -110,16 +62,10 @@ namespace xsd inline XMLCh* transcode_to_xmlch (const std::basic_string& s) { -#ifndef XSD_USE_LCP - // Convert UTF-8 to UTF-16 - // - return bits::char_transcoder::from ( - s.c_str (), s.length ()); +#ifndef XSD_CXX_XML_TRANSCODER_CHAR_LCP + return char_transcoder::from (s.c_str (), s.length ()); #else - // Use Xerces-C++ local code page transcoding. - // - std_memory_manager mm; - return xercesc::XMLString::transcode (s.c_str (), &mm); + return char_transcoder::from (s.c_str ()); #endif } } diff --git a/libxsd/xsd/cxx/xml/string.txx b/libxsd/xsd/cxx/xml/string.txx index cdef87e..f71480e 100644 --- a/libxsd/xsd/cxx/xml/string.txx +++ b/libxsd/xsd/cxx/xml/string.txx @@ -6,306 +6,16 @@ #ifndef XSD_CXX_XML_STRING_TXX #define XSD_CXX_XML_STRING_TXX -#ifndef XSD_USE_LCP -namespace xsd -{ - namespace cxx - { - namespace xml - { - namespace bits - { - template - const unsigned char char_transcoder::first_byte_mask_[5] = - { - 0x00, 0x00, 0xC0, 0xE0, 0xF0 - }; - - template - std::basic_string char_transcoder:: - to (const XMLCh* s, std::size_t len) - { - const XMLCh* end (s + len); - - // Find what the resulting buffer size will be. - // - std::size_t rl (0); - unsigned int u (0); // Four byte UCS-4 char. - - bool valid (true); - const XMLCh* p (s); - for (; p < end; ++p) - { - XMLCh x (*p); - - if (x < 0xD800 || x > 0xDBFF) - u = x; - else - { - // Make sure we have one more char and it has a valid - // value for the second char in a surrogate pair. - // - if (++p == end || !((*p >= 0xDC00) && (*p <= 0xDFFF))) - { - valid = false; - break; - } - - u = ((x - 0xD800) << 10) + (*p - 0xDC00) + 0x10000; - } - - if (u < 0x80) - rl++; - else if (u < 0x800) - rl += 2; - else if (u < 0x10000) - rl += 3; - else if (u < 0x110000) - rl += 4; - else - { - valid = false; - break; - } - } - - if (!valid) - throw invalid_utf16_string (); - - std::basic_string r; - r.reserve (rl + 1); - r.resize (rl); - C* rs (const_cast (r.c_str ())); - - std::size_t i (0); - unsigned int count (0); - - p = s; - - // Tight first loop for the common case. - // - for (; p < end && *p < 0x80; ++p) - rs[i++] = C (*p); - - for (; p < end; ++p) - { - XMLCh x (*p); - - if ((x >= 0xD800) && (x <= 0xDBFF)) - { - u = ((x - 0xD800) << 10) + (*++p - 0xDC00) + 0x10000; - } - else - u = x; - - if (u < 0x80) - count = 1; - else if (u < 0x800) - count = 2; - else if (u < 0x10000) - count = 3; - else if (u < 0x110000) - count = 4; - - switch(count) - { - case 4: - { - rs[i + 3] = C ((u | 0x80UL) & 0xBFUL); - u >>= 6; - } - case 3: - { - rs[i + 2] = C ((u | 0x80UL) & 0xBFUL); - u >>= 6; - } - case 2: - { - rs[i + 1] = C ((u | 0x80UL) & 0xBFUL); - u >>= 6; - } - case 1: - { - rs[i] = C (u | first_byte_mask_[count]); - } - } - - i += count; - } - - return r; - } - - template - XMLCh* char_transcoder:: - from (const C* s, std::size_t len) - { - bool valid (true); - const C* end (s + len); - - // Find what the resulting buffer size will be. - // - std::size_t rl (0); - unsigned int count (0); - - for (const C* p (s); p < end; ++p) - { - unsigned char c (*p); - - if (c < 0x80) - { - // Fast path. - // - rl += 1; - continue; - } - else if ((c >> 5) == 0x06) - count = 2; - else if ((c >> 4) == 0x0E) - count = 3; - else if ((c >> 3) == 0x1E) - count = 4; - else - { - valid = false; - break; - } - - p += count - 1; // One will be added in the for loop - - if (p + 1 > end) - { - valid = false; - break; - } - - // BMP is represented by up to 3 code points in UTF-8. - // - rl += count > 3 ? 2 : 1; - } - - if (!valid) - throw invalid_utf8_string (); - - auto_array r (new XMLCh[rl + 1]); - XMLCh* ir (r.get ()); - - unsigned int u (0); // Four byte UCS-4 char. - - for (const C* p (s); p < end; ++p) - { - unsigned char c (*p); - - if (c < 0x80) - { - // Fast path. - // - *ir++ = static_cast (c); - continue; - } - else if ((c >> 5) == 0x06) - { - // UTF-8: 110yyyyy 10zzzzzz - // Unicode: 00000yyy yyzzzzzz - // - u = (c & 0x1F) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u |= c & 0x3F; - } - else if ((c >> 4) == 0x0E) - { - // UTF-8: 1110xxxx 10yyyyyy 10zzzzzz - // Unicode: xxxxyyyy yyzzzzzz - // - u = (c & 0x0F) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u = (u | (c & 0x3F)) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u |= c & 0x3F; - } - else if ((c >> 3) == 0x1E) - { - // UTF-8: 000wwwxx xxxxyyyy yyzzzzzz - // Unicode: 11110www 10xxxxxx 10yyyyyy 10zzzzzz - // - u = (c & 0x07) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u = (u | (c & 0x3F)) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u = (u | (c & 0x3F)) << 6; - - c = *++p; - if ((c >> 6) != 2) - { - valid = false; - break; - } - u |= c & 0x3F; - } - - if (u & 0xFFFF0000) - { - // Surrogate pair. - // - *ir++ = static_cast (((u - 0x10000) >> 10) + 0xD800); - *ir++ = static_cast ((u & 0x3FF) + 0xDC00); - } - else - *ir++ = static_cast (u); - } - - if (!valid) - throw invalid_utf8_string (); - - *ir = XMLCh (0); - - return r.release (); - } - } - } - } -} - -#endif // XSD_USE_LCP #endif // XSD_CXX_XML_STRING_TXX - #if defined(XSD_USE_WCHAR) || !defined(XSD_USE_CHAR) #ifndef XSD_CXX_XML_STRING_TXX_WCHAR #define XSD_CXX_XML_STRING_TXX_WCHAR +#include + namespace xsd { namespace cxx diff --git a/tests/cxx/tree/encoding/char/iso-8859-1/driver.cxx b/tests/cxx/tree/encoding/char/iso-8859-1/driver.cxx new file mode 100644 index 0000000..9bd5725 --- /dev/null +++ b/tests/cxx/tree/encoding/char/iso-8859-1/driver.cxx @@ -0,0 +1,76 @@ +// file : tests/cxx/tree/encoding/char/iso-8859-1/driver.cxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2006-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +// Test ISO-8859-1 encoding. +// + +#include // std::auto_ptr +#include +#include + +#include "test.hxx" + +using namespace std; +using namespace test; + +int +main (int argc, char* argv[]) +{ + if (argc != 2) + { + cerr << "usage: " << argv[0] << " test.xml" << endl; + return 1; + } + + try + { + try + { + root (argv[1]); + return 1; + } + catch (xsd::cxx::xml::iso8859_1_unrepresentable const&) + { + } + + xsd::cxx::xml::char_transcoder::unrep_char ('?'); + auto_ptr r (root (argv[1])); + + { + type::a_sequence const& s (r->a ()); + + if (s[0] != "abc" || + s[1] != "\xE6" || + s[2] != "\xA2\xA3\xA4\xA5" || + s[3] != "???") + { + cerr << "invalid encoding" << endl; + return 1; + } + } + + { + type::b_sequence const& s (r->b ()); + + if (s[0] != strenum::abc || + s[1] != strenum::a_c || + s[2] != strenum::cxx__bc) + { + cerr << "invalid encoding" << endl; + return 1; + } + } + + xml_schema::namespace_infomap map; + map["t"].name = "test"; + + root (std::cout, *r, map, "ISO-8859-1"); + } + catch (xml_schema::exception const& e) + { + cerr << "xml_schema::exception: " << e.what () << endl; + return 1; + } +} diff --git a/tests/cxx/tree/encoding/char/iso-8859-1/makefile b/tests/cxx/tree/encoding/char/iso-8859-1/makefile new file mode 100644 index 0000000..dd48fc1 --- /dev/null +++ b/tests/cxx/tree/encoding/char/iso-8859-1/makefile @@ -0,0 +1,83 @@ +# file : tests/cxx/tree/encoding/char/iso-8859-1/makefile +# author : Boris Kolpackov +# copyright : Copyright (c) 2006-2009 Code Synthesis Tools CC +# license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +include $(dir $(lastword $(MAKEFILE_LIST)))../../../../../../build/bootstrap.make + +xsd := test.xsd +cxx := driver.cxx + +obj := $(addprefix $(out_base)/,$(cxx:.cxx=.o) $(xsd:.xsd=.o)) +dep := $(obj:.o=.o.d) + +driver := $(out_base)/driver +test := $(out_base)/.test +clean := $(out_base)/.clean + + +# Import. +# +$(call import,\ + $(scf_root)/import/libxerces-c/stub.make,\ + l: xerces_c.l,cpp-options: xerces_c.l.cpp-options) + + +# Build. +# +$(driver): $(obj) $(xerces_c.l) + +$(obj) $(dep): cpp_options := -I$(src_root)/libxsd +$(obj) $(dep): $(xerces_c.l.cpp-options) + +genf := $(xsd:.xsd=.hxx) $(xsd:.xsd=.ixx) $(xsd:.xsd=.cxx) +gen := $(addprefix $(out_base)/,$(genf)) + +$(gen): xsd := $(out_root)/xsd/xsd +$(gen): xsd_options := --char-encoding iso8859-1 --generate-serialization \ +--generate-doxygen +$(gen): $(out_root)/xsd/xsd + +$(call include-dep,$(dep)) + +# Convenience alias for default target. +# +$(out_base)/: $(driver) + + +# Test. +# +$(test): driver := $(driver) +$(test): $(driver) $(src_base)/test.xml $(src_base)/test.std + $(call message,test $$1,$$1 $(src_base)/test.xml | diff -u $(src_base)/test.std -,$(driver)) + +# Clean. +# +$(clean): $(driver).o.clean \ + $(addsuffix .cxx.clean,$(obj)) \ + $(addsuffix .cxx.clean,$(dep)) \ + $(addprefix $(out_base)/,$(xsd:.xsd=.cxx.xsd.clean)) + +# Generated .gitignore. +# +ifeq ($(out_base),$(src_base)) +$(gen): | $(out_base)/.gitignore +$(driver): | $(out_base)/.gitignore + +$(out_base)/.gitignore: files := driver $(genf) +$(clean): $(out_base)/.gitignore.clean + +$(call include,$(bld_root)/git/gitignore.make) +endif + +# How to. +# +$(call include,$(bld_root)/cxx/o-e.make) +$(call include,$(bld_root)/cxx/cxx-o.make) +$(call include,$(bld_root)/cxx/cxx-d.make) +$(call include,$(scf_root)/xsd/tree/xsd-cxx.make) + + +# Dependencies. +# +$(call import,$(src_root)/xsd/makefile) diff --git a/tests/cxx/tree/encoding/char/iso-8859-1/test.std b/tests/cxx/tree/encoding/char/iso-8859-1/test.std new file mode 100644 index 0000000..ca6297f --- /dev/null +++ b/tests/cxx/tree/encoding/char/iso-8859-1/test.std @@ -0,0 +1,18 @@ + + + + abc + + æ + + ¢£¤¥ + + ??? + + abc + + aâc + + âòbc + + diff --git a/tests/cxx/tree/encoding/char/iso-8859-1/test.xml b/tests/cxx/tree/encoding/char/iso-8859-1/test.xml new file mode 100644 index 0000000..9c9e752 --- /dev/null +++ b/tests/cxx/tree/encoding/char/iso-8859-1/test.xml @@ -0,0 +1,14 @@ + + + abc + æ + ¢£¤¥ + Āꪪ򪪪 + + abc + aâc + âòbc + + diff --git a/tests/cxx/tree/encoding/char/iso-8859-1/test.xsd b/tests/cxx/tree/encoding/char/iso-8859-1/test.xsd new file mode 100644 index 0000000..31b8901 --- /dev/null +++ b/tests/cxx/tree/encoding/char/iso-8859-1/test.xsd @@ -0,0 +1,31 @@ + + + + + + + + Test enum. Valid values are: + abc + aâc + òbc + + + + + + + + + + + + + + + + + + + + diff --git a/tests/cxx/tree/encoding/char/lcp/driver.cxx b/tests/cxx/tree/encoding/char/lcp/driver.cxx index 3d30aa9..7bc4a2d 100644 --- a/tests/cxx/tree/encoding/char/lcp/driver.cxx +++ b/tests/cxx/tree/encoding/char/lcp/driver.cxx @@ -3,7 +3,7 @@ // copyright : Copyright (c) 2006-2009 Code Synthesis Tools CC // license : GNU GPL v2 + exceptions; see accompanying LICENSE file -// Test local code page encoding (XSD_USE_LCP defined). +// Test local code page encoding (--char-encoding lcp). // The test just makes sure it still compiles and works. // diff --git a/tests/cxx/tree/encoding/char/lcp/makefile b/tests/cxx/tree/encoding/char/lcp/makefile index 9520a42..324e331 100644 --- a/tests/cxx/tree/encoding/char/lcp/makefile +++ b/tests/cxx/tree/encoding/char/lcp/makefile @@ -27,14 +27,14 @@ $(call import,\ # $(driver): $(obj) $(xerces_c.l) -$(obj) $(dep): cpp_options := -I$(src_root)/libxsd -DXSD_USE_LCP +$(obj) $(dep): cpp_options := -I$(src_root)/libxsd $(obj) $(dep): $(xerces_c.l.cpp-options) genf := $(xsd:.xsd=.hxx) $(xsd:.xsd=.ixx) $(xsd:.xsd=.cxx) gen := $(addprefix $(out_base)/,$(genf)) $(gen): xsd := $(out_root)/xsd/xsd -$(gen): xsd_options := --generate-serialization +$(gen): xsd_options := --generate-serialization --char-encoding lcp $(gen): $(out_root)/xsd/xsd $(call include-dep,$(dep)) diff --git a/tests/cxx/tree/encoding/char/makefile b/tests/cxx/tree/encoding/char/makefile index 78b6e7a..ef25ad3 100644 --- a/tests/cxx/tree/encoding/char/makefile +++ b/tests/cxx/tree/encoding/char/makefile @@ -5,7 +5,7 @@ include $(dir $(lastword $(MAKEFILE_LIST)))../../../../../build/bootstrap.make -tests := lcp utf-8 +tests := lcp utf-8 iso-8859-1 default := $(out_base)/ test := $(out_base)/.test diff --git a/tests/cxx/tree/encoding/char/utf-8/makefile b/tests/cxx/tree/encoding/char/utf-8/makefile index 9fbbc7c..da5d7b4 100644 --- a/tests/cxx/tree/encoding/char/utf-8/makefile +++ b/tests/cxx/tree/encoding/char/utf-8/makefile @@ -1,4 +1,4 @@ -# file : tests/cxx/tree/encoding/char/lcp/makefile +# file : tests/cxx/tree/encoding/char/utf-8/makefile # author : Boris Kolpackov # copyright : Copyright (c) 2006-2009 Code Synthesis Tools CC # license : GNU GPL v2 + exceptions; see accompanying LICENSE file diff --git a/xsd/cxx/elements.cxx b/xsd/cxx/elements.cxx index fd23fc0..764d7da 100644 --- a/xsd/cxx/elements.cxx +++ b/xsd/cxx/elements.cxx @@ -8,7 +8,9 @@ #include #include // std::toupper +#include #include +#include #include using std::wcerr; @@ -111,7 +113,9 @@ namespace CXX Context:: Context (std::wostream& o, SemanticGraph::Schema& root, + StringLiteralMap const* string_literal_map_, NarrowString const& char_type__, + NarrowString const& char_encoding__, Boolean include_with_brackets__, NarrowString const& include_prefix__, NarrowString const& esymbol, @@ -125,8 +129,10 @@ namespace CXX : os (o), schema_root (root), char_type (char_type_), + char_encoding (char_encoding_), L (L_), string_type (string_type_), + string_literal_map (string_literal_map_), include_with_brackets (include_with_brackets_), include_prefix (include_prefix_), type_exp (type_exp_), @@ -135,6 +141,7 @@ namespace CXX ns_mapping_cache (ns_mapping_cache_), xs_ns_ (0), char_type_ (char_type__), + char_encoding_ (char_encoding__), L_ (char_type == L"wchar_t" ? L"L" : L""), include_with_brackets_ (include_with_brackets__), include_prefix_ (include_prefix__), @@ -177,7 +184,7 @@ namespace CXX xs_ns_ = dynamic_cast (n); } - // + // String type. // if (char_type == L"char") string_type_ = L"::std::string"; @@ -186,6 +193,16 @@ namespace CXX else string_type_ = L"::std::basic_string< " + char_type + L" >"; + // Default encoding. + // + if (!char_encoding) + { + if (char_type == L"char") + char_encoding = L"utf8"; + else + char_encoding = L"auto"; + } + // Default mapping. // nsr_mapping_.push_back ( @@ -615,6 +632,121 @@ namespace CXX return r; } + String + strlit_ascii (String const& str) + { + String r; + Size n (str.size ()); + + // In most common cases we will have that many chars. + // + r.reserve (n + 2); + + r += '"'; + + Boolean escape (false); + + for (Size i (0); i < n; ++i) + { + UnsignedLong u (Context::unicode_char (str, i)); // May advance i. + + // [128 - ] - unrepresentable + // 127 - \x7F + // [32 - 126] - as is + // [0 - 31] - \X or \xXX + // + + if (u < 32 || u == 127) + { + switch (u) + { + case L'\n': + { + r += L"\\n"; + break; + } + case L'\t': + { + r += L"\\t"; + break; + } + case L'\v': + { + r += L"\\v"; + break; + } + case L'\b': + { + r += L"\\b"; + break; + } + case L'\r': + { + r += L"\\r"; + break; + } + case L'\f': + { + r += L"\\f"; + break; + } + case L'\a': + { + r += L"\\a"; + break; + } + default: + { + r += charlit (u); + escape = true; + break; + } + } + } + else if (u < 127) + { + if (escape) + { + // Close and open the string so there are no clashes. + // + r += '"'; + r += '"'; + + escape = false; + } + + switch (u) + { + case L'"': + { + r += L"\\\""; + break; + } + case L'\\': + { + r += L"\\\\"; + break; + } + default: + { + r += static_cast (u); + break; + } + } + } + else + { + // Unrepresentable character. + // + throw UnrepresentableCharacter (str, i + 1); + } + } + + r += '"'; + + return r; + } + const UnsignedLong utf8_first_char_mask[5] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0 @@ -770,6 +902,126 @@ namespace CXX } String + strlit_iso8859_1 (String const& str) + { + String r; + Size n (str.size ()); + + // In most common cases we will have that many chars. + // + r.reserve (n + 2); + + r += '"'; + + Boolean escape (false); + + for (Size i (0); i < n; ++i) + { + UnsignedLong u (Context::unicode_char (str, i)); // May advance i. + + // [256 - ] - unrepresentable + // [127 - 255] - \xXX + // [32 - 126] - as is + // [0 - 31] - \X or \xXX + // + + if (u < 32) + { + switch (u) + { + case L'\n': + { + r += L"\\n"; + break; + } + case L'\t': + { + r += L"\\t"; + break; + } + case L'\v': + { + r += L"\\v"; + break; + } + case L'\b': + { + r += L"\\b"; + break; + } + case L'\r': + { + r += L"\\r"; + break; + } + case L'\f': + { + r += L"\\f"; + break; + } + case L'\a': + { + r += L"\\a"; + break; + } + default: + { + r += charlit (u); + escape = true; + break; + } + } + } + else if (u < 127) + { + if (escape) + { + // Close and open the string so there are no clashes. + // + r += '"'; + r += '"'; + + escape = false; + } + + switch (u) + { + case L'"': + { + r += L"\\\""; + break; + } + case L'\\': + { + r += L"\\\\"; + break; + } + default: + { + r += static_cast (u); + break; + } + } + } + else if (u < 256) + { + r += charlit (u); + escape = true; + } + else + { + // Unrepresentable character. + // + throw UnrepresentableCharacter (str, i + 1); + } + } + + r += '"'; + + return r; + } + + String strlit_utf32 (String const& str) { String r; @@ -886,8 +1138,27 @@ namespace CXX String Context:: strlit (String const& str) { + // First see if we have a custom mapping. + // + assert (string_literal_map != 0); + StringLiteralMap::ConstIterator i (string_literal_map->find (str)); + + if (i != string_literal_map->end ()) + return i->second; + if (char_type == L"char") - return strlit_utf8 (str); + { + if (char_encoding == L"utf8") + return strlit_utf8 (str); + else if (char_encoding == L"iso8859-1") + return strlit_iso8859_1 (str); + else + { + // For LCP, custom, and other unknown encodings, use ASCII. + // + return strlit_ascii (str); + } + } else return strlit_utf32 (str); } diff --git a/xsd/cxx/elements.hxx b/xsd/cxx/elements.hxx index 39eee77..3bbacd0 100644 --- a/xsd/cxx/elements.hxx +++ b/xsd/cxx/elements.hxx @@ -6,6 +6,8 @@ #ifndef CXX_ELEMENTS_HXX #define CXX_ELEMENTS_HXX +#include + #include #include #include @@ -17,8 +19,7 @@ #include #include - -#include +#include namespace CXX { @@ -36,6 +37,30 @@ namespace CXX // Exceptions. // + struct UnrepresentableCharacter + { + UnrepresentableCharacter (String const& str, Size pos) + : str_ (str), pos_ (pos) + { + } + + String const& + string () const + { + return str_; + } + + Size + position () const + { + return pos_; + } + + private: + String str_; + Size pos_; + }; + struct NoNamespaceMapping { NoNamespaceMapping (SemanticGraph::Path const& file, @@ -106,7 +131,6 @@ namespace CXX String reason_; }; - // // class Context @@ -124,7 +148,9 @@ namespace CXX public: Context (std::wostream& o, SemanticGraph::Schema& root, + StringLiteralMap const* custom_literals_map, NarrowString const& char_type__, + NarrowString const& char_encoding__, Boolean include_with_brackets__, NarrowString const& include_prefix__, NarrowString const& esymbol, @@ -141,8 +167,10 @@ namespace CXX : os (c.os), schema_root (c.schema_root), char_type (c.char_type), + char_encoding (c.char_encoding), L (c.L), string_type (c.string_type), + string_literal_map (c.string_literal_map), include_with_brackets (c.include_with_brackets), include_prefix (c.include_prefix), type_exp (c.type_exp), @@ -166,8 +194,10 @@ namespace CXX : os (o), schema_root (c.schema_root), char_type (c.char_type), + char_encoding (c.char_encoding), L (c.L), string_type (c.string_type), + string_literal_map (c.string_literal_map), include_with_brackets (c.include_with_brackets), include_prefix (c.include_prefix), type_exp (c.type_exp), @@ -309,8 +339,10 @@ namespace CXX SemanticGraph::Schema& schema_root; String& char_type; + String& char_encoding; String& L; // string literal prefix String& string_type; + StringLiteralMap const* string_literal_map; Boolean& include_with_brackets; String& include_prefix; @@ -326,6 +358,7 @@ namespace CXX SemanticGraph::Namespace* xs_ns_; String char_type_; + String char_encoding_; String L_; String string_type_; diff --git a/xsd/cxx/literal-map.cxx b/xsd/cxx/literal-map.cxx new file mode 100644 index 0000000..f3f7ee0 --- /dev/null +++ b/xsd/cxx/literal-map.cxx @@ -0,0 +1,296 @@ +// file : xsd/cxx/literal-map.cxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#include // std::auto_ptr +#include // std::size_t +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include + +#include + +using namespace std; +using namespace xercesc; +namespace XML = XSDFrontend::XML; + +namespace CXX +{ + class Handler: public DefaultHandler + { + public: + struct Failed {}; + + Handler (String const& file, StringLiteralMap& map) + : state_ (s_init), file_ (file), map_ (map) + { + } + + virtual void + setDocumentLocator (const Locator* const l) + { + locator_ = l; + } + + virtual Void + startElement (const XMLCh* const, + const XMLCh* const lname, + const XMLCh* const, + const xercesc::Attributes&) + { + String n (XML::transcode (lname)); + + if (n == L"string-literal-map" && state_ == s_init) + state_ = s_map; + else if (n == L"entry" && state_ == s_map) + { + str_seen_ = false; + lit_seen_ = false; + state_ = s_entry; + } + else if (n == L"string" && state_ == s_entry) + { + str_seen_ = true; + str_.clear (); + state_ = s_string; + } + else if (n == L"literal" && state_ == s_entry) + { + lit_seen_ = true; + lit_.clear (); + state_ = s_literal; + } + else + { + wcerr << file_ << ":" << line () << ":" << col () << ": error: " + << "unexpected element '" << n << "'" << endl; + throw Failed (); + } + } + + virtual Void + endElement (const XMLCh* const, + const XMLCh* const lname, + const XMLCh* const) + { + String n (XML::transcode (lname)); + + if (n == L"string-literal-map") + state_ = s_init; + else if (n == L"entry") + { + if (!str_seen_) + { + wcerr << file_ << ":" << line () << ":" << col () << ": error: " + << "expected 'string' element" << endl; + throw Failed (); + } + + if (!lit_seen_) + { + wcerr << file_ << ":" << line () << ":" << col () << ": error: " + << "expected 'literal' element" << endl; + throw Failed (); + } + + map_[str_] = lit_; + state_ = s_map; + } + else if (n == L"string") + state_ = s_entry; + else if (n == L"literal") + state_ = s_entry; + } + +#if _XERCES_VERSION >= 30000 + virtual Void + characters (const XMLCh* const s, const XMLSize_t length) +#else + virtual Void + characters (const XMLCh* const s, const unsigned int length) +#endif + { + String str (XML::transcode (s, length)); + + if (state_ == s_string) + str_ += str; + else if (state_ == s_literal) + lit_ += str; + else + { + for (Size i (0); i < str.size (); ++i) + { + WideChar c (str[i]); + + if (c != 0x20 && c != 0x0A && c != 0x0D && c != 0x09) + { + wcerr << file_ << ":" << line () << ":" << col () << ": error: " + << "unexpected character data" << endl; + throw Failed (); + } + } + } + } + + // Error hanlding. + // + enum Severity {s_warning, s_error, s_fatal}; + + virtual Void + warning (const SAXParseException& e) + { + handle (e, s_warning); + } + + virtual Void + error (const SAXParseException& e) + { + handle (e, s_error); + } + + virtual Void + fatalError (const SAXParseException& e) + { + handle (e, s_fatal); + } + + virtual Void + resetErrors () + { + } + + Void + handle (const SAXParseException& e, Severity s) + { + wcerr << file_ << ":"; + +#if _XERCES_VERSION >= 30000 + wcerr << e.getLineNumber () << ":" << e.getColumnNumber () << ": "; +#else + XMLSSize_t l (e.getLineNumber ()); + XMLSSize_t c (e.getColumnNumber ()); + wcerr << (l == -1 ? 0 : l) << ":" << (c == -1 ? 0 : c) << ": "; +#endif + + String msg (XML::transcode (e.getMessage ())); + wcerr << (s == s_warning ? "warning: " : "error: ") << msg << endl; + + if (s != s_warning) + throw Failed (); + } + + size_t + line () const + { + size_t r (0); + + if (locator_ != 0) + { +#if _XERCES_VERSION >= 30000 + r = static_cast (locator_->getLineNumber ()); +#else + XMLSSize_t l (locator_->getLineNumber ()); + r = l == -1 ? 0 : static_cast (l); +#endif + } + + return r; + } + + size_t + col () const + { + size_t r (0); + + if (locator_ != 0) + { +#if _XERCES_VERSION >= 30000 + r = static_cast (locator_->getColumnNumber ()); +#else + XMLSSize_t c (locator_->getColumnNumber ()); + r = c == -1 ? 0 : static_cast (c); +#endif + } + + return r; + } + + private: + const Locator* locator_; + + enum + { + s_init, + s_map, + s_entry, + s_string, + s_literal + } state_; + + String file_; + StringLiteralMap& map_; + + Boolean str_seen_; + Boolean lit_seen_; + + String str_; + String lit_; + }; + + bool + read_literal_map (NarrowString const& file, StringLiteralMap& map) + { + try + { + // Try to open the file with fstream. This way we get to + // report the error in a consistent manner. + // + { + ifstream ifs (file.c_str ()); + if (!ifs.is_open ()) + { + wcerr << file.c_str () << ": unable to open in read mode" << endl; + return false; + } + } + + String wfile (file); + + LocalFileInputSource is (XML::XMLChString (wfile).c_str ()); + Handler h (wfile, map); + + auto_ptr parser ( + XMLReaderFactory::createXMLReader ()); + + parser->setFeature (XMLUni::fgSAX2CoreNameSpaces, true); + parser->setFeature (XMLUni::fgSAX2CoreNameSpacePrefixes, true); + parser->setFeature (XMLUni::fgSAX2CoreValidation, false); + parser->setFeature (XMLUni::fgXercesSchema, false); + parser->setFeature (XMLUni::fgXercesSchemaFullChecking, false); + + parser->setErrorHandler (&h); + parser->setContentHandler (&h); + + parser->parse (is); + } + catch (Handler::Failed const&) + { + return false; + } + + return true; + } +} diff --git a/xsd/cxx/literal-map.hxx b/xsd/cxx/literal-map.hxx new file mode 100644 index 0000000..1120045 --- /dev/null +++ b/xsd/cxx/literal-map.hxx @@ -0,0 +1,23 @@ +// file : xsd/cxx/literal-map.hxx +// author : Boris Kolpackov +// copyright : Copyright (c) 2005-2009 Code Synthesis Tools CC +// license : GNU GPL v2 + exceptions; see accompanying LICENSE file + +#ifndef CXX_LITERAL_MAP_HXX +#define CXX_LITERAL_MAP_HXX + +#include +#include + +namespace CXX +{ + using namespace Cult; + typedef WideString String; + + typedef Cult::Containers::Map StringLiteralMap; + + bool + read_literal_map (NarrowString const& file, StringLiteralMap& map); +} + +#endif // CXX_LITERAL_MAP_HXX diff --git a/xsd/cxx/parser/cli.hxx b/xsd/cxx/parser/cli.hxx index 504de43..5f31af7 100644 --- a/xsd/cxx/parser/cli.hxx +++ b/xsd/cxx/parser/cli.hxx @@ -24,6 +24,7 @@ namespace CXX typedef Char const Key[]; extern Key type_map; + extern Key char_encoding; extern Key char_type; extern Key output_dir; extern Key xml_parser; @@ -85,6 +86,7 @@ namespace CXX typedef Cult::CLI::Options< type_map, Cult::Containers::Vector, char_type, NarrowString, + char_encoding, NarrowString, output_dir, NarrowString, xml_parser, NarrowString, generate_inline, Boolean, diff --git a/xsd/cxx/parser/elements.cxx b/xsd/cxx/parser/elements.cxx index 8a02ffb..09d1008 100644 --- a/xsd/cxx/parser/elements.cxx +++ b/xsd/cxx/parser/elements.cxx @@ -42,12 +42,15 @@ namespace CXX Context (std::wostream& o, SemanticGraph::Schema& root, CLI::Options const& ops, + StringLiteralMap const* map, Regex const* he, Regex const* ie, Regex const* hie) : CXX::Context (o, root, + map, ops.value (), + ops.value (), ops.value (), ops.value (), ops.value (), diff --git a/xsd/cxx/parser/elements.hxx b/xsd/cxx/parser/elements.hxx index 90ff84e..61cde69 100644 --- a/xsd/cxx/parser/elements.hxx +++ b/xsd/cxx/parser/elements.hxx @@ -39,6 +39,7 @@ namespace CXX Context (std::wostream&, SemanticGraph::Schema&, CLI::Options const&, + StringLiteralMap const*, Regex const* hxx_expr, Regex const* ixx_expr, Regex const* hxx_impl_expr); diff --git a/xsd/cxx/parser/generator.cxx b/xsd/cxx/parser/generator.cxx index 342e3f2..ec08af4 100644 --- a/xsd/cxx/parser/generator.cxx +++ b/xsd/cxx/parser/generator.cxx @@ -126,9 +126,9 @@ namespace CXX { namespace CLI { - extern Key char_type; extern Key type_map = "type-map"; extern Key char_type = "char-type"; + extern Key char_encoding = "char-encoding"; extern Key output_dir = "output-dir"; extern Key xml_parser = "xml-parser"; extern Key generate_inline = "generate-inline"; @@ -206,6 +206,14 @@ namespace CXX << " values are 'char' (default) and 'wchar_t'." << endl; + e << "--char-encoding " << endl + << " Specify the character encoding that should be used\n" + << " in the object model. Valid values for the 'char'\n" + << " character type are 'utf8' (default), 'iso8859-1',\n" + << " 'lcp', and 'custom'. For the 'wchar_t' character\n" + << " type the only valid value is 'auto'." + << endl; + e << "--output-dir " << endl << " Write generated files to instead of current\n" << " directory." @@ -471,6 +479,11 @@ namespace CXX // Misc. // + e << "--custom-literals " << endl + << " Load custom XML string to C++ literal mappings\n" + << " from ." + << endl; + e << "--export-symbol " << endl << " Export symbol for Win32 DLL export/import control." << endl; @@ -600,6 +613,7 @@ namespace CXX generate (Parser::CLI::Options const& ops, Schema& schema, Path const& file_path, + StringLiteralMap const& string_literal_map, Boolean gen_driver, const WarningSet& disabled_warnings, FileList& file_list, @@ -648,7 +662,7 @@ namespace CXX // { NameProcessor proc; - proc.process (ops, schema, file_path); + proc.process (ops, schema, file_path, string_literal_map); } Boolean validation ((ops.value () == "expat" || @@ -701,7 +715,7 @@ namespace CXX String xns; { - Context ctx (std::wcerr, schema, ops, 0, 0, 0); + Context ctx (std::wcerr, schema, ops, 0, 0, 0, 0); xns = ctx.xs_ns_name (); } @@ -1144,7 +1158,13 @@ namespace CXX // HXX // { - Context ctx (hxx, schema, ops, &hxx_expr, &ixx_expr, &hxx_impl_expr); + Context ctx (hxx, + schema, + ops, + &string_literal_map, + &hxx_expr, + &ixx_expr, + &hxx_impl_expr); Indentation::Clip hxx_sloc (hxx); @@ -1231,7 +1251,13 @@ namespace CXX // if (inline_) { - Context ctx (ixx, schema, ops, &hxx_expr, &ixx_expr, &hxx_impl_expr); + Context ctx (ixx, + schema, + ops, + &string_literal_map, + &hxx_expr, + &ixx_expr, + &hxx_impl_expr); Indentation::Clip ixx_sloc (ixx); @@ -1287,7 +1313,13 @@ namespace CXX // if (source) { - Context ctx (cxx, schema, ops, &hxx_expr, &ixx_expr, &hxx_impl_expr); + Context ctx (cxx, + schema, + ops, + &string_literal_map, + &hxx_expr, + &ixx_expr, + &hxx_impl_expr); Indentation::Clip cxx_sloc (cxx); @@ -1351,8 +1383,13 @@ namespace CXX // if (impl) { - Context ctx (hxx_impl, schema, ops, - &hxx_expr, &ixx_expr, &hxx_impl_expr); + Context ctx (hxx_impl, + schema, + ops, + &string_literal_map, + &hxx_expr, + &ixx_expr, + &hxx_impl_expr); String guard (guard_expr.merge (guard_prefix + hxx_impl_name)); guard = ctx.escape (guard); // Make it a C++ id. @@ -1380,8 +1417,13 @@ namespace CXX // if (impl) { - Context ctx (cxx_impl, schema, ops, - &hxx_expr, &ixx_expr, &hxx_impl_expr); + Context ctx (cxx_impl, + schema, + ops, + &string_literal_map, + &hxx_expr, + &ixx_expr, + &hxx_impl_expr); // Set auto-indentation. // @@ -1397,8 +1439,13 @@ namespace CXX // if (driver) { - Context ctx (cxx_driver, schema, ops, - &hxx_expr, &ixx_expr, &hxx_impl_expr); + Context ctx (cxx_driver, + schema, + ops, + &string_literal_map, + &hxx_expr, + &ixx_expr, + &hxx_impl_expr); // Set auto-indentation. // @@ -1412,6 +1459,17 @@ namespace CXX return sloc; } + catch (UnrepresentableCharacter const& e) + { + wcerr << "error: character at position " << e.position () << " " + << "in string '" << e.string () << "' is unrepresentable in " + << "the target encoding" << endl; + + wcerr << "info: use the --custom-literals option to provide custom " + << "string literals mapping" << endl; + + throw Failed (); + } catch (NoNamespaceMapping const& e) { wcerr << e.file () << ":" << e.line () << ":" << e.column () diff --git a/xsd/cxx/parser/generator.hxx b/xsd/cxx/parser/generator.hxx index aaab3b8..8c5631d 100644 --- a/xsd/cxx/parser/generator.hxx +++ b/xsd/cxx/parser/generator.hxx @@ -18,6 +18,7 @@ #include +#include #include namespace CXX @@ -41,6 +42,7 @@ namespace CXX generate (CLI::Options const& options, XSDFrontend::SemanticGraph::Schema&, XSDFrontend::SemanticGraph::Path const& file, + StringLiteralMap const&, Boolean gen_driver, const WarningSet& disabled_warnings, FileList& file_list, diff --git a/xsd/cxx/parser/name-processor.cxx b/xsd/cxx/parser/name-processor.cxx index e9ba876..5f9209e 100644 --- a/xsd/cxx/parser/name-processor.cxx +++ b/xsd/cxx/parser/name-processor.cxx @@ -3,7 +3,6 @@ // copyright : Copyright (c) 2006-2009 Code Synthesis Tools CC // license : GNU GPL v2 + exceptions; see accompanying LICENSE file -#include #include #include @@ -35,10 +34,13 @@ namespace CXX public: Context (CLI::Options const& ops, SemanticGraph::Schema& root, - SemanticGraph::Path const& file) + SemanticGraph::Path const& file, + StringLiteralMap const* map) : CXX::Context (std::wcerr, root, + map, ops.value (), + ops.value (), ops.value (), ops.value (), ops.value (), @@ -1101,9 +1103,10 @@ namespace CXX Void process_impl (CLI::Options const& ops, SemanticGraph::Schema& tu, - SemanticGraph::Path const& file) + SemanticGraph::Path const& file, + StringLiteralMap const& map) { - Context ctx (ops, tu, file); + Context ctx (ops, tu, file, &map); if (tu.names_begin ()->named ().name () == L"http://www.w3.org/2001/XMLSchema") @@ -1196,9 +1199,10 @@ namespace CXX Void NameProcessor:: process (CLI::Options const& ops, SemanticGraph::Schema& tu, - SemanticGraph::Path const& file) + SemanticGraph::Path const& file, + StringLiteralMap const& map) { - process_impl (ops, tu, file); + process_impl (ops, tu, file, map); } } } diff --git a/xsd/cxx/parser/name-processor.hxx b/xsd/cxx/parser/name-processor.hxx index f7849c8..fee7027 100644 --- a/xsd/cxx/parser/name-processor.hxx +++ b/xsd/cxx/parser/name-processor.hxx @@ -6,10 +6,9 @@ #ifndef CXX_PARSER_NAME_PROCESSOR_HXX #define CXX_PARSER_NAME_PROCESSOR_HXX -#include - #include +#include #include namespace CXX @@ -26,7 +25,8 @@ namespace CXX Void process (CLI::Options const& ops, XSDFrontend::SemanticGraph::Schema&, - XSDFrontend::SemanticGraph::Path const& file); + XSDFrontend::SemanticGraph::Path const& file, + StringLiteralMap const& map); }; } } diff --git a/xsd/cxx/parser/parser-header.cxx b/xsd/cxx/parser/parser-header.cxx index 878a891..8ecd898 100644 --- a/xsd/cxx/parser/parser-header.cxx +++ b/xsd/cxx/parser/parser-header.cxx @@ -1324,6 +1324,13 @@ namespace CXX } else { + if (ctx.char_type == L"char" && + ctx.xml_parser == L"xerces" && + ctx.char_encoding != L"custom") + { + ctx.os << "#include " << endl; + } + ctx.os << "#include " << endl << "#include " << endl << "#include " << endl diff --git a/xsd/cxx/parser/validator.cxx b/xsd/cxx/parser/validator.cxx index 526c941..9b5d967 100644 --- a/xsd/cxx/parser/validator.cxx +++ b/xsd/cxx/parser/validator.cxx @@ -27,7 +27,7 @@ namespace CXX CLI::Options const& options, const WarningSet& disabled_warnings, Boolean& valid_) - : Context (std::wcerr, root, options, 0, 0, 0), + : Context (std::wcerr, root, options, 0, 0, 0, 0), disabled_warnings_ (disabled_warnings), disabled_warnings_all_ (false), valid (valid_), @@ -584,7 +584,20 @@ namespace CXX if (options.value () == "expat" && options.value () == "wchar_t") { - wcerr << "error: using expat with wchar_t is not yet supported" + wcerr << "error: using expat with wchar_t is not supported" + << endl; + + return false; + } + + // + // + if (options.value () == "expat" && + !options.value ().empty () && + options.value () != "utf8") + { + wcerr << "error: using expat with character encoding other than " + << "utf8 is not supported" << endl; return false; diff --git a/xsd/cxx/tree/cli.hxx b/xsd/cxx/tree/cli.hxx index 9ccf405..c9078e7 100644 --- a/xsd/cxx/tree/cli.hxx +++ b/xsd/cxx/tree/cli.hxx @@ -24,6 +24,7 @@ namespace CXX typedef Char const Key[]; extern Key char_type; + extern Key char_encoding; extern Key output_dir; extern Key generate_polymorphic; extern Key generate_serialization; @@ -119,6 +120,7 @@ namespace CXX typedef Cult::CLI::Options< char_type, NarrowString, + char_encoding, NarrowString, output_dir, NarrowString, generate_polymorphic, Boolean, generate_serialization, Boolean, diff --git a/xsd/cxx/tree/counter.cxx b/xsd/cxx/tree/counter.cxx index d8223bb..a9649b5 100644 --- a/xsd/cxx/tree/counter.cxx +++ b/xsd/cxx/tree/counter.cxx @@ -239,7 +239,7 @@ namespace CXX count (CLI::Options const& options, SemanticGraph::Schema& tu) { Counts counts; - Context ctx (std::wcerr, tu, options, counts, false, 0, 0, 0); + Context ctx (std::wcerr, tu, options, counts, false, 0, 0, 0, 0); Traversal::Schema schema; Traversal::Sources sources; diff --git a/xsd/cxx/tree/elements.cxx b/xsd/cxx/tree/elements.cxx index db1d858..444caa4 100644 --- a/xsd/cxx/tree/elements.cxx +++ b/xsd/cxx/tree/elements.cxx @@ -39,12 +39,15 @@ namespace CXX CLI::Options const& ops, Counts const& counts_, Boolean generate_xml_schema__, + StringLiteralMap const* map, Regex const* fe, Regex const* he, Regex const* ie) : CXX::Context (o, root, + map, ops.value (), + ops.value (), ops.value (), ops.value (), ops.value (), diff --git a/xsd/cxx/tree/elements.hxx b/xsd/cxx/tree/elements.hxx index 602291d..a0cb1d9 100644 --- a/xsd/cxx/tree/elements.hxx +++ b/xsd/cxx/tree/elements.hxx @@ -117,6 +117,7 @@ namespace CXX CLI::Options const& ops, Counts const& counts_, Boolean generate_xml_schema, + StringLiteralMap const*, Regex const* fwd_expr, Regex const* hxx_expr, Regex const* ixx_expr); diff --git a/xsd/cxx/tree/generator.cxx b/xsd/cxx/tree/generator.cxx index f9b055e..b81504c 100644 --- a/xsd/cxx/tree/generator.cxx +++ b/xsd/cxx/tree/generator.cxx @@ -116,6 +116,7 @@ namespace CXX namespace CLI { extern Key char_type = "char-type"; + extern Key char_encoding = "char-encoding"; extern Key output_dir = "output-dir"; extern Key generate_polymorphic = "generate-polymorphic"; extern Key generate_serialization = "generate-serialization"; @@ -220,12 +221,19 @@ namespace CXX << " values are 'char' (default) and 'wchar_t'." << endl; + e << "--char-encoding " << endl + << " Specify the character encoding that should be used\n" + << " in the object model. Valid values for the 'char'\n" + << " character type are 'utf8' (default), 'iso8859-1',\n" + << " 'lcp', and 'custom'. For the 'wchar_t' character\n" + << " type the only valid value is 'auto'." + << endl; + e << "--output-dir " << endl << " Write generated files to instead of current\n" << " directory." << endl; - e << "--generate-polymorphic" << endl << " Generate polymorphism-aware code. Specify this\n" << " option if you use substitution groups or xsi:type." @@ -670,6 +678,11 @@ namespace CXX << " separate the file name from the part number." << endl; + e << "--custom-literals " << endl + << " Load custom XML string to C++ literal mappings\n" + << " from ." + << endl; + e << "--export-symbol " << endl << " Export symbol for Win32 DLL export/import control." << endl; @@ -803,6 +816,7 @@ namespace CXX generate (Tree::CLI::Options const& ops, Schema& schema, Path const& file_path, + StringLiteralMap const& string_literal_map, const WarningSet& disabled_warnings, FileList& file_list, AutoUnlinks& unlinks) @@ -860,7 +874,7 @@ namespace CXX // { NameProcessor proc; - if (!proc.process (ops, schema, file_path)) + if (!proc.process (ops, schema, file_path, string_literal_map)) throw Failed (); } @@ -1179,8 +1193,15 @@ namespace CXX // if (forward) { - Context ctx (fwd, schema, ops, counts, generate_xml_schema, - &fwd_expr, &hxx_expr, &ixx_expr); + Context ctx (fwd, + schema, + ops, + counts, + generate_xml_schema, + &string_literal_map, + &fwd_expr, + &hxx_expr, + &ixx_expr); Indentation::Clip fwd_sloc (fwd); @@ -1287,8 +1308,15 @@ namespace CXX // HXX // { - Context ctx (hxx, schema, ops, counts, generate_xml_schema, - &fwd_expr, &hxx_expr, &ixx_expr); + Context ctx (hxx, + schema, + ops, + counts, + generate_xml_schema, + &string_literal_map, + &fwd_expr, + &hxx_expr, + &ixx_expr); Indentation::Clip hxx_sloc (hxx); @@ -1434,8 +1462,15 @@ namespace CXX // if (inline_) { - Context ctx (ixx, schema, ops, counts, generate_xml_schema, - &fwd_expr, &hxx_expr, &ixx_expr); + Context ctx (ixx, + schema, + ops, + counts, + generate_xml_schema, + &string_literal_map, + &fwd_expr, + &hxx_expr, + &ixx_expr); Indentation::Clip ixx_sloc (ixx); @@ -1560,8 +1595,15 @@ namespace CXX WideOutputFileStream& os (*cxx[part]); - Context ctx (os, schema, ops, counts, generate_xml_schema, - &fwd_expr, &hxx_expr, &ixx_expr); + Context ctx (os, + schema, + ops, + counts, + generate_xml_schema, + &string_literal_map, + &fwd_expr, + &hxx_expr, + &ixx_expr); Indentation::Clip cxx_sloc (os); @@ -1644,6 +1686,17 @@ namespace CXX return sloc; } + catch (UnrepresentableCharacter const& e) + { + wcerr << "error: character at position " << e.position () << " " + << "in string '" << e.string () << "' is unrepresentable in " + << "the target encoding" << endl; + + wcerr << "info: use the --custom-literals option to provide custom " + << "string literals mapping" << endl; + + throw Failed (); + } catch (NoNamespaceMapping const& e) { wcerr << e.file () << ":" << e.line () << ":" << e.column () diff --git a/xsd/cxx/tree/generator.hxx b/xsd/cxx/tree/generator.hxx index 1aa3c60..a66ede0 100644 --- a/xsd/cxx/tree/generator.hxx +++ b/xsd/cxx/tree/generator.hxx @@ -13,6 +13,7 @@ #include +#include #include namespace CXX @@ -36,6 +37,7 @@ namespace CXX generate (CLI::Options const& options, XSDFrontend::SemanticGraph::Schema&, XSDFrontend::SemanticGraph::Path const& file, + StringLiteralMap const&, const WarningSet& disabled_warnings, FileList& file_list, AutoUnlinks& unlinks); diff --git a/xsd/cxx/tree/name-processor.cxx b/xsd/cxx/tree/name-processor.cxx index 53027af..e15b072 100644 --- a/xsd/cxx/tree/name-processor.cxx +++ b/xsd/cxx/tree/name-processor.cxx @@ -4,7 +4,6 @@ // license : GNU GPL v2 + exceptions; see accompanying LICENSE file #include -#include #include @@ -43,12 +42,14 @@ namespace CXX Counts const& counts, Boolean generate_xml_schema, SemanticGraph::Schema& root, - SemanticGraph::Path const& file) + SemanticGraph::Path const& file, + StringLiteralMap const& map) : Tree::Context (std::wcerr, root, options, counts, generate_xml_schema, + &map, 0, 0, 0), @@ -1970,12 +1971,13 @@ namespace CXX Boolean process_impl (CLI::Options const& ops, SemanticGraph::Schema& tu, - SemanticGraph::Path const& file) + SemanticGraph::Path const& file, + StringLiteralMap const& map) { try { Counts counts; - Context ctx (ops, counts, false, tu, file); + Context ctx (ops, counts, false, tu, file, map); if (tu.names_begin ()->named ().name () == L"http://www.w3.org/2001/XMLSchema") @@ -2096,9 +2098,10 @@ namespace CXX Boolean NameProcessor:: process (CLI::Options const& ops, SemanticGraph::Schema& tu, - SemanticGraph::Path const& file) + SemanticGraph::Path const& file, + StringLiteralMap const& map) { - return process_impl (ops, tu, file); + return process_impl (ops, tu, file, map); } } } diff --git a/xsd/cxx/tree/name-processor.hxx b/xsd/cxx/tree/name-processor.hxx index 9b8eac9..18c3b82 100644 --- a/xsd/cxx/tree/name-processor.hxx +++ b/xsd/cxx/tree/name-processor.hxx @@ -6,11 +6,7 @@ #ifndef CXX_TREE_NAME_PROCESSOR_HXX #define CXX_TREE_NAME_PROCESSOR_HXX -#include - -#include - -#include +#include namespace CXX { @@ -26,7 +22,8 @@ namespace CXX Boolean process (CLI::Options const&, XSDFrontend::SemanticGraph::Schema&, - XSDFrontend::SemanticGraph::Path const& file); + XSDFrontend::SemanticGraph::Path const& file, + StringLiteralMap const&); }; } } diff --git a/xsd/cxx/tree/tree-forward.cxx b/xsd/cxx/tree/tree-forward.cxx index cceedb7..02c4317 100644 --- a/xsd/cxx/tree/tree-forward.cxx +++ b/xsd/cxx/tree/tree-forward.cxx @@ -152,6 +152,13 @@ namespace CXX } else { + if (ctx.char_type == L"char" && ctx.char_encoding != L"custom") + { + ctx.os << "#include " << endl + << endl; + } + ctx.os << "#include " << endl << "#include " << endl << "#include " << endl diff --git a/xsd/cxx/tree/tree-header.cxx b/xsd/cxx/tree/tree-header.cxx index 7bb630c..9b39739 100644 --- a/xsd/cxx/tree/tree-header.cxx +++ b/xsd/cxx/tree/tree-header.cxx @@ -3539,6 +3539,13 @@ namespace CXX { if (ctx.generate_xml_schema) { + if (ctx.char_type == L"char" && ctx.char_encoding != L"custom") + { + ctx.os << "#include " << endl + << endl; + } + ctx.os << "#include " << endl << "#include " << endl << "#include " << endl @@ -3671,6 +3678,13 @@ namespace CXX << "#include // std::binary_search" << endl << endl; + if (ctx.char_type == L"char" && ctx.char_encoding != L"custom") + { + ctx.os << "#include " << endl + << endl; + } + ctx.os << "#include " << endl << "#include " << endl << "#include " << endl diff --git a/xsd/cxx/tree/validator.cxx b/xsd/cxx/tree/validator.cxx index 7ef23fa..5742e7a 100644 --- a/xsd/cxx/tree/validator.cxx +++ b/xsd/cxx/tree/validator.cxx @@ -38,6 +38,7 @@ namespace CXX generate_xml_schema, 0, 0, + 0, 0), disabled_warnings_ (disabled_warnings), disabled_warnings_all_ (false), diff --git a/xsd/elements.hxx b/xsd/elements.hxx index bfde527..3948479 100644 --- a/xsd/elements.hxx +++ b/xsd/elements.hxx @@ -132,4 +132,3 @@ private: }; #endif // ELEMENTS_HXX - diff --git a/xsd/makefile b/xsd/makefile index e58b9dd..12990a6 100644 --- a/xsd/makefile +++ b/xsd/makefile @@ -7,7 +7,8 @@ include $(dir $(lastword $(MAKEFILE_LIST)))../build/bootstrap.make cxx_tun := xsd.cxx -cxx_tun += cxx/elements.cxx +cxx_tun += cxx/elements.cxx \ + cxx/literal-map.cxx cxx_tun += cxx/parser/elements.cxx \ cxx/parser/validator.cxx \ @@ -88,17 +89,22 @@ $(call import,\ $(scf_root)/import/libxsd-frontend/stub.make,\ l: xsd_fe.l,cpp-options: xsd_fe.l.cpp-options) +$(call import,\ + $(scf_root)/import/libxerces-c/stub.make,\ + l: xerces_c.l,cpp-options: xerces_c.l.cpp-options) + # Build. # $(xsd): $(cxx_obj) $(xsd_fe.l) $(be.l) $(cult.l) $(fs.l) $(re.l) $(xerces_c.l) $(cxx_obj) $(cxx_od): cpp_options := -I$(src_base) -$(cxx_obj) $(cxx_od): \ - $(xsd_fe.l.cpp-options) \ - $(be.l.cpp-options) \ - $(cult.l.cpp-options) \ - $(fs.l.cpp-options) \ - $(re.l.cpp-options) +$(cxx_obj) $(cxx_od): \ + $(xsd_fe.l.cpp-options) \ + $(be.l.cpp-options) \ + $(cult.l.cpp-options) \ + $(fs.l.cpp-options) \ + $(re.l.cpp-options) \ + $(xerces_c.l.cpp-options) $(call include-dep,$(cxx_od)) diff --git a/xsd/xsd.cxx b/xsd/xsd.cxx index 7aa18e6..2a67ae9 100644 --- a/xsd/xsd.cxx +++ b/xsd/xsd.cxx @@ -34,6 +34,8 @@ #include #include +#include + #include #include @@ -79,6 +81,7 @@ namespace CLI extern Key location_map = "location-map"; extern Key location_regex = "location-regex"; extern Key location_regex_trace = "location-regex-trace"; + extern Key custom_literals = "custom-literals"; extern Key file_per_type = "file-per-type"; extern Key type_file_regex = "type-file-regex"; extern Key type_file_regex_trace = "type-file-regex-trace"; @@ -101,6 +104,7 @@ namespace CLI location_map, NarrowStrings, location_regex, NarrowStrings, location_regex_trace, Boolean, + custom_literals, NarrowString, file_per_type, Boolean, type_file_regex, NarrowStrings, type_file_regex_trace, Boolean, @@ -188,11 +192,27 @@ private: Boolean trace_; }; +// +// +struct XercesInitializer +{ + XercesInitializer () + { + xercesc::XMLPlatformUtils::Initialize (); + } + + ~XercesInitializer () + { + xercesc::XMLPlatformUtils::Terminate (); + } +}; + // Expand the \n escape sequence. // Void expand_nl (NarrowString& s); + Int main (Int argc, Char* argv[]) { @@ -557,6 +577,22 @@ main (Int argc, Char* argv[]) common_ops.value (), common_ops.value ()); + // Load custom string literals, if any. + // + CXX::StringLiteralMap string_literal_map; + + if (NarrowString file = common_ops.value ()) + { + XercesInitializer xerces_init; + + if (!CXX::read_literal_map (file, string_literal_map)) + { + // Diagnostics has already been issued. + // + return 1; + } + } + if (!fpt) { // File-per-schema compilation mode. @@ -703,7 +739,13 @@ main (Int argc, Char* argv[]) try { sloc += CXX::Tree::Generator::generate ( - *tree_ops, *schema, tu, disabled_w, file_list, unlinks); + *tree_ops, + *schema, + tu, + string_literal_map, + disabled_w, + file_list, + unlinks); } catch (CXX::Tree::Generator::Failed const&) { @@ -717,7 +759,14 @@ main (Int argc, Char* argv[]) try { sloc += CXX::Parser::Generator::generate ( - *parser_ops, *schema, tu, true, disabled_w, file_list, unlinks); + *parser_ops, + *schema, + tu, + string_literal_map, + true, + disabled_w, + file_list, + unlinks); } catch (CXX::Parser::Generator::Failed const&) { @@ -837,7 +886,13 @@ main (Int argc, Char* argv[]) try { sloc += CXX::Tree::Generator::generate ( - *tree_ops, s, path, disabled_w, file_list, unlinks); + *tree_ops, + s, + path, + string_literal_map, + disabled_w, + file_list, + unlinks); } catch (CXX::Tree::Generator::Failed const&) { @@ -853,7 +908,14 @@ main (Int argc, Char* argv[]) // Only generate driver for the first schema. // sloc += CXX::Parser::Generator::generate ( - *parser_ops, s, path, i == b, disabled_w, file_list, unlinks); + *parser_ops, + s, + path, + string_literal_map, + i == b, + disabled_w, + file_list, + unlinks); } catch (CXX::Parser::Generator::Failed const&) { -- cgit v1.1