From 99b98c43b71501854ed930fb1ec5bcebc7cf57a5 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Mon, 12 May 2014 15:53:21 -0700 Subject: Add introduction documentation --- doc/Makefile.am | 5 + doc/default.css | 323 ++++++++++ doc/intro.xhtml | 1762 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ doc/makefile | 18 + 4 files changed, 2108 insertions(+) create mode 100644 doc/Makefile.am create mode 100644 doc/default.css create mode 100644 doc/intro.xhtml create mode 100644 doc/makefile (limited to 'doc') diff --git a/doc/Makefile.am b/doc/Makefile.am new file mode 100644 index 0000000..88c346e --- /dev/null +++ b/doc/Makefile.am @@ -0,0 +1,5 @@ +# file : doc/Makefile.am +# copyright : Copyright (c) 2013-2014 Code Synthesis Tools CC +# license : MIT; see accompanying LICENSE file + +dist_html_DATA = __file__(html_docs) diff --git a/doc/default.css b/doc/default.css new file mode 100644 index 0000000..889f46b --- /dev/null +++ b/doc/default.css @@ -0,0 +1,323 @@ +html { + margin : 0; + padding : 0; + background : white; +} + +body { + font-family : "Lucida Grande", Verdana, "Bitstream Vera Sans", sans-serif; + font-weight : normal; + font-size : 13px; + line-height : 19px; + + color : black; + + margin : 0 2em 0 2em; + padding : 0; +} + + +body { + min-width: 40em; +} + +#container { + max-width : 46em; + margin : 0 auto; + padding : 0 1em 0 1em; +} + + + +/* + * Footer + * + */ +#footer { + color : #3a84a7; + + padding : 1em 0 0.5em 0; + + font-size : 10px; + line-height : 15px; + + text-align: center; +} + +#footer a:link, #footer a:visited { + + color:#1d6699; + text-decoration: underline; +} + +#footer a { + margin-left: 0.7em; + margin-right: 0.7em; +} + +#footer p { + padding: 0; + margin: 0.3em 0 0 0; +} + +/* Distribution terms. */ +#footer #terms { + text-align: justify; + + font-size : 110%; + font-family : monospace; + + padding : 1em 0 0.5em 0; +} + + +/* + * Content + * + */ + +#content { + padding : 0em 0.1em 0 1.3em; + margin : 1.4em 0 0 0; +} + +#content p, +#content ol, +#content ul, +#content dl { + text-align: justify; +} + +#content h1 { + margin-left: -0.89em; +} + +a:link { + color:#0536d2; +} + + +/* + * Headings + * + */ + +h1, h2, h3, h4, h5, h6 { + font-weight : 500; +} + +h1 { font-size : 155%; } +h2 { font-size : 130%; } +h3 { font-size : 125%; } +h4 { font-size : 110%; } +h5 { font-size : 106%; } +h6 { font-size : 100%; } + +h1 { margin : 1.8em 0 0.8em 0;} +h2 { margin-top : 1.4em;} +h3 { margin-top : 1em;} + +p.indent { + margin-left : 1.5em; +} + + +/* + * Fix for IE 5.5 table font problem + * + */ + +table { + font-size : 13px; +} + + +/* + * table of content + * + */ + +ul.toc li { + padding : .4em 0em 0em 0em; +} + + +/* Toc links don't need to show when they are visited. */ +.toc a:visited { + color:#0536d2; +} + + +/* + * lists + * + */ + + +/* list of links */ +ul.menu { + list-style-type : none; +} + +ul.menu li { + padding-top : 0.3em; + padding-bottom : 0.3em; +} + + + +/* @@ I should probably use child selector here */ +/* list with multiline list-elements */ +ul.multiline li, ol.multiline li, dl.multiline dd { + padding-top : 0.16em; + padding-bottom : 0.16em; + + font-size : 11px; + line-height : 15px; +} + + + +/* C++ code snippet */ +pre.cxx { + margin-top : 0em; + margin-bottom : 2em; + + margin-left : 1em; +} + +/* SQL code snippet */ +pre.sql { + margin-top : 0em; + margin-bottom : 2em; + + margin-left : 1em; +} + +/* make code snippet */ +pre.make { + margin-top : 0em; + margin-bottom : 2em; + + margin-left : 1em; +} + +/* terminal output */ +pre.term { + margin-top : 0em; + margin-bottom : 2em; + + margin-left : 1em; +} + + +/* Images */ +div.center { + text-align: center; +} + +/* Document info. */ +#docinfo { + margin-top: 4em; + border-top: 1px dashed #000000; + font-size: 70%; +} + + +/* Footnote */ + +#footnote { + margin-top : 2.5em; +} + +#footnote hr, hr.footnote { + margin-left: 0; + margin-bottom: 0.6em; + width: 8em; + border-top: 1px solid #000000; + border-right: none; + border-bottom: none; + border-left: none; + +} + +#footnote ol { + margin-left: 0; + padding-left: 1.45em; +} + +#footnote li { + text-align : left; + font-size : 11px; + line-height : 15px; + + padding : .4em 0 .4em 0; +} + + +/* Normal table with borders, etc. */ + +table.std { + margin: 2em 0 2em 0; + + border-collapse : collapse; + border : 1px solid; + border-color : #000000; + + font-size : 11px; + line-height : 14px; +} + +table.std th, table.std td { + border : 1px solid; + padding : 0.6em 0.8em 0.6em 0.8em; +} + +table.std th { + background : #cde8f6; +} + +table.std td { + text-align: left; +} + + +/* + * "item | description" table. + * + */ + +table.description { + border-style : none; + border-collapse : separate; + border-spacing : 0; + + font-size : 13px; + + margin : 0.6em 0 0.6em 0; + padding : 0 0 0 0; +} + +table.description tr { + padding : 0 0 0 0; + margin : 0 0 0 0; +} + +table.description * td, table.description * th { + border-style : none; + margin : 0 0 0 0; + vertical-align : top; +} + +table.description * th { + font-weight : normal; + padding : 0.4em 1em 0.4em 0; + text-align : left; + white-space : nowrap; + background : none; +} + +table.description * td { + padding : 0.4em 0 0.4em 1em; + text-align : justify; +} diff --git a/doc/intro.xhtml b/doc/intro.xhtml new file mode 100644 index 0000000..930736b --- /dev/null +++ b/doc/intro.xhtml @@ -0,0 +1,1762 @@ + + + + + + XML Parsing and Serialization in C++ with libstudxml + + + + + + + + + + + + + + + +
+
+ +
+ +
+
XML Parsing and Serialization in C++
+
With libstudxml
+ +

Copyright © 2013-2014 Code Synthesis Tools CC. Permission is + granted to copy, distribute and/or modify this document under the + terms of the MIT license.

+ + +

Revision 1.0, May 2014

+

This revision of the document describes libstudxml 1.0.0.

+
+ +
+

Table of Contents

+ + + + + + + + + + + + + + + + + + + + + + + +
About This Document
1Terminology
2Low-Level API
3High-Level API
4Object Persistence
5Inheritance
6Implementation Notes
+
+ +
+

About This Document

+ +

This document is based on the talk given by Boris Kolpackov at + the C++Now 2014 conference where libstudxml was + first made publicly available. Its goal is to introduce a new, + modern C++ API for XML by showing how to handle the most common + use cases. Compared to the talk, this introduction omits some of + the general discussion relevant to XML in general and its handling + in C++. It also provides more complete code examples that would not + fit onto slides during the presentation. If, however, you would + like to get a more complete picture of "state of XML in C++", then + you may prefer to first watch the video of the talk (when it becomes + available).

+ +

While this document uses some C++11 features in examples, the + library itself can be used in C++98 applications.

+ +

Terminology

+ +

Before we begin, let's define a few terms to make sure we are on + the same page.

+ +

When we say "XML format" that is a bit loose. XML is actually + a meta-format that we specialize for our needs. That is, we decide + what element and attribute names we will use. Which elements will + be valid where. What they will mean, an so on. This specialization + of XML to a specific format is called an XML Vocabulary.

+ +

Often, but not always, when we parse XML, we store extracted data + in the application's memory. Usually, we would create classes + specific to our XML vocabulary. For example, if we have an element + called person then we may create a C++ class also + called person. we will call such classes an + Object Model.

+ +

The content of an element in XML can be empty, text, nested + elements, or a mixture of the two:

+ +
+<empty name="a" id="1"/>
+
+<simple name="b" id="2">text<simple/>
+
+<complex name="c" id="3">
+  <nested>...</nested>
+  <nested>...</nested>
+<complex/>
+
+<mixed name="d" id="4">
+  te<nested>...</nested>
+  x
+  <nested>...</nested>t
+<mixed/>
+  
+ +

These are called the empty, simple, + complex, and mixed content models, + respectively.

+ +

Low-Level API

+ +

libstudxml provides the streaming XML pull parser and + streaming XML serializer. The parser is a conforming, non-validating + XML 1.0 implementation (see Implementation Notes + for details). The application character encoding (that is, the + encoding used in the application's memory) for both parser and + serializer is UTF-8. The output encoding of the serializer is + UTF-8 as well. The parser supports UTF-8, UTF-16, ISO-8859-1, + and US-ASCII input encodings.

+ +
+#include <xml/parser.hxx>
+
+namespace xml
+{
+  class parser;
+}
+  
+ +
+#include <xml/serializer.hxx>
+
+namespace xml
+{
+  class serializer;
+}
+  
+ +

C++ is often used to implement XML converters and filters, especially + where speed is a concern. Such applications require the lowest-level + API with minimum overhead. So we will start there (see the + roundtrip example in the libstudxml + distribution).

+ +
+class parser
+{
+  typedef unsigned short feature_type;
+
+  static const feature_type receive_elements;
+  static const feature_type receive_characters;
+  static const feature_type receive_attributes;
+  static const feature_type receive_namespace_decls;
+
+  static const feature_type receive_default =
+    receive_elements |
+    receive_characters |
+    receive_attributes;
+
+  parser (std::istream&,
+          const std::string& input_name,
+          feature_type = receive_default);
+  ...
+};
+  
+ +

The parser constructor takes three arguments: the stream to parse, + input name that is used in diagnostics to identify the document + being parsed, and the list of events we want the parser to report.

+ +

As an example of an XML filter, let's write one that removes a + specific attribute from the document, say id. The + first step in our filter would then be to create the parser + instance:

+ +
+int main (int argc, char* argv[])
+{
+  ...
+
+  try
+  {
+    using namespace xml;
+
+    ifstream ifs (argv[1]);
+    parser p (ifs, argv[1]);
+
+    ...
+  }
+  catch (const xml::parsing& e)
+  {
+    cerr << e.what () << endl;
+    return 1;
+  }
+}
+  
+ +

Here we also see how to handle parsing errors. So far so good. + Let's see the next piece of the API.

+ +
+class parser
+{
+  enum event_type
+  {
+    start_element,
+    end_element,
+    start_attribute,
+    end_attribute,
+    characters,
+    start_namespace_decl,
+    end_namespace_decl,
+    eof
+  };
+
+  event_type next ();
+};
+  
+ +

We call the next() function when we are ready to handle + the next piece of XML. And now we can implement our filter a bit + further:

+ +
+parser p (ifs, argv[1]);
+
+for (parser::event_type e (p.next ());
+     e != parser::eof;
+     e = p.next ())
+{
+  switch (e)
+  {
+  case parser::start_element:
+    ...
+  case parser::end_element:
+    ...
+  case parser::start_attribute:
+    ...
+  case parser::end_attribute:
+    ...
+  case parser::characters:
+    ...
+  }
+}
+  
+ +

In C++11 we can use the range-based for loop to tidy + things up a bit:

+ +
+parser p (ifs, argv[1]);
+
+for (parser::event_type e: p)
+{
+  switch (e)
+  {
+    ...
+  }
+}
+  
+ +

The next piece of the API puzzle:

+ +
+class parser
+{
+  const std::string& name () const;
+  const std::string& value () const;
+
+  unsigned long long line () const;
+  unsigned long long column () const;
+};
+  
+ +

The name() accessor returns the name of the current element + or attribute. The value() function returns the text of the + characters event for an element or attribute. The line() and + column() accessors return the current position in the document. + Here is how we could print all the element positions for debugging:

+ +
+switch (e)
+{
+case parser::start_element:
+  cerr << p.line () << ':' << p.column () << ": start "
+       << p.name () << endl;
+  break;
+case parser::end_element:
+  cerr << p.line () << ':' << p.column () << ": end "
+       << p.name () << endl;
+  break;
+}
+  
+ +

We have now seen enough of the parsing side to complete our filter. + What's missing is the serialization. So let's switch to that for a + moment:

+ +
+class serializer
+{
+  serializer (std::ostream&,
+              const std::string& output_name,
+              unsigned short indentation = 2);
+
+  ...
+};
+  
+ +

The constructor is pretty similar to the parser's. The + indentation argument specifies the number of indentation + spaces that should be used for pretty-printing. We can disable it by + passing 0.

+ +

Now we can create the serializer instance for our filter:

+ +
+int main (int argc, char* argv[])
+{
+  ...
+
+  try
+  {
+    using namespace xml;
+
+    ifstream ifs (argv[1]);
+    parser p (ifs, argv[1]);
+    serializer s (cout, "output", 0);
+
+    ...
+  }
+  catch (const xml::parsing& e)
+  {
+    cerr << e.what () << endl;
+    return 1;
+  }
+  catch (const xml::serialization& e)
+  {
+    cerr << e.what () << endl;
+    return 1;
+  }
+}
+  
+ +

Notice that we have also added an exception handler for the + serialization exception. Instead of handling + the parsing and serialization + exceptions separately, we can catch just + xml::exception, which is a common base for the + other two:

+ +
+int main (int argc, char* argv[])
+{
+  try
+  {
+    ...
+  }
+  catch (const xml::exception& e)
+  {
+    cerr << e.what () << endl;
+    return 1;
+  }
+}
+  
+ +

The next chunk of the serializer API:

+ +
+class serializer
+{
+  void start_element (const std::string& name);
+  void end_element ();
+
+  void start_attribute (const std::string& name);
+  void end_attribute ();
+
+  void characters (const std::string& value);
+};
+  
+ +

Everything should be pretty self-explanatory here. And we have + now seen enough to finish our filter:

+ +
+parser p (ifs, argv[1]);
+serializer s (cout, "output", 0);
+
+bool skip (false);
+
+for (parser::event_type e: p)
+{
+  switch (e)
+  {
+  case parser::start_element:
+    {
+      s.start_element (p.name ());
+      break;
+    }
+  case parser::end_element:
+    {
+      s.end_element ();
+      break;
+    }
+  case parser::start_attribute:
+    {
+      if (p.name () == "id")
+        skip = true;
+      else
+        s.start_attribute (p.name ());
+      break;
+    }
+  case parser::end_attribute:
+    {
+      if (skip)
+        skip = false;
+      else
+        s.end_attribute ();
+      break;
+    }
+  case parser::characters:
+    {
+      if (!skip)
+        s.characters (p.value ());
+      break;
+    }
+  }
+}
+  
+ +

Do you see any problems with our filter? Well, one problem is + that this implementation doesn't handle XML namespaces. Let's + see how we can fix this. The first issue is with the element + and attribute names. When namespaces are used, those may be + qualified. libstudxml uses the qname + class to represent such names:

+ +
+#include <xml/qname.hxx>
+
+namespace xml
+{
+  class qname
+  {
+  public:
+    qname ();
+    qname (const std::string& name);
+    qname (const std::string& namespace_,
+           const std::string& name);
+
+    const std::string& namespace_ () const;
+    const std::string& name () const;
+  };
+}
+  
+ +

The parser, in addition to the name() accessor also + has qname() which returns the potentially qualified + name. Similarly, the start_element() and + start_attribute() functions in the serializer are + overloaded to accept qname:

+ +
+class parser
+{
+  const qname& qname () const;
+};
+
+class serializer
+{
+  void start_element (const qname&);
+  void start_attribute (const qname&);
+};
+  
+ +

The first thing we need to do to make our filter namespace-aware + is to use qualified names instead of the local ones. This one is + easy:

+ +
+switch (e)
+{
+case parser::start_element:
+  {
+    s.start_element (p.qname ());
+    break;
+  }
+case parser::start_attribute:
+  {
+    if (p.qname () == "id") // Unqualified name.
+      skip = true;
+    else
+      s.start_attribute (p.qname ());
+    break;
+  }
+}
+  
+ + +

There is, however, another thing that we have to do. Right now our + code does not propagate the namespace-prefix mappings from the input + document to the output. At the moment, where the input XML might have + meaningful prefixes assigned to namespace, the output will have + automatically generated ones like g1, g2, + and so on.

+ +

To fix this, first we need to tell the parser to report namespace-prefix + mappings, called namespace declarations in XML, to us:

+ +
+parser p (ifs,
+          argv[1]
+          parser::receive_default |
+          parser::receive_namespace_decls);
+  
+ +

We then also need to propagate this information to the serializer by + handling the start_namespace_decl event:

+ +
+for (...)
+{
+  switch (e)
+  {
+    ...
+
+  case parser::start_namespace_decl:
+    s.namespace_decl (p.namespace_ (), p.prefix ());
+    break;
+
+    ...
+  }
+}
+  
+ +

Well, that wasn't too bad.

+ +

High-Level API

+ +

So that was a pretty low level XML work where we didn't care about + the semantics of the stored data, or, in fact the XML vocabulary that + we dealt with.

+ +

However, this API will quickly become tedious once we try to handle + a specific XML vocabulary and do something useful with the stored + data. Why is that? There are several areas where we could use some + help:

+ +
    +
  • Validation and error handling
  • +
  • Attribute access
  • +
  • Data extraction
  • +
  • Content model processing
  • +
  • Control flow
  • +
+ +

Let's examine each area using our object position vocabulary as a + test case (see the processing example in the + libstudxml distribution).

+ +
+<object id="123">
+  <name>Lion's Head</name>
+  <type>mountain</type>
+
+  <position lat="-33.8569" lon="18.5083"/>
+  <position lat="-33.8568" lon="18.5083"/>
+  <position lat="-33.8568" lon="18.5082"/>
+</object>
+  
+ +

If you cannot assume the XML you are parsing is valid, and you + generally shouldn't, then you will quickly realize that the biggest + pain in dealing with XML is making sure that what we got is actually + valid.

+ +

This stuff is pervasive. What if the root element is spelled + wrong? Maybe the id attribute is missing? Or there + is some stray text before the name element? Things + can be broken in an infinite number of ways.

+ +

To illustrate this point, here is the parsing code of just the + root element with proper error handling:

+ +
+parser p (ifs, argv[1]);
+
+if (p.next () != parser::start_element ||
+    p.qname () != "object")
+{
+  // error
+}
+
+...
+
+if (p.next () != parser::end_element) // object
+{
+  // error
+}
+  
+ +

Not very pretty. To help with this, the parser API provides the + next_expect() function:

+ +
+class parser
+{
+  void next_expect (event_type);
+  void next_expect (event_type, const std::string& name);
+};
+  
+ +

This function gets the next event and makes sure it is what's + expected. If not, it throws an appropriate parsing exception. + This simplifies our root element parsing quite a bit:

+ +
+parser p (ifs, argv[1]);
+
+p.next_expect (parser::start_element, "object");
+...
+p.next_expect (parser::end_element); // object
+  
+ +

Let's now take the next step and try to handle the id + attribute. According to what we have seen so far, it will look + something along these lines:

+ +
+p.next_expect (parser::start_element, "object");
+
+p.next_expect (parser::start_attribute, "id");
+p.next_expect (parser::characters);
+cout << "id: " << p.value () << endl;
+p.next_expect (parser::end_attribute);
+
+...
+
+p.next_expect (parser::end_element); // object
+  
+ +

Not too bad but there is a bit of a problem. What if our object + element had several attributes? The order of attributes in XML + is arbitrary so we should be prepared to get them in any order. + This fact complicates our attribute parsing code quite a bit:

+ +
+while (p.next () == parser::start_attribute)
+{
+  if (p.qname () == "id")
+  {
+    p.next_expect (parser::characters);
+    cout << "id: " << p.value () << endl;
+  }
+  else if (...)
+  {
+  }
+  else
+  {
+    // error: unknown attribute
+  }
+
+  p.next_expect (parser::end_attribute);
+}
+  
+ +

There is also a bug in this version. Can you see it? We now + don't make sure that the id attribute was actually + specified.

+ +

If you think about it, at this level, it is actually not that + convenient to receive attributes as events. In fact, a map of + attributes would be much more usable.

+ +

Remember we talked about the parser features that specify which + events we want to see:

+ +
+class parser
+{
+  static const feature_type receive_elements;
+  static const feature_type receive_characters;
+  static const feature_type receive_attributes;
+
+  ...
+};
+  
+ +

Well, in reality, there is no receive_attributes. Rather, + there are these two options: + +

+class parser
+{
+  static const feature_type receive_attributes_map;
+  static const feature_type receive_attributes_event;
+
+  ...
+};
+  
+ +

That is, we can ask the parser to send us attributes as events or + as a map. And the default is to send them as a map.

+ +

In case of a map, we have the following attribute access API to work + with:

+ +
+class parser
+{
+  const std::string& attribute (const std::string& name) const;
+
+  std::string attribute (const std::string& name,
+                         const std::string& default_value) const;
+
+  bool attribute_present (const std::string& name) const;
+};
+  
+ +

If the attribute is not found, then the version without the default + value throws an appropriate parsing exception while the version with + the default value returns that value. There are also the + qname versions of these functions.

+ +

Let's see how this simplifies our code:

+ +
+p.next_expect (parser::start_element, "object");
+
+cout << "id: " << p.attribute ("id") << endl;
+
+...
+
+p.next_expect (parser::end_element); // object
+  
+ +

Much better.

+ +

If the id attribute is not present, then we get an + exception. But what happens if we have a stray attribute in our + document? The attribute map is magical in this sense. After + the end_element event for the object + element the parser will examine the attribute map. If there is + an attribute that hasn't been retrieved with one of the attribute + access functions, then the parser will throw the unexpected + attribute exception.

+ +

Error handling out of the way, the next thing that will annoy us is data + extractions. In XML everything is text. While our id value + is an integer, XML stores it as text and the low-level API returns it to + us as text. To help with this the parser provides the following data + extraction functions:

+ +
+class parser
+{
+  template <typename T>
+  T value () const;
+
+  template <typename T>
+  T attribute (const std::string& name) const;
+
+  template <typename T>
+  T attribute (const std::string& name,
+               const T& default_value) const;
+};
+  
+ +

Now we can get the id as an integer without much fuss:

+ +
+p.next_expect (parser::start_element, "object");
+
+unsigned int id = p.attribute<unsigned int> ("id");
+
+...
+
+p.next_expect (parser::end_element); // object
+  
+ +

Ok, let's try to parse our vocabulary a bit further:

+ +
+p.next_expect (parser::start_element, "object");
+unsigned int id = p.attribute<unsigned int> ("id");
+
+p.next_expect (parser::start_element, "name");
+
+...
+
+p.next_expect (parser::end_element); // name
+
+p.next_expect (parser::end_element); // object
+  
+ +

Here is the part of the document that we are parsing:

+ +
+<object id="123">
+  <name>Lion's Head</name>
+  
+ +

What do you think, is everything's alright with our code? When we + try to parse our document, we will get an exception here:

+ +
+p.next_expect (parser::start_element, "name");
+  
+ +

Any idea why? Let's try to print the event that we get:

+ +
+// p.next_expect (parser::start_element, "name");
+cerr << p.next () << endl;
+  
+ +

We expect start_element but get characters! + Wait a minute, but there are characters after object and + before name. There is a newline and two spaces that are + replaced with hashes for illustration here:

+ +
+<object id="123">#
+##<name>Lion's Head</name>
+  
+ +

If you go to a forum or a mailing list for any XML parser, this will + be the most common question. Why do I get text when I should clearly + get an element!?

+ +

The reason why we get this whitespace text is because the parser has no + idea whether it is significant or not. The significance of whitespaces is + determined by the XML content model that we talked about earlier. Here is + the table:

+ +
+namespace xml
+{
+  enum class content
+  {          //  element   characters  whitespaces
+    empty,   //    no          no        ignored
+    simple,  //    no          yes       preserved
+    complex, //    yes         no        ignored
+    mixed    //    yes         yes       preserved
+  };
+}
+  
+ +

In empty content neither nested elements nor characters are allowed with + whitespaces ignored. Simple content allows no nested elements with + whitespaces preserved. Complex content allows nested elements only with + whitespaces which are ignored. Finally, the mixed content allows anything + in any order with everything preserved.

+ +

If we specify the content model for an element, then the parser + will do automatic whitespace processing for us:

+ +
+class parser
+{
+  void content (content);
+};
+  
+ +

That is, in empty and complex content, whitespaces will be silently + ignored. By knowing the content model, the parser also has a chance to do + more error handling for us. It will automatically throw appropriate + exceptions if there are nested elements in empty or simple content or + non-whitespace characters in complex content.

+ +

Ok, let's now see how we can take advantage of this feature in + our code:

+ +
+p.next_expect (parser::start_element, "object");
+p.content (content::complex);
+
+unsigned int id = p.attribute<unsigned int> ("id");
+
+p.next_expect (parser::start_element, "name"); // Ok.
+
+...
+
+p.next_expect (parser::end_element); // name
+
+p.next_expect (parser::end_element); // object
+  
+ +

Now whitespaces are ignored and everything works as we expected. + Here is how we can parse the content of the name + element:

+ +
+p.next_expect (parser::start_element, "name");
+p.content (content::simple);
+
+p.next_expect (parser::characters);
+string name = p.value ();
+
+p.next_expect (parser::end_element); // name
+  
+ +

As you can see, parsing a simple content element is quite a bit more + involved compared to getting a value of an attribute. Element markup also + has higher overhead in the resulting XML. That's why in our case it would + have been wiser to make name and type + attributes.

+ +

But if we are stuck with a lot of simple content elements, then + the parser provides the following helper functions:

+ +
+class parser
+{
+  std::string element ();
+
+  template <typename T>
+  T element ();
+
+  std::string element (const std::string& name);
+
+  template <typename T>
+  T element (const std::string& name);
+
+  std::string element (const std::string& name,
+                       const std::string& default_value);
+
+  template <typename T>
+  T element (const std::string& name,
+             const T& default_value);
+};
+  
+ +

The first two assume that you have already handled the + start_element event. They should be used if the element also + has attributes. The other four parse the complete element. Overloaded + qname versions are also provided.

+ +

Here is how we can simplify our parsing code thanks to these + functions:

+ +
+p.next_expect (parser::start_element, "object");
+p.content (content::complex);
+
+unsigned int id = p.attribute<unsigned int> ("id");
+string name = p.element ("name");
+
+p.next_expect (parser::end_element); // object
+  
+ +

For the type element we would like to use this enum + class:

+ +
+enum class object_type
+{
+  building,
+  mountain,
+  ...
+};
+  
+ +

The parsing code is similar to the name element. Now + we use the data extracting version of the element() + function:

+ +
+object_type type = p.element<object_type> ("type");
+  
+ +

Except that this won't compile. The parser doesn't know how to + convert the text representation to our enum. By + default the parser will try to use the iostream + extraction operator but we haven't provided any.

+ +

We can provide conversion code specifically for XML by specializing + the value_traits class template:

+ +
+namespace xml
+{
+  template <>
+  struct value_traits<object_type>
+  {
+    static object_type
+    parse (std::string, const parser&)
+    {
+      ...
+    }
+
+    static std::string
+    serialize (object_type, const serializer&)
+    {
+      ...
+    }
+  };
+}
+  
+ +

The last bit that we need to handle is the position + element. The interesting part here is how to stop without going + too far since there can be several of them. To help with this task + the parser allows us to peek into the next event:

+ +
+p.next_expect (parser::start_element, "object");
+p.content (content::complex);
+...
+
+do
+{
+  p.next_expect (parser::start_element, "position");
+  p.content (content::empty);
+
+  float lat = p.attribute<float> ("lat");
+  float lon = p.attribute<float> ("lon");
+
+  p.next_expect (parser::end_element);
+
+} while (p.peek () == parser::start_element);
+
+p.next_expect (parser::end_element); // object
+  
+ +

Do you see anything else that we can improve? Actually, there is + one thing. Look at the next_expect() calls in the + above code. They are both immediately followed by the setting + of the content model. We can tidy this up a bit by passing the + content model as a third argument to next_expect(). + This even reads like prose: "Next we expect the start of an + element called position that shall have empty + content."

+ +

Here is the complete, production-quality parsing code for our XML + vocabulary. 13 lines. With validation and everything:

+ +
+parser p (ifs, argv[1]);
+
+p.next_expect (parser::start_element, "object", content::complex);
+
+unsigned int id = p.attribute<unsigned int> ("id");
+string name = p.element ("name");
+object_type type = p.element<object_type> ("type");
+
+do
+{
+  p.next_expect (parser::start_element, "position", content::empty);
+
+  float lat = p.attribute<float> ("lat");
+  float lon = p.attribute<float> ("lon");
+
+  p.next_expect (parser::end_element); // position
+} while (p.peek () == parser::start_element)
+
+p.next_expect (parser::end_element); // object
+  
+ +

So that was the high-level parsing API. Let's now catch up with the + corresponding additions to the serializer.

+ +

Similar to parsing, calling start_attribute(), + characters(), and then end_attribute() + might not be convenient. Instead we can add an attribute with + a single call:

+ +
+class serializer
+{
+  void attribute (const std::string& name,
+                  const std::string& value);
+
+  void element (const std::string& value);
+
+  void element (const std::string& name,
+                const std::string& value);
+};
+  
+ +

The same for elements with simple content. The first version finishes + the element that we have started, while the second writes the complete + element. There are also the qname versions of these + functions that are not shown.

+ +

Instead of strings we can also serialize value types. This uses the + same value_traits specialization mechanism that we have + used for parsing:

+ +
+class serializer
+{
+  template <typename T>
+  void attribute (const std::string& name,
+                  const T& value);
+
+  template <typename T>
+  void element (const T& value);
+
+  template <typename T>
+  void element (const std::string& name,
+                const T& value);
+
+  template <typename T>
+  void characters (const T& value);
+};
+  
+ +

Let's now see now how we can serialize a complete sample document for + our object position vocabulary using this high-level API:

+ +
+serializer s (cout, "output");
+
+s.start_element ("object");
+
+s.attribute ("id", 123);
+s.element ("name", "Lion's Head");
+s.element ("type", object_type::mountain);
+
+for (...)
+{
+  s.start_element ("position");
+
+  float lat (...), lon (...);
+
+  s.attribute ("lat", lat);
+  s.attribute ("lon", lon);
+
+  s.end_element (); // position
+}
+
+s.end_element (); // object
+  
+ +

Pretty straightforward stuff.

+ +

Object Persistence

+ +

So far we have used our API to first implement a filter that doesn't + really care about the data and then an application that processes the + data without creating any kind of object model. Let's now try to handle + the other end of the spectrum: objects that know how to persist + themselves into XML (see the persistence example in + the libstudxml distribution).

+ +

But before we continue, let's fix our XML to be slightly more idiomatic. + That is we make name and type to be attributes + rather than elements:

+ +
+<object name="Lion's Head" type="mountain" id="123">
+  <position lat="-33.8569" lon="18.5083"/>
+  <position lat="-33.8568" lon="18.5083"/>
+  <position lat="-33.8568" lon="18.5082"/>
+</object>
+  
+ +

Generally, the API works best with idiomatic XML and will nudge you + gently in that direction with minor inconveniences.

+ +

For this vocabulary, the object model might look like this:

+ +
+enum class object_type {...};
+
+class position
+{
+  ...
+
+  float lat_;
+  float lon_;
+};
+
+class object
+{
+  ...
+
+  std::string name_;
+  object_type type_;
+  unsigned int id_;
+  std::vector<position> positions_;
+};
+  
+ +

Here I omit sensible constructors, accessors and modifiers that our + classes would probably have.

+ +

Let me also mention that what I am going to show next is what I + believe is the sensible structure for XML persistence using this + API. But that doesn't mean that's the only way. For example, we + are going to do parsing in a constructor:

+ +
+class position
+{
+  position (xml::parser&);
+
+  void
+  serialize (xml::serializer&) const;
+
+  ...
+};
+
+class object
+{
+  object (xml::parser&);
+
+  void
+  serialize (xml::serializer&) const;
+
+  ...
+};
+  
+ +

But you may prefer to first create an instance, say with the default + constructor, and then have a separate function do the parsing. + Nothing wrong with this approach.

+ +

Let's start with the position constructor. Here, we are + immediately confronted with this choice: do we parse the start and end + element events in position or expect our caller to handle them.

+ +

I suggest that we let our caller do this. We may have different elements + in our vocabulary that use the same position type. If we + assume the element name in the constructor, then we won't be able to use + the same class for all these elements. We will see the second advantage + of this arrangement in a moment, when we deal with inheritance. But, if + you have a simple model with one-to-one mapping between types and + elements and no inheritance, then there is nothing wrong with going the + other route.

+ +
+position::
+position (parser& p)
+  : lat_ (p.attribute<float> ("lat")),
+    lon_ (p.attribute<float> ("lon"))
+{
+  p.content (content::empty);
+}
+  
+ +

Ok, nice and clean so far. Let's look at the object + constructor:

+ +
+object::
+object (parser& p)
+  : name_ (p.attribute ("name")),
+    type_ (p.attribute<object_type> ("type")),
+    id_ (p.attribute<unsigned int> ("id"))
+{
+  p.content (content::complex);
+
+  do
+  {
+    p.next_expect (parser::start_element, "position");
+    positions_.push_back (position (p));
+    p.next_expect (parser::end_element);
+
+  } while (p.peek () == parser::start_element);
+}
+  
+ +

The only mildly interesting line here is where we call the position + constructor to parse the content of the nested elements.

+ +

Before we look into serialization, let me also mentioned one other + thing. In our vocabulary all the attributes are required but it is + quite common to have optional attributes. The API functions with + default values make it really convenient to handle such attributes + in the initializer lists.

+ +

Let's say the type attribute is optional. Then we + could do this:

+ +
+object::
+object (parser& p)
+  : ...
+    type_ (p.attribute ("type", object_type::other))
+    ...
+  
+ +

We use the same arrangement for serialization, that is, the + containing object starts and ends the element allowing us to + reuse the same type for different elements:

+ +
+void position::serialize (serializer& s) const
+{
+  s.attribute ("lat", lat_);
+  s.attribute ("lon", lon_);
+}
+
+void object::serialize (serializer& s) const
+{
+  s.attribute ("name", name_);
+  s.attribute ("type", type_);
+  s.attribute ("id", id_);
+
+  for (const auto& p: positions_)
+  {
+    s.start_element ("position");
+    p.serialize (s);
+    s.end_element ();
+  }
+}
+  
+ +

Ok, also nice and tidy.

+ + There is one thing, however, that is not so nice: the start of + the parser or serializer. Here is the code:

+ +
+parser p (ifs, argv[1]);
+p.next_expect (parser::start_element, "object");
+object o (p);
+p.next_expect (parser::end_element);
+
+serializer s (cout, "output");
+s.start_element ("object");
+o.serialize (s);
+s.end_element ();
+  
+ +

Remember, we made the caller responsible for handling the start and + end of the element. This works beautifully inside the object model but + not so much in the client code. What we would like to see instead + is this:

+ +
+parser p (ifs, argv[1]);
+object o (p);
+
+serializer s (cout, "output");
+o.serialize (s);
+  
+ +

The main reason for choosing this structure was the ability to reuse the + same type for different elements. The other reason was inheritance which + we haven't gotten to yet. If we think about it, it is very unlikely for a + class corresponding to the root of our vocabulary to also be used inside + as a local element. I can't remember ever seeing a vocabulary like + this.

+ +

So what we can do here is make an exception: the root type of our + object model handles the top-level element. Here is the parser:

+ +
+object::
+object (parser& p)
+{
+  p.next_expect (
+    parser::start_element, "object", content::complex);
+
+  name_ = p.attribute ("name");
+  type_ = p.attribute<object_type> ("type");
+  id_ = p.attribute<unsigned int> ("id");
+
+  ...
+
+  p.next_expect (parser::end_element);
+}
+  
+ +

And here is the serializer:

+ +
+void object::
+serialize (serializer& s) const
+{
+  s.start_element ("object");
+
+  ...
+
+  s.end_element ();
+}
+  
+ +

The only minor drawback of going this route is that we can no longer + parse attributes in the initializer list for the root object

. + +

Inheritance

+ +

So far we had a smooth sailing with the streaming approach but things get + a bit bumpy once we start dealing with inheritance. This is normally + where the in-memory approach has its day.

+ +

Say we have elevated-object which adds the + units attribute and the elevation elements. + Here is the XML:

+ +
+<elevated-object name="Lion's Head" type="mountain"
+                 units="m" id="123">
+  <position lat="-33.8569" lon="18.5083"/>
+  <position lat="-33.8568" lon="18.5083"/>
+  <position lat="-33.8568" lon="18.5082"/>
+
+  <elevation val="668.9"/>
+  <elevation val="669"/>
+  <elevation val="669.1"/>
+</elevated-object>
+  
+ +

And here is the object model:

+ +
+enum class units {...};
+
+class elevation {...};
+
+class elevated_object: public object
+{
+  ...
+
+  units units_;
+  std::vector<elevation> elevations_;
+};
+  
+ +

Streaming assumes linearity. We start an element, add some attributes, + add some nested elements, and end the element. In contrast, with an + in-memory approach we can add some attributes, then add some nested + elements, then go back and add more attributes. This kind of back and + forth is exactly what inheritance often requires. So this is a bit of + problem for us.

+ +

Consider the elevated_object constructor:

+ +
+elevated_object::
+elevated_object (parser& p)
+  : object (p),
+    units_ (p.attribute<units> ("units"))
+{
+  do
+  {
+    p.next_expect (parser::start_element, "elevation");
+    elevations_.push_back (elevation (p));
+    p.next_expect (parser::end_element);
+
+  } while (p.peek () == parser::start_element &&
+           p.name () == "elevation")
+}
+  
+ +

Note that here I assume we went back to our original architecture + where the caller handles the start and end of the element (this is + the other advantage of this architecture: it allows us to reuse + base parsing and serialization code in derived classes).

+ +

So we would like to reuse the parsing code from object + so we call the base constructor first.

+ +

Then we parse the derived attribute and elements. Do you see + the problem? The object constructor will parse its + attributes and then move on to nested elements. When this constructor + returns, we need to go back to parsing attributes! This is not + something that a streaming approach would normally allow.

+ +

To resolve this, the lifetime of the attribute map was extend until + after the end_element event. That is, we can access + attributes any time we are at the element's level. As a result, + the above code just works.

+ +

We have the same problem in serialization. Let's say we write + the straightforward code like this:

+ +
+void elevated_object::
+serialize (serializer& s) const
+{
+  object::serialize (s);
+
+  s.attribute ("units", units_);
+
+  for (const auto& e: elevations_)
+  {
+    s.start_element ("elevation");
+    e.serialize (s);
+    s.end_element ();
+  }
+}
+  
+ +

This is not going to work since we will try to add the units + attribute after the nested position elements have already + been written.

+ +

To handle inheritance in serialization we have to split the + serialize() function into two. One serializes + the attributes while the other — content:

+ +
+void object::
+serialize_attributes (serializer& s) const
+{
+  s.attribute ("name", name_);
+  s.attribute ("type", type_);
+  s.attribute ("id", id_);
+}
+
+void object::
+serialize_content (serializer& s) const
+{
+  for (const auto& p: positions_)
+  {
+    s.start_element ("position");
+    p.serialize (s);
+    s.end_element ();
+  }
+}
+  
+ +

The serialize() function then simply calls these two + in the correct order.

+ +
+void object::
+serialize (serializer& s) const
+{
+  serialize_attributes (s);
+  serialize_content (s);
+}
+  
+ +

I bet you can guess what the elevated_object's + implementation looks like:

+ +
+void elevated_object::
+serialize_attributes (serializer& s) const
+{
+  object::serialize_attributes (s);
+  s.attribute ("units", units_);
+}
+
+void elevated_object::
+serialize_content (serializer& s) const
+{
+  object::serialize_content (s);
+
+  for (const auto& e: elevations_)
+  {
+    s.start_element ("elevation");
+    e.serialize (s);
+    s.end_element ();
+  }
+}
+  
+ +

The serialize() function for elevated_object + is exactly the same:

+ +
+void elevated_object::
+serialize (serializer& s) const
+{
+  serialize_attributes (s);
+  serialize_content (s);
+}
+  
+ +

Implementation Notes

+ +

libstudxmlis open source (MIT license), portable + (autotools and VC++ projects provided), and external dependency-free + implementation.

+ +

It provides a conforming, non-validating XML 1.0 parser by using + the mature and tested Expat XML parser. libstudxml + includes the Expat source code (also distributed under the MIT + license) as an implementation detail. However, you can link to + an external Expat library if you prefer.

+ +

If you are familiar with Expat, you are probably wondering how + the push interface provided by Expat was adapted to the pull + API shown earlier. Expat allows us to suspend and resume parsing + after every event and that's exactly what this implementation + does. The performance cost of this constant suspension and + resumption is about 35% of Expat's performance, which is not + negligible but not the end of the world either.

+ +

All in, with all the name splitting and string constructions, + parsing throughput on a 2010 Intel Core i7 laptop is about + 35 MByte/sec, which should be sufficient for most applications.

+ +

While it is much easier to implement a conforming serializer + from scratch, libstudxml reuses an existing and + tested implementation in this case as well. It includes source + code of a small C library for XML serialization called Genx + (also MIT licensed) that was initially created by Tim Bray + and significantly improved and extended over the past years + as part of the XSD/e project.

+ +
+
+ + + diff --git a/doc/makefile b/doc/makefile new file mode 100644 index 0000000..a40e0bf --- /dev/null +++ b/doc/makefile @@ -0,0 +1,18 @@ +# file : doc/makefile +# copyright : Copyright (c) 2013-2014 Code Synthesis Tools CC +# license : MIT; see accompanying LICENSE file + +include $(dir $(lastword $(MAKEFILE_LIST)))../build/bootstrap.make + +dist := $(out_base)/.dist + +# Dist. +# +$(dist): data_dist := default.css intro.xhtml +$(dist): export html_docs := $(data_dist) +$(dist): + $(call dist-data,$(html_docs)) + $(call meta-automake) + +$(call include,$(bld_root)/dist.make) +$(call include,$(bld_root)/meta/automake.make) -- cgit v1.1