aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2014-04-29 13:06:50 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2014-04-29 13:06:50 +0200
commitf6313f17cb87d62c4a73f9d135baafd076431311 (patch)
tree1acd7c0ee273f2f423a835641f2d100a3a0d6202
parent818bedc799073966a4c56fd83ab1df358b9e9c24 (diff)
Accumulate characters in simple content
This makes the high-level data extraction (e.g., value<T>()) much more usable since without this functionality the content can be delivered in multiple chunks.
-rw-r--r--tests/parser/driver.cxx57
-rw-r--r--xml/parser.cxx68
-rw-r--r--xml/parser.hxx7
3 files changed, 111 insertions, 21 deletions
diff --git a/tests/parser/driver.cxx b/tests/parser/driver.cxx
index c5d18f0..e4ecd69 100644
--- a/tests/parser/driver.cxx
+++ b/tests/parser/driver.cxx
@@ -84,6 +84,24 @@ main ()
// cerr << e.what () << endl;
}
+ // Test namespace declarations.
+ //
+ {
+ // Followup end element event that should be precedeeded by end
+ // namespace declaration.
+ //
+ istringstream is ("<root xmlns:a='a'/>");
+ parser p (is,
+ "test",
+ parser::receive_default |
+ parser::receive_namespace_decls);
+
+ p.next_expect (parser::start_element, "root");
+ p.next_expect (parser::start_namespace_decl);
+ p.next_expect (parser::end_namespace_decl);
+ p.next_expect (parser::end_element);
+ }
+
// Test value extraction.
//
{
@@ -269,6 +287,45 @@ main ()
// cerr << e.what () << endl;
}
+ {
+ // Test content accumulation in simple content.
+ //
+ istringstream is ("<root xmlns:a='a'>1&#x32;3</root>");
+ parser p (is,
+ "simple",
+ parser::receive_default |
+ parser::receive_namespace_decls);
+
+ assert (p.next () == parser::start_element);
+ p.next_expect (parser::start_namespace_decl);
+ p.content (parser::simple);
+ assert (p.next () == parser::characters && p.value () == "123");
+ p.next_expect (parser::end_namespace_decl);
+ assert (p.next () == parser::end_element);
+ assert (p.next () == parser::eof);
+ }
+
+ try
+ {
+ // Test error handling in accumulation in simple content.
+ //
+ istringstream is ("<root xmlns:a='a'>1&#x32;<nested/>3</root>");
+ parser p (is,
+ "simple",
+ parser::receive_default |
+ parser::receive_namespace_decls);
+
+ assert (p.next () == parser::start_element);
+ p.next_expect (parser::start_namespace_decl);
+ p.content (parser::simple);
+ p.next ();
+ assert (false);
+ }
+ catch (const xml::exception&)
+ {
+ // cerr << e.what () << endl;
+ }
+
// complex
//
{
diff --git a/xml/parser.cxx b/xml/parser.cxx
index 85ee19b..4079d3f 100644
--- a/xml/parser.cxx
+++ b/xml/parser.cxx
@@ -134,12 +134,15 @@ namespace xml
if (e == XML_ERROR_ABORTED)
{
- // For now we only abort the parser in the characters_() handler.
+ // For now we only abort the parser in the characters_() and
+ // start_element_() handlers.
//
switch (content ())
{
case empty:
throw parsing (*this, "character in empty content");
+ case simple:
+ throw parsing (*this, "element in simple content");
case complex:
throw parsing (*this, "character in complex content");
default:
@@ -495,9 +498,17 @@ namespace xml
{
event_ = queue_;
queue_ = eof;
+
+ line_ = XML_GetCurrentLineNumber (p_);
+ column_ = XML_GetCurrentColumnNumber (p_);
+
return event_;
}
+ // Reset the character accumulation flag.
+ //
+ accumulate_ = false;
+
XML_ParsingStatus ps;
XML_GetParsingStatus (p_, &ps);
@@ -630,6 +641,21 @@ namespace xml
//
assert (ps.parsing == XML_PARSING);
+ // When accumulating characters in simple content, we expect to
+ // see more characters or end element. Seeing start element is
+ // possible but means violation of the content model.
+ //
+ if (p.accumulate_)
+ {
+ // It would have been easier to throw the exception directly,
+ // however, the Expat code is most likely not exception safe.
+ //
+ p.line_ = XML_GetCurrentLineNumber (p.p_);
+ p.column_ = XML_GetCurrentColumnNumber (p.p_);
+ XML_StopParser (p.p_, false);
+ return;
+ }
+
p.event_ = start_element;
split_name (name, p.qname_);
@@ -702,23 +728,19 @@ namespace xml
p.queue_ = end_element;
else
{
- // We may also have the end namespace declaration events which
- // should come before the end element. If that's the case, then
- // queue the end element and return the end namespace as the next
- // event.
+ split_name (name, p.qname_);
+
+ // If we are accumulating characters, then queue this event.
//
- if (p.end_ns_i_ < p.end_ns_.size ())
- {
- p.event_ = end_namespace_decl;
+ if (p.accumulate_)
p.queue_ = end_element;
- }
else
+ {
p.event_ = end_element;
- split_name (name, p.qname_);
-
- p.line_ = XML_GetCurrentLineNumber (p.p_);
- p.column_ = XML_GetCurrentColumnNumber (p.p_);
+ p.line_ = XML_GetCurrentLineNumber (p.p_);
+ p.column_ = XML_GetCurrentColumnNumber (p.p_);
+ }
XML_StopParser (p.p_, true);
}
@@ -738,9 +760,11 @@ namespace xml
if (ps.parsing == XML_FINISHED)
return;
+ content_type cont (p.content ());
+
// If this is empty or complex content, see if these are whitespaces.
//
- switch (p.content ())
+ switch (cont)
{
case empty:
case complex:
@@ -765,10 +789,11 @@ namespace xml
break;
}
- // This can be a followup event for another character event. In
- // this case simply append the data.
+ // Append the characters if we are accumulating. This can also be a
+ // followup event for another character event. In this case also
+ // append the data.
//
- if (ps.parsing != XML_PARSING)
+ if (p.accumulate_ || ps.parsing != XML_PARSING)
{
assert (p.event_ == characters);
p.value_.append (s, n);
@@ -781,7 +806,14 @@ namespace xml
p.line_ = XML_GetCurrentLineNumber (p.p_);
p.column_ = XML_GetCurrentColumnNumber (p.p_);
- XML_StopParser (p.p_, true);
+ // In simple content we need to accumulate all the characters
+ // into a single event. To do this we will let the parser run
+ // until we reach the end of the element.
+ //
+ if (cont == simple)
+ p.accumulate_ = true;
+ else
+ XML_StopParser (p.p_, true);
}
}
diff --git a/xml/parser.hxx b/xml/parser.hxx
index b9b5d4c..ab31959 100644
--- a/xml/parser.hxx
+++ b/xml/parser.hxx
@@ -128,7 +128,7 @@ namespace xml
event_type
next ()
{
- // Move to .ixx.
+ //@@ Move to .ixx.
if (state_ == state_next)
return next_ (false);
@@ -279,9 +279,9 @@ namespace xml
public:
enum content_type
{
- // element characters whitespaces
+ // element characters whitespaces notes
empty, // no no ignored
- simple, // no yes preserved
+ simple, // no yes preserved content accumulated
complex, // yes no ignored
mixed // yes yes preserved
};
@@ -345,6 +345,7 @@ namespace xml
XML_Parser p_;
std::size_t depth_;
+ bool accumulate_; // Whether we are accumulating character content.
enum {state_next, state_peek} state_;
event_type event_;
event_type queue_;