diff options
author | Boris Kolpackov <boris@codesynthesis.com> | 2014-04-29 13:06:50 +0200 |
---|---|---|
committer | Boris Kolpackov <boris@codesynthesis.com> | 2014-04-29 13:06:50 +0200 |
commit | f6313f17cb87d62c4a73f9d135baafd076431311 (patch) | |
tree | 1acd7c0ee273f2f423a835641f2d100a3a0d6202 | |
parent | 818bedc799073966a4c56fd83ab1df358b9e9c24 (diff) |
Accumulate characters in simple content
This makes the high-level data extraction (e.g., value<T>()) much more
usable since without this functionality the content can be delivered in
multiple chunks.
-rw-r--r-- | tests/parser/driver.cxx | 57 | ||||
-rw-r--r-- | xml/parser.cxx | 68 | ||||
-rw-r--r-- | xml/parser.hxx | 7 |
3 files changed, 111 insertions, 21 deletions
diff --git a/tests/parser/driver.cxx b/tests/parser/driver.cxx index c5d18f0..e4ecd69 100644 --- a/tests/parser/driver.cxx +++ b/tests/parser/driver.cxx @@ -84,6 +84,24 @@ main () // cerr << e.what () << endl; } + // Test namespace declarations. + // + { + // Followup end element event that should be precedeeded by end + // namespace declaration. + // + istringstream is ("<root xmlns:a='a'/>"); + parser p (is, + "test", + parser::receive_default | + parser::receive_namespace_decls); + + p.next_expect (parser::start_element, "root"); + p.next_expect (parser::start_namespace_decl); + p.next_expect (parser::end_namespace_decl); + p.next_expect (parser::end_element); + } + // Test value extraction. // { @@ -269,6 +287,45 @@ main () // cerr << e.what () << endl; } + { + // Test content accumulation in simple content. + // + istringstream is ("<root xmlns:a='a'>123</root>"); + parser p (is, + "simple", + parser::receive_default | + parser::receive_namespace_decls); + + assert (p.next () == parser::start_element); + p.next_expect (parser::start_namespace_decl); + p.content (parser::simple); + assert (p.next () == parser::characters && p.value () == "123"); + p.next_expect (parser::end_namespace_decl); + assert (p.next () == parser::end_element); + assert (p.next () == parser::eof); + } + + try + { + // Test error handling in accumulation in simple content. + // + istringstream is ("<root xmlns:a='a'>12<nested/>3</root>"); + parser p (is, + "simple", + parser::receive_default | + parser::receive_namespace_decls); + + assert (p.next () == parser::start_element); + p.next_expect (parser::start_namespace_decl); + p.content (parser::simple); + p.next (); + assert (false); + } + catch (const xml::exception&) + { + // cerr << e.what () << endl; + } + // complex // { diff --git a/xml/parser.cxx b/xml/parser.cxx index 85ee19b..4079d3f 100644 --- a/xml/parser.cxx +++ b/xml/parser.cxx @@ -134,12 +134,15 @@ namespace xml if (e == XML_ERROR_ABORTED) { - // For now we only abort the parser in the characters_() handler. + // For now we only abort the parser in the characters_() and + // start_element_() handlers. // switch (content ()) { case empty: throw parsing (*this, "character in empty content"); + case simple: + throw parsing (*this, "element in simple content"); case complex: throw parsing (*this, "character in complex content"); default: @@ -495,9 +498,17 @@ namespace xml { event_ = queue_; queue_ = eof; + + line_ = XML_GetCurrentLineNumber (p_); + column_ = XML_GetCurrentColumnNumber (p_); + return event_; } + // Reset the character accumulation flag. + // + accumulate_ = false; + XML_ParsingStatus ps; XML_GetParsingStatus (p_, &ps); @@ -630,6 +641,21 @@ namespace xml // assert (ps.parsing == XML_PARSING); + // When accumulating characters in simple content, we expect to + // see more characters or end element. Seeing start element is + // possible but means violation of the content model. + // + if (p.accumulate_) + { + // It would have been easier to throw the exception directly, + // however, the Expat code is most likely not exception safe. + // + p.line_ = XML_GetCurrentLineNumber (p.p_); + p.column_ = XML_GetCurrentColumnNumber (p.p_); + XML_StopParser (p.p_, false); + return; + } + p.event_ = start_element; split_name (name, p.qname_); @@ -702,23 +728,19 @@ namespace xml p.queue_ = end_element; else { - // We may also have the end namespace declaration events which - // should come before the end element. If that's the case, then - // queue the end element and return the end namespace as the next - // event. + split_name (name, p.qname_); + + // If we are accumulating characters, then queue this event. // - if (p.end_ns_i_ < p.end_ns_.size ()) - { - p.event_ = end_namespace_decl; + if (p.accumulate_) p.queue_ = end_element; - } else + { p.event_ = end_element; - split_name (name, p.qname_); - - p.line_ = XML_GetCurrentLineNumber (p.p_); - p.column_ = XML_GetCurrentColumnNumber (p.p_); + p.line_ = XML_GetCurrentLineNumber (p.p_); + p.column_ = XML_GetCurrentColumnNumber (p.p_); + } XML_StopParser (p.p_, true); } @@ -738,9 +760,11 @@ namespace xml if (ps.parsing == XML_FINISHED) return; + content_type cont (p.content ()); + // If this is empty or complex content, see if these are whitespaces. // - switch (p.content ()) + switch (cont) { case empty: case complex: @@ -765,10 +789,11 @@ namespace xml break; } - // This can be a followup event for another character event. In - // this case simply append the data. + // Append the characters if we are accumulating. This can also be a + // followup event for another character event. In this case also + // append the data. // - if (ps.parsing != XML_PARSING) + if (p.accumulate_ || ps.parsing != XML_PARSING) { assert (p.event_ == characters); p.value_.append (s, n); @@ -781,7 +806,14 @@ namespace xml p.line_ = XML_GetCurrentLineNumber (p.p_); p.column_ = XML_GetCurrentColumnNumber (p.p_); - XML_StopParser (p.p_, true); + // In simple content we need to accumulate all the characters + // into a single event. To do this we will let the parser run + // until we reach the end of the element. + // + if (cont == simple) + p.accumulate_ = true; + else + XML_StopParser (p.p_, true); } } diff --git a/xml/parser.hxx b/xml/parser.hxx index b9b5d4c..ab31959 100644 --- a/xml/parser.hxx +++ b/xml/parser.hxx @@ -128,7 +128,7 @@ namespace xml event_type next () { - // Move to .ixx. + //@@ Move to .ixx. if (state_ == state_next) return next_ (false); @@ -279,9 +279,9 @@ namespace xml public: enum content_type { - // element characters whitespaces + // element characters whitespaces notes empty, // no no ignored - simple, // no yes preserved + simple, // no yes preserved content accumulated complex, // yes no ignored mixed // yes yes preserved }; @@ -345,6 +345,7 @@ namespace xml XML_Parser p_; std::size_t depth_; + bool accumulate_; // Whether we are accumulating character content. enum {state_next, state_peek} state_; event_type event_; event_type queue_; |