From f6313f17cb87d62c4a73f9d135baafd076431311 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 29 Apr 2014 13:06:50 +0200 Subject: Accumulate characters in simple content This makes the high-level data extraction (e.g., value()) much more usable since without this functionality the content can be delivered in multiple chunks. --- xml/parser.cxx | 68 ++++++++++++++++++++++++++++++++++++++++++---------------- xml/parser.hxx | 7 +++--- 2 files changed, 54 insertions(+), 21 deletions(-) (limited to 'xml') diff --git a/xml/parser.cxx b/xml/parser.cxx index 85ee19b..4079d3f 100644 --- a/xml/parser.cxx +++ b/xml/parser.cxx @@ -134,12 +134,15 @@ namespace xml if (e == XML_ERROR_ABORTED) { - // For now we only abort the parser in the characters_() handler. + // For now we only abort the parser in the characters_() and + // start_element_() handlers. // switch (content ()) { case empty: throw parsing (*this, "character in empty content"); + case simple: + throw parsing (*this, "element in simple content"); case complex: throw parsing (*this, "character in complex content"); default: @@ -495,9 +498,17 @@ namespace xml { event_ = queue_; queue_ = eof; + + line_ = XML_GetCurrentLineNumber (p_); + column_ = XML_GetCurrentColumnNumber (p_); + return event_; } + // Reset the character accumulation flag. + // + accumulate_ = false; + XML_ParsingStatus ps; XML_GetParsingStatus (p_, &ps); @@ -630,6 +641,21 @@ namespace xml // assert (ps.parsing == XML_PARSING); + // When accumulating characters in simple content, we expect to + // see more characters or end element. Seeing start element is + // possible but means violation of the content model. + // + if (p.accumulate_) + { + // It would have been easier to throw the exception directly, + // however, the Expat code is most likely not exception safe. + // + p.line_ = XML_GetCurrentLineNumber (p.p_); + p.column_ = XML_GetCurrentColumnNumber (p.p_); + XML_StopParser (p.p_, false); + return; + } + p.event_ = start_element; split_name (name, p.qname_); @@ -702,23 +728,19 @@ namespace xml p.queue_ = end_element; else { - // We may also have the end namespace declaration events which - // should come before the end element. If that's the case, then - // queue the end element and return the end namespace as the next - // event. + split_name (name, p.qname_); + + // If we are accumulating characters, then queue this event. // - if (p.end_ns_i_ < p.end_ns_.size ()) - { - p.event_ = end_namespace_decl; + if (p.accumulate_) p.queue_ = end_element; - } else + { p.event_ = end_element; - split_name (name, p.qname_); - - p.line_ = XML_GetCurrentLineNumber (p.p_); - p.column_ = XML_GetCurrentColumnNumber (p.p_); + p.line_ = XML_GetCurrentLineNumber (p.p_); + p.column_ = XML_GetCurrentColumnNumber (p.p_); + } XML_StopParser (p.p_, true); } @@ -738,9 +760,11 @@ namespace xml if (ps.parsing == XML_FINISHED) return; + content_type cont (p.content ()); + // If this is empty or complex content, see if these are whitespaces. // - switch (p.content ()) + switch (cont) { case empty: case complex: @@ -765,10 +789,11 @@ namespace xml break; } - // This can be a followup event for another character event. In - // this case simply append the data. + // Append the characters if we are accumulating. This can also be a + // followup event for another character event. In this case also + // append the data. // - if (ps.parsing != XML_PARSING) + if (p.accumulate_ || ps.parsing != XML_PARSING) { assert (p.event_ == characters); p.value_.append (s, n); @@ -781,7 +806,14 @@ namespace xml p.line_ = XML_GetCurrentLineNumber (p.p_); p.column_ = XML_GetCurrentColumnNumber (p.p_); - XML_StopParser (p.p_, true); + // In simple content we need to accumulate all the characters + // into a single event. To do this we will let the parser run + // until we reach the end of the element. + // + if (cont == simple) + p.accumulate_ = true; + else + XML_StopParser (p.p_, true); } } diff --git a/xml/parser.hxx b/xml/parser.hxx index b9b5d4c..ab31959 100644 --- a/xml/parser.hxx +++ b/xml/parser.hxx @@ -128,7 +128,7 @@ namespace xml event_type next () { - // Move to .ixx. + //@@ Move to .ixx. if (state_ == state_next) return next_ (false); @@ -279,9 +279,9 @@ namespace xml public: enum content_type { - // element characters whitespaces + // element characters whitespaces notes empty, // no no ignored - simple, // no yes preserved + simple, // no yes preserved content accumulated complex, // yes no ignored mixed // yes yes preserved }; @@ -345,6 +345,7 @@ namespace xml XML_Parser p_; std::size_t depth_; + bool accumulate_; // Whether we are accumulating character content. enum {state_next, state_peek} state_; event_type event_; event_type queue_; -- cgit v1.1