aboutsummaryrefslogtreecommitdiff
path: root/cutl/xml/parser.hxx
blob: 19a49e85e0b1b8402a824e08ef78884f549d2687 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
// file      : cutl/xml/parser.hxx
// copyright : Copyright (c) 2009-2017 Code Synthesis Tools CC
// license   : MIT; see accompanying LICENSE file

#ifndef CUTL_XML_PARSER_HXX
#define CUTL_XML_PARSER_HXX

#include <map>
#include <vector>
#include <string>
#include <iosfwd>
#include <cstddef> // std::size_t
#include <cassert>

#include <cutl/details/config.hxx> // LIBCUTL_EXTERNAL_EXPAT

#ifndef LIBCUTL_EXTERNAL_EXPAT
#  include <cutl/details/expat/expat.h>
#else
#  include <expat.h>
#endif

// We only support UTF-8 expat.
//
#ifdef XML_UNICODE
#  error UTF-16 expat (XML_UNICODE defined) is not supported
#endif

#include <cutl/xml/qname.hxx>
#include <cutl/xml/exception.hxx>

#include <cutl/details/export.hxx>

namespace cutl
{
  namespace xml
  {
    class parser;

    struct LIBCUTL_EXPORT parsing: exception
    {
      virtual
      ~parsing () throw ();

      parsing (const std::string& name,
               unsigned long long line,
               unsigned long long column,
               const std::string& description);

      parsing (const parser&, const std::string& description);

      const std::string&
      name () const {return name_;}

      unsigned long long
      line () const {return line_;}

      unsigned long long
      column () const {return column_;}

      const std::string&
      description () const {return description_;}

      virtual const char*
      what () const throw ();

    private:
      void
      init ();

    private:
      std::string name_;
      unsigned long long line_;
      unsigned long long column_;
      std::string description_;
      std::string what_;
    };

    class LIBCUTL_EXPORT parser
    {
    public:
      ~parser ();

      typedef xml::qname qname_type;
      typedef unsigned short feature_type;

      // If both receive_attributes_event and receive_attributes_map are
      // specified, then receive_attributes_event is assumed.
      //
      static const feature_type receive_elements = 0x0001;
      static const feature_type receive_characters = 0x0002;
      static const feature_type receive_attributes_map = 0x0004;
      static const feature_type receive_attributes_event = 0x0008;
      static const feature_type receive_namespace_decls = 0x0010;

      static const feature_type receive_default = receive_elements |
                                                  receive_characters |
                                                  receive_attributes_map;

      // Parse std::istream. Input name is used in diagnostics to identify
      // the document being parsed. std::ios_base::failure exception is
      // used to report io errors (badbit and failbit).
      //
      parser (std::istream&,
              const std::string& input_name,
              feature_type = receive_default);

      const std::string&
      input_name () const {return iname_;}

      // Parsing events.
      //
    public:
      enum event_type
      {
        // If adding new events, also update the stream insertion operator.
        //
        start_element,
        end_element,
        start_attribute,
        end_attribute,
        characters,
        start_namespace_decl,
        end_namespace_decl,
        eof
      };

      event_type
      next ()
      {
        if (state_ == state_next)
          return next_ (false);
        else
        {
          // If we previously peeked at start/end_element, then adjust
          // state accordingly.
          //
          switch (event_)
          {
          case end_element:
            {
              if (!element_state_.empty () &&
                  element_state_.back ().depth == depth_)
                pop_element ();

              depth_--;
              break;
            }
          case start_element:
            {
              depth_++;
              break;
            }
          default:
            break;
          }

          state_ = state_next;
          return event_;
        }
      }

      // Get the next event and make sure that it's what's expected. If it
      // is not, then throw an appropriate parsing exception.
      //
      void
      next_expect (event_type);

      void
      next_expect (event_type, const qname_type& qname);

      void
      next_expect (event_type, const std::string& name);

      void
      next_expect (event_type, const std::string& ns, const std::string& name);

      event_type
      peek ()
      {
        if (state_ == state_peek)
          return event_;
        else
        {
          event_type e (next_ (true));
          state_ = state_peek; // Set it after the call to next_().
          return e;
        }
      }

      // Return the even that was last returned by the call to next() or
      // peek().
      //
      event_type
      event () {return event_;}

      // Event data.
      //
    public:
      const qname_type& qname () const {return *pqname_;}

      const std::string& namespace_ () const {return pqname_->namespace_ ();}
      const std::string& name () const {return pqname_->name ();}
      const std::string& prefix () const {return pqname_->prefix ();}

      const std::string& value () const {return *pvalue_;}

      unsigned long long line () const {return line_;}
      unsigned long long column () const {return column_;}

      // Attribute map lookup. If attribute is not found, then the version
      // without the default value throws an appropriate parsing exception
      // while the version with the default value returns that value.
      //
      // Note also that there is no attribute(ns,name) version since it
      // would conflict with attribute(name,dv) (qualified attributes
      // are not very common).
      //
      // Attribute map is valid throughout at the "element level" until
      // end_element and not just during start_element. As a special case,
      // the map is still valid after peek() that returned end_element until
      // this end_element event is retrieved with next().
      //
      const std::string&
      attribute (const std::string& name) const;

      template <typename T>
      T
      attribute (const std::string& name) const;

      std::string
      attribute (const std::string& name, const std::string& dv) const;

      template <typename T>
      T
      attribute (const std::string& name, const T& dv) const;

      const std::string&
      attribute (const qname_type& qname) const;

      template <typename T>
      T
      attribute (const qname_type& qname) const;

      std::string
      attribute (const qname_type& qname, const std::string& dv) const;

      template <typename T>
      T
      attribute (const qname_type& qname, const T& dv) const;

      bool
      attribute_present (const std::string& name) const;

      bool
      attribute_present (const qname_type& qname) const;

      // Low-level attribute map access. Note that this API assumes
      // all attributes are handled.
      //
      struct attribute_value_type
      {
        std::string value;
        mutable bool handled;
      };

      typedef std::map<qname_type, attribute_value_type> attribute_map_type;

      const attribute_map_type&
      attribute_map () const;

      // Optional content processing.
      //
    public:
      enum content_type
      {
                 //  element   characters  whitespaces
        empty,   //    no          no        ignored
        simple,  //    no          yes       preserved
        complex, //    yes         no        ignored
        mixed    //    yes         yes       preserved
      };

      // Note that you cannot get/set content while peeking.
      //
      void
      content (content_type c)
      {
        assert (state_ == state_next);

        if (!element_state_.empty () && element_state_.back ().depth == depth_)
          element_state_.back ().content = c;
        else
          element_state_.push_back (element_entry (depth_, c));
      }

      content_type
      content () const
      {
        assert (state_ == state_next);

        return
          !element_state_.empty () && element_state_.back ().depth == depth_
          ? element_state_.back ().content
          : mixed;
      }

    private:
      static void XMLCALL
      start_element_ (void*, const XML_Char*, const XML_Char**);

      static void XMLCALL
      end_element_ (void*, const XML_Char*);

      static void XMLCALL
      characters_ (void*, const XML_Char*, int);

      static void XMLCALL
      start_namespace_decl_ (void*, const XML_Char*, const XML_Char*);

      static void XMLCALL
      end_namespace_decl_ (void*, const XML_Char*);

    private:
      event_type
      next_ (bool peek);

      event_type
      next_body ();

      void
      handle_error ();

    private:
      std::istream& is_;
      const std::string iname_;
      feature_type feature_;

      XML_Parser p_;
      std::size_t depth_;
      enum {state_next, state_peek} state_;
      event_type event_;
      event_type queue_;

      qname_type qname_;
      std::string value_;

      // These are used to avoid copying when we are handling attributes
      // and namespace decls.
      //
      const qname_type* pqname_;
      const std::string* pvalue_;

      unsigned long long line_;
      unsigned long long column_;

      // Attributes as events.
      //
      struct attribute_type
      {
        qname_type qname;
        std::string value;
      };

      typedef std::vector<attribute_type> attributes;

      attributes attr_;
      attributes::size_type attr_i_; // Index of the current attribute.

      // Namespace declarations.
      //
      typedef std::vector<qname_type> namespace_decls;

      namespace_decls start_ns_;
      namespace_decls::size_type start_ns_i_; // Index of the current decl.

      namespace_decls end_ns_;
      namespace_decls::size_type end_ns_i_; // Index of the current decl.

      // Element state consisting of the content model and attribute map.
      //
      struct element_entry
      {
        element_entry (std::size_t d, content_type c = mixed)
            : depth (d), content (c), attr_unhandled_ (0) {}

        std::size_t depth;
        content_type content;
        attribute_map_type attr_map_;
        mutable attribute_map_type::size_type attr_unhandled_;
      };

      typedef std::vector<element_entry> element_state;
      std::vector<element_entry> element_state_;

      // Empty attribute map to return when an element has no attributes.
      //
      const attribute_map_type empty_attr_map_;

      // Return the element entry corresponding to the current depth, if
      // exists, and NULL otherwise.
      //
      const element_entry*
      get_element () const;

      void
      pop_element ();
    };

    LIBCUTL_EXPORT
    std::ostream&
    operator<< (std::ostream&, parser::event_type);
  }
}

#include <cutl/xml/parser.ixx>
#include <cutl/xml/parser.txx>

#endif // CUTL_XML_PARSER_HXX