openMSX
rapidsax.hh
Go to the documentation of this file.
1 #ifndef RAPIDSAX_HH
2 #define RAPIDSAX_HH
3 
4 // This code is _heavily_ based on RapidXml 1.13
5 // http://rapidxml.sourceforge.net/
6 //
7 // RapidXml is a very fast XML parser.
8 // http://xmlbench.sourceforge.net/results/benchmark200910/index.html
9 // One of the main reasons it can be this fast is that doesn't do any string
10 // copies. Instead the XML input data is modified in-place (e.g. for stuff like
11 // < replacements). Though this also means the output produced by the parser
12 // is tied to the lifetime of the XML input data.
13 //
14 // RapidXml produces a DOM-like output. This parser has a SAX-like interface.
15 
16 #include "string_ref.hh"
17 
18 namespace rapidsax {
19 
20 // Parse given XML text and call callback functions in the given handler.
21 // - XML text must be zero-terminated
22 // - Handler must implement the methods defined in NullHandler (below). An
23 // easy way to do this is to inherit from NullHandler and only reimplement
24 // the methods that you need.
25 // - The behavior of the parser can be fine-tuned with the FLAGS parameter,
26 // see below for more details.
27 // - When a parse error is encounter, an instance of ParseError is thrown.
28 // - The lifetime of the string_ref's in the callback handler is the same as
29 // the lifetime of the input XML data (no string copies are made, instead
30 // the XML file is modified in-place and references to this data are passed).
31 template<int FLAGS, typename HANDLER> void parse(HANDLER& handler, char* xml);
32 
33 
34 // Flags that influence parsing behavior. The flags can be OR'ed together.
35 
36 // Should XML entities like &lt; be expanded or not?
37 static const int noEntityTranslation = 0x1;
38 // Should leading and trailing whitespace be trimmed?
39 static const int trimWhitespace = 0x2;
40 // Should sequences of whitespace characters be replaced with a single
41 // space character?
42 static const int normalizeWhitespace = 0x4;
43 
44 
45 // Callback handler with all empty implementations (can be used as a base
46 // class in case you only need to reimplement a few of the methods).
48 {
49 public:
50  // Called when an opening XML tag is encountered.
51  // 'name' is the name of the XML tag.
52  void start(string_ref /*name*/) {}
53 
54  // Called when a XML tag is closed.
55  // Note: the parser does currently not check whether the name of the
56  // opening nd closing tags matches.
57  void stop() {}
58 
59  // Called when text inside a tag is parsed.
60  // XML entities are replaced (optional)
61  // Whitespace is (optionally) trimmed or normalized.
62  // This method is not called for an empty text string.
63  // (Unlike other SAX parsers) the whole text string is always
64  // passed in a single chunk (so no need to concatenate this text
65  // with previous chunks in the callback).
66  void text(string_ref /*text*/) {}
67 
68  // Called for each parsed attribute.
69  // Attributes can occur inside xml tags or inside XML declarations.
70  void attribute(string_ref /*name*/, string_ref /*value*/) {}
71 
72  // Called for parsed CDATA sections.
73  void cdata(string_ref /*value*/) {}
74 
75  // Called when a XML comment (<!-- ... -->) is parsed.
76  void comment(string_ref /*value*/) {}
77 
78  // Called when XML declaration (<?xml .. ?>) is parsed.
79  // Inside a XML declaration there can be attributes.
80  void declarationStart() {}
81  void declarationStop() {}
82 
83  // Called when the <!DOCTYPE ..> is parsed.
84  void doctype(string_ref /*text*/) {}
85 
86  // Called when XML processing instructions (<? .. ?>) are parsed.
87  void procInstr(string_ref /*target*/, string_ref /*instr*/) {}
88 };
89 
90 
92 {
93 public:
94  ParseError(const char* what, char* where)
95  : m_what(what)
96  , m_where(where)
97  {
98  }
99 
100  const char* what() const { return m_what; }
101  char* where() const { return m_where; }
102 
103 private:
104  const char* m_what;
105  char* m_where;
106 };
107 
108 
109 namespace internal {
110 
111 typedef unsigned char u8;
112 
113 extern const bool lutWhitespace[256]; // Whitespace table
114 extern const bool lutNodeName[256]; // Node name table
115 extern const bool lutText[256]; // Text table
116 extern const bool lutTextPureNoWs[256]; // Text table
117 extern const bool lutTextPureWithWs[256]; // Text table
118 extern const bool lutAttributeName[256]; // Attribute name table
119 extern const bool lutAttributeData1[256]; // Attribute data table, single quote
120 extern const bool lutAttributeData1Pure[256]; // Attribute data table, single quote
121 extern const bool lutAttributeData2[256]; // Attribute data table, double quotes
122 extern const bool lutAttributeData2Pure[256]; // Attribute data table, double quotes
123 extern const u8 lutDigits[256]; // Digits
124 
125 // Detect whitespace character
127  static bool test(char ch) {
128  return lutWhitespace[u8(ch)];
129  }
130 };
131 
132 // Detect node name character
133 struct NodeNamePred {
134  static bool test(char ch) {
135  return lutNodeName[u8(ch)];
136  }
137 };
138 
139 // Detect attribute name character
141  static bool test(char ch) {
142  return lutAttributeName[u8(ch)];
143  }
144 };
145 
146 // Detect text character (PCDATA)
147 struct TextPred {
148  static bool test(char ch) {
149  return lutText[u8(ch)];
150  }
151 };
152 
153 // Detect text character (PCDATA) that does not require processing
155  static bool test(char ch) {
156  return lutTextPureNoWs[u8(ch)];
157  }
158 };
159 
160 // Detect text character (PCDATA) that does not require processing
162  static bool test(char ch) {
163  return lutTextPureWithWs[u8(ch)];
164  }
165 };
166 
167 // Detect attribute value character (single quote)
168 struct AttPred1 {
169  static bool test(char ch) {
170  return lutAttributeData1[u8(ch)];
171  }
172 };
173 // Detect attribute value character (double quote)
174 struct AttPred2 {
175  static bool test(char ch) {
176  return lutAttributeData2[u8(ch)];
177  }
178 };
179 
180 // Detect attribute value character (single quote)
181 struct AttPurePred1 {
182  static bool test(char ch) {
183  return lutAttributeData1Pure[u8(ch)];
184  }
185 };
186 // Detect attribute value character (double quote)
187 struct AttPurePred2 {
188  static bool test(char ch) {
189  return lutAttributeData2Pure[u8(ch)];
190  }
191 };
192 
193 // Insert coded character, using UTF8
194 static inline void insertUTF8char(char*& text, unsigned long code)
195 {
196  if (code < 0x80) { // 1 byte sequence
197  text[0] = char(code);
198  text += 1;
199  } else if (code < 0x800) {// 2 byte sequence
200  text[1] = char((code | 0x80) & 0xBF); code >>= 6;
201  text[0] = char (code | 0xC0);
202  text += 2;
203  } else if (code < 0x10000) { // 3 byte sequence
204  text[2] = char((code | 0x80) & 0xBF); code >>= 6;
205  text[1] = char((code | 0x80) & 0xBF); code >>= 6;
206  text[0] = char (code | 0xE0);
207  text += 3;
208  } else if (code < 0x110000) { // 4 byte sequence
209  text[3] = char((code | 0x80) & 0xBF); code >>= 6;
210  text[2] = char((code | 0x80) & 0xBF); code >>= 6;
211  text[1] = char((code | 0x80) & 0xBF); code >>= 6;
212  text[0] = char (code | 0xF0);
213  text += 4;
214  } else { // Invalid, only codes up to 0x10FFFF are allowed in Unicode
215  throw ParseError("invalid numeric character entity", text);
216  }
217 }
218 
219 template<char C0, char C1> static inline bool next(const char* p)
220 {
221  return (p[0] == C0) && (p[1] == C1);
222 }
223 template<char C0, char C1, char C2> static inline bool next(const char* p)
224 {
225  return (p[0] == C0) && (p[1] == C1) && (p[2] == C2);
226 }
227 template<char C0, char C1, char C2, char C3> static inline bool next(const char* p)
228 {
229  return (p[0] == C0) && (p[1] == C1) && (p[2] == C2) && (p[3] == C3);
230 }
231 template<char C0, char C1, char C2, char C3, char C4, char C5>
232 static inline bool next(const char* p)
233 {
234  return (p[0] == C0) && (p[1] == C1) && (p[2] == C2) &&
235  (p[3] == C3) && (p[4] == C4) && (p[5] == C5);
236 }
237 
238 
239 // Skip characters until predicate evaluates to true
240 template<class StopPred> static inline void skip(char*& text)
241 {
242  char* tmp = text;
243  while (StopPred::test(*tmp)) ++tmp;
244  text = tmp;
245 }
246 
247 // Skip characters until predicate evaluates to true while doing the following:
248 // - replacing XML character entity references with proper characters
249 // (&apos; &amp; &quot; &lt; &gt; &#...;)
250 // - condensing whitespace sequences to single space character
251 template<class StopPred, class StopPredPure, int FLAGS>
252 static inline char* skipAndExpand(char*& text)
253 {
254  // If entity translation, whitespace condense and whitespace
255  // trimming is disabled, use plain skip.
256  if ( (FLAGS & noEntityTranslation) &&
257  !(FLAGS & normalizeWhitespace) &&
258  !(FLAGS & trimWhitespace)) {
259  skip<StopPred>(text);
260  return text;
261  }
262 
263  // Use simple skip until first modification is detected
264  skip<StopPredPure>(text);
265 
266  // Use translation skip
267  char* src = text;
268  char* dest = src;
269  while (StopPred::test(*src)) {
270  // Test if replacement is needed
271  if (!(FLAGS & noEntityTranslation) &&
272  (src[0] == '&')) {
273  switch (src[1]) {
274  case 'a': // &amp; &apos;
275  if (next<'m','p',';'>(&src[2])) {
276  *dest = '&';
277  ++dest;
278  src += 5;
279  continue;
280  }
281  if (next<'p','o','s',';'>(&src[2])) {
282  *dest = '\'';
283  ++dest;
284  src += 6;
285  continue;
286  }
287  break;
288 
289  case 'q': // &quot;
290  if (next<'u','o','t',';'>(&src[2])) {
291  *dest = '"';
292  ++dest;
293  src += 6;
294  continue;
295  }
296  break;
297 
298  case 'g': // &gt;
299  if (next<'t',';'>(&src[2])) {
300  *dest = '>';
301  ++dest;
302  src += 4;
303  continue;
304  }
305  break;
306 
307  case 'l': // &lt;
308  if (next<'t',';'>(&src[2])) {
309  *dest = '<';
310  ++dest;
311  src += 4;
312  continue;
313  }
314  break;
315 
316  case '#': // &#...; - assumes ASCII
317  if (src[2] == 'x') {
318  unsigned long code = 0;
319  src += 3; // skip &#x
320  while (true) {
321  u8 digit = lutDigits[u8(*src)];
322  if (digit == 0xFF) break;
323  code = code * 16 + digit;
324  ++src;
325  }
326  insertUTF8char(dest, code);
327  } else {
328  unsigned long code = 0;
329  src += 2; // skip &#
330  while (1) {
331  u8 digit = lutDigits[u8(*src)];
332  if (digit == 0xFF) break;
333  code = code * 10 + digit;
334  ++src;
335  }
336  insertUTF8char(dest, code);
337  }
338  if (*src != ';') {
339  throw ParseError("expected ;", src);
340  }
341  ++src;
342  continue;
343 
344  default:
345  // Something else, ignore, just copy '&' verbatim
346  break;
347  }
348  }
349 
350  // Test if condensing is needed
351  if ((FLAGS & normalizeWhitespace) &&
352  (WhitespacePred::test(*src))) {
353  *dest++ = ' '; // single space in dest
354  ++src; // skip first whitespace char
355  // Skip remaining whitespace chars
356  while (WhitespacePred::test(*src)) ++src;
357  continue;
358  }
359 
360  // No replacement, only copy character
361  *dest++ = *src++;
362  }
363 
364  // Return new end
365  text = src;
366  return dest;
367 }
368 
369 static inline void skipBOM(char*& text)
370 {
371  if (next<char(0xEF), char(0xBB), char(0xBF)>(text)) {
372  text += 3; // skip utf-8 bom
373  }
374 }
375 
376 
377 template<int FLAGS, typename HANDLER> class Parser
378 {
379  HANDLER& handler;
380 
381 public:
382  Parser(HANDLER& handler_, char* text)
383  : handler(handler_)
384  {
385  skipBOM(text);
386  while (true) {
387  // Skip whitespace before node
388  skip<WhitespacePred>(text);
389  if (*text == 0) break;
390 
391  if (*text != '<') {
392  throw ParseError("expected <", text);
393  }
394  ++text; // skip '<'
395  parseNode(text);
396  }
397  }
398 
399 private:
400  // Parse XML declaration (<?xml...)
401  void parseDeclaration(char*& text)
402  {
403  handler.declarationStart();
404  skip<WhitespacePred>(text); // skip ws before attributes or ?>
405  parseAttributes(text);
406  handler.declarationStop();
407 
408  // skip ?>
409  if (!next<'?','>'>(text)) {
410  throw ParseError("expected ?>", text);
411  }
412  text += 2;
413  }
414 
415  // Parse XML comment (<!--...)
416  void parseComment(char*& text)
417  {
418  // Skip until end of comment
419  char* value = text; // remember value start
420  while (!next<'-','-','>'>(text)) {
421  if (text[0] == 0) {
422  throw ParseError("unexpected end of data", text);
423  }
424  ++text;
425  }
426  handler.comment(string_ref(value, text));
427  text += 3; // skip '-->'
428  }
429 
430  void parseDoctype(char*& text)
431  {
432  char* value = text; // remember value start
433 
434  // skip to >
435  while (*text != '>') {
436  switch (*text) {
437  case '[': {
438  // If '[' encountered, scan for matching ending
439  // ']' using naive algorithm with depth. This
440  // works for all W3C test files except for 2
441  // most wicked.
442  ++text; // skip '['
443  int depth = 1;
444  while (depth > 0) {
445  switch (*text) {
446  case char('['): ++depth; break;
447  case char(']'): --depth; break;
448  case 0: throw ParseError(
449  "unexpected end of data", text);
450  }
451  ++text;
452  }
453  break;
454  }
455  case '\0':
456  throw ParseError("unexpected end of data", text);
457 
458  default:
459  ++text;
460  }
461  }
462 
463  handler.doctype(string_ref(value, text));
464  text += 1; // skip '>'
465  }
466 
467  void parsePI(char*& text)
468  {
469  // Extract PI target name
470  char* name = text;
471  skip<NodeNamePred>(text);
472  char* nameEnd = text;
473  if (name == nameEnd) {
474  throw ParseError("expected PI target", text);
475  }
476 
477  // Skip whitespace between pi target and pi
478  skip<WhitespacePred>(text);
479 
480  // Skip to '?>'
481  char* value = text; // Remember start of pi
482  while (!next<'?','>'>(text)) {
483  if (*text == 0) {
484  throw ParseError("unexpected end of data", text);
485  }
486  ++text;
487  }
488  // Set pi value (verbatim, no entity expansion or ws normalization)
489  handler.procInstr(string_ref(name, nameEnd),
490  string_ref(value, text));
491  text += 2; // skip '?>'
492  }
493 
494  void parseText(char*& text, char* contentsStart)
495  {
496  // Backup to contents start if whitespace trimming is disabled
497  if (!(FLAGS & trimWhitespace)) {
498  text = contentsStart;
499  }
500  // Skip until end of data
501  char* value = text;
502  char* end = (FLAGS & normalizeWhitespace)
503  ? skipAndExpand<TextPred, TextPureWithWsPred, FLAGS>(text)
504  : skipAndExpand<TextPred, TextPureNoWsPred , FLAGS>(text);
505 
506  // Trim trailing whitespace; leading was already trimmed by
507  // whitespace skip after >
508  if (FLAGS & trimWhitespace) {
509  if (FLAGS & normalizeWhitespace) {
510  // Whitespace is already condensed to single
511  // space characters by skipping function, so
512  // just trim 1 char off the end.
513  if (end[-1] == ' ') {
514  --end;
515  }
516  } else {
517  // Backup until non-whitespace character is found
518  while (WhitespacePred::test(end[-1])) {
519  --end;
520  }
521  }
522  }
523 
524  // Handle text, but only if non-empty.
525  auto len = end - value;
526  if (len) handler.text(string_ref(value, len));
527  }
528 
529  void parseCdata(char*& text)
530  {
531  // Skip until end of cdata
532  char* value = text;
533  while (!next<']',']','>'>(text)) {
534  if (text[0] == 0) {
535  throw ParseError("unexpected end of data", text);
536  }
537  ++text;
538  }
539  handler.cdata(string_ref(value, text));
540  text += 3; // skip ]]>
541  }
542 
543  void parseElement(char*& text)
544  {
545  // Extract element name
546  char* name = text;
547  skip<NodeNamePred>(text);
548  char* nameEnd = text;
549  if (name == nameEnd) {
550  throw ParseError("expected element name", text);
551  }
552  handler.start(string_ref(name, nameEnd));
553 
554  skip<WhitespacePred>(text); // skip ws before attributes or >
555  parseAttributes(text);
556 
557  // Determine ending type
558  if (*text == '>') {
559  ++text;
560  parseNodeContents(text);
561  } else if (*text == '/') {
562  handler.stop();
563  ++text;
564  if (*text != '>') {
565  throw ParseError("expected >", text);
566  }
567  ++text;
568  } else {
569  throw ParseError("expected >", text);
570  }
571  }
572 
573  // Determine node type, and parse it
574  void parseNode(char*& text)
575  {
576  switch (text[0]) {
577  case '?': // <?...
578  ++text; // skip ?
579  // Note: this doesn't detect mixed case (xMl), does
580  // that matter?
581  if ((next<'x','m','l'>(text) ||
582  next<'X','M','L'>(text)) &&
583  WhitespacePred::test(text[3])) {
584  // '<?xml ' - xml declaration
585  text += 4; // skip 'xml '
586  parseDeclaration(text);
587  } else {
588  parsePI(text);
589  }
590  break;
591 
592  case '!': // <!...
593  // Parse proper subset of <! node
594  switch (text[1]) {
595  case '-': // <!-
596  if (text[2] == '-') {
597  // '<!--' - xml comment
598  text += 3; // skip '!--'
599  parseComment(text);
600  return;
601  }
602  break;
603 
604  case '[': // <![
605  if (next<'C','D','A','T','A','['>(&text[2])) {
606  // '<![CDATA[' - cdata
607  text += 8; // skip '![CDATA['
608  parseCdata(text);
609  return;
610  }
611  break;
612 
613  case 'D': // <!D
614  if (next<'O','C','T','Y','P','E'>(&text[2]) &&
615  WhitespacePred::test(text[8])) {
616  // '<!DOCTYPE ' - doctype
617  text += 9; // skip '!DOCTYPE '
618  parseDoctype(text);
619  return;
620  }
621  break;
622  }
623  // Attempt to skip other, unrecognized types starting with <!
624  ++text; // skip !
625  while (*text != '>') {
626  if (*text == 0) {
627  throw ParseError(
628  "unexpected end of data", text);
629  }
630  ++text;
631  }
632  ++text; // skip '>'
633  break;
634 
635  default: // <...
636  parseElement(text);
637  break;
638  }
639  }
640 
641  // Parse contents of the node - children, data etc.
642  void parseNodeContents(char*& text)
643  {
644  while (true) {
645  char* contentsStart = text; // start before ws is skipped
646  skip<WhitespacePred>(text); // Skip ws between > and contents
647 
648 afterText: // After parseText() jump here instead of continuing
649  // the loop, because skipping whitespace is unnecessary.
650  switch (*text) {
651  case '<': // Node closing or child node
652  if (text[1] == '/') {
653  // Node closing
654  text += 2; // skip '</'
655  skip<NodeNamePred>(text);
656  // TODO validate closing tag??
657  handler.stop();
658  // Skip remaining whitespace after node name
659  skip<WhitespacePred>(text);
660  if (*text != '>') {
661  throw ParseError("expected >", text);
662  }
663  ++text; // skip '>'
664  return;
665  } else {
666  // Child node
667  ++text; // skip '<'
668  parseNode(text);
669  }
670  break;
671 
672  case '\0':
673  throw ParseError("unexpected end of data", text);
674 
675  default:
676  parseText(text, contentsStart);
677  goto afterText;
678  }
679  }
680  }
681 
682  // Parse XML attributes of the node
683  void parseAttributes(char*& text)
684  {
685  // For all attributes
686  while (AttributeNamePred::test(*text)) {
687  // Extract attribute name
688  char* name = text;
689  ++text; // Skip first character of attribute name
690  skip<AttributeNamePred>(text);
691  char* nameEnd = text;
692  if (name == nameEnd) {
693  throw ParseError("expected attribute name", name);
694  }
695 
696  skip<WhitespacePred>(text); // skip ws after name
697  if (*text != '=') {
698  throw ParseError("expected =", text);
699  }
700  ++text; // skip =
701  skip<WhitespacePred>(text); // skip ws after =
702 
703  // Skip quote and remember if it was ' or "
704  char quote = *text;
705  if (quote != '\'' && quote != '"') {
706  throw ParseError("expected ' or \"", text);
707  }
708  ++text;
709 
710  // Extract attribute value and expand char refs in it
711  // No whitespace normalization in attributes
712  static const int FLAGS2 = FLAGS & ~normalizeWhitespace;
713  char* value = text;
714  char* valueEnd = (quote == '\'')
715  ? skipAndExpand<AttPred1, AttPurePred1, FLAGS2>(text)
716  : skipAndExpand<AttPred2, AttPurePred2, FLAGS2>(text);
717  handler.attribute(string_ref(name, nameEnd),
718  string_ref(value, valueEnd));
719 
720  // Make sure that end quote is present
721  if (*text != quote) {
722  throw ParseError("expected ' or \"", text);
723  }
724  ++text; // skip quote
725 
726  skip<WhitespacePred>(text); // skip ws after value
727  }
728  }
729 };
730 
731 } // namespace internal
732 
733 template<int FLAGS, typename HANDLER>
734 inline void parse(HANDLER& handler, char* xml)
735 {
736  internal::Parser<FLAGS, HANDLER> parser(handler, xml);
737 }
738 
739 } // namespace rapidsax
740 
741 #endif