31 template<
int FLAGS,
typename HANDLER>
void parse(HANDLER& handler,
char* xml);
37 static const int noEntityTranslation = 0x1;
39 static const int trimWhitespace = 0x2;
42 static const int normalizeWhitespace = 0x4;
100 const char*
what()
const {
return m_what; }
101 char*
where()
const {
return m_where; }
111 typedef unsigned char u8;
115 extern const bool lutText[256];
194 static inline void insertUTF8char(
char*& text,
unsigned long code)
197 text[0] = char(code);
199 }
else if (code < 0x800) {
200 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
201 text[0] = char (code | 0xC0);
203 }
else if (code < 0x10000) {
204 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
205 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
206 text[0] = char (code | 0xE0);
208 }
else if (code < 0x110000) {
209 text[3] = char((code | 0x80) & 0xBF); code >>= 6;
210 text[2] = char((code | 0x80) & 0xBF); code >>= 6;
211 text[1] = char((code | 0x80) & 0xBF); code >>= 6;
212 text[0] = char (code | 0xF0);
215 throw ParseError(
"invalid numeric character entity", text);
219 template<
char C0,
char C1>
static inline bool next(
const char* p)
221 return (p[0] == C0) && (p[1] == C1);
223 template<
char C0,
char C1,
char C2>
static inline bool next(
const char* p)
225 return (p[0] == C0) && (p[1] == C1) && (p[2] == C2);
227 template<
char C0,
char C1,
char C2,
char C3>
static inline bool next(
const char* p)
229 return (p[0] == C0) && (p[1] == C1) && (p[2] == C2) && (p[3] == C3);
231 template<
char C0,
char C1,
char C2,
char C3,
char C4,
char C5>
232 static inline bool next(
const char* p)
234 return (p[0] == C0) && (p[1] == C1) && (p[2] == C2) &&
235 (p[3] == C3) && (p[4] == C4) && (p[5] == C5);
240 template<
class StopPred>
static inline void skip(
char*& text)
243 while (StopPred::test(*tmp)) ++tmp;
251 template<
class StopPred,
class StopPredPure,
int FLAGS>
252 static inline char* skipAndExpand(
char*& text)
256 if ( (FLAGS & noEntityTranslation) &&
257 !(FLAGS & normalizeWhitespace) &&
258 !(FLAGS & trimWhitespace)) {
259 skip<StopPred>(text);
264 skip<StopPredPure>(text);
269 while (StopPred::test(*src)) {
271 if (!(FLAGS & noEntityTranslation) &&
275 if (next<'m','p',';'>(&src[2])) {
281 if (next<'p','o','s',';'>(&src[2])) {
290 if (next<'u','o','t',';'>(&src[2])) {
299 if (next<'t',';'>(&src[2])) {
308 if (next<'t',';'>(&src[2])) {
318 unsigned long code = 0;
322 if (digit == 0xFF)
break;
323 code = code * 16 + digit;
326 insertUTF8char(dest, code);
328 unsigned long code = 0;
332 if (digit == 0xFF)
break;
333 code = code * 10 + digit;
336 insertUTF8char(dest, code);
339 throw ParseError(
"expected ;", src);
351 if ((FLAGS & normalizeWhitespace) &&
369 static inline void skipBOM(
char*& text)
371 if (next<
char(0xEF),
char(0xBB),
char(0xBF)>(text)) {
377 template<
int FLAGS,
typename HANDLER>
class Parser
388 skip<WhitespacePred>(text);
389 if (*text == 0)
break;
401 void parseDeclaration(
char*& text)
403 handler.declarationStart();
404 skip<WhitespacePred>(text);
405 parseAttributes(text);
406 handler.declarationStop();
409 if (!next<'?','>
'>(text)) {
410 throw ParseError("expected ?>", text);
415 // Parse XML comment (<!--...)
416 void parseComment(char*& text)
418 // Skip until end of comment
419 char* value = text; // remember value start
420 while (!next<'-
','-
','>
'>(text)) {
422 throw ParseError("unexpected end of data", text);
426 handler.comment(string_ref(value, text));
427 text += 3; // skip '-->
'
430 void parseDoctype(char*& text)
432 char* value = text; // remember value start
435 while (*text != '>
') {
438 // If '[
' encountered, scan for matching ending
439 // ']
' using naive algorithm with depth. This
440 // works for all W3C test files except for 2
446 case char('[
'): ++depth; break;
447 case char(']
'): --depth; break;
448 case 0: throw ParseError(
449 "unexpected end of data", text);
456 throw ParseError("unexpected end of data", text);
463 handler.doctype(string_ref(value, text));
464 text += 1; // skip '>
'
467 void parsePI(char*& text)
469 // Extract PI target name
471 skip<NodeNamePred>(text);
472 char* nameEnd = text;
473 if (name == nameEnd) {
474 throw ParseError("expected PI target", text);
477 // Skip whitespace between pi target and pi
478 skip<WhitespacePred>(text);
481 char* value = text; // Remember start of pi
482 while (!next<'?
','>
'>(text)) {
484 throw ParseError("unexpected end of data", text);
488 // Set pi value (verbatim, no entity expansion or ws normalization)
489 handler.procInstr(string_ref(name, nameEnd),
490 string_ref(value, text));
491 text += 2; // skip '?>
'
494 void parseText(char*& text, char* contentsStart)
496 // Backup to contents start if whitespace trimming is disabled
497 if (!(FLAGS & trimWhitespace)) {
498 text = contentsStart;
500 // Skip until end of data
502 char* end = (FLAGS & normalizeWhitespace)
503 ? skipAndExpand<TextPred, TextPureWithWsPred, FLAGS>(text)
504 : skipAndExpand<TextPred, TextPureNoWsPred , FLAGS>(text);
506 // Trim trailing whitespace; leading was already trimmed by
507 // whitespace skip after >
508 if (FLAGS & trimWhitespace) {
509 if (FLAGS & normalizeWhitespace) {
510 // Whitespace is already condensed to single
511 // space characters by skipping function, so
512 // just trim 1 char off the end.
513 if (end[-1] == ' ') {
517 // Backup until non-whitespace character is found
518 while (WhitespacePred::test(end[-1])) {
524 // Handle text, but only if non-empty.
525 auto len = end - value;
526 if (len) handler.text(string_ref(value, len));
529 void parseCdata(char*& text)
531 // Skip until end of cdata
533 while (!next<']
',']
','>
'>(text)) {
535 throw ParseError("unexpected end of data", text);
539 handler.cdata(string_ref(value, text));
540 text += 3; // skip ]]>
543 void parseElement(char*& text)
545 // Extract element name
547 skip<NodeNamePred>(text);
548 char* nameEnd = text;
549 if (name == nameEnd) {
550 throw ParseError("expected element name", text);
552 handler.start(string_ref(name, nameEnd));
554 skip<WhitespacePred>(text); // skip ws before attributes or >
555 parseAttributes(text);
557 // Determine ending type
560 parseNodeContents(text);
561 } else if (*text == '/
') {
565 throw ParseError("expected >", text);
569 throw ParseError("expected >", text);
573 // Determine node type, and parse it
574 void parseNode(char*& text)
579 // Note: this doesn't detect mixed
case (xMl), does
581 if ((next<'x','m','l'>(text) ||
582 next<'X','M','L'>(text)) &&
586 parseDeclaration(text);
596 if (text[2] ==
'-') {
605 if (next<
'C',
'D',
'A',
'T',
'A',
'['>(&text[2])) {
614 if (next<'O','C','T','Y','P','E'>(&text[2]) &&
625 while (*text !=
'>') {
628 "unexpected end of data", text);
642 void parseNodeContents(
char*& text)
645 char* contentsStart = text;
646 skip<WhitespacePred>(text);
652 if (text[1] ==
'/') {
655 skip<NodeNamePred>(text);
659 skip<WhitespacePred>(text);
661 throw ParseError(
"expected >", text);
673 throw ParseError(
"unexpected end of data", text);
676 parseText(text, contentsStart);
683 void parseAttributes(
char*& text)
690 skip<AttributeNamePred>(text);
691 char* nameEnd = text;
692 if (name == nameEnd) {
693 throw ParseError(
"expected attribute name", name);
696 skip<WhitespacePred>(text);
698 throw ParseError(
"expected =", text);
701 skip<WhitespacePred>(text);
705 if (quote !=
'\'' && quote !=
'"') {
706 throw ParseError(
"expected ' or \"", text);
712 static const int FLAGS2 = FLAGS & ~normalizeWhitespace;
714 char* valueEnd = (quote ==
'\'')
715 ? skipAndExpand<AttPred1, AttPurePred1, FLAGS2>(text)
716 : skipAndExpand<AttPred2, AttPurePred2, FLAGS2>(text);
721 if (*text != quote) {
722 throw ParseError(
"expected ' or \"", text);
726 skip<WhitespacePred>(text);
733 template<
int FLAGS,
typename HANDLER>
734 inline void parse(HANDLER& handler,
char* xml)