2 Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3 See the file copying.txt for copying permission.
6 #undef INVALID_LEAD_CASE
8 #ifndef IS_INVALID_CHAR
9 #define IS_INVALID_CHAR(enc, ptr, n) (0)
10 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
13 return XML_TOK_PARTIAL_CHAR; \
17 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
20 return XML_TOK_PARTIAL_CHAR; \
21 if (IS_INVALID_CHAR(enc, ptr, n)) { \
22 *(nextTokPtr) = (ptr); \
23 return XML_TOK_INVALID; \
29 #define INVALID_CASES(ptr, nextTokPtr) \
30 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
31 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
32 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
36 *(nextTokPtr) = (ptr); \
37 return XML_TOK_INVALID;
39 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
42 return XML_TOK_PARTIAL_CHAR; \
43 if (!IS_NAME_CHAR(enc, ptr, n)) { \
45 return XML_TOK_INVALID; \
50 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
52 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
54 return XML_TOK_INVALID; \
63 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
64 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
65 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
67 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
70 return XML_TOK_PARTIAL_CHAR; \
71 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
73 return XML_TOK_INVALID; \
78 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
80 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
82 return XML_TOK_INVALID; \
88 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
89 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
90 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
93 #define PREFIX(ident) ident
96 /* ptr points to character following "<!-" */
99 int PREFIX(scanComment
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
100 const char **nextTokPtr
)
103 if (!CHAR_MATCHES(enc
, ptr
, ASCII_MINUS
)) {
105 return XML_TOK_INVALID
;
109 switch (BYTE_TYPE(enc
, ptr
)) {
110 INVALID_CASES(ptr
, nextTokPtr
)
112 if ((ptr
+= MINBPC(enc
)) == end
)
113 return XML_TOK_PARTIAL
;
114 if (CHAR_MATCHES(enc
, ptr
, ASCII_MINUS
)) {
115 if ((ptr
+= MINBPC(enc
)) == end
)
116 return XML_TOK_PARTIAL
;
117 if (!CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
119 return XML_TOK_INVALID
;
121 *nextTokPtr
= ptr
+ MINBPC(enc
);
122 return XML_TOK_COMMENT
;
131 return XML_TOK_PARTIAL
;
134 /* ptr points to character following "<!" */
137 int PREFIX(scanDecl
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
138 const char **nextTokPtr
)
141 return XML_TOK_PARTIAL
;
142 switch (BYTE_TYPE(enc
, ptr
)) {
144 return PREFIX(scanComment
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
146 *nextTokPtr
= ptr
+ MINBPC(enc
);
147 return XML_TOK_COND_SECT_OPEN
;
154 return XML_TOK_INVALID
;
157 switch (BYTE_TYPE(enc
, ptr
)) {
159 if (ptr
+ MINBPC(enc
) == end
)
160 return XML_TOK_PARTIAL
;
161 /* don't allow <!ENTITY% foo "whatever"> */
162 switch (BYTE_TYPE(enc
, ptr
+ MINBPC(enc
))) {
163 case BT_S
: case BT_CR
: case BT_LF
: case BT_PERCNT
:
165 return XML_TOK_INVALID
;
168 case BT_S
: case BT_CR
: case BT_LF
:
170 return XML_TOK_DECL_OPEN
;
177 return XML_TOK_INVALID
;
180 return XML_TOK_PARTIAL
;
184 int PREFIX(checkPiTarget
)(const ENCODING
*enc
, const char *ptr
, const char *end
, int *tokPtr
)
187 *tokPtr
= XML_TOK_PI
;
188 if (end
- ptr
!= MINBPC(enc
)*3)
190 switch (BYTE_TO_ASCII(enc
, ptr
)) {
200 switch (BYTE_TO_ASCII(enc
, ptr
)) {
210 switch (BYTE_TO_ASCII(enc
, ptr
)) {
221 *tokPtr
= XML_TOK_XML_DECL
;
225 /* ptr points to character following "<?" */
228 int PREFIX(scanPi
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
229 const char **nextTokPtr
)
232 const char *target
= ptr
;
234 return XML_TOK_PARTIAL
;
235 switch (BYTE_TYPE(enc
, ptr
)) {
236 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
239 return XML_TOK_INVALID
;
242 switch (BYTE_TYPE(enc
, ptr
)) {
243 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
244 case BT_S
: case BT_CR
: case BT_LF
:
245 if (!PREFIX(checkPiTarget
)(enc
, target
, ptr
, &tok
)) {
247 return XML_TOK_INVALID
;
251 switch (BYTE_TYPE(enc
, ptr
)) {
252 INVALID_CASES(ptr
, nextTokPtr
)
256 return XML_TOK_PARTIAL
;
257 if (CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
258 *nextTokPtr
= ptr
+ MINBPC(enc
);
267 return XML_TOK_PARTIAL
;
269 if (!PREFIX(checkPiTarget
)(enc
, target
, ptr
, &tok
)) {
271 return XML_TOK_INVALID
;
275 return XML_TOK_PARTIAL
;
276 if (CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
277 *nextTokPtr
= ptr
+ MINBPC(enc
);
283 return XML_TOK_INVALID
;
286 return XML_TOK_PARTIAL
;
291 int PREFIX(scanCdataSection
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
292 const char **nextTokPtr
)
294 static const char CDATA_LSQB
[] = { ASCII_C
, ASCII_D
, ASCII_A
, ASCII_T
, ASCII_A
, ASCII_LSQB
};
297 if (end
- ptr
< 6 * MINBPC(enc
))
298 return XML_TOK_PARTIAL
;
299 for (i
= 0; i
< 6; i
++, ptr
+= MINBPC(enc
)) {
300 if (!CHAR_MATCHES(enc
, ptr
, CDATA_LSQB
[i
])) {
302 return XML_TOK_INVALID
;
306 return XML_TOK_CDATA_SECT_OPEN
;
310 int PREFIX(cdataSectionTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
311 const char **nextTokPtr
)
315 #if !(MINBPC(enc) == 1)
316 if (MINBPC(enc
) > 1) {
317 size_t n
= end
- ptr
;
318 if (n
& (MINBPC(enc
) - 1)) {
319 n
&= ~(MINBPC(enc
) - 1);
321 return XML_TOK_PARTIAL
;
326 switch (BYTE_TYPE(enc
, ptr
)) {
330 return XML_TOK_PARTIAL
;
331 if (!CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
))
335 return XML_TOK_PARTIAL
;
336 if (!CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
340 *nextTokPtr
= ptr
+ MINBPC(enc
);
341 return XML_TOK_CDATA_SECT_CLOSE
;
345 return XML_TOK_PARTIAL
;
346 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
349 return XML_TOK_DATA_NEWLINE
;
351 *nextTokPtr
= ptr
+ MINBPC(enc
);
352 return XML_TOK_DATA_NEWLINE
;
353 INVALID_CASES(ptr
, nextTokPtr
)
359 switch (BYTE_TYPE(enc
, ptr
)) {
360 #define LEAD_CASE(n) \
362 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
364 return XML_TOK_DATA_CHARS; \
368 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
377 return XML_TOK_DATA_CHARS
;
384 return XML_TOK_DATA_CHARS
;
387 /* ptr points to character following "</" */
390 int PREFIX(scanEndTag
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
391 const char **nextTokPtr
)
394 return XML_TOK_PARTIAL
;
395 switch (BYTE_TYPE(enc
, ptr
)) {
396 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
399 return XML_TOK_INVALID
;
402 switch (BYTE_TYPE(enc
, ptr
)) {
403 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
404 case BT_S
: case BT_CR
: case BT_LF
:
405 for (ptr
+= MINBPC(enc
); ptr
!= end
; ptr
+= MINBPC(enc
)) {
406 switch (BYTE_TYPE(enc
, ptr
)) {
407 case BT_S
: case BT_CR
: case BT_LF
:
410 *nextTokPtr
= ptr
+ MINBPC(enc
);
411 return XML_TOK_END_TAG
;
414 return XML_TOK_INVALID
;
417 return XML_TOK_PARTIAL
;
420 /* no need to check qname syntax here, since end-tag must match exactly */
425 *nextTokPtr
= ptr
+ MINBPC(enc
);
426 return XML_TOK_END_TAG
;
429 return XML_TOK_INVALID
;
432 return XML_TOK_PARTIAL
;
435 /* ptr points to character following "&#X" */
438 int PREFIX(scanHexCharRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
439 const char **nextTokPtr
)
442 switch (BYTE_TYPE(enc
, ptr
)) {
448 return XML_TOK_INVALID
;
450 for (ptr
+= MINBPC(enc
); ptr
!= end
; ptr
+= MINBPC(enc
)) {
451 switch (BYTE_TYPE(enc
, ptr
)) {
456 *nextTokPtr
= ptr
+ MINBPC(enc
);
457 return XML_TOK_CHAR_REF
;
460 return XML_TOK_INVALID
;
464 return XML_TOK_PARTIAL
;
467 /* ptr points to character following "&#" */
470 int PREFIX(scanCharRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
471 const char **nextTokPtr
)
474 if (CHAR_MATCHES(enc
, ptr
, ASCII_x
))
475 return PREFIX(scanHexCharRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
476 switch (BYTE_TYPE(enc
, ptr
)) {
481 return XML_TOK_INVALID
;
483 for (ptr
+= MINBPC(enc
); ptr
!= end
; ptr
+= MINBPC(enc
)) {
484 switch (BYTE_TYPE(enc
, ptr
)) {
488 *nextTokPtr
= ptr
+ MINBPC(enc
);
489 return XML_TOK_CHAR_REF
;
492 return XML_TOK_INVALID
;
496 return XML_TOK_PARTIAL
;
499 /* ptr points to character following "&" */
502 int PREFIX(scanRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
503 const char **nextTokPtr
)
506 return XML_TOK_PARTIAL
;
507 switch (BYTE_TYPE(enc
, ptr
)) {
508 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
510 return PREFIX(scanCharRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
513 return XML_TOK_INVALID
;
516 switch (BYTE_TYPE(enc
, ptr
)) {
517 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
519 *nextTokPtr
= ptr
+ MINBPC(enc
);
520 return XML_TOK_ENTITY_REF
;
523 return XML_TOK_INVALID
;
526 return XML_TOK_PARTIAL
;
529 /* ptr points to character following first character of attribute name */
532 int PREFIX(scanAtts
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
533 const char **nextTokPtr
)
539 switch (BYTE_TYPE(enc
, ptr
)) {
540 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
545 return XML_TOK_INVALID
;
550 return XML_TOK_PARTIAL
;
551 switch (BYTE_TYPE(enc
, ptr
)) {
552 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
555 return XML_TOK_INVALID
;
559 case BT_S
: case BT_CR
: case BT_LF
:
565 return XML_TOK_PARTIAL
;
566 t
= BYTE_TYPE(enc
, ptr
);
576 return XML_TOK_INVALID
;
590 return XML_TOK_PARTIAL
;
591 open
= BYTE_TYPE(enc
, ptr
);
592 if (open
== BT_QUOT
|| open
== BT_APOS
)
601 return XML_TOK_INVALID
;
605 /* in attribute value */
609 return XML_TOK_PARTIAL
;
610 t
= BYTE_TYPE(enc
, ptr
);
614 INVALID_CASES(ptr
, nextTokPtr
)
617 int tok
= PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, &ptr
);
619 if (tok
== XML_TOK_INVALID
)
627 return XML_TOK_INVALID
;
635 return XML_TOK_PARTIAL
;
636 switch (BYTE_TYPE(enc
, ptr
)) {
647 return XML_TOK_INVALID
;
649 /* ptr points to closing quote */
653 return XML_TOK_PARTIAL
;
654 switch (BYTE_TYPE(enc
, ptr
)) {
655 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
656 case BT_S
: case BT_CR
: case BT_LF
:
660 *nextTokPtr
= ptr
+ MINBPC(enc
);
661 return XML_TOK_START_TAG_WITH_ATTS
;
666 return XML_TOK_PARTIAL
;
667 if (!CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
669 return XML_TOK_INVALID
;
671 *nextTokPtr
= ptr
+ MINBPC(enc
);
672 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS
;
675 return XML_TOK_INVALID
;
683 return XML_TOK_INVALID
;
686 return XML_TOK_PARTIAL
;
689 /* ptr points to character following "<" */
692 int PREFIX(scanLt
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
693 const char **nextTokPtr
)
699 return XML_TOK_PARTIAL
;
700 switch (BYTE_TYPE(enc
, ptr
)) {
701 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
703 if ((ptr
+= MINBPC(enc
)) == end
)
704 return XML_TOK_PARTIAL
;
705 switch (BYTE_TYPE(enc
, ptr
)) {
707 return PREFIX(scanComment
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
709 return PREFIX(scanCdataSection
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
712 return XML_TOK_INVALID
;
714 return PREFIX(scanPi
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
716 return PREFIX(scanEndTag
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
719 return XML_TOK_INVALID
;
724 /* we have a start-tag */
726 switch (BYTE_TYPE(enc
, ptr
)) {
727 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
732 return XML_TOK_INVALID
;
737 return XML_TOK_PARTIAL
;
738 switch (BYTE_TYPE(enc
, ptr
)) {
739 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
742 return XML_TOK_INVALID
;
746 case BT_S
: case BT_CR
: case BT_LF
:
750 switch (BYTE_TYPE(enc
, ptr
)) {
751 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
756 case BT_S
: case BT_CR
: case BT_LF
:
761 return XML_TOK_INVALID
;
763 return PREFIX(scanAtts
)(enc
, ptr
, end
, nextTokPtr
);
765 return XML_TOK_PARTIAL
;
769 *nextTokPtr
= ptr
+ MINBPC(enc
);
770 return XML_TOK_START_TAG_NO_ATTS
;
775 return XML_TOK_PARTIAL
;
776 if (!CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
778 return XML_TOK_INVALID
;
780 *nextTokPtr
= ptr
+ MINBPC(enc
);
781 return XML_TOK_EMPTY_ELEMENT_NO_ATTS
;
784 return XML_TOK_INVALID
;
787 return XML_TOK_PARTIAL
;
791 int PREFIX(contentTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
792 const char **nextTokPtr
)
796 #if !(MINBPC(enc) == 1)
797 if (MINBPC(enc
) > 1) {
798 size_t n
= end
- ptr
;
799 if (n
& (MINBPC(enc
) - 1)) {
800 n
&= ~(MINBPC(enc
) - 1);
802 return XML_TOK_PARTIAL
;
807 switch (BYTE_TYPE(enc
, ptr
)) {
809 return PREFIX(scanLt
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
811 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
815 return XML_TOK_TRAILING_CR
;
816 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
819 return XML_TOK_DATA_NEWLINE
;
821 *nextTokPtr
= ptr
+ MINBPC(enc
);
822 return XML_TOK_DATA_NEWLINE
;
826 return XML_TOK_TRAILING_RSQB
;
827 if (!CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
))
831 return XML_TOK_TRAILING_RSQB
;
832 if (!CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
837 return XML_TOK_INVALID
;
838 INVALID_CASES(ptr
, nextTokPtr
)
844 switch (BYTE_TYPE(enc
, ptr
)) {
845 #define LEAD_CASE(n) \
847 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
849 return XML_TOK_DATA_CHARS; \
853 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
856 if (ptr
+ MINBPC(enc
) != end
) {
857 if (!CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), ASCII_RSQB
)) {
861 if (ptr
+ 2*MINBPC(enc
) != end
) {
862 if (!CHAR_MATCHES(enc
, ptr
+ 2*MINBPC(enc
), ASCII_GT
)) {
866 *nextTokPtr
= ptr
+ 2*MINBPC(enc
);
867 return XML_TOK_INVALID
;
879 return XML_TOK_DATA_CHARS
;
886 return XML_TOK_DATA_CHARS
;
889 /* ptr points to character following "%" */
892 int PREFIX(scanPercent
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
893 const char **nextTokPtr
)
896 return XML_TOK_PARTIAL
;
897 switch (BYTE_TYPE(enc
, ptr
)) {
898 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
899 case BT_S
: case BT_LF
: case BT_CR
: case BT_PERCNT
:
901 return XML_TOK_PERCENT
;
904 return XML_TOK_INVALID
;
907 switch (BYTE_TYPE(enc
, ptr
)) {
908 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
910 *nextTokPtr
= ptr
+ MINBPC(enc
);
911 return XML_TOK_PARAM_ENTITY_REF
;
914 return XML_TOK_INVALID
;
917 return XML_TOK_PARTIAL
;
921 int PREFIX(scanPoundName
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
922 const char **nextTokPtr
)
925 return XML_TOK_PARTIAL
;
926 switch (BYTE_TYPE(enc
, ptr
)) {
927 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
930 return XML_TOK_INVALID
;
933 switch (BYTE_TYPE(enc
, ptr
)) {
934 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
935 case BT_CR
: case BT_LF
: case BT_S
:
936 case BT_RPAR
: case BT_GT
: case BT_PERCNT
: case BT_VERBAR
:
938 return XML_TOK_POUND_NAME
;
941 return XML_TOK_INVALID
;
944 return -XML_TOK_POUND_NAME
;
948 int PREFIX(scanLit
)(int open
, const ENCODING
*enc
,
949 const char *ptr
, const char *end
,
950 const char **nextTokPtr
)
953 int t
= BYTE_TYPE(enc
, ptr
);
955 INVALID_CASES(ptr
, nextTokPtr
)
962 return -XML_TOK_LITERAL
;
964 switch (BYTE_TYPE(enc
, ptr
)) {
965 case BT_S
: case BT_CR
: case BT_LF
:
966 case BT_GT
: case BT_PERCNT
: case BT_LSQB
:
967 return XML_TOK_LITERAL
;
969 return XML_TOK_INVALID
;
976 return XML_TOK_PARTIAL
;
980 int PREFIX(prologTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
981 const char **nextTokPtr
)
986 #if !(MINBPC(enc) == 1)
987 if (MINBPC(enc
) > 1) {
988 size_t n
= end
- ptr
;
989 if (n
& (MINBPC(enc
) - 1)) {
990 n
&= ~(MINBPC(enc
) - 1);
992 return XML_TOK_PARTIAL
;
997 switch (BYTE_TYPE(enc
, ptr
)) {
999 return PREFIX(scanLit
)(BT_QUOT
, enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1001 return PREFIX(scanLit
)(BT_APOS
, enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1006 return XML_TOK_PARTIAL
;
1007 switch (BYTE_TYPE(enc
, ptr
)) {
1009 return PREFIX(scanDecl
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1011 return PREFIX(scanPi
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1018 *nextTokPtr
= ptr
- MINBPC(enc
);
1019 return XML_TOK_INSTANCE_START
;
1022 return XML_TOK_INVALID
;
1025 if (ptr
+ MINBPC(enc
) == end
)
1026 return -XML_TOK_PROLOG_S
;
1028 case BT_S
: case BT_LF
:
1033 switch (BYTE_TYPE(enc
, ptr
)) {
1034 case BT_S
: case BT_LF
:
1037 /* don't split CR/LF pair */
1038 if (ptr
+ MINBPC(enc
) != end
)
1043 return XML_TOK_PROLOG_S
;
1047 return XML_TOK_PROLOG_S
;
1049 return PREFIX(scanPercent
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1051 *nextTokPtr
= ptr
+ MINBPC(enc
);
1052 return XML_TOK_COMMA
;
1054 *nextTokPtr
= ptr
+ MINBPC(enc
);
1055 return XML_TOK_OPEN_BRACKET
;
1059 return -XML_TOK_CLOSE_BRACKET
;
1060 if (CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
)) {
1061 if (ptr
+ MINBPC(enc
) == end
)
1062 return XML_TOK_PARTIAL
;
1063 if (CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), ASCII_GT
)) {
1064 *nextTokPtr
= ptr
+ 2*MINBPC(enc
);
1065 return XML_TOK_COND_SECT_CLOSE
;
1069 return XML_TOK_CLOSE_BRACKET
;
1071 *nextTokPtr
= ptr
+ MINBPC(enc
);
1072 return XML_TOK_OPEN_PAREN
;
1076 return -XML_TOK_CLOSE_PAREN
;
1077 switch (BYTE_TYPE(enc
, ptr
)) {
1079 *nextTokPtr
= ptr
+ MINBPC(enc
);
1080 return XML_TOK_CLOSE_PAREN_ASTERISK
;
1082 *nextTokPtr
= ptr
+ MINBPC(enc
);
1083 return XML_TOK_CLOSE_PAREN_QUESTION
;
1085 *nextTokPtr
= ptr
+ MINBPC(enc
);
1086 return XML_TOK_CLOSE_PAREN_PLUS
;
1087 case BT_CR
: case BT_LF
: case BT_S
:
1088 case BT_GT
: case BT_COMMA
: case BT_VERBAR
:
1091 return XML_TOK_CLOSE_PAREN
;
1094 return XML_TOK_INVALID
;
1096 *nextTokPtr
= ptr
+ MINBPC(enc
);
1099 *nextTokPtr
= ptr
+ MINBPC(enc
);
1100 return XML_TOK_DECL_CLOSE
;
1102 return PREFIX(scanPoundName
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1104 #define LEAD_CASE(n) \
1105 case BT_LEAD ## n: \
1106 if (end - ptr < n) \
1107 return XML_TOK_PARTIAL_CHAR; \
1108 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1110 tok = XML_TOK_NAME; \
1113 if (IS_NAME_CHAR(enc, ptr, n)) { \
1115 tok = XML_TOK_NMTOKEN; \
1118 *nextTokPtr = ptr; \
1119 return XML_TOK_INVALID;
1121 #define LEAD_CASE(n) \
1122 case BT_LEAD ## n: \
1123 if (end - ptr < n) \
1124 return XML_TOK_PARTIAL_CHAR; \
1125 *nextTokPtr = ptr; \
1126 return XML_TOK_INVALID;
1128 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1141 tok
= XML_TOK_NMTOKEN
;
1146 if (IS_NMSTRT_CHAR_MINBPC(enc
, ptr
)) {
1151 if (IS_NAME_CHAR_MINBPC(enc
, ptr
)) {
1153 tok
= XML_TOK_NMTOKEN
;
1160 return XML_TOK_INVALID
;
1162 while (ptr
!= end
) {
1163 switch (BYTE_TYPE(enc
, ptr
)) {
1164 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
1165 case BT_GT
: case BT_RPAR
: case BT_COMMA
:
1166 case BT_VERBAR
: case BT_LSQB
: case BT_PERCNT
:
1167 case BT_S
: case BT_CR
: case BT_LF
:
1176 return XML_TOK_PARTIAL
;
1177 tok
= XML_TOK_PREFIXED_NAME
;
1178 switch (BYTE_TYPE(enc
, ptr
)) {
1179 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
1181 tok
= XML_TOK_NMTOKEN
;
1185 case XML_TOK_PREFIXED_NAME
:
1186 tok
= XML_TOK_NMTOKEN
;
1192 if (tok
== XML_TOK_NMTOKEN
) {
1194 return XML_TOK_INVALID
;
1196 *nextTokPtr
= ptr
+ MINBPC(enc
);
1197 return XML_TOK_NAME_PLUS
;
1199 if (tok
== XML_TOK_NMTOKEN
) {
1201 return XML_TOK_INVALID
;
1203 *nextTokPtr
= ptr
+ MINBPC(enc
);
1204 return XML_TOK_NAME_ASTERISK
;
1206 if (tok
== XML_TOK_NMTOKEN
) {
1208 return XML_TOK_INVALID
;
1210 *nextTokPtr
= ptr
+ MINBPC(enc
);
1211 return XML_TOK_NAME_QUESTION
;
1214 return XML_TOK_INVALID
;
1221 int PREFIX(attributeValueTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1222 const char **nextTokPtr
)
1226 return XML_TOK_NONE
;
1228 while (ptr
!= end
) {
1229 switch (BYTE_TYPE(enc
, ptr
)) {
1230 #define LEAD_CASE(n) \
1231 case BT_LEAD ## n: ptr += n; break;
1232 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1236 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1238 return XML_TOK_DATA_CHARS
;
1240 /* this is for inside entity references */
1242 return XML_TOK_INVALID
;
1245 *nextTokPtr
= ptr
+ MINBPC(enc
);
1246 return XML_TOK_DATA_NEWLINE
;
1249 return XML_TOK_DATA_CHARS
;
1254 return XML_TOK_TRAILING_CR
;
1255 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
1258 return XML_TOK_DATA_NEWLINE
;
1261 return XML_TOK_DATA_CHARS
;
1264 *nextTokPtr
= ptr
+ MINBPC(enc
);
1265 return XML_TOK_ATTRIBUTE_VALUE_S
;
1268 return XML_TOK_DATA_CHARS
;
1275 return XML_TOK_DATA_CHARS
;
1279 int PREFIX(entityValueTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1280 const char **nextTokPtr
)
1284 return XML_TOK_NONE
;
1286 while (ptr
!= end
) {
1287 switch (BYTE_TYPE(enc
, ptr
)) {
1288 #define LEAD_CASE(n) \
1289 case BT_LEAD ## n: ptr += n; break;
1290 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1294 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1296 return XML_TOK_DATA_CHARS
;
1299 return PREFIX(scanPercent
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1301 return XML_TOK_DATA_CHARS
;
1304 *nextTokPtr
= ptr
+ MINBPC(enc
);
1305 return XML_TOK_DATA_NEWLINE
;
1308 return XML_TOK_DATA_CHARS
;
1313 return XML_TOK_TRAILING_CR
;
1314 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
1317 return XML_TOK_DATA_NEWLINE
;
1320 return XML_TOK_DATA_CHARS
;
1327 return XML_TOK_DATA_CHARS
;
1333 int PREFIX(ignoreSectionTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1334 const char **nextTokPtr
)
1337 if (MINBPC(enc
) > 1) {
1338 size_t n
= end
- ptr
;
1339 if (n
& (MINBPC(enc
) - 1)) {
1340 n
&= ~(MINBPC(enc
) - 1);
1344 while (ptr
!= end
) {
1345 switch (BYTE_TYPE(enc
, ptr
)) {
1346 INVALID_CASES(ptr
, nextTokPtr
)
1348 if ((ptr
+= MINBPC(enc
)) == end
)
1349 return XML_TOK_PARTIAL
;
1350 if (CHAR_MATCHES(enc
, ptr
, ASCII_EXCL
)) {
1351 if ((ptr
+= MINBPC(enc
)) == end
)
1352 return XML_TOK_PARTIAL
;
1353 if (CHAR_MATCHES(enc
, ptr
, ASCII_LSQB
)) {
1360 if ((ptr
+= MINBPC(enc
)) == end
)
1361 return XML_TOK_PARTIAL
;
1362 if (CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
)) {
1363 if ((ptr
+= MINBPC(enc
)) == end
)
1364 return XML_TOK_PARTIAL
;
1365 if (CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
1369 return XML_TOK_IGNORE_SECT
;
1380 return XML_TOK_PARTIAL
;
1383 #endif /* XML_DTD */
1386 int PREFIX(isPublicId
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1387 const char **badPtr
)
1391 for (; ptr
!= end
; ptr
+= MINBPC(enc
)) {
1392 switch (BYTE_TYPE(enc
, ptr
)) {
1416 if (CHAR_MATCHES(enc
, ptr
, ASCII_TAB
)) {
1423 if (!(BYTE_TO_ASCII(enc
, ptr
) & ~0x7f))
1426 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1440 /* This must only be called for a well-formed start-tag or empty element tag.
1441 Returns the number of attributes. Pointers to the first attsMax attributes
1442 are stored in atts. */
1445 int PREFIX(getAtts
)(const ENCODING
*enc
, const char *ptr
,
1446 int attsMax
, ATTRIBUTE
*atts
)
1448 enum { other
, inName
, inValue
} state
= inName
;
1450 int open
= 0; /* defined when state == inValue;
1451 initialization just to shut up compilers */
1453 for (ptr
+= MINBPC(enc
);; ptr
+= MINBPC(enc
)) {
1454 switch (BYTE_TYPE(enc
, ptr
)) {
1455 #define START_NAME \
1456 if (state == other) { \
1457 if (nAtts < attsMax) { \
1458 atts[nAtts].name = ptr; \
1459 atts[nAtts].normalized = 1; \
1463 #define LEAD_CASE(n) \
1464 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1465 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1474 if (state
!= inValue
) {
1475 if (nAtts
< attsMax
)
1476 atts
[nAtts
].valuePtr
= ptr
+ MINBPC(enc
);
1480 else if (open
== BT_QUOT
) {
1482 if (nAtts
< attsMax
)
1483 atts
[nAtts
].valueEnd
= ptr
;
1488 if (state
!= inValue
) {
1489 if (nAtts
< attsMax
)
1490 atts
[nAtts
].valuePtr
= ptr
+ MINBPC(enc
);
1494 else if (open
== BT_APOS
) {
1496 if (nAtts
< attsMax
)
1497 atts
[nAtts
].valueEnd
= ptr
;
1502 if (nAtts
< attsMax
)
1503 atts
[nAtts
].normalized
= 0;
1506 if (state
== inName
)
1508 else if (state
== inValue
1510 && atts
[nAtts
].normalized
1511 && (ptr
== atts
[nAtts
].valuePtr
1512 || BYTE_TO_ASCII(enc
, ptr
) != ASCII_SPACE
1513 || BYTE_TO_ASCII(enc
, ptr
+ MINBPC(enc
)) == ASCII_SPACE
1514 || BYTE_TYPE(enc
, ptr
+ MINBPC(enc
)) == open
))
1515 atts
[nAtts
].normalized
= 0;
1517 case BT_CR
: case BT_LF
:
1518 /* This case ensures that the first attribute name is counted
1519 Apart from that we could just change state on the quote. */
1520 if (state
== inName
)
1522 else if (state
== inValue
&& nAtts
< attsMax
)
1523 atts
[nAtts
].normalized
= 0;
1527 if (state
!= inValue
)
1538 int PREFIX(charRefNumber
)(const ENCODING
*enc
, const char *ptr
)
1542 ptr
+= 2*MINBPC(enc
);
1543 if (CHAR_MATCHES(enc
, ptr
, ASCII_x
)) {
1544 for (ptr
+= MINBPC(enc
); !CHAR_MATCHES(enc
, ptr
, ASCII_SEMI
); ptr
+= MINBPC(enc
)) {
1545 int c
= BYTE_TO_ASCII(enc
, ptr
);
1547 case ASCII_0
: case ASCII_1
: case ASCII_2
: case ASCII_3
: case ASCII_4
:
1548 case ASCII_5
: case ASCII_6
: case ASCII_7
: case ASCII_8
: case ASCII_9
:
1550 result
|= (c
- ASCII_0
);
1552 case ASCII_A
: case ASCII_B
: case ASCII_C
: case ASCII_D
: case ASCII_E
: case ASCII_F
:
1554 result
+= 10 + (c
- ASCII_A
);
1556 case ASCII_a
: case ASCII_b
: case ASCII_c
: case ASCII_d
: case ASCII_e
: case ASCII_f
:
1558 result
+= 10 + (c
- ASCII_a
);
1561 if (result
>= 0x110000)
1566 for (; !CHAR_MATCHES(enc
, ptr
, ASCII_SEMI
); ptr
+= MINBPC(enc
)) {
1567 int c
= BYTE_TO_ASCII(enc
, ptr
);
1569 result
+= (c
- ASCII_0
);
1570 if (result
>= 0x110000)
1574 return checkCharRefNumber(result
);
1578 int PREFIX(predefinedEntityName
)(const ENCODING
*enc
, const char *ptr
, const char *end
)
1580 switch ((end
- ptr
)/MINBPC(enc
)) {
1582 if (CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), ASCII_t
)) {
1583 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1592 if (CHAR_MATCHES(enc
, ptr
, ASCII_a
)) {
1594 if (CHAR_MATCHES(enc
, ptr
, ASCII_m
)) {
1596 if (CHAR_MATCHES(enc
, ptr
, ASCII_p
))
1602 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1605 if (CHAR_MATCHES(enc
, ptr
, ASCII_u
)) {
1607 if (CHAR_MATCHES(enc
, ptr
, ASCII_o
)) {
1609 if (CHAR_MATCHES(enc
, ptr
, ASCII_t
))
1616 if (CHAR_MATCHES(enc
, ptr
, ASCII_p
)) {
1618 if (CHAR_MATCHES(enc
, ptr
, ASCII_o
)) {
1620 if (CHAR_MATCHES(enc
, ptr
, ASCII_s
))
1631 int PREFIX(sameName
)(const ENCODING
*enc
, const char *ptr1
, const char *ptr2
)
1634 switch (BYTE_TYPE(enc
, ptr1
)) {
1635 #define LEAD_CASE(n) \
1636 case BT_LEAD ## n: \
1637 if (*ptr1++ != *ptr2++) \
1639 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1642 if (*ptr1
++ != *ptr2
++)
1654 if (*ptr2
++ != *ptr1
++)
1656 if (MINBPC(enc
) > 1) {
1657 if (*ptr2
++ != *ptr1
++)
1659 if (MINBPC(enc
) > 2) {
1660 if (*ptr2
++ != *ptr1
++)
1662 if (MINBPC(enc
) > 3) {
1663 if (*ptr2
++ != *ptr1
++)
1670 if (MINBPC(enc
) == 1 && *ptr1
== *ptr2
)
1672 switch (BYTE_TYPE(enc
, ptr2
)) {
1695 int PREFIX(nameMatchesAscii
)(const ENCODING
*enc
, const char *ptr1
,
1696 const char *end1
, const char *ptr2
)
1698 for (; *ptr2
; ptr1
+= MINBPC(enc
), ptr2
++) {
1701 if (!CHAR_MATCHES(enc
, ptr1
, *ptr2
))
1704 return ptr1
== end1
;
1708 int PREFIX(nameLength
)(const ENCODING
*enc
, const char *ptr
)
1710 const char *start
= ptr
;
1712 switch (BYTE_TYPE(enc
, ptr
)) {
1713 #define LEAD_CASE(n) \
1714 case BT_LEAD ## n: ptr += n; break;
1715 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1735 const char *PREFIX(skipS
)(const ENCODING
*enc
, const char *ptr
)
1738 switch (BYTE_TYPE(enc
, ptr
)) {
1751 void PREFIX(updatePosition
)(const ENCODING
*enc
,
1756 while (ptr
!= end
) {
1757 switch (BYTE_TYPE(enc
, ptr
)) {
1758 #define LEAD_CASE(n) \
1759 case BT_LEAD ## n: \
1762 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1765 pos
->columnNumber
= (unsigned)-1;
1772 if (ptr
!= end
&& BYTE_TYPE(enc
, ptr
) == BT_LF
)
1774 pos
->columnNumber
= (unsigned)-1;
1780 pos
->columnNumber
++;
1785 #undef MULTIBYTE_CASES
1786 #undef INVALID_CASES
1787 #undef CHECK_NAME_CASE
1788 #undef CHECK_NAME_CASES
1789 #undef CHECK_NMSTRT_CASE
1790 #undef CHECK_NMSTRT_CASES