]> git.saurik.com Git - wxWidgets.git/blob - src/xrc/expat/xmltok/xmltok_impl.c
case-insensitive sort of HTML help index
[wxWidgets.git] / src / xrc / expat / xmltok / xmltok_impl.c
1 /*
2 Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3 See the file copying.txt for copying permission.
4 */
5
6 #undef INVALID_LEAD_CASE
7
8 #ifndef IS_INVALID_CHAR
9 #define IS_INVALID_CHAR(enc, ptr, n) (0)
10 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
11 case BT_LEAD ## n: \
12 if (end - ptr < n) \
13 return XML_TOK_PARTIAL_CHAR; \
14 ptr += n; \
15 break;
16 #else
17 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
18 case BT_LEAD ## n: \
19 if (end - ptr < n) \
20 return XML_TOK_PARTIAL_CHAR; \
21 if (IS_INVALID_CHAR(enc, ptr, n)) { \
22 *(nextTokPtr) = (ptr); \
23 return XML_TOK_INVALID; \
24 } \
25 ptr += n; \
26 break;
27 #endif
28
29 #define INVALID_CASES(ptr, nextTokPtr) \
30 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
31 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
32 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
33 case BT_NONXML: \
34 case BT_MALFORM: \
35 case BT_TRAIL: \
36 *(nextTokPtr) = (ptr); \
37 return XML_TOK_INVALID;
38
39 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
40 case BT_LEAD ## n: \
41 if (end - ptr < n) \
42 return XML_TOK_PARTIAL_CHAR; \
43 if (!IS_NAME_CHAR(enc, ptr, n)) { \
44 *nextTokPtr = ptr; \
45 return XML_TOK_INVALID; \
46 } \
47 ptr += n; \
48 break;
49
50 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
51 case BT_NONASCII: \
52 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
53 *nextTokPtr = ptr; \
54 return XML_TOK_INVALID; \
55 } \
56 case BT_NMSTRT: \
57 case BT_HEX: \
58 case BT_DIGIT: \
59 case BT_NAME: \
60 case BT_MINUS: \
61 ptr += MINBPC(enc); \
62 break; \
63 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
64 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
65 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
66
67 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
68 case BT_LEAD ## n: \
69 if (end - ptr < n) \
70 return XML_TOK_PARTIAL_CHAR; \
71 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
72 *nextTokPtr = ptr; \
73 return XML_TOK_INVALID; \
74 } \
75 ptr += n; \
76 break;
77
78 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
79 case BT_NONASCII: \
80 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
81 *nextTokPtr = ptr; \
82 return XML_TOK_INVALID; \
83 } \
84 case BT_NMSTRT: \
85 case BT_HEX: \
86 ptr += MINBPC(enc); \
87 break; \
88 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
89 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
90 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
91
92 #ifndef PREFIX
93 #define PREFIX(ident) ident
94 #endif
95
96 /* ptr points to character following "<!-" */
97
98 static
99 int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
100 const char **nextTokPtr)
101 {
102 if (ptr != end) {
103 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
104 *nextTokPtr = ptr;
105 return XML_TOK_INVALID;
106 }
107 ptr += MINBPC(enc);
108 while (ptr != end) {
109 switch (BYTE_TYPE(enc, ptr)) {
110 INVALID_CASES(ptr, nextTokPtr)
111 case BT_MINUS:
112 if ((ptr += MINBPC(enc)) == end)
113 return XML_TOK_PARTIAL;
114 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
115 if ((ptr += MINBPC(enc)) == end)
116 return XML_TOK_PARTIAL;
117 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
118 *nextTokPtr = ptr;
119 return XML_TOK_INVALID;
120 }
121 *nextTokPtr = ptr + MINBPC(enc);
122 return XML_TOK_COMMENT;
123 }
124 break;
125 default:
126 ptr += MINBPC(enc);
127 break;
128 }
129 }
130 }
131 return XML_TOK_PARTIAL;
132 }
133
134 /* ptr points to character following "<!" */
135
136 static
137 int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
138 const char **nextTokPtr)
139 {
140 if (ptr == end)
141 return XML_TOK_PARTIAL;
142 switch (BYTE_TYPE(enc, ptr)) {
143 case BT_MINUS:
144 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
145 case BT_LSQB:
146 *nextTokPtr = ptr + MINBPC(enc);
147 return XML_TOK_COND_SECT_OPEN;
148 case BT_NMSTRT:
149 case BT_HEX:
150 ptr += MINBPC(enc);
151 break;
152 default:
153 *nextTokPtr = ptr;
154 return XML_TOK_INVALID;
155 }
156 while (ptr != end) {
157 switch (BYTE_TYPE(enc, ptr)) {
158 case BT_PERCNT:
159 if (ptr + MINBPC(enc) == end)
160 return XML_TOK_PARTIAL;
161 /* don't allow <!ENTITY% foo "whatever"> */
162 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
163 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
164 *nextTokPtr = ptr;
165 return XML_TOK_INVALID;
166 }
167 /* fall through */
168 case BT_S: case BT_CR: case BT_LF:
169 *nextTokPtr = ptr;
170 return XML_TOK_DECL_OPEN;
171 case BT_NMSTRT:
172 case BT_HEX:
173 ptr += MINBPC(enc);
174 break;
175 default:
176 *nextTokPtr = ptr;
177 return XML_TOK_INVALID;
178 }
179 }
180 return XML_TOK_PARTIAL;
181 }
182
183 static
184 int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
185 {
186 int upper = 0;
187 *tokPtr = XML_TOK_PI;
188 if (end - ptr != MINBPC(enc)*3)
189 return 1;
190 switch (BYTE_TO_ASCII(enc, ptr)) {
191 case ASCII_x:
192 break;
193 case ASCII_X:
194 upper = 1;
195 break;
196 default:
197 return 1;
198 }
199 ptr += MINBPC(enc);
200 switch (BYTE_TO_ASCII(enc, ptr)) {
201 case ASCII_m:
202 break;
203 case ASCII_M:
204 upper = 1;
205 break;
206 default:
207 return 1;
208 }
209 ptr += MINBPC(enc);
210 switch (BYTE_TO_ASCII(enc, ptr)) {
211 case ASCII_l:
212 break;
213 case ASCII_L:
214 upper = 1;
215 break;
216 default:
217 return 1;
218 }
219 if (upper)
220 return 0;
221 *tokPtr = XML_TOK_XML_DECL;
222 return 1;
223 }
224
225 /* ptr points to character following "<?" */
226
227 static
228 int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
229 const char **nextTokPtr)
230 {
231 int tok;
232 const char *target = ptr;
233 if (ptr == end)
234 return XML_TOK_PARTIAL;
235 switch (BYTE_TYPE(enc, ptr)) {
236 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
237 default:
238 *nextTokPtr = ptr;
239 return XML_TOK_INVALID;
240 }
241 while (ptr != end) {
242 switch (BYTE_TYPE(enc, ptr)) {
243 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
244 case BT_S: case BT_CR: case BT_LF:
245 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
246 *nextTokPtr = ptr;
247 return XML_TOK_INVALID;
248 }
249 ptr += MINBPC(enc);
250 while (ptr != end) {
251 switch (BYTE_TYPE(enc, ptr)) {
252 INVALID_CASES(ptr, nextTokPtr)
253 case BT_QUEST:
254 ptr += MINBPC(enc);
255 if (ptr == end)
256 return XML_TOK_PARTIAL;
257 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
258 *nextTokPtr = ptr + MINBPC(enc);
259 return tok;
260 }
261 break;
262 default:
263 ptr += MINBPC(enc);
264 break;
265 }
266 }
267 return XML_TOK_PARTIAL;
268 case BT_QUEST:
269 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
270 *nextTokPtr = ptr;
271 return XML_TOK_INVALID;
272 }
273 ptr += MINBPC(enc);
274 if (ptr == end)
275 return XML_TOK_PARTIAL;
276 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
277 *nextTokPtr = ptr + MINBPC(enc);
278 return tok;
279 }
280 /* fall through */
281 default:
282 *nextTokPtr = ptr;
283 return XML_TOK_INVALID;
284 }
285 }
286 return XML_TOK_PARTIAL;
287 }
288
289
290 static
291 int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
292 const char **nextTokPtr)
293 {
294 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB };
295 int i;
296 /* CDATA[ */
297 if (end - ptr < 6 * MINBPC(enc))
298 return XML_TOK_PARTIAL;
299 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
300 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
301 *nextTokPtr = ptr;
302 return XML_TOK_INVALID;
303 }
304 }
305 *nextTokPtr = ptr;
306 return XML_TOK_CDATA_SECT_OPEN;
307 }
308
309 static
310 int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
311 const char **nextTokPtr)
312 {
313 if (ptr == end)
314 return XML_TOK_NONE;
315 #if !(MINBPC(enc) == 1)
316 if (MINBPC(enc) > 1) {
317 size_t n = end - ptr;
318 if (n & (MINBPC(enc) - 1)) {
319 n &= ~(MINBPC(enc) - 1);
320 if (n == 0)
321 return XML_TOK_PARTIAL;
322 end = ptr + n;
323 }
324 }
325 #endif
326 switch (BYTE_TYPE(enc, ptr)) {
327 case BT_RSQB:
328 ptr += MINBPC(enc);
329 if (ptr == end)
330 return XML_TOK_PARTIAL;
331 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
332 break;
333 ptr += MINBPC(enc);
334 if (ptr == end)
335 return XML_TOK_PARTIAL;
336 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
337 ptr -= MINBPC(enc);
338 break;
339 }
340 *nextTokPtr = ptr + MINBPC(enc);
341 return XML_TOK_CDATA_SECT_CLOSE;
342 case BT_CR:
343 ptr += MINBPC(enc);
344 if (ptr == end)
345 return XML_TOK_PARTIAL;
346 if (BYTE_TYPE(enc, ptr) == BT_LF)
347 ptr += MINBPC(enc);
348 *nextTokPtr = ptr;
349 return XML_TOK_DATA_NEWLINE;
350 case BT_LF:
351 *nextTokPtr = ptr + MINBPC(enc);
352 return XML_TOK_DATA_NEWLINE;
353 INVALID_CASES(ptr, nextTokPtr)
354 default:
355 ptr += MINBPC(enc);
356 break;
357 }
358 while (ptr != end) {
359 switch (BYTE_TYPE(enc, ptr)) {
360 #define LEAD_CASE(n) \
361 case BT_LEAD ## n: \
362 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
363 *nextTokPtr = ptr; \
364 return XML_TOK_DATA_CHARS; \
365 } \
366 ptr += n; \
367 break;
368 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
369 #undef LEAD_CASE
370 case BT_NONXML:
371 case BT_MALFORM:
372 case BT_TRAIL:
373 case BT_CR:
374 case BT_LF:
375 case BT_RSQB:
376 *nextTokPtr = ptr;
377 return XML_TOK_DATA_CHARS;
378 default:
379 ptr += MINBPC(enc);
380 break;
381 }
382 }
383 *nextTokPtr = ptr;
384 return XML_TOK_DATA_CHARS;
385 }
386
387 /* ptr points to character following "</" */
388
389 static
390 int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
391 const char **nextTokPtr)
392 {
393 if (ptr == end)
394 return XML_TOK_PARTIAL;
395 switch (BYTE_TYPE(enc, ptr)) {
396 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
397 default:
398 *nextTokPtr = ptr;
399 return XML_TOK_INVALID;
400 }
401 while (ptr != end) {
402 switch (BYTE_TYPE(enc, ptr)) {
403 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
404 case BT_S: case BT_CR: case BT_LF:
405 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
406 switch (BYTE_TYPE(enc, ptr)) {
407 case BT_S: case BT_CR: case BT_LF:
408 break;
409 case BT_GT:
410 *nextTokPtr = ptr + MINBPC(enc);
411 return XML_TOK_END_TAG;
412 default:
413 *nextTokPtr = ptr;
414 return XML_TOK_INVALID;
415 }
416 }
417 return XML_TOK_PARTIAL;
418 #ifdef XML_NS
419 case BT_COLON:
420 /* no need to check qname syntax here, since end-tag must match exactly */
421 ptr += MINBPC(enc);
422 break;
423 #endif
424 case BT_GT:
425 *nextTokPtr = ptr + MINBPC(enc);
426 return XML_TOK_END_TAG;
427 default:
428 *nextTokPtr = ptr;
429 return XML_TOK_INVALID;
430 }
431 }
432 return XML_TOK_PARTIAL;
433 }
434
435 /* ptr points to character following "&#X" */
436
437 static
438 int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
439 const char **nextTokPtr)
440 {
441 if (ptr != end) {
442 switch (BYTE_TYPE(enc, ptr)) {
443 case BT_DIGIT:
444 case BT_HEX:
445 break;
446 default:
447 *nextTokPtr = ptr;
448 return XML_TOK_INVALID;
449 }
450 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
451 switch (BYTE_TYPE(enc, ptr)) {
452 case BT_DIGIT:
453 case BT_HEX:
454 break;
455 case BT_SEMI:
456 *nextTokPtr = ptr + MINBPC(enc);
457 return XML_TOK_CHAR_REF;
458 default:
459 *nextTokPtr = ptr;
460 return XML_TOK_INVALID;
461 }
462 }
463 }
464 return XML_TOK_PARTIAL;
465 }
466
467 /* ptr points to character following "&#" */
468
469 static
470 int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
471 const char **nextTokPtr)
472 {
473 if (ptr != end) {
474 if (CHAR_MATCHES(enc, ptr, ASCII_x))
475 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
476 switch (BYTE_TYPE(enc, ptr)) {
477 case BT_DIGIT:
478 break;
479 default:
480 *nextTokPtr = ptr;
481 return XML_TOK_INVALID;
482 }
483 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
484 switch (BYTE_TYPE(enc, ptr)) {
485 case BT_DIGIT:
486 break;
487 case BT_SEMI:
488 *nextTokPtr = ptr + MINBPC(enc);
489 return XML_TOK_CHAR_REF;
490 default:
491 *nextTokPtr = ptr;
492 return XML_TOK_INVALID;
493 }
494 }
495 }
496 return XML_TOK_PARTIAL;
497 }
498
499 /* ptr points to character following "&" */
500
501 static
502 int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
503 const char **nextTokPtr)
504 {
505 if (ptr == end)
506 return XML_TOK_PARTIAL;
507 switch (BYTE_TYPE(enc, ptr)) {
508 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
509 case BT_NUM:
510 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
511 default:
512 *nextTokPtr = ptr;
513 return XML_TOK_INVALID;
514 }
515 while (ptr != end) {
516 switch (BYTE_TYPE(enc, ptr)) {
517 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
518 case BT_SEMI:
519 *nextTokPtr = ptr + MINBPC(enc);
520 return XML_TOK_ENTITY_REF;
521 default:
522 *nextTokPtr = ptr;
523 return XML_TOK_INVALID;
524 }
525 }
526 return XML_TOK_PARTIAL;
527 }
528
529 /* ptr points to character following first character of attribute name */
530
531 static
532 int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
533 const char **nextTokPtr)
534 {
535 #ifdef XML_NS
536 int hadColon = 0;
537 #endif
538 while (ptr != end) {
539 switch (BYTE_TYPE(enc, ptr)) {
540 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
541 #ifdef XML_NS
542 case BT_COLON:
543 if (hadColon) {
544 *nextTokPtr = ptr;
545 return XML_TOK_INVALID;
546 }
547 hadColon = 1;
548 ptr += MINBPC(enc);
549 if (ptr == end)
550 return XML_TOK_PARTIAL;
551 switch (BYTE_TYPE(enc, ptr)) {
552 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
553 default:
554 *nextTokPtr = ptr;
555 return XML_TOK_INVALID;
556 }
557 break;
558 #endif
559 case BT_S: case BT_CR: case BT_LF:
560 for (;;) {
561 int t;
562
563 ptr += MINBPC(enc);
564 if (ptr == end)
565 return XML_TOK_PARTIAL;
566 t = BYTE_TYPE(enc, ptr);
567 if (t == BT_EQUALS)
568 break;
569 switch (t) {
570 case BT_S:
571 case BT_LF:
572 case BT_CR:
573 break;
574 default:
575 *nextTokPtr = ptr;
576 return XML_TOK_INVALID;
577 }
578 }
579 /* fall through */
580 case BT_EQUALS:
581 {
582 int open;
583 #ifdef XML_NS
584 hadColon = 0;
585 #endif
586 for (;;) {
587
588 ptr += MINBPC(enc);
589 if (ptr == end)
590 return XML_TOK_PARTIAL;
591 open = BYTE_TYPE(enc, ptr);
592 if (open == BT_QUOT || open == BT_APOS)
593 break;
594 switch (open) {
595 case BT_S:
596 case BT_LF:
597 case BT_CR:
598 break;
599 default:
600 *nextTokPtr = ptr;
601 return XML_TOK_INVALID;
602 }
603 }
604 ptr += MINBPC(enc);
605 /* in attribute value */
606 for (;;) {
607 int t;
608 if (ptr == end)
609 return XML_TOK_PARTIAL;
610 t = BYTE_TYPE(enc, ptr);
611 if (t == open)
612 break;
613 switch (t) {
614 INVALID_CASES(ptr, nextTokPtr)
615 case BT_AMP:
616 {
617 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
618 if (tok <= 0) {
619 if (tok == XML_TOK_INVALID)
620 *nextTokPtr = ptr;
621 return tok;
622 }
623 break;
624 }
625 case BT_LT:
626 *nextTokPtr = ptr;
627 return XML_TOK_INVALID;
628 default:
629 ptr += MINBPC(enc);
630 break;
631 }
632 }
633 ptr += MINBPC(enc);
634 if (ptr == end)
635 return XML_TOK_PARTIAL;
636 switch (BYTE_TYPE(enc, ptr)) {
637 case BT_S:
638 case BT_CR:
639 case BT_LF:
640 break;
641 case BT_SOL:
642 goto sol;
643 case BT_GT:
644 goto gt;
645 default:
646 *nextTokPtr = ptr;
647 return XML_TOK_INVALID;
648 }
649 /* ptr points to closing quote */
650 for (;;) {
651 ptr += MINBPC(enc);
652 if (ptr == end)
653 return XML_TOK_PARTIAL;
654 switch (BYTE_TYPE(enc, ptr)) {
655 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
656 case BT_S: case BT_CR: case BT_LF:
657 continue;
658 case BT_GT:
659 gt:
660 *nextTokPtr = ptr + MINBPC(enc);
661 return XML_TOK_START_TAG_WITH_ATTS;
662 case BT_SOL:
663 sol:
664 ptr += MINBPC(enc);
665 if (ptr == end)
666 return XML_TOK_PARTIAL;
667 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
668 *nextTokPtr = ptr;
669 return XML_TOK_INVALID;
670 }
671 *nextTokPtr = ptr + MINBPC(enc);
672 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
673 default:
674 *nextTokPtr = ptr;
675 return XML_TOK_INVALID;
676 }
677 break;
678 }
679 break;
680 }
681 default:
682 *nextTokPtr = ptr;
683 return XML_TOK_INVALID;
684 }
685 }
686 return XML_TOK_PARTIAL;
687 }
688
689 /* ptr points to character following "<" */
690
691 static
692 int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
693 const char **nextTokPtr)
694 {
695 #ifdef XML_NS
696 int hadColon;
697 #endif
698 if (ptr == end)
699 return XML_TOK_PARTIAL;
700 switch (BYTE_TYPE(enc, ptr)) {
701 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
702 case BT_EXCL:
703 if ((ptr += MINBPC(enc)) == end)
704 return XML_TOK_PARTIAL;
705 switch (BYTE_TYPE(enc, ptr)) {
706 case BT_MINUS:
707 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
708 case BT_LSQB:
709 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
710 }
711 *nextTokPtr = ptr;
712 return XML_TOK_INVALID;
713 case BT_QUEST:
714 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
715 case BT_SOL:
716 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
717 default:
718 *nextTokPtr = ptr;
719 return XML_TOK_INVALID;
720 }
721 #ifdef XML_NS
722 hadColon = 0;
723 #endif
724 /* we have a start-tag */
725 while (ptr != end) {
726 switch (BYTE_TYPE(enc, ptr)) {
727 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
728 #ifdef XML_NS
729 case BT_COLON:
730 if (hadColon) {
731 *nextTokPtr = ptr;
732 return XML_TOK_INVALID;
733 }
734 hadColon = 1;
735 ptr += MINBPC(enc);
736 if (ptr == end)
737 return XML_TOK_PARTIAL;
738 switch (BYTE_TYPE(enc, ptr)) {
739 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
740 default:
741 *nextTokPtr = ptr;
742 return XML_TOK_INVALID;
743 }
744 break;
745 #endif
746 case BT_S: case BT_CR: case BT_LF:
747 {
748 ptr += MINBPC(enc);
749 while (ptr != end) {
750 switch (BYTE_TYPE(enc, ptr)) {
751 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
752 case BT_GT:
753 goto gt;
754 case BT_SOL:
755 goto sol;
756 case BT_S: case BT_CR: case BT_LF:
757 ptr += MINBPC(enc);
758 continue;
759 default:
760 *nextTokPtr = ptr;
761 return XML_TOK_INVALID;
762 }
763 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
764 }
765 return XML_TOK_PARTIAL;
766 }
767 case BT_GT:
768 gt:
769 *nextTokPtr = ptr + MINBPC(enc);
770 return XML_TOK_START_TAG_NO_ATTS;
771 case BT_SOL:
772 sol:
773 ptr += MINBPC(enc);
774 if (ptr == end)
775 return XML_TOK_PARTIAL;
776 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
777 *nextTokPtr = ptr;
778 return XML_TOK_INVALID;
779 }
780 *nextTokPtr = ptr + MINBPC(enc);
781 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
782 default:
783 *nextTokPtr = ptr;
784 return XML_TOK_INVALID;
785 }
786 }
787 return XML_TOK_PARTIAL;
788 }
789
790 static
791 int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
792 const char **nextTokPtr)
793 {
794 if (ptr == end)
795 return XML_TOK_NONE;
796 #if !(MINBPC(enc) == 1)
797 if (MINBPC(enc) > 1) {
798 size_t n = end - ptr;
799 if (n & (MINBPC(enc) - 1)) {
800 n &= ~(MINBPC(enc) - 1);
801 if (n == 0)
802 return XML_TOK_PARTIAL;
803 end = ptr + n;
804 }
805 }
806 #endif
807 switch (BYTE_TYPE(enc, ptr)) {
808 case BT_LT:
809 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
810 case BT_AMP:
811 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
812 case BT_CR:
813 ptr += MINBPC(enc);
814 if (ptr == end)
815 return XML_TOK_TRAILING_CR;
816 if (BYTE_TYPE(enc, ptr) == BT_LF)
817 ptr += MINBPC(enc);
818 *nextTokPtr = ptr;
819 return XML_TOK_DATA_NEWLINE;
820 case BT_LF:
821 *nextTokPtr = ptr + MINBPC(enc);
822 return XML_TOK_DATA_NEWLINE;
823 case BT_RSQB:
824 ptr += MINBPC(enc);
825 if (ptr == end)
826 return XML_TOK_TRAILING_RSQB;
827 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
828 break;
829 ptr += MINBPC(enc);
830 if (ptr == end)
831 return XML_TOK_TRAILING_RSQB;
832 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
833 ptr -= MINBPC(enc);
834 break;
835 }
836 *nextTokPtr = ptr;
837 return XML_TOK_INVALID;
838 INVALID_CASES(ptr, nextTokPtr)
839 default:
840 ptr += MINBPC(enc);
841 break;
842 }
843 while (ptr != end) {
844 switch (BYTE_TYPE(enc, ptr)) {
845 #define LEAD_CASE(n) \
846 case BT_LEAD ## n: \
847 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
848 *nextTokPtr = ptr; \
849 return XML_TOK_DATA_CHARS; \
850 } \
851 ptr += n; \
852 break;
853 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
854 #undef LEAD_CASE
855 case BT_RSQB:
856 if (ptr + MINBPC(enc) != end) {
857 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
858 ptr += MINBPC(enc);
859 break;
860 }
861 if (ptr + 2*MINBPC(enc) != end) {
862 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
863 ptr += MINBPC(enc);
864 break;
865 }
866 *nextTokPtr = ptr + 2*MINBPC(enc);
867 return XML_TOK_INVALID;
868 }
869 }
870 /* fall through */
871 case BT_AMP:
872 case BT_LT:
873 case BT_NONXML:
874 case BT_MALFORM:
875 case BT_TRAIL:
876 case BT_CR:
877 case BT_LF:
878 *nextTokPtr = ptr;
879 return XML_TOK_DATA_CHARS;
880 default:
881 ptr += MINBPC(enc);
882 break;
883 }
884 }
885 *nextTokPtr = ptr;
886 return XML_TOK_DATA_CHARS;
887 }
888
889 /* ptr points to character following "%" */
890
891 static
892 int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
893 const char **nextTokPtr)
894 {
895 if (ptr == end)
896 return XML_TOK_PARTIAL;
897 switch (BYTE_TYPE(enc, ptr)) {
898 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
899 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
900 *nextTokPtr = ptr;
901 return XML_TOK_PERCENT;
902 default:
903 *nextTokPtr = ptr;
904 return XML_TOK_INVALID;
905 }
906 while (ptr != end) {
907 switch (BYTE_TYPE(enc, ptr)) {
908 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
909 case BT_SEMI:
910 *nextTokPtr = ptr + MINBPC(enc);
911 return XML_TOK_PARAM_ENTITY_REF;
912 default:
913 *nextTokPtr = ptr;
914 return XML_TOK_INVALID;
915 }
916 }
917 return XML_TOK_PARTIAL;
918 }
919
920 static
921 int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
922 const char **nextTokPtr)
923 {
924 if (ptr == end)
925 return XML_TOK_PARTIAL;
926 switch (BYTE_TYPE(enc, ptr)) {
927 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
928 default:
929 *nextTokPtr = ptr;
930 return XML_TOK_INVALID;
931 }
932 while (ptr != end) {
933 switch (BYTE_TYPE(enc, ptr)) {
934 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
935 case BT_CR: case BT_LF: case BT_S:
936 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
937 *nextTokPtr = ptr;
938 return XML_TOK_POUND_NAME;
939 default:
940 *nextTokPtr = ptr;
941 return XML_TOK_INVALID;
942 }
943 }
944 return -XML_TOK_POUND_NAME;
945 }
946
947 static
948 int PREFIX(scanLit)(int open, const ENCODING *enc,
949 const char *ptr, const char *end,
950 const char **nextTokPtr)
951 {
952 while (ptr != end) {
953 int t = BYTE_TYPE(enc, ptr);
954 switch (t) {
955 INVALID_CASES(ptr, nextTokPtr)
956 case BT_QUOT:
957 case BT_APOS:
958 ptr += MINBPC(enc);
959 if (t != open)
960 break;
961 if (ptr == end)
962 return -XML_TOK_LITERAL;
963 *nextTokPtr = ptr;
964 switch (BYTE_TYPE(enc, ptr)) {
965 case BT_S: case BT_CR: case BT_LF:
966 case BT_GT: case BT_PERCNT: case BT_LSQB:
967 return XML_TOK_LITERAL;
968 default:
969 return XML_TOK_INVALID;
970 }
971 default:
972 ptr += MINBPC(enc);
973 break;
974 }
975 }
976 return XML_TOK_PARTIAL;
977 }
978
979 static
980 int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
981 const char **nextTokPtr)
982 {
983 int tok;
984 if (ptr == end)
985 return XML_TOK_NONE;
986 #if !(MINBPC(enc) == 1)
987 if (MINBPC(enc) > 1) {
988 size_t n = end - ptr;
989 if (n & (MINBPC(enc) - 1)) {
990 n &= ~(MINBPC(enc) - 1);
991 if (n == 0)
992 return XML_TOK_PARTIAL;
993 end = ptr + n;
994 }
995 }
996 #endif
997 switch (BYTE_TYPE(enc, ptr)) {
998 case BT_QUOT:
999 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
1000 case BT_APOS:
1001 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
1002 case BT_LT:
1003 {
1004 ptr += MINBPC(enc);
1005 if (ptr == end)
1006 return XML_TOK_PARTIAL;
1007 switch (BYTE_TYPE(enc, ptr)) {
1008 case BT_EXCL:
1009 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1010 case BT_QUEST:
1011 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1012 case BT_NMSTRT:
1013 case BT_HEX:
1014 case BT_NONASCII:
1015 case BT_LEAD2:
1016 case BT_LEAD3:
1017 case BT_LEAD4:
1018 *nextTokPtr = ptr - MINBPC(enc);
1019 return XML_TOK_INSTANCE_START;
1020 }
1021 *nextTokPtr = ptr;
1022 return XML_TOK_INVALID;
1023 }
1024 case BT_CR:
1025 if (ptr + MINBPC(enc) == end)
1026 return -XML_TOK_PROLOG_S;
1027 /* fall through */
1028 case BT_S: case BT_LF:
1029 for (;;) {
1030 ptr += MINBPC(enc);
1031 if (ptr == end)
1032 break;
1033 switch (BYTE_TYPE(enc, ptr)) {
1034 case BT_S: case BT_LF:
1035 break;
1036 case BT_CR:
1037 /* don't split CR/LF pair */
1038 if (ptr + MINBPC(enc) != end)
1039 break;
1040 /* fall through */
1041 default:
1042 *nextTokPtr = ptr;
1043 return XML_TOK_PROLOG_S;
1044 }
1045 }
1046 *nextTokPtr = ptr;
1047 return XML_TOK_PROLOG_S;
1048 case BT_PERCNT:
1049 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1050 case BT_COMMA:
1051 *nextTokPtr = ptr + MINBPC(enc);
1052 return XML_TOK_COMMA;
1053 case BT_LSQB:
1054 *nextTokPtr = ptr + MINBPC(enc);
1055 return XML_TOK_OPEN_BRACKET;
1056 case BT_RSQB:
1057 ptr += MINBPC(enc);
1058 if (ptr == end)
1059 return -XML_TOK_CLOSE_BRACKET;
1060 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1061 if (ptr + MINBPC(enc) == end)
1062 return XML_TOK_PARTIAL;
1063 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1064 *nextTokPtr = ptr + 2*MINBPC(enc);
1065 return XML_TOK_COND_SECT_CLOSE;
1066 }
1067 }
1068 *nextTokPtr = ptr;
1069 return XML_TOK_CLOSE_BRACKET;
1070 case BT_LPAR:
1071 *nextTokPtr = ptr + MINBPC(enc);
1072 return XML_TOK_OPEN_PAREN;
1073 case BT_RPAR:
1074 ptr += MINBPC(enc);
1075 if (ptr == end)
1076 return -XML_TOK_CLOSE_PAREN;
1077 switch (BYTE_TYPE(enc, ptr)) {
1078 case BT_AST:
1079 *nextTokPtr = ptr + MINBPC(enc);
1080 return XML_TOK_CLOSE_PAREN_ASTERISK;
1081 case BT_QUEST:
1082 *nextTokPtr = ptr + MINBPC(enc);
1083 return XML_TOK_CLOSE_PAREN_QUESTION;
1084 case BT_PLUS:
1085 *nextTokPtr = ptr + MINBPC(enc);
1086 return XML_TOK_CLOSE_PAREN_PLUS;
1087 case BT_CR: case BT_LF: case BT_S:
1088 case BT_GT: case BT_COMMA: case BT_VERBAR:
1089 case BT_RPAR:
1090 *nextTokPtr = ptr;
1091 return XML_TOK_CLOSE_PAREN;
1092 }
1093 *nextTokPtr = ptr;
1094 return XML_TOK_INVALID;
1095 case BT_VERBAR:
1096 *nextTokPtr = ptr + MINBPC(enc);
1097 return XML_TOK_OR;
1098 case BT_GT:
1099 *nextTokPtr = ptr + MINBPC(enc);
1100 return XML_TOK_DECL_CLOSE;
1101 case BT_NUM:
1102 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1103 #ifdef XML_MIN_SIZE
1104 #define LEAD_CASE(n) \
1105 case BT_LEAD ## n: \
1106 if (end - ptr < n) \
1107 return XML_TOK_PARTIAL_CHAR; \
1108 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1109 ptr += n; \
1110 tok = XML_TOK_NAME; \
1111 break; \
1112 } \
1113 if (IS_NAME_CHAR(enc, ptr, n)) { \
1114 ptr += n; \
1115 tok = XML_TOK_NMTOKEN; \
1116 break; \
1117 } \
1118 *nextTokPtr = ptr; \
1119 return XML_TOK_INVALID;
1120 #else
1121 #define LEAD_CASE(n) \
1122 case BT_LEAD ## n: \
1123 if (end - ptr < n) \
1124 return XML_TOK_PARTIAL_CHAR; \
1125 *nextTokPtr = ptr; \
1126 return XML_TOK_INVALID;
1127 #endif
1128 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1129 #undef LEAD_CASE
1130 case BT_NMSTRT:
1131 case BT_HEX:
1132 tok = XML_TOK_NAME;
1133 ptr += MINBPC(enc);
1134 break;
1135 case BT_DIGIT:
1136 case BT_NAME:
1137 case BT_MINUS:
1138 #ifdef XML_NS
1139 case BT_COLON:
1140 #endif
1141 tok = XML_TOK_NMTOKEN;
1142 ptr += MINBPC(enc);
1143 break;
1144 case BT_NONASCII:
1145 #ifdef XML_MIN_SIZE
1146 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1147 ptr += MINBPC(enc);
1148 tok = XML_TOK_NAME;
1149 break;
1150 }
1151 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1152 ptr += MINBPC(enc);
1153 tok = XML_TOK_NMTOKEN;
1154 break;
1155 }
1156 #endif
1157 /* fall through */
1158 default:
1159 *nextTokPtr = ptr;
1160 return XML_TOK_INVALID;
1161 }
1162 while (ptr != end) {
1163 switch (BYTE_TYPE(enc, ptr)) {
1164 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1165 case BT_GT: case BT_RPAR: case BT_COMMA:
1166 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1167 case BT_S: case BT_CR: case BT_LF:
1168 *nextTokPtr = ptr;
1169 return tok;
1170 #ifdef XML_NS
1171 case BT_COLON:
1172 ptr += MINBPC(enc);
1173 switch (tok) {
1174 case XML_TOK_NAME:
1175 if (ptr == end)
1176 return XML_TOK_PARTIAL;
1177 tok = XML_TOK_PREFIXED_NAME;
1178 switch (BYTE_TYPE(enc, ptr)) {
1179 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1180 default:
1181 tok = XML_TOK_NMTOKEN;
1182 break;
1183 }
1184 break;
1185 case XML_TOK_PREFIXED_NAME:
1186 tok = XML_TOK_NMTOKEN;
1187 break;
1188 }
1189 break;
1190 #endif
1191 case BT_PLUS:
1192 if (tok == XML_TOK_NMTOKEN) {
1193 *nextTokPtr = ptr;
1194 return XML_TOK_INVALID;
1195 }
1196 *nextTokPtr = ptr + MINBPC(enc);
1197 return XML_TOK_NAME_PLUS;
1198 case BT_AST:
1199 if (tok == XML_TOK_NMTOKEN) {
1200 *nextTokPtr = ptr;
1201 return XML_TOK_INVALID;
1202 }
1203 *nextTokPtr = ptr + MINBPC(enc);
1204 return XML_TOK_NAME_ASTERISK;
1205 case BT_QUEST:
1206 if (tok == XML_TOK_NMTOKEN) {
1207 *nextTokPtr = ptr;
1208 return XML_TOK_INVALID;
1209 }
1210 *nextTokPtr = ptr + MINBPC(enc);
1211 return XML_TOK_NAME_QUESTION;
1212 default:
1213 *nextTokPtr = ptr;
1214 return XML_TOK_INVALID;
1215 }
1216 }
1217 return -tok;
1218 }
1219
1220 static
1221 int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1222 const char **nextTokPtr)
1223 {
1224 const char *start;
1225 if (ptr == end)
1226 return XML_TOK_NONE;
1227 start = ptr;
1228 while (ptr != end) {
1229 switch (BYTE_TYPE(enc, ptr)) {
1230 #define LEAD_CASE(n) \
1231 case BT_LEAD ## n: ptr += n; break;
1232 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1233 #undef LEAD_CASE
1234 case BT_AMP:
1235 if (ptr == start)
1236 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1237 *nextTokPtr = ptr;
1238 return XML_TOK_DATA_CHARS;
1239 case BT_LT:
1240 /* this is for inside entity references */
1241 *nextTokPtr = ptr;
1242 return XML_TOK_INVALID;
1243 case BT_LF:
1244 if (ptr == start) {
1245 *nextTokPtr = ptr + MINBPC(enc);
1246 return XML_TOK_DATA_NEWLINE;
1247 }
1248 *nextTokPtr = ptr;
1249 return XML_TOK_DATA_CHARS;
1250 case BT_CR:
1251 if (ptr == start) {
1252 ptr += MINBPC(enc);
1253 if (ptr == end)
1254 return XML_TOK_TRAILING_CR;
1255 if (BYTE_TYPE(enc, ptr) == BT_LF)
1256 ptr += MINBPC(enc);
1257 *nextTokPtr = ptr;
1258 return XML_TOK_DATA_NEWLINE;
1259 }
1260 *nextTokPtr = ptr;
1261 return XML_TOK_DATA_CHARS;
1262 case BT_S:
1263 if (ptr == start) {
1264 *nextTokPtr = ptr + MINBPC(enc);
1265 return XML_TOK_ATTRIBUTE_VALUE_S;
1266 }
1267 *nextTokPtr = ptr;
1268 return XML_TOK_DATA_CHARS;
1269 default:
1270 ptr += MINBPC(enc);
1271 break;
1272 }
1273 }
1274 *nextTokPtr = ptr;
1275 return XML_TOK_DATA_CHARS;
1276 }
1277
1278 static
1279 int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1280 const char **nextTokPtr)
1281 {
1282 const char *start;
1283 if (ptr == end)
1284 return XML_TOK_NONE;
1285 start = ptr;
1286 while (ptr != end) {
1287 switch (BYTE_TYPE(enc, ptr)) {
1288 #define LEAD_CASE(n) \
1289 case BT_LEAD ## n: ptr += n; break;
1290 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1291 #undef LEAD_CASE
1292 case BT_AMP:
1293 if (ptr == start)
1294 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1295 *nextTokPtr = ptr;
1296 return XML_TOK_DATA_CHARS;
1297 case BT_PERCNT:
1298 if (ptr == start)
1299 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1300 *nextTokPtr = ptr;
1301 return XML_TOK_DATA_CHARS;
1302 case BT_LF:
1303 if (ptr == start) {
1304 *nextTokPtr = ptr + MINBPC(enc);
1305 return XML_TOK_DATA_NEWLINE;
1306 }
1307 *nextTokPtr = ptr;
1308 return XML_TOK_DATA_CHARS;
1309 case BT_CR:
1310 if (ptr == start) {
1311 ptr += MINBPC(enc);
1312 if (ptr == end)
1313 return XML_TOK_TRAILING_CR;
1314 if (BYTE_TYPE(enc, ptr) == BT_LF)
1315 ptr += MINBPC(enc);
1316 *nextTokPtr = ptr;
1317 return XML_TOK_DATA_NEWLINE;
1318 }
1319 *nextTokPtr = ptr;
1320 return XML_TOK_DATA_CHARS;
1321 default:
1322 ptr += MINBPC(enc);
1323 break;
1324 }
1325 }
1326 *nextTokPtr = ptr;
1327 return XML_TOK_DATA_CHARS;
1328 }
1329
1330 #ifdef XML_DTD
1331
1332 static
1333 int PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1334 const char **nextTokPtr)
1335 {
1336 int level = 0;
1337 if (MINBPC(enc) > 1) {
1338 size_t n = end - ptr;
1339 if (n & (MINBPC(enc) - 1)) {
1340 n &= ~(MINBPC(enc) - 1);
1341 end = ptr + n;
1342 }
1343 }
1344 while (ptr != end) {
1345 switch (BYTE_TYPE(enc, ptr)) {
1346 INVALID_CASES(ptr, nextTokPtr)
1347 case BT_LT:
1348 if ((ptr += MINBPC(enc)) == end)
1349 return XML_TOK_PARTIAL;
1350 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1351 if ((ptr += MINBPC(enc)) == end)
1352 return XML_TOK_PARTIAL;
1353 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1354 ++level;
1355 ptr += MINBPC(enc);
1356 }
1357 }
1358 break;
1359 case BT_RSQB:
1360 if ((ptr += MINBPC(enc)) == end)
1361 return XML_TOK_PARTIAL;
1362 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1363 if ((ptr += MINBPC(enc)) == end)
1364 return XML_TOK_PARTIAL;
1365 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1366 ptr += MINBPC(enc);
1367 if (level == 0) {
1368 *nextTokPtr = ptr;
1369 return XML_TOK_IGNORE_SECT;
1370 }
1371 --level;
1372 }
1373 }
1374 break;
1375 default:
1376 ptr += MINBPC(enc);
1377 break;
1378 }
1379 }
1380 return XML_TOK_PARTIAL;
1381 }
1382
1383 #endif /* XML_DTD */
1384
1385 static
1386 int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1387 const char **badPtr)
1388 {
1389 ptr += MINBPC(enc);
1390 end -= MINBPC(enc);
1391 for (; ptr != end; ptr += MINBPC(enc)) {
1392 switch (BYTE_TYPE(enc, ptr)) {
1393 case BT_DIGIT:
1394 case BT_HEX:
1395 case BT_MINUS:
1396 case BT_APOS:
1397 case BT_LPAR:
1398 case BT_RPAR:
1399 case BT_PLUS:
1400 case BT_COMMA:
1401 case BT_SOL:
1402 case BT_EQUALS:
1403 case BT_QUEST:
1404 case BT_CR:
1405 case BT_LF:
1406 case BT_SEMI:
1407 case BT_EXCL:
1408 case BT_AST:
1409 case BT_PERCNT:
1410 case BT_NUM:
1411 #ifdef XML_NS
1412 case BT_COLON:
1413 #endif
1414 break;
1415 case BT_S:
1416 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1417 *badPtr = ptr;
1418 return 0;
1419 }
1420 break;
1421 case BT_NAME:
1422 case BT_NMSTRT:
1423 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1424 break;
1425 default:
1426 switch (BYTE_TO_ASCII(enc, ptr)) {
1427 case 0x24: /* $ */
1428 case 0x40: /* @ */
1429 break;
1430 default:
1431 *badPtr = ptr;
1432 return 0;
1433 }
1434 break;
1435 }
1436 }
1437 return 1;
1438 }
1439
1440 /* This must only be called for a well-formed start-tag or empty element tag.
1441 Returns the number of attributes. Pointers to the first attsMax attributes
1442 are stored in atts. */
1443
1444 static
1445 int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1446 int attsMax, ATTRIBUTE *atts)
1447 {
1448 enum { other, inName, inValue } state = inName;
1449 int nAtts = 0;
1450 int open = 0; /* defined when state == inValue;
1451 initialization just to shut up compilers */
1452
1453 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1454 switch (BYTE_TYPE(enc, ptr)) {
1455 #define START_NAME \
1456 if (state == other) { \
1457 if (nAtts < attsMax) { \
1458 atts[nAtts].name = ptr; \
1459 atts[nAtts].normalized = 1; \
1460 } \
1461 state = inName; \
1462 }
1463 #define LEAD_CASE(n) \
1464 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1465 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1466 #undef LEAD_CASE
1467 case BT_NONASCII:
1468 case BT_NMSTRT:
1469 case BT_HEX:
1470 START_NAME
1471 break;
1472 #undef START_NAME
1473 case BT_QUOT:
1474 if (state != inValue) {
1475 if (nAtts < attsMax)
1476 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1477 state = inValue;
1478 open = BT_QUOT;
1479 }
1480 else if (open == BT_QUOT) {
1481 state = other;
1482 if (nAtts < attsMax)
1483 atts[nAtts].valueEnd = ptr;
1484 nAtts++;
1485 }
1486 break;
1487 case BT_APOS:
1488 if (state != inValue) {
1489 if (nAtts < attsMax)
1490 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1491 state = inValue;
1492 open = BT_APOS;
1493 }
1494 else if (open == BT_APOS) {
1495 state = other;
1496 if (nAtts < attsMax)
1497 atts[nAtts].valueEnd = ptr;
1498 nAtts++;
1499 }
1500 break;
1501 case BT_AMP:
1502 if (nAtts < attsMax)
1503 atts[nAtts].normalized = 0;
1504 break;
1505 case BT_S:
1506 if (state == inName)
1507 state = other;
1508 else if (state == inValue
1509 && nAtts < attsMax
1510 && atts[nAtts].normalized
1511 && (ptr == atts[nAtts].valuePtr
1512 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1513 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1514 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1515 atts[nAtts].normalized = 0;
1516 break;
1517 case BT_CR: case BT_LF:
1518 /* This case ensures that the first attribute name is counted
1519 Apart from that we could just change state on the quote. */
1520 if (state == inName)
1521 state = other;
1522 else if (state == inValue && nAtts < attsMax)
1523 atts[nAtts].normalized = 0;
1524 break;
1525 case BT_GT:
1526 case BT_SOL:
1527 if (state != inValue)
1528 return nAtts;
1529 break;
1530 default:
1531 break;
1532 }
1533 }
1534 /* not reached */
1535 }
1536
1537 static
1538 int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1539 {
1540 int result = 0;
1541 /* skip &# */
1542 ptr += 2*MINBPC(enc);
1543 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1544 for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1545 int c = BYTE_TO_ASCII(enc, ptr);
1546 switch (c) {
1547 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1548 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1549 result <<= 4;
1550 result |= (c - ASCII_0);
1551 break;
1552 case ASCII_A: case ASCII_B: case ASCII_C: case ASCII_D: case ASCII_E: case ASCII_F:
1553 result <<= 4;
1554 result += 10 + (c - ASCII_A);
1555 break;
1556 case ASCII_a: case ASCII_b: case ASCII_c: case ASCII_d: case ASCII_e: case ASCII_f:
1557 result <<= 4;
1558 result += 10 + (c - ASCII_a);
1559 break;
1560 }
1561 if (result >= 0x110000)
1562 return -1;
1563 }
1564 }
1565 else {
1566 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1567 int c = BYTE_TO_ASCII(enc, ptr);
1568 result *= 10;
1569 result += (c - ASCII_0);
1570 if (result >= 0x110000)
1571 return -1;
1572 }
1573 }
1574 return checkCharRefNumber(result);
1575 }
1576
1577 static
1578 int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
1579 {
1580 switch ((end - ptr)/MINBPC(enc)) {
1581 case 2:
1582 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1583 switch (BYTE_TO_ASCII(enc, ptr)) {
1584 case ASCII_l:
1585 return ASCII_LT;
1586 case ASCII_g:
1587 return ASCII_GT;
1588 }
1589 }
1590 break;
1591 case 3:
1592 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1593 ptr += MINBPC(enc);
1594 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1595 ptr += MINBPC(enc);
1596 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1597 return ASCII_AMP;
1598 }
1599 }
1600 break;
1601 case 4:
1602 switch (BYTE_TO_ASCII(enc, ptr)) {
1603 case ASCII_q:
1604 ptr += MINBPC(enc);
1605 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1606 ptr += MINBPC(enc);
1607 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1608 ptr += MINBPC(enc);
1609 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1610 return ASCII_QUOT;
1611 }
1612 }
1613 break;
1614 case ASCII_a:
1615 ptr += MINBPC(enc);
1616 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1617 ptr += MINBPC(enc);
1618 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1619 ptr += MINBPC(enc);
1620 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1621 return ASCII_APOS;
1622 }
1623 }
1624 break;
1625 }
1626 }
1627 return 0;
1628 }
1629
1630 static
1631 int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1632 {
1633 for (;;) {
1634 switch (BYTE_TYPE(enc, ptr1)) {
1635 #define LEAD_CASE(n) \
1636 case BT_LEAD ## n: \
1637 if (*ptr1++ != *ptr2++) \
1638 return 0;
1639 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1640 #undef LEAD_CASE
1641 /* fall through */
1642 if (*ptr1++ != *ptr2++)
1643 return 0;
1644 break;
1645 case BT_NONASCII:
1646 case BT_NMSTRT:
1647 #ifdef XML_NS
1648 case BT_COLON:
1649 #endif
1650 case BT_HEX:
1651 case BT_DIGIT:
1652 case BT_NAME:
1653 case BT_MINUS:
1654 if (*ptr2++ != *ptr1++)
1655 return 0;
1656 if (MINBPC(enc) > 1) {
1657 if (*ptr2++ != *ptr1++)
1658 return 0;
1659 if (MINBPC(enc) > 2) {
1660 if (*ptr2++ != *ptr1++)
1661 return 0;
1662 if (MINBPC(enc) > 3) {
1663 if (*ptr2++ != *ptr1++)
1664 return 0;
1665 }
1666 }
1667 }
1668 break;
1669 default:
1670 if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1671 return 1;
1672 switch (BYTE_TYPE(enc, ptr2)) {
1673 case BT_LEAD2:
1674 case BT_LEAD3:
1675 case BT_LEAD4:
1676 case BT_NONASCII:
1677 case BT_NMSTRT:
1678 #ifdef XML_NS
1679 case BT_COLON:
1680 #endif
1681 case BT_HEX:
1682 case BT_DIGIT:
1683 case BT_NAME:
1684 case BT_MINUS:
1685 return 0;
1686 default:
1687 return 1;
1688 }
1689 }
1690 }
1691 /* not reached */
1692 }
1693
1694 static
1695 int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1696 const char *end1, const char *ptr2)
1697 {
1698 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1699 if (ptr1 == end1)
1700 return 0;
1701 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1702 return 0;
1703 }
1704 return ptr1 == end1;
1705 }
1706
1707 static
1708 int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1709 {
1710 const char *start = ptr;
1711 for (;;) {
1712 switch (BYTE_TYPE(enc, ptr)) {
1713 #define LEAD_CASE(n) \
1714 case BT_LEAD ## n: ptr += n; break;
1715 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1716 #undef LEAD_CASE
1717 case BT_NONASCII:
1718 case BT_NMSTRT:
1719 #ifdef XML_NS
1720 case BT_COLON:
1721 #endif
1722 case BT_HEX:
1723 case BT_DIGIT:
1724 case BT_NAME:
1725 case BT_MINUS:
1726 ptr += MINBPC(enc);
1727 break;
1728 default:
1729 return ptr - start;
1730 }
1731 }
1732 }
1733
1734 static
1735 const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1736 {
1737 for (;;) {
1738 switch (BYTE_TYPE(enc, ptr)) {
1739 case BT_LF:
1740 case BT_CR:
1741 case BT_S:
1742 ptr += MINBPC(enc);
1743 break;
1744 default:
1745 return ptr;
1746 }
1747 }
1748 }
1749
1750 static
1751 void PREFIX(updatePosition)(const ENCODING *enc,
1752 const char *ptr,
1753 const char *end,
1754 POSITION *pos)
1755 {
1756 while (ptr != end) {
1757 switch (BYTE_TYPE(enc, ptr)) {
1758 #define LEAD_CASE(n) \
1759 case BT_LEAD ## n: \
1760 ptr += n; \
1761 break;
1762 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1763 #undef LEAD_CASE
1764 case BT_LF:
1765 pos->columnNumber = (unsigned)-1;
1766 pos->lineNumber++;
1767 ptr += MINBPC(enc);
1768 break;
1769 case BT_CR:
1770 pos->lineNumber++;
1771 ptr += MINBPC(enc);
1772 if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1773 ptr += MINBPC(enc);
1774 pos->columnNumber = (unsigned)-1;
1775 break;
1776 default:
1777 ptr += MINBPC(enc);
1778 break;
1779 }
1780 pos->columnNumber++;
1781 }
1782 }
1783
1784 #undef DO_LEAD_CASE
1785 #undef MULTIBYTE_CASES
1786 #undef INVALID_CASES
1787 #undef CHECK_NAME_CASE
1788 #undef CHECK_NAME_CASES
1789 #undef CHECK_NMSTRT_CASE
1790 #undef CHECK_NMSTRT_CASES