]> git.saurik.com Git - wxWidgets.git/blob - contrib/src/xml/expat/xmltok/xmltok_impl.c
minor change: buf[(size_t)pos] instead of buf.c_str()[pos]
[wxWidgets.git] / contrib / src / xml / expat / xmltok / xmltok_impl.c
1 /*
2 Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3 See the file copying.txt for copying permission.
4 */
5
6 #ifndef IS_INVALID_CHAR
7 #define IS_INVALID_CHAR(enc, ptr, n) (0)
8 #endif
9
10 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
11 case BT_LEAD ## n: \
12 if (end - ptr < n) \
13 return XML_TOK_PARTIAL_CHAR; \
14 if (IS_INVALID_CHAR(enc, ptr, n)) { \
15 *(nextTokPtr) = (ptr); \
16 return XML_TOK_INVALID; \
17 } \
18 ptr += n; \
19 break;
20
21 #define INVALID_CASES(ptr, nextTokPtr) \
22 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
23 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
24 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
25 case BT_NONXML: \
26 case BT_MALFORM: \
27 case BT_TRAIL: \
28 *(nextTokPtr) = (ptr); \
29 return XML_TOK_INVALID;
30
31 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
32 case BT_LEAD ## n: \
33 if (end - ptr < n) \
34 return XML_TOK_PARTIAL_CHAR; \
35 if (!IS_NAME_CHAR(enc, ptr, n)) { \
36 *nextTokPtr = ptr; \
37 return XML_TOK_INVALID; \
38 } \
39 ptr += n; \
40 break;
41
42 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
43 case BT_NONASCII: \
44 if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
45 *nextTokPtr = ptr; \
46 return XML_TOK_INVALID; \
47 } \
48 case BT_NMSTRT: \
49 case BT_HEX: \
50 case BT_DIGIT: \
51 case BT_NAME: \
52 case BT_MINUS: \
53 ptr += MINBPC(enc); \
54 break; \
55 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
56 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
57 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
58
59 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
60 case BT_LEAD ## n: \
61 if (end - ptr < n) \
62 return XML_TOK_PARTIAL_CHAR; \
63 if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
64 *nextTokPtr = ptr; \
65 return XML_TOK_INVALID; \
66 } \
67 ptr += n; \
68 break;
69
70 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
71 case BT_NONASCII: \
72 if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
73 *nextTokPtr = ptr; \
74 return XML_TOK_INVALID; \
75 } \
76 case BT_NMSTRT: \
77 case BT_HEX: \
78 ptr += MINBPC(enc); \
79 break; \
80 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
81 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
82 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
83
84 #ifndef PREFIX
85 #define PREFIX(ident) ident
86 #endif
87
88 /* ptr points to character following "<!-" */
89
90 static
91 int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
92 const char **nextTokPtr)
93 {
94 if (ptr != end) {
95 if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
96 *nextTokPtr = ptr;
97 return XML_TOK_INVALID;
98 }
99 ptr += MINBPC(enc);
100 while (ptr != end) {
101 switch (BYTE_TYPE(enc, ptr)) {
102 INVALID_CASES(ptr, nextTokPtr)
103 case BT_MINUS:
104 if ((ptr += MINBPC(enc)) == end)
105 return XML_TOK_PARTIAL;
106 if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
107 if ((ptr += MINBPC(enc)) == end)
108 return XML_TOK_PARTIAL;
109 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
110 *nextTokPtr = ptr;
111 return XML_TOK_INVALID;
112 }
113 *nextTokPtr = ptr + MINBPC(enc);
114 return XML_TOK_COMMENT;
115 }
116 break;
117 default:
118 ptr += MINBPC(enc);
119 break;
120 }
121 }
122 }
123 return XML_TOK_PARTIAL;
124 }
125
126 /* ptr points to character following "<!" */
127
128 static
129 int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
130 const char **nextTokPtr)
131 {
132 if (ptr == end)
133 return XML_TOK_PARTIAL;
134 switch (BYTE_TYPE(enc, ptr)) {
135 case BT_MINUS:
136 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
137 case BT_LSQB:
138 *nextTokPtr = ptr + MINBPC(enc);
139 return XML_TOK_COND_SECT_OPEN;
140 case BT_NMSTRT:
141 case BT_HEX:
142 ptr += MINBPC(enc);
143 break;
144 default:
145 *nextTokPtr = ptr;
146 return XML_TOK_INVALID;
147 }
148 while (ptr != end) {
149 switch (BYTE_TYPE(enc, ptr)) {
150 case BT_PERCNT:
151 if (ptr + MINBPC(enc) == end)
152 return XML_TOK_PARTIAL;
153 /* don't allow <!ENTITY% foo "whatever"> */
154 switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
155 case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
156 *nextTokPtr = ptr;
157 return XML_TOK_INVALID;
158 }
159 /* fall through */
160 case BT_S: case BT_CR: case BT_LF:
161 *nextTokPtr = ptr;
162 return XML_TOK_DECL_OPEN;
163 case BT_NMSTRT:
164 case BT_HEX:
165 ptr += MINBPC(enc);
166 break;
167 default:
168 *nextTokPtr = ptr;
169 return XML_TOK_INVALID;
170 }
171 }
172 return XML_TOK_PARTIAL;
173 }
174
175 static
176 int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
177 {
178 int upper = 0;
179 *tokPtr = XML_TOK_PI;
180 if (end - ptr != MINBPC(enc)*3)
181 return 1;
182 switch (BYTE_TO_ASCII(enc, ptr)) {
183 case ASCII_x:
184 break;
185 case ASCII_X:
186 upper = 1;
187 break;
188 default:
189 return 1;
190 }
191 ptr += MINBPC(enc);
192 switch (BYTE_TO_ASCII(enc, ptr)) {
193 case ASCII_m:
194 break;
195 case ASCII_M:
196 upper = 1;
197 break;
198 default:
199 return 1;
200 }
201 ptr += MINBPC(enc);
202 switch (BYTE_TO_ASCII(enc, ptr)) {
203 case ASCII_l:
204 break;
205 case ASCII_L:
206 upper = 1;
207 break;
208 default:
209 return 1;
210 }
211 if (upper)
212 return 0;
213 *tokPtr = XML_TOK_XML_DECL;
214 return 1;
215 }
216
217 /* ptr points to character following "<?" */
218
219 static
220 int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
221 const char **nextTokPtr)
222 {
223 int tok;
224 const char *target = ptr;
225 if (ptr == end)
226 return XML_TOK_PARTIAL;
227 switch (BYTE_TYPE(enc, ptr)) {
228 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
229 default:
230 *nextTokPtr = ptr;
231 return XML_TOK_INVALID;
232 }
233 while (ptr != end) {
234 switch (BYTE_TYPE(enc, ptr)) {
235 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
236 case BT_S: case BT_CR: case BT_LF:
237 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
238 *nextTokPtr = ptr;
239 return XML_TOK_INVALID;
240 }
241 ptr += MINBPC(enc);
242 while (ptr != end) {
243 switch (BYTE_TYPE(enc, ptr)) {
244 INVALID_CASES(ptr, nextTokPtr)
245 case BT_QUEST:
246 ptr += MINBPC(enc);
247 if (ptr == end)
248 return XML_TOK_PARTIAL;
249 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
250 *nextTokPtr = ptr + MINBPC(enc);
251 return tok;
252 }
253 break;
254 default:
255 ptr += MINBPC(enc);
256 break;
257 }
258 }
259 return XML_TOK_PARTIAL;
260 case BT_QUEST:
261 if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
262 *nextTokPtr = ptr;
263 return XML_TOK_INVALID;
264 }
265 ptr += MINBPC(enc);
266 if (ptr == end)
267 return XML_TOK_PARTIAL;
268 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
269 *nextTokPtr = ptr + MINBPC(enc);
270 return tok;
271 }
272 /* fall through */
273 default:
274 *nextTokPtr = ptr;
275 return XML_TOK_INVALID;
276 }
277 }
278 return XML_TOK_PARTIAL;
279 }
280
281
282 static
283 int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
284 const char **nextTokPtr)
285 {
286 static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB };
287 int i;
288 /* CDATA[ */
289 if (end - ptr < 6 * MINBPC(enc))
290 return XML_TOK_PARTIAL;
291 for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
292 if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
293 *nextTokPtr = ptr;
294 return XML_TOK_INVALID;
295 }
296 }
297 *nextTokPtr = ptr;
298 return XML_TOK_CDATA_SECT_OPEN;
299 }
300
301 static
302 int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
303 const char **nextTokPtr)
304 {
305 if (ptr == end)
306 return XML_TOK_NONE;
307 if (MINBPC(enc) > 1) {
308 size_t n = end - ptr;
309 if (n & (MINBPC(enc) - 1)) {
310 n &= ~(MINBPC(enc) - 1);
311 if (n == 0)
312 return XML_TOK_PARTIAL;
313 end = ptr + n;
314 }
315 }
316 switch (BYTE_TYPE(enc, ptr)) {
317 case BT_RSQB:
318 ptr += MINBPC(enc);
319 if (ptr == end)
320 return XML_TOK_PARTIAL;
321 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
322 break;
323 ptr += MINBPC(enc);
324 if (ptr == end)
325 return XML_TOK_PARTIAL;
326 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
327 ptr -= MINBPC(enc);
328 break;
329 }
330 *nextTokPtr = ptr + MINBPC(enc);
331 return XML_TOK_CDATA_SECT_CLOSE;
332 case BT_CR:
333 ptr += MINBPC(enc);
334 if (ptr == end)
335 return XML_TOK_PARTIAL;
336 if (BYTE_TYPE(enc, ptr) == BT_LF)
337 ptr += MINBPC(enc);
338 *nextTokPtr = ptr;
339 return XML_TOK_DATA_NEWLINE;
340 case BT_LF:
341 *nextTokPtr = ptr + MINBPC(enc);
342 return XML_TOK_DATA_NEWLINE;
343 INVALID_CASES(ptr, nextTokPtr)
344 default:
345 ptr += MINBPC(enc);
346 break;
347 }
348 while (ptr != end) {
349 switch (BYTE_TYPE(enc, ptr)) {
350 #define LEAD_CASE(n) \
351 case BT_LEAD ## n: \
352 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
353 *nextTokPtr = ptr; \
354 return XML_TOK_DATA_CHARS; \
355 } \
356 ptr += n; \
357 break;
358 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
359 #undef LEAD_CASE
360 case BT_NONXML:
361 case BT_MALFORM:
362 case BT_TRAIL:
363 case BT_CR:
364 case BT_LF:
365 case BT_RSQB:
366 *nextTokPtr = ptr;
367 return XML_TOK_DATA_CHARS;
368 default:
369 ptr += MINBPC(enc);
370 break;
371 }
372 }
373 *nextTokPtr = ptr;
374 return XML_TOK_DATA_CHARS;
375 }
376
377 /* ptr points to character following "</" */
378
379 static
380 int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
381 const char **nextTokPtr)
382 {
383 if (ptr == end)
384 return XML_TOK_PARTIAL;
385 switch (BYTE_TYPE(enc, ptr)) {
386 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
387 default:
388 *nextTokPtr = ptr;
389 return XML_TOK_INVALID;
390 }
391 while (ptr != end) {
392 switch (BYTE_TYPE(enc, ptr)) {
393 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
394 case BT_S: case BT_CR: case BT_LF:
395 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
396 switch (BYTE_TYPE(enc, ptr)) {
397 case BT_S: case BT_CR: case BT_LF:
398 break;
399 case BT_GT:
400 *nextTokPtr = ptr + MINBPC(enc);
401 return XML_TOK_END_TAG;
402 default:
403 *nextTokPtr = ptr;
404 return XML_TOK_INVALID;
405 }
406 }
407 return XML_TOK_PARTIAL;
408 #ifdef XML_NS
409 case BT_COLON:
410 /* no need to check qname syntax here, since end-tag must match exactly */
411 ptr += MINBPC(enc);
412 break;
413 #endif
414 case BT_GT:
415 *nextTokPtr = ptr + MINBPC(enc);
416 return XML_TOK_END_TAG;
417 default:
418 *nextTokPtr = ptr;
419 return XML_TOK_INVALID;
420 }
421 }
422 return XML_TOK_PARTIAL;
423 }
424
425 /* ptr points to character following "&#X" */
426
427 static
428 int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
429 const char **nextTokPtr)
430 {
431 if (ptr != end) {
432 switch (BYTE_TYPE(enc, ptr)) {
433 case BT_DIGIT:
434 case BT_HEX:
435 break;
436 default:
437 *nextTokPtr = ptr;
438 return XML_TOK_INVALID;
439 }
440 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
441 switch (BYTE_TYPE(enc, ptr)) {
442 case BT_DIGIT:
443 case BT_HEX:
444 break;
445 case BT_SEMI:
446 *nextTokPtr = ptr + MINBPC(enc);
447 return XML_TOK_CHAR_REF;
448 default:
449 *nextTokPtr = ptr;
450 return XML_TOK_INVALID;
451 }
452 }
453 }
454 return XML_TOK_PARTIAL;
455 }
456
457 /* ptr points to character following "&#" */
458
459 static
460 int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
461 const char **nextTokPtr)
462 {
463 if (ptr != end) {
464 if (CHAR_MATCHES(enc, ptr, ASCII_x))
465 return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
466 switch (BYTE_TYPE(enc, ptr)) {
467 case BT_DIGIT:
468 break;
469 default:
470 *nextTokPtr = ptr;
471 return XML_TOK_INVALID;
472 }
473 for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
474 switch (BYTE_TYPE(enc, ptr)) {
475 case BT_DIGIT:
476 break;
477 case BT_SEMI:
478 *nextTokPtr = ptr + MINBPC(enc);
479 return XML_TOK_CHAR_REF;
480 default:
481 *nextTokPtr = ptr;
482 return XML_TOK_INVALID;
483 }
484 }
485 }
486 return XML_TOK_PARTIAL;
487 }
488
489 /* ptr points to character following "&" */
490
491 static
492 int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
493 const char **nextTokPtr)
494 {
495 if (ptr == end)
496 return XML_TOK_PARTIAL;
497 switch (BYTE_TYPE(enc, ptr)) {
498 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
499 case BT_NUM:
500 return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
501 default:
502 *nextTokPtr = ptr;
503 return XML_TOK_INVALID;
504 }
505 while (ptr != end) {
506 switch (BYTE_TYPE(enc, ptr)) {
507 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
508 case BT_SEMI:
509 *nextTokPtr = ptr + MINBPC(enc);
510 return XML_TOK_ENTITY_REF;
511 default:
512 *nextTokPtr = ptr;
513 return XML_TOK_INVALID;
514 }
515 }
516 return XML_TOK_PARTIAL;
517 }
518
519 /* ptr points to character following first character of attribute name */
520
521 static
522 int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
523 const char **nextTokPtr)
524 {
525 #ifdef XML_NS
526 int hadColon = 0;
527 #endif
528 while (ptr != end) {
529 switch (BYTE_TYPE(enc, ptr)) {
530 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
531 #ifdef XML_NS
532 case BT_COLON:
533 if (hadColon) {
534 *nextTokPtr = ptr;
535 return XML_TOK_INVALID;
536 }
537 hadColon = 1;
538 ptr += MINBPC(enc);
539 if (ptr == end)
540 return XML_TOK_PARTIAL;
541 switch (BYTE_TYPE(enc, ptr)) {
542 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
543 default:
544 *nextTokPtr = ptr;
545 return XML_TOK_INVALID;
546 }
547 break;
548 #endif
549 case BT_S: case BT_CR: case BT_LF:
550 for (;;) {
551 int t;
552
553 ptr += MINBPC(enc);
554 if (ptr == end)
555 return XML_TOK_PARTIAL;
556 t = BYTE_TYPE(enc, ptr);
557 if (t == BT_EQUALS)
558 break;
559 switch (t) {
560 case BT_S:
561 case BT_LF:
562 case BT_CR:
563 break;
564 default:
565 *nextTokPtr = ptr;
566 return XML_TOK_INVALID;
567 }
568 }
569 /* fall through */
570 case BT_EQUALS:
571 {
572 int open;
573 #ifdef XML_NS
574 hadColon = 0;
575 #endif
576 for (;;) {
577
578 ptr += MINBPC(enc);
579 if (ptr == end)
580 return XML_TOK_PARTIAL;
581 open = BYTE_TYPE(enc, ptr);
582 if (open == BT_QUOT || open == BT_APOS)
583 break;
584 switch (open) {
585 case BT_S:
586 case BT_LF:
587 case BT_CR:
588 break;
589 default:
590 *nextTokPtr = ptr;
591 return XML_TOK_INVALID;
592 }
593 }
594 ptr += MINBPC(enc);
595 /* in attribute value */
596 for (;;) {
597 int t;
598 if (ptr == end)
599 return XML_TOK_PARTIAL;
600 t = BYTE_TYPE(enc, ptr);
601 if (t == open)
602 break;
603 switch (t) {
604 INVALID_CASES(ptr, nextTokPtr)
605 case BT_AMP:
606 {
607 int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
608 if (tok <= 0) {
609 if (tok == XML_TOK_INVALID)
610 *nextTokPtr = ptr;
611 return tok;
612 }
613 break;
614 }
615 case BT_LT:
616 *nextTokPtr = ptr;
617 return XML_TOK_INVALID;
618 default:
619 ptr += MINBPC(enc);
620 break;
621 }
622 }
623 ptr += MINBPC(enc);
624 if (ptr == end)
625 return XML_TOK_PARTIAL;
626 switch (BYTE_TYPE(enc, ptr)) {
627 case BT_S:
628 case BT_CR:
629 case BT_LF:
630 break;
631 case BT_SOL:
632 goto sol;
633 case BT_GT:
634 goto gt;
635 default:
636 *nextTokPtr = ptr;
637 return XML_TOK_INVALID;
638 }
639 /* ptr points to closing quote */
640 for (;;) {
641 ptr += MINBPC(enc);
642 if (ptr == end)
643 return XML_TOK_PARTIAL;
644 switch (BYTE_TYPE(enc, ptr)) {
645 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
646 case BT_S: case BT_CR: case BT_LF:
647 continue;
648 case BT_GT:
649 gt:
650 *nextTokPtr = ptr + MINBPC(enc);
651 return XML_TOK_START_TAG_WITH_ATTS;
652 case BT_SOL:
653 sol:
654 ptr += MINBPC(enc);
655 if (ptr == end)
656 return XML_TOK_PARTIAL;
657 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
658 *nextTokPtr = ptr;
659 return XML_TOK_INVALID;
660 }
661 *nextTokPtr = ptr + MINBPC(enc);
662 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
663 default:
664 *nextTokPtr = ptr;
665 return XML_TOK_INVALID;
666 }
667 break;
668 }
669 break;
670 }
671 default:
672 *nextTokPtr = ptr;
673 return XML_TOK_INVALID;
674 }
675 }
676 return XML_TOK_PARTIAL;
677 }
678
679 /* ptr points to character following "<" */
680
681 static
682 int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
683 const char **nextTokPtr)
684 {
685 #ifdef XML_NS
686 int hadColon;
687 #endif
688 if (ptr == end)
689 return XML_TOK_PARTIAL;
690 switch (BYTE_TYPE(enc, ptr)) {
691 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
692 case BT_EXCL:
693 if ((ptr += MINBPC(enc)) == end)
694 return XML_TOK_PARTIAL;
695 switch (BYTE_TYPE(enc, ptr)) {
696 case BT_MINUS:
697 return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
698 case BT_LSQB:
699 return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
700 }
701 *nextTokPtr = ptr;
702 return XML_TOK_INVALID;
703 case BT_QUEST:
704 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
705 case BT_SOL:
706 return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
707 default:
708 *nextTokPtr = ptr;
709 return XML_TOK_INVALID;
710 }
711 #ifdef XML_NS
712 hadColon = 0;
713 #endif
714 /* we have a start-tag */
715 while (ptr != end) {
716 switch (BYTE_TYPE(enc, ptr)) {
717 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
718 #ifdef XML_NS
719 case BT_COLON:
720 if (hadColon) {
721 *nextTokPtr = ptr;
722 return XML_TOK_INVALID;
723 }
724 hadColon = 1;
725 ptr += MINBPC(enc);
726 if (ptr == end)
727 return XML_TOK_PARTIAL;
728 switch (BYTE_TYPE(enc, ptr)) {
729 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
730 default:
731 *nextTokPtr = ptr;
732 return XML_TOK_INVALID;
733 }
734 break;
735 #endif
736 case BT_S: case BT_CR: case BT_LF:
737 {
738 ptr += MINBPC(enc);
739 while (ptr != end) {
740 switch (BYTE_TYPE(enc, ptr)) {
741 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
742 case BT_GT:
743 goto gt;
744 case BT_SOL:
745 goto sol;
746 case BT_S: case BT_CR: case BT_LF:
747 ptr += MINBPC(enc);
748 continue;
749 default:
750 *nextTokPtr = ptr;
751 return XML_TOK_INVALID;
752 }
753 return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
754 }
755 return XML_TOK_PARTIAL;
756 }
757 case BT_GT:
758 gt:
759 *nextTokPtr = ptr + MINBPC(enc);
760 return XML_TOK_START_TAG_NO_ATTS;
761 case BT_SOL:
762 sol:
763 ptr += MINBPC(enc);
764 if (ptr == end)
765 return XML_TOK_PARTIAL;
766 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
767 *nextTokPtr = ptr;
768 return XML_TOK_INVALID;
769 }
770 *nextTokPtr = ptr + MINBPC(enc);
771 return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
772 default:
773 *nextTokPtr = ptr;
774 return XML_TOK_INVALID;
775 }
776 }
777 return XML_TOK_PARTIAL;
778 }
779
780 static
781 int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
782 const char **nextTokPtr)
783 {
784 if (ptr == end)
785 return XML_TOK_NONE;
786 if (MINBPC(enc) > 1) {
787 size_t n = end - ptr;
788 if (n & (MINBPC(enc) - 1)) {
789 n &= ~(MINBPC(enc) - 1);
790 if (n == 0)
791 return XML_TOK_PARTIAL;
792 end = ptr + n;
793 }
794 }
795 switch (BYTE_TYPE(enc, ptr)) {
796 case BT_LT:
797 return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
798 case BT_AMP:
799 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
800 case BT_CR:
801 ptr += MINBPC(enc);
802 if (ptr == end)
803 return XML_TOK_TRAILING_CR;
804 if (BYTE_TYPE(enc, ptr) == BT_LF)
805 ptr += MINBPC(enc);
806 *nextTokPtr = ptr;
807 return XML_TOK_DATA_NEWLINE;
808 case BT_LF:
809 *nextTokPtr = ptr + MINBPC(enc);
810 return XML_TOK_DATA_NEWLINE;
811 case BT_RSQB:
812 ptr += MINBPC(enc);
813 if (ptr == end)
814 return XML_TOK_TRAILING_RSQB;
815 if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
816 break;
817 ptr += MINBPC(enc);
818 if (ptr == end)
819 return XML_TOK_TRAILING_RSQB;
820 if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
821 ptr -= MINBPC(enc);
822 break;
823 }
824 *nextTokPtr = ptr;
825 return XML_TOK_INVALID;
826 INVALID_CASES(ptr, nextTokPtr)
827 default:
828 ptr += MINBPC(enc);
829 break;
830 }
831 while (ptr != end) {
832 switch (BYTE_TYPE(enc, ptr)) {
833 #define LEAD_CASE(n) \
834 case BT_LEAD ## n: \
835 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
836 *nextTokPtr = ptr; \
837 return XML_TOK_DATA_CHARS; \
838 } \
839 ptr += n; \
840 break;
841 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
842 #undef LEAD_CASE
843 case BT_RSQB:
844 if (ptr + MINBPC(enc) != end) {
845 if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
846 ptr += MINBPC(enc);
847 break;
848 }
849 if (ptr + 2*MINBPC(enc) != end) {
850 if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
851 ptr += MINBPC(enc);
852 break;
853 }
854 *nextTokPtr = ptr + 2*MINBPC(enc);
855 return XML_TOK_INVALID;
856 }
857 }
858 /* fall through */
859 case BT_AMP:
860 case BT_LT:
861 case BT_NONXML:
862 case BT_MALFORM:
863 case BT_TRAIL:
864 case BT_CR:
865 case BT_LF:
866 *nextTokPtr = ptr;
867 return XML_TOK_DATA_CHARS;
868 default:
869 ptr += MINBPC(enc);
870 break;
871 }
872 }
873 *nextTokPtr = ptr;
874 return XML_TOK_DATA_CHARS;
875 }
876
877 /* ptr points to character following "%" */
878
879 static
880 int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
881 const char **nextTokPtr)
882 {
883 if (ptr == end)
884 return XML_TOK_PARTIAL;
885 switch (BYTE_TYPE(enc, ptr)) {
886 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
887 case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
888 *nextTokPtr = ptr;
889 return XML_TOK_PERCENT;
890 default:
891 *nextTokPtr = ptr;
892 return XML_TOK_INVALID;
893 }
894 while (ptr != end) {
895 switch (BYTE_TYPE(enc, ptr)) {
896 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
897 case BT_SEMI:
898 *nextTokPtr = ptr + MINBPC(enc);
899 return XML_TOK_PARAM_ENTITY_REF;
900 default:
901 *nextTokPtr = ptr;
902 return XML_TOK_INVALID;
903 }
904 }
905 return XML_TOK_PARTIAL;
906 }
907
908 static
909 int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
910 const char **nextTokPtr)
911 {
912 if (ptr == end)
913 return XML_TOK_PARTIAL;
914 switch (BYTE_TYPE(enc, ptr)) {
915 CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
916 default:
917 *nextTokPtr = ptr;
918 return XML_TOK_INVALID;
919 }
920 while (ptr != end) {
921 switch (BYTE_TYPE(enc, ptr)) {
922 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
923 case BT_CR: case BT_LF: case BT_S:
924 case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
925 *nextTokPtr = ptr;
926 return XML_TOK_POUND_NAME;
927 default:
928 *nextTokPtr = ptr;
929 return XML_TOK_INVALID;
930 }
931 }
932 return -XML_TOK_POUND_NAME;
933 }
934
935 static
936 int PREFIX(scanLit)(int open, const ENCODING *enc,
937 const char *ptr, const char *end,
938 const char **nextTokPtr)
939 {
940 while (ptr != end) {
941 int t = BYTE_TYPE(enc, ptr);
942 switch (t) {
943 INVALID_CASES(ptr, nextTokPtr)
944 case BT_QUOT:
945 case BT_APOS:
946 ptr += MINBPC(enc);
947 if (t != open)
948 break;
949 if (ptr == end)
950 return -XML_TOK_LITERAL;
951 *nextTokPtr = ptr;
952 switch (BYTE_TYPE(enc, ptr)) {
953 case BT_S: case BT_CR: case BT_LF:
954 case BT_GT: case BT_PERCNT: case BT_LSQB:
955 return XML_TOK_LITERAL;
956 default:
957 return XML_TOK_INVALID;
958 }
959 default:
960 ptr += MINBPC(enc);
961 break;
962 }
963 }
964 return XML_TOK_PARTIAL;
965 }
966
967 static
968 int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
969 const char **nextTokPtr)
970 {
971 int tok;
972 if (ptr == end)
973 return XML_TOK_NONE;
974 if (MINBPC(enc) > 1) {
975 size_t n = end - ptr;
976 if (n & (MINBPC(enc) - 1)) {
977 n &= ~(MINBPC(enc) - 1);
978 if (n == 0)
979 return XML_TOK_PARTIAL;
980 end = ptr + n;
981 }
982 }
983 switch (BYTE_TYPE(enc, ptr)) {
984 case BT_QUOT:
985 return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
986 case BT_APOS:
987 return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
988 case BT_LT:
989 {
990 ptr += MINBPC(enc);
991 if (ptr == end)
992 return XML_TOK_PARTIAL;
993 switch (BYTE_TYPE(enc, ptr)) {
994 case BT_EXCL:
995 return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
996 case BT_QUEST:
997 return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
998 case BT_NMSTRT:
999 case BT_HEX:
1000 case BT_NONASCII:
1001 case BT_LEAD2:
1002 case BT_LEAD3:
1003 case BT_LEAD4:
1004 *nextTokPtr = ptr - MINBPC(enc);
1005 return XML_TOK_INSTANCE_START;
1006 }
1007 *nextTokPtr = ptr;
1008 return XML_TOK_INVALID;
1009 }
1010 case BT_CR:
1011 if (ptr + MINBPC(enc) == end)
1012 return -XML_TOK_PROLOG_S;
1013 /* fall through */
1014 case BT_S: case BT_LF:
1015 for (;;) {
1016 ptr += MINBPC(enc);
1017 if (ptr == end)
1018 break;
1019 switch (BYTE_TYPE(enc, ptr)) {
1020 case BT_S: case BT_LF:
1021 break;
1022 case BT_CR:
1023 /* don't split CR/LF pair */
1024 if (ptr + MINBPC(enc) != end)
1025 break;
1026 /* fall through */
1027 default:
1028 *nextTokPtr = ptr;
1029 return XML_TOK_PROLOG_S;
1030 }
1031 }
1032 *nextTokPtr = ptr;
1033 return XML_TOK_PROLOG_S;
1034 case BT_PERCNT:
1035 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1036 case BT_COMMA:
1037 *nextTokPtr = ptr + MINBPC(enc);
1038 return XML_TOK_COMMA;
1039 case BT_LSQB:
1040 *nextTokPtr = ptr + MINBPC(enc);
1041 return XML_TOK_OPEN_BRACKET;
1042 case BT_RSQB:
1043 ptr += MINBPC(enc);
1044 if (ptr == end)
1045 return -XML_TOK_CLOSE_BRACKET;
1046 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1047 if (ptr + MINBPC(enc) == end)
1048 return XML_TOK_PARTIAL;
1049 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
1050 *nextTokPtr = ptr + 2*MINBPC(enc);
1051 return XML_TOK_COND_SECT_CLOSE;
1052 }
1053 }
1054 *nextTokPtr = ptr;
1055 return XML_TOK_CLOSE_BRACKET;
1056 case BT_LPAR:
1057 *nextTokPtr = ptr + MINBPC(enc);
1058 return XML_TOK_OPEN_PAREN;
1059 case BT_RPAR:
1060 ptr += MINBPC(enc);
1061 if (ptr == end)
1062 return -XML_TOK_CLOSE_PAREN;
1063 switch (BYTE_TYPE(enc, ptr)) {
1064 case BT_AST:
1065 *nextTokPtr = ptr + MINBPC(enc);
1066 return XML_TOK_CLOSE_PAREN_ASTERISK;
1067 case BT_QUEST:
1068 *nextTokPtr = ptr + MINBPC(enc);
1069 return XML_TOK_CLOSE_PAREN_QUESTION;
1070 case BT_PLUS:
1071 *nextTokPtr = ptr + MINBPC(enc);
1072 return XML_TOK_CLOSE_PAREN_PLUS;
1073 case BT_CR: case BT_LF: case BT_S:
1074 case BT_GT: case BT_COMMA: case BT_VERBAR:
1075 case BT_RPAR:
1076 *nextTokPtr = ptr;
1077 return XML_TOK_CLOSE_PAREN;
1078 }
1079 *nextTokPtr = ptr;
1080 return XML_TOK_INVALID;
1081 case BT_VERBAR:
1082 *nextTokPtr = ptr + MINBPC(enc);
1083 return XML_TOK_OR;
1084 case BT_GT:
1085 *nextTokPtr = ptr + MINBPC(enc);
1086 return XML_TOK_DECL_CLOSE;
1087 case BT_NUM:
1088 return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1089 #define LEAD_CASE(n) \
1090 case BT_LEAD ## n: \
1091 if (end - ptr < n) \
1092 return XML_TOK_PARTIAL_CHAR; \
1093 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1094 ptr += n; \
1095 tok = XML_TOK_NAME; \
1096 break; \
1097 } \
1098 if (IS_NAME_CHAR(enc, ptr, n)) { \
1099 ptr += n; \
1100 tok = XML_TOK_NMTOKEN; \
1101 break; \
1102 } \
1103 *nextTokPtr = ptr; \
1104 return XML_TOK_INVALID;
1105 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1106 #undef LEAD_CASE
1107 case BT_NMSTRT:
1108 case BT_HEX:
1109 tok = XML_TOK_NAME;
1110 ptr += MINBPC(enc);
1111 break;
1112 case BT_DIGIT:
1113 case BT_NAME:
1114 case BT_MINUS:
1115 #ifdef XML_NS
1116 case BT_COLON:
1117 #endif
1118 tok = XML_TOK_NMTOKEN;
1119 ptr += MINBPC(enc);
1120 break;
1121 case BT_NONASCII:
1122 if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
1123 ptr += MINBPC(enc);
1124 tok = XML_TOK_NAME;
1125 break;
1126 }
1127 if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
1128 ptr += MINBPC(enc);
1129 tok = XML_TOK_NMTOKEN;
1130 break;
1131 }
1132 /* fall through */
1133 default:
1134 *nextTokPtr = ptr;
1135 return XML_TOK_INVALID;
1136 }
1137 while (ptr != end) {
1138 switch (BYTE_TYPE(enc, ptr)) {
1139 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1140 case BT_GT: case BT_RPAR: case BT_COMMA:
1141 case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
1142 case BT_S: case BT_CR: case BT_LF:
1143 *nextTokPtr = ptr;
1144 return tok;
1145 #ifdef XML_NS
1146 case BT_COLON:
1147 ptr += MINBPC(enc);
1148 switch (tok) {
1149 case XML_TOK_NAME:
1150 if (ptr == end)
1151 return XML_TOK_PARTIAL;
1152 tok = XML_TOK_PREFIXED_NAME;
1153 switch (BYTE_TYPE(enc, ptr)) {
1154 CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
1155 default:
1156 tok = XML_TOK_NMTOKEN;
1157 break;
1158 }
1159 break;
1160 case XML_TOK_PREFIXED_NAME:
1161 tok = XML_TOK_NMTOKEN;
1162 break;
1163 }
1164 break;
1165 #endif
1166 case BT_PLUS:
1167 if (tok == XML_TOK_NMTOKEN) {
1168 *nextTokPtr = ptr;
1169 return XML_TOK_INVALID;
1170 }
1171 *nextTokPtr = ptr + MINBPC(enc);
1172 return XML_TOK_NAME_PLUS;
1173 case BT_AST:
1174 if (tok == XML_TOK_NMTOKEN) {
1175 *nextTokPtr = ptr;
1176 return XML_TOK_INVALID;
1177 }
1178 *nextTokPtr = ptr + MINBPC(enc);
1179 return XML_TOK_NAME_ASTERISK;
1180 case BT_QUEST:
1181 if (tok == XML_TOK_NMTOKEN) {
1182 *nextTokPtr = ptr;
1183 return XML_TOK_INVALID;
1184 }
1185 *nextTokPtr = ptr + MINBPC(enc);
1186 return XML_TOK_NAME_QUESTION;
1187 default:
1188 *nextTokPtr = ptr;
1189 return XML_TOK_INVALID;
1190 }
1191 }
1192 return -tok;
1193 }
1194
1195 static
1196 int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1197 const char **nextTokPtr)
1198 {
1199 const char *start;
1200 if (ptr == end)
1201 return XML_TOK_NONE;
1202 start = ptr;
1203 while (ptr != end) {
1204 switch (BYTE_TYPE(enc, ptr)) {
1205 #define LEAD_CASE(n) \
1206 case BT_LEAD ## n: ptr += n; break;
1207 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1208 #undef LEAD_CASE
1209 case BT_AMP:
1210 if (ptr == start)
1211 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1212 *nextTokPtr = ptr;
1213 return XML_TOK_DATA_CHARS;
1214 case BT_LT:
1215 /* this is for inside entity references */
1216 *nextTokPtr = ptr;
1217 return XML_TOK_INVALID;
1218 case BT_LF:
1219 if (ptr == start) {
1220 *nextTokPtr = ptr + MINBPC(enc);
1221 return XML_TOK_DATA_NEWLINE;
1222 }
1223 *nextTokPtr = ptr;
1224 return XML_TOK_DATA_CHARS;
1225 case BT_CR:
1226 if (ptr == start) {
1227 ptr += MINBPC(enc);
1228 if (ptr == end)
1229 return XML_TOK_TRAILING_CR;
1230 if (BYTE_TYPE(enc, ptr) == BT_LF)
1231 ptr += MINBPC(enc);
1232 *nextTokPtr = ptr;
1233 return XML_TOK_DATA_NEWLINE;
1234 }
1235 *nextTokPtr = ptr;
1236 return XML_TOK_DATA_CHARS;
1237 case BT_S:
1238 if (ptr == start) {
1239 *nextTokPtr = ptr + MINBPC(enc);
1240 return XML_TOK_ATTRIBUTE_VALUE_S;
1241 }
1242 *nextTokPtr = ptr;
1243 return XML_TOK_DATA_CHARS;
1244 default:
1245 ptr += MINBPC(enc);
1246 break;
1247 }
1248 }
1249 *nextTokPtr = ptr;
1250 return XML_TOK_DATA_CHARS;
1251 }
1252
1253 static
1254 int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
1255 const char **nextTokPtr)
1256 {
1257 const char *start;
1258 if (ptr == end)
1259 return XML_TOK_NONE;
1260 start = ptr;
1261 while (ptr != end) {
1262 switch (BYTE_TYPE(enc, ptr)) {
1263 #define LEAD_CASE(n) \
1264 case BT_LEAD ## n: ptr += n; break;
1265 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1266 #undef LEAD_CASE
1267 case BT_AMP:
1268 if (ptr == start)
1269 return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1270 *nextTokPtr = ptr;
1271 return XML_TOK_DATA_CHARS;
1272 case BT_PERCNT:
1273 if (ptr == start)
1274 return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
1275 *nextTokPtr = ptr;
1276 return XML_TOK_DATA_CHARS;
1277 case BT_LF:
1278 if (ptr == start) {
1279 *nextTokPtr = ptr + MINBPC(enc);
1280 return XML_TOK_DATA_NEWLINE;
1281 }
1282 *nextTokPtr = ptr;
1283 return XML_TOK_DATA_CHARS;
1284 case BT_CR:
1285 if (ptr == start) {
1286 ptr += MINBPC(enc);
1287 if (ptr == end)
1288 return XML_TOK_TRAILING_CR;
1289 if (BYTE_TYPE(enc, ptr) == BT_LF)
1290 ptr += MINBPC(enc);
1291 *nextTokPtr = ptr;
1292 return XML_TOK_DATA_NEWLINE;
1293 }
1294 *nextTokPtr = ptr;
1295 return XML_TOK_DATA_CHARS;
1296 default:
1297 ptr += MINBPC(enc);
1298 break;
1299 }
1300 }
1301 *nextTokPtr = ptr;
1302 return XML_TOK_DATA_CHARS;
1303 }
1304
1305 #ifdef XML_DTD
1306
1307 static
1308 int PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
1309 const char **nextTokPtr)
1310 {
1311 int level = 0;
1312 if (MINBPC(enc) > 1) {
1313 size_t n = end - ptr;
1314 if (n & (MINBPC(enc) - 1)) {
1315 n &= ~(MINBPC(enc) - 1);
1316 end = ptr + n;
1317 }
1318 }
1319 while (ptr != end) {
1320 switch (BYTE_TYPE(enc, ptr)) {
1321 INVALID_CASES(ptr, nextTokPtr)
1322 case BT_LT:
1323 if ((ptr += MINBPC(enc)) == end)
1324 return XML_TOK_PARTIAL;
1325 if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
1326 if ((ptr += MINBPC(enc)) == end)
1327 return XML_TOK_PARTIAL;
1328 if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
1329 ++level;
1330 ptr += MINBPC(enc);
1331 }
1332 }
1333 break;
1334 case BT_RSQB:
1335 if ((ptr += MINBPC(enc)) == end)
1336 return XML_TOK_PARTIAL;
1337 if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
1338 if ((ptr += MINBPC(enc)) == end)
1339 return XML_TOK_PARTIAL;
1340 if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
1341 ptr += MINBPC(enc);
1342 if (level == 0) {
1343 *nextTokPtr = ptr;
1344 return XML_TOK_IGNORE_SECT;
1345 }
1346 --level;
1347 }
1348 }
1349 break;
1350 default:
1351 ptr += MINBPC(enc);
1352 break;
1353 }
1354 }
1355 return XML_TOK_PARTIAL;
1356 }
1357
1358 #endif /* XML_DTD */
1359
1360 static
1361 int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
1362 const char **badPtr)
1363 {
1364 ptr += MINBPC(enc);
1365 end -= MINBPC(enc);
1366 for (; ptr != end; ptr += MINBPC(enc)) {
1367 switch (BYTE_TYPE(enc, ptr)) {
1368 case BT_DIGIT:
1369 case BT_HEX:
1370 case BT_MINUS:
1371 case BT_APOS:
1372 case BT_LPAR:
1373 case BT_RPAR:
1374 case BT_PLUS:
1375 case BT_COMMA:
1376 case BT_SOL:
1377 case BT_EQUALS:
1378 case BT_QUEST:
1379 case BT_CR:
1380 case BT_LF:
1381 case BT_SEMI:
1382 case BT_EXCL:
1383 case BT_AST:
1384 case BT_PERCNT:
1385 case BT_NUM:
1386 #ifdef XML_NS
1387 case BT_COLON:
1388 #endif
1389 break;
1390 case BT_S:
1391 if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
1392 *badPtr = ptr;
1393 return 0;
1394 }
1395 break;
1396 case BT_NAME:
1397 case BT_NMSTRT:
1398 if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
1399 break;
1400 default:
1401 switch (BYTE_TO_ASCII(enc, ptr)) {
1402 case 0x24: /* $ */
1403 case 0x40: /* @ */
1404 break;
1405 default:
1406 *badPtr = ptr;
1407 return 0;
1408 }
1409 break;
1410 }
1411 }
1412 return 1;
1413 }
1414
1415 /* This must only be called for a well-formed start-tag or empty element tag.
1416 Returns the number of attributes. Pointers to the first attsMax attributes
1417 are stored in atts. */
1418
1419 static
1420 int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
1421 int attsMax, ATTRIBUTE *atts)
1422 {
1423 enum { other, inName, inValue } state = inName;
1424 int nAtts = 0;
1425 int open = 0; /* defined when state == inValue;
1426 initialization just to shut up compilers */
1427
1428 for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
1429 switch (BYTE_TYPE(enc, ptr)) {
1430 #define START_NAME \
1431 if (state == other) { \
1432 if (nAtts < attsMax) { \
1433 atts[nAtts].name = ptr; \
1434 atts[nAtts].normalized = 1; \
1435 } \
1436 state = inName; \
1437 }
1438 #define LEAD_CASE(n) \
1439 case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
1440 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1441 #undef LEAD_CASE
1442 case BT_NONASCII:
1443 case BT_NMSTRT:
1444 case BT_HEX:
1445 START_NAME
1446 break;
1447 #undef START_NAME
1448 case BT_QUOT:
1449 if (state != inValue) {
1450 if (nAtts < attsMax)
1451 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1452 state = inValue;
1453 open = BT_QUOT;
1454 }
1455 else if (open == BT_QUOT) {
1456 state = other;
1457 if (nAtts < attsMax)
1458 atts[nAtts].valueEnd = ptr;
1459 nAtts++;
1460 }
1461 break;
1462 case BT_APOS:
1463 if (state != inValue) {
1464 if (nAtts < attsMax)
1465 atts[nAtts].valuePtr = ptr + MINBPC(enc);
1466 state = inValue;
1467 open = BT_APOS;
1468 }
1469 else if (open == BT_APOS) {
1470 state = other;
1471 if (nAtts < attsMax)
1472 atts[nAtts].valueEnd = ptr;
1473 nAtts++;
1474 }
1475 break;
1476 case BT_AMP:
1477 if (nAtts < attsMax)
1478 atts[nAtts].normalized = 0;
1479 break;
1480 case BT_S:
1481 if (state == inName)
1482 state = other;
1483 else if (state == inValue
1484 && nAtts < attsMax
1485 && atts[nAtts].normalized
1486 && (ptr == atts[nAtts].valuePtr
1487 || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
1488 || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
1489 || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
1490 atts[nAtts].normalized = 0;
1491 break;
1492 case BT_CR: case BT_LF:
1493 /* This case ensures that the first attribute name is counted
1494 Apart from that we could just change state on the quote. */
1495 if (state == inName)
1496 state = other;
1497 else if (state == inValue && nAtts < attsMax)
1498 atts[nAtts].normalized = 0;
1499 break;
1500 case BT_GT:
1501 case BT_SOL:
1502 if (state != inValue)
1503 return nAtts;
1504 break;
1505 default:
1506 break;
1507 }
1508 }
1509 /* not reached */
1510 }
1511
1512 static
1513 int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
1514 {
1515 int result = 0;
1516 /* skip &# */
1517 ptr += 2*MINBPC(enc);
1518 if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
1519 for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1520 int c = BYTE_TO_ASCII(enc, ptr);
1521 switch (c) {
1522 case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
1523 case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
1524 result <<= 4;
1525 result |= (c - ASCII_0);
1526 break;
1527 case ASCII_A: case ASCII_B: case ASCII_C: case ASCII_D: case ASCII_E: case ASCII_F:
1528 result <<= 4;
1529 result += 10 + (c - ASCII_A);
1530 break;
1531 case ASCII_a: case ASCII_b: case ASCII_c: case ASCII_d: case ASCII_e: case ASCII_f:
1532 result <<= 4;
1533 result += 10 + (c - ASCII_a);
1534 break;
1535 }
1536 if (result >= 0x110000)
1537 return -1;
1538 }
1539 }
1540 else {
1541 for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
1542 int c = BYTE_TO_ASCII(enc, ptr);
1543 result *= 10;
1544 result += (c - ASCII_0);
1545 if (result >= 0x110000)
1546 return -1;
1547 }
1548 }
1549 return checkCharRefNumber(result);
1550 }
1551
1552 static
1553 int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
1554 {
1555 switch ((end - ptr)/MINBPC(enc)) {
1556 case 2:
1557 if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
1558 switch (BYTE_TO_ASCII(enc, ptr)) {
1559 case ASCII_l:
1560 return ASCII_LT;
1561 case ASCII_g:
1562 return ASCII_GT;
1563 }
1564 }
1565 break;
1566 case 3:
1567 if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
1568 ptr += MINBPC(enc);
1569 if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
1570 ptr += MINBPC(enc);
1571 if (CHAR_MATCHES(enc, ptr, ASCII_p))
1572 return ASCII_AMP;
1573 }
1574 }
1575 break;
1576 case 4:
1577 switch (BYTE_TO_ASCII(enc, ptr)) {
1578 case ASCII_q:
1579 ptr += MINBPC(enc);
1580 if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
1581 ptr += MINBPC(enc);
1582 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1583 ptr += MINBPC(enc);
1584 if (CHAR_MATCHES(enc, ptr, ASCII_t))
1585 return ASCII_QUOT;
1586 }
1587 }
1588 break;
1589 case ASCII_a:
1590 ptr += MINBPC(enc);
1591 if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
1592 ptr += MINBPC(enc);
1593 if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
1594 ptr += MINBPC(enc);
1595 if (CHAR_MATCHES(enc, ptr, ASCII_s))
1596 return ASCII_APOS;
1597 }
1598 }
1599 break;
1600 }
1601 }
1602 return 0;
1603 }
1604
1605 static
1606 int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
1607 {
1608 for (;;) {
1609 switch (BYTE_TYPE(enc, ptr1)) {
1610 #define LEAD_CASE(n) \
1611 case BT_LEAD ## n: \
1612 if (*ptr1++ != *ptr2++) \
1613 return 0;
1614 LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
1615 #undef LEAD_CASE
1616 /* fall through */
1617 if (*ptr1++ != *ptr2++)
1618 return 0;
1619 break;
1620 case BT_NONASCII:
1621 case BT_NMSTRT:
1622 #ifdef XML_NS
1623 case BT_COLON:
1624 #endif
1625 case BT_HEX:
1626 case BT_DIGIT:
1627 case BT_NAME:
1628 case BT_MINUS:
1629 if (*ptr2++ != *ptr1++)
1630 return 0;
1631 if (MINBPC(enc) > 1) {
1632 if (*ptr2++ != *ptr1++)
1633 return 0;
1634 if (MINBPC(enc) > 2) {
1635 if (*ptr2++ != *ptr1++)
1636 return 0;
1637 if (MINBPC(enc) > 3) {
1638 if (*ptr2++ != *ptr1++)
1639 return 0;
1640 }
1641 }
1642 }
1643 break;
1644 default:
1645 if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
1646 return 1;
1647 switch (BYTE_TYPE(enc, ptr2)) {
1648 case BT_LEAD2:
1649 case BT_LEAD3:
1650 case BT_LEAD4:
1651 case BT_NONASCII:
1652 case BT_NMSTRT:
1653 #ifdef XML_NS
1654 case BT_COLON:
1655 #endif
1656 case BT_HEX:
1657 case BT_DIGIT:
1658 case BT_NAME:
1659 case BT_MINUS:
1660 return 0;
1661 default:
1662 return 1;
1663 }
1664 }
1665 }
1666 /* not reached */
1667 }
1668
1669 static
1670 int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
1671 const char *end1, const char *ptr2)
1672 {
1673 for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
1674 if (ptr1 == end1)
1675 return 0;
1676 if (!CHAR_MATCHES(enc, ptr1, *ptr2))
1677 return 0;
1678 }
1679 return ptr1 == end1;
1680 }
1681
1682 static
1683 int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
1684 {
1685 const char *start = ptr;
1686 for (;;) {
1687 switch (BYTE_TYPE(enc, ptr)) {
1688 #define LEAD_CASE(n) \
1689 case BT_LEAD ## n: ptr += n; break;
1690 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1691 #undef LEAD_CASE
1692 case BT_NONASCII:
1693 case BT_NMSTRT:
1694 #ifdef XML_NS
1695 case BT_COLON:
1696 #endif
1697 case BT_HEX:
1698 case BT_DIGIT:
1699 case BT_NAME:
1700 case BT_MINUS:
1701 ptr += MINBPC(enc);
1702 break;
1703 default:
1704 return ptr - start;
1705 }
1706 }
1707 }
1708
1709 static
1710 const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
1711 {
1712 for (;;) {
1713 switch (BYTE_TYPE(enc, ptr)) {
1714 case BT_LF:
1715 case BT_CR:
1716 case BT_S:
1717 ptr += MINBPC(enc);
1718 break;
1719 default:
1720 return ptr;
1721 }
1722 }
1723 }
1724
1725 static
1726 void PREFIX(updatePosition)(const ENCODING *enc,
1727 const char *ptr,
1728 const char *end,
1729 POSITION *pos)
1730 {
1731 while (ptr != end) {
1732 switch (BYTE_TYPE(enc, ptr)) {
1733 #define LEAD_CASE(n) \
1734 case BT_LEAD ## n: \
1735 ptr += n; \
1736 break;
1737 LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
1738 #undef LEAD_CASE
1739 case BT_LF:
1740 pos->columnNumber = (unsigned)-1;
1741 pos->lineNumber++;
1742 ptr += MINBPC(enc);
1743 break;
1744 case BT_CR:
1745 pos->lineNumber++;
1746 ptr += MINBPC(enc);
1747 if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
1748 ptr += MINBPC(enc);
1749 pos->columnNumber = (unsigned)-1;
1750 break;
1751 default:
1752 ptr += MINBPC(enc);
1753 break;
1754 }
1755 pos->columnNumber++;
1756 }
1757 }
1758
1759 #undef DO_LEAD_CASE
1760 #undef MULTIBYTE_CASES
1761 #undef INVALID_CASES
1762 #undef CHECK_NAME_CASE
1763 #undef CHECK_NAME_CASES
1764 #undef CHECK_NMSTRT_CASE
1765 #undef CHECK_NMSTRT_CASES