1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 
   2    See the file COPYING for copying permission. 
   7 #ifdef COMPILED_FROM_DSP 
  10 #include "os2config.h" 
  11 #elif defined(__MSDOS__) 
  12 #include "dosconfig.h" 
  13 #elif defined(MACOS_CLASSIC) 
  14 #include "macconfig.h" 
  15 #elif defined(__amigaos__) 
  16 #include "amigaconfig.h" 
  17 #elif defined(__WATCOMC__) 
  18 #include "watcomconfig.h" 
  20 #ifdef HAVE_EXPAT_CONFIG_H 
  21 #include <expat_config.h> 
  23 #endif /* ndef COMPILED_FROM_DSP */ 
  25 #include "expat_external.h" 
  31 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 
  33 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 
  37   { PREFIX(prologTok), PREFIX(contentTok), \ 
  38     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ 
  39   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ 
  41   PREFIX(nameMatchesAscii), \ 
  45   PREFIX(charRefNumber), \ 
  46   PREFIX(predefinedEntityName), \ 
  47   PREFIX(updatePosition), \ 
  50 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 
  52 #define UCS2_GET_NAMING(pages, hi, lo) \ 
  53    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) 
  55 /* A 2 byte UTF-8 representation splits the characters 11 bits between 
  56    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into 
  57    pages, 3 bits to add to that index and 5 bits to generate the mask. 
  59 #define UTF8_GET_NAMING2(pages, byte) \ 
  60     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 
  61                       + ((((byte)[0]) & 3) << 1) \ 
  62                       + ((((byte)[1]) >> 5) & 1)] \ 
  63          & (1 << (((byte)[1]) & 0x1F))) 
  65 /* A 3 byte UTF-8 representation splits the characters 16 bits between 
  66    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index 
  67    into pages, 3 bits to add to that index and 5 bits to generate the 
  70 #define UTF8_GET_NAMING3(pages, byte) \ 
  71   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ 
  72                              + ((((byte)[1]) >> 2) & 0xF)] \ 
  74                       + ((((byte)[1]) & 3) << 1) \ 
  75                       + ((((byte)[2]) >> 5) & 1)] \ 
  76          & (1 << (((byte)[2]) & 0x1F))) 
  78 #define UTF8_GET_NAMING(pages, p, n) \ 
  80   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 
  82      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ 
  85 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 
  86    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 
  87    with the additional restriction of not allowing the Unicode 
  88    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 
  89    Implementation details: 
  90      (A & 0x80) == 0     means A < 0x80 
  92      (A & 0xC0) == 0xC0  means A > 0xBF 
  95 #define UTF8_INVALID2(p) \ 
  96   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 
  98 #define UTF8_INVALID3(p) \ 
  99   (((p)[2] & 0x80) == 0 \ 
 101   ((*p) == 0xEF && (p)[1] == 0xBF \ 
 105     ((p)[2] & 0xC0) == 0xC0) \ 
 109     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 
 111     ((p)[1] & 0x80) == 0 \ 
 113     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 
 115 #define UTF8_INVALID4(p) \ 
 116   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ 
 118   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ 
 122     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 
 124     ((p)[1] & 0x80) == 0 \ 
 126     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 
 128 static int PTRFASTCALL
 
 129 isNever(const ENCODING 
*enc
, const char *p
) 
 134 static int PTRFASTCALL
 
 135 utf8_isName2(const ENCODING 
*enc
, const char *p
) 
 137   return UTF8_GET_NAMING2(namePages
, (const unsigned char *)p
); 
 140 static int PTRFASTCALL
 
 141 utf8_isName3(const ENCODING 
*enc
, const char *p
) 
 143   return UTF8_GET_NAMING3(namePages
, (const unsigned char *)p
); 
 146 #define utf8_isName4 isNever 
 148 static int PTRFASTCALL
 
 149 utf8_isNmstrt2(const ENCODING 
*enc
, const char *p
) 
 151   return UTF8_GET_NAMING2(nmstrtPages
, (const unsigned char *)p
); 
 154 static int PTRFASTCALL
 
 155 utf8_isNmstrt3(const ENCODING 
*enc
, const char *p
) 
 157   return UTF8_GET_NAMING3(nmstrtPages
, (const unsigned char *)p
); 
 160 #define utf8_isNmstrt4 isNever 
 162 static int PTRFASTCALL
 
 163 utf8_isInvalid2(const ENCODING 
*enc
, const char *p
) 
 165   return UTF8_INVALID2((const unsigned char *)p
); 
 168 static int PTRFASTCALL
 
 169 utf8_isInvalid3(const ENCODING 
*enc
, const char *p
) 
 171   return UTF8_INVALID3((const unsigned char *)p
); 
 174 static int PTRFASTCALL
 
 175 utf8_isInvalid4(const ENCODING 
*enc
, const char *p
) 
 177   return UTF8_INVALID4((const unsigned char *)p
); 
 180 struct normal_encoding 
{ 
 182   unsigned char type
[256]; 
 184   int (PTRFASTCALL 
*byteType
)(const ENCODING 
*, const char *); 
 185   int (PTRFASTCALL 
*isNameMin
)(const ENCODING 
*, const char *); 
 186   int (PTRFASTCALL 
*isNmstrtMin
)(const ENCODING 
*, const char *); 
 187   int (PTRFASTCALL 
*byteToAscii
)(const ENCODING 
*, const char *); 
 188   int (PTRCALL 
*charMatches
)(const ENCODING 
*, const char *, int); 
 189 #endif /* XML_MIN_SIZE */ 
 190   int (PTRFASTCALL 
*isName2
)(const ENCODING 
*, const char *); 
 191   int (PTRFASTCALL 
*isName3
)(const ENCODING 
*, const char *); 
 192   int (PTRFASTCALL 
*isName4
)(const ENCODING 
*, const char *); 
 193   int (PTRFASTCALL 
*isNmstrt2
)(const ENCODING 
*, const char *); 
 194   int (PTRFASTCALL 
*isNmstrt3
)(const ENCODING 
*, const char *); 
 195   int (PTRFASTCALL 
*isNmstrt4
)(const ENCODING 
*, const char *); 
 196   int (PTRFASTCALL 
*isInvalid2
)(const ENCODING 
*, const char *); 
 197   int (PTRFASTCALL 
*isInvalid3
)(const ENCODING 
*, const char *); 
 198   int (PTRFASTCALL 
*isInvalid4
)(const ENCODING 
*, const char *); 
 201 #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc)) 
 205 #define STANDARD_VTABLE(E) \ 
 214 #define STANDARD_VTABLE(E) /* as nothing */ 
 218 #define NORMAL_VTABLE(E) \ 
 229 static int FASTCALL 
checkCharRefNumber(int); 
 231 #include "xmltok_impl.h" 
 235 #define sb_isNameMin isNever 
 236 #define sb_isNmstrtMin isNever 
 240 #define MINBPC(enc) ((enc)->minBytesPerChar) 
 242 /* minimum bytes per character */ 
 243 #define MINBPC(enc) 1 
 246 #define SB_BYTE_TYPE(enc, p) \ 
 247   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 
 250 static int PTRFASTCALL
 
 251 sb_byteType(const ENCODING 
*enc
, const char *p
) 
 253   return SB_BYTE_TYPE(enc
, p
); 
 255 #define BYTE_TYPE(enc, p) \ 
 256  (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 
 258 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 
 262 #define BYTE_TO_ASCII(enc, p) \ 
 263  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 
 264 static int PTRFASTCALL
 
 265 sb_byteToAscii(const ENCODING 
*enc
, const char *p
) 
 270 #define BYTE_TO_ASCII(enc, p) (*(p)) 
 273 #define IS_NAME_CHAR(enc, p, n) \ 
 274  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) 
 275 #define IS_NMSTRT_CHAR(enc, p, n) \ 
 276  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) 
 277 #define IS_INVALID_CHAR(enc, p, n) \ 
 278  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) 
 281 #define IS_NAME_CHAR_MINBPC(enc, p) \ 
 282  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 
 283 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 
 284  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 
 286 #define IS_NAME_CHAR_MINBPC(enc, p) (0) 
 287 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 
 291 #define CHAR_MATCHES(enc, p, c) \ 
 292  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 
 294 sb_charMatches(const ENCODING 
*enc
, const char *p
, int c
) 
 299 /* c is an ASCII character */ 
 300 #define CHAR_MATCHES(enc, p, c) (*(p) == c) 
 303 #define PREFIX(ident) normal_ ## ident 
 304 #define XML_TOK_IMPL_C 
 305 #include "xmltok_impl.c" 
 306 #undef XML_TOK_IMPL_C 
 313 #undef IS_NAME_CHAR_MINBPC 
 314 #undef IS_NMSTRT_CHAR 
 315 #undef IS_NMSTRT_CHAR_MINBPC 
 316 #undef IS_INVALID_CHAR 
 318 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */ 
 326 utf8_toUtf8(const ENCODING 
*enc
, 
 327             const char **fromP
, const char *fromLim
, 
 328             char **toP
, const char *toLim
) 
 332   if (fromLim 
- *fromP 
> toLim 
- *toP
) { 
 333     /* Avoid copying partial characters. */ 
 334     for (fromLim 
= *fromP 
+ (toLim 
- *toP
); fromLim 
> *fromP
; fromLim
--) 
 335       if (((unsigned char)fromLim
[-1] & 0xc0) != 0x80) 
 338   for (to 
= *toP
, from 
= *fromP
; from 
!= fromLim
; from
++, to
++) 
 345 utf8_toUtf16(const ENCODING 
*enc
, 
 346              const char **fromP
, const char *fromLim
, 
 347              unsigned short **toP
, const unsigned short *toLim
) 
 349   unsigned short *to 
= *toP
; 
 350   const char *from 
= *fromP
; 
 351   while (from 
!= fromLim 
&& to 
!= toLim
) { 
 352     switch (((struct normal_encoding 
*)enc
)->type
[(unsigned char)*from
]) { 
 354       *to
++ = (unsigned short)(((from
[0] & 0x1f) << 6) | (from
[1] & 0x3f)); 
 358       *to
++ = (unsigned short)(((from
[0] & 0xf) << 12) 
 359                                | ((from
[1] & 0x3f) << 6) | (from
[2] & 0x3f)); 
 367         n 
= ((from
[0] & 0x7) << 18) | ((from
[1] & 0x3f) << 12) 
 368             | ((from
[2] & 0x3f) << 6) | (from
[3] & 0x3f); 
 370         to
[0] = (unsigned short)((n 
>> 10) | 0xD800); 
 371         to
[1] = (unsigned short)((n 
& 0x3FF) | 0xDC00); 
 387 static const struct normal_encoding utf8_encoding_ns 
= { 
 388   { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 }, 
 390 #include "asciitab.h" 
 393   STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
) 
 397 static const struct normal_encoding utf8_encoding 
= { 
 398   { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 }, 
 400 #define BT_COLON BT_NMSTRT 
 401 #include "asciitab.h" 
 405   STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
) 
 410 static const struct normal_encoding internal_utf8_encoding_ns 
= { 
 411   { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 }, 
 413 #include "iasciitab.h" 
 416   STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
) 
 421 static const struct normal_encoding internal_utf8_encoding 
= { 
 422   { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 }, 
 424 #define BT_COLON BT_NMSTRT 
 425 #include "iasciitab.h" 
 429   STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
) 
 433 latin1_toUtf8(const ENCODING 
*enc
, 
 434               const char **fromP
, const char *fromLim
, 
 435               char **toP
, const char *toLim
) 
 439     if (*fromP 
== fromLim
) 
 441     c 
= (unsigned char)**fromP
; 
 443       if (toLim 
- *toP 
< 2) 
 445       *(*toP
)++ = (char)((c 
>> 6) | UTF8_cval2
); 
 446       *(*toP
)++ = (char)((c 
& 0x3f) | 0x80); 
 452       *(*toP
)++ = *(*fromP
)++; 
 458 latin1_toUtf16(const ENCODING 
*enc
, 
 459                const char **fromP
, const char *fromLim
, 
 460                unsigned short **toP
, const unsigned short *toLim
) 
 462   while (*fromP 
!= fromLim 
&& *toP 
!= toLim
) 
 463     *(*toP
)++ = (unsigned char)*(*fromP
)++; 
 468 static const struct normal_encoding latin1_encoding_ns 
= { 
 469   { VTABLE1
, latin1_toUtf8
, latin1_toUtf16
, 1, 0, 0 }, 
 471 #include "asciitab.h" 
 472 #include "latin1tab.h" 
 479 static const struct normal_encoding latin1_encoding 
= { 
 480   { VTABLE1
, latin1_toUtf8
, latin1_toUtf16
, 1, 0, 0 }, 
 482 #define BT_COLON BT_NMSTRT 
 483 #include "asciitab.h" 
 485 #include "latin1tab.h" 
 491 ascii_toUtf8(const ENCODING 
*enc
, 
 492              const char **fromP
, const char *fromLim
, 
 493              char **toP
, const char *toLim
) 
 495   while (*fromP 
!= fromLim 
&& *toP 
!= toLim
) 
 496     *(*toP
)++ = *(*fromP
)++; 
 501 static const struct normal_encoding ascii_encoding_ns 
= { 
 502   { VTABLE1
, ascii_toUtf8
, latin1_toUtf16
, 1, 1, 0 }, 
 504 #include "asciitab.h" 
 512 static const struct normal_encoding ascii_encoding 
= { 
 513   { VTABLE1
, ascii_toUtf8
, latin1_toUtf16
, 1, 1, 0 }, 
 515 #define BT_COLON BT_NMSTRT 
 516 #include "asciitab.h" 
 523 static int PTRFASTCALL
 
 524 unicode_byte_type(char hi
, char lo
) 
 526   switch ((unsigned char)hi
) { 
 527   case 0xD8: case 0xD9: case 0xDA: case 0xDB: 
 529   case 0xDC: case 0xDD: case 0xDE: case 0xDF: 
 532     switch ((unsigned char)lo
) { 
 542 #define DEFINE_UTF16_TO_UTF8(E) \ 
 543 static void  PTRCALL \ 
 544 E ## toUtf8(const ENCODING *enc, \ 
 545             const char **fromP, const char *fromLim, \ 
 546             char **toP, const char *toLim) \ 
 549   for (from = *fromP; from != fromLim; from += 2) { \ 
 552     unsigned char lo = GET_LO(from); \ 
 553     unsigned char hi = GET_HI(from); \ 
 557         if (*toP == toLim) { \ 
 565     case 0x1: case 0x2: case 0x3: \ 
 566     case 0x4: case 0x5: case 0x6: case 0x7: \ 
 567       if (toLim -  *toP < 2) { \ 
 571       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \ 
 572       *(*toP)++ = ((lo & 0x3f) | 0x80); \ 
 575       if (toLim -  *toP < 3)  { \ 
 579       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 
 580       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 
 581       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 
 582       *(*toP)++ = ((lo & 0x3f) | 0x80); \ 
 584     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ 
 585       if (toLim -  *toP < 4) { \ 
 589       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 
 590       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ 
 591       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 
 593       lo2 = GET_LO(from); \ 
 594       *(*toP)++ = (((lo & 0x3) << 4) \ 
 595                    | ((GET_HI(from) & 0x3) << 2) \ 
 598       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 
 605 #define DEFINE_UTF16_TO_UTF16(E) \ 
 606 static void  PTRCALL \ 
 607 E ## toUtf16(const ENCODING *enc, \ 
 608              const char **fromP, const char *fromLim, \ 
 609              unsigned short **toP, const unsigned short *toLim) \ 
 611   /* Avoid copying first half only of surrogate */ \ 
 612   if (fromLim - *fromP > ((toLim - *toP) << 1) \ 
 613       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ 
 615   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ 
 616     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 
 619 #define SET2(ptr, ch) \ 
 620   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) 
 621 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 
 622 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 
 624 DEFINE_UTF16_TO_UTF8(little2_
) 
 625 DEFINE_UTF16_TO_UTF16(little2_
) 
 631 #define SET2(ptr, ch) \ 
 632   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) 
 633 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 
 634 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 
 636 DEFINE_UTF16_TO_UTF8(big2_
) 
 637 DEFINE_UTF16_TO_UTF16(big2_
) 
 643 #define LITTLE2_BYTE_TYPE(enc, p) \ 
 645   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 
 646   : unicode_byte_type((p)[1], (p)[0])) 
 647 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) 
 648 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) 
 649 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ 
 650   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 
 651 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 
 652   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 
 656 static int PTRFASTCALL
 
 657 little2_byteType(const ENCODING 
*enc
, const char *p
) 
 659   return LITTLE2_BYTE_TYPE(enc
, p
); 
 662 static int PTRFASTCALL
 
 663 little2_byteToAscii(const ENCODING 
*enc
, const char *p
) 
 665   return LITTLE2_BYTE_TO_ASCII(enc
, p
); 
 669 little2_charMatches(const ENCODING 
*enc
, const char *p
, int c
) 
 671   return LITTLE2_CHAR_MATCHES(enc
, p
, c
); 
 674 static int PTRFASTCALL
 
 675 little2_isNameMin(const ENCODING 
*enc
, const char *p
) 
 677   return LITTLE2_IS_NAME_CHAR_MINBPC(enc
, p
); 
 680 static int PTRFASTCALL
 
 681 little2_isNmstrtMin(const ENCODING 
*enc
, const char *p
) 
 683   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc
, p
); 
 687 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 
 689 #else /* not XML_MIN_SIZE */ 
 692 #define PREFIX(ident) little2_ ## ident 
 693 #define MINBPC(enc) 2 
 694 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 
 695 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 
 696 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 
 697 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) 
 698 #define IS_NAME_CHAR(enc, p, n) 0 
 699 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) 
 700 #define IS_NMSTRT_CHAR(enc, p, n) (0) 
 701 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) 
 703 #define XML_TOK_IMPL_C 
 704 #include "xmltok_impl.c" 
 705 #undef XML_TOK_IMPL_C 
 712 #undef IS_NAME_CHAR_MINBPC 
 713 #undef IS_NMSTRT_CHAR 
 714 #undef IS_NMSTRT_CHAR_MINBPC 
 715 #undef IS_INVALID_CHAR 
 717 #endif /* not XML_MIN_SIZE */ 
 721 static const struct normal_encoding little2_encoding_ns 
= { 
 723 #if BYTEORDER == 1234 
 730 #include "asciitab.h" 
 731 #include "latin1tab.h" 
 733   STANDARD_VTABLE(little2_
) 
 738 static const struct normal_encoding little2_encoding 
= { 
 740 #if BYTEORDER == 1234 
 747 #define BT_COLON BT_NMSTRT 
 748 #include "asciitab.h" 
 750 #include "latin1tab.h" 
 752   STANDARD_VTABLE(little2_
) 
 755 #if BYTEORDER != 4321 
 759 static const struct normal_encoding internal_little2_encoding_ns 
= { 
 762 #include "iasciitab.h" 
 763 #include "latin1tab.h" 
 765   STANDARD_VTABLE(little2_
) 
 770 static const struct normal_encoding internal_little2_encoding 
= { 
 773 #define BT_COLON BT_NMSTRT 
 774 #include "iasciitab.h" 
 776 #include "latin1tab.h" 
 778   STANDARD_VTABLE(little2_
) 
 784 #define BIG2_BYTE_TYPE(enc, p) \ 
 786   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 
 787   : unicode_byte_type((p)[0], (p)[1])) 
 788 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) 
 789 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) 
 790 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ 
 791   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 
 792 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 
 793   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 
 797 static int PTRFASTCALL
 
 798 big2_byteType(const ENCODING 
*enc
, const char *p
) 
 800   return BIG2_BYTE_TYPE(enc
, p
); 
 803 static int PTRFASTCALL
 
 804 big2_byteToAscii(const ENCODING 
*enc
, const char *p
) 
 806   return BIG2_BYTE_TO_ASCII(enc
, p
); 
 810 big2_charMatches(const ENCODING 
*enc
, const char *p
, int c
) 
 812   return BIG2_CHAR_MATCHES(enc
, p
, c
); 
 815 static int PTRFASTCALL
 
 816 big2_isNameMin(const ENCODING 
*enc
, const char *p
) 
 818   return BIG2_IS_NAME_CHAR_MINBPC(enc
, p
); 
 821 static int PTRFASTCALL
 
 822 big2_isNmstrtMin(const ENCODING 
*enc
, const char *p
) 
 824   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc
, p
); 
 828 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 
 830 #else /* not XML_MIN_SIZE */ 
 833 #define PREFIX(ident) big2_ ## ident 
 834 #define MINBPC(enc) 2 
 835 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 
 836 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 
 837 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 
 838 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) 
 839 #define IS_NAME_CHAR(enc, p, n) 0 
 840 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) 
 841 #define IS_NMSTRT_CHAR(enc, p, n) (0) 
 842 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) 
 844 #define XML_TOK_IMPL_C 
 845 #include "xmltok_impl.c" 
 846 #undef XML_TOK_IMPL_C 
 853 #undef IS_NAME_CHAR_MINBPC 
 854 #undef IS_NMSTRT_CHAR 
 855 #undef IS_NMSTRT_CHAR_MINBPC 
 856 #undef IS_INVALID_CHAR 
 858 #endif /* not XML_MIN_SIZE */ 
 862 static const struct normal_encoding big2_encoding_ns 
= { 
 864 #if BYTEORDER == 4321 
 871 #include "asciitab.h" 
 872 #include "latin1tab.h" 
 874   STANDARD_VTABLE(big2_
) 
 879 static const struct normal_encoding big2_encoding 
= { 
 881 #if BYTEORDER == 4321 
 888 #define BT_COLON BT_NMSTRT 
 889 #include "asciitab.h" 
 891 #include "latin1tab.h" 
 893   STANDARD_VTABLE(big2_
) 
 896 #if BYTEORDER != 1234 
 900 static const struct normal_encoding internal_big2_encoding_ns 
= { 
 903 #include "iasciitab.h" 
 904 #include "latin1tab.h" 
 906   STANDARD_VTABLE(big2_
) 
 911 static const struct normal_encoding internal_big2_encoding 
= { 
 914 #define BT_COLON BT_NMSTRT 
 915 #include "iasciitab.h" 
 917 #include "latin1tab.h" 
 919   STANDARD_VTABLE(big2_
) 
 927 streqci(const char *s1
, const char *s2
) 
 932     if (ASCII_a 
<= c1 
&& c1 
<= ASCII_z
) 
 933       c1 
+= ASCII_A 
- ASCII_a
; 
 934     if (ASCII_a 
<= c2 
&& c2 
<= ASCII_z
) 
 935       c2 
+= ASCII_A 
- ASCII_a
; 
 945 initUpdatePosition(const ENCODING 
*enc
, const char *ptr
, 
 946                    const char *end
, POSITION 
*pos
) 
 948   normal_updatePosition(&utf8_encoding
.enc
, ptr
, end
, pos
); 
 952 toAscii(const ENCODING 
*enc
, const char *ptr
, const char *end
) 
 956   XmlUtf8Convert(enc
, &ptr
, end
, &p
, p 
+ 1); 
 976 /* Return 1 if there's just optional white space or there's an S 
 977    followed by name=val. 
 980 parsePseudoAttribute(const ENCODING 
*enc
, 
 983                      const char **namePtr
, 
 984                      const char **nameEndPtr
, 
 986                      const char **nextTokPtr
) 
 994   if (!isSpace(toAscii(enc
, ptr
, end
))) { 
 999     ptr 
+= enc
->minBytesPerChar
; 
1000   } while (isSpace(toAscii(enc
, ptr
, end
))); 
1007     c 
= toAscii(enc
, ptr
, end
); 
1012     if (c 
== ASCII_EQUALS
) { 
1019         ptr 
+= enc
->minBytesPerChar
; 
1020       } while (isSpace(c 
= toAscii(enc
, ptr
, end
))); 
1021       if (c 
!= ASCII_EQUALS
) { 
1027     ptr 
+= enc
->minBytesPerChar
; 
1029   if (ptr 
== *namePtr
) { 
1033   ptr 
+= enc
->minBytesPerChar
; 
1034   c 
= toAscii(enc
, ptr
, end
); 
1035   while (isSpace(c
)) { 
1036     ptr 
+= enc
->minBytesPerChar
; 
1037     c 
= toAscii(enc
, ptr
, end
); 
1039   if (c 
!= ASCII_QUOT 
&& c 
!= ASCII_APOS
) { 
1044   ptr 
+= enc
->minBytesPerChar
; 
1046   for (;; ptr 
+= enc
->minBytesPerChar
) { 
1047     c 
= toAscii(enc
, ptr
, end
); 
1050     if (!(ASCII_a 
<= c 
&& c 
<= ASCII_z
) 
1051         && !(ASCII_A 
<= c 
&& c 
<= ASCII_Z
) 
1052         && !(ASCII_0 
<= c 
&& c 
<= ASCII_9
) 
1053         && c 
!= ASCII_PERIOD
 
1055         && c 
!= ASCII_UNDERSCORE
) { 
1060   *nextTokPtr 
= ptr 
+ enc
->minBytesPerChar
; 
1064 static const char KW_version
[] = { 
1065   ASCII_v
, ASCII_e
, ASCII_r
, ASCII_s
, ASCII_i
, ASCII_o
, ASCII_n
, '\0' 
1068 static const char KW_encoding
[] = { 
1069   ASCII_e
, ASCII_n
, ASCII_c
, ASCII_o
, ASCII_d
, ASCII_i
, ASCII_n
, ASCII_g
, '\0' 
1072 static const char KW_standalone
[] = { 
1073   ASCII_s
, ASCII_t
, ASCII_a
, ASCII_n
, ASCII_d
, ASCII_a
, ASCII_l
, ASCII_o
, 
1074   ASCII_n
, ASCII_e
, '\0' 
1077 static const char KW_yes
[] = { 
1078   ASCII_y
, ASCII_e
, ASCII_s
,  '\0' 
1081 static const char KW_no
[] = { 
1082   ASCII_n
, ASCII_o
,  '\0' 
1086 doParseXmlDecl(const ENCODING 
*(*encodingFinder
)(const ENCODING 
*, 
1089                int isGeneralTextEntity
, 
1090                const ENCODING 
*enc
, 
1093                const char **badPtr
, 
1094                const char **versionPtr
, 
1095                const char **versionEndPtr
, 
1096                const char **encodingName
, 
1097                const ENCODING 
**encoding
, 
1100   const char *val 
= NULL
; 
1101   const char *name 
= NULL
; 
1102   const char *nameEnd 
= NULL
; 
1103   ptr 
+= 5 * enc
->minBytesPerChar
; 
1104   end 
-= 2 * enc
->minBytesPerChar
; 
1105   if (!parsePseudoAttribute(enc
, ptr
, end
, &name
, &nameEnd
, &val
, &ptr
) 
1110   if (!XmlNameMatchesAscii(enc
, name
, nameEnd
, KW_version
)) { 
1111     if (!isGeneralTextEntity
) { 
1120       *versionEndPtr 
= ptr
; 
1121     if (!parsePseudoAttribute(enc
, ptr
, end
, &name
, &nameEnd
, &val
, &ptr
)) { 
1126       if (isGeneralTextEntity
) { 
1127         /* a TextDecl must have an EncodingDecl */ 
1134   if (XmlNameMatchesAscii(enc
, name
, nameEnd
, KW_encoding
)) { 
1135     int c 
= toAscii(enc
, val
, end
); 
1136     if (!(ASCII_a 
<= c 
&& c 
<= ASCII_z
) && !(ASCII_A 
<= c 
&& c 
<= ASCII_Z
)) { 
1141       *encodingName 
= val
; 
1143       *encoding 
= encodingFinder(enc
, val
, ptr 
- enc
->minBytesPerChar
); 
1144     if (!parsePseudoAttribute(enc
, ptr
, end
, &name
, &nameEnd
, &val
, &ptr
)) { 
1151   if (!XmlNameMatchesAscii(enc
, name
, nameEnd
, KW_standalone
) 
1152       || isGeneralTextEntity
) { 
1156   if (XmlNameMatchesAscii(enc
, val
, ptr 
- enc
->minBytesPerChar
, KW_yes
)) { 
1160   else if (XmlNameMatchesAscii(enc
, val
, ptr 
- enc
->minBytesPerChar
, KW_no
)) { 
1168   while (isSpace(toAscii(enc
, ptr
, end
))) 
1169     ptr 
+= enc
->minBytesPerChar
; 
1178 checkCharRefNumber(int result
) 
1180   switch (result 
>> 8) { 
1181   case 0xD8: case 0xD9: case 0xDA: case 0xDB: 
1182   case 0xDC: case 0xDD: case 0xDE: case 0xDF: 
1185     if (latin1_encoding
.type
[result
] == BT_NONXML
) 
1189     if (result 
== 0xFFFE || result 
== 0xFFFF) 
1197 XmlUtf8Encode(int c
, char *buf
) 
1200     /* minN is minimum legal resulting value for N byte sequence */ 
1209     buf
[0] = (char)(c 
| UTF8_cval1
); 
1213     buf
[0] = (char)((c 
>> 6) | UTF8_cval2
); 
1214     buf
[1] = (char)((c 
& 0x3f) | 0x80); 
1218     buf
[0] = (char)((c 
>> 12) | UTF8_cval3
); 
1219     buf
[1] = (char)(((c 
>> 6) & 0x3f) | 0x80); 
1220     buf
[2] = (char)((c 
& 0x3f) | 0x80); 
1224     buf
[0] = (char)((c 
>> 18) | UTF8_cval4
); 
1225     buf
[1] = (char)(((c 
>> 12) & 0x3f) | 0x80); 
1226     buf
[2] = (char)(((c 
>> 6) & 0x3f) | 0x80); 
1227     buf
[3] = (char)((c 
& 0x3f) | 0x80); 
1234 XmlUtf16Encode(int charNum
, unsigned short *buf
) 
1238   if (charNum 
< 0x10000) { 
1239     buf
[0] = (unsigned short)charNum
; 
1242   if (charNum 
< 0x110000) { 
1244     buf
[0] = (unsigned short)((charNum 
>> 10) + 0xD800); 
1245     buf
[1] = (unsigned short)((charNum 
& 0x3FF) + 0xDC00); 
1251 struct unknown_encoding 
{ 
1252   struct normal_encoding normal
; 
1255   unsigned short utf16
[256]; 
1259 #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc)) 
1262 XmlSizeOfUnknownEncoding(void) 
1264   return sizeof(struct unknown_encoding
); 
1267 static int PTRFASTCALL
 
1268 unknown_isName(const ENCODING 
*enc
, const char *p
) 
1270   const struct unknown_encoding 
*uenc 
= AS_UNKNOWN_ENCODING(enc
); 
1271   int c 
= uenc
->convert(uenc
->userData
, p
); 
1274   return UCS2_GET_NAMING(namePages
, c 
>> 8, c 
& 0xFF); 
1277 static int PTRFASTCALL
 
1278 unknown_isNmstrt(const ENCODING 
*enc
, const char *p
) 
1280   const struct unknown_encoding 
*uenc 
= AS_UNKNOWN_ENCODING(enc
); 
1281   int c 
= uenc
->convert(uenc
->userData
, p
); 
1284   return UCS2_GET_NAMING(nmstrtPages
, c 
>> 8, c 
& 0xFF); 
1287 static int PTRFASTCALL
 
1288 unknown_isInvalid(const ENCODING 
*enc
, const char *p
) 
1290   const struct unknown_encoding 
*uenc 
= AS_UNKNOWN_ENCODING(enc
); 
1291   int c 
= uenc
->convert(uenc
->userData
, p
); 
1292   return (c 
& ~0xFFFF) || checkCharRefNumber(c
) < 0; 
1296 unknown_toUtf8(const ENCODING 
*enc
, 
1297                const char **fromP
, const char *fromLim
, 
1298                char **toP
, const char *toLim
) 
1300   const struct unknown_encoding 
*uenc 
= AS_UNKNOWN_ENCODING(enc
); 
1301   char buf
[XML_UTF8_ENCODE_MAX
]; 
1305     if (*fromP 
== fromLim
) 
1307     utf8 
= uenc
->utf8
[(unsigned char)**fromP
]; 
1310       int c 
= uenc
->convert(uenc
->userData
, *fromP
); 
1311       n 
= XmlUtf8Encode(c
, buf
); 
1312       if (n 
> toLim 
- *toP
) 
1315       *fromP 
+= (AS_NORMAL_ENCODING(enc
)->type
[(unsigned char)**fromP
] 
1319       if (n 
> toLim 
- *toP
) 
1324       *(*toP
)++ = *utf8
++; 
1330 unknown_toUtf16(const ENCODING 
*enc
, 
1331                 const char **fromP
, const char *fromLim
, 
1332                 unsigned short **toP
, const unsigned short *toLim
) 
1334   const struct unknown_encoding 
*uenc 
= AS_UNKNOWN_ENCODING(enc
); 
1335   while (*fromP 
!= fromLim 
&& *toP 
!= toLim
) { 
1336     unsigned short c 
= uenc
->utf16
[(unsigned char)**fromP
]; 
1338       c 
= (unsigned short) 
1339           uenc
->convert(uenc
->userData
, *fromP
); 
1340       *fromP 
+= (AS_NORMAL_ENCODING(enc
)->type
[(unsigned char)**fromP
] 
1350 XmlInitUnknownEncoding(void *mem
, 
1356   struct unknown_encoding 
*e 
= (struct unknown_encoding 
*)mem
; 
1357   for (i 
= 0; i 
< (int)sizeof(struct normal_encoding
); i
++) 
1358     ((char *)mem
)[i
] = ((char *)&latin1_encoding
)[i
]; 
1359   for (i 
= 0; i 
< 128; i
++) 
1360     if (latin1_encoding
.type
[i
] != BT_OTHER
 
1361         && latin1_encoding
.type
[i
] != BT_NONXML
 
1364   for (i 
= 0; i 
< 256; i
++) { 
1367       e
->normal
.type
[i
] = BT_MALFORM
; 
1368       /* This shouldn't really get used. */ 
1369       e
->utf16
[i
] = 0xFFFF; 
1376       e
->normal
.type
[i
] = (unsigned char)(BT_LEAD2 
- (c 
+ 2)); 
1380     else if (c 
< 0x80) { 
1381       if (latin1_encoding
.type
[c
] != BT_OTHER
 
1382           && latin1_encoding
.type
[c
] != BT_NONXML
 
1385       e
->normal
.type
[i
] = latin1_encoding
.type
[c
]; 
1387       e
->utf8
[i
][1] = (char)c
; 
1388       e
->utf16
[i
] = (unsigned short)(c 
== 0 ? 0xFFFF : c
); 
1390     else if (checkCharRefNumber(c
) < 0) { 
1391       e
->normal
.type
[i
] = BT_NONXML
; 
1392       /* This shouldn't really get used. */ 
1393       e
->utf16
[i
] = 0xFFFF; 
1400       if (UCS2_GET_NAMING(nmstrtPages
, c 
>> 8, c 
& 0xff)) 
1401         e
->normal
.type
[i
] = BT_NMSTRT
; 
1402       else if (UCS2_GET_NAMING(namePages
, c 
>> 8, c 
& 0xff)) 
1403         e
->normal
.type
[i
] = BT_NAME
; 
1405         e
->normal
.type
[i
] = BT_OTHER
; 
1406       e
->utf8
[i
][0] = (char)XmlUtf8Encode(c
, e
->utf8
[i
] + 1); 
1407       e
->utf16
[i
] = (unsigned short)c
; 
1410   e
->userData 
= userData
; 
1411   e
->convert 
= convert
; 
1413     e
->normal
.isName2 
= unknown_isName
; 
1414     e
->normal
.isName3 
= unknown_isName
; 
1415     e
->normal
.isName4 
= unknown_isName
; 
1416     e
->normal
.isNmstrt2 
= unknown_isNmstrt
; 
1417     e
->normal
.isNmstrt3 
= unknown_isNmstrt
; 
1418     e
->normal
.isNmstrt4 
= unknown_isNmstrt
; 
1419     e
->normal
.isInvalid2 
= unknown_isInvalid
; 
1420     e
->normal
.isInvalid3 
= unknown_isInvalid
; 
1421     e
->normal
.isInvalid4 
= unknown_isInvalid
; 
1423   e
->normal
.enc
.utf8Convert 
= unknown_toUtf8
; 
1424   e
->normal
.enc
.utf16Convert 
= unknown_toUtf16
; 
1425   return &(e
->normal
.enc
); 
1428 /* If this enumeration is changed, getEncodingIndex and encodings 
1429 must also be changed. */ 
1438   /* must match encodingNames up to here */ 
1442 static const char KW_ISO_8859_1
[] = { 
1443   ASCII_I
, ASCII_S
, ASCII_O
, ASCII_MINUS
, ASCII_8
, ASCII_8
, ASCII_5
, ASCII_9
, 
1444   ASCII_MINUS
, ASCII_1
, '\0' 
1446 static const char KW_US_ASCII
[] = { 
1447   ASCII_U
, ASCII_S
, ASCII_MINUS
, ASCII_A
, ASCII_S
, ASCII_C
, ASCII_I
, ASCII_I
, 
1450 static const char KW_UTF_8
[] =  { 
1451   ASCII_U
, ASCII_T
, ASCII_F
, ASCII_MINUS
, ASCII_8
, '\0' 
1453 static const char KW_UTF_16
[] = { 
1454   ASCII_U
, ASCII_T
, ASCII_F
, ASCII_MINUS
, ASCII_1
, ASCII_6
, '\0' 
1456 static const char KW_UTF_16BE
[] = { 
1457   ASCII_U
, ASCII_T
, ASCII_F
, ASCII_MINUS
, ASCII_1
, ASCII_6
, ASCII_B
, ASCII_E
, 
1460 static const char KW_UTF_16LE
[] = { 
1461   ASCII_U
, ASCII_T
, ASCII_F
, ASCII_MINUS
, ASCII_1
, ASCII_6
, ASCII_L
, ASCII_E
, 
1466 getEncodingIndex(const char *name
) 
1468   static const char * const encodingNames
[] = { 
1479   for (i 
= 0; i 
< (int)(sizeof(encodingNames
)/sizeof(encodingNames
[0])); i
++) 
1480     if (streqci(name
, encodingNames
[i
])) 
1485 /* For binary compatibility, we store the index of the encoding 
1486    specified at initialization in the isUtf16 member. 
1489 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 
1490 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 
1492 /* This is what detects the encoding.  encodingTable maps from 
1493    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 
1494    the external (protocol) specified encoding; state is 
1495    XML_CONTENT_STATE if we're parsing an external text entity, and 
1496    XML_PROLOG_STATE otherwise. 
1501 initScan(const ENCODING 
* const *encodingTable
, 
1502          const INIT_ENCODING 
*enc
, 
1506          const char **nextTokPtr
) 
1508   const ENCODING 
**encPtr
; 
1511     return XML_TOK_NONE
; 
1512   encPtr 
= enc
->encPtr
; 
1513   if (ptr 
+ 1 == end
) { 
1514     /* only a single byte available for auto-detection */ 
1515 #ifndef XML_DTD /* FIXME */ 
1516     /* a well-formed document entity must have more than one byte */ 
1517     if (state 
!= XML_CONTENT_STATE
) 
1518       return XML_TOK_PARTIAL
; 
1520     /* so we're parsing an external text entity... */ 
1521     /* if UTF-16 was externally specified, then we need at least 2 bytes */ 
1522     switch (INIT_ENC_INDEX(enc
)) { 
1526       return XML_TOK_PARTIAL
; 
1528     switch ((unsigned char)*ptr
) { 
1531     case 0xEF: /* possibly first byte of UTF-8 BOM */ 
1532       if (INIT_ENC_INDEX(enc
) == ISO_8859_1_ENC
 
1533           && state 
== XML_CONTENT_STATE
) 
1538       return XML_TOK_PARTIAL
; 
1542     switch (((unsigned char)ptr
[0] << 8) | (unsigned char)ptr
[1]) { 
1544       if (INIT_ENC_INDEX(enc
) == ISO_8859_1_ENC
 
1545           && state 
== XML_CONTENT_STATE
) 
1547       *nextTokPtr 
= ptr 
+ 2; 
1548       *encPtr 
= encodingTable
[UTF_16BE_ENC
]; 
1550     /* 00 3C is handled in the default case */ 
1552       if ((INIT_ENC_INDEX(enc
) == UTF_16BE_ENC
 
1553            || INIT_ENC_INDEX(enc
) == UTF_16_ENC
) 
1554           && state 
== XML_CONTENT_STATE
) 
1556       *encPtr 
= encodingTable
[UTF_16LE_ENC
]; 
1557       return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
); 
1559       if (INIT_ENC_INDEX(enc
) == ISO_8859_1_ENC
 
1560           && state 
== XML_CONTENT_STATE
) 
1562       *nextTokPtr 
= ptr 
+ 2; 
1563       *encPtr 
= encodingTable
[UTF_16LE_ENC
]; 
1566       /* Maybe a UTF-8 BOM (EF BB BF) */ 
1567       /* If there's an explicitly specified (external) encoding 
1568          of ISO-8859-1 or some flavour of UTF-16 
1569          and this is an external text entity, 
1570          don't look for the BOM, 
1571          because it might be a legal data. 
1573       if (state 
== XML_CONTENT_STATE
) { 
1574         int e 
= INIT_ENC_INDEX(enc
); 
1575         if (e 
== ISO_8859_1_ENC 
|| e 
== UTF_16BE_ENC
 
1576             || e 
== UTF_16LE_ENC 
|| e 
== UTF_16_ENC
) 
1580         return XML_TOK_PARTIAL
; 
1581       if ((unsigned char)ptr
[2] == 0xBF) { 
1582         *nextTokPtr 
= ptr 
+ 3; 
1583         *encPtr 
= encodingTable
[UTF_8_ENC
]; 
1588       if (ptr
[0] == '\0') { 
1589         /* 0 isn't a legal data character. Furthermore a document 
1590            entity can only start with ASCII characters.  So the only 
1591            way this can fail to be big-endian UTF-16 if it it's an 
1592            external parsed general entity that's labelled as 
1595         if (state 
== XML_CONTENT_STATE 
&& INIT_ENC_INDEX(enc
) == UTF_16LE_ENC
) 
1597         *encPtr 
= encodingTable
[UTF_16BE_ENC
]; 
1598         return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
); 
1600       else if (ptr
[1] == '\0') { 
1601         /* We could recover here in the case: 
1602             - parsing an external entity 
1604             - no externally specified encoding 
1605             - no encoding declaration 
1606            by assuming UTF-16LE.  But we don't, because this would mean when 
1607            presented just with a single byte, we couldn't reliably determine 
1608            whether we needed further bytes. 
1610         if (state 
== XML_CONTENT_STATE
) 
1612         *encPtr 
= encodingTable
[UTF_16LE_ENC
]; 
1613         return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
); 
1618   *encPtr 
= encodingTable
[INIT_ENC_INDEX(enc
)]; 
1619   return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
); 
1625 #define XML_TOK_NS_C 
1626 #include "xmltok_ns.c" 
1633 #define NS(x) x ## NS 
1634 #define ns(x) x ## _ns 
1636 #define XML_TOK_NS_C 
1637 #include "xmltok_ns.c" 
1644 XmlInitUnknownEncodingNS(void *mem
, 
1649   ENCODING 
*enc 
= XmlInitUnknownEncoding(mem
, table
, convert
, userData
); 
1651     ((struct normal_encoding 
*)enc
)->type
[ASCII_COLON
] = BT_COLON
;