1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd 
   2    See the file COPYING for copying permission. 
   5 #ifdef COMPILED_FROM_DSP 
   9 #elif defined(__MSDOS__) 
  10 #include "dosconfig.h" 
  11 #elif defined(MACOS_CLASSIC) 
  12 #include "macconfig.h" 
  14 #include "expat_config.h" 
  15 #endif /* ndef COMPILED_FROM_DSP */ 
  22 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok) 
  24 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */ 
  28   { PREFIX(prologTok), PREFIX(contentTok), \ 
  29     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \ 
  30   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ 
  32   PREFIX(nameMatchesAscii), \ 
  36   PREFIX(charRefNumber), \ 
  37   PREFIX(predefinedEntityName), \ 
  38   PREFIX(updatePosition), \ 
  41 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) 
  43 #define UCS2_GET_NAMING(pages, hi, lo) \ 
  44    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) 
  46 /* A 2 byte UTF-8 representation splits the characters 11 bits between 
  47    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into 
  48    pages, 3 bits to add to that index and 5 bits to generate the mask. 
  50 #define UTF8_GET_NAMING2(pages, byte) \ 
  51     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \ 
  52                       + ((((byte)[0]) & 3) << 1) \ 
  53                       + ((((byte)[1]) >> 5) & 1)] \ 
  54          & (1 << (((byte)[1]) & 0x1F))) 
  56 /* A 3 byte UTF-8 representation splits the characters 16 bits between 
  57    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index 
  58    into pages, 3 bits to add to that index and 5 bits to generate the 
  61 #define UTF8_GET_NAMING3(pages, byte) \ 
  62   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \ 
  63                              + ((((byte)[1]) >> 2) & 0xF)] \ 
  65                       + ((((byte)[1]) & 3) << 1) \ 
  66                       + ((((byte)[2]) >> 5) & 1)] \ 
  67          & (1 << (((byte)[2]) & 0x1F))) 
  69 #define UTF8_GET_NAMING(pages, p, n) \ 
  71   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ 
  73      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \ 
  76 /* Detection of invalid UTF-8 sequences is based on Table 3.1B 
  77    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ 
  78    with the additional restriction of not allowing the Unicode 
  79    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE). 
  80    Implementation details: 
  81      (A & 0x80) == 0     means A < 0x80 
  83      (A & 0xC0) == 0xC0  means A > 0xBF 
  86 #define UTF8_INVALID2(p) \ 
  87   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0) 
  89 #define UTF8_INVALID3(p) \ 
  90   (((p)[2] & 0x80) == 0 \ 
  92   ((*p) == 0xEF && (p)[1] == 0xBF \ 
  96     ((p)[2] & 0xC0) == 0xC0) \ 
 100     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \ 
 102     ((p)[1] & 0x80) == 0 \ 
 104     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0))) 
 106 #define UTF8_INVALID4(p) \ 
 107   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \ 
 109   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \ 
 113     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \ 
 115     ((p)[1] & 0x80) == 0 \ 
 117     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0))) 
 119 static int PTRFASTCALL
 
 120 isNever(const ENCODING 
*enc
, const char *p
) 
 125 static int PTRFASTCALL
 
 126 utf8_isName2(const ENCODING 
*enc
, const char *p
) 
 128   return UTF8_GET_NAMING2(namePages
, (const unsigned char *)p
); 
 131 static int PTRFASTCALL
 
 132 utf8_isName3(const ENCODING 
*enc
, const char *p
) 
 134   return UTF8_GET_NAMING3(namePages
, (const unsigned char *)p
); 
 137 #define utf8_isName4 isNever 
 139 static int PTRFASTCALL
 
 140 utf8_isNmstrt2(const ENCODING 
*enc
, const char *p
) 
 142   return UTF8_GET_NAMING2(nmstrtPages
, (const unsigned char *)p
); 
 145 static int PTRFASTCALL
 
 146 utf8_isNmstrt3(const ENCODING 
*enc
, const char *p
) 
 148   return UTF8_GET_NAMING3(nmstrtPages
, (const unsigned char *)p
); 
 151 #define utf8_isNmstrt4 isNever 
 153 static int PTRFASTCALL
 
 154 utf8_isInvalid2(const ENCODING 
*enc
, const char *p
) 
 156   return UTF8_INVALID2((const unsigned char *)p
); 
 159 static int PTRFASTCALL
 
 160 utf8_isInvalid3(const ENCODING 
*enc
, const char *p
) 
 162   return UTF8_INVALID3((const unsigned char *)p
); 
 165 static int PTRFASTCALL
 
 166 utf8_isInvalid4(const ENCODING 
*enc
, const char *p
) 
 168   return UTF8_INVALID4((const unsigned char *)p
); 
 171 struct normal_encoding 
{ 
 173   unsigned char type
[256]; 
 175   int (PTRFASTCALL 
*byteType
)(const ENCODING 
*, const char *); 
 176   int (PTRFASTCALL 
*isNameMin
)(const ENCODING 
*, const char *); 
 177   int (PTRFASTCALL 
*isNmstrtMin
)(const ENCODING 
*, const char *); 
 178   int (PTRFASTCALL 
*byteToAscii
)(const ENCODING 
*, const char *); 
 179   int (PTRCALL 
*charMatches
)(const ENCODING 
*, const char *, int); 
 180 #endif /* XML_MIN_SIZE */ 
 181   int (PTRFASTCALL 
*isName2
)(const ENCODING 
*, const char *); 
 182   int (PTRFASTCALL 
*isName3
)(const ENCODING 
*, const char *); 
 183   int (PTRFASTCALL 
*isName4
)(const ENCODING 
*, const char *); 
 184   int (PTRFASTCALL 
*isNmstrt2
)(const ENCODING 
*, const char *); 
 185   int (PTRFASTCALL 
*isNmstrt3
)(const ENCODING 
*, const char *); 
 186   int (PTRFASTCALL 
*isNmstrt4
)(const ENCODING 
*, const char *); 
 187   int (PTRFASTCALL 
*isInvalid2
)(const ENCODING 
*, const char *); 
 188   int (PTRFASTCALL 
*isInvalid3
)(const ENCODING 
*, const char *); 
 189   int (PTRFASTCALL 
*isInvalid4
)(const ENCODING 
*, const char *); 
 192 #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc)) 
 196 #define STANDARD_VTABLE(E) \ 
 205 #define STANDARD_VTABLE(E) /* as nothing */ 
 209 #define NORMAL_VTABLE(E) \ 
 220 static int FASTCALL 
checkCharRefNumber(int); 
 222 #include "xmltok_impl.h" 
 226 #define sb_isNameMin isNever 
 227 #define sb_isNmstrtMin isNever 
 231 #define MINBPC(enc) ((enc)->minBytesPerChar) 
 233 /* minimum bytes per character */ 
 234 #define MINBPC(enc) 1 
 237 #define SB_BYTE_TYPE(enc, p) \ 
 238   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) 
 241 static int PTRFASTCALL
 
 242 sb_byteType(const ENCODING 
*enc
, const char *p
) 
 244   return SB_BYTE_TYPE(enc
, p
); 
 246 #define BYTE_TYPE(enc, p) \ 
 247  (AS_NORMAL_ENCODING(enc)->byteType(enc, p)) 
 249 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p) 
 253 #define BYTE_TO_ASCII(enc, p) \ 
 254  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p)) 
 255 static int PTRFASTCALL
 
 256 sb_byteToAscii(const ENCODING 
*enc
, const char *p
) 
 261 #define BYTE_TO_ASCII(enc, p) (*(p)) 
 264 #define IS_NAME_CHAR(enc, p, n) \ 
 265  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p)) 
 266 #define IS_NMSTRT_CHAR(enc, p, n) \ 
 267  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p)) 
 268 #define IS_INVALID_CHAR(enc, p, n) \ 
 269  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p)) 
 272 #define IS_NAME_CHAR_MINBPC(enc, p) \ 
 273  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p)) 
 274 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \ 
 275  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p)) 
 277 #define IS_NAME_CHAR_MINBPC(enc, p) (0) 
 278 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) 
 282 #define CHAR_MATCHES(enc, p, c) \ 
 283  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c)) 
 285 sb_charMatches(const ENCODING 
*enc
, const char *p
, int c
) 
 290 /* c is an ASCII character */ 
 291 #define CHAR_MATCHES(enc, p, c) (*(p) == c) 
 294 #define PREFIX(ident) normal_ ## ident 
 295 #include "xmltok_impl.c" 
 302 #undef IS_NAME_CHAR_MINBPC 
 303 #undef IS_NMSTRT_CHAR 
 304 #undef IS_NMSTRT_CHAR_MINBPC 
 305 #undef IS_INVALID_CHAR 
 307 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */ 
 315 utf8_toUtf8(const ENCODING 
*enc
, 
 316             const char **fromP
, const char *fromLim
, 
 317             char **toP
, const char *toLim
) 
 321   if (fromLim 
- *fromP 
> toLim 
- *toP
) { 
 322     /* Avoid copying partial characters. */ 
 323     for (fromLim 
= *fromP 
+ (toLim 
- *toP
); fromLim 
> *fromP
; fromLim
--) 
 324       if (((unsigned char)fromLim
[-1] & 0xc0) != 0x80) 
 327   for (to 
= *toP
, from 
= *fromP
; from 
!= fromLim
; from
++, to
++) 
 334 utf8_toUtf16(const ENCODING 
*enc
, 
 335              const char **fromP
, const char *fromLim
, 
 336              unsigned short **toP
, const unsigned short *toLim
) 
 338   unsigned short *to 
= *toP
; 
 339   const char *from 
= *fromP
; 
 340   while (from 
!= fromLim 
&& to 
!= toLim
) { 
 341     switch (((struct normal_encoding 
*)enc
)->type
[(unsigned char)*from
]) { 
 343       *to
++ = (unsigned short)(((from
[0] & 0x1f) << 6) | (from
[1] & 0x3f)); 
 347       *to
++ = (unsigned short)(((from
[0] & 0xf) << 12) 
 348                                | ((from
[1] & 0x3f) << 6) | (from
[2] & 0x3f)); 
 356         n 
= ((from
[0] & 0x7) << 18) | ((from
[1] & 0x3f) << 12) 
 357             | ((from
[2] & 0x3f) << 6) | (from
[3] & 0x3f); 
 359         to
[0] = (unsigned short)((n 
>> 10) | 0xD800); 
 360         to
[1] = (unsigned short)((n 
& 0x3FF) | 0xDC00); 
 376 static const struct normal_encoding utf8_encoding_ns 
= { 
 377   { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 }, 
 379 #include "asciitab.h" 
 382   STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
) 
 386 static const struct normal_encoding utf8_encoding 
= { 
 387   { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 }, 
 389 #define BT_COLON BT_NMSTRT 
 390 #include "asciitab.h" 
 394   STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
) 
 399 static const struct normal_encoding internal_utf8_encoding_ns 
= { 
 400   { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 }, 
 402 #include "iasciitab.h" 
 405   STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
) 
 410 static const struct normal_encoding internal_utf8_encoding 
= { 
 411   { VTABLE1
, utf8_toUtf8
, utf8_toUtf16
, 1, 1, 0 }, 
 413 #define BT_COLON BT_NMSTRT 
 414 #include "iasciitab.h" 
 418   STANDARD_VTABLE(sb_
) NORMAL_VTABLE(utf8_
) 
 422 latin1_toUtf8(const ENCODING 
*enc
, 
 423               const char **fromP
, const char *fromLim
, 
 424               char **toP
, const char *toLim
) 
 428     if (*fromP 
== fromLim
) 
 430     c 
= (unsigned char)**fromP
; 
 432       if (toLim 
- *toP 
< 2) 
 434       *(*toP
)++ = (char)((c 
>> 6) | UTF8_cval2
); 
 435       *(*toP
)++ = (char)((c 
& 0x3f) | 0x80); 
 441       *(*toP
)++ = *(*fromP
)++; 
 447 latin1_toUtf16(const ENCODING 
*enc
, 
 448                const char **fromP
, const char *fromLim
, 
 449                unsigned short **toP
, const unsigned short *toLim
) 
 451   while (*fromP 
!= fromLim 
&& *toP 
!= toLim
) 
 452     *(*toP
)++ = (unsigned char)*(*fromP
)++; 
 457 static const struct normal_encoding latin1_encoding_ns 
= { 
 458   { VTABLE1
, latin1_toUtf8
, latin1_toUtf16
, 1, 0, 0 }, 
 460 #include "asciitab.h" 
 461 #include "latin1tab.h" 
 468 static const struct normal_encoding latin1_encoding 
= { 
 469   { VTABLE1
, latin1_toUtf8
, latin1_toUtf16
, 1, 0, 0 }, 
 471 #define BT_COLON BT_NMSTRT 
 472 #include "asciitab.h" 
 474 #include "latin1tab.h" 
 480 ascii_toUtf8(const ENCODING 
*enc
, 
 481              const char **fromP
, const char *fromLim
, 
 482              char **toP
, const char *toLim
) 
 484   while (*fromP 
!= fromLim 
&& *toP 
!= toLim
) 
 485     *(*toP
)++ = *(*fromP
)++; 
 490 static const struct normal_encoding ascii_encoding_ns 
= { 
 491   { VTABLE1
, ascii_toUtf8
, latin1_toUtf16
, 1, 1, 0 }, 
 493 #include "asciitab.h" 
 501 static const struct normal_encoding ascii_encoding 
= { 
 502   { VTABLE1
, ascii_toUtf8
, latin1_toUtf16
, 1, 1, 0 }, 
 504 #define BT_COLON BT_NMSTRT 
 505 #include "asciitab.h" 
 512 static int PTRFASTCALL
 
 513 unicode_byte_type(char hi
, char lo
) 
 515   switch ((unsigned char)hi
) { 
 516   case 0xD8: case 0xD9: case 0xDA: case 0xDB: 
 518   case 0xDC: case 0xDD: case 0xDE: case 0xDF: 
 521     switch ((unsigned char)lo
) { 
 531 #define DEFINE_UTF16_TO_UTF8(E) \ 
 532 static void  PTRCALL \ 
 533 E ## toUtf8(const ENCODING *enc, \ 
 534             const char **fromP, const char *fromLim, \ 
 535             char **toP, const char *toLim) \ 
 538   for (from = *fromP; from != fromLim; from += 2) { \ 
 541     unsigned char lo = GET_LO(from); \ 
 542     unsigned char hi = GET_HI(from); \ 
 546         if (*toP == toLim) { \ 
 554     case 0x1: case 0x2: case 0x3: \ 
 555     case 0x4: case 0x5: case 0x6: case 0x7: \ 
 556       if (toLim -  *toP < 2) { \ 
 560       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \ 
 561       *(*toP)++ = ((lo & 0x3f) | 0x80); \ 
 564       if (toLim -  *toP < 3)  { \ 
 568       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ 
 569       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ 
 570       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ 
 571       *(*toP)++ = ((lo & 0x3f) | 0x80); \ 
 573     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \ 
 574       if (toLim -  *toP < 4) { \ 
 578       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ 
 579       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ 
 580       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ 
 582       lo2 = GET_LO(from); \ 
 583       *(*toP)++ = (((lo & 0x3) << 4) \ 
 584                    | ((GET_HI(from) & 0x3) << 2) \ 
 587       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \ 
 594 #define DEFINE_UTF16_TO_UTF16(E) \ 
 595 static void  PTRCALL \ 
 596 E ## toUtf16(const ENCODING *enc, \ 
 597              const char **fromP, const char *fromLim, \ 
 598              unsigned short **toP, const unsigned short *toLim) \ 
 600   /* Avoid copying first half only of surrogate */ \ 
 601   if (fromLim - *fromP > ((toLim - *toP) << 1) \ 
 602       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ 
 604   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ 
 605     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ 
 608 #define SET2(ptr, ch) \ 
 609   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8))) 
 610 #define GET_LO(ptr) ((unsigned char)(ptr)[0]) 
 611 #define GET_HI(ptr) ((unsigned char)(ptr)[1]) 
 613 DEFINE_UTF16_TO_UTF8(little2_
) 
 614 DEFINE_UTF16_TO_UTF16(little2_
) 
 620 #define SET2(ptr, ch) \ 
 621   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF))) 
 622 #define GET_LO(ptr) ((unsigned char)(ptr)[1]) 
 623 #define GET_HI(ptr) ((unsigned char)(ptr)[0]) 
 625 DEFINE_UTF16_TO_UTF8(big2_
) 
 626 DEFINE_UTF16_TO_UTF16(big2_
) 
 632 #define LITTLE2_BYTE_TYPE(enc, p) \ 
 634   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ 
 635   : unicode_byte_type((p)[1], (p)[0])) 
 636 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) 
 637 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) 
 638 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \ 
 639   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) 
 640 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 
 641   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) 
 645 static int PTRFASTCALL
 
 646 little2_byteType(const ENCODING 
*enc
, const char *p
) 
 648   return LITTLE2_BYTE_TYPE(enc
, p
); 
 651 static int PTRFASTCALL
 
 652 little2_byteToAscii(const ENCODING 
*enc
, const char *p
) 
 654   return LITTLE2_BYTE_TO_ASCII(enc
, p
); 
 658 little2_charMatches(const ENCODING 
*enc
, const char *p
, int c
) 
 660   return LITTLE2_CHAR_MATCHES(enc
, p
, c
); 
 663 static int PTRFASTCALL
 
 664 little2_isNameMin(const ENCODING 
*enc
, const char *p
) 
 666   return LITTLE2_IS_NAME_CHAR_MINBPC(enc
, p
); 
 669 static int PTRFASTCALL
 
 670 little2_isNmstrtMin(const ENCODING 
*enc
, const char *p
) 
 672   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc
, p
); 
 676 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16 
 678 #else /* not XML_MIN_SIZE */ 
 681 #define PREFIX(ident) little2_ ## ident 
 682 #define MINBPC(enc) 2 
 683 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 
 684 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p) 
 685 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p) 
 686 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c) 
 687 #define IS_NAME_CHAR(enc, p, n) 0 
 688 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) 
 689 #define IS_NMSTRT_CHAR(enc, p, n) (0) 
 690 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) 
 692 #include "xmltok_impl.c" 
 699 #undef IS_NAME_CHAR_MINBPC 
 700 #undef IS_NMSTRT_CHAR 
 701 #undef IS_NMSTRT_CHAR_MINBPC 
 702 #undef IS_INVALID_CHAR 
 704 #endif /* not XML_MIN_SIZE */ 
 708 static const struct normal_encoding little2_encoding_ns 
= { 
 710 #if BYTEORDER == 1234 
 717 #include "asciitab.h" 
 718 #include "latin1tab.h" 
 720   STANDARD_VTABLE(little2_
) 
 725 static const struct normal_encoding little2_encoding 
= { 
 727 #if BYTEORDER == 1234 
 734 #define BT_COLON BT_NMSTRT 
 735 #include "asciitab.h" 
 737 #include "latin1tab.h" 
 739   STANDARD_VTABLE(little2_
) 
 742 #if BYTEORDER != 4321 
 746 static const struct normal_encoding internal_little2_encoding_ns 
= { 
 749 #include "iasciitab.h" 
 750 #include "latin1tab.h" 
 752   STANDARD_VTABLE(little2_
) 
 757 static const struct normal_encoding internal_little2_encoding 
= { 
 760 #define BT_COLON BT_NMSTRT 
 761 #include "iasciitab.h" 
 763 #include "latin1tab.h" 
 765   STANDARD_VTABLE(little2_
) 
 771 #define BIG2_BYTE_TYPE(enc, p) \ 
 773   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ 
 774   : unicode_byte_type((p)[0], (p)[1])) 
 775 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) 
 776 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) 
 777 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \ 
 778   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) 
 779 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \ 
 780   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) 
 784 static int PTRFASTCALL
 
 785 big2_byteType(const ENCODING 
*enc
, const char *p
) 
 787   return BIG2_BYTE_TYPE(enc
, p
); 
 790 static int PTRFASTCALL
 
 791 big2_byteToAscii(const ENCODING 
*enc
, const char *p
) 
 793   return BIG2_BYTE_TO_ASCII(enc
, p
); 
 797 big2_charMatches(const ENCODING 
*enc
, const char *p
, int c
) 
 799   return BIG2_CHAR_MATCHES(enc
, p
, c
); 
 802 static int PTRFASTCALL
 
 803 big2_isNameMin(const ENCODING 
*enc
, const char *p
) 
 805   return BIG2_IS_NAME_CHAR_MINBPC(enc
, p
); 
 808 static int PTRFASTCALL
 
 809 big2_isNmstrtMin(const ENCODING 
*enc
, const char *p
) 
 811   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc
, p
); 
 815 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16 
 817 #else /* not XML_MIN_SIZE */ 
 820 #define PREFIX(ident) big2_ ## ident 
 821 #define MINBPC(enc) 2 
 822 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ 
 823 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p) 
 824 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p) 
 825 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c) 
 826 #define IS_NAME_CHAR(enc, p, n) 0 
 827 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p) 
 828 #define IS_NMSTRT_CHAR(enc, p, n) (0) 
 829 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) 
 831 #include "xmltok_impl.c" 
 838 #undef IS_NAME_CHAR_MINBPC 
 839 #undef IS_NMSTRT_CHAR 
 840 #undef IS_NMSTRT_CHAR_MINBPC 
 841 #undef IS_INVALID_CHAR 
 843 #endif /* not XML_MIN_SIZE */ 
 847 static const struct normal_encoding big2_encoding_ns 
= { 
 849 #if BYTEORDER == 4321 
 856 #include "asciitab.h" 
 857 #include "latin1tab.h" 
 859   STANDARD_VTABLE(big2_
) 
 864 static const struct normal_encoding big2_encoding 
= { 
 866 #if BYTEORDER == 4321 
 873 #define BT_COLON BT_NMSTRT 
 874 #include "asciitab.h" 
 876 #include "latin1tab.h" 
 878   STANDARD_VTABLE(big2_
) 
 881 #if BYTEORDER != 1234 
 885 static const struct normal_encoding internal_big2_encoding_ns 
= { 
 888 #include "iasciitab.h" 
 889 #include "latin1tab.h" 
 891   STANDARD_VTABLE(big2_
) 
 896 static const struct normal_encoding internal_big2_encoding 
= { 
 899 #define BT_COLON BT_NMSTRT 
 900 #include "iasciitab.h" 
 902 #include "latin1tab.h" 
 904   STANDARD_VTABLE(big2_
) 
 912 streqci(const char *s1
, const char *s2
) 
 917     if (ASCII_a 
<= c1 
&& c1 
<= ASCII_z
) 
 918       c1 
+= ASCII_A 
- ASCII_a
; 
 919     if (ASCII_a 
<= c2 
&& c2 
<= ASCII_z
) 
 920       c2 
+= ASCII_A 
- ASCII_a
; 
 930 initUpdatePosition(const ENCODING 
*enc
, const char *ptr
, 
 931                    const char *end
, POSITION 
*pos
) 
 933   normal_updatePosition(&utf8_encoding
.enc
, ptr
, end
, pos
); 
 937 toAscii(const ENCODING 
*enc
, const char *ptr
, const char *end
) 
 941   XmlUtf8Convert(enc
, &ptr
, end
, &p
, p 
+ 1); 
 961 /* Return 1 if there's just optional white space or there's an S 
 962    followed by name=val. 
 965 parsePseudoAttribute(const ENCODING 
*enc
, 
 968                      const char **namePtr
, 
 969                      const char **nameEndPtr
, 
 971                      const char **nextTokPtr
) 
 979   if (!isSpace(toAscii(enc
, ptr
, end
))) { 
 984     ptr 
+= enc
->minBytesPerChar
; 
 985   } while (isSpace(toAscii(enc
, ptr
, end
))); 
 992     c 
= toAscii(enc
, ptr
, end
); 
 997     if (c 
== ASCII_EQUALS
) { 
1004         ptr 
+= enc
->minBytesPerChar
; 
1005       } while (isSpace(c 
= toAscii(enc
, ptr
, end
))); 
1006       if (c 
!= ASCII_EQUALS
) { 
1012     ptr 
+= enc
->minBytesPerChar
; 
1014   if (ptr 
== *namePtr
) { 
1018   ptr 
+= enc
->minBytesPerChar
; 
1019   c 
= toAscii(enc
, ptr
, end
); 
1020   while (isSpace(c
)) { 
1021     ptr 
+= enc
->minBytesPerChar
; 
1022     c 
= toAscii(enc
, ptr
, end
); 
1024   if (c 
!= ASCII_QUOT 
&& c 
!= ASCII_APOS
) { 
1029   ptr 
+= enc
->minBytesPerChar
; 
1031   for (;; ptr 
+= enc
->minBytesPerChar
) { 
1032     c 
= toAscii(enc
, ptr
, end
); 
1035     if (!(ASCII_a 
<= c 
&& c 
<= ASCII_z
) 
1036         && !(ASCII_A 
<= c 
&& c 
<= ASCII_Z
) 
1037         && !(ASCII_0 
<= c 
&& c 
<= ASCII_9
) 
1038         && c 
!= ASCII_PERIOD
 
1040         && c 
!= ASCII_UNDERSCORE
) { 
1045   *nextTokPtr 
= ptr 
+ enc
->minBytesPerChar
; 
1049 static const char KW_version
[] = { 
1050   ASCII_v
, ASCII_e
, ASCII_r
, ASCII_s
, ASCII_i
, ASCII_o
, ASCII_n
, '\0' 
1053 static const char KW_encoding
[] = { 
1054   ASCII_e
, ASCII_n
, ASCII_c
, ASCII_o
, ASCII_d
, ASCII_i
, ASCII_n
, ASCII_g
, '\0' 
1057 static const char KW_standalone
[] = { 
1058   ASCII_s
, ASCII_t
, ASCII_a
, ASCII_n
, ASCII_d
, ASCII_a
, ASCII_l
, ASCII_o
, 
1059   ASCII_n
, ASCII_e
, '\0' 
1062 static const char KW_yes
[] = { 
1063   ASCII_y
, ASCII_e
, ASCII_s
,  '\0' 
1066 static const char KW_no
[] = { 
1067   ASCII_n
, ASCII_o
,  '\0' 
1071 doParseXmlDecl(const ENCODING 
*(*encodingFinder
)(const ENCODING 
*, 
1074                int isGeneralTextEntity
, 
1075                const ENCODING 
*enc
, 
1078                const char **badPtr
, 
1079                const char **versionPtr
, 
1080                const char **versionEndPtr
, 
1081                const char **encodingName
, 
1082                const ENCODING 
**encoding
, 
1085   const char *val 
= NULL
; 
1086   const char *name 
= NULL
; 
1087   const char *nameEnd 
= NULL
; 
1088   ptr 
+= 5 * enc
->minBytesPerChar
; 
1089   end 
-= 2 * enc
->minBytesPerChar
; 
1090   if (!parsePseudoAttribute(enc
, ptr
, end
, &name
, &nameEnd
, &val
, &ptr
) 
1095   if (!XmlNameMatchesAscii(enc
, name
, nameEnd
, KW_version
)) { 
1096     if (!isGeneralTextEntity
) { 
1105       *versionEndPtr 
= ptr
; 
1106     if (!parsePseudoAttribute(enc
, ptr
, end
, &name
, &nameEnd
, &val
, &ptr
)) { 
1111       if (isGeneralTextEntity
) { 
1112         /* a TextDecl must have an EncodingDecl */ 
1119   if (XmlNameMatchesAscii(enc
, name
, nameEnd
, KW_encoding
)) { 
1120     int c 
= toAscii(enc
, val
, end
); 
1121     if (!(ASCII_a 
<= c 
&& c 
<= ASCII_z
) && !(ASCII_A 
<= c 
&& c 
<= ASCII_Z
)) { 
1126       *encodingName 
= val
; 
1128       *encoding 
= encodingFinder(enc
, val
, ptr 
- enc
->minBytesPerChar
); 
1129     if (!parsePseudoAttribute(enc
, ptr
, end
, &name
, &nameEnd
, &val
, &ptr
)) { 
1136   if (!XmlNameMatchesAscii(enc
, name
, nameEnd
, KW_standalone
) 
1137       || isGeneralTextEntity
) { 
1141   if (XmlNameMatchesAscii(enc
, val
, ptr 
- enc
->minBytesPerChar
, KW_yes
)) { 
1145   else if (XmlNameMatchesAscii(enc
, val
, ptr 
- enc
->minBytesPerChar
, KW_no
)) { 
1153   while (isSpace(toAscii(enc
, ptr
, end
))) 
1154     ptr 
+= enc
->minBytesPerChar
; 
1163 checkCharRefNumber(int result
) 
1165   switch (result 
>> 8) { 
1166   case 0xD8: case 0xD9: case 0xDA: case 0xDB: 
1167   case 0xDC: case 0xDD: case 0xDE: case 0xDF: 
1170     if (latin1_encoding
.type
[result
] == BT_NONXML
) 
1174     if (result 
== 0xFFFE || result 
== 0xFFFF) 
1182 XmlUtf8Encode(int c
, char *buf
) 
1185     /* minN is minimum legal resulting value for N byte sequence */ 
1194     buf
[0] = (char)(c 
| UTF8_cval1
); 
1198     buf
[0] = (char)((c 
>> 6) | UTF8_cval2
); 
1199     buf
[1] = (char)((c 
& 0x3f) | 0x80); 
1203     buf
[0] = (char)((c 
>> 12) | UTF8_cval3
); 
1204     buf
[1] = (char)(((c 
>> 6) & 0x3f) | 0x80); 
1205     buf
[2] = (char)((c 
& 0x3f) | 0x80); 
1209     buf
[0] = (char)((c 
>> 18) | UTF8_cval4
); 
1210     buf
[1] = (char)(((c 
>> 12) & 0x3f) | 0x80); 
1211     buf
[2] = (char)(((c 
>> 6) & 0x3f) | 0x80); 
1212     buf
[3] = (char)((c 
& 0x3f) | 0x80); 
1219 XmlUtf16Encode(int charNum
, unsigned short *buf
) 
1223   if (charNum 
< 0x10000) { 
1224     buf
[0] = (unsigned short)charNum
; 
1227   if (charNum 
< 0x110000) { 
1229     buf
[0] = (unsigned short)((charNum 
>> 10) + 0xD800); 
1230     buf
[1] = (unsigned short)((charNum 
& 0x3FF) + 0xDC00); 
1236 struct unknown_encoding 
{ 
1237   struct normal_encoding normal
; 
1238   int (*convert
)(void *userData
, const char *p
); 
1240   unsigned short utf16
[256]; 
1244 #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc)) 
1247 XmlSizeOfUnknownEncoding(void) 
1249   return sizeof(struct unknown_encoding
); 
1252 static int PTRFASTCALL
 
1253 unknown_isName(const ENCODING 
*enc
, const char *p
) 
1255   const struct unknown_encoding 
*uenc 
= AS_UNKNOWN_ENCODING(enc
); 
1256   int c 
= uenc
->convert(uenc
->userData
, p
); 
1259   return UCS2_GET_NAMING(namePages
, c 
>> 8, c 
& 0xFF); 
1262 static int PTRFASTCALL
 
1263 unknown_isNmstrt(const ENCODING 
*enc
, const char *p
) 
1265   const struct unknown_encoding 
*uenc 
= AS_UNKNOWN_ENCODING(enc
); 
1266   int c 
= uenc
->convert(uenc
->userData
, p
); 
1269   return UCS2_GET_NAMING(nmstrtPages
, c 
>> 8, c 
& 0xFF); 
1272 static int PTRFASTCALL
 
1273 unknown_isInvalid(const ENCODING 
*enc
, const char *p
) 
1275   const struct unknown_encoding 
*uenc 
= AS_UNKNOWN_ENCODING(enc
); 
1276   int c 
= uenc
->convert(uenc
->userData
, p
); 
1277   return (c 
& ~0xFFFF) || checkCharRefNumber(c
) < 0; 
1281 unknown_toUtf8(const ENCODING 
*enc
, 
1282                const char **fromP
, const char *fromLim
, 
1283                char **toP
, const char *toLim
) 
1285   const struct unknown_encoding 
*uenc 
= AS_UNKNOWN_ENCODING(enc
); 
1286   char buf
[XML_UTF8_ENCODE_MAX
]; 
1290     if (*fromP 
== fromLim
) 
1292     utf8 
= uenc
->utf8
[(unsigned char)**fromP
]; 
1295       int c 
= uenc
->convert(uenc
->userData
, *fromP
); 
1296       n 
= XmlUtf8Encode(c
, buf
); 
1297       if (n 
> toLim 
- *toP
) 
1300       *fromP 
+= (AS_NORMAL_ENCODING(enc
)->type
[(unsigned char)**fromP
] 
1304       if (n 
> toLim 
- *toP
) 
1309       *(*toP
)++ = *utf8
++; 
1315 unknown_toUtf16(const ENCODING 
*enc
, 
1316                 const char **fromP
, const char *fromLim
, 
1317                 unsigned short **toP
, const unsigned short *toLim
) 
1319   const struct unknown_encoding 
*uenc 
= AS_UNKNOWN_ENCODING(enc
); 
1320   while (*fromP 
!= fromLim 
&& *toP 
!= toLim
) { 
1321     unsigned short c 
= uenc
->utf16
[(unsigned char)**fromP
]; 
1323       c 
= (unsigned short) 
1324           uenc
->convert(uenc
->userData
, *fromP
); 
1325       *fromP 
+= (AS_NORMAL_ENCODING(enc
)->type
[(unsigned char)**fromP
] 
1335 XmlInitUnknownEncoding(void *mem
, 
1341   struct unknown_encoding 
*e 
= (struct unknown_encoding 
*)mem
; 
1342   for (i 
= 0; i 
< (int)sizeof(struct normal_encoding
); i
++) 
1343     ((char *)mem
)[i
] = ((char *)&latin1_encoding
)[i
]; 
1344   for (i 
= 0; i 
< 128; i
++) 
1345     if (latin1_encoding
.type
[i
] != BT_OTHER
 
1346         && latin1_encoding
.type
[i
] != BT_NONXML
 
1349   for (i 
= 0; i 
< 256; i
++) { 
1352       e
->normal
.type
[i
] = BT_MALFORM
; 
1353       /* This shouldn't really get used. */ 
1354       e
->utf16
[i
] = 0xFFFF; 
1361       e
->normal
.type
[i
] = (unsigned char)(BT_LEAD2 
- (c 
+ 2)); 
1365     else if (c 
< 0x80) { 
1366       if (latin1_encoding
.type
[c
] != BT_OTHER
 
1367           && latin1_encoding
.type
[c
] != BT_NONXML
 
1370       e
->normal
.type
[i
] = latin1_encoding
.type
[c
]; 
1372       e
->utf8
[i
][1] = (char)c
; 
1373       e
->utf16
[i
] = (unsigned short)(c 
== 0 ? 0xFFFF : c
); 
1375     else if (checkCharRefNumber(c
) < 0) { 
1376       e
->normal
.type
[i
] = BT_NONXML
; 
1377       /* This shouldn't really get used. */ 
1378       e
->utf16
[i
] = 0xFFFF; 
1385       if (UCS2_GET_NAMING(nmstrtPages
, c 
>> 8, c 
& 0xff)) 
1386         e
->normal
.type
[i
] = BT_NMSTRT
; 
1387       else if (UCS2_GET_NAMING(namePages
, c 
>> 8, c 
& 0xff)) 
1388         e
->normal
.type
[i
] = BT_NAME
; 
1390         e
->normal
.type
[i
] = BT_OTHER
; 
1391       e
->utf8
[i
][0] = (char)XmlUtf8Encode(c
, e
->utf8
[i
] + 1); 
1392       e
->utf16
[i
] = (unsigned short)c
; 
1395   e
->userData 
= userData
; 
1396   e
->convert 
= convert
; 
1398     e
->normal
.isName2 
= unknown_isName
; 
1399     e
->normal
.isName3 
= unknown_isName
; 
1400     e
->normal
.isName4 
= unknown_isName
; 
1401     e
->normal
.isNmstrt2 
= unknown_isNmstrt
; 
1402     e
->normal
.isNmstrt3 
= unknown_isNmstrt
; 
1403     e
->normal
.isNmstrt4 
= unknown_isNmstrt
; 
1404     e
->normal
.isInvalid2 
= unknown_isInvalid
; 
1405     e
->normal
.isInvalid3 
= unknown_isInvalid
; 
1406     e
->normal
.isInvalid4 
= unknown_isInvalid
; 
1408   e
->normal
.enc
.utf8Convert 
= unknown_toUtf8
; 
1409   e
->normal
.enc
.utf16Convert 
= unknown_toUtf16
; 
1410   return &(e
->normal
.enc
); 
1413 /* If this enumeration is changed, getEncodingIndex and encodings 
1414 must also be changed. */ 
1423   /* must match encodingNames up to here */ 
1427 static const char KW_ISO_8859_1
[] = { 
1428   ASCII_I
, ASCII_S
, ASCII_O
, ASCII_MINUS
, ASCII_8
, ASCII_8
, ASCII_5
, ASCII_9
, 
1429   ASCII_MINUS
, ASCII_1
, '\0' 
1431 static const char KW_US_ASCII
[] = { 
1432   ASCII_U
, ASCII_S
, ASCII_MINUS
, ASCII_A
, ASCII_S
, ASCII_C
, ASCII_I
, ASCII_I
, 
1435 static const char KW_UTF_8
[] =  { 
1436   ASCII_U
, ASCII_T
, ASCII_F
, ASCII_MINUS
, ASCII_8
, '\0' 
1438 static const char KW_UTF_16
[] = { 
1439   ASCII_U
, ASCII_T
, ASCII_F
, ASCII_MINUS
, ASCII_1
, ASCII_6
, '\0' 
1441 static const char KW_UTF_16BE
[] = { 
1442   ASCII_U
, ASCII_T
, ASCII_F
, ASCII_MINUS
, ASCII_1
, ASCII_6
, ASCII_B
, ASCII_E
, 
1445 static const char KW_UTF_16LE
[] = { 
1446   ASCII_U
, ASCII_T
, ASCII_F
, ASCII_MINUS
, ASCII_1
, ASCII_6
, ASCII_L
, ASCII_E
, 
1451 getEncodingIndex(const char *name
) 
1453   static const char *encodingNames
[] = { 
1464   for (i 
= 0; i 
< (int)(sizeof(encodingNames
)/sizeof(encodingNames
[0])); i
++) 
1465     if (streqci(name
, encodingNames
[i
])) 
1470 /* For binary compatibility, we store the index of the encoding 
1471    specified at initialization in the isUtf16 member. 
1474 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16) 
1475 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i) 
1477 /* This is what detects the encoding.  encodingTable maps from 
1478    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of 
1479    the external (protocol) specified encoding; state is 
1480    XML_CONTENT_STATE if we're parsing an external text entity, and 
1481    XML_PROLOG_STATE otherwise. 
1486 initScan(const ENCODING 
**encodingTable
, 
1487          const INIT_ENCODING 
*enc
, 
1491          const char **nextTokPtr
) 
1493   const ENCODING 
**encPtr
; 
1496     return XML_TOK_NONE
; 
1497   encPtr 
= enc
->encPtr
; 
1498   if (ptr 
+ 1 == end
) { 
1499     /* only a single byte available for auto-detection */ 
1500 #ifndef XML_DTD /* FIXME */ 
1501     /* a well-formed document entity must have more than one byte */ 
1502     if (state 
!= XML_CONTENT_STATE
) 
1503       return XML_TOK_PARTIAL
; 
1505     /* so we're parsing an external text entity... */ 
1506     /* if UTF-16 was externally specified, then we need at least 2 bytes */ 
1507     switch (INIT_ENC_INDEX(enc
)) { 
1511       return XML_TOK_PARTIAL
; 
1513     switch ((unsigned char)*ptr
) { 
1516     case 0xEF: /* possibly first byte of UTF-8 BOM */ 
1517       if (INIT_ENC_INDEX(enc
) == ISO_8859_1_ENC
 
1518           && state 
== XML_CONTENT_STATE
) 
1523       return XML_TOK_PARTIAL
; 
1527     switch (((unsigned char)ptr
[0] << 8) | (unsigned char)ptr
[1]) { 
1529       if (INIT_ENC_INDEX(enc
) == ISO_8859_1_ENC
 
1530           && state 
== XML_CONTENT_STATE
) 
1532       *nextTokPtr 
= ptr 
+ 2; 
1533       *encPtr 
= encodingTable
[UTF_16BE_ENC
]; 
1535     /* 00 3C is handled in the default case */ 
1537       if ((INIT_ENC_INDEX(enc
) == UTF_16BE_ENC
 
1538            || INIT_ENC_INDEX(enc
) == UTF_16_ENC
) 
1539           && state 
== XML_CONTENT_STATE
) 
1541       *encPtr 
= encodingTable
[UTF_16LE_ENC
]; 
1542       return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
); 
1544       if (INIT_ENC_INDEX(enc
) == ISO_8859_1_ENC
 
1545           && state 
== XML_CONTENT_STATE
) 
1547       *nextTokPtr 
= ptr 
+ 2; 
1548       *encPtr 
= encodingTable
[UTF_16LE_ENC
]; 
1551       /* Maybe a UTF-8 BOM (EF BB BF) */ 
1552       /* If there's an explicitly specified (external) encoding 
1553          of ISO-8859-1 or some flavour of UTF-16 
1554          and this is an external text entity, 
1555          don't look for the BOM, 
1556          because it might be a legal data. 
1558       if (state 
== XML_CONTENT_STATE
) { 
1559         int e 
= INIT_ENC_INDEX(enc
); 
1560         if (e 
== ISO_8859_1_ENC 
|| e 
== UTF_16BE_ENC
 
1561             || e 
== UTF_16LE_ENC 
|| e 
== UTF_16_ENC
) 
1565         return XML_TOK_PARTIAL
; 
1566       if ((unsigned char)ptr
[2] == 0xBF) { 
1567         *nextTokPtr 
= ptr 
+ 3; 
1568         *encPtr 
= encodingTable
[UTF_8_ENC
]; 
1573       if (ptr
[0] == '\0') { 
1574         /* 0 isn't a legal data character. Furthermore a document 
1575            entity can only start with ASCII characters.  So the only 
1576            way this can fail to be big-endian UTF-16 if it it's an 
1577            external parsed general entity that's labelled as 
1580         if (state 
== XML_CONTENT_STATE 
&& INIT_ENC_INDEX(enc
) == UTF_16LE_ENC
) 
1582         *encPtr 
= encodingTable
[UTF_16BE_ENC
]; 
1583         return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
); 
1585       else if (ptr
[1] == '\0') { 
1586         /* We could recover here in the case: 
1587             - parsing an external entity 
1589             - no externally specified encoding 
1590             - no encoding declaration 
1591            by assuming UTF-16LE.  But we don't, because this would mean when 
1592            presented just with a single byte, we couldn't reliably determine 
1593            whether we needed further bytes. 
1595         if (state 
== XML_CONTENT_STATE
) 
1597         *encPtr 
= encodingTable
[UTF_16LE_ENC
]; 
1598         return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
); 
1603   *encPtr 
= encodingTable
[INIT_ENC_INDEX(enc
)]; 
1604   return XmlTok(*encPtr
, state
, ptr
, end
, nextTokPtr
); 
1610 #include "xmltok_ns.c" 
1616 #define NS(x) x ## NS 
1617 #define ns(x) x ## _ns 
1619 #include "xmltok_ns.c" 
1625 XmlInitUnknownEncodingNS(void *mem
, 
1630   ENCODING 
*enc 
= XmlInitUnknownEncoding(mem
, table
, convert
, userData
); 
1632     ((struct normal_encoding 
*)enc
)->type
[ASCII_COLON
] = BT_COLON
;