icuSources/tools/toolutil/xmlparser.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2004-2010, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  xmlparser.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004jul21
  14 *   created by: Andy Heninger
  15 */
  16
  17 #include <stdio.h>
  18 #include "unicode/uchar.h"
  19 #include "unicode/ucnv.h"
  20 #include "unicode/regex.h"
  21 #include "filestrm.h"
  22 #include "xmlparser.h"
  23
  24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
  25
  26 // character constants
  27 enum {
  28     x_QUOT=0x22,
  29     x_AMP=0x26,
  30     x_APOS=0x27,
  31     x_LT=0x3c,
  32     x_GT=0x3e,
  33     x_l=0x6c
  34 };
  35
  36 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
  37
  38 // XML #4
  39 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
  40                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
  41                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
  42                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
  43
  44 //  XML #5
  45 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
  46
  47 //  XML #6
  48 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
  49
  50 U_NAMESPACE_BEGIN
  51
  52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
  53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
  54
  55 //
  56 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
  57 //                             used for parsing.
  58 //
  59 UXMLParser::UXMLParser(UErrorCode &status) :
  60       //  XML Declaration.  XML Production #23.
  61       //      example:  "<?xml version=1.0 encoding="utf-16" ?>
  62       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
  63       //            allow for a possible leading BOM.
  64       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
  65
  66       //  XML Comment   production #15
  67       //     example:  "<!-- whatever -->
  68       //       note, does not detect an illegal "--" within comments
  69       mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
  70
  71       //  XML Spaces
  72       //      production [3]
  73       mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
  74
  75       //  XML Doctype decl  production #28
  76       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
  77       //       or      "<!DOCTYPE foo [internal dtd]>
  78       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
  79       //           Some internal dtd subsets could confuse this simple-minded
  80       //           attempt at skipping over them, specifically, occcurences
  81       //           of closeing square brackets.  These could appear in comments,
  82       //           or in parameter entity declarations, for example.
  83       mXMLDoctype(UnicodeString(
  84            "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
  85            ), 0, status),
  86
  87       //  XML PI     production #16
  88       //     example   "<?target stuff?>
  89       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
  90
  91       //  XML Element Start   Productions #40, #41
  92       //          example   <foo att1='abc'  att2="d e f" >
  93       //      capture #1:  the tag name
  94       //
  95       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
  96           "(?:"
  97                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
  98                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
  99           ")*"                                                             //   * for zero or more attributes.
 100           XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
 101
 102       //  XML Element End     production #42
 103       //     example   </foo>
 104       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
 105
 106       // XML Element Empty    production #44
 107       //     example   <foo att1="abc"   att2="d e f" />
 108       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
 109           "(?:"
 110                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
 111                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
 112           ")*"                                                             //   * for zero or more attributes.
 113           XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
 114
 115
 116       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
 117       mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
 118
 119       // Attribute name = "value".  XML Productions 10, 40/41
 120       //  Capture group 1 is name,
 121       //                2 is the attribute value, including the quotes.
 122       //
 123       //   Note that attributes are scanned twice.  The first time is with
 124       //        the regex for an entire element start.  There, the attributes
 125       //        are checked syntactically, but not separted out one by one.
 126       //        Here, we match a single attribute, and make its name and
 127       //        attribute value available to the parser code.
 128       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
 129          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
 130
 131
 132       mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
 133
 134       // Match any of the new-line sequences in content.
 135       //   All are changed to \u000a.
 136       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
 137
 138       // & char references
 139       //   We will figure out what we've got based on which capture group has content.
 140       //   The last one is a catchall for unrecognized entity references..
 141       //             1     2     3      4      5           6                    7          8
 142       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
 143                 0, status),
 144
 145       fNames(status),
 146       fElementStack(status),
 147       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
 148       {
 149       }
 150
 151 UXMLParser *
 152 UXMLParser::createParser(UErrorCode &errorCode) {
 153     if (U_FAILURE(errorCode)) {
 154         return NULL;
 155     } else {
 156         return new UXMLParser(errorCode);
 157     }
 158 }
 159
 160 UXMLParser::~UXMLParser() {}
 161
 162 UXMLElement *
 163 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
 164     char bytes[4096], charsetBuffer[100];
 165     FileStream *f;
 166     const char *charset, *pb;
 167     UnicodeString src;
 168     UConverter *cnv;
 169     UChar *buffer, *pu;
 170     int32_t fileLength, bytesLength, length, capacity;
 171     UBool flush;
 172
 173     if(U_FAILURE(errorCode)) {
 174         return NULL;
 175     }
 176
 177     f=T_FileStream_open(filename, "rb");
 178     if(f==NULL) {
 179         errorCode=U_FILE_ACCESS_ERROR;
 180         return NULL;
 181     }
 182
 183     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
 184     if(bytesLength<(int32_t)sizeof(bytes)) {
 185         // we have already read the entire file
 186         fileLength=bytesLength;
 187     } else {
 188         // get the file length
 189         fileLength=T_FileStream_size(f);
 190     }
 191
 192     /*
 193      * get the charset:
 194      * 1. Unicode signature
 195      * 2. treat as ISO-8859-1 and read XML encoding="charser"
 196      * 3. default to UTF-8
 197      */
 198     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
 199     if(U_SUCCESS(errorCode) && charset!=NULL) {
 200         // open converter according to Unicode signature
 201         cnv=ucnv_open(charset, &errorCode);
 202     } else {
 203         // read as Latin-1 and parse the XML declaration and encoding
 204         cnv=ucnv_open("ISO-8859-1", &errorCode);
 205         if(U_FAILURE(errorCode)) {
 206             // unexpected error opening Latin-1 converter
 207             goto exit;
 208         }
 209
 210         buffer=src.getBuffer(bytesLength);
 211         if(buffer==NULL) {
 212             // unexpected failure to reserve some string capacity
 213             errorCode=U_MEMORY_ALLOCATION_ERROR;
 214             goto exit;
 215         }
 216         pb=bytes;
 217         pu=buffer;
 218         ucnv_toUnicode(
 219             cnv,
 220             &pu, buffer+src.getCapacity(),
 221             &pb, bytes+bytesLength,
 222             NULL, TRUE, &errorCode);
 223         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
 224         ucnv_close(cnv);
 225         cnv=NULL;
 226         if(U_FAILURE(errorCode)) {
 227             // unexpected error in conversion from Latin-1
 228             src.remove();
 229             goto exit;
 230         }
 231
 232         // parse XML declaration
 233         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
 234             int32_t declEnd=mXMLDecl.end(errorCode);
 235             // go beyond <?xml
 236             int32_t pos=src.indexOf((UChar)x_l)+1;
 237
 238             mAttrValue.reset(src);
 239             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
 240                 UnicodeString attName  = mAttrValue.group(1, errorCode);
 241                 UnicodeString attValue = mAttrValue.group(2, errorCode);
 242
 243                 // Trim the quotes from the att value.  These are left over from the original regex
 244                 //   that parsed the attribue, which couldn't conveniently strip them.
 245                 attValue.remove(0,1);                    // one char from the beginning
 246                 attValue.truncate(attValue.length()-1);  // and one from the end.
 247
 248                 if(attName==UNICODE_STRING("encoding", 8)) {
 249                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
 250                     charset=charsetBuffer;
 251                     break;
 252                 }
 253                 pos = mAttrValue.end(2, errorCode);
 254             }
 255
 256             if(charset==NULL) {
 257                 // default to UTF-8
 258                 charset="UTF-8";
 259             }
 260             cnv=ucnv_open(charset, &errorCode);
 261         }
 262     }
 263
 264     if(U_FAILURE(errorCode)) {
 265         // unable to open the converter
 266         goto exit;
 267     }
 268
 269     // convert the file contents
 270     capacity=fileLength;        // estimated capacity
 271     src.getBuffer(capacity);
 272     src.releaseBuffer(0);       // zero length
 273     flush=FALSE;
 274     for(;;) {
 275         // convert contents of bytes[bytesLength]
 276         pb=bytes;
 277         for(;;) {
 278             length=src.length();
 279             buffer=src.getBuffer(capacity);
 280             if(buffer==NULL) {
 281                 // unexpected failure to reserve some string capacity
 282                 errorCode=U_MEMORY_ALLOCATION_ERROR;
 283                 goto exit;
 284             }
 285
 286             pu=buffer+length;
 287             ucnv_toUnicode(
 288                 cnv, &pu, buffer+src.getCapacity(),
 289                 &pb, bytes+bytesLength,
 290                 NULL, FALSE, &errorCode);
 291             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
 292             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 293                 errorCode=U_ZERO_ERROR;
 294                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
 295             } else {
 296                 break;
 297             }
 298         }
 299
 300         if(U_FAILURE(errorCode)) {
 301             break; // conversion error
 302         }
 303
 304         if(flush) {
 305             break; // completely converted the file
 306         }
 307
 308         // read next block
 309         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
 310         if(bytesLength==0) {
 311             // reached end of file, convert once more to flush the converter
 312             flush=TRUE;
 313         }
 314     };
 315
 316 exit:
 317     ucnv_close(cnv);
 318     T_FileStream_close(f);
 319
 320     if(U_SUCCESS(errorCode)) {
 321         return parse(src, errorCode);
 322     } else {
 323         return NULL;
 324     }
 325 }
 326
 327 UXMLElement *
 328 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
 329     if(U_FAILURE(status)) {
 330         return NULL;
 331     }
 332
 333     UXMLElement   *root = NULL;
 334     fPos = 0; // TODO use just a local pos variable and pass it into functions
 335               // where necessary?
 336
 337     // set all matchers to work on the input string
 338     mXMLDecl.reset(src);
 339     mXMLComment.reset(src);
 340     mXMLSP.reset(src);
 341     mXMLDoctype.reset(src);
 342     mXMLPI.reset(src);
 343     mXMLElemStart.reset(src);
 344     mXMLElemEnd.reset(src);
 345     mXMLElemEmpty.reset(src);
 346     mXMLCharData.reset(src);
 347     mAttrValue.reset(src);
 348     mAttrNormalizer.reset(src);
 349     mNewLineNormalizer.reset(src);
 350     mAmps.reset(src);
 351
 352     // Consume the XML Declaration, if present.
 353     if (mXMLDecl.lookingAt(fPos, status)) {
 354         fPos = mXMLDecl.end(status);
 355     }
 356
 357     // Consume "misc" [XML production 27] appearing before DocType
 358     parseMisc(status);
 359
 360     // Consume a DocType declaration, if present.
 361     if (mXMLDoctype.lookingAt(fPos, status)) {
 362         fPos = mXMLDoctype.end(status);
 363     }
 364
 365     // Consume additional "misc" [XML production 27] appearing after the DocType
 366     parseMisc(status);
 367
 368     // Get the root element
 369     if (mXMLElemEmpty.lookingAt(fPos, status)) {
 370         // Root is an empty element (no nested elements or content)
 371         root = createElement(mXMLElemEmpty, status);
 372         fPos = mXMLElemEmpty.end(status);
 373     } else {
 374         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
 375             error("Root Element expected", status);
 376             goto errorExit;
 377         }
 378         root = createElement(mXMLElemStart, status);
 379         UXMLElement  *el = root;
 380
 381         //
 382         // This is the loop that consumes the root element of the document,
 383         //      including all nested content.   Nested elements are handled by
 384         //      explicit pushes/pops of the element stack; there is no recursion
 385         //      in the control flow of this code.
 386         //      "el" always refers to the current element, the one to which content
 387         //      is being added.  It is above the top of the element stack.
 388         for (;;) {
 389             // Nested Element Start
 390             if (mXMLElemStart.lookingAt(fPos, status)) {
 391                 UXMLElement *t = createElement(mXMLElemStart, status);
 392                 el->fChildren.addElement(t, status);
 393                 t->fParent = el;
 394                 fElementStack.push(el, status);
 395                 el = t;
 396                 continue;
 397             }
 398
 399             // Text Content.  String is concatenated onto the current node's content,
 400             //                but only if it contains something other than spaces.
 401             UnicodeString s = scanContent(status);
 402             if (s.length() > 0) {
 403                 mXMLSP.reset(s);
 404                 if (mXMLSP.matches(status) == FALSE) {
 405                     // This chunk of text contains something other than just
 406                     //  white space. Make a child node for it.
 407                     replaceCharRefs(s, status);
 408                     el->fChildren.addElement(s.clone(), status);
 409                 }
 410                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
 411                 continue;
 412             }
 413
 414             // Comments.  Discard.
 415             if (mXMLComment.lookingAt(fPos, status)) {
 416                 fPos = mXMLComment.end(status);
 417                 continue;
 418             }
 419
 420             // PIs.  Discard.
 421             if (mXMLPI.lookingAt(fPos, status)) {
 422                 fPos = mXMLPI.end(status);
 423                 continue;
 424             }
 425
 426             // Element End
 427             if (mXMLElemEnd.lookingAt(fPos, status)) {
 428                 fPos = mXMLElemEnd.end(0, status);
 429                 const UnicodeString name = mXMLElemEnd.group(1, status);
 430                 if (name != *el->fName) {
 431                     error("Element start / end tag mismatch", status);
 432                     goto errorExit;
 433                 }
 434                 if (fElementStack.empty()) {
 435                     // Close of the root element.  We're done with the doc.
 436                     el = NULL;
 437                     break;
 438                 }
 439                 el = (UXMLElement *)fElementStack.pop();
 440                 continue;
 441             }
 442
 443             // Empty Element.  Stored as a child of the current element, but not stacked.
 444             if (mXMLElemEmpty.lookingAt(fPos, status)) {
 445                 UXMLElement *t = createElement(mXMLElemEmpty, status);
 446                 el->fChildren.addElement(t, status);
 447                 continue;
 448             }
 449
 450             // Hit something within the document that doesn't match anything.
 451             //   It's an error.
 452             error("Unrecognized markup", status);
 453             break;
 454         }
 455
 456         if (el != NULL || !fElementStack.empty()) {
 457             // We bailed out early, for some reason.
 458             error("Root element not closed.", status);
 459             goto errorExit;
 460         }
 461     }
 462
 463     // Root Element parse is complete.
 464     // Consume the annoying xml "Misc" that can appear at the end of the doc.
 465     parseMisc(status);
 466
 467     // We should have reached the end of the input
 468     if (fPos != src.length()) {
 469         error("Extra content at the end of the document", status);
 470         goto errorExit;
 471     }
 472
 473     // Success!
 474     return root;
 475
 476 errorExit:
 477     delete root;
 478     return NULL;
 479 }
 480
 481 //
 482 //  createElement
 483 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
 484 //      for it.
 485 //
 486 UXMLElement *
 487 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
 488     // First capture group is the element's name.
 489     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
 490
 491     // Scan for attributes.
 492     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
 493
 494     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
 495         UnicodeString attName  = mAttrValue.group(1, status);
 496         UnicodeString attValue = mAttrValue.group(2, status);
 497
 498         // Trim the quotes from the att value.  These are left over from the original regex
 499         //   that parsed the attribue, which couldn't conveniently strip them.
 500         attValue.remove(0,1);                    // one char from the beginning
 501         attValue.truncate(attValue.length()-1);  // and one from the end.
 502
 503         // XML Attribue value normalization.
 504         // This is one of the really screwy parts of the XML spec.
 505         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
 506         // Note that non-validating parsers must treat all entities as type CDATA
 507         //   which simplifies things some.
 508
 509         // Att normalization step 1:  normalize any newlines in the attribute value
 510         mNewLineNormalizer.reset(attValue);
 511         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
 512
 513         // Next change all xml white space chars to plain \u0020 spaces.
 514         mAttrNormalizer.reset(attValue);
 515         UnicodeString oneSpace((UChar)0x0020);
 516         attValue = mAttrNormalizer.replaceAll(oneSpace, status);
 517
 518         // Replace character entities.
 519         replaceCharRefs(attValue, status);
 520
 521         // Save the attribute name and value in our document structure.
 522         el->fAttNames.addElement((void *)intern(attName, status), status);
 523         el->fAttValues.addElement(attValue.clone(), status);
 524         pos = mAttrValue.end(2, status);
 525     }
 526     fPos = mEl.end(0, status);
 527     return el;
 528 }
 529
 530 //
 531 //  parseMisc
 532 //     Consume XML "Misc" [production #27]
 533 //        which is any combination of space, PI and comments
 534 //      Need to watch end-of-input because xml MISC stuff is allowed after
 535 //        the document element, so we WILL scan off the end in this function
 536 //
 537 void
 538 UXMLParser::parseMisc(UErrorCode &status)  {
 539     for (;;) {
 540         if (fPos >= mXMLPI.input().length()) {
 541             break;
 542         }
 543         if (mXMLPI.lookingAt(fPos, status)) {
 544             fPos = mXMLPI.end(status);
 545             continue;
 546         }
 547         if (mXMLSP.lookingAt(fPos, status)) {
 548             fPos = mXMLSP.end(status);
 549             continue;
 550         }
 551         if (mXMLComment.lookingAt(fPos, status)) {
 552             fPos = mXMLComment.end(status);
 553             continue;
 554         }
 555         break;
 556     }
 557 }
 558
 559 //
 560 //  Scan for document content.
 561 //
 562 UnicodeString
 563 UXMLParser::scanContent(UErrorCode &status) {
 564     UnicodeString  result;
 565     if (mXMLCharData.lookingAt(fPos, status)) {
 566         result = mXMLCharData.group((int32_t)0, status);
 567         // Normalize the new-lines.  (Before char ref substitution)
 568         mNewLineNormalizer.reset(result);
 569         result = mNewLineNormalizer.replaceAll(fOneLF, status);
 570
 571         // TODO:  handle CDATA
 572         fPos = mXMLCharData.end(0, status);
 573     }
 574
 575     return result;
 576 }
 577
 578 //
 579 //   replaceCharRefs
 580 //
 581 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
 582 //       with the corresponding actual character.
 583 //
 584 void
 585 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
 586     UnicodeString result;
 587     UnicodeString replacement;
 588     int     i;
 589
 590     mAmps.reset(s);
 591     // See the initialization for the regex matcher mAmps.
 592     //    Which entity we've matched is determined by which capture group has content,
 593     //      which is flaged by start() of that group not being -1.
 594     while (mAmps.find()) {
 595         if (mAmps.start(1, status) != -1) {
 596             replacement.setTo((UChar)x_AMP);
 597         } else if (mAmps.start(2, status) != -1) {
 598             replacement.setTo((UChar)x_LT);
 599         } else if (mAmps.start(3, status) != -1) {
 600             replacement.setTo((UChar)x_GT);
 601         } else if (mAmps.start(4, status) != -1) {
 602             replacement.setTo((UChar)x_APOS);
 603         } else if (mAmps.start(5, status) != -1) {
 604             replacement.setTo((UChar)x_QUOT);
 605         } else if (mAmps.start(6, status) != -1) {
 606             UnicodeString hexString = mAmps.group(6, status);
 607             UChar32 val = 0;
 608             for (i=0; i<hexString.length(); i++) {
 609                 val = (val << 4) + u_digit(hexString.charAt(i), 16);
 610             }
 611             // TODO:  some verification that the character is valid
 612             replacement.setTo(val);
 613         } else if (mAmps.start(7, status) != -1) {
 614             UnicodeString decimalString = mAmps.group(7, status);
 615             UChar32 val = 0;
 616             for (i=0; i<decimalString.length(); i++) {
 617                 val = val*10 + u_digit(decimalString.charAt(i), 10);
 618             }
 619             // TODO:  some verification that the character is valid
 620             replacement.setTo(val);
 621         } else {
 622             // An unrecognized &entity;  Leave it alone.
 623             //  TODO:  check that it really looks like an entity, and is not some
 624             //         random & in the text.
 625             replacement = mAmps.group((int32_t)0, status);
 626         }
 627         mAmps.appendReplacement(result, replacement, status);
 628     }
 629     mAmps.appendTail(result);
 630     s = result;
 631 }
 632
 633 void
 634 UXMLParser::error(const char *message, UErrorCode &status) {
 635     // TODO:  something better here...
 636     const UnicodeString &src=mXMLDecl.input();
 637     int  line = 0;
 638     int  ci = 0;
 639     while (ci < fPos && ci>=0) {
 640         ci = src.indexOf((UChar)0x0a, ci+1);
 641         line++;
 642     }
 643     fprintf(stderr, "Error: %s at line %d\n", message, line);
 644     if (U_SUCCESS(status)) {
 645         status = U_PARSE_ERROR;
 646     }
 647 }
 648
 649 // intern strings like in Java
 650
 651 const UnicodeString *
 652 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
 653     const UHashElement *he=fNames.find(s);
 654     if(he!=NULL) {
 655         // already a known name, return its hashed key pointer
 656         return (const UnicodeString *)he->key.pointer;
 657     } else {
 658         // add this new name and return its hashed key pointer
 659         fNames.puti(s, 0, errorCode);
 660         he=fNames.find(s);
 661         return (const UnicodeString *)he->key.pointer;
 662     }
 663 }
 664
 665 const UnicodeString *
 666 UXMLParser::findName(const UnicodeString &s) const {
 667     const UHashElement *he=fNames.find(s);
 668     if(he!=NULL) {
 669         // a known name, return its hashed key pointer
 670         return (const UnicodeString *)he->key.pointer;
 671     } else {
 672         // unknown name
 673         return NULL;
 674     }
 675 }
 676
 677 // UXMLElement ------------------------------------------------------------- ***
 678
 679 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
 680    fParser(parser),
 681    fName(name),
 682    fAttNames(errorCode),
 683    fAttValues(errorCode),
 684    fChildren(errorCode),
 685    fParent(NULL)
 686 {
 687 }
 688
 689 UXMLElement::~UXMLElement() {
 690     int   i;
 691     // attribute names are owned by the UXMLParser, don't delete them here
 692     for (i=fAttValues.size()-1; i>=0; i--) {
 693         delete (UObject *)fAttValues.elementAt(i);
 694     }
 695     for (i=fChildren.size()-1; i>=0; i--) {
 696         delete (UObject *)fChildren.elementAt(i);
 697     }
 698 }
 699
 700 const UnicodeString &
 701 UXMLElement::getTagName() const {
 702     return *fName;
 703 }
 704
 705 UnicodeString
 706 UXMLElement::getText(UBool recurse) const {
 707     UnicodeString text;
 708     appendText(text, recurse);
 709     return text;
 710 }
 711
 712 void
 713 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
 714     const UObject *node;
 715     int32_t i, count=fChildren.size();
 716     for(i=0; i<count; ++i) {
 717         node=(const UObject *)fChildren.elementAt(i);
 718         const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
 719         if(s!=NULL) {
 720             text.append(*s);
 721         } else if(recurse) /* must be a UXMLElement */ {
 722             ((const UXMLElement *)node)->appendText(text, recurse);
 723         }
 724     }
 725 }
 726
 727 int32_t
 728 UXMLElement::countAttributes() const {
 729     return fAttNames.size();
 730 }
 731
 732 const UnicodeString *
 733 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
 734     if(0<=i && i<fAttNames.size()) {
 735         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
 736         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
 737         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
 738     } else {
 739         return NULL;
 740     }
 741 }
 742
 743 const UnicodeString *
 744 UXMLElement::getAttribute(const UnicodeString &name) const {
 745     // search for the attribute name by comparing the interned pointer,
 746     // not the string contents
 747     const UnicodeString *p=fParser->findName(name);
 748     if(p==NULL) {
 749         return NULL; // no such attribute seen by the parser at all
 750     }
 751
 752     int32_t i, count=fAttNames.size();
 753     for(i=0; i<count; ++i) {
 754         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
 755             return (const UnicodeString *)fAttValues.elementAt(i);
 756         }
 757     }
 758     return NULL;
 759 }
 760
 761 int32_t
 762 UXMLElement::countChildren() const {
 763     return fChildren.size();
 764 }
 765
 766 const UObject *
 767 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
 768     if(0<=i && i<fChildren.size()) {
 769         const UObject *node=(const UObject *)fChildren.elementAt(i);
 770         if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
 771             type=UXML_NODE_TYPE_ELEMENT;
 772         } else {
 773             type=UXML_NODE_TYPE_STRING;
 774         }
 775         return node;
 776     } else {
 777         return NULL;
 778     }
 779 }
 780
 781 const UXMLElement *
 782 UXMLElement::nextChildElement(int32_t &i) const {
 783     if(i<0) {
 784         return NULL;
 785     }
 786
 787     const UObject *node;
 788     int32_t count=fChildren.size();
 789     while(i<count) {
 790         node=(const UObject *)fChildren.elementAt(i++);
 791         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
 792         if(elem!=NULL) {
 793             return elem;
 794         }
 795     }
 796     return NULL;
 797 }
 798
 799 const UXMLElement *
 800 UXMLElement::getChildElement(const UnicodeString &name) const {
 801     // search for the element name by comparing the interned pointer,
 802     // not the string contents
 803     const UnicodeString *p=fParser->findName(name);
 804     if(p==NULL) {
 805         return NULL; // no such element seen by the parser at all
 806     }
 807
 808     const UObject *node;
 809     int32_t i, count=fChildren.size();
 810     for(i=0; i<count; ++i) {
 811         node=(const UObject *)fChildren.elementAt(i);
 812         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
 813         if(elem!=NULL) {
 814             if(p==elem->fName) {
 815                 return elem;
 816             }
 817         }
 818     }
 819     return NULL;
 820 }
 821
 822 U_NAMESPACE_END
 823
 824 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
 825