icuSources/tools/toolutil/xmlparser.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 *
   6 *   Copyright (C) 2004-2010, International Business Machines
   7 *   Corporation and others.  All Rights Reserved.
   8 *
   9 *******************************************************************************
  10 *   file name:  xmlparser.cpp
  11 *   encoding:   UTF-8
  12 *   tab size:   8 (not used)
  13 *   indentation:4
  14 *
  15 *   created on: 2004jul21
  16 *   created by: Andy Heninger
  17 */
  18
  19 #include <stdio.h>
  20 #include "unicode/uchar.h"
  21 #include "unicode/ucnv.h"
  22 #include "unicode/regex.h"
  23 #include "filestrm.h"
  24 #include "xmlparser.h"
  25
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
  27
  28 // character constants
  29 enum {
  30     x_QUOT=0x22,
  31     x_AMP=0x26,
  32     x_APOS=0x27,
  33     x_LT=0x3c,
  34     x_GT=0x3e,
  35     x_l=0x6c
  36 };
  37
  38 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
  39
  40 // XML #4
  41 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
  42                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
  43                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
  44                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
  45
  46 //  XML #5
  47 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
  48
  49 //  XML #6
  50 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
  51
  52 U_NAMESPACE_BEGIN
  53
  54 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
  55 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
  56
  57 //
  58 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
  59 //                             used for parsing.
  60 //
  61 UXMLParser::UXMLParser(UErrorCode &status) :
  62       //  XML Declaration.  XML Production #23.
  63       //      example:  "<?xml version=1.0 encoding="utf-16" ?>
  64       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
  65       //            allow for a possible leading BOM.
  66       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
  67
  68       //  XML Comment   production #15
  69       //     example:  "<!-- whatever -->
  70       //       note, does not detect an illegal "--" within comments
  71       mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
  72
  73       //  XML Spaces
  74       //      production [3]
  75       mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
  76
  77       //  XML Doctype decl  production #28
  78       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
  79       //       or      "<!DOCTYPE foo [internal dtd]>
  80       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
  81       //           Some internal dtd subsets could confuse this simple-minded
  82       //           attempt at skipping over them, specifically, occcurences
  83       //           of closeing square brackets.  These could appear in comments,
  84       //           or in parameter entity declarations, for example.
  85       mXMLDoctype(UnicodeString(
  86            "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
  87            ), 0, status),
  88
  89       //  XML PI     production #16
  90       //     example   "<?target stuff?>
  91       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
  92
  93       //  XML Element Start   Productions #40, #41
  94       //          example   <foo att1='abc'  att2="d e f" >
  95       //      capture #1:  the tag name
  96       //
  97       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
  98           "(?:"
  99                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
 100                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
 101           ")*"                                                             //   * for zero or more attributes.
 102           XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
 103
 104       //  XML Element End     production #42
 105       //     example   </foo>
 106       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
 107
 108       // XML Element Empty    production #44
 109       //     example   <foo att1="abc"   att2="d e f" />
 110       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
 111           "(?:"
 112                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
 113                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
 114           ")*"                                                             //   * for zero or more attributes.
 115           XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
 116
 117
 118       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
 119       mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
 120
 121       // Attribute name = "value".  XML Productions 10, 40/41
 122       //  Capture group 1 is name,
 123       //                2 is the attribute value, including the quotes.
 124       //
 125       //   Note that attributes are scanned twice.  The first time is with
 126       //        the regex for an entire element start.  There, the attributes
 127       //        are checked syntactically, but not separted out one by one.
 128       //        Here, we match a single attribute, and make its name and
 129       //        attribute value available to the parser code.
 130       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
 131          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
 132
 133
 134       mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
 135
 136       // Match any of the new-line sequences in content.
 137       //   All are changed to \u000a.
 138       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
 139
 140       // & char references
 141       //   We will figure out what we've got based on which capture group has content.
 142       //   The last one is a catchall for unrecognized entity references..
 143       //             1     2     3      4      5           6                    7          8
 144       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
 145                 0, status),
 146
 147       fNames(status),
 148       fElementStack(status),
 149       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
 150       {
 151       }
 152
 153 UXMLParser *
 154 UXMLParser::createParser(UErrorCode &errorCode) {
 155     if (U_FAILURE(errorCode)) {
 156         return NULL;
 157     } else {
 158         return new UXMLParser(errorCode);
 159     }
 160 }
 161
 162 UXMLParser::~UXMLParser() {}
 163
 164 UXMLElement *
 165 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
 166     char bytes[4096], charsetBuffer[100];
 167     FileStream *f;
 168     const char *charset, *pb;
 169     UnicodeString src;
 170     UConverter *cnv;
 171     UChar *buffer, *pu;
 172     int32_t fileLength, bytesLength, length, capacity;
 173     UBool flush;
 174
 175     if(U_FAILURE(errorCode)) {
 176         return NULL;
 177     }
 178
 179     f=T_FileStream_open(filename, "rb");
 180     if(f==NULL) {
 181         errorCode=U_FILE_ACCESS_ERROR;
 182         return NULL;
 183     }
 184
 185     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
 186     if(bytesLength<(int32_t)sizeof(bytes)) {
 187         // we have already read the entire file
 188         fileLength=bytesLength;
 189     } else {
 190         // get the file length
 191         fileLength=T_FileStream_size(f);
 192     }
 193
 194     /*
 195      * get the charset:
 196      * 1. Unicode signature
 197      * 2. treat as ISO-8859-1 and read XML encoding="charser"
 198      * 3. default to UTF-8
 199      */
 200     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
 201     if(U_SUCCESS(errorCode) && charset!=NULL) {
 202         // open converter according to Unicode signature
 203         cnv=ucnv_open(charset, &errorCode);
 204     } else {
 205         // read as Latin-1 and parse the XML declaration and encoding
 206         cnv=ucnv_open("ISO-8859-1", &errorCode);
 207         if(U_FAILURE(errorCode)) {
 208             // unexpected error opening Latin-1 converter
 209             goto exit;
 210         }
 211
 212         buffer=toUCharPtr(src.getBuffer(bytesLength));
 213         if(buffer==NULL) {
 214             // unexpected failure to reserve some string capacity
 215             errorCode=U_MEMORY_ALLOCATION_ERROR;
 216             goto exit;
 217         }
 218         pb=bytes;
 219         pu=buffer;
 220         ucnv_toUnicode(
 221             cnv,
 222             &pu, buffer+src.getCapacity(),
 223             &pb, bytes+bytesLength,
 224             NULL, TRUE, &errorCode);
 225         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
 226         ucnv_close(cnv);
 227         cnv=NULL;
 228         if(U_FAILURE(errorCode)) {
 229             // unexpected error in conversion from Latin-1
 230             src.remove();
 231             goto exit;
 232         }
 233
 234         // parse XML declaration
 235         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
 236             int32_t declEnd=mXMLDecl.end(errorCode);
 237             // go beyond <?xml
 238             int32_t pos=src.indexOf((UChar)x_l)+1;
 239
 240             mAttrValue.reset(src);
 241             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
 242                 UnicodeString attName  = mAttrValue.group(1, errorCode);
 243                 UnicodeString attValue = mAttrValue.group(2, errorCode);
 244
 245                 // Trim the quotes from the att value.  These are left over from the original regex
 246                 //   that parsed the attribue, which couldn't conveniently strip them.
 247                 attValue.remove(0,1);                    // one char from the beginning
 248                 attValue.truncate(attValue.length()-1);  // and one from the end.
 249
 250                 if(attName==UNICODE_STRING("encoding", 8)) {
 251                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
 252                     charset=charsetBuffer;
 253                     break;
 254                 }
 255                 pos = mAttrValue.end(2, errorCode);
 256             }
 257
 258             if(charset==NULL) {
 259                 // default to UTF-8
 260                 charset="UTF-8";
 261             }
 262             cnv=ucnv_open(charset, &errorCode);
 263         }
 264     }
 265
 266     if(U_FAILURE(errorCode)) {
 267         // unable to open the converter
 268         goto exit;
 269     }
 270
 271     // convert the file contents
 272     capacity=fileLength;        // estimated capacity
 273     src.getBuffer(capacity);
 274     src.releaseBuffer(0);       // zero length
 275     flush=FALSE;
 276     for(;;) {
 277         // convert contents of bytes[bytesLength]
 278         pb=bytes;
 279         for(;;) {
 280             length=src.length();
 281             buffer=toUCharPtr(src.getBuffer(capacity));
 282             if(buffer==NULL) {
 283                 // unexpected failure to reserve some string capacity
 284                 errorCode=U_MEMORY_ALLOCATION_ERROR;
 285                 goto exit;
 286             }
 287
 288             pu=buffer+length;
 289             ucnv_toUnicode(
 290                 cnv, &pu, buffer+src.getCapacity(),
 291                 &pb, bytes+bytesLength,
 292                 NULL, FALSE, &errorCode);
 293             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
 294             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 295                 errorCode=U_ZERO_ERROR;
 296                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
 297             } else {
 298                 break;
 299             }
 300         }
 301
 302         if(U_FAILURE(errorCode)) {
 303             break; // conversion error
 304         }
 305
 306         if(flush) {
 307             break; // completely converted the file
 308         }
 309
 310         // read next block
 311         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
 312         if(bytesLength==0) {
 313             // reached end of file, convert once more to flush the converter
 314             flush=TRUE;
 315         }
 316     };
 317
 318 exit:
 319     ucnv_close(cnv);
 320     T_FileStream_close(f);
 321
 322     if(U_SUCCESS(errorCode)) {
 323         return parse(src, errorCode);
 324     } else {
 325         return NULL;
 326     }
 327 }
 328
 329 UXMLElement *
 330 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
 331     if(U_FAILURE(status)) {
 332         return NULL;
 333     }
 334
 335     UXMLElement   *root = NULL;
 336     fPos = 0; // TODO use just a local pos variable and pass it into functions
 337               // where necessary?
 338
 339     // set all matchers to work on the input string
 340     mXMLDecl.reset(src);
 341     mXMLComment.reset(src);
 342     mXMLSP.reset(src);
 343     mXMLDoctype.reset(src);
 344     mXMLPI.reset(src);
 345     mXMLElemStart.reset(src);
 346     mXMLElemEnd.reset(src);
 347     mXMLElemEmpty.reset(src);
 348     mXMLCharData.reset(src);
 349     mAttrValue.reset(src);
 350     mAttrNormalizer.reset(src);
 351     mNewLineNormalizer.reset(src);
 352     mAmps.reset(src);
 353
 354     // Consume the XML Declaration, if present.
 355     if (mXMLDecl.lookingAt(fPos, status)) {
 356         fPos = mXMLDecl.end(status);
 357     }
 358
 359     // Consume "misc" [XML production 27] appearing before DocType
 360     parseMisc(status);
 361
 362     // Consume a DocType declaration, if present.
 363     if (mXMLDoctype.lookingAt(fPos, status)) {
 364         fPos = mXMLDoctype.end(status);
 365     }
 366
 367     // Consume additional "misc" [XML production 27] appearing after the DocType
 368     parseMisc(status);
 369
 370     // Get the root element
 371     if (mXMLElemEmpty.lookingAt(fPos, status)) {
 372         // Root is an empty element (no nested elements or content)
 373         root = createElement(mXMLElemEmpty, status);
 374         fPos = mXMLElemEmpty.end(status);
 375     } else {
 376         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
 377             error("Root Element expected", status);
 378             goto errorExit;
 379         }
 380         root = createElement(mXMLElemStart, status);
 381         UXMLElement  *el = root;
 382
 383         //
 384         // This is the loop that consumes the root element of the document,
 385         //      including all nested content.   Nested elements are handled by
 386         //      explicit pushes/pops of the element stack; there is no recursion
 387         //      in the control flow of this code.
 388         //      "el" always refers to the current element, the one to which content
 389         //      is being added.  It is above the top of the element stack.
 390         for (;;) {
 391             // Nested Element Start
 392             if (mXMLElemStart.lookingAt(fPos, status)) {
 393                 UXMLElement *t = createElement(mXMLElemStart, status);
 394                 el->fChildren.addElement(t, status);
 395                 t->fParent = el;
 396                 fElementStack.push(el, status);
 397                 el = t;
 398                 continue;
 399             }
 400
 401             // Text Content.  String is concatenated onto the current node's content,
 402             //                but only if it contains something other than spaces.
 403             UnicodeString s = scanContent(status);
 404             if (s.length() > 0) {
 405                 mXMLSP.reset(s);
 406                 if (mXMLSP.matches(status) == FALSE) {
 407                     // This chunk of text contains something other than just
 408                     //  white space. Make a child node for it.
 409                     replaceCharRefs(s, status);
 410                     el->fChildren.addElement(s.clone(), status);
 411                 }
 412                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
 413                 continue;
 414             }
 415
 416             // Comments.  Discard.
 417             if (mXMLComment.lookingAt(fPos, status)) {
 418                 fPos = mXMLComment.end(status);
 419                 continue;
 420             }
 421
 422             // PIs.  Discard.
 423             if (mXMLPI.lookingAt(fPos, status)) {
 424                 fPos = mXMLPI.end(status);
 425                 continue;
 426             }
 427
 428             // Element End
 429             if (mXMLElemEnd.lookingAt(fPos, status)) {
 430                 fPos = mXMLElemEnd.end(0, status);
 431                 const UnicodeString name = mXMLElemEnd.group(1, status);
 432                 if (name != *el->fName) {
 433                     error("Element start / end tag mismatch", status);
 434                     goto errorExit;
 435                 }
 436                 if (fElementStack.empty()) {
 437                     // Close of the root element.  We're done with the doc.
 438                     el = NULL;
 439                     break;
 440                 }
 441                 el = (UXMLElement *)fElementStack.pop();
 442                 continue;
 443             }
 444
 445             // Empty Element.  Stored as a child of the current element, but not stacked.
 446             if (mXMLElemEmpty.lookingAt(fPos, status)) {
 447                 UXMLElement *t = createElement(mXMLElemEmpty, status);
 448                 el->fChildren.addElement(t, status);
 449                 continue;
 450             }
 451
 452             // Hit something within the document that doesn't match anything.
 453             //   It's an error.
 454             error("Unrecognized markup", status);
 455             break;
 456         }
 457
 458         if (el != NULL || !fElementStack.empty()) {
 459             // We bailed out early, for some reason.
 460             error("Root element not closed.", status);
 461             goto errorExit;
 462         }
 463     }
 464
 465     // Root Element parse is complete.
 466     // Consume the annoying xml "Misc" that can appear at the end of the doc.
 467     parseMisc(status);
 468
 469     // We should have reached the end of the input
 470     if (fPos != src.length()) {
 471         error("Extra content at the end of the document", status);
 472         goto errorExit;
 473     }
 474
 475     // Success!
 476     return root;
 477
 478 errorExit:
 479     delete root;
 480     return NULL;
 481 }
 482
 483 //
 484 //  createElement
 485 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
 486 //      for it.
 487 //
 488 UXMLElement *
 489 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
 490     // First capture group is the element's name.
 491     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
 492
 493     // Scan for attributes.
 494     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
 495
 496     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
 497         UnicodeString attName  = mAttrValue.group(1, status);
 498         UnicodeString attValue = mAttrValue.group(2, status);
 499
 500         // Trim the quotes from the att value.  These are left over from the original regex
 501         //   that parsed the attribue, which couldn't conveniently strip them.
 502         attValue.remove(0,1);                    // one char from the beginning
 503         attValue.truncate(attValue.length()-1);  // and one from the end.
 504
 505         // XML Attribue value normalization.
 506         // This is one of the really screwy parts of the XML spec.
 507         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
 508         // Note that non-validating parsers must treat all entities as type CDATA
 509         //   which simplifies things some.
 510
 511         // Att normalization step 1:  normalize any newlines in the attribute value
 512         mNewLineNormalizer.reset(attValue);
 513         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
 514
 515         // Next change all xml white space chars to plain \u0020 spaces.
 516         mAttrNormalizer.reset(attValue);
 517         UnicodeString oneSpace((UChar)0x0020);
 518         attValue = mAttrNormalizer.replaceAll(oneSpace, status);
 519
 520         // Replace character entities.
 521         replaceCharRefs(attValue, status);
 522
 523         // Save the attribute name and value in our document structure.
 524         el->fAttNames.addElement((void *)intern(attName, status), status);
 525         el->fAttValues.addElement(attValue.clone(), status);
 526         pos = mAttrValue.end(2, status);
 527     }
 528     fPos = mEl.end(0, status);
 529     return el;
 530 }
 531
 532 //
 533 //  parseMisc
 534 //     Consume XML "Misc" [production #27]
 535 //        which is any combination of space, PI and comments
 536 //      Need to watch end-of-input because xml MISC stuff is allowed after
 537 //        the document element, so we WILL scan off the end in this function
 538 //
 539 void
 540 UXMLParser::parseMisc(UErrorCode &status)  {
 541     for (;;) {
 542         if (fPos >= mXMLPI.input().length()) {
 543             break;
 544         }
 545         if (mXMLPI.lookingAt(fPos, status)) {
 546             fPos = mXMLPI.end(status);
 547             continue;
 548         }
 549         if (mXMLSP.lookingAt(fPos, status)) {
 550             fPos = mXMLSP.end(status);
 551             continue;
 552         }
 553         if (mXMLComment.lookingAt(fPos, status)) {
 554             fPos = mXMLComment.end(status);
 555             continue;
 556         }
 557         break;
 558     }
 559 }
 560
 561 //
 562 //  Scan for document content.
 563 //
 564 UnicodeString
 565 UXMLParser::scanContent(UErrorCode &status) {
 566     UnicodeString  result;
 567     if (mXMLCharData.lookingAt(fPos, status)) {
 568         result = mXMLCharData.group((int32_t)0, status);
 569         // Normalize the new-lines.  (Before char ref substitution)
 570         mNewLineNormalizer.reset(result);
 571         result = mNewLineNormalizer.replaceAll(fOneLF, status);
 572
 573         // TODO:  handle CDATA
 574         fPos = mXMLCharData.end(0, status);
 575     }
 576
 577     return result;
 578 }
 579
 580 //
 581 //   replaceCharRefs
 582 //
 583 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
 584 //       with the corresponding actual character.
 585 //
 586 void
 587 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
 588     UnicodeString result;
 589     UnicodeString replacement;
 590     int     i;
 591
 592     mAmps.reset(s);
 593     // See the initialization for the regex matcher mAmps.
 594     //    Which entity we've matched is determined by which capture group has content,
 595     //      which is flaged by start() of that group not being -1.
 596     while (mAmps.find()) {
 597         if (mAmps.start(1, status) != -1) {
 598             replacement.setTo((UChar)x_AMP);
 599         } else if (mAmps.start(2, status) != -1) {
 600             replacement.setTo((UChar)x_LT);
 601         } else if (mAmps.start(3, status) != -1) {
 602             replacement.setTo((UChar)x_GT);
 603         } else if (mAmps.start(4, status) != -1) {
 604             replacement.setTo((UChar)x_APOS);
 605         } else if (mAmps.start(5, status) != -1) {
 606             replacement.setTo((UChar)x_QUOT);
 607         } else if (mAmps.start(6, status) != -1) {
 608             UnicodeString hexString = mAmps.group(6, status);
 609             UChar32 val = 0;
 610             for (i=0; i<hexString.length(); i++) {
 611                 val = (val << 4) + u_digit(hexString.charAt(i), 16);
 612             }
 613             // TODO:  some verification that the character is valid
 614             replacement.setTo(val);
 615         } else if (mAmps.start(7, status) != -1) {
 616             UnicodeString decimalString = mAmps.group(7, status);
 617             UChar32 val = 0;
 618             for (i=0; i<decimalString.length(); i++) {
 619                 val = val*10 + u_digit(decimalString.charAt(i), 10);
 620             }
 621             // TODO:  some verification that the character is valid
 622             replacement.setTo(val);
 623         } else {
 624             // An unrecognized &entity;  Leave it alone.
 625             //  TODO:  check that it really looks like an entity, and is not some
 626             //         random & in the text.
 627             replacement = mAmps.group((int32_t)0, status);
 628         }
 629         mAmps.appendReplacement(result, replacement, status);
 630     }
 631     mAmps.appendTail(result);
 632     s = result;
 633 }
 634
 635 void
 636 UXMLParser::error(const char *message, UErrorCode &status) {
 637     // TODO:  something better here...
 638     const UnicodeString &src=mXMLDecl.input();
 639     int  line = 0;
 640     int  ci = 0;
 641     while (ci < fPos && ci>=0) {
 642         ci = src.indexOf((UChar)0x0a, ci+1);
 643         line++;
 644     }
 645     fprintf(stderr, "Error: %s at line %d\n", message, line);
 646     if (U_SUCCESS(status)) {
 647         status = U_PARSE_ERROR;
 648     }
 649 }
 650
 651 // intern strings like in Java
 652
 653 const UnicodeString *
 654 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
 655     const UHashElement *he=fNames.find(s);
 656     if(he!=NULL) {
 657         // already a known name, return its hashed key pointer
 658         return (const UnicodeString *)he->key.pointer;
 659     } else {
 660         // add this new name and return its hashed key pointer
 661         fNames.puti(s, 0, errorCode);
 662         he=fNames.find(s);
 663         return (const UnicodeString *)he->key.pointer;
 664     }
 665 }
 666
 667 const UnicodeString *
 668 UXMLParser::findName(const UnicodeString &s) const {
 669     const UHashElement *he=fNames.find(s);
 670     if(he!=NULL) {
 671         // a known name, return its hashed key pointer
 672         return (const UnicodeString *)he->key.pointer;
 673     } else {
 674         // unknown name
 675         return NULL;
 676     }
 677 }
 678
 679 // UXMLElement ------------------------------------------------------------- ***
 680
 681 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
 682    fParser(parser),
 683    fName(name),
 684    fAttNames(errorCode),
 685    fAttValues(errorCode),
 686    fChildren(errorCode),
 687    fParent(NULL)
 688 {
 689 }
 690
 691 UXMLElement::~UXMLElement() {
 692     int   i;
 693     // attribute names are owned by the UXMLParser, don't delete them here
 694     for (i=fAttValues.size()-1; i>=0; i--) {
 695         delete (UObject *)fAttValues.elementAt(i);
 696     }
 697     for (i=fChildren.size()-1; i>=0; i--) {
 698         delete (UObject *)fChildren.elementAt(i);
 699     }
 700 }
 701
 702 const UnicodeString &
 703 UXMLElement::getTagName() const {
 704     return *fName;
 705 }
 706
 707 UnicodeString
 708 UXMLElement::getText(UBool recurse) const {
 709     UnicodeString text;
 710     appendText(text, recurse);
 711     return text;
 712 }
 713
 714 void
 715 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
 716     const UObject *node;
 717     int32_t i, count=fChildren.size();
 718     for(i=0; i<count; ++i) {
 719         node=(const UObject *)fChildren.elementAt(i);
 720         const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
 721         if(s!=NULL) {
 722             text.append(*s);
 723         } else if(recurse) /* must be a UXMLElement */ {
 724             ((const UXMLElement *)node)->appendText(text, recurse);
 725         }
 726     }
 727 }
 728
 729 int32_t
 730 UXMLElement::countAttributes() const {
 731     return fAttNames.size();
 732 }
 733
 734 const UnicodeString *
 735 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
 736     if(0<=i && i<fAttNames.size()) {
 737         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
 738         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
 739         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
 740     } else {
 741         return NULL;
 742     }
 743 }
 744
 745 const UnicodeString *
 746 UXMLElement::getAttribute(const UnicodeString &name) const {
 747     // search for the attribute name by comparing the interned pointer,
 748     // not the string contents
 749     const UnicodeString *p=fParser->findName(name);
 750     if(p==NULL) {
 751         return NULL; // no such attribute seen by the parser at all
 752     }
 753
 754     int32_t i, count=fAttNames.size();
 755     for(i=0; i<count; ++i) {
 756         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
 757             return (const UnicodeString *)fAttValues.elementAt(i);
 758         }
 759     }
 760     return NULL;
 761 }
 762
 763 int32_t
 764 UXMLElement::countChildren() const {
 765     return fChildren.size();
 766 }
 767
 768 const UObject *
 769 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
 770     if(0<=i && i<fChildren.size()) {
 771         const UObject *node=(const UObject *)fChildren.elementAt(i);
 772         if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
 773             type=UXML_NODE_TYPE_ELEMENT;
 774         } else {
 775             type=UXML_NODE_TYPE_STRING;
 776         }
 777         return node;
 778     } else {
 779         return NULL;
 780     }
 781 }
 782
 783 const UXMLElement *
 784 UXMLElement::nextChildElement(int32_t &i) const {
 785     if(i<0) {
 786         return NULL;
 787     }
 788
 789     const UObject *node;
 790     int32_t count=fChildren.size();
 791     while(i<count) {
 792         node=(const UObject *)fChildren.elementAt(i++);
 793         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
 794         if(elem!=NULL) {
 795             return elem;
 796         }
 797     }
 798     return NULL;
 799 }
 800
 801 const UXMLElement *
 802 UXMLElement::getChildElement(const UnicodeString &name) const {
 803     // search for the element name by comparing the interned pointer,
 804     // not the string contents
 805     const UnicodeString *p=fParser->findName(name);
 806     if(p==NULL) {
 807         return NULL; // no such element seen by the parser at all
 808     }
 809
 810     const UObject *node;
 811     int32_t i, count=fChildren.size();
 812     for(i=0; i<count; ++i) {
 813         node=(const UObject *)fChildren.elementAt(i);
 814         const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
 815         if(elem!=NULL) {
 816             if(p==elem->fName) {
 817                 return elem;
 818             }
 819         }
 820     }
 821     return NULL;
 822 }
 823
 824 U_NAMESPACE_END
 825
 826 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
 827