icuSources/tools/toolutil/xmlparser.cpp

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2004-2006, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *   file name:  xmlparser.cpp
   9 *   encoding:   US-ASCII
  10 *   tab size:   8 (not used)
  11 *   indentation:4
  12 *
  13 *   created on: 2004jul21
  14 *   created by: Andy Heninger
  15 */
  16
  17 #include <stdio.h>
  18 #include "unicode/uchar.h"
  19 #include "unicode/ucnv.h"
  20 #include "unicode/regex.h"
  21 #include "filestrm.h"
  22 #include "xmlparser.h"
  23
  24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
  25
  26 // character constants
  27 enum {
  28     x_QUOT=0x22,
  29     x_AMP=0x26,
  30     x_APOS=0x27,
  31     x_LT=0x3c,
  32     x_GT=0x3e,
  33     x_l=0x6c
  34 };
  35
  36 #define  XML_SPACES "[ \\u0009\\u000d\\u000a]"
  37
  38 // XML #4
  39 #define  XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
  40                     "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
  41                     "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
  42                     "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
  43
  44 //  XML #5
  45 #define  XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
  46
  47 //  XML #6
  48 #define  XML_NAME    XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
  49
  50 U_NAMESPACE_BEGIN
  51
  52 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
  53 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
  54
  55 //
  56 //   UXMLParser constructor.   Mostly just initializes the ICU regexes that are
  57 //                             used for parsing.
  58 //
  59 UXMLParser::UXMLParser(UErrorCode &status) :
  60       //  XML Declaration.  XML Production #23.
  61       //      example:  "<?xml version=1.0 encoding="utf-16" ?>
  62       //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
  63       //            allow for a possible leading BOM.
  64       mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status),
  65
  66       //  XML Comment   production #15
  67       //     example:  "<!-- whatever -->
  68       //       note, does not detect an illegal "--" within comments
  69       mXMLComment(UnicodeString("(?s)<!--.+?-->"), 0, status),
  70
  71       //  XML Spaces
  72       //      production [3]
  73       mXMLSP(UnicodeString(XML_SPACES "+"), 0, status),
  74
  75       //  XML Doctype decl  production #28
  76       //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
  77       //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
  78       //           Some internal dtd subsets could confuse this simple-minded
  79       //           attempt at skipping over them.
  80       mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>"), 0, status),
  81
  82       //  XML PI     production #16
  83       //     example   "<?target stuff?>
  84       mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status),
  85
  86       //  XML Element Start   Productions #40, #41
  87       //          example   <foo att1='abc'  att2="d e f" >
  88       //      capture #1:  the tag name
  89       //
  90       mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
  91           "(?:"
  92                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
  93                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
  94           ")*"                                                             //   * for zero or more attributes.
  95           XML_SPACES "*?>"), 0, status),                               // match " >"
  96
  97       //  XML Element End     production #42
  98       //     example   </foo>
  99       mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>"), 0, status),
 100
 101       // XML Element Empty    production #44
 102       //     example   <foo att1="abc"   att2="d e f" />
 103       mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")"                                 // match  "<tag_name"
 104           "(?:"
 105                 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
 106                 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
 107           ")*"                                                             //   * for zero or more attributes.
 108           XML_SPACES "*?/>"), 0, status),                              // match " />"
 109
 110
 111       // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
 112       mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status),
 113
 114       // Attribute name = "value".  XML Productions 10, 40/41
 115       //  Capture group 1 is name,
 116       //                2 is the attribute value, including the quotes.
 117       //
 118       //   Note that attributes are scanned twice.  The first time is with
 119       //        the regex for an entire element start.  There, the attributes
 120       //        are checked syntactically, but not separted out one by one.
 121       //        Here, we match a single attribute, and make its name and
 122       //        attribute value available to the parser code.
 123       mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
 124          "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status),
 125
 126
 127       mAttrNormalizer(UnicodeString(XML_SPACES), 0, status),
 128
 129       // Match any of the new-line sequences in content.
 130       //   All are changed to \u000a.
 131       mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status),
 132
 133       // & char references
 134       //   We will figure out what we've got based on which capture group has content.
 135       //   The last one is a catchall for unrecognized entity references..
 136       //             1     2     3      4      5           6                    7          8
 137       mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
 138                 0, status),
 139
 140       fNames(status),
 141       fElementStack(status),
 142       fOneLF((UChar)0x0a)        // Plain new-line string, used in new line normalization.
 143       {
 144       }
 145
 146 UXMLParser *
 147 UXMLParser::createParser(UErrorCode &errorCode) {
 148     if (U_FAILURE(errorCode)) {
 149         return NULL;
 150     } else {
 151         return new UXMLParser(errorCode);
 152     }
 153 }
 154
 155 UXMLParser::~UXMLParser() {}
 156
 157 UXMLElement *
 158 UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
 159     char bytes[4096], charsetBuffer[100];
 160     FileStream *f;
 161     const char *charset, *pb;
 162     UnicodeString src;
 163     UConverter *cnv;
 164     UChar *buffer, *pu;
 165     int32_t fileLength, bytesLength, length, capacity;
 166     UBool flush;
 167
 168     if(U_FAILURE(errorCode)) {
 169         return NULL;
 170     }
 171
 172     f=T_FileStream_open(filename, "rb");
 173     if(f==NULL) {
 174         errorCode=U_FILE_ACCESS_ERROR;
 175         return NULL;
 176     }
 177
 178     bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
 179     if(bytesLength<(int32_t)sizeof(bytes)) {
 180         // we have already read the entire file
 181         fileLength=bytesLength;
 182     } else {
 183         // get the file length
 184         fileLength=T_FileStream_size(f);
 185     }
 186
 187     /*
 188      * get the charset:
 189      * 1. Unicode signature
 190      * 2. treat as ISO-8859-1 and read XML encoding="charser"
 191      * 3. default to UTF-8
 192      */
 193     charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
 194     if(U_SUCCESS(errorCode) && charset!=NULL) {
 195         // open converter according to Unicode signature
 196         cnv=ucnv_open(charset, &errorCode);
 197     } else {
 198         // read as Latin-1 and parse the XML declaration and encoding
 199         cnv=ucnv_open("ISO-8859-1", &errorCode);
 200         if(U_FAILURE(errorCode)) {
 201             // unexpected error opening Latin-1 converter
 202             goto exit;
 203         }
 204
 205         buffer=src.getBuffer(bytesLength);
 206         if(buffer==NULL) {
 207             // unexpected failure to reserve some string capacity
 208             errorCode=U_MEMORY_ALLOCATION_ERROR;
 209             goto exit;
 210         }
 211         pb=bytes;
 212         pu=buffer;
 213         ucnv_toUnicode(
 214             cnv,
 215             &pu, buffer+src.getCapacity(),
 216             &pb, bytes+bytesLength,
 217             NULL, TRUE, &errorCode);
 218         src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
 219         ucnv_close(cnv);
 220         cnv=NULL;
 221         if(U_FAILURE(errorCode)) {
 222             // unexpected error in conversion from Latin-1
 223             src.remove();
 224             goto exit;
 225         }
 226
 227         // parse XML declaration
 228         if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
 229             int32_t declEnd=mXMLDecl.end(errorCode);
 230             // go beyond <?xml
 231             int32_t pos=src.indexOf((UChar)x_l)+1;
 232
 233             mAttrValue.reset(src);
 234             while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) {  // loop runs once per attribute on this element.
 235                 UnicodeString attName  = mAttrValue.group(1, errorCode);
 236                 UnicodeString attValue = mAttrValue.group(2, errorCode);
 237
 238                 // Trim the quotes from the att value.  These are left over from the original regex
 239                 //   that parsed the attribue, which couldn't conveniently strip them.
 240                 attValue.remove(0,1);                    // one char from the beginning
 241                 attValue.truncate(attValue.length()-1);  // and one from the end.
 242
 243                 if(attName==UNICODE_STRING("encoding", 8)) {
 244                     length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
 245                     charset=charsetBuffer;
 246                     break;
 247                 }
 248                 pos = mAttrValue.end(2, errorCode);
 249             }
 250
 251             if(charset==NULL) {
 252                 // default to UTF-8
 253                 charset="UTF-8";
 254             }
 255             cnv=ucnv_open(charset, &errorCode);
 256         }
 257     }
 258
 259     if(U_FAILURE(errorCode)) {
 260         // unable to open the converter
 261         goto exit;
 262     }
 263
 264     // convert the file contents
 265     capacity=fileLength;        // estimated capacity
 266     src.getBuffer(capacity);
 267     src.releaseBuffer(0);       // zero length
 268     flush=FALSE;
 269     for(;;) {
 270         // convert contents of bytes[bytesLength]
 271         pb=bytes;
 272         for(;;) {
 273             length=src.length();
 274             buffer=src.getBuffer(capacity);
 275             if(buffer==NULL) {
 276                 // unexpected failure to reserve some string capacity
 277                 errorCode=U_MEMORY_ALLOCATION_ERROR;
 278                 goto exit;
 279             }
 280
 281             pu=buffer+length;
 282             ucnv_toUnicode(
 283                 cnv, &pu, buffer+src.getCapacity(),
 284                 &pb, bytes+bytesLength,
 285                 NULL, FALSE, &errorCode);
 286             src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
 287             if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
 288                 errorCode=U_ZERO_ERROR;
 289                 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
 290             } else {
 291                 break;
 292             }
 293         }
 294
 295         if(U_FAILURE(errorCode)) {
 296             break; // conversion error
 297         }
 298
 299         if(flush) {
 300             break; // completely converted the file
 301         }
 302
 303         // read next block
 304         bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
 305         if(bytesLength==0) {
 306             // reached end of file, convert once more to flush the converter
 307             flush=TRUE;
 308         }
 309     };
 310
 311 exit:
 312     ucnv_close(cnv);
 313     T_FileStream_close(f);
 314
 315     if(U_SUCCESS(errorCode)) {
 316         return parse(src, errorCode);
 317     } else {
 318         return NULL;
 319     }
 320 }
 321
 322 UXMLElement *
 323 UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
 324     if(U_FAILURE(status)) {
 325         return NULL;
 326     }
 327
 328     UXMLElement   *root = NULL;
 329     fPos = 0; // TODO use just a local pos variable and pass it into functions
 330               // where necessary?
 331
 332     // set all matchers to work on the input string
 333     mXMLDecl.reset(src);
 334     mXMLComment.reset(src);
 335     mXMLSP.reset(src);
 336     mXMLDoctype.reset(src);
 337     mXMLPI.reset(src);
 338     mXMLElemStart.reset(src);
 339     mXMLElemEnd.reset(src);
 340     mXMLElemEmpty.reset(src);
 341     mXMLCharData.reset(src);
 342     mAttrValue.reset(src);
 343     mAttrNormalizer.reset(src);
 344     mNewLineNormalizer.reset(src);
 345     mAmps.reset(src);
 346
 347     // Consume the XML Declaration, if present.
 348     if (mXMLDecl.lookingAt(fPos, status)) {
 349         fPos = mXMLDecl.end(status);
 350     }
 351
 352     // Consume "misc" [XML production 27] appearing before DocType
 353     parseMisc(status);
 354
 355     // Consume a DocType declaration, if present.
 356     if (mXMLDoctype.lookingAt(fPos, status)) {
 357         fPos = mXMLDoctype.end(status);
 358     }
 359
 360     // Consume additional "misc" [XML production 27] appearing after the DocType
 361     parseMisc(status);
 362
 363     // Get the root element
 364     if (mXMLElemEmpty.lookingAt(fPos, status)) {
 365         // Root is an empty element (no nested elements or content)
 366         root = createElement(mXMLElemEmpty, status);
 367         fPos = mXMLElemEmpty.end(status);
 368     } else {
 369         if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
 370             error("Root Element expected", status);
 371             goto errorExit;
 372         }
 373         root = createElement(mXMLElemStart, status);
 374         UXMLElement  *el = root;
 375
 376         //
 377         // This is the loop that consumes the root element of the document,
 378         //      including all nested content.   Nested elements are handled by
 379         //      explicit pushes/pops of the element stack; there is no recursion
 380         //      in the control flow of this code.
 381         //      "el" always refers to the current element, the one to which content
 382         //      is being added.  It is above the top of the element stack.
 383         for (;;) {
 384             // Nested Element Start
 385             if (mXMLElemStart.lookingAt(fPos, status)) {
 386                 UXMLElement *t = createElement(mXMLElemStart, status);
 387                 el->fChildren.addElement(t, status);
 388                 t->fParent = el;
 389                 fElementStack.push(el, status);
 390                 el = t;
 391                 continue;
 392             }
 393
 394             // Text Content.  String is concatenated onto the current node's content,
 395             //                but only if it contains something other than spaces.
 396             UnicodeString s = scanContent(status);
 397             if (s.length() > 0) {
 398                 mXMLSP.reset(s);
 399                 if (mXMLSP.matches(status) == FALSE) {
 400                     // This chunk of text contains something other than just
 401                     //  white space. Make a child node for it.
 402                     replaceCharRefs(s, status);
 403                     el->fChildren.addElement(s.clone(), status);
 404                 }
 405                 mXMLSP.reset(src);    // The matchers need to stay set to the main input string.
 406                 continue;
 407             }
 408
 409             // Comments.  Discard.
 410             if (mXMLComment.lookingAt(fPos, status)) {
 411                 fPos = mXMLComment.end(status);
 412                 continue;
 413             }
 414
 415             // PIs.  Discard.
 416             if (mXMLPI.lookingAt(fPos, status)) {
 417                 fPos = mXMLPI.end(status);
 418                 continue;
 419             }
 420
 421             // Element End
 422             if (mXMLElemEnd.lookingAt(fPos, status)) {
 423                 fPos = mXMLElemEnd.end(0, status);
 424                 const UnicodeString name = mXMLElemEnd.group(1, status);
 425                 if (name != *el->fName) {
 426                     error("Element start / end tag mismatch", status);
 427                     goto errorExit;
 428                 }
 429                 if (fElementStack.empty()) {
 430                     // Close of the root element.  We're done with the doc.
 431                     el = NULL;
 432                     break;
 433                 }
 434                 el = (UXMLElement *)fElementStack.pop();
 435                 continue;
 436             }
 437
 438             // Empty Element.  Stored as a child of the current element, but not stacked.
 439             if (mXMLElemEmpty.lookingAt(fPos, status)) {
 440                 UXMLElement *t = createElement(mXMLElemEmpty, status);
 441                 el->fChildren.addElement(t, status);
 442                 continue;
 443             }
 444
 445             // Hit something within the document that doesn't match anything.
 446             //   It's an error.
 447             error("Unrecognized markup", status);
 448             break;
 449         }
 450
 451         if (el != NULL || !fElementStack.empty()) {
 452             // We bailed out early, for some reason.
 453             error("Root element not closed.", status);
 454             goto errorExit;
 455         }
 456     }
 457
 458     // Root Element parse is complete.
 459     // Consume the annoying xml "Misc" that can appear at the end of the doc.
 460     parseMisc(status);
 461
 462     // We should have reached the end of the input
 463     if (fPos != src.length()) {
 464         error("Extra content at the end of the document", status);
 465         goto errorExit;
 466     }
 467
 468     // Success!
 469     return root;
 470
 471 errorExit:
 472     delete root;
 473     return NULL;
 474 }
 475
 476 //
 477 //  createElement
 478 //      We've just matched an element start tag.  Create and fill in a UXMLElement object
 479 //      for it.
 480 //
 481 UXMLElement *
 482 UXMLParser::createElement(RegexMatcher  &mEl, UErrorCode &status) {
 483     // First capture group is the element's name.
 484     UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
 485
 486     // Scan for attributes.
 487     int32_t   pos = mEl.end(1, status);  // The position after the end of the tag name
 488
 489     while (mAttrValue.lookingAt(pos, status)) {  // loop runs once per attribute on this element.
 490         UnicodeString attName  = mAttrValue.group(1, status);
 491         UnicodeString attValue = mAttrValue.group(2, status);
 492
 493         // Trim the quotes from the att value.  These are left over from the original regex
 494         //   that parsed the attribue, which couldn't conveniently strip them.
 495         attValue.remove(0,1);                    // one char from the beginning
 496         attValue.truncate(attValue.length()-1);  // and one from the end.
 497
 498         // XML Attribue value normalization.
 499         // This is one of the really screwy parts of the XML spec.
 500         // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
 501         // Note that non-validating parsers must treat all entities as type CDATA
 502         //   which simplifies things some.
 503
 504         // Att normalization step 1:  normalize any newlines in the attribute value
 505         mNewLineNormalizer.reset(attValue);
 506         attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
 507
 508         // Next change all xml white space chars to plain \u0020 spaces.
 509         mAttrNormalizer.reset(attValue);
 510         UnicodeString oneSpace((UChar)0x0020);
 511         attValue = mAttrNormalizer.replaceAll(oneSpace, status);
 512
 513         // Replace character entities.
 514         replaceCharRefs(attValue, status);
 515
 516         // Save the attribute name and value in our document structure.
 517         el->fAttNames.addElement((void *)intern(attName, status), status);
 518         el->fAttValues.addElement(attValue.clone(), status);
 519         pos = mAttrValue.end(2, status);
 520     }
 521     fPos = mEl.end(0, status);
 522     return el;
 523 }
 524
 525 //
 526 //  parseMisc
 527 //     Consume XML "Misc" [production #27]
 528 //        which is any combination of space, PI and comments
 529 //      Need to watch end-of-input because xml MISC stuff is allowed after
 530 //        the document element, so we WILL scan off the end in this function
 531 //
 532 void
 533 UXMLParser::parseMisc(UErrorCode &status)  {
 534     for (;;) {
 535         if (fPos >= mXMLPI.input().length()) {
 536             break;
 537         }
 538         if (mXMLPI.lookingAt(fPos, status)) {
 539             fPos = mXMLPI.end(status);
 540             continue;
 541         }
 542         if (mXMLSP.lookingAt(fPos, status)) {
 543             fPos = mXMLSP.end(status);
 544             continue;
 545         }
 546         if (mXMLComment.lookingAt(fPos, status)) {
 547             fPos = mXMLComment.end(status);
 548             continue;
 549         }
 550         break;
 551     }
 552 }
 553
 554 //
 555 //  Scan for document content.
 556 //
 557 UnicodeString
 558 UXMLParser::scanContent(UErrorCode &status) {
 559     UnicodeString  result;
 560     if (mXMLCharData.lookingAt(fPos, status)) {
 561         result = mXMLCharData.group(0, status);
 562         // Normalize the new-lines.  (Before char ref substitution)
 563         mNewLineNormalizer.reset(result);
 564         result = mNewLineNormalizer.replaceAll(fOneLF, status);
 565
 566         // TODO:  handle CDATA
 567         fPos = mXMLCharData.end(0, status);
 568     }
 569
 570     return result;
 571 }
 572
 573 //
 574 //   replaceCharRefs
 575 //
 576 //      replace the char entities &lt;  &amp; &#123; &#x12ab; etc. in a string
 577 //       with the corresponding actual character.
 578 //
 579 void
 580 UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
 581     UnicodeString result;
 582     UnicodeString replacement;
 583     int     i;
 584
 585     mAmps.reset(s);
 586     // See the initialization for the regex matcher mAmps.
 587     //    Which entity we've matched is determined by which capture group has content,
 588     //      which is flaged by start() of that group not being -1.
 589     while (mAmps.find()) {
 590         if (mAmps.start(1, status) != -1) {
 591             replacement.setTo((UChar)x_AMP);
 592         } else if (mAmps.start(2, status) != -1) {
 593             replacement.setTo((UChar)x_LT);
 594         } else if (mAmps.start(3, status) != -1) {
 595             replacement.setTo((UChar)x_GT);
 596         } else if (mAmps.start(4, status) != -1) {
 597             replacement.setTo((UChar)x_APOS);
 598         } else if (mAmps.start(5, status) != -1) {
 599             replacement.setTo((UChar)x_QUOT);
 600         } else if (mAmps.start(6, status) != -1) {
 601             UnicodeString hexString = mAmps.group(6, status);
 602             UChar32 val = 0;
 603             for (i=0; i<hexString.length(); i++) {
 604                 val = (val << 4) + u_digit(hexString.charAt(i), 16);
 605             }
 606             // TODO:  some verification that the character is valid
 607             replacement.setTo(val);
 608         } else if (mAmps.start(7, status) != -1) {
 609             UnicodeString decimalString = mAmps.group(7, status);
 610             UChar32 val = 0;
 611             for (i=0; i<decimalString.length(); i++) {
 612                 val = val*10 + u_digit(decimalString.charAt(i), 10);
 613             }
 614             // TODO:  some verification that the character is valid
 615             replacement.setTo(val);
 616         } else {
 617             // An unrecognized &entity;  Leave it alone.
 618             //  TODO:  check that it really looks like an entity, and is not some
 619             //         random & in the text.
 620             replacement = mAmps.group(0, status);
 621         }
 622         mAmps.appendReplacement(result, replacement, status);
 623     }
 624     mAmps.appendTail(result);
 625     s = result;
 626 }
 627
 628 void
 629 UXMLParser::error(const char *message, UErrorCode &status) {
 630     // TODO:  something better here...
 631     const UnicodeString &src=mXMLDecl.input();
 632     int  line = 0;
 633     int  ci = 0;
 634     while (ci < fPos && ci>=0) {
 635         ci = src.indexOf((UChar)0x0a, ci+1);
 636         line++;
 637     }
 638     fprintf(stderr, "Error: %s at line %d\n", message, line);
 639     if (U_SUCCESS(status)) {
 640         status = U_PARSE_ERROR;
 641     }
 642 }
 643
 644 // intern strings like in Java
 645
 646 const UnicodeString *
 647 UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
 648     const UHashElement *he=fNames.find(s);
 649     if(he!=NULL) {
 650         // already a known name, return its hashed key pointer
 651         return (const UnicodeString *)he->key.pointer;
 652     } else {
 653         // add this new name and return its hashed key pointer
 654         fNames.puti(s, 0, errorCode);
 655         he=fNames.find(s);
 656         return (const UnicodeString *)he->key.pointer;
 657     }
 658 }
 659
 660 const UnicodeString *
 661 UXMLParser::findName(const UnicodeString &s) const {
 662     const UHashElement *he=fNames.find(s);
 663     if(he!=NULL) {
 664         // a known name, return its hashed key pointer
 665         return (const UnicodeString *)he->key.pointer;
 666     } else {
 667         // unknown name
 668         return NULL;
 669     }
 670 }
 671
 672 // UXMLElement ------------------------------------------------------------- ***
 673
 674 UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
 675    fParser(parser),
 676    fName(name),
 677    fAttNames(errorCode),
 678    fAttValues(errorCode),
 679    fChildren(errorCode),
 680    fParent(NULL)
 681 {
 682 }
 683
 684 UXMLElement::~UXMLElement() {
 685     int   i;
 686     // attribute names are owned by the UXMLParser, don't delete them here
 687     for (i=fAttValues.size()-1; i>=0; i--) {
 688         delete (UObject *)fAttValues.elementAt(i);
 689     }
 690     for (i=fChildren.size()-1; i>=0; i--) {
 691         delete (UObject *)fChildren.elementAt(i);
 692     }
 693 }
 694
 695 const UnicodeString &
 696 UXMLElement::getTagName() const {
 697     return *fName;
 698 }
 699
 700 UnicodeString
 701 UXMLElement::getText(UBool recurse) const {
 702     UnicodeString text;
 703     appendText(text, recurse);
 704     return text;
 705 }
 706
 707 void
 708 UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
 709     const UObject *node;
 710     int32_t i, count=fChildren.size();
 711     for(i=0; i<count; ++i) {
 712         node=(const UObject *)fChildren.elementAt(i);
 713         if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
 714             text.append(*(const UnicodeString *)node);
 715         } else if(recurse) /* must be a UXMLElement */ {
 716             ((const UXMLElement *)node)->appendText(text, recurse);
 717         }
 718     }
 719 }
 720
 721 int32_t
 722 UXMLElement::countAttributes() const {
 723     return fAttNames.size();
 724 }
 725
 726 const UnicodeString *
 727 UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
 728     if(0<=i && i<fAttNames.size()) {
 729         name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
 730         value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
 731         return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
 732     } else {
 733         return NULL;
 734     }
 735 }
 736
 737 const UnicodeString *
 738 UXMLElement::getAttribute(const UnicodeString &name) const {
 739     // search for the attribute name by comparing the interned pointer,
 740     // not the string contents
 741     const UnicodeString *p=fParser->findName(name);
 742     if(p==NULL) {
 743         return NULL; // no such attribute seen by the parser at all
 744     }
 745
 746     int32_t i, count=fAttNames.size();
 747     for(i=0; i<count; ++i) {
 748         if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
 749             return (const UnicodeString *)fAttValues.elementAt(i);
 750         }
 751     }
 752     return NULL;
 753 }
 754
 755 int32_t
 756 UXMLElement::countChildren() const {
 757     return fChildren.size();
 758 }
 759
 760 const UObject *
 761 UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
 762     if(0<=i && i<fChildren.size()) {
 763         const UObject *node=(const UObject *)fChildren.elementAt(i);
 764         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
 765             type=UXML_NODE_TYPE_ELEMENT;
 766         } else {
 767             type=UXML_NODE_TYPE_STRING;
 768         }
 769         return node;
 770     } else {
 771         return NULL;
 772     }
 773 }
 774
 775 const UXMLElement *
 776 UXMLElement::nextChildElement(int32_t &i) const {
 777     if(i<0) {
 778         return NULL;
 779     }
 780
 781     const UObject *node;
 782     int32_t count=fChildren.size();
 783     while(i<count) {
 784         node=(const UObject *)fChildren.elementAt(i++);
 785         // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
 786         // if(node instanceof UXMLElement) {
 787         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
 788             return (const UXMLElement *)node;
 789         }
 790     }
 791     return NULL;
 792 }
 793
 794 const UXMLElement *
 795 UXMLElement::getChildElement(const UnicodeString &name) const {
 796     // search for the element name by comparing the interned pointer,
 797     // not the string contents
 798     const UnicodeString *p=fParser->findName(name);
 799     if(p==NULL) {
 800         return NULL; // no such element seen by the parser at all
 801     }
 802
 803     const UObject *node;
 804     int32_t i, count=fChildren.size();
 805     for(i=0; i<count; ++i) {
 806         node=(const UObject *)fChildren.elementAt(i);
 807         if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
 808             const UXMLElement *elem=(const UXMLElement *)node;
 809             if(p==elem->fName) {
 810                 return elem;
 811             }
 812         }
 813     }
 814     return NULL;
 815 }
 816
 817 U_NAMESPACE_END
 818
 819 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
 820