]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/toolutil/xmlparser.cpp
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / xmlparser.cpp
CommitLineData
73c04bcf
A
1/*
2*******************************************************************************
3*
4* Copyright (C) 2004-2006, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: xmlparser.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2004jul21
14* created by: Andy Heninger
15*/
16
17#include <stdio.h>
18#include "unicode/uchar.h"
19#include "unicode/ucnv.h"
20#include "unicode/regex.h"
21#include "filestrm.h"
22#include "xmlparser.h"
23
24#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
25
26// character constants
27enum {
28 x_QUOT=0x22,
29 x_AMP=0x26,
30 x_APOS=0x27,
31 x_LT=0x3c,
32 x_GT=0x3e,
33 x_l=0x6c
34};
35
36#define XML_SPACES "[ \\u0009\\u000d\\u000a]"
37
38// XML #4
39#define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \
40 "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \
41 "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \
42 "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]"
43
44// XML #5
45#define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]"
46
47// XML #6
48#define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*"
49
50U_NAMESPACE_BEGIN
51
52UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser)
53UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement)
54
55//
56// UXMLParser constructor. Mostly just initializes the ICU regexes that are
57// used for parsing.
58//
59UXMLParser::UXMLParser(UErrorCode &status) :
60 // XML Declaration. XML Production #23.
61 // example: "<?xml version=1.0 encoding="utf-16" ?>
62 // This is a sloppy implementation - just look for the leading <?xml and the closing ?>
63 // allow for a possible leading BOM.
64 mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status),
65
66 // XML Comment production #15
67 // example: "<!-- whatever -->
68 // note, does not detect an illegal "--" within comments
69 mXMLComment(UnicodeString("(?s)<!--.+?-->"), 0, status),
70
71 // XML Spaces
72 // production [3]
73 mXMLSP(UnicodeString(XML_SPACES "+"), 0, status),
74
75 // XML Doctype decl production #28
76 // example "<!DOCTYPE foo SYSTEM "somewhere" >
77 // TODO: we don't actually parse the DOCTYPE or internal subsets.
78 // Some internal dtd subsets could confuse this simple-minded
79 // attempt at skipping over them.
80 mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>"), 0, status),
81
82 // XML PI production #16
83 // example "<?target stuff?>
84 mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status),
85
86 // XML Element Start Productions #40, #41
87 // example <foo att1='abc' att2="d e f" >
88 // capture #1: the tag name
89 //
90 mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
91 "(?:"
92 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
93 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
94 ")*" // * for zero or more attributes.
95 XML_SPACES "*?>"), 0, status), // match " >"
96
97 // XML Element End production #42
98 // example </foo>
99 mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>"), 0, status),
100
101 // XML Element Empty production #44
102 // example <foo att1="abc" att2="d e f" />
103 mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name"
104 "(?:"
105 XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
106 "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
107 ")*" // * for zero or more attributes.
108 XML_SPACES "*?/>"), 0, status), // match " />"
109
110
111 // XMLCharData. Everything but '<'. Note that & will be dealt with later.
112 mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status),
113
114 // Attribute name = "value". XML Productions 10, 40/41
115 // Capture group 1 is name,
116 // 2 is the attribute value, including the quotes.
117 //
118 // Note that attributes are scanned twice. The first time is with
119 // the regex for an entire element start. There, the attributes
120 // are checked syntactically, but not separted out one by one.
121 // Here, we match a single attribute, and make its name and
122 // attribute value available to the parser code.
123 mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*"
124 "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status),
125
126
127 mAttrNormalizer(UnicodeString(XML_SPACES), 0, status),
128
129 // Match any of the new-line sequences in content.
130 // All are changed to \u000a.
131 mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status),
132
133 // & char references
134 // We will figure out what we've got based on which capture group has content.
135 // The last one is a catchall for unrecognized entity references..
136 // 1 2 3 4 5 6 7 8
137 mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"),
138 0, status),
139
140 fNames(status),
141 fElementStack(status),
142 fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization.
143 {
144 }
145
146UXMLParser *
147UXMLParser::createParser(UErrorCode &errorCode) {
148 if (U_FAILURE(errorCode)) {
149 return NULL;
150 } else {
151 return new UXMLParser(errorCode);
152 }
153}
154
155UXMLParser::~UXMLParser() {}
156
157UXMLElement *
158UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
159 char bytes[4096], charsetBuffer[100];
160 FileStream *f;
161 const char *charset, *pb;
162 UnicodeString src;
163 UConverter *cnv;
164 UChar *buffer, *pu;
165 int32_t fileLength, bytesLength, length, capacity;
166 UBool flush;
167
168 if(U_FAILURE(errorCode)) {
169 return NULL;
170 }
171
172 f=T_FileStream_open(filename, "rb");
173 if(f==NULL) {
174 errorCode=U_FILE_ACCESS_ERROR;
175 return NULL;
176 }
177
178 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
179 if(bytesLength<(int32_t)sizeof(bytes)) {
180 // we have already read the entire file
181 fileLength=bytesLength;
182 } else {
183 // get the file length
184 fileLength=T_FileStream_size(f);
185 }
186
187 /*
188 * get the charset:
189 * 1. Unicode signature
190 * 2. treat as ISO-8859-1 and read XML encoding="charser"
191 * 3. default to UTF-8
192 */
193 charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode);
194 if(U_SUCCESS(errorCode) && charset!=NULL) {
195 // open converter according to Unicode signature
196 cnv=ucnv_open(charset, &errorCode);
197 } else {
198 // read as Latin-1 and parse the XML declaration and encoding
199 cnv=ucnv_open("ISO-8859-1", &errorCode);
200 if(U_FAILURE(errorCode)) {
201 // unexpected error opening Latin-1 converter
202 goto exit;
203 }
204
205 buffer=src.getBuffer(bytesLength);
206 if(buffer==NULL) {
207 // unexpected failure to reserve some string capacity
208 errorCode=U_MEMORY_ALLOCATION_ERROR;
209 goto exit;
210 }
211 pb=bytes;
212 pu=buffer;
213 ucnv_toUnicode(
214 cnv,
215 &pu, buffer+src.getCapacity(),
216 &pb, bytes+bytesLength,
217 NULL, TRUE, &errorCode);
218 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
219 ucnv_close(cnv);
220 cnv=NULL;
221 if(U_FAILURE(errorCode)) {
222 // unexpected error in conversion from Latin-1
223 src.remove();
224 goto exit;
225 }
226
227 // parse XML declaration
228 if(mXMLDecl.reset(src).lookingAt(0, errorCode)) {
229 int32_t declEnd=mXMLDecl.end(errorCode);
230 // go beyond <?xml
231 int32_t pos=src.indexOf((UChar)x_l)+1;
232
233 mAttrValue.reset(src);
234 while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element.
235 UnicodeString attName = mAttrValue.group(1, errorCode);
236 UnicodeString attValue = mAttrValue.group(2, errorCode);
237
238 // Trim the quotes from the att value. These are left over from the original regex
239 // that parsed the attribue, which couldn't conveniently strip them.
240 attValue.remove(0,1); // one char from the beginning
241 attValue.truncate(attValue.length()-1); // and one from the end.
242
243 if(attName==UNICODE_STRING("encoding", 8)) {
244 length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer));
245 charset=charsetBuffer;
246 break;
247 }
248 pos = mAttrValue.end(2, errorCode);
249 }
250
251 if(charset==NULL) {
252 // default to UTF-8
253 charset="UTF-8";
254 }
255 cnv=ucnv_open(charset, &errorCode);
256 }
257 }
258
259 if(U_FAILURE(errorCode)) {
260 // unable to open the converter
261 goto exit;
262 }
263
264 // convert the file contents
265 capacity=fileLength; // estimated capacity
266 src.getBuffer(capacity);
267 src.releaseBuffer(0); // zero length
268 flush=FALSE;
269 for(;;) {
270 // convert contents of bytes[bytesLength]
271 pb=bytes;
272 for(;;) {
273 length=src.length();
274 buffer=src.getBuffer(capacity);
275 if(buffer==NULL) {
276 // unexpected failure to reserve some string capacity
277 errorCode=U_MEMORY_ALLOCATION_ERROR;
278 goto exit;
279 }
280
281 pu=buffer+length;
282 ucnv_toUnicode(
283 cnv, &pu, buffer+src.getCapacity(),
284 &pb, bytes+bytesLength,
285 NULL, FALSE, &errorCode);
286 src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0);
287 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
288 errorCode=U_ZERO_ERROR;
289 capacity=(3*src.getCapacity())/2; // increase capacity by 50%
290 } else {
291 break;
292 }
293 }
294
295 if(U_FAILURE(errorCode)) {
296 break; // conversion error
297 }
298
299 if(flush) {
300 break; // completely converted the file
301 }
302
303 // read next block
304 bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes));
305 if(bytesLength==0) {
306 // reached end of file, convert once more to flush the converter
307 flush=TRUE;
308 }
309 };
310
311exit:
312 ucnv_close(cnv);
313 T_FileStream_close(f);
314
315 if(U_SUCCESS(errorCode)) {
316 return parse(src, errorCode);
317 } else {
318 return NULL;
319 }
320}
321
322UXMLElement *
323UXMLParser::parse(const UnicodeString &src, UErrorCode &status) {
324 if(U_FAILURE(status)) {
325 return NULL;
326 }
327
328 UXMLElement *root = NULL;
329 fPos = 0; // TODO use just a local pos variable and pass it into functions
330 // where necessary?
331
332 // set all matchers to work on the input string
333 mXMLDecl.reset(src);
334 mXMLComment.reset(src);
335 mXMLSP.reset(src);
336 mXMLDoctype.reset(src);
337 mXMLPI.reset(src);
338 mXMLElemStart.reset(src);
339 mXMLElemEnd.reset(src);
340 mXMLElemEmpty.reset(src);
341 mXMLCharData.reset(src);
342 mAttrValue.reset(src);
343 mAttrNormalizer.reset(src);
344 mNewLineNormalizer.reset(src);
345 mAmps.reset(src);
346
347 // Consume the XML Declaration, if present.
348 if (mXMLDecl.lookingAt(fPos, status)) {
349 fPos = mXMLDecl.end(status);
350 }
351
352 // Consume "misc" [XML production 27] appearing before DocType
353 parseMisc(status);
354
355 // Consume a DocType declaration, if present.
356 if (mXMLDoctype.lookingAt(fPos, status)) {
357 fPos = mXMLDoctype.end(status);
358 }
359
360 // Consume additional "misc" [XML production 27] appearing after the DocType
361 parseMisc(status);
362
363 // Get the root element
364 if (mXMLElemEmpty.lookingAt(fPos, status)) {
365 // Root is an empty element (no nested elements or content)
366 root = createElement(mXMLElemEmpty, status);
367 fPos = mXMLElemEmpty.end(status);
368 } else {
369 if (mXMLElemStart.lookingAt(fPos, status) == FALSE) {
370 error("Root Element expected", status);
371 goto errorExit;
372 }
373 root = createElement(mXMLElemStart, status);
374 UXMLElement *el = root;
375
376 //
377 // This is the loop that consumes the root element of the document,
378 // including all nested content. Nested elements are handled by
379 // explicit pushes/pops of the element stack; there is no recursion
380 // in the control flow of this code.
381 // "el" always refers to the current element, the one to which content
382 // is being added. It is above the top of the element stack.
383 for (;;) {
384 // Nested Element Start
385 if (mXMLElemStart.lookingAt(fPos, status)) {
386 UXMLElement *t = createElement(mXMLElemStart, status);
387 el->fChildren.addElement(t, status);
388 t->fParent = el;
389 fElementStack.push(el, status);
390 el = t;
391 continue;
392 }
393
394 // Text Content. String is concatenated onto the current node's content,
395 // but only if it contains something other than spaces.
396 UnicodeString s = scanContent(status);
397 if (s.length() > 0) {
398 mXMLSP.reset(s);
399 if (mXMLSP.matches(status) == FALSE) {
400 // This chunk of text contains something other than just
401 // white space. Make a child node for it.
402 replaceCharRefs(s, status);
403 el->fChildren.addElement(s.clone(), status);
404 }
405 mXMLSP.reset(src); // The matchers need to stay set to the main input string.
406 continue;
407 }
408
409 // Comments. Discard.
410 if (mXMLComment.lookingAt(fPos, status)) {
411 fPos = mXMLComment.end(status);
412 continue;
413 }
414
415 // PIs. Discard.
416 if (mXMLPI.lookingAt(fPos, status)) {
417 fPos = mXMLPI.end(status);
418 continue;
419 }
420
421 // Element End
422 if (mXMLElemEnd.lookingAt(fPos, status)) {
423 fPos = mXMLElemEnd.end(0, status);
424 const UnicodeString name = mXMLElemEnd.group(1, status);
425 if (name != *el->fName) {
426 error("Element start / end tag mismatch", status);
427 goto errorExit;
428 }
429 if (fElementStack.empty()) {
430 // Close of the root element. We're done with the doc.
431 el = NULL;
432 break;
433 }
434 el = (UXMLElement *)fElementStack.pop();
435 continue;
436 }
437
438 // Empty Element. Stored as a child of the current element, but not stacked.
439 if (mXMLElemEmpty.lookingAt(fPos, status)) {
440 UXMLElement *t = createElement(mXMLElemEmpty, status);
441 el->fChildren.addElement(t, status);
442 continue;
443 }
444
445 // Hit something within the document that doesn't match anything.
446 // It's an error.
447 error("Unrecognized markup", status);
448 break;
449 }
450
451 if (el != NULL || !fElementStack.empty()) {
452 // We bailed out early, for some reason.
453 error("Root element not closed.", status);
454 goto errorExit;
455 }
456 }
457
458 // Root Element parse is complete.
459 // Consume the annoying xml "Misc" that can appear at the end of the doc.
460 parseMisc(status);
461
462 // We should have reached the end of the input
463 if (fPos != src.length()) {
464 error("Extra content at the end of the document", status);
465 goto errorExit;
466 }
467
468 // Success!
469 return root;
470
471errorExit:
472 delete root;
473 return NULL;
474}
475
476//
477// createElement
478// We've just matched an element start tag. Create and fill in a UXMLElement object
479// for it.
480//
481UXMLElement *
482UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) {
483 // First capture group is the element's name.
484 UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status);
485
486 // Scan for attributes.
487 int32_t pos = mEl.end(1, status); // The position after the end of the tag name
488
489 while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element.
490 UnicodeString attName = mAttrValue.group(1, status);
491 UnicodeString attValue = mAttrValue.group(2, status);
492
493 // Trim the quotes from the att value. These are left over from the original regex
494 // that parsed the attribue, which couldn't conveniently strip them.
495 attValue.remove(0,1); // one char from the beginning
496 attValue.truncate(attValue.length()-1); // and one from the end.
497
498 // XML Attribue value normalization.
499 // This is one of the really screwy parts of the XML spec.
500 // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize
501 // Note that non-validating parsers must treat all entities as type CDATA
502 // which simplifies things some.
503
504 // Att normalization step 1: normalize any newlines in the attribute value
505 mNewLineNormalizer.reset(attValue);
506 attValue = mNewLineNormalizer.replaceAll(fOneLF, status);
507
508 // Next change all xml white space chars to plain \u0020 spaces.
509 mAttrNormalizer.reset(attValue);
510 UnicodeString oneSpace((UChar)0x0020);
511 attValue = mAttrNormalizer.replaceAll(oneSpace, status);
512
513 // Replace character entities.
514 replaceCharRefs(attValue, status);
515
516 // Save the attribute name and value in our document structure.
517 el->fAttNames.addElement((void *)intern(attName, status), status);
518 el->fAttValues.addElement(attValue.clone(), status);
519 pos = mAttrValue.end(2, status);
520 }
521 fPos = mEl.end(0, status);
522 return el;
523}
524
525//
526// parseMisc
527// Consume XML "Misc" [production #27]
528// which is any combination of space, PI and comments
529// Need to watch end-of-input because xml MISC stuff is allowed after
530// the document element, so we WILL scan off the end in this function
531//
532void
533UXMLParser::parseMisc(UErrorCode &status) {
534 for (;;) {
535 if (fPos >= mXMLPI.input().length()) {
536 break;
537 }
538 if (mXMLPI.lookingAt(fPos, status)) {
539 fPos = mXMLPI.end(status);
540 continue;
541 }
542 if (mXMLSP.lookingAt(fPos, status)) {
543 fPos = mXMLSP.end(status);
544 continue;
545 }
546 if (mXMLComment.lookingAt(fPos, status)) {
547 fPos = mXMLComment.end(status);
548 continue;
549 }
550 break;
551 }
552}
553
554//
555// Scan for document content.
556//
557UnicodeString
558UXMLParser::scanContent(UErrorCode &status) {
559 UnicodeString result;
560 if (mXMLCharData.lookingAt(fPos, status)) {
561 result = mXMLCharData.group(0, status);
562 // Normalize the new-lines. (Before char ref substitution)
563 mNewLineNormalizer.reset(result);
564 result = mNewLineNormalizer.replaceAll(fOneLF, status);
565
566 // TODO: handle CDATA
567 fPos = mXMLCharData.end(0, status);
568 }
569
570 return result;
571}
572
573//
574// replaceCharRefs
575//
576// replace the char entities &lt; &amp; &#123; &#x12ab; etc. in a string
577// with the corresponding actual character.
578//
579void
580UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
581 UnicodeString result;
582 UnicodeString replacement;
583 int i;
584
585 mAmps.reset(s);
586 // See the initialization for the regex matcher mAmps.
587 // Which entity we've matched is determined by which capture group has content,
588 // which is flaged by start() of that group not being -1.
589 while (mAmps.find()) {
590 if (mAmps.start(1, status) != -1) {
591 replacement.setTo((UChar)x_AMP);
592 } else if (mAmps.start(2, status) != -1) {
593 replacement.setTo((UChar)x_LT);
594 } else if (mAmps.start(3, status) != -1) {
595 replacement.setTo((UChar)x_GT);
596 } else if (mAmps.start(4, status) != -1) {
597 replacement.setTo((UChar)x_APOS);
598 } else if (mAmps.start(5, status) != -1) {
599 replacement.setTo((UChar)x_QUOT);
600 } else if (mAmps.start(6, status) != -1) {
601 UnicodeString hexString = mAmps.group(6, status);
602 UChar32 val = 0;
603 for (i=0; i<hexString.length(); i++) {
604 val = (val << 4) + u_digit(hexString.charAt(i), 16);
605 }
606 // TODO: some verification that the character is valid
607 replacement.setTo(val);
608 } else if (mAmps.start(7, status) != -1) {
609 UnicodeString decimalString = mAmps.group(7, status);
610 UChar32 val = 0;
611 for (i=0; i<decimalString.length(); i++) {
612 val = val*10 + u_digit(decimalString.charAt(i), 10);
613 }
614 // TODO: some verification that the character is valid
615 replacement.setTo(val);
616 } else {
617 // An unrecognized &entity; Leave it alone.
618 // TODO: check that it really looks like an entity, and is not some
619 // random & in the text.
620 replacement = mAmps.group(0, status);
621 }
622 mAmps.appendReplacement(result, replacement, status);
623 }
624 mAmps.appendTail(result);
625 s = result;
626}
627
628void
629UXMLParser::error(const char *message, UErrorCode &status) {
630 // TODO: something better here...
631 const UnicodeString &src=mXMLDecl.input();
632 int line = 0;
633 int ci = 0;
634 while (ci < fPos && ci>=0) {
635 ci = src.indexOf((UChar)0x0a, ci+1);
636 line++;
637 }
638 fprintf(stderr, "Error: %s at line %d\n", message, line);
639 if (U_SUCCESS(status)) {
640 status = U_PARSE_ERROR;
641 }
642}
643
644// intern strings like in Java
645
646const UnicodeString *
647UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) {
648 const UHashElement *he=fNames.find(s);
649 if(he!=NULL) {
650 // already a known name, return its hashed key pointer
651 return (const UnicodeString *)he->key.pointer;
652 } else {
653 // add this new name and return its hashed key pointer
654 fNames.puti(s, 0, errorCode);
655 he=fNames.find(s);
656 return (const UnicodeString *)he->key.pointer;
657 }
658}
659
660const UnicodeString *
661UXMLParser::findName(const UnicodeString &s) const {
662 const UHashElement *he=fNames.find(s);
663 if(he!=NULL) {
664 // a known name, return its hashed key pointer
665 return (const UnicodeString *)he->key.pointer;
666 } else {
667 // unknown name
668 return NULL;
669 }
670}
671
672// UXMLElement ------------------------------------------------------------- ***
673
674UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) :
675 fParser(parser),
676 fName(name),
677 fAttNames(errorCode),
678 fAttValues(errorCode),
679 fChildren(errorCode),
680 fParent(NULL)
681{
682}
683
684UXMLElement::~UXMLElement() {
685 int i;
686 // attribute names are owned by the UXMLParser, don't delete them here
687 for (i=fAttValues.size()-1; i>=0; i--) {
688 delete (UObject *)fAttValues.elementAt(i);
689 }
690 for (i=fChildren.size()-1; i>=0; i--) {
691 delete (UObject *)fChildren.elementAt(i);
692 }
693}
694
695const UnicodeString &
696UXMLElement::getTagName() const {
697 return *fName;
698}
699
700UnicodeString
701UXMLElement::getText(UBool recurse) const {
702 UnicodeString text;
703 appendText(text, recurse);
704 return text;
705}
706
707void
708UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
709 const UObject *node;
710 int32_t i, count=fChildren.size();
711 for(i=0; i<count; ++i) {
712 node=(const UObject *)fChildren.elementAt(i);
713 if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
714 text.append(*(const UnicodeString *)node);
715 } else if(recurse) /* must be a UXMLElement */ {
716 ((const UXMLElement *)node)->appendText(text, recurse);
717 }
718 }
719}
720
721int32_t
722UXMLElement::countAttributes() const {
723 return fAttNames.size();
724}
725
726const UnicodeString *
727UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const {
728 if(0<=i && i<fAttNames.size()) {
729 name.setTo(*(const UnicodeString *)fAttNames.elementAt(i));
730 value.setTo(*(const UnicodeString *)fAttValues.elementAt(i));
731 return &value; // or return (UnicodeString *)fAttValues.elementAt(i);
732 } else {
733 return NULL;
734 }
735}
736
737const UnicodeString *
738UXMLElement::getAttribute(const UnicodeString &name) const {
739 // search for the attribute name by comparing the interned pointer,
740 // not the string contents
741 const UnicodeString *p=fParser->findName(name);
742 if(p==NULL) {
743 return NULL; // no such attribute seen by the parser at all
744 }
745
746 int32_t i, count=fAttNames.size();
747 for(i=0; i<count; ++i) {
748 if(p==(const UnicodeString *)fAttNames.elementAt(i)) {
749 return (const UnicodeString *)fAttValues.elementAt(i);
750 }
751 }
752 return NULL;
753}
754
755int32_t
756UXMLElement::countChildren() const {
757 return fChildren.size();
758}
759
760const UObject *
761UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
762 if(0<=i && i<fChildren.size()) {
763 const UObject *node=(const UObject *)fChildren.elementAt(i);
764 if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
765 type=UXML_NODE_TYPE_ELEMENT;
766 } else {
767 type=UXML_NODE_TYPE_STRING;
768 }
769 return node;
770 } else {
771 return NULL;
772 }
773}
774
775const UXMLElement *
776UXMLElement::nextChildElement(int32_t &i) const {
777 if(i<0) {
778 return NULL;
779 }
780
781 const UObject *node;
782 int32_t count=fChildren.size();
783 while(i<count) {
784 node=(const UObject *)fChildren.elementAt(i++);
785 // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
786 // if(node instanceof UXMLElement) {
787 if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
788 return (const UXMLElement *)node;
789 }
790 }
791 return NULL;
792}
793
794const UXMLElement *
795UXMLElement::getChildElement(const UnicodeString &name) const {
796 // search for the element name by comparing the interned pointer,
797 // not the string contents
798 const UnicodeString *p=fParser->findName(name);
799 if(p==NULL) {
800 return NULL; // no such element seen by the parser at all
801 }
802
803 const UObject *node;
804 int32_t i, count=fChildren.size();
805 for(i=0; i<count; ++i) {
806 node=(const UObject *)fChildren.elementAt(i);
807 if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
808 const UXMLElement *elem=(const UXMLElement *)node;
809 if(p==elem->fName) {
810 return elem;
811 }
812 }
813 }
814 return NULL;
815}
816
817U_NAMESPACE_END
818
819#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
820