]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2004-2006, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: xmlparser.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2004jul21 | |
14 | * created by: Andy Heninger | |
15 | */ | |
16 | ||
17 | #include <stdio.h> | |
18 | #include "unicode/uchar.h" | |
19 | #include "unicode/ucnv.h" | |
20 | #include "unicode/regex.h" | |
21 | #include "filestrm.h" | |
22 | #include "xmlparser.h" | |
23 | ||
24 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION | |
25 | ||
26 | // character constants | |
27 | enum { | |
28 | x_QUOT=0x22, | |
29 | x_AMP=0x26, | |
30 | x_APOS=0x27, | |
31 | x_LT=0x3c, | |
32 | x_GT=0x3e, | |
33 | x_l=0x6c | |
34 | }; | |
35 | ||
36 | #define XML_SPACES "[ \\u0009\\u000d\\u000a]" | |
37 | ||
38 | // XML #4 | |
39 | #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ | |
40 | "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ | |
41 | "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ | |
42 | "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" | |
43 | ||
44 | // XML #5 | |
45 | #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" | |
46 | ||
47 | // XML #6 | |
48 | #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" | |
49 | ||
50 | U_NAMESPACE_BEGIN | |
51 | ||
52 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) | |
53 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) | |
54 | ||
55 | // | |
56 | // UXMLParser constructor. Mostly just initializes the ICU regexes that are | |
57 | // used for parsing. | |
58 | // | |
59 | UXMLParser::UXMLParser(UErrorCode &status) : | |
60 | // XML Declaration. XML Production #23. | |
61 | // example: "<?xml version=1.0 encoding="utf-16" ?> | |
62 | // This is a sloppy implementation - just look for the leading <?xml and the closing ?> | |
63 | // allow for a possible leading BOM. | |
64 | mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status), | |
65 | ||
66 | // XML Comment production #15 | |
67 | // example: "<!-- whatever --> | |
68 | // note, does not detect an illegal "--" within comments | |
69 | mXMLComment(UnicodeString("(?s)<!--.+?-->"), 0, status), | |
70 | ||
71 | // XML Spaces | |
72 | // production [3] | |
73 | mXMLSP(UnicodeString(XML_SPACES "+"), 0, status), | |
74 | ||
75 | // XML Doctype decl production #28 | |
76 | // example "<!DOCTYPE foo SYSTEM "somewhere" > | |
77 | // TODO: we don't actually parse the DOCTYPE or internal subsets. | |
78 | // Some internal dtd subsets could confuse this simple-minded | |
79 | // attempt at skipping over them. | |
80 | mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>"), 0, status), | |
81 | ||
82 | // XML PI production #16 | |
83 | // example "<?target stuff?> | |
84 | mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status), | |
85 | ||
86 | // XML Element Start Productions #40, #41 | |
87 | // example <foo att1='abc' att2="d e f" > | |
88 | // capture #1: the tag name | |
89 | // | |
90 | mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" | |
91 | "(?:" | |
92 | XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " | |
93 | "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' | |
94 | ")*" // * for zero or more attributes. | |
95 | XML_SPACES "*?>"), 0, status), // match " >" | |
96 | ||
97 | // XML Element End production #42 | |
98 | // example </foo> | |
99 | mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>"), 0, status), | |
100 | ||
101 | // XML Element Empty production #44 | |
102 | // example <foo att1="abc" att2="d e f" /> | |
103 | mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" | |
104 | "(?:" | |
105 | XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " | |
106 | "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' | |
107 | ")*" // * for zero or more attributes. | |
108 | XML_SPACES "*?/>"), 0, status), // match " />" | |
109 | ||
110 | ||
111 | // XMLCharData. Everything but '<'. Note that & will be dealt with later. | |
112 | mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status), | |
113 | ||
114 | // Attribute name = "value". XML Productions 10, 40/41 | |
115 | // Capture group 1 is name, | |
116 | // 2 is the attribute value, including the quotes. | |
117 | // | |
118 | // Note that attributes are scanned twice. The first time is with | |
119 | // the regex for an entire element start. There, the attributes | |
120 | // are checked syntactically, but not separted out one by one. | |
121 | // Here, we match a single attribute, and make its name and | |
122 | // attribute value available to the parser code. | |
123 | mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" | |
124 | "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status), | |
125 | ||
126 | ||
127 | mAttrNormalizer(UnicodeString(XML_SPACES), 0, status), | |
128 | ||
129 | // Match any of the new-line sequences in content. | |
130 | // All are changed to \u000a. | |
131 | mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status), | |
132 | ||
133 | // & char references | |
134 | // We will figure out what we've got based on which capture group has content. | |
135 | // The last one is a catchall for unrecognized entity references.. | |
136 | // 1 2 3 4 5 6 7 8 | |
137 | mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), | |
138 | 0, status), | |
139 | ||
140 | fNames(status), | |
141 | fElementStack(status), | |
142 | fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. | |
143 | { | |
144 | } | |
145 | ||
146 | UXMLParser * | |
147 | UXMLParser::createParser(UErrorCode &errorCode) { | |
148 | if (U_FAILURE(errorCode)) { | |
149 | return NULL; | |
150 | } else { | |
151 | return new UXMLParser(errorCode); | |
152 | } | |
153 | } | |
154 | ||
155 | UXMLParser::~UXMLParser() {} | |
156 | ||
157 | UXMLElement * | |
158 | UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { | |
159 | char bytes[4096], charsetBuffer[100]; | |
160 | FileStream *f; | |
161 | const char *charset, *pb; | |
162 | UnicodeString src; | |
163 | UConverter *cnv; | |
164 | UChar *buffer, *pu; | |
165 | int32_t fileLength, bytesLength, length, capacity; | |
166 | UBool flush; | |
167 | ||
168 | if(U_FAILURE(errorCode)) { | |
169 | return NULL; | |
170 | } | |
171 | ||
172 | f=T_FileStream_open(filename, "rb"); | |
173 | if(f==NULL) { | |
174 | errorCode=U_FILE_ACCESS_ERROR; | |
175 | return NULL; | |
176 | } | |
177 | ||
178 | bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); | |
179 | if(bytesLength<(int32_t)sizeof(bytes)) { | |
180 | // we have already read the entire file | |
181 | fileLength=bytesLength; | |
182 | } else { | |
183 | // get the file length | |
184 | fileLength=T_FileStream_size(f); | |
185 | } | |
186 | ||
187 | /* | |
188 | * get the charset: | |
189 | * 1. Unicode signature | |
190 | * 2. treat as ISO-8859-1 and read XML encoding="charser" | |
191 | * 3. default to UTF-8 | |
192 | */ | |
193 | charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); | |
194 | if(U_SUCCESS(errorCode) && charset!=NULL) { | |
195 | // open converter according to Unicode signature | |
196 | cnv=ucnv_open(charset, &errorCode); | |
197 | } else { | |
198 | // read as Latin-1 and parse the XML declaration and encoding | |
199 | cnv=ucnv_open("ISO-8859-1", &errorCode); | |
200 | if(U_FAILURE(errorCode)) { | |
201 | // unexpected error opening Latin-1 converter | |
202 | goto exit; | |
203 | } | |
204 | ||
205 | buffer=src.getBuffer(bytesLength); | |
206 | if(buffer==NULL) { | |
207 | // unexpected failure to reserve some string capacity | |
208 | errorCode=U_MEMORY_ALLOCATION_ERROR; | |
209 | goto exit; | |
210 | } | |
211 | pb=bytes; | |
212 | pu=buffer; | |
213 | ucnv_toUnicode( | |
214 | cnv, | |
215 | &pu, buffer+src.getCapacity(), | |
216 | &pb, bytes+bytesLength, | |
217 | NULL, TRUE, &errorCode); | |
218 | src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); | |
219 | ucnv_close(cnv); | |
220 | cnv=NULL; | |
221 | if(U_FAILURE(errorCode)) { | |
222 | // unexpected error in conversion from Latin-1 | |
223 | src.remove(); | |
224 | goto exit; | |
225 | } | |
226 | ||
227 | // parse XML declaration | |
228 | if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { | |
229 | int32_t declEnd=mXMLDecl.end(errorCode); | |
230 | // go beyond <?xml | |
231 | int32_t pos=src.indexOf((UChar)x_l)+1; | |
232 | ||
233 | mAttrValue.reset(src); | |
234 | while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. | |
235 | UnicodeString attName = mAttrValue.group(1, errorCode); | |
236 | UnicodeString attValue = mAttrValue.group(2, errorCode); | |
237 | ||
238 | // Trim the quotes from the att value. These are left over from the original regex | |
239 | // that parsed the attribue, which couldn't conveniently strip them. | |
240 | attValue.remove(0,1); // one char from the beginning | |
241 | attValue.truncate(attValue.length()-1); // and one from the end. | |
242 | ||
243 | if(attName==UNICODE_STRING("encoding", 8)) { | |
244 | length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); | |
245 | charset=charsetBuffer; | |
246 | break; | |
247 | } | |
248 | pos = mAttrValue.end(2, errorCode); | |
249 | } | |
250 | ||
251 | if(charset==NULL) { | |
252 | // default to UTF-8 | |
253 | charset="UTF-8"; | |
254 | } | |
255 | cnv=ucnv_open(charset, &errorCode); | |
256 | } | |
257 | } | |
258 | ||
259 | if(U_FAILURE(errorCode)) { | |
260 | // unable to open the converter | |
261 | goto exit; | |
262 | } | |
263 | ||
264 | // convert the file contents | |
265 | capacity=fileLength; // estimated capacity | |
266 | src.getBuffer(capacity); | |
267 | src.releaseBuffer(0); // zero length | |
268 | flush=FALSE; | |
269 | for(;;) { | |
270 | // convert contents of bytes[bytesLength] | |
271 | pb=bytes; | |
272 | for(;;) { | |
273 | length=src.length(); | |
274 | buffer=src.getBuffer(capacity); | |
275 | if(buffer==NULL) { | |
276 | // unexpected failure to reserve some string capacity | |
277 | errorCode=U_MEMORY_ALLOCATION_ERROR; | |
278 | goto exit; | |
279 | } | |
280 | ||
281 | pu=buffer+length; | |
282 | ucnv_toUnicode( | |
283 | cnv, &pu, buffer+src.getCapacity(), | |
284 | &pb, bytes+bytesLength, | |
285 | NULL, FALSE, &errorCode); | |
286 | src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); | |
287 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { | |
288 | errorCode=U_ZERO_ERROR; | |
289 | capacity=(3*src.getCapacity())/2; // increase capacity by 50% | |
290 | } else { | |
291 | break; | |
292 | } | |
293 | } | |
294 | ||
295 | if(U_FAILURE(errorCode)) { | |
296 | break; // conversion error | |
297 | } | |
298 | ||
299 | if(flush) { | |
300 | break; // completely converted the file | |
301 | } | |
302 | ||
303 | // read next block | |
304 | bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); | |
305 | if(bytesLength==0) { | |
306 | // reached end of file, convert once more to flush the converter | |
307 | flush=TRUE; | |
308 | } | |
309 | }; | |
310 | ||
311 | exit: | |
312 | ucnv_close(cnv); | |
313 | T_FileStream_close(f); | |
314 | ||
315 | if(U_SUCCESS(errorCode)) { | |
316 | return parse(src, errorCode); | |
317 | } else { | |
318 | return NULL; | |
319 | } | |
320 | } | |
321 | ||
322 | UXMLElement * | |
323 | UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { | |
324 | if(U_FAILURE(status)) { | |
325 | return NULL; | |
326 | } | |
327 | ||
328 | UXMLElement *root = NULL; | |
329 | fPos = 0; // TODO use just a local pos variable and pass it into functions | |
330 | // where necessary? | |
331 | ||
332 | // set all matchers to work on the input string | |
333 | mXMLDecl.reset(src); | |
334 | mXMLComment.reset(src); | |
335 | mXMLSP.reset(src); | |
336 | mXMLDoctype.reset(src); | |
337 | mXMLPI.reset(src); | |
338 | mXMLElemStart.reset(src); | |
339 | mXMLElemEnd.reset(src); | |
340 | mXMLElemEmpty.reset(src); | |
341 | mXMLCharData.reset(src); | |
342 | mAttrValue.reset(src); | |
343 | mAttrNormalizer.reset(src); | |
344 | mNewLineNormalizer.reset(src); | |
345 | mAmps.reset(src); | |
346 | ||
347 | // Consume the XML Declaration, if present. | |
348 | if (mXMLDecl.lookingAt(fPos, status)) { | |
349 | fPos = mXMLDecl.end(status); | |
350 | } | |
351 | ||
352 | // Consume "misc" [XML production 27] appearing before DocType | |
353 | parseMisc(status); | |
354 | ||
355 | // Consume a DocType declaration, if present. | |
356 | if (mXMLDoctype.lookingAt(fPos, status)) { | |
357 | fPos = mXMLDoctype.end(status); | |
358 | } | |
359 | ||
360 | // Consume additional "misc" [XML production 27] appearing after the DocType | |
361 | parseMisc(status); | |
362 | ||
363 | // Get the root element | |
364 | if (mXMLElemEmpty.lookingAt(fPos, status)) { | |
365 | // Root is an empty element (no nested elements or content) | |
366 | root = createElement(mXMLElemEmpty, status); | |
367 | fPos = mXMLElemEmpty.end(status); | |
368 | } else { | |
369 | if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { | |
370 | error("Root Element expected", status); | |
371 | goto errorExit; | |
372 | } | |
373 | root = createElement(mXMLElemStart, status); | |
374 | UXMLElement *el = root; | |
375 | ||
376 | // | |
377 | // This is the loop that consumes the root element of the document, | |
378 | // including all nested content. Nested elements are handled by | |
379 | // explicit pushes/pops of the element stack; there is no recursion | |
380 | // in the control flow of this code. | |
381 | // "el" always refers to the current element, the one to which content | |
382 | // is being added. It is above the top of the element stack. | |
383 | for (;;) { | |
384 | // Nested Element Start | |
385 | if (mXMLElemStart.lookingAt(fPos, status)) { | |
386 | UXMLElement *t = createElement(mXMLElemStart, status); | |
387 | el->fChildren.addElement(t, status); | |
388 | t->fParent = el; | |
389 | fElementStack.push(el, status); | |
390 | el = t; | |
391 | continue; | |
392 | } | |
393 | ||
394 | // Text Content. String is concatenated onto the current node's content, | |
395 | // but only if it contains something other than spaces. | |
396 | UnicodeString s = scanContent(status); | |
397 | if (s.length() > 0) { | |
398 | mXMLSP.reset(s); | |
399 | if (mXMLSP.matches(status) == FALSE) { | |
400 | // This chunk of text contains something other than just | |
401 | // white space. Make a child node for it. | |
402 | replaceCharRefs(s, status); | |
403 | el->fChildren.addElement(s.clone(), status); | |
404 | } | |
405 | mXMLSP.reset(src); // The matchers need to stay set to the main input string. | |
406 | continue; | |
407 | } | |
408 | ||
409 | // Comments. Discard. | |
410 | if (mXMLComment.lookingAt(fPos, status)) { | |
411 | fPos = mXMLComment.end(status); | |
412 | continue; | |
413 | } | |
414 | ||
415 | // PIs. Discard. | |
416 | if (mXMLPI.lookingAt(fPos, status)) { | |
417 | fPos = mXMLPI.end(status); | |
418 | continue; | |
419 | } | |
420 | ||
421 | // Element End | |
422 | if (mXMLElemEnd.lookingAt(fPos, status)) { | |
423 | fPos = mXMLElemEnd.end(0, status); | |
424 | const UnicodeString name = mXMLElemEnd.group(1, status); | |
425 | if (name != *el->fName) { | |
426 | error("Element start / end tag mismatch", status); | |
427 | goto errorExit; | |
428 | } | |
429 | if (fElementStack.empty()) { | |
430 | // Close of the root element. We're done with the doc. | |
431 | el = NULL; | |
432 | break; | |
433 | } | |
434 | el = (UXMLElement *)fElementStack.pop(); | |
435 | continue; | |
436 | } | |
437 | ||
438 | // Empty Element. Stored as a child of the current element, but not stacked. | |
439 | if (mXMLElemEmpty.lookingAt(fPos, status)) { | |
440 | UXMLElement *t = createElement(mXMLElemEmpty, status); | |
441 | el->fChildren.addElement(t, status); | |
442 | continue; | |
443 | } | |
444 | ||
445 | // Hit something within the document that doesn't match anything. | |
446 | // It's an error. | |
447 | error("Unrecognized markup", status); | |
448 | break; | |
449 | } | |
450 | ||
451 | if (el != NULL || !fElementStack.empty()) { | |
452 | // We bailed out early, for some reason. | |
453 | error("Root element not closed.", status); | |
454 | goto errorExit; | |
455 | } | |
456 | } | |
457 | ||
458 | // Root Element parse is complete. | |
459 | // Consume the annoying xml "Misc" that can appear at the end of the doc. | |
460 | parseMisc(status); | |
461 | ||
462 | // We should have reached the end of the input | |
463 | if (fPos != src.length()) { | |
464 | error("Extra content at the end of the document", status); | |
465 | goto errorExit; | |
466 | } | |
467 | ||
468 | // Success! | |
469 | return root; | |
470 | ||
471 | errorExit: | |
472 | delete root; | |
473 | return NULL; | |
474 | } | |
475 | ||
476 | // | |
477 | // createElement | |
478 | // We've just matched an element start tag. Create and fill in a UXMLElement object | |
479 | // for it. | |
480 | // | |
481 | UXMLElement * | |
482 | UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { | |
483 | // First capture group is the element's name. | |
484 | UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); | |
485 | ||
486 | // Scan for attributes. | |
487 | int32_t pos = mEl.end(1, status); // The position after the end of the tag name | |
488 | ||
489 | while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. | |
490 | UnicodeString attName = mAttrValue.group(1, status); | |
491 | UnicodeString attValue = mAttrValue.group(2, status); | |
492 | ||
493 | // Trim the quotes from the att value. These are left over from the original regex | |
494 | // that parsed the attribue, which couldn't conveniently strip them. | |
495 | attValue.remove(0,1); // one char from the beginning | |
496 | attValue.truncate(attValue.length()-1); // and one from the end. | |
497 | ||
498 | // XML Attribue value normalization. | |
499 | // This is one of the really screwy parts of the XML spec. | |
500 | // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize | |
501 | // Note that non-validating parsers must treat all entities as type CDATA | |
502 | // which simplifies things some. | |
503 | ||
504 | // Att normalization step 1: normalize any newlines in the attribute value | |
505 | mNewLineNormalizer.reset(attValue); | |
506 | attValue = mNewLineNormalizer.replaceAll(fOneLF, status); | |
507 | ||
508 | // Next change all xml white space chars to plain \u0020 spaces. | |
509 | mAttrNormalizer.reset(attValue); | |
510 | UnicodeString oneSpace((UChar)0x0020); | |
511 | attValue = mAttrNormalizer.replaceAll(oneSpace, status); | |
512 | ||
513 | // Replace character entities. | |
514 | replaceCharRefs(attValue, status); | |
515 | ||
516 | // Save the attribute name and value in our document structure. | |
517 | el->fAttNames.addElement((void *)intern(attName, status), status); | |
518 | el->fAttValues.addElement(attValue.clone(), status); | |
519 | pos = mAttrValue.end(2, status); | |
520 | } | |
521 | fPos = mEl.end(0, status); | |
522 | return el; | |
523 | } | |
524 | ||
525 | // | |
526 | // parseMisc | |
527 | // Consume XML "Misc" [production #27] | |
528 | // which is any combination of space, PI and comments | |
529 | // Need to watch end-of-input because xml MISC stuff is allowed after | |
530 | // the document element, so we WILL scan off the end in this function | |
531 | // | |
532 | void | |
533 | UXMLParser::parseMisc(UErrorCode &status) { | |
534 | for (;;) { | |
535 | if (fPos >= mXMLPI.input().length()) { | |
536 | break; | |
537 | } | |
538 | if (mXMLPI.lookingAt(fPos, status)) { | |
539 | fPos = mXMLPI.end(status); | |
540 | continue; | |
541 | } | |
542 | if (mXMLSP.lookingAt(fPos, status)) { | |
543 | fPos = mXMLSP.end(status); | |
544 | continue; | |
545 | } | |
546 | if (mXMLComment.lookingAt(fPos, status)) { | |
547 | fPos = mXMLComment.end(status); | |
548 | continue; | |
549 | } | |
550 | break; | |
551 | } | |
552 | } | |
553 | ||
554 | // | |
555 | // Scan for document content. | |
556 | // | |
557 | UnicodeString | |
558 | UXMLParser::scanContent(UErrorCode &status) { | |
559 | UnicodeString result; | |
560 | if (mXMLCharData.lookingAt(fPos, status)) { | |
561 | result = mXMLCharData.group(0, status); | |
562 | // Normalize the new-lines. (Before char ref substitution) | |
563 | mNewLineNormalizer.reset(result); | |
564 | result = mNewLineNormalizer.replaceAll(fOneLF, status); | |
565 | ||
566 | // TODO: handle CDATA | |
567 | fPos = mXMLCharData.end(0, status); | |
568 | } | |
569 | ||
570 | return result; | |
571 | } | |
572 | ||
573 | // | |
574 | // replaceCharRefs | |
575 | // | |
576 | // replace the char entities < & { ካ etc. in a string | |
577 | // with the corresponding actual character. | |
578 | // | |
579 | void | |
580 | UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { | |
581 | UnicodeString result; | |
582 | UnicodeString replacement; | |
583 | int i; | |
584 | ||
585 | mAmps.reset(s); | |
586 | // See the initialization for the regex matcher mAmps. | |
587 | // Which entity we've matched is determined by which capture group has content, | |
588 | // which is flaged by start() of that group not being -1. | |
589 | while (mAmps.find()) { | |
590 | if (mAmps.start(1, status) != -1) { | |
591 | replacement.setTo((UChar)x_AMP); | |
592 | } else if (mAmps.start(2, status) != -1) { | |
593 | replacement.setTo((UChar)x_LT); | |
594 | } else if (mAmps.start(3, status) != -1) { | |
595 | replacement.setTo((UChar)x_GT); | |
596 | } else if (mAmps.start(4, status) != -1) { | |
597 | replacement.setTo((UChar)x_APOS); | |
598 | } else if (mAmps.start(5, status) != -1) { | |
599 | replacement.setTo((UChar)x_QUOT); | |
600 | } else if (mAmps.start(6, status) != -1) { | |
601 | UnicodeString hexString = mAmps.group(6, status); | |
602 | UChar32 val = 0; | |
603 | for (i=0; i<hexString.length(); i++) { | |
604 | val = (val << 4) + u_digit(hexString.charAt(i), 16); | |
605 | } | |
606 | // TODO: some verification that the character is valid | |
607 | replacement.setTo(val); | |
608 | } else if (mAmps.start(7, status) != -1) { | |
609 | UnicodeString decimalString = mAmps.group(7, status); | |
610 | UChar32 val = 0; | |
611 | for (i=0; i<decimalString.length(); i++) { | |
612 | val = val*10 + u_digit(decimalString.charAt(i), 10); | |
613 | } | |
614 | // TODO: some verification that the character is valid | |
615 | replacement.setTo(val); | |
616 | } else { | |
617 | // An unrecognized &entity; Leave it alone. | |
618 | // TODO: check that it really looks like an entity, and is not some | |
619 | // random & in the text. | |
620 | replacement = mAmps.group(0, status); | |
621 | } | |
622 | mAmps.appendReplacement(result, replacement, status); | |
623 | } | |
624 | mAmps.appendTail(result); | |
625 | s = result; | |
626 | } | |
627 | ||
628 | void | |
629 | UXMLParser::error(const char *message, UErrorCode &status) { | |
630 | // TODO: something better here... | |
631 | const UnicodeString &src=mXMLDecl.input(); | |
632 | int line = 0; | |
633 | int ci = 0; | |
634 | while (ci < fPos && ci>=0) { | |
635 | ci = src.indexOf((UChar)0x0a, ci+1); | |
636 | line++; | |
637 | } | |
638 | fprintf(stderr, "Error: %s at line %d\n", message, line); | |
639 | if (U_SUCCESS(status)) { | |
640 | status = U_PARSE_ERROR; | |
641 | } | |
642 | } | |
643 | ||
644 | // intern strings like in Java | |
645 | ||
646 | const UnicodeString * | |
647 | UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { | |
648 | const UHashElement *he=fNames.find(s); | |
649 | if(he!=NULL) { | |
650 | // already a known name, return its hashed key pointer | |
651 | return (const UnicodeString *)he->key.pointer; | |
652 | } else { | |
653 | // add this new name and return its hashed key pointer | |
654 | fNames.puti(s, 0, errorCode); | |
655 | he=fNames.find(s); | |
656 | return (const UnicodeString *)he->key.pointer; | |
657 | } | |
658 | } | |
659 | ||
660 | const UnicodeString * | |
661 | UXMLParser::findName(const UnicodeString &s) const { | |
662 | const UHashElement *he=fNames.find(s); | |
663 | if(he!=NULL) { | |
664 | // a known name, return its hashed key pointer | |
665 | return (const UnicodeString *)he->key.pointer; | |
666 | } else { | |
667 | // unknown name | |
668 | return NULL; | |
669 | } | |
670 | } | |
671 | ||
672 | // UXMLElement ------------------------------------------------------------- *** | |
673 | ||
674 | UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : | |
675 | fParser(parser), | |
676 | fName(name), | |
677 | fAttNames(errorCode), | |
678 | fAttValues(errorCode), | |
679 | fChildren(errorCode), | |
680 | fParent(NULL) | |
681 | { | |
682 | } | |
683 | ||
684 | UXMLElement::~UXMLElement() { | |
685 | int i; | |
686 | // attribute names are owned by the UXMLParser, don't delete them here | |
687 | for (i=fAttValues.size()-1; i>=0; i--) { | |
688 | delete (UObject *)fAttValues.elementAt(i); | |
689 | } | |
690 | for (i=fChildren.size()-1; i>=0; i--) { | |
691 | delete (UObject *)fChildren.elementAt(i); | |
692 | } | |
693 | } | |
694 | ||
695 | const UnicodeString & | |
696 | UXMLElement::getTagName() const { | |
697 | return *fName; | |
698 | } | |
699 | ||
700 | UnicodeString | |
701 | UXMLElement::getText(UBool recurse) const { | |
702 | UnicodeString text; | |
703 | appendText(text, recurse); | |
704 | return text; | |
705 | } | |
706 | ||
707 | void | |
708 | UXMLElement::appendText(UnicodeString &text, UBool recurse) const { | |
709 | const UObject *node; | |
710 | int32_t i, count=fChildren.size(); | |
711 | for(i=0; i<count; ++i) { | |
712 | node=(const UObject *)fChildren.elementAt(i); | |
713 | if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) { | |
714 | text.append(*(const UnicodeString *)node); | |
715 | } else if(recurse) /* must be a UXMLElement */ { | |
716 | ((const UXMLElement *)node)->appendText(text, recurse); | |
717 | } | |
718 | } | |
719 | } | |
720 | ||
721 | int32_t | |
722 | UXMLElement::countAttributes() const { | |
723 | return fAttNames.size(); | |
724 | } | |
725 | ||
726 | const UnicodeString * | |
727 | UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { | |
728 | if(0<=i && i<fAttNames.size()) { | |
729 | name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); | |
730 | value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); | |
731 | return &value; // or return (UnicodeString *)fAttValues.elementAt(i); | |
732 | } else { | |
733 | return NULL; | |
734 | } | |
735 | } | |
736 | ||
737 | const UnicodeString * | |
738 | UXMLElement::getAttribute(const UnicodeString &name) const { | |
739 | // search for the attribute name by comparing the interned pointer, | |
740 | // not the string contents | |
741 | const UnicodeString *p=fParser->findName(name); | |
742 | if(p==NULL) { | |
743 | return NULL; // no such attribute seen by the parser at all | |
744 | } | |
745 | ||
746 | int32_t i, count=fAttNames.size(); | |
747 | for(i=0; i<count; ++i) { | |
748 | if(p==(const UnicodeString *)fAttNames.elementAt(i)) { | |
749 | return (const UnicodeString *)fAttValues.elementAt(i); | |
750 | } | |
751 | } | |
752 | return NULL; | |
753 | } | |
754 | ||
755 | int32_t | |
756 | UXMLElement::countChildren() const { | |
757 | return fChildren.size(); | |
758 | } | |
759 | ||
760 | const UObject * | |
761 | UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { | |
762 | if(0<=i && i<fChildren.size()) { | |
763 | const UObject *node=(const UObject *)fChildren.elementAt(i); | |
764 | if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) { | |
765 | type=UXML_NODE_TYPE_ELEMENT; | |
766 | } else { | |
767 | type=UXML_NODE_TYPE_STRING; | |
768 | } | |
769 | return node; | |
770 | } else { | |
771 | return NULL; | |
772 | } | |
773 | } | |
774 | ||
775 | const UXMLElement * | |
776 | UXMLElement::nextChildElement(int32_t &i) const { | |
777 | if(i<0) { | |
778 | return NULL; | |
779 | } | |
780 | ||
781 | const UObject *node; | |
782 | int32_t count=fChildren.size(); | |
783 | while(i<count) { | |
784 | node=(const UObject *)fChildren.elementAt(i++); | |
785 | // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI | |
786 | // if(node instanceof UXMLElement) { | |
787 | if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) { | |
788 | return (const UXMLElement *)node; | |
789 | } | |
790 | } | |
791 | return NULL; | |
792 | } | |
793 | ||
794 | const UXMLElement * | |
795 | UXMLElement::getChildElement(const UnicodeString &name) const { | |
796 | // search for the element name by comparing the interned pointer, | |
797 | // not the string contents | |
798 | const UnicodeString *p=fParser->findName(name); | |
799 | if(p==NULL) { | |
800 | return NULL; // no such element seen by the parser at all | |
801 | } | |
802 | ||
803 | const UObject *node; | |
804 | int32_t i, count=fChildren.size(); | |
805 | for(i=0; i<count; ++i) { | |
806 | node=(const UObject *)fChildren.elementAt(i); | |
807 | if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) { | |
808 | const UXMLElement *elem=(const UXMLElement *)node; | |
809 | if(p==elem->fName) { | |
810 | return elem; | |
811 | } | |
812 | } | |
813 | } | |
814 | return NULL; | |
815 | } | |
816 | ||
817 | U_NAMESPACE_END | |
818 | ||
819 | #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | |
820 |