]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
729e4ab9 | 4 | * Copyright (C) 2004-2010, International Business Machines |
73c04bcf A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: xmlparser.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2004jul21 | |
14 | * created by: Andy Heninger | |
15 | */ | |
16 | ||
17 | #include <stdio.h> | |
18 | #include "unicode/uchar.h" | |
19 | #include "unicode/ucnv.h" | |
20 | #include "unicode/regex.h" | |
21 | #include "filestrm.h" | |
22 | #include "xmlparser.h" | |
23 | ||
24 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION | |
25 | ||
26 | // character constants | |
27 | enum { | |
28 | x_QUOT=0x22, | |
29 | x_AMP=0x26, | |
30 | x_APOS=0x27, | |
31 | x_LT=0x3c, | |
32 | x_GT=0x3e, | |
33 | x_l=0x6c | |
34 | }; | |
35 | ||
36 | #define XML_SPACES "[ \\u0009\\u000d\\u000a]" | |
37 | ||
38 | // XML #4 | |
39 | #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ | |
40 | "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ | |
41 | "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ | |
42 | "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" | |
43 | ||
44 | // XML #5 | |
45 | #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" | |
46 | ||
47 | // XML #6 | |
48 | #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" | |
49 | ||
50 | U_NAMESPACE_BEGIN | |
51 | ||
52 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) | |
53 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) | |
54 | ||
55 | // | |
56 | // UXMLParser constructor. Mostly just initializes the ICU regexes that are | |
57 | // used for parsing. | |
58 | // | |
59 | UXMLParser::UXMLParser(UErrorCode &status) : | |
60 | // XML Declaration. XML Production #23. | |
61 | // example: "<?xml version=1.0 encoding="utf-16" ?> | |
62 | // This is a sloppy implementation - just look for the leading <?xml and the closing ?> | |
63 | // allow for a possible leading BOM. | |
46f4442e | 64 | mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), |
73c04bcf A |
65 | |
66 | // XML Comment production #15 | |
67 | // example: "<!-- whatever --> | |
68 | // note, does not detect an illegal "--" within comments | |
46f4442e | 69 | mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), |
73c04bcf A |
70 | |
71 | // XML Spaces | |
72 | // production [3] | |
46f4442e | 73 | mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), |
73c04bcf A |
74 | |
75 | // XML Doctype decl production #28 | |
76 | // example "<!DOCTYPE foo SYSTEM "somewhere" > | |
46f4442e | 77 | // or "<!DOCTYPE foo [internal dtd]> |
73c04bcf A |
78 | // TODO: we don't actually parse the DOCTYPE or internal subsets. |
79 | // Some internal dtd subsets could confuse this simple-minded | |
46f4442e A |
80 | // attempt at skipping over them, specifically, occcurences |
81 | // of closeing square brackets. These could appear in comments, | |
82 | // or in parameter entity declarations, for example. | |
83 | mXMLDoctype(UnicodeString( | |
84 | "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV | |
85 | ), 0, status), | |
73c04bcf A |
86 | |
87 | // XML PI production #16 | |
88 | // example "<?target stuff?> | |
46f4442e | 89 | mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), |
73c04bcf A |
90 | |
91 | // XML Element Start Productions #40, #41 | |
92 | // example <foo att1='abc' att2="d e f" > | |
93 | // capture #1: the tag name | |
94 | // | |
95 | mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" | |
96 | "(?:" | |
97 | XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " | |
98 | "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' | |
99 | ")*" // * for zero or more attributes. | |
46f4442e | 100 | XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" |
73c04bcf A |
101 | |
102 | // XML Element End production #42 | |
103 | // example </foo> | |
46f4442e | 104 | mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), |
73c04bcf A |
105 | |
106 | // XML Element Empty production #44 | |
107 | // example <foo att1="abc" att2="d e f" /> | |
108 | mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" | |
109 | "(?:" | |
110 | XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " | |
111 | "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' | |
112 | ")*" // * for zero or more attributes. | |
46f4442e | 113 | XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" |
73c04bcf A |
114 | |
115 | ||
116 | // XMLCharData. Everything but '<'. Note that & will be dealt with later. | |
46f4442e | 117 | mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), |
73c04bcf A |
118 | |
119 | // Attribute name = "value". XML Productions 10, 40/41 | |
120 | // Capture group 1 is name, | |
121 | // 2 is the attribute value, including the quotes. | |
122 | // | |
123 | // Note that attributes are scanned twice. The first time is with | |
124 | // the regex for an entire element start. There, the attributes | |
125 | // are checked syntactically, but not separted out one by one. | |
126 | // Here, we match a single attribute, and make its name and | |
127 | // attribute value available to the parser code. | |
128 | mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" | |
46f4442e | 129 | "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), |
73c04bcf A |
130 | |
131 | ||
46f4442e | 132 | mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), |
73c04bcf A |
133 | |
134 | // Match any of the new-line sequences in content. | |
135 | // All are changed to \u000a. | |
46f4442e | 136 | mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), |
73c04bcf A |
137 | |
138 | // & char references | |
139 | // We will figure out what we've got based on which capture group has content. | |
140 | // The last one is a catchall for unrecognized entity references.. | |
141 | // 1 2 3 4 5 6 7 8 | |
142 | mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), | |
143 | 0, status), | |
144 | ||
145 | fNames(status), | |
146 | fElementStack(status), | |
147 | fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. | |
148 | { | |
149 | } | |
150 | ||
151 | UXMLParser * | |
152 | UXMLParser::createParser(UErrorCode &errorCode) { | |
153 | if (U_FAILURE(errorCode)) { | |
154 | return NULL; | |
155 | } else { | |
156 | return new UXMLParser(errorCode); | |
157 | } | |
158 | } | |
159 | ||
160 | UXMLParser::~UXMLParser() {} | |
161 | ||
162 | UXMLElement * | |
163 | UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { | |
164 | char bytes[4096], charsetBuffer[100]; | |
165 | FileStream *f; | |
166 | const char *charset, *pb; | |
167 | UnicodeString src; | |
168 | UConverter *cnv; | |
169 | UChar *buffer, *pu; | |
170 | int32_t fileLength, bytesLength, length, capacity; | |
171 | UBool flush; | |
172 | ||
173 | if(U_FAILURE(errorCode)) { | |
174 | return NULL; | |
175 | } | |
176 | ||
177 | f=T_FileStream_open(filename, "rb"); | |
178 | if(f==NULL) { | |
179 | errorCode=U_FILE_ACCESS_ERROR; | |
180 | return NULL; | |
181 | } | |
182 | ||
183 | bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); | |
184 | if(bytesLength<(int32_t)sizeof(bytes)) { | |
185 | // we have already read the entire file | |
186 | fileLength=bytesLength; | |
187 | } else { | |
188 | // get the file length | |
189 | fileLength=T_FileStream_size(f); | |
190 | } | |
191 | ||
192 | /* | |
193 | * get the charset: | |
194 | * 1. Unicode signature | |
195 | * 2. treat as ISO-8859-1 and read XML encoding="charser" | |
196 | * 3. default to UTF-8 | |
197 | */ | |
198 | charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); | |
199 | if(U_SUCCESS(errorCode) && charset!=NULL) { | |
200 | // open converter according to Unicode signature | |
201 | cnv=ucnv_open(charset, &errorCode); | |
202 | } else { | |
203 | // read as Latin-1 and parse the XML declaration and encoding | |
204 | cnv=ucnv_open("ISO-8859-1", &errorCode); | |
205 | if(U_FAILURE(errorCode)) { | |
206 | // unexpected error opening Latin-1 converter | |
207 | goto exit; | |
208 | } | |
209 | ||
210 | buffer=src.getBuffer(bytesLength); | |
211 | if(buffer==NULL) { | |
212 | // unexpected failure to reserve some string capacity | |
213 | errorCode=U_MEMORY_ALLOCATION_ERROR; | |
214 | goto exit; | |
215 | } | |
216 | pb=bytes; | |
217 | pu=buffer; | |
218 | ucnv_toUnicode( | |
219 | cnv, | |
220 | &pu, buffer+src.getCapacity(), | |
221 | &pb, bytes+bytesLength, | |
222 | NULL, TRUE, &errorCode); | |
223 | src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); | |
224 | ucnv_close(cnv); | |
225 | cnv=NULL; | |
226 | if(U_FAILURE(errorCode)) { | |
227 | // unexpected error in conversion from Latin-1 | |
228 | src.remove(); | |
229 | goto exit; | |
230 | } | |
231 | ||
232 | // parse XML declaration | |
233 | if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { | |
234 | int32_t declEnd=mXMLDecl.end(errorCode); | |
235 | // go beyond <?xml | |
236 | int32_t pos=src.indexOf((UChar)x_l)+1; | |
237 | ||
238 | mAttrValue.reset(src); | |
239 | while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. | |
240 | UnicodeString attName = mAttrValue.group(1, errorCode); | |
241 | UnicodeString attValue = mAttrValue.group(2, errorCode); | |
242 | ||
243 | // Trim the quotes from the att value. These are left over from the original regex | |
244 | // that parsed the attribue, which couldn't conveniently strip them. | |
245 | attValue.remove(0,1); // one char from the beginning | |
246 | attValue.truncate(attValue.length()-1); // and one from the end. | |
247 | ||
248 | if(attName==UNICODE_STRING("encoding", 8)) { | |
249 | length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); | |
250 | charset=charsetBuffer; | |
251 | break; | |
252 | } | |
253 | pos = mAttrValue.end(2, errorCode); | |
254 | } | |
255 | ||
256 | if(charset==NULL) { | |
257 | // default to UTF-8 | |
258 | charset="UTF-8"; | |
259 | } | |
260 | cnv=ucnv_open(charset, &errorCode); | |
261 | } | |
262 | } | |
263 | ||
264 | if(U_FAILURE(errorCode)) { | |
265 | // unable to open the converter | |
266 | goto exit; | |
267 | } | |
268 | ||
269 | // convert the file contents | |
270 | capacity=fileLength; // estimated capacity | |
271 | src.getBuffer(capacity); | |
272 | src.releaseBuffer(0); // zero length | |
273 | flush=FALSE; | |
274 | for(;;) { | |
275 | // convert contents of bytes[bytesLength] | |
276 | pb=bytes; | |
277 | for(;;) { | |
278 | length=src.length(); | |
279 | buffer=src.getBuffer(capacity); | |
280 | if(buffer==NULL) { | |
281 | // unexpected failure to reserve some string capacity | |
282 | errorCode=U_MEMORY_ALLOCATION_ERROR; | |
283 | goto exit; | |
284 | } | |
285 | ||
286 | pu=buffer+length; | |
287 | ucnv_toUnicode( | |
288 | cnv, &pu, buffer+src.getCapacity(), | |
289 | &pb, bytes+bytesLength, | |
290 | NULL, FALSE, &errorCode); | |
291 | src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); | |
292 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { | |
293 | errorCode=U_ZERO_ERROR; | |
294 | capacity=(3*src.getCapacity())/2; // increase capacity by 50% | |
295 | } else { | |
296 | break; | |
297 | } | |
298 | } | |
299 | ||
300 | if(U_FAILURE(errorCode)) { | |
301 | break; // conversion error | |
302 | } | |
303 | ||
304 | if(flush) { | |
305 | break; // completely converted the file | |
306 | } | |
307 | ||
308 | // read next block | |
309 | bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); | |
310 | if(bytesLength==0) { | |
311 | // reached end of file, convert once more to flush the converter | |
312 | flush=TRUE; | |
313 | } | |
314 | }; | |
315 | ||
316 | exit: | |
317 | ucnv_close(cnv); | |
318 | T_FileStream_close(f); | |
319 | ||
320 | if(U_SUCCESS(errorCode)) { | |
321 | return parse(src, errorCode); | |
322 | } else { | |
323 | return NULL; | |
324 | } | |
325 | } | |
326 | ||
327 | UXMLElement * | |
328 | UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { | |
329 | if(U_FAILURE(status)) { | |
330 | return NULL; | |
331 | } | |
332 | ||
333 | UXMLElement *root = NULL; | |
334 | fPos = 0; // TODO use just a local pos variable and pass it into functions | |
335 | // where necessary? | |
336 | ||
337 | // set all matchers to work on the input string | |
338 | mXMLDecl.reset(src); | |
339 | mXMLComment.reset(src); | |
340 | mXMLSP.reset(src); | |
341 | mXMLDoctype.reset(src); | |
342 | mXMLPI.reset(src); | |
343 | mXMLElemStart.reset(src); | |
344 | mXMLElemEnd.reset(src); | |
345 | mXMLElemEmpty.reset(src); | |
346 | mXMLCharData.reset(src); | |
347 | mAttrValue.reset(src); | |
348 | mAttrNormalizer.reset(src); | |
349 | mNewLineNormalizer.reset(src); | |
350 | mAmps.reset(src); | |
351 | ||
352 | // Consume the XML Declaration, if present. | |
353 | if (mXMLDecl.lookingAt(fPos, status)) { | |
354 | fPos = mXMLDecl.end(status); | |
355 | } | |
356 | ||
357 | // Consume "misc" [XML production 27] appearing before DocType | |
358 | parseMisc(status); | |
359 | ||
360 | // Consume a DocType declaration, if present. | |
361 | if (mXMLDoctype.lookingAt(fPos, status)) { | |
362 | fPos = mXMLDoctype.end(status); | |
363 | } | |
364 | ||
365 | // Consume additional "misc" [XML production 27] appearing after the DocType | |
366 | parseMisc(status); | |
367 | ||
368 | // Get the root element | |
369 | if (mXMLElemEmpty.lookingAt(fPos, status)) { | |
370 | // Root is an empty element (no nested elements or content) | |
371 | root = createElement(mXMLElemEmpty, status); | |
372 | fPos = mXMLElemEmpty.end(status); | |
373 | } else { | |
374 | if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { | |
375 | error("Root Element expected", status); | |
376 | goto errorExit; | |
377 | } | |
378 | root = createElement(mXMLElemStart, status); | |
379 | UXMLElement *el = root; | |
380 | ||
381 | // | |
382 | // This is the loop that consumes the root element of the document, | |
383 | // including all nested content. Nested elements are handled by | |
384 | // explicit pushes/pops of the element stack; there is no recursion | |
385 | // in the control flow of this code. | |
386 | // "el" always refers to the current element, the one to which content | |
387 | // is being added. It is above the top of the element stack. | |
388 | for (;;) { | |
389 | // Nested Element Start | |
390 | if (mXMLElemStart.lookingAt(fPos, status)) { | |
391 | UXMLElement *t = createElement(mXMLElemStart, status); | |
392 | el->fChildren.addElement(t, status); | |
393 | t->fParent = el; | |
394 | fElementStack.push(el, status); | |
395 | el = t; | |
396 | continue; | |
397 | } | |
398 | ||
399 | // Text Content. String is concatenated onto the current node's content, | |
400 | // but only if it contains something other than spaces. | |
401 | UnicodeString s = scanContent(status); | |
402 | if (s.length() > 0) { | |
403 | mXMLSP.reset(s); | |
404 | if (mXMLSP.matches(status) == FALSE) { | |
405 | // This chunk of text contains something other than just | |
406 | // white space. Make a child node for it. | |
407 | replaceCharRefs(s, status); | |
408 | el->fChildren.addElement(s.clone(), status); | |
409 | } | |
410 | mXMLSP.reset(src); // The matchers need to stay set to the main input string. | |
411 | continue; | |
412 | } | |
413 | ||
414 | // Comments. Discard. | |
415 | if (mXMLComment.lookingAt(fPos, status)) { | |
416 | fPos = mXMLComment.end(status); | |
417 | continue; | |
418 | } | |
419 | ||
420 | // PIs. Discard. | |
421 | if (mXMLPI.lookingAt(fPos, status)) { | |
422 | fPos = mXMLPI.end(status); | |
423 | continue; | |
424 | } | |
425 | ||
426 | // Element End | |
427 | if (mXMLElemEnd.lookingAt(fPos, status)) { | |
428 | fPos = mXMLElemEnd.end(0, status); | |
429 | const UnicodeString name = mXMLElemEnd.group(1, status); | |
430 | if (name != *el->fName) { | |
431 | error("Element start / end tag mismatch", status); | |
432 | goto errorExit; | |
433 | } | |
434 | if (fElementStack.empty()) { | |
435 | // Close of the root element. We're done with the doc. | |
436 | el = NULL; | |
437 | break; | |
438 | } | |
439 | el = (UXMLElement *)fElementStack.pop(); | |
440 | continue; | |
441 | } | |
442 | ||
443 | // Empty Element. Stored as a child of the current element, but not stacked. | |
444 | if (mXMLElemEmpty.lookingAt(fPos, status)) { | |
445 | UXMLElement *t = createElement(mXMLElemEmpty, status); | |
446 | el->fChildren.addElement(t, status); | |
447 | continue; | |
448 | } | |
449 | ||
450 | // Hit something within the document that doesn't match anything. | |
451 | // It's an error. | |
452 | error("Unrecognized markup", status); | |
453 | break; | |
454 | } | |
455 | ||
456 | if (el != NULL || !fElementStack.empty()) { | |
457 | // We bailed out early, for some reason. | |
458 | error("Root element not closed.", status); | |
459 | goto errorExit; | |
460 | } | |
461 | } | |
462 | ||
463 | // Root Element parse is complete. | |
464 | // Consume the annoying xml "Misc" that can appear at the end of the doc. | |
465 | parseMisc(status); | |
466 | ||
467 | // We should have reached the end of the input | |
468 | if (fPos != src.length()) { | |
469 | error("Extra content at the end of the document", status); | |
470 | goto errorExit; | |
471 | } | |
472 | ||
473 | // Success! | |
474 | return root; | |
475 | ||
476 | errorExit: | |
477 | delete root; | |
478 | return NULL; | |
479 | } | |
480 | ||
481 | // | |
482 | // createElement | |
483 | // We've just matched an element start tag. Create and fill in a UXMLElement object | |
484 | // for it. | |
485 | // | |
486 | UXMLElement * | |
487 | UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { | |
488 | // First capture group is the element's name. | |
489 | UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); | |
490 | ||
491 | // Scan for attributes. | |
492 | int32_t pos = mEl.end(1, status); // The position after the end of the tag name | |
493 | ||
494 | while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. | |
495 | UnicodeString attName = mAttrValue.group(1, status); | |
496 | UnicodeString attValue = mAttrValue.group(2, status); | |
497 | ||
498 | // Trim the quotes from the att value. These are left over from the original regex | |
499 | // that parsed the attribue, which couldn't conveniently strip them. | |
500 | attValue.remove(0,1); // one char from the beginning | |
501 | attValue.truncate(attValue.length()-1); // and one from the end. | |
502 | ||
503 | // XML Attribue value normalization. | |
504 | // This is one of the really screwy parts of the XML spec. | |
505 | // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize | |
506 | // Note that non-validating parsers must treat all entities as type CDATA | |
507 | // which simplifies things some. | |
508 | ||
509 | // Att normalization step 1: normalize any newlines in the attribute value | |
510 | mNewLineNormalizer.reset(attValue); | |
511 | attValue = mNewLineNormalizer.replaceAll(fOneLF, status); | |
512 | ||
513 | // Next change all xml white space chars to plain \u0020 spaces. | |
514 | mAttrNormalizer.reset(attValue); | |
515 | UnicodeString oneSpace((UChar)0x0020); | |
516 | attValue = mAttrNormalizer.replaceAll(oneSpace, status); | |
517 | ||
518 | // Replace character entities. | |
519 | replaceCharRefs(attValue, status); | |
520 | ||
521 | // Save the attribute name and value in our document structure. | |
522 | el->fAttNames.addElement((void *)intern(attName, status), status); | |
523 | el->fAttValues.addElement(attValue.clone(), status); | |
524 | pos = mAttrValue.end(2, status); | |
525 | } | |
526 | fPos = mEl.end(0, status); | |
527 | return el; | |
528 | } | |
529 | ||
530 | // | |
531 | // parseMisc | |
532 | // Consume XML "Misc" [production #27] | |
533 | // which is any combination of space, PI and comments | |
534 | // Need to watch end-of-input because xml MISC stuff is allowed after | |
535 | // the document element, so we WILL scan off the end in this function | |
536 | // | |
537 | void | |
538 | UXMLParser::parseMisc(UErrorCode &status) { | |
539 | for (;;) { | |
540 | if (fPos >= mXMLPI.input().length()) { | |
541 | break; | |
542 | } | |
543 | if (mXMLPI.lookingAt(fPos, status)) { | |
544 | fPos = mXMLPI.end(status); | |
545 | continue; | |
546 | } | |
547 | if (mXMLSP.lookingAt(fPos, status)) { | |
548 | fPos = mXMLSP.end(status); | |
549 | continue; | |
550 | } | |
551 | if (mXMLComment.lookingAt(fPos, status)) { | |
552 | fPos = mXMLComment.end(status); | |
553 | continue; | |
554 | } | |
555 | break; | |
556 | } | |
557 | } | |
558 | ||
559 | // | |
560 | // Scan for document content. | |
561 | // | |
562 | UnicodeString | |
563 | UXMLParser::scanContent(UErrorCode &status) { | |
564 | UnicodeString result; | |
565 | if (mXMLCharData.lookingAt(fPos, status)) { | |
729e4ab9 | 566 | result = mXMLCharData.group((int32_t)0, status); |
73c04bcf A |
567 | // Normalize the new-lines. (Before char ref substitution) |
568 | mNewLineNormalizer.reset(result); | |
569 | result = mNewLineNormalizer.replaceAll(fOneLF, status); | |
570 | ||
571 | // TODO: handle CDATA | |
572 | fPos = mXMLCharData.end(0, status); | |
573 | } | |
574 | ||
575 | return result; | |
576 | } | |
577 | ||
578 | // | |
579 | // replaceCharRefs | |
580 | // | |
581 | // replace the char entities < & { ካ etc. in a string | |
582 | // with the corresponding actual character. | |
583 | // | |
584 | void | |
585 | UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { | |
586 | UnicodeString result; | |
587 | UnicodeString replacement; | |
588 | int i; | |
589 | ||
590 | mAmps.reset(s); | |
591 | // See the initialization for the regex matcher mAmps. | |
592 | // Which entity we've matched is determined by which capture group has content, | |
593 | // which is flaged by start() of that group not being -1. | |
594 | while (mAmps.find()) { | |
595 | if (mAmps.start(1, status) != -1) { | |
596 | replacement.setTo((UChar)x_AMP); | |
597 | } else if (mAmps.start(2, status) != -1) { | |
598 | replacement.setTo((UChar)x_LT); | |
599 | } else if (mAmps.start(3, status) != -1) { | |
600 | replacement.setTo((UChar)x_GT); | |
601 | } else if (mAmps.start(4, status) != -1) { | |
602 | replacement.setTo((UChar)x_APOS); | |
603 | } else if (mAmps.start(5, status) != -1) { | |
604 | replacement.setTo((UChar)x_QUOT); | |
605 | } else if (mAmps.start(6, status) != -1) { | |
606 | UnicodeString hexString = mAmps.group(6, status); | |
607 | UChar32 val = 0; | |
608 | for (i=0; i<hexString.length(); i++) { | |
609 | val = (val << 4) + u_digit(hexString.charAt(i), 16); | |
610 | } | |
611 | // TODO: some verification that the character is valid | |
612 | replacement.setTo(val); | |
613 | } else if (mAmps.start(7, status) != -1) { | |
614 | UnicodeString decimalString = mAmps.group(7, status); | |
615 | UChar32 val = 0; | |
616 | for (i=0; i<decimalString.length(); i++) { | |
617 | val = val*10 + u_digit(decimalString.charAt(i), 10); | |
618 | } | |
619 | // TODO: some verification that the character is valid | |
620 | replacement.setTo(val); | |
621 | } else { | |
622 | // An unrecognized &entity; Leave it alone. | |
623 | // TODO: check that it really looks like an entity, and is not some | |
624 | // random & in the text. | |
729e4ab9 | 625 | replacement = mAmps.group((int32_t)0, status); |
73c04bcf A |
626 | } |
627 | mAmps.appendReplacement(result, replacement, status); | |
628 | } | |
629 | mAmps.appendTail(result); | |
630 | s = result; | |
631 | } | |
632 | ||
633 | void | |
634 | UXMLParser::error(const char *message, UErrorCode &status) { | |
635 | // TODO: something better here... | |
636 | const UnicodeString &src=mXMLDecl.input(); | |
637 | int line = 0; | |
638 | int ci = 0; | |
639 | while (ci < fPos && ci>=0) { | |
640 | ci = src.indexOf((UChar)0x0a, ci+1); | |
641 | line++; | |
642 | } | |
643 | fprintf(stderr, "Error: %s at line %d\n", message, line); | |
644 | if (U_SUCCESS(status)) { | |
645 | status = U_PARSE_ERROR; | |
646 | } | |
647 | } | |
648 | ||
649 | // intern strings like in Java | |
650 | ||
651 | const UnicodeString * | |
652 | UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { | |
653 | const UHashElement *he=fNames.find(s); | |
654 | if(he!=NULL) { | |
655 | // already a known name, return its hashed key pointer | |
656 | return (const UnicodeString *)he->key.pointer; | |
657 | } else { | |
658 | // add this new name and return its hashed key pointer | |
659 | fNames.puti(s, 0, errorCode); | |
660 | he=fNames.find(s); | |
661 | return (const UnicodeString *)he->key.pointer; | |
662 | } | |
663 | } | |
664 | ||
665 | const UnicodeString * | |
666 | UXMLParser::findName(const UnicodeString &s) const { | |
667 | const UHashElement *he=fNames.find(s); | |
668 | if(he!=NULL) { | |
669 | // a known name, return its hashed key pointer | |
670 | return (const UnicodeString *)he->key.pointer; | |
671 | } else { | |
672 | // unknown name | |
673 | return NULL; | |
674 | } | |
675 | } | |
676 | ||
677 | // UXMLElement ------------------------------------------------------------- *** | |
678 | ||
679 | UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : | |
680 | fParser(parser), | |
681 | fName(name), | |
682 | fAttNames(errorCode), | |
683 | fAttValues(errorCode), | |
684 | fChildren(errorCode), | |
685 | fParent(NULL) | |
686 | { | |
687 | } | |
688 | ||
689 | UXMLElement::~UXMLElement() { | |
690 | int i; | |
691 | // attribute names are owned by the UXMLParser, don't delete them here | |
692 | for (i=fAttValues.size()-1; i>=0; i--) { | |
693 | delete (UObject *)fAttValues.elementAt(i); | |
694 | } | |
695 | for (i=fChildren.size()-1; i>=0; i--) { | |
696 | delete (UObject *)fChildren.elementAt(i); | |
697 | } | |
698 | } | |
699 | ||
700 | const UnicodeString & | |
701 | UXMLElement::getTagName() const { | |
702 | return *fName; | |
703 | } | |
704 | ||
705 | UnicodeString | |
706 | UXMLElement::getText(UBool recurse) const { | |
707 | UnicodeString text; | |
708 | appendText(text, recurse); | |
709 | return text; | |
710 | } | |
711 | ||
712 | void | |
713 | UXMLElement::appendText(UnicodeString &text, UBool recurse) const { | |
714 | const UObject *node; | |
715 | int32_t i, count=fChildren.size(); | |
716 | for(i=0; i<count; ++i) { | |
717 | node=(const UObject *)fChildren.elementAt(i); | |
729e4ab9 A |
718 | const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); |
719 | if(s!=NULL) { | |
720 | text.append(*s); | |
73c04bcf A |
721 | } else if(recurse) /* must be a UXMLElement */ { |
722 | ((const UXMLElement *)node)->appendText(text, recurse); | |
723 | } | |
724 | } | |
725 | } | |
726 | ||
727 | int32_t | |
728 | UXMLElement::countAttributes() const { | |
729 | return fAttNames.size(); | |
730 | } | |
731 | ||
732 | const UnicodeString * | |
733 | UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { | |
734 | if(0<=i && i<fAttNames.size()) { | |
735 | name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); | |
736 | value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); | |
737 | return &value; // or return (UnicodeString *)fAttValues.elementAt(i); | |
738 | } else { | |
739 | return NULL; | |
740 | } | |
741 | } | |
742 | ||
743 | const UnicodeString * | |
744 | UXMLElement::getAttribute(const UnicodeString &name) const { | |
745 | // search for the attribute name by comparing the interned pointer, | |
746 | // not the string contents | |
747 | const UnicodeString *p=fParser->findName(name); | |
748 | if(p==NULL) { | |
749 | return NULL; // no such attribute seen by the parser at all | |
750 | } | |
751 | ||
752 | int32_t i, count=fAttNames.size(); | |
753 | for(i=0; i<count; ++i) { | |
754 | if(p==(const UnicodeString *)fAttNames.elementAt(i)) { | |
755 | return (const UnicodeString *)fAttValues.elementAt(i); | |
756 | } | |
757 | } | |
758 | return NULL; | |
759 | } | |
760 | ||
761 | int32_t | |
762 | UXMLElement::countChildren() const { | |
763 | return fChildren.size(); | |
764 | } | |
765 | ||
766 | const UObject * | |
767 | UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { | |
768 | if(0<=i && i<fChildren.size()) { | |
769 | const UObject *node=(const UObject *)fChildren.elementAt(i); | |
729e4ab9 | 770 | if(dynamic_cast<const UXMLElement *>(node)!=NULL) { |
73c04bcf A |
771 | type=UXML_NODE_TYPE_ELEMENT; |
772 | } else { | |
773 | type=UXML_NODE_TYPE_STRING; | |
774 | } | |
775 | return node; | |
776 | } else { | |
777 | return NULL; | |
778 | } | |
779 | } | |
780 | ||
781 | const UXMLElement * | |
782 | UXMLElement::nextChildElement(int32_t &i) const { | |
783 | if(i<0) { | |
784 | return NULL; | |
785 | } | |
786 | ||
787 | const UObject *node; | |
788 | int32_t count=fChildren.size(); | |
789 | while(i<count) { | |
790 | node=(const UObject *)fChildren.elementAt(i++); | |
729e4ab9 A |
791 | const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
792 | if(elem!=NULL) { | |
793 | return elem; | |
73c04bcf A |
794 | } |
795 | } | |
796 | return NULL; | |
797 | } | |
798 | ||
799 | const UXMLElement * | |
800 | UXMLElement::getChildElement(const UnicodeString &name) const { | |
801 | // search for the element name by comparing the interned pointer, | |
802 | // not the string contents | |
803 | const UnicodeString *p=fParser->findName(name); | |
804 | if(p==NULL) { | |
805 | return NULL; // no such element seen by the parser at all | |
806 | } | |
807 | ||
808 | const UObject *node; | |
809 | int32_t i, count=fChildren.size(); | |
810 | for(i=0; i<count; ++i) { | |
811 | node=(const UObject *)fChildren.elementAt(i); | |
729e4ab9 A |
812 | const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
813 | if(elem!=NULL) { | |
73c04bcf A |
814 | if(p==elem->fName) { |
815 | return elem; | |
816 | } | |
817 | } | |
818 | } | |
819 | return NULL; | |
820 | } | |
821 | ||
822 | U_NAMESPACE_END | |
823 | ||
824 | #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | |
825 |