]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
73c04bcf A |
3 | /* |
4 | ******************************************************************************* | |
5 | * | |
729e4ab9 | 6 | * Copyright (C) 2004-2010, International Business Machines |
73c04bcf A |
7 | * Corporation and others. All Rights Reserved. |
8 | * | |
9 | ******************************************************************************* | |
10 | * file name: xmlparser.cpp | |
f3c0d7a5 | 11 | * encoding: UTF-8 |
73c04bcf A |
12 | * tab size: 8 (not used) |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2004jul21 | |
16 | * created by: Andy Heninger | |
17 | */ | |
18 | ||
19 | #include <stdio.h> | |
20 | #include "unicode/uchar.h" | |
21 | #include "unicode/ucnv.h" | |
22 | #include "unicode/regex.h" | |
23 | #include "filestrm.h" | |
24 | #include "xmlparser.h" | |
25 | ||
26 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION | |
27 | ||
28 | // character constants | |
29 | enum { | |
30 | x_QUOT=0x22, | |
31 | x_AMP=0x26, | |
32 | x_APOS=0x27, | |
33 | x_LT=0x3c, | |
34 | x_GT=0x3e, | |
35 | x_l=0x6c | |
36 | }; | |
37 | ||
38 | #define XML_SPACES "[ \\u0009\\u000d\\u000a]" | |
39 | ||
40 | // XML #4 | |
41 | #define XML_NAMESTARTCHAR "[[A-Z]:_[a-z][\\u00c0-\\u00d6][\\u00d8-\\u00f6]" \ | |
42 | "[\\u00f8-\\u02ff][\\u0370-\\u037d][\\u037F-\\u1FFF][\\u200C-\\u200D]" \ | |
43 | "[\\u2070-\\u218F][\\u2C00-\\u2FEF][\\u3001-\\uD7FF][\\uF900-\\uFDCF]" \ | |
44 | "[\\uFDF0-\\uFFFD][\\U00010000-\\U000EFFFF]]" | |
45 | ||
46 | // XML #5 | |
47 | #define XML_NAMECHAR "[" XML_NAMESTARTCHAR "\\-.[0-9]\\u00b7[\\u0300-\\u036f][\\u203f-\\u2040]]" | |
48 | ||
49 | // XML #6 | |
50 | #define XML_NAME XML_NAMESTARTCHAR "(?:" XML_NAMECHAR ")*" | |
51 | ||
52 | U_NAMESPACE_BEGIN | |
53 | ||
54 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLParser) | |
55 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UXMLElement) | |
56 | ||
57 | // | |
58 | // UXMLParser constructor. Mostly just initializes the ICU regexes that are | |
59 | // used for parsing. | |
60 | // | |
61 | UXMLParser::UXMLParser(UErrorCode &status) : | |
62 | // XML Declaration. XML Production #23. | |
63 | // example: "<?xml version=1.0 encoding="utf-16" ?> | |
64 | // This is a sloppy implementation - just look for the leading <?xml and the closing ?> | |
65 | // allow for a possible leading BOM. | |
46f4442e | 66 | mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), |
73c04bcf A |
67 | |
68 | // XML Comment production #15 | |
69 | // example: "<!-- whatever --> | |
70 | // note, does not detect an illegal "--" within comments | |
46f4442e | 71 | mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status), |
73c04bcf A |
72 | |
73 | // XML Spaces | |
74 | // production [3] | |
46f4442e | 75 | mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), |
73c04bcf A |
76 | |
77 | // XML Doctype decl production #28 | |
78 | // example "<!DOCTYPE foo SYSTEM "somewhere" > | |
46f4442e | 79 | // or "<!DOCTYPE foo [internal dtd]> |
73c04bcf A |
80 | // TODO: we don't actually parse the DOCTYPE or internal subsets. |
81 | // Some internal dtd subsets could confuse this simple-minded | |
46f4442e A |
82 | // attempt at skipping over them, specifically, occcurences |
83 | // of closeing square brackets. These could appear in comments, | |
84 | // or in parameter entity declarations, for example. | |
85 | mXMLDoctype(UnicodeString( | |
86 | "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV | |
87 | ), 0, status), | |
73c04bcf A |
88 | |
89 | // XML PI production #16 | |
90 | // example "<?target stuff?> | |
46f4442e | 91 | mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), |
73c04bcf A |
92 | |
93 | // XML Element Start Productions #40, #41 | |
94 | // example <foo att1='abc' att2="d e f" > | |
95 | // capture #1: the tag name | |
96 | // | |
97 | mXMLElemStart (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" | |
98 | "(?:" | |
99 | XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " | |
100 | "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' | |
101 | ")*" // * for zero or more attributes. | |
46f4442e | 102 | XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" |
73c04bcf A |
103 | |
104 | // XML Element End production #42 | |
105 | // example </foo> | |
46f4442e | 106 | mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status), |
73c04bcf A |
107 | |
108 | // XML Element Empty production #44 | |
109 | // example <foo att1="abc" att2="d e f" /> | |
110 | mXMLElemEmpty (UnicodeString("(?s)<(" XML_NAME ")" // match "<tag_name" | |
111 | "(?:" | |
112 | XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " | |
113 | "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' | |
114 | ")*" // * for zero or more attributes. | |
46f4442e | 115 | XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" |
73c04bcf A |
116 | |
117 | ||
118 | // XMLCharData. Everything but '<'. Note that & will be dealt with later. | |
46f4442e | 119 | mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), |
73c04bcf A |
120 | |
121 | // Attribute name = "value". XML Productions 10, 40/41 | |
122 | // Capture group 1 is name, | |
123 | // 2 is the attribute value, including the quotes. | |
124 | // | |
125 | // Note that attributes are scanned twice. The first time is with | |
126 | // the regex for an entire element start. There, the attributes | |
127 | // are checked syntactically, but not separted out one by one. | |
128 | // Here, we match a single attribute, and make its name and | |
129 | // attribute value available to the parser code. | |
130 | mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" | |
46f4442e | 131 | "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), |
73c04bcf A |
132 | |
133 | ||
46f4442e | 134 | mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), |
73c04bcf A |
135 | |
136 | // Match any of the new-line sequences in content. | |
137 | // All are changed to \u000a. | |
46f4442e | 138 | mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), |
73c04bcf A |
139 | |
140 | // & char references | |
141 | // We will figure out what we've got based on which capture group has content. | |
142 | // The last one is a catchall for unrecognized entity references.. | |
143 | // 1 2 3 4 5 6 7 8 | |
144 | mAmps(UnicodeString("&(?:(amp;)|(lt;)|(gt;)|(apos;)|(quot;)|#x([0-9A-Fa-f]{1,8});|#([0-9]{1,8});|(.))"), | |
145 | 0, status), | |
146 | ||
147 | fNames(status), | |
148 | fElementStack(status), | |
149 | fOneLF((UChar)0x0a) // Plain new-line string, used in new line normalization. | |
150 | { | |
151 | } | |
152 | ||
153 | UXMLParser * | |
154 | UXMLParser::createParser(UErrorCode &errorCode) { | |
155 | if (U_FAILURE(errorCode)) { | |
156 | return NULL; | |
157 | } else { | |
158 | return new UXMLParser(errorCode); | |
159 | } | |
160 | } | |
161 | ||
162 | UXMLParser::~UXMLParser() {} | |
163 | ||
164 | UXMLElement * | |
165 | UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { | |
166 | char bytes[4096], charsetBuffer[100]; | |
167 | FileStream *f; | |
168 | const char *charset, *pb; | |
169 | UnicodeString src; | |
170 | UConverter *cnv; | |
171 | UChar *buffer, *pu; | |
172 | int32_t fileLength, bytesLength, length, capacity; | |
173 | UBool flush; | |
174 | ||
175 | if(U_FAILURE(errorCode)) { | |
176 | return NULL; | |
177 | } | |
178 | ||
179 | f=T_FileStream_open(filename, "rb"); | |
180 | if(f==NULL) { | |
181 | errorCode=U_FILE_ACCESS_ERROR; | |
182 | return NULL; | |
183 | } | |
184 | ||
185 | bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); | |
186 | if(bytesLength<(int32_t)sizeof(bytes)) { | |
187 | // we have already read the entire file | |
188 | fileLength=bytesLength; | |
189 | } else { | |
190 | // get the file length | |
191 | fileLength=T_FileStream_size(f); | |
192 | } | |
193 | ||
194 | /* | |
195 | * get the charset: | |
196 | * 1. Unicode signature | |
197 | * 2. treat as ISO-8859-1 and read XML encoding="charser" | |
198 | * 3. default to UTF-8 | |
199 | */ | |
200 | charset=ucnv_detectUnicodeSignature(bytes, bytesLength, NULL, &errorCode); | |
201 | if(U_SUCCESS(errorCode) && charset!=NULL) { | |
202 | // open converter according to Unicode signature | |
203 | cnv=ucnv_open(charset, &errorCode); | |
204 | } else { | |
205 | // read as Latin-1 and parse the XML declaration and encoding | |
206 | cnv=ucnv_open("ISO-8859-1", &errorCode); | |
207 | if(U_FAILURE(errorCode)) { | |
208 | // unexpected error opening Latin-1 converter | |
209 | goto exit; | |
210 | } | |
211 | ||
f3c0d7a5 | 212 | buffer=toUCharPtr(src.getBuffer(bytesLength)); |
73c04bcf A |
213 | if(buffer==NULL) { |
214 | // unexpected failure to reserve some string capacity | |
215 | errorCode=U_MEMORY_ALLOCATION_ERROR; | |
216 | goto exit; | |
217 | } | |
218 | pb=bytes; | |
219 | pu=buffer; | |
220 | ucnv_toUnicode( | |
221 | cnv, | |
222 | &pu, buffer+src.getCapacity(), | |
223 | &pb, bytes+bytesLength, | |
224 | NULL, TRUE, &errorCode); | |
225 | src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); | |
226 | ucnv_close(cnv); | |
227 | cnv=NULL; | |
228 | if(U_FAILURE(errorCode)) { | |
229 | // unexpected error in conversion from Latin-1 | |
230 | src.remove(); | |
231 | goto exit; | |
232 | } | |
233 | ||
234 | // parse XML declaration | |
235 | if(mXMLDecl.reset(src).lookingAt(0, errorCode)) { | |
236 | int32_t declEnd=mXMLDecl.end(errorCode); | |
237 | // go beyond <?xml | |
238 | int32_t pos=src.indexOf((UChar)x_l)+1; | |
239 | ||
240 | mAttrValue.reset(src); | |
241 | while(pos<declEnd && mAttrValue.lookingAt(pos, errorCode)) { // loop runs once per attribute on this element. | |
242 | UnicodeString attName = mAttrValue.group(1, errorCode); | |
243 | UnicodeString attValue = mAttrValue.group(2, errorCode); | |
244 | ||
245 | // Trim the quotes from the att value. These are left over from the original regex | |
246 | // that parsed the attribue, which couldn't conveniently strip them. | |
247 | attValue.remove(0,1); // one char from the beginning | |
248 | attValue.truncate(attValue.length()-1); // and one from the end. | |
249 | ||
250 | if(attName==UNICODE_STRING("encoding", 8)) { | |
251 | length=attValue.extract(0, 0x7fffffff, charsetBuffer, (int32_t)sizeof(charsetBuffer)); | |
252 | charset=charsetBuffer; | |
253 | break; | |
254 | } | |
255 | pos = mAttrValue.end(2, errorCode); | |
256 | } | |
257 | ||
258 | if(charset==NULL) { | |
259 | // default to UTF-8 | |
260 | charset="UTF-8"; | |
261 | } | |
262 | cnv=ucnv_open(charset, &errorCode); | |
263 | } | |
264 | } | |
265 | ||
266 | if(U_FAILURE(errorCode)) { | |
267 | // unable to open the converter | |
268 | goto exit; | |
269 | } | |
270 | ||
271 | // convert the file contents | |
272 | capacity=fileLength; // estimated capacity | |
273 | src.getBuffer(capacity); | |
274 | src.releaseBuffer(0); // zero length | |
275 | flush=FALSE; | |
276 | for(;;) { | |
277 | // convert contents of bytes[bytesLength] | |
278 | pb=bytes; | |
279 | for(;;) { | |
280 | length=src.length(); | |
f3c0d7a5 | 281 | buffer=toUCharPtr(src.getBuffer(capacity)); |
73c04bcf A |
282 | if(buffer==NULL) { |
283 | // unexpected failure to reserve some string capacity | |
284 | errorCode=U_MEMORY_ALLOCATION_ERROR; | |
285 | goto exit; | |
286 | } | |
287 | ||
288 | pu=buffer+length; | |
289 | ucnv_toUnicode( | |
290 | cnv, &pu, buffer+src.getCapacity(), | |
291 | &pb, bytes+bytesLength, | |
292 | NULL, FALSE, &errorCode); | |
293 | src.releaseBuffer(U_SUCCESS(errorCode) ? (int32_t)(pu-buffer) : 0); | |
294 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { | |
295 | errorCode=U_ZERO_ERROR; | |
296 | capacity=(3*src.getCapacity())/2; // increase capacity by 50% | |
297 | } else { | |
298 | break; | |
299 | } | |
300 | } | |
301 | ||
302 | if(U_FAILURE(errorCode)) { | |
303 | break; // conversion error | |
304 | } | |
305 | ||
306 | if(flush) { | |
307 | break; // completely converted the file | |
308 | } | |
309 | ||
310 | // read next block | |
311 | bytesLength=T_FileStream_read(f, bytes, (int32_t)sizeof(bytes)); | |
312 | if(bytesLength==0) { | |
313 | // reached end of file, convert once more to flush the converter | |
314 | flush=TRUE; | |
315 | } | |
316 | }; | |
317 | ||
318 | exit: | |
319 | ucnv_close(cnv); | |
320 | T_FileStream_close(f); | |
321 | ||
322 | if(U_SUCCESS(errorCode)) { | |
323 | return parse(src, errorCode); | |
324 | } else { | |
325 | return NULL; | |
326 | } | |
327 | } | |
328 | ||
329 | UXMLElement * | |
330 | UXMLParser::parse(const UnicodeString &src, UErrorCode &status) { | |
331 | if(U_FAILURE(status)) { | |
332 | return NULL; | |
333 | } | |
334 | ||
335 | UXMLElement *root = NULL; | |
336 | fPos = 0; // TODO use just a local pos variable and pass it into functions | |
337 | // where necessary? | |
338 | ||
339 | // set all matchers to work on the input string | |
340 | mXMLDecl.reset(src); | |
341 | mXMLComment.reset(src); | |
342 | mXMLSP.reset(src); | |
343 | mXMLDoctype.reset(src); | |
344 | mXMLPI.reset(src); | |
345 | mXMLElemStart.reset(src); | |
346 | mXMLElemEnd.reset(src); | |
347 | mXMLElemEmpty.reset(src); | |
348 | mXMLCharData.reset(src); | |
349 | mAttrValue.reset(src); | |
350 | mAttrNormalizer.reset(src); | |
351 | mNewLineNormalizer.reset(src); | |
352 | mAmps.reset(src); | |
353 | ||
354 | // Consume the XML Declaration, if present. | |
355 | if (mXMLDecl.lookingAt(fPos, status)) { | |
356 | fPos = mXMLDecl.end(status); | |
357 | } | |
358 | ||
359 | // Consume "misc" [XML production 27] appearing before DocType | |
360 | parseMisc(status); | |
361 | ||
362 | // Consume a DocType declaration, if present. | |
363 | if (mXMLDoctype.lookingAt(fPos, status)) { | |
364 | fPos = mXMLDoctype.end(status); | |
365 | } | |
366 | ||
367 | // Consume additional "misc" [XML production 27] appearing after the DocType | |
368 | parseMisc(status); | |
369 | ||
370 | // Get the root element | |
371 | if (mXMLElemEmpty.lookingAt(fPos, status)) { | |
372 | // Root is an empty element (no nested elements or content) | |
373 | root = createElement(mXMLElemEmpty, status); | |
374 | fPos = mXMLElemEmpty.end(status); | |
375 | } else { | |
376 | if (mXMLElemStart.lookingAt(fPos, status) == FALSE) { | |
377 | error("Root Element expected", status); | |
378 | goto errorExit; | |
379 | } | |
380 | root = createElement(mXMLElemStart, status); | |
381 | UXMLElement *el = root; | |
382 | ||
383 | // | |
384 | // This is the loop that consumes the root element of the document, | |
385 | // including all nested content. Nested elements are handled by | |
386 | // explicit pushes/pops of the element stack; there is no recursion | |
387 | // in the control flow of this code. | |
388 | // "el" always refers to the current element, the one to which content | |
389 | // is being added. It is above the top of the element stack. | |
390 | for (;;) { | |
391 | // Nested Element Start | |
392 | if (mXMLElemStart.lookingAt(fPos, status)) { | |
393 | UXMLElement *t = createElement(mXMLElemStart, status); | |
394 | el->fChildren.addElement(t, status); | |
395 | t->fParent = el; | |
396 | fElementStack.push(el, status); | |
397 | el = t; | |
398 | continue; | |
399 | } | |
400 | ||
401 | // Text Content. String is concatenated onto the current node's content, | |
402 | // but only if it contains something other than spaces. | |
403 | UnicodeString s = scanContent(status); | |
404 | if (s.length() > 0) { | |
405 | mXMLSP.reset(s); | |
406 | if (mXMLSP.matches(status) == FALSE) { | |
407 | // This chunk of text contains something other than just | |
408 | // white space. Make a child node for it. | |
409 | replaceCharRefs(s, status); | |
410 | el->fChildren.addElement(s.clone(), status); | |
411 | } | |
412 | mXMLSP.reset(src); // The matchers need to stay set to the main input string. | |
413 | continue; | |
414 | } | |
415 | ||
416 | // Comments. Discard. | |
417 | if (mXMLComment.lookingAt(fPos, status)) { | |
418 | fPos = mXMLComment.end(status); | |
419 | continue; | |
420 | } | |
421 | ||
422 | // PIs. Discard. | |
423 | if (mXMLPI.lookingAt(fPos, status)) { | |
424 | fPos = mXMLPI.end(status); | |
425 | continue; | |
426 | } | |
427 | ||
428 | // Element End | |
429 | if (mXMLElemEnd.lookingAt(fPos, status)) { | |
430 | fPos = mXMLElemEnd.end(0, status); | |
431 | const UnicodeString name = mXMLElemEnd.group(1, status); | |
432 | if (name != *el->fName) { | |
433 | error("Element start / end tag mismatch", status); | |
434 | goto errorExit; | |
435 | } | |
436 | if (fElementStack.empty()) { | |
437 | // Close of the root element. We're done with the doc. | |
438 | el = NULL; | |
439 | break; | |
440 | } | |
441 | el = (UXMLElement *)fElementStack.pop(); | |
442 | continue; | |
443 | } | |
444 | ||
445 | // Empty Element. Stored as a child of the current element, but not stacked. | |
446 | if (mXMLElemEmpty.lookingAt(fPos, status)) { | |
447 | UXMLElement *t = createElement(mXMLElemEmpty, status); | |
448 | el->fChildren.addElement(t, status); | |
449 | continue; | |
450 | } | |
451 | ||
452 | // Hit something within the document that doesn't match anything. | |
453 | // It's an error. | |
454 | error("Unrecognized markup", status); | |
455 | break; | |
456 | } | |
457 | ||
458 | if (el != NULL || !fElementStack.empty()) { | |
459 | // We bailed out early, for some reason. | |
460 | error("Root element not closed.", status); | |
461 | goto errorExit; | |
462 | } | |
463 | } | |
464 | ||
465 | // Root Element parse is complete. | |
466 | // Consume the annoying xml "Misc" that can appear at the end of the doc. | |
467 | parseMisc(status); | |
468 | ||
469 | // We should have reached the end of the input | |
470 | if (fPos != src.length()) { | |
471 | error("Extra content at the end of the document", status); | |
472 | goto errorExit; | |
473 | } | |
474 | ||
475 | // Success! | |
476 | return root; | |
477 | ||
478 | errorExit: | |
479 | delete root; | |
480 | return NULL; | |
481 | } | |
482 | ||
483 | // | |
484 | // createElement | |
485 | // We've just matched an element start tag. Create and fill in a UXMLElement object | |
486 | // for it. | |
487 | // | |
488 | UXMLElement * | |
489 | UXMLParser::createElement(RegexMatcher &mEl, UErrorCode &status) { | |
490 | // First capture group is the element's name. | |
491 | UXMLElement *el = new UXMLElement(this, intern(mEl.group(1, status), status), status); | |
492 | ||
493 | // Scan for attributes. | |
494 | int32_t pos = mEl.end(1, status); // The position after the end of the tag name | |
495 | ||
496 | while (mAttrValue.lookingAt(pos, status)) { // loop runs once per attribute on this element. | |
497 | UnicodeString attName = mAttrValue.group(1, status); | |
498 | UnicodeString attValue = mAttrValue.group(2, status); | |
499 | ||
500 | // Trim the quotes from the att value. These are left over from the original regex | |
501 | // that parsed the attribue, which couldn't conveniently strip them. | |
502 | attValue.remove(0,1); // one char from the beginning | |
503 | attValue.truncate(attValue.length()-1); // and one from the end. | |
504 | ||
505 | // XML Attribue value normalization. | |
506 | // This is one of the really screwy parts of the XML spec. | |
507 | // See http://www.w3.org/TR/2004/REC-xml11-20040204/#AVNormalize | |
508 | // Note that non-validating parsers must treat all entities as type CDATA | |
509 | // which simplifies things some. | |
510 | ||
511 | // Att normalization step 1: normalize any newlines in the attribute value | |
512 | mNewLineNormalizer.reset(attValue); | |
513 | attValue = mNewLineNormalizer.replaceAll(fOneLF, status); | |
514 | ||
515 | // Next change all xml white space chars to plain \u0020 spaces. | |
516 | mAttrNormalizer.reset(attValue); | |
517 | UnicodeString oneSpace((UChar)0x0020); | |
518 | attValue = mAttrNormalizer.replaceAll(oneSpace, status); | |
519 | ||
520 | // Replace character entities. | |
521 | replaceCharRefs(attValue, status); | |
522 | ||
523 | // Save the attribute name and value in our document structure. | |
524 | el->fAttNames.addElement((void *)intern(attName, status), status); | |
525 | el->fAttValues.addElement(attValue.clone(), status); | |
526 | pos = mAttrValue.end(2, status); | |
527 | } | |
528 | fPos = mEl.end(0, status); | |
529 | return el; | |
530 | } | |
531 | ||
532 | // | |
533 | // parseMisc | |
534 | // Consume XML "Misc" [production #27] | |
535 | // which is any combination of space, PI and comments | |
536 | // Need to watch end-of-input because xml MISC stuff is allowed after | |
537 | // the document element, so we WILL scan off the end in this function | |
538 | // | |
539 | void | |
540 | UXMLParser::parseMisc(UErrorCode &status) { | |
541 | for (;;) { | |
542 | if (fPos >= mXMLPI.input().length()) { | |
543 | break; | |
544 | } | |
545 | if (mXMLPI.lookingAt(fPos, status)) { | |
546 | fPos = mXMLPI.end(status); | |
547 | continue; | |
548 | } | |
549 | if (mXMLSP.lookingAt(fPos, status)) { | |
550 | fPos = mXMLSP.end(status); | |
551 | continue; | |
552 | } | |
553 | if (mXMLComment.lookingAt(fPos, status)) { | |
554 | fPos = mXMLComment.end(status); | |
555 | continue; | |
556 | } | |
557 | break; | |
558 | } | |
559 | } | |
560 | ||
561 | // | |
562 | // Scan for document content. | |
563 | // | |
564 | UnicodeString | |
565 | UXMLParser::scanContent(UErrorCode &status) { | |
566 | UnicodeString result; | |
567 | if (mXMLCharData.lookingAt(fPos, status)) { | |
729e4ab9 | 568 | result = mXMLCharData.group((int32_t)0, status); |
73c04bcf A |
569 | // Normalize the new-lines. (Before char ref substitution) |
570 | mNewLineNormalizer.reset(result); | |
571 | result = mNewLineNormalizer.replaceAll(fOneLF, status); | |
572 | ||
573 | // TODO: handle CDATA | |
574 | fPos = mXMLCharData.end(0, status); | |
575 | } | |
576 | ||
577 | return result; | |
578 | } | |
579 | ||
580 | // | |
581 | // replaceCharRefs | |
582 | // | |
583 | // replace the char entities < & { ካ etc. in a string | |
584 | // with the corresponding actual character. | |
585 | // | |
586 | void | |
587 | UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { | |
588 | UnicodeString result; | |
589 | UnicodeString replacement; | |
590 | int i; | |
591 | ||
592 | mAmps.reset(s); | |
593 | // See the initialization for the regex matcher mAmps. | |
594 | // Which entity we've matched is determined by which capture group has content, | |
595 | // which is flaged by start() of that group not being -1. | |
596 | while (mAmps.find()) { | |
597 | if (mAmps.start(1, status) != -1) { | |
598 | replacement.setTo((UChar)x_AMP); | |
599 | } else if (mAmps.start(2, status) != -1) { | |
600 | replacement.setTo((UChar)x_LT); | |
601 | } else if (mAmps.start(3, status) != -1) { | |
602 | replacement.setTo((UChar)x_GT); | |
603 | } else if (mAmps.start(4, status) != -1) { | |
604 | replacement.setTo((UChar)x_APOS); | |
605 | } else if (mAmps.start(5, status) != -1) { | |
606 | replacement.setTo((UChar)x_QUOT); | |
607 | } else if (mAmps.start(6, status) != -1) { | |
608 | UnicodeString hexString = mAmps.group(6, status); | |
609 | UChar32 val = 0; | |
610 | for (i=0; i<hexString.length(); i++) { | |
611 | val = (val << 4) + u_digit(hexString.charAt(i), 16); | |
612 | } | |
613 | // TODO: some verification that the character is valid | |
614 | replacement.setTo(val); | |
615 | } else if (mAmps.start(7, status) != -1) { | |
616 | UnicodeString decimalString = mAmps.group(7, status); | |
617 | UChar32 val = 0; | |
618 | for (i=0; i<decimalString.length(); i++) { | |
619 | val = val*10 + u_digit(decimalString.charAt(i), 10); | |
620 | } | |
621 | // TODO: some verification that the character is valid | |
622 | replacement.setTo(val); | |
623 | } else { | |
624 | // An unrecognized &entity; Leave it alone. | |
625 | // TODO: check that it really looks like an entity, and is not some | |
626 | // random & in the text. | |
729e4ab9 | 627 | replacement = mAmps.group((int32_t)0, status); |
73c04bcf A |
628 | } |
629 | mAmps.appendReplacement(result, replacement, status); | |
630 | } | |
631 | mAmps.appendTail(result); | |
632 | s = result; | |
633 | } | |
634 | ||
635 | void | |
636 | UXMLParser::error(const char *message, UErrorCode &status) { | |
637 | // TODO: something better here... | |
638 | const UnicodeString &src=mXMLDecl.input(); | |
639 | int line = 0; | |
640 | int ci = 0; | |
641 | while (ci < fPos && ci>=0) { | |
642 | ci = src.indexOf((UChar)0x0a, ci+1); | |
643 | line++; | |
644 | } | |
645 | fprintf(stderr, "Error: %s at line %d\n", message, line); | |
646 | if (U_SUCCESS(status)) { | |
647 | status = U_PARSE_ERROR; | |
648 | } | |
649 | } | |
650 | ||
651 | // intern strings like in Java | |
652 | ||
653 | const UnicodeString * | |
654 | UXMLParser::intern(const UnicodeString &s, UErrorCode &errorCode) { | |
655 | const UHashElement *he=fNames.find(s); | |
656 | if(he!=NULL) { | |
657 | // already a known name, return its hashed key pointer | |
658 | return (const UnicodeString *)he->key.pointer; | |
659 | } else { | |
660 | // add this new name and return its hashed key pointer | |
661 | fNames.puti(s, 0, errorCode); | |
662 | he=fNames.find(s); | |
663 | return (const UnicodeString *)he->key.pointer; | |
664 | } | |
665 | } | |
666 | ||
667 | const UnicodeString * | |
668 | UXMLParser::findName(const UnicodeString &s) const { | |
669 | const UHashElement *he=fNames.find(s); | |
670 | if(he!=NULL) { | |
671 | // a known name, return its hashed key pointer | |
672 | return (const UnicodeString *)he->key.pointer; | |
673 | } else { | |
674 | // unknown name | |
675 | return NULL; | |
676 | } | |
677 | } | |
678 | ||
679 | // UXMLElement ------------------------------------------------------------- *** | |
680 | ||
681 | UXMLElement::UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode) : | |
682 | fParser(parser), | |
683 | fName(name), | |
684 | fAttNames(errorCode), | |
685 | fAttValues(errorCode), | |
686 | fChildren(errorCode), | |
687 | fParent(NULL) | |
688 | { | |
689 | } | |
690 | ||
691 | UXMLElement::~UXMLElement() { | |
692 | int i; | |
693 | // attribute names are owned by the UXMLParser, don't delete them here | |
694 | for (i=fAttValues.size()-1; i>=0; i--) { | |
695 | delete (UObject *)fAttValues.elementAt(i); | |
696 | } | |
697 | for (i=fChildren.size()-1; i>=0; i--) { | |
698 | delete (UObject *)fChildren.elementAt(i); | |
699 | } | |
700 | } | |
701 | ||
702 | const UnicodeString & | |
703 | UXMLElement::getTagName() const { | |
704 | return *fName; | |
705 | } | |
706 | ||
707 | UnicodeString | |
708 | UXMLElement::getText(UBool recurse) const { | |
709 | UnicodeString text; | |
710 | appendText(text, recurse); | |
711 | return text; | |
712 | } | |
713 | ||
714 | void | |
715 | UXMLElement::appendText(UnicodeString &text, UBool recurse) const { | |
716 | const UObject *node; | |
717 | int32_t i, count=fChildren.size(); | |
718 | for(i=0; i<count; ++i) { | |
719 | node=(const UObject *)fChildren.elementAt(i); | |
729e4ab9 A |
720 | const UnicodeString *s=dynamic_cast<const UnicodeString *>(node); |
721 | if(s!=NULL) { | |
722 | text.append(*s); | |
73c04bcf A |
723 | } else if(recurse) /* must be a UXMLElement */ { |
724 | ((const UXMLElement *)node)->appendText(text, recurse); | |
725 | } | |
726 | } | |
727 | } | |
728 | ||
729 | int32_t | |
730 | UXMLElement::countAttributes() const { | |
731 | return fAttNames.size(); | |
732 | } | |
733 | ||
734 | const UnicodeString * | |
735 | UXMLElement::getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const { | |
736 | if(0<=i && i<fAttNames.size()) { | |
737 | name.setTo(*(const UnicodeString *)fAttNames.elementAt(i)); | |
738 | value.setTo(*(const UnicodeString *)fAttValues.elementAt(i)); | |
739 | return &value; // or return (UnicodeString *)fAttValues.elementAt(i); | |
740 | } else { | |
741 | return NULL; | |
742 | } | |
743 | } | |
744 | ||
745 | const UnicodeString * | |
746 | UXMLElement::getAttribute(const UnicodeString &name) const { | |
747 | // search for the attribute name by comparing the interned pointer, | |
748 | // not the string contents | |
749 | const UnicodeString *p=fParser->findName(name); | |
750 | if(p==NULL) { | |
751 | return NULL; // no such attribute seen by the parser at all | |
752 | } | |
753 | ||
754 | int32_t i, count=fAttNames.size(); | |
755 | for(i=0; i<count; ++i) { | |
756 | if(p==(const UnicodeString *)fAttNames.elementAt(i)) { | |
757 | return (const UnicodeString *)fAttValues.elementAt(i); | |
758 | } | |
759 | } | |
760 | return NULL; | |
761 | } | |
762 | ||
763 | int32_t | |
764 | UXMLElement::countChildren() const { | |
765 | return fChildren.size(); | |
766 | } | |
767 | ||
768 | const UObject * | |
769 | UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { | |
770 | if(0<=i && i<fChildren.size()) { | |
771 | const UObject *node=(const UObject *)fChildren.elementAt(i); | |
729e4ab9 | 772 | if(dynamic_cast<const UXMLElement *>(node)!=NULL) { |
73c04bcf A |
773 | type=UXML_NODE_TYPE_ELEMENT; |
774 | } else { | |
775 | type=UXML_NODE_TYPE_STRING; | |
776 | } | |
777 | return node; | |
778 | } else { | |
779 | return NULL; | |
780 | } | |
781 | } | |
782 | ||
783 | const UXMLElement * | |
784 | UXMLElement::nextChildElement(int32_t &i) const { | |
785 | if(i<0) { | |
786 | return NULL; | |
787 | } | |
788 | ||
789 | const UObject *node; | |
790 | int32_t count=fChildren.size(); | |
791 | while(i<count) { | |
792 | node=(const UObject *)fChildren.elementAt(i++); | |
729e4ab9 A |
793 | const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
794 | if(elem!=NULL) { | |
795 | return elem; | |
73c04bcf A |
796 | } |
797 | } | |
798 | return NULL; | |
799 | } | |
800 | ||
801 | const UXMLElement * | |
802 | UXMLElement::getChildElement(const UnicodeString &name) const { | |
803 | // search for the element name by comparing the interned pointer, | |
804 | // not the string contents | |
805 | const UnicodeString *p=fParser->findName(name); | |
806 | if(p==NULL) { | |
807 | return NULL; // no such element seen by the parser at all | |
808 | } | |
809 | ||
810 | const UObject *node; | |
811 | int32_t i, count=fChildren.size(); | |
812 | for(i=0; i<count; ++i) { | |
813 | node=(const UObject *)fChildren.elementAt(i); | |
729e4ab9 A |
814 | const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node); |
815 | if(elem!=NULL) { | |
73c04bcf A |
816 | if(p==elem->fName) { |
817 | return elem; | |
818 | } | |
819 | } | |
820 | } | |
821 | return NULL; | |
822 | } | |
823 | ||
824 | U_NAMESPACE_END | |
825 | ||
826 | #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | |
827 |