X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..340931cb2e044a2141d11567dd0f782524e32994:/icuSources/tools/toolutil/xmlparser.cpp diff --git a/icuSources/tools/toolutil/xmlparser.cpp b/icuSources/tools/toolutil/xmlparser.cpp index 195fa303..a9650cc5 100644 --- a/icuSources/tools/toolutil/xmlparser.cpp +++ b/icuSources/tools/toolutil/xmlparser.cpp @@ -1,12 +1,14 @@ +// © 2016 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * -* Copyright (C) 2004-2006, International Business Machines +* Copyright (C) 2004-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: xmlparser.cpp -* encoding: US-ASCII +* encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * @@ -61,27 +63,32 @@ UXMLParser::UXMLParser(UErrorCode &status) : // example: " // This is a sloppy implementation - just look for the leading // allow for a possible leading BOM. - mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status), + mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), // XML Comment production #15 // example: " // note, does not detect an illegal "--" within comments - mXMLComment(UnicodeString("(?s)"), 0, status), + mXMLComment(UnicodeString("(?s)", -1, US_INV), 0, status), // XML Spaces // production [3] - mXMLSP(UnicodeString(XML_SPACES "+"), 0, status), + mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), // XML Doctype decl production #28 // example " + // or " // TODO: we don't actually parse the DOCTYPE or internal subsets. // Some internal dtd subsets could confuse this simple-minded - // attempt at skipping over them. - mXMLDoctype(UnicodeString("(?s)"), 0, status), + // attempt at skipping over them, specifically, occcurences + // of closeing square brackets. These could appear in comments, + // or in parameter entity declarations, for example. + mXMLDoctype(UnicodeString( + "(?s)|\\[.*?\\].*?>)", -1, US_INV + ), 0, status), // XML PI production #16 // example " - mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status), + mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), // XML Element Start Productions #40, #41 // example @@ -92,11 +99,11 @@ UXMLParser::UXMLParser(UErrorCode &status) : XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' ")*" // * for zero or more attributes. - XML_SPACES "*?>"), 0, status), // match " >" + XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" // XML Element End production #42 // example - mXMLElemEnd (UnicodeString(""), 0, status), + mXMLElemEnd (UnicodeString("", -1, US_INV), 0, status), // XML Element Empty production #44 // example @@ -105,11 +112,11 @@ UXMLParser::UXMLParser(UErrorCode &status) : XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' ")*" // * for zero or more attributes. - XML_SPACES "*?/>"), 0, status), // match " />" + XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" // XMLCharData. Everything but '<'. Note that & will be dealt with later. - mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status), + mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), // Attribute name = "value". XML Productions 10, 40/41 // Capture group 1 is name, @@ -121,14 +128,14 @@ UXMLParser::UXMLParser(UErrorCode &status) : // Here, we match a single attribute, and make its name and // attribute value available to the parser code. mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" - "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status), + "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), - mAttrNormalizer(UnicodeString(XML_SPACES), 0, status), + mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), // Match any of the new-line sequences in content. // All are changed to \u000a. - mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status), + mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), // & char references // We will figure out what we've got based on which capture group has content. @@ -202,7 +209,7 @@ UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { goto exit; } - buffer=src.getBuffer(bytesLength); + buffer=toUCharPtr(src.getBuffer(bytesLength)); if(buffer==NULL) { // unexpected failure to reserve some string capacity errorCode=U_MEMORY_ALLOCATION_ERROR; @@ -271,7 +278,7 @@ UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { pb=bytes; for(;;) { length=src.length(); - buffer=src.getBuffer(capacity); + buffer=toUCharPtr(src.getBuffer(capacity)); if(buffer==NULL) { // unexpected failure to reserve some string capacity errorCode=U_MEMORY_ALLOCATION_ERROR; @@ -306,7 +313,7 @@ UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) { // reached end of file, convert once more to flush the converter flush=TRUE; } - }; + } exit: ucnv_close(cnv); @@ -558,7 +565,7 @@ UnicodeString UXMLParser::scanContent(UErrorCode &status) { UnicodeString result; if (mXMLCharData.lookingAt(fPos, status)) { - result = mXMLCharData.group(0, status); + result = mXMLCharData.group((int32_t)0, status); // Normalize the new-lines. (Before char ref substitution) mNewLineNormalizer.reset(result); result = mNewLineNormalizer.replaceAll(fOneLF, status); @@ -617,7 +624,7 @@ UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) { // An unrecognized &entity; Leave it alone. // TODO: check that it really looks like an entity, and is not some // random & in the text. - replacement = mAmps.group(0, status); + replacement = mAmps.group((int32_t)0, status); } mAmps.appendReplacement(result, replacement, status); } @@ -710,8 +717,9 @@ UXMLElement::appendText(UnicodeString &text, UBool recurse) const { int32_t i, count=fChildren.size(); for(i=0; igetDynamicClassID()==UnicodeString::getStaticClassID()) { - text.append(*(const UnicodeString *)node); + const UnicodeString *s=dynamic_cast(node); + if(s!=NULL) { + text.append(*s); } else if(recurse) /* must be a UXMLElement */ { ((const UXMLElement *)node)->appendText(text, recurse); } @@ -761,7 +769,7 @@ const UObject * UXMLElement::getChild(int32_t i, UXMLNodeType &type) const { if(0<=i && igetDynamicClassID()==UXMLElement::getStaticClassID()) { + if(dynamic_cast(node)!=NULL) { type=UXML_NODE_TYPE_ELEMENT; } else { type=UXML_NODE_TYPE_STRING; @@ -782,10 +790,9 @@ UXMLElement::nextChildElement(int32_t &i) const { int32_t count=fChildren.size(); while(igetDynamicClassID()==UXMLElement::getStaticClassID()) { - return (const UXMLElement *)node; + const UXMLElement *elem=dynamic_cast(node); + if(elem!=NULL) { + return elem; } } return NULL; @@ -804,8 +811,8 @@ UXMLElement::getChildElement(const UnicodeString &name) const { int32_t i, count=fChildren.size(); for(i=0; igetDynamicClassID()==UXMLElement::getStaticClassID()) { - const UXMLElement *elem=(const UXMLElement *)node; + const UXMLElement *elem=dynamic_cast(node); + if(elem!=NULL) { if(p==elem->fName) { return elem; }