X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..cead9d4c898d4172af6769483c517e3bdb8be2bd:/icuSources/tools/toolutil/xmlparser.cpp diff --git a/icuSources/tools/toolutil/xmlparser.cpp b/icuSources/tools/toolutil/xmlparser.cpp index 195fa303..55688320 100644 --- a/icuSources/tools/toolutil/xmlparser.cpp +++ b/icuSources/tools/toolutil/xmlparser.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2004-2006, International Business Machines +* Copyright (C) 2004-2008, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -61,27 +61,32 @@ UXMLParser::UXMLParser(UErrorCode &status) : // example: " // This is a sloppy implementation - just look for the leading // allow for a possible leading BOM. - mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status), + mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status), // XML Comment production #15 // example: " // note, does not detect an illegal "--" within comments - mXMLComment(UnicodeString("(?s)"), 0, status), + mXMLComment(UnicodeString("(?s)", -1, US_INV), 0, status), // XML Spaces // production [3] - mXMLSP(UnicodeString(XML_SPACES "+"), 0, status), + mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status), // XML Doctype decl production #28 // example " + // or " // TODO: we don't actually parse the DOCTYPE or internal subsets. // Some internal dtd subsets could confuse this simple-minded - // attempt at skipping over them. - mXMLDoctype(UnicodeString("(?s)"), 0, status), + // attempt at skipping over them, specifically, occcurences + // of closeing square brackets. These could appear in comments, + // or in parameter entity declarations, for example. + mXMLDoctype(UnicodeString( + "(?s)|\\[.*?\\].*?>)", -1, US_INV + ), 0, status), // XML PI production #16 // example " - mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status), + mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status), // XML Element Start Productions #40, #41 // example @@ -92,11 +97,11 @@ UXMLParser::UXMLParser(UErrorCode &status) : XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' ")*" // * for zero or more attributes. - XML_SPACES "*?>"), 0, status), // match " >" + XML_SPACES "*?>", -1, US_INV), 0, status), // match " >" // XML Element End production #42 // example - mXMLElemEnd (UnicodeString(""), 0, status), + mXMLElemEnd (UnicodeString("", -1, US_INV), 0, status), // XML Element Empty production #44 // example @@ -105,11 +110,11 @@ UXMLParser::UXMLParser(UErrorCode &status) : XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = " "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"' ")*" // * for zero or more attributes. - XML_SPACES "*?/>"), 0, status), // match " />" + XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />" // XMLCharData. Everything but '<'. Note that & will be dealt with later. - mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status), + mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status), // Attribute name = "value". XML Productions 10, 40/41 // Capture group 1 is name, @@ -121,14 +126,14 @@ UXMLParser::UXMLParser(UErrorCode &status) : // Here, we match a single attribute, and make its name and // attribute value available to the parser code. mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*" - "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status), + "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status), - mAttrNormalizer(UnicodeString(XML_SPACES), 0, status), + mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status), // Match any of the new-line sequences in content. // All are changed to \u000a. - mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status), + mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status), // & char references // We will figure out what we've got based on which capture group has content.