X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/73c04bcfe1096173b00431f0cdc742894b15eef0..cead9d4c898d4172af6769483c517e3bdb8be2bd:/icuSources/tools/toolutil/xmlparser.cpp
diff --git a/icuSources/tools/toolutil/xmlparser.cpp b/icuSources/tools/toolutil/xmlparser.cpp
index 195fa303..55688320 100644
--- a/icuSources/tools/toolutil/xmlparser.cpp
+++ b/icuSources/tools/toolutil/xmlparser.cpp
@@ -1,7 +1,7 @@
/*
*******************************************************************************
*
-* Copyright (C) 2004-2006, International Business Machines
+* Copyright (C) 2004-2008, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@@ -61,27 +61,32 @@ UXMLParser::UXMLParser(UErrorCode &status) :
// example: "
// This is a sloppy implementation - just look for the leading
// allow for a possible leading BOM.
- mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status),
+ mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
// XML Comment production #15
// example: "
// note, does not detect an illegal "--" within comments
- mXMLComment(UnicodeString("(?s)"), 0, status),
+ mXMLComment(UnicodeString("(?s)", -1, US_INV), 0, status),
// XML Spaces
// production [3]
- mXMLSP(UnicodeString(XML_SPACES "+"), 0, status),
+ mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
// XML Doctype decl production #28
// example "
+ // or "
// TODO: we don't actually parse the DOCTYPE or internal subsets.
// Some internal dtd subsets could confuse this simple-minded
- // attempt at skipping over them.
- mXMLDoctype(UnicodeString("(?s)"), 0, status),
+ // attempt at skipping over them, specifically, occcurences
+ // of closeing square brackets. These could appear in comments,
+ // or in parameter entity declarations, for example.
+ mXMLDoctype(UnicodeString(
+ "(?s)|\\[.*?\\].*?>)", -1, US_INV
+ ), 0, status),
// XML PI production #16
// example "
- mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status),
+ mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
// XML Element Start Productions #40, #41
// example
@@ -92,11 +97,11 @@ UXMLParser::UXMLParser(UErrorCode &status) :
XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
")*" // * for zero or more attributes.
- XML_SPACES "*?>"), 0, status), // match " >"
+ XML_SPACES "*?>", -1, US_INV), 0, status), // match " >"
// XML Element End production #42
// example
- mXMLElemEnd (UnicodeString("(" XML_NAME ")" XML_SPACES "*>"), 0, status),
+ mXMLElemEnd (UnicodeString("(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
// XML Element Empty production #44
// example
@@ -105,11 +110,11 @@ UXMLParser::UXMLParser(UErrorCode &status) :
XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
")*" // * for zero or more attributes.
- XML_SPACES "*?/>"), 0, status), // match " />"
+ XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />"
// XMLCharData. Everything but '<'. Note that & will be dealt with later.
- mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status),
+ mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
// Attribute name = "value". XML Productions 10, 40/41
// Capture group 1 is name,
@@ -121,14 +126,14 @@ UXMLParser::UXMLParser(UErrorCode &status) :
// Here, we match a single attribute, and make its name and
// attribute value available to the parser code.
mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*"
- "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status),
+ "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
- mAttrNormalizer(UnicodeString(XML_SPACES), 0, status),
+ mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
// Match any of the new-line sequences in content.
// All are changed to \u000a.
- mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status),
+ mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
// & char references
// We will figure out what we've got based on which capture group has content.