/*
*******************************************************************************
*
-* Copyright (C) 2004-2006, International Business Machines
+* Copyright (C) 2004-2008, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
// example: "<?xml version=1.0 encoding="utf-16" ?>
// This is a sloppy implementation - just look for the leading <?xml and the closing ?>
// allow for a possible leading BOM.
- mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status),
+ mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
// XML Comment production #15
// example: "<!-- whatever -->
// note, does not detect an illegal "--" within comments
- mXMLComment(UnicodeString("(?s)<!--.+?-->"), 0, status),
+ mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
// XML Spaces
// production [3]
- mXMLSP(UnicodeString(XML_SPACES "+"), 0, status),
+ mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
// XML Doctype decl production #28
// example "<!DOCTYPE foo SYSTEM "somewhere" >
+ // or "<!DOCTYPE foo [internal dtd]>
// TODO: we don't actually parse the DOCTYPE or internal subsets.
// Some internal dtd subsets could confuse this simple-minded
- // attempt at skipping over them.
- mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>"), 0, status),
+ // attempt at skipping over them, specifically, occcurences
+ // of closeing square brackets. These could appear in comments,
+ // or in parameter entity declarations, for example.
+ mXMLDoctype(UnicodeString(
+ "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
+ ), 0, status),
// XML PI production #16
// example "<?target stuff?>
- mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status),
+ mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
// XML Element Start Productions #40, #41
// example <foo att1='abc' att2="d e f" >
XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
")*" // * for zero or more attributes.
- XML_SPACES "*?>"), 0, status), // match " >"
+ XML_SPACES "*?>", -1, US_INV), 0, status), // match " >"
// XML Element End production #42
// example </foo>
- mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>"), 0, status),
+ mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
// XML Element Empty production #44
// example <foo att1="abc" att2="d e f" />
XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
")*" // * for zero or more attributes.
- XML_SPACES "*?/>"), 0, status), // match " />"
+ XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />"
// XMLCharData. Everything but '<'. Note that & will be dealt with later.
- mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status),
+ mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
// Attribute name = "value". XML Productions 10, 40/41
// Capture group 1 is name,
// Here, we match a single attribute, and make its name and
// attribute value available to the parser code.
mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*"
- "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status),
+ "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
- mAttrNormalizer(UnicodeString(XML_SPACES), 0, status),
+ mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
// Match any of the new-line sequences in content.
// All are changed to \u000a.
- mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status),
+ mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
// & char references
// We will figure out what we've got based on which capture group has content.