+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
-* Copyright (C) 2004-2006, International Business Machines
+* Copyright (C) 2004-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: xmlparser.cpp
-* encoding: US-ASCII
+* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
// example: "<?xml version=1.0 encoding="utf-16" ?>
// This is a sloppy implementation - just look for the leading <?xml and the closing ?>
// allow for a possible leading BOM.
- mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status),
+ mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
// XML Comment production #15
// example: "<!-- whatever -->
// note, does not detect an illegal "--" within comments
- mXMLComment(UnicodeString("(?s)<!--.+?-->"), 0, status),
+ mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
// XML Spaces
// production [3]
- mXMLSP(UnicodeString(XML_SPACES "+"), 0, status),
+ mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
// XML Doctype decl production #28
// example "<!DOCTYPE foo SYSTEM "somewhere" >
+ // or "<!DOCTYPE foo [internal dtd]>
// TODO: we don't actually parse the DOCTYPE or internal subsets.
// Some internal dtd subsets could confuse this simple-minded
- // attempt at skipping over them.
- mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>"), 0, status),
+ // attempt at skipping over them, specifically, occcurences
+ // of closeing square brackets. These could appear in comments,
+ // or in parameter entity declarations, for example.
+ mXMLDoctype(UnicodeString(
+ "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
+ ), 0, status),
// XML PI production #16
// example "<?target stuff?>
- mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status),
+ mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
// XML Element Start Productions #40, #41
// example <foo att1='abc' att2="d e f" >
XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
")*" // * for zero or more attributes.
- XML_SPACES "*?>"), 0, status), // match " >"
+ XML_SPACES "*?>", -1, US_INV), 0, status), // match " >"
// XML Element End production #42
// example </foo>
- mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>"), 0, status),
+ mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
// XML Element Empty production #44
// example <foo att1="abc" att2="d e f" />
XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*" // match "ATTR_NAME = "
"(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))" // match '"attribute value"'
")*" // * for zero or more attributes.
- XML_SPACES "*?/>"), 0, status), // match " />"
+ XML_SPACES "*?/>", -1, US_INV), 0, status), // match " />"
// XMLCharData. Everything but '<'. Note that & will be dealt with later.
- mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status),
+ mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
// Attribute name = "value". XML Productions 10, 40/41
// Capture group 1 is name,
// Here, we match a single attribute, and make its name and
// attribute value available to the parser code.
mAttrValue(UnicodeString(XML_SPACES "+(" XML_NAME ")" XML_SPACES "*=" XML_SPACES "*"
- "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status),
+ "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
- mAttrNormalizer(UnicodeString(XML_SPACES), 0, status),
+ mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
// Match any of the new-line sequences in content.
// All are changed to \u000a.
- mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status),
+ mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
// & char references
// We will figure out what we've got based on which capture group has content.
goto exit;
}
- buffer=src.getBuffer(bytesLength);
+ buffer=toUCharPtr(src.getBuffer(bytesLength));
if(buffer==NULL) {
// unexpected failure to reserve some string capacity
errorCode=U_MEMORY_ALLOCATION_ERROR;
pb=bytes;
for(;;) {
length=src.length();
- buffer=src.getBuffer(capacity);
+ buffer=toUCharPtr(src.getBuffer(capacity));
if(buffer==NULL) {
// unexpected failure to reserve some string capacity
errorCode=U_MEMORY_ALLOCATION_ERROR;
// reached end of file, convert once more to flush the converter
flush=TRUE;
}
- };
+ }
exit:
ucnv_close(cnv);
UXMLParser::scanContent(UErrorCode &status) {
UnicodeString result;
if (mXMLCharData.lookingAt(fPos, status)) {
- result = mXMLCharData.group(0, status);
+ result = mXMLCharData.group((int32_t)0, status);
// Normalize the new-lines. (Before char ref substitution)
mNewLineNormalizer.reset(result);
result = mNewLineNormalizer.replaceAll(fOneLF, status);
// An unrecognized &entity; Leave it alone.
// TODO: check that it really looks like an entity, and is not some
// random & in the text.
- replacement = mAmps.group(0, status);
+ replacement = mAmps.group((int32_t)0, status);
}
mAmps.appendReplacement(result, replacement, status);
}
int32_t i, count=fChildren.size();
for(i=0; i<count; ++i) {
node=(const UObject *)fChildren.elementAt(i);
- if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
- text.append(*(const UnicodeString *)node);
+ const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
+ if(s!=NULL) {
+ text.append(*s);
} else if(recurse) /* must be a UXMLElement */ {
((const UXMLElement *)node)->appendText(text, recurse);
}
UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
if(0<=i && i<fChildren.size()) {
const UObject *node=(const UObject *)fChildren.elementAt(i);
- if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
+ if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
type=UXML_NODE_TYPE_ELEMENT;
} else {
type=UXML_NODE_TYPE_STRING;
int32_t count=fChildren.size();
while(i<count) {
node=(const UObject *)fChildren.elementAt(i++);
- // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
- // if(node instanceof UXMLElement) {
- if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
- return (const UXMLElement *)node;
+ const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
+ if(elem!=NULL) {
+ return elem;
}
}
return NULL;
int32_t i, count=fChildren.size();
for(i=0; i<count; ++i) {
node=(const UObject *)fChildren.elementAt(i);
- if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
- const UXMLElement *elem=(const UXMLElement *)node;
+ const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
+ if(elem!=NULL) {
if(p==elem->fName) {
return elem;
}