ICU-66108.tar.gz

[apple/icu.git] / icuSources / tools / toolutil / xmlparser.cpp
diff --git a/icuSources/tools/toolutil/xmlparser.cpp b/icuSources/tools/toolutil/xmlparser.cpp

index 195fa30385c9f26422e5ccbffa1c045e9bdec59d..a9650cc599927f26be11a8f1d014ded0c534e312 100644 (file)
--- a/icuSources/tools/toolutil/xmlparser.cpp
+++ b/icuSources/tools/toolutil/xmlparser.cpp
@@ -1,12 +1,14 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
  *******************************************************************************
  *
-*   Copyright (C) 2004-2006, International Business Machines
+*   Copyright (C) 2004-2010, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  *******************************************************************************
  *   file name:  xmlparser.cpp
-*   encoding:   US-ASCII
+*   encoding:   UTF-8
  *   tab size:   8 (not used)
  *   indentation:4
  *
@@ -61,27 +63,32 @@ UXMLParser::UXMLParser(UErrorCode &status) :
        //      example:  "<?xml version=1.0 encoding="utf-16" ?>
        //      This is a sloppy implementation - just look for the leading <?xml and the closing ?>
        //            allow for a possible leading BOM.
-      mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>"), 0, status),
+      mXMLDecl(UnicodeString("(?s)\\uFEFF?<\\?xml.+?\\?>", -1, US_INV), 0, status),
        
        //  XML Comment   production #15
        //     example:  "<!-- whatever -->
        //       note, does not detect an illegal "--" within comments
-      mXMLComment(UnicodeString("(?s)<!--.+?-->"), 0, status),
+      mXMLComment(UnicodeString("(?s)<!--.+?-->", -1, US_INV), 0, status),
        
        //  XML Spaces
        //      production [3]
-      mXMLSP(UnicodeString(XML_SPACES "+"), 0, status),
+      mXMLSP(UnicodeString(XML_SPACES "+", -1, US_INV), 0, status),
        
        //  XML Doctype decl  production #28
        //     example   "<!DOCTYPE foo SYSTEM "somewhere" >
+      //       or      "<!DOCTYPE foo [internal dtd]>
        //    TODO:  we don't actually parse the DOCTYPE or internal subsets.
        //           Some internal dtd subsets could confuse this simple-minded
-      //           attempt at skipping over them.
-      mXMLDoctype(UnicodeString("(?s)<!DOCTYPE.+?>"), 0, status),
+      //           attempt at skipping over them, specifically, occcurences
+      //           of closeing square brackets.  These could appear in comments, 
+      //           or in parameter entity declarations, for example.
+      mXMLDoctype(UnicodeString(
+           "(?s)<!DOCTYPE.*?(>|\\[.*?\\].*?>)", -1, US_INV
+           ), 0, status),
        
        //  XML PI     production #16
        //     example   "<?target stuff?>
-      mXMLPI(UnicodeString("(?s)<\\?.+?\\?>"), 0, status),
+      mXMLPI(UnicodeString("(?s)<\\?.+?\\?>", -1, US_INV), 0, status),
        
        //  XML Element Start   Productions #40, #41
        //          example   <foo att1='abc'  att2="d e f" >
@@ -92,11 +99,11 @@ UXMLParser::UXMLParser(UErrorCode &status) :
                  XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
                  "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
            ")*"                                                             //   * for zero or more attributes.
-          XML_SPACES "*?>"), 0, status),                               // match " >"
+          XML_SPACES "*?>", -1, US_INV), 0, status),                               // match " >"
        
        //  XML Element End     production #42
        //     example   </foo>
-      mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>"), 0, status),
+      mXMLElemEnd (UnicodeString("</(" XML_NAME ")" XML_SPACES "*>", -1, US_INV), 0, status),
        
        // XML Element Empty    production #44
        //     example   <foo att1="abc"   att2="d e f" />
@@ -105,11 +112,11 @@ UXMLParser::UXMLParser(UErrorCode &status) :
                  XML_SPACES "+" XML_NAME XML_SPACES "*=" XML_SPACES "*"     // match  "ATTR_NAME = "
                  "(?:(?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"        // match  '"attribute value"'
            ")*"                                                             //   * for zero or more attributes.
-          XML_SPACES "*?/>"), 0, status),                              // match " />"
+          XML_SPACES "*?/>", -1, US_INV), 0, status),                              // match " />"
        
  
        // XMLCharData.  Everything but '<'.  Note that & will be dealt with later.
-      mXMLCharData(UnicodeString("(?s)[^<]*"), 0, status),
+      mXMLCharData(UnicodeString("(?s)[^<]*", -1, US_INV), 0, status),
  
        // Attribute name = "value".  XML Productions 10, 40/41
        //  Capture group 1 is name, 
@@ -121,14 +128,14 @@ UXMLParser::UXMLParser(UErrorCode &status) :
        //        Here, we match a single attribute, and make its name and
        //        attribute value available to the parser code.
        mAttrValue(UnicodeString(XML_SPACES "+("  XML_NAME ")"  XML_SPACES "*=" XML_SPACES "*"
-         "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))"), 0, status),
+         "((?:\\\'[^<\\\']*?\\\')|(?:\\\"[^<\\\"]*?\\\"))", -1, US_INV), 0, status),
  
  
-      mAttrNormalizer(UnicodeString(XML_SPACES), 0, status),
+      mAttrNormalizer(UnicodeString(XML_SPACES, -1, US_INV), 0, status),
  
        // Match any of the new-line sequences in content.
        //   All are changed to \u000a.
-      mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028"), 0, status),
+      mNewLineNormalizer(UnicodeString("\\u000d\\u000a|\\u000d\\u0085|\\u000a|\\u000d|\\u0085|\\u2028", -1, US_INV), 0, status),
  
        // & char references
        //   We will figure out what we've got based on which capture group has content.
@@ -202,7 +209,7 @@ UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
              goto exit;
          }
  
-        buffer=src.getBuffer(bytesLength);
+        buffer=toUCharPtr(src.getBuffer(bytesLength));
          if(buffer==NULL) {
              // unexpected failure to reserve some string capacity
              errorCode=U_MEMORY_ALLOCATION_ERROR;
@@ -271,7 +278,7 @@ UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
          pb=bytes;
          for(;;) {
              length=src.length();
-            buffer=src.getBuffer(capacity);
+            buffer=toUCharPtr(src.getBuffer(capacity));
              if(buffer==NULL) {
                  // unexpected failure to reserve some string capacity
                  errorCode=U_MEMORY_ALLOCATION_ERROR;
@@ -306,7 +313,7 @@ UXMLParser::parseFile(const char *filename, UErrorCode &errorCode) {
              // reached end of file, convert once more to flush the converter
              flush=TRUE;
          }
-    };
+    }
  
  exit:
      ucnv_close(cnv);
@@ -558,7 +565,7 @@ UnicodeString
  UXMLParser::scanContent(UErrorCode &status) {
      UnicodeString  result;
      if (mXMLCharData.lookingAt(fPos, status)) {
-        result = mXMLCharData.group(0, status);
+        result = mXMLCharData.group((int32_t)0, status);
          // Normalize the new-lines.  (Before char ref substitution)
          mNewLineNormalizer.reset(result);
          result = mNewLineNormalizer.replaceAll(fOneLF, status);
@@ -617,7 +624,7 @@ UXMLParser::replaceCharRefs(UnicodeString &s, UErrorCode &status) {
              // An unrecognized &entity;  Leave it alone.
              //  TODO:  check that it really looks like an entity, and is not some
              //         random & in the text.
-            replacement = mAmps.group(0, status);
+            replacement = mAmps.group((int32_t)0, status);
          }
          mAmps.appendReplacement(result, replacement, status);
      }
@@ -710,8 +717,9 @@ UXMLElement::appendText(UnicodeString &text, UBool recurse) const {
      int32_t i, count=fChildren.size();
      for(i=0; i<count; ++i) {
          node=(const UObject *)fChildren.elementAt(i);
-        if(node->getDynamicClassID()==UnicodeString::getStaticClassID()) {
-            text.append(*(const UnicodeString *)node);
+        const UnicodeString *s=dynamic_cast<const UnicodeString *>(node);
+        if(s!=NULL) {
+            text.append(*s);
          } else if(recurse) /* must be a UXMLElement */ {
              ((const UXMLElement *)node)->appendText(text, recurse);
          }
@@ -761,7 +769,7 @@ const UObject *
  UXMLElement::getChild(int32_t i, UXMLNodeType &type) const {
      if(0<=i && i<fChildren.size()) {
          const UObject *node=(const UObject *)fChildren.elementAt(i);
-        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
+        if(dynamic_cast<const UXMLElement *>(node)!=NULL) {
              type=UXML_NODE_TYPE_ELEMENT;
          } else {
              type=UXML_NODE_TYPE_STRING;
@@ -782,10 +790,9 @@ UXMLElement::nextChildElement(int32_t &i) const {
      int32_t count=fChildren.size();
      while(i<count) {
          node=(const UObject *)fChildren.elementAt(i++);
-        // TODO: see if ICU can use C++ instanceof instead of its own poor man's RTTI
-        // if(node instanceof UXMLElement) {
-        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
-            return (const UXMLElement *)node;
+        const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
+        if(elem!=NULL) {
+            return elem;
          }
      }
      return NULL;
@@ -804,8 +811,8 @@ UXMLElement::getChildElement(const UnicodeString &name) const {
      int32_t i, count=fChildren.size();
      for(i=0; i<count; ++i) {
          node=(const UObject *)fChildren.elementAt(i);
-        if(node->getDynamicClassID()==UXMLElement::getStaticClassID()) {
-            const UXMLElement *elem=(const UXMLElement *)node;
+        const UXMLElement *elem=dynamic_cast<const UXMLElement *>(node);
+        if(elem!=NULL) {
              if(p==elem->fName) {
                  return elem;
              }