ICU-59180.0.1.tar.gz

[apple/icu.git] / icuSources / i18n / rbt_pars.cpp
diff --git a/icuSources/i18n/rbt_pars.cpp b/icuSources/i18n/rbt_pars.cpp

index f1c2cb97625682a9b486e6fa275187b2f3303fd7..6ed89b1fc584c9584630c4ec7f1573e744ccda7b 100644 (file)
--- a/icuSources/i18n/rbt_pars.cpp
+++ b/icuSources/i18n/rbt_pars.cpp
@@ -1,6 +1,8 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
   **********************************************************************
- *   Copyright (C) 1999-2006, International Business Machines
+ *   Copyright (C) 1999-2016, International Business Machines
   *   Corporation and others.  All Rights Reserved.
   **********************************************************************
   *   Date        Name        Description
@@ -19,6 +21,7 @@
  #include "unicode/uchar.h"
  #include "unicode/ustring.h"
  #include "unicode/uniset.h"
+#include "unicode/utf16.h"
  #include "cstring.h"
  #include "funcrepl.h"
  #include "hash.h"
@@ -33,6 +36,7 @@
  #include "tridpars.h"
  #include "uvector.h"
  #include "hash.h"
+#include "patternprops.h"
  #include "util.h"
  #include "cmemory.h"
  #include "uprops.h"
@@ -142,6 +146,8 @@ public:
                const UVector* variablesVector = 0,
                const Hashtable* variableNames = 0);
  
+    virtual ~ParseData();
+
      virtual const UnicodeString* lookup(const UnicodeString& s) const;
  
      virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
@@ -170,6 +176,8 @@ ParseData::ParseData(const TransliterationRuleData* d,
                       const Hashtable* vNames) :
      data(d), variablesVector(sets), variableNames(vNames) {}
  
+ParseData::~ParseData() {}
+
  /**
   * Implement SymbolTable API.
   */
@@ -356,7 +364,7 @@ RuleHalf::~RuleHalf() {
  int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status) {
      int32_t start = pos;
      text.truncate(0);
-    pos = parseSection(rule, pos, limit, text, ILLEGAL_TOP, FALSE, status);
+    pos = parseSection(rule, pos, limit, text, UnicodeString(TRUE, ILLEGAL_TOP, -1), FALSE, status);
  
      if (cursorOffset > 0 && cursor != cursorOffsetPos) {
          return syntaxError(U_MISPLACED_CURSOR_OFFSET, rule, start, status);
@@ -406,7 +414,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
          // Since all syntax characters are in the BMP, fetching
          // 16-bit code units suffices here.
          UChar c = rule.charAt(pos++);
-        if (uprv_isRuleWhiteSpace(c)) {
+        if (PatternProps::isWhiteSpace(c)) {
              // Ignore whitespace.  Note that this is not Unicode
              // spaces, but Java spaces -- a subset, representing
              // whitespace likely to be seen in code.
@@ -521,7 +529,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
                  int32_t segmentNumber = nextSegmentNumber++; // 1-based
                  
                  // Parse the segment
-                pos = parseSection(rule, pos, limit, buf, ILLEGAL_SEG, TRUE, status);
+                pos = parseSection(rule, pos, limit, buf, UnicodeString(TRUE, ILLEGAL_SEG, -1), TRUE, status);
                  
                  // After parsing a segment, the relevant characters are
                  // in buf, starting at offset bufSegStart.  Extract them
@@ -530,6 +538,9 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
                  StringMatcher* m =
                      new StringMatcher(buf, bufSegStart, buf.length(),
                                        segmentNumber, *parser.curData);
+                if (m == NULL) {
+                    return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
+                }
                  
                  // Record and associate object and segment number
                  parser.setSegmentObject(segmentNumber, m, status);
@@ -560,7 +571,7 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
                  int32_t bufSegStart = buf.length();
                  
                  // Parse the segment
-                pos = parseSection(rule, iref, limit, buf, ILLEGAL_FUNC, TRUE, status);
+                pos = parseSection(rule, iref, limit, buf, UnicodeString(TRUE, ILLEGAL_FUNC, -1), TRUE, status);
                  
                  // After parsing a segment, the relevant characters are
                  // in buf, starting at offset bufSegStart.
@@ -568,6 +579,9 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
                  buf.extractBetween(bufSegStart, buf.length(), output);
                  FunctionReplacer *r =
                      new FunctionReplacer(t, new StringReplacer(output, parser.curData));
+                if (r == NULL) {
+                    return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
+                }
                  
                  // Replace the buffer contents with a stand-in
                  buf.truncate(bufSegStart);
@@ -659,6 +673,9 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
  
                  UnicodeFunctor *m =
                      new StringMatcher(buf, qstart, qlimit, 0, *parser.curData);
+                if (m == NULL) {
+                    return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
+                }
                  int32_t min = 0;
                  int32_t max = Quantifier::MAX;
                  switch (c) {
@@ -673,6 +690,9 @@ int32_t RuleHalf::parseSection(const UnicodeString& rule, int32_t pos, int32_t l
                  //    do nothing -- min, max already set
                  }
                  m = new Quantifier(m, min, max);
+                if (m == NULL) {
+                    return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
+                }
                  buf.truncate(qstart);
                  buf.append(parser.generateStandInFor(m, status));
              }
@@ -776,7 +796,7 @@ void RuleHalf::removeContext() {
  UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) {
      for (int32_t i=0; i<text.length(); ) {
          UChar32 c = text.char32At(i);
-        i += UTF_CHAR_LENGTH(c);
+        i += U16_LENGTH(c);
          if (!transParser.parseData->isReplacer(c)) {
              return FALSE;
          }
@@ -791,7 +811,7 @@ UBool RuleHalf::isValidOutput(TransliteratorParser& transParser) {
  UBool RuleHalf::isValidInput(TransliteratorParser& transParser) {
      for (int32_t i=0; i<text.length(); ) {
          UChar32 c = text.char32At(i);
-        i += UTF_CHAR_LENGTH(c);
+        i += U16_LENGTH(c);
          if (!transParser.parseData->isMatcher(c)) {
              return FALSE;
          }
@@ -812,11 +832,11 @@ idBlockVector(statusReturn),
  variablesVector(statusReturn),
  segmentObjects(statusReturn)
  {
-    idBlockVector.setDeleter(uhash_deleteUnicodeString);
+    idBlockVector.setDeleter(uprv_deleteUObject);
      curData = NULL;
      compoundFilter = NULL;
      parseData = NULL;
-    variableNames.setValueDeleter(uhash_deleteUnicodeString);
+    variableNames.setValueDeleter(uprv_deleteUObject);
  }
  
  /**
@@ -865,10 +885,11 @@ UnicodeSet* TransliteratorParser::orphanCompoundFilter() {
   */
  void TransliteratorParser::parseRules(const UnicodeString& rule,
                                        UTransDirection theDirection,
-                                      UErrorCode& status) {
+                                      UErrorCode& status)
+{
      // Clear error struct
+    uprv_memset(&parseError, 0, sizeof(parseError));
      parseError.line = parseError.offset = -1;
-    parseError.preContext[0] = parseError.postContext[0] = (UChar)0;
  
      UBool parsingIDs = TRUE;
      int32_t ruleCount = 0;
@@ -900,6 +921,7 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
  
      dotStandIn = (UChar) -1;
  
+    UnicodeString *tempstr = NULL; // used for memory allocation error checking
      UnicodeString str; // scratch
      UnicodeString idBlockResult;
      int32_t pos = 0;
@@ -915,7 +937,7 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
  
      while (pos < limit && U_SUCCESS(status)) {
          UChar c = rule.charAt(pos++);
-        if (uprv_isRuleWhiteSpace(c)) {
+        if (PatternProps::isWhiteSpace(c)) {
              // Ignore leading whitespace.
              continue;
          }
@@ -944,7 +966,7 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
                  rule.compare(pos, ID_TOKEN_LEN, ID_TOKEN) == 0) {
              pos += ID_TOKEN_LEN;
              c = rule.charAt(pos);
-            while (uprv_isRuleWhiteSpace(c) && pos < limit) {
+            while (PatternProps::isWhiteSpace(c) && pos < limit) {
                  ++pos;
                  c = rule.charAt(pos);
              }
@@ -1003,13 +1025,24 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
              pos = p;
          } else {
              if (parsingIDs) {
+                tempstr = new UnicodeString(idBlockResult);
+                // NULL pointer check
+                if (tempstr == NULL) {
+                    status = U_MEMORY_ALLOCATION_ERROR;
+                    return;
+                }
                  if (direction == UTRANS_FORWARD)
-                    idBlockVector.addElement(new UnicodeString(idBlockResult), status);
+                    idBlockVector.addElement(tempstr, status);
                  else
-                    idBlockVector.insertElementAt(new UnicodeString(idBlockResult), 0, status);
+                    idBlockVector.insertElementAt(tempstr, 0, status);
                  idBlockResult.remove();
                  parsingIDs = FALSE;
                  curData = new TransliterationRuleData(status);
+                // NULL pointer check
+                if (curData == NULL) {
+                    status = U_MEMORY_ALLOCATION_ERROR;
+                    return;
+                }
                  parseData->data = curData;
  
                  // By default, rules use part of the private use area
@@ -1033,10 +1066,16 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
      }
  
      if (parsingIDs && idBlockResult.length() > 0) {
+        tempstr = new UnicodeString(idBlockResult);
+        // NULL pointer check
+        if (tempstr == NULL) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+            return;
+        }
          if (direction == UTRANS_FORWARD)
-            idBlockVector.addElement(new UnicodeString(idBlockResult), status);
+            idBlockVector.addElement(tempstr, status);
          else
-            idBlockVector.insertElementAt(new UnicodeString(idBlockResult), 0, status);
+            idBlockVector.insertElementAt(tempstr, 0, status);
      }
      else if (!parsingIDs && curData != NULL) {
          if (direction == UTRANS_FORWARD)
@@ -1055,20 +1094,30 @@ void TransliteratorParser::parseRules(const UnicodeString& rule,
                  data->variables = 0;
              } else {
                  data->variables = (UnicodeFunctor**)uprv_malloc(data->variablesLength * sizeof(UnicodeFunctor*));
+                // NULL pointer check
+                if (data->variables == NULL) {
+                    status = U_MEMORY_ALLOCATION_ERROR;
+                    return;
+                }
                  data->variablesAreOwned = (i == 0);
              }
  
              for (int32_t j = 0; j < data->variablesLength; j++) {
                  data->variables[j] =
-                    ((UnicodeSet*)variablesVector.elementAt(j));
+                    static_cast<UnicodeFunctor *>(variablesVector.elementAt(j));
              }
              
              data->variableNames.removeAll();
-            int32_t pos = -1;
+            int32_t pos = UHASH_FIRST;
              const UHashElement* he = variableNames.nextElement(pos);
              while (he != NULL) {
+                UnicodeString* tempus = (UnicodeString*)(((UnicodeString*)(he->value.pointer))->clone());
+                if (tempus == NULL) {
+                    status = U_MEMORY_ALLOCATION_ERROR;
+                    return;
+                }
                  data->variableNames.put(*((UnicodeString*)(he->key.pointer)),
-                    ((UnicodeString*)(he->value.pointer))->clone(), status);
+                    tempus, status);
                  he = variableNames.nextElement(pos);
              }
          }
@@ -1151,7 +1200,7 @@ static const UChar PRAGMA_NFC_RULES[] = {0x7E,0x6E,0x66,0x63,0x20,0x72,0x75,0x6C
   */
  UBool TransliteratorParser::resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit) {
      // Must start with /use\s/i
-    return ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_USE, NULL) >= 0;
+    return ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_USE, 4), NULL) >= 0;
  }
  
  /**
@@ -1176,25 +1225,25 @@ int32_t TransliteratorParser::parsePragma(const UnicodeString& rule, int32_t pos
      // use maximum backup 16;
      // use nfd rules;
      // use nfc rules;
-    int p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_VARIABLE_RANGE, array);
+    int p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_VARIABLE_RANGE, -1), array);
      if (p >= 0) {
          setVariableRange(array[0], array[1], status);
          return p;
      }
      
-    p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_MAXIMUM_BACKUP, array);
+    p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_MAXIMUM_BACKUP, -1), array);
      if (p >= 0) {
          pragmaMaximumBackup(array[0]);
          return p;
      }
      
-    p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_NFD_RULES, NULL);
+    p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFD_RULES, -1), NULL);
      if (p >= 0) {
          pragmaNormalizeRules(UNORM_NFD);
          return p;
      }
      
-    p = ICU_Utility::parsePattern(rule, pos, limit, PRAGMA_NFC_RULES, NULL);
+    p = ICU_Utility::parsePattern(rule, pos, limit, UnicodeString(TRUE, PRAGMA_NFC_RULES, -1), NULL);
      if (p >= 0) {
          pragmaNormalizeRules(UNORM_NFC);
          return p;
@@ -1299,6 +1348,10 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
          } 
          // We allow anything on the right, including an empty string.
          UnicodeString* value = new UnicodeString(right->text);
+        // NULL pointer check
+        if (value == NULL) {
+            return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
+        }
          variableNames.put(undefinedVariableName, value, status);
          ++variableLimit;
          return pos;
@@ -1383,17 +1436,27 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
      UnicodeFunctor** segmentsArray = NULL;
      if (segmentObjects.size() > 0) {
          segmentsArray = (UnicodeFunctor **)uprv_malloc(segmentObjects.size() * sizeof(UnicodeFunctor *));
+        // Null pointer check
+        if (segmentsArray == NULL) {
+            return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
+        }
          segmentObjects.toArray((void**) segmentsArray);
      }
+    TransliterationRule* temptr = new TransliterationRule(
+            left->text, left->ante, left->post,
+            right->text, right->cursor, right->cursorOffset,
+            segmentsArray,
+            segmentObjects.size(),
+            left->anchorStart, left->anchorEnd,
+            curData,
+            status);
+    //Null pointer check
+    if (temptr == NULL) {
+        uprv_free(segmentsArray);
+        return syntaxError(U_MEMORY_ALLOCATION_ERROR, rule, start, status);
+    }
  
-    curData->ruleSet.addRule(new TransliterationRule(
-                                 left->text, left->ante, left->post,
-                                 right->text, right->cursor, right->cursorOffset,
-                                 segmentsArray,
-                                 segmentObjects.size(),
-                                 left->anchorStart, left->anchorEnd,
-                                 curData,
-                                 status), status);
+    curData->ruleSet.addRule(temptr, status);
  
      return pos;
  }
@@ -1445,6 +1508,11 @@ UChar TransliteratorParser::parseSet(const UnicodeString& rule,
                                            ParsePosition& pos,
                                            UErrorCode& status) {
      UnicodeSet* set = new UnicodeSet(rule, pos, USET_IGNORE_SPACE, parseData, status);
+    // Null pointer check
+    if (set == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return (UChar)0x0000; // Return empty character with error.
+    }
      set->compact();
      return generateStandInFor(set, status);
  }
@@ -1507,7 +1575,7 @@ void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted,
      // and stored before segment i; be careful with the
      // vector handling here.
      if (segmentObjects.size() < seg) {
-        segmentObjects.setSize(seg);
+        segmentObjects.setSize(seg, status);
      }
      int32_t index = getSegmentStandin(seg, status) - curData->variablesBase;
      if (segmentObjects.elementAt(seg-1) != NULL ||
@@ -1526,7 +1594,13 @@ void TransliteratorParser::setSegmentObject(int32_t seg, StringMatcher* adopted,
   */
  UChar TransliteratorParser::getDotStandIn(UErrorCode& status) {
      if (dotStandIn == (UChar) -1) {
-        dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET, status), status);
+        UnicodeSet* tempus = new UnicodeSet(UnicodeString(TRUE, DOT_SET, -1), status);
+        // Null pointer check.
+        if (tempus == NULL) {
+            status = U_MEMORY_ALLOCATION_ERROR;
+            return (UChar)0x0000;
+        }
+        dotStandIn = generateStandInFor(tempus, status);
      }
      return dotStandIn;
  }
@@ -1566,14 +1640,16 @@ void TransliteratorParser::appendVariableDef(const UnicodeString& name,
  /**
   * Glue method to get around access restrictions in C++.
   */
-Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
+/*Transliterator* TransliteratorParser::createBasicInstance(const UnicodeString& id, const UnicodeString* canonID) {
      return Transliterator::createBasicInstance(id, canonID);
-}
+}*/
  
  U_NAMESPACE_END
  
  U_CAPI int32_t
  utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status) {
+    U_NAMESPACE_USE
+
      //const UChar *sourceStart = source;
      const UChar *targetStart = target;
      const UChar *sourceLimit = source+sourceLen;
@@ -1600,11 +1676,18 @@ utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorC
                      target--;
                  }
                  do {
+                    if (source == sourceLimit) {
+                        c = U_SENTINEL;
+                        break;
+                    }
                      c = *(source++);
                  }
                  while (c != CR && c != LF);
+                if (c < 0) {
+                    break;
+                }
              }
-            else if (c == ESCAPE) {
+            else if (c == ESCAPE && source < sourceLimit) {
                  UChar32   c2 = *source;
                  if (c2 == CR || c2 == LF) {
                      /* A backslash at the end of a line. */
@@ -1622,7 +1705,7 @@ utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorC
                          *status = U_PARSE_ERROR;
                          return 0;
                      }
-                    if (!uprv_isRuleWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) {
+                    if (!PatternProps::isWhiteSpace(c2) && !u_iscntrl(c2) && !u_ispunct(c2)) {
                          /* It was escaped for a reason. Write what it was suppose to be. */
                          source+=5;
                          c = c2;