]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/i18n/repattrn.cpp
ICU-64243.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / repattrn.cpp
index 2bc7d0395dbca6435024d6006a39ccb4b8d938d2..b8aee1a028dabd83ef63e54aeff1568022280db9 100644 (file)
@@ -1,10 +1,12 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 //
 //  file:  repattrn.cpp
 //
 /*
 ***************************************************************************
-*   Copyright (C) 2002-2013 International Business Machines Corporation   *
-*   and others. All rights reserved.                                      *
+*   Copyright (C) 2002-2016 International Business Machines Corporation
+*   and others. All rights reserved.
 ***************************************************************************
 */
 
 
 #include "unicode/regex.h"
 #include "unicode/uclean.h"
+#include "cmemory.h"
+#include "cstr.h"
 #include "uassert.h"
+#include "uhash.h"
 #include "uvector.h"
 #include "uvectr32.h"
 #include "uvectr64.h"
@@ -66,25 +71,32 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
     init();
 
     // Copy simple fields
-    if ( other.fPatternString == NULL ) {
+    fDeferredStatus   = other.fDeferredStatus;
+
+    if (U_FAILURE(fDeferredStatus)) {
+        return *this;
+    }
+
+    if (other.fPatternString == NULL) {
         fPatternString = NULL;
-        fPattern      = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
+        fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
     } else {
         fPatternString = new UnicodeString(*(other.fPatternString));
-        UErrorCode status = U_ZERO_ERROR;
-        fPattern      = utext_openConstUnicodeString(NULL, fPatternString, &status);
-        if (U_FAILURE(status)) {
+        if (fPatternString == NULL) {
             fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
-            return *this;
+        } else {
+            fPattern = utext_openConstUnicodeString(NULL, fPatternString, &fDeferredStatus);
         }
     }
+    if (U_FAILURE(fDeferredStatus)) {
+        return *this;
+    }
+
     fFlags            = other.fFlags;
     fLiteralText      = other.fLiteralText;
-    fDeferredStatus   = other.fDeferredStatus;
     fMinMatchLen      = other.fMinMatchLen;
     fFrameSize        = other.fFrameSize;
     fDataSize         = other.fDataSize;
-    fMaxCaptureDigits = other.fMaxCaptureDigits;
     fStaticSets       = other.fStaticSets;
     fStaticSets8      = other.fStaticSets8;
 
@@ -125,6 +137,21 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
         fSets8[i] = other.fSets8[i];
     }
 
+    // Copy the named capture group hash map.
+    int32_t hashPos = UHASH_FIRST;
+    while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
+        if (U_FAILURE(fDeferredStatus)) {
+            break;
+        }
+        const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
+        UnicodeString *key = new UnicodeString(*name);
+        int32_t val = hashEl->value.integer;
+        if (key == NULL) {
+            fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
+        } else {
+            uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
+        }
+    }
     return *this;
 }
 
@@ -146,7 +173,6 @@ void RegexPattern::init() {
     fFrameSize        = 0;
     fDataSize         = 0;
     fGroupMap         = NULL;
-    fMaxCaptureDigits = 1;
     fStaticSets       = NULL;
     fStaticSets8      = NULL;
     fStartType        = START_NO_INFO;
@@ -156,6 +182,7 @@ void RegexPattern::init() {
     fInitialChar      = 0;
     fInitialChars8    = NULL;
     fNeedsAltInput    = FALSE;
+    fNamedCaptureMap  = NULL;
 
     fPattern          = NULL; // will be set later
     fPatternString    = NULL; // may be set later
@@ -164,17 +191,24 @@ void RegexPattern::init() {
     fSets             = new UVector(fDeferredStatus);
     fInitialChars     = new UnicodeSet;
     fInitialChars8    = new Regex8BitSet;
+    fNamedCaptureMap  = uhash_open(uhash_hashUnicodeString,     // Key hash function
+                                   uhash_compareUnicodeString,  // Key comparator function
+                                   uhash_compareLong,           // Value comparator function
+                                   &fDeferredStatus);
     if (U_FAILURE(fDeferredStatus)) {
         return;
     }
     if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
-        fInitialChars == NULL || fInitialChars8 == NULL) {
+            fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
         fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
 
     // Slot zero of the vector of sets is reserved.  Fill it here.
     fSets->addElement((int32_t)0, fDeferredStatus);
+
+    // fNamedCaptureMap owns its key strings, type (UnicodeString *)
+    uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
 }
 
 
@@ -212,6 +246,8 @@ void RegexPattern::zap() {
         delete fPatternString;
         fPatternString = NULL;
     }
+    uhash_close(fNamedCaptureMap);
+    fNamedCaptureMap = NULL;
 }
 
 
@@ -569,6 +605,34 @@ UText *RegexPattern::patternText(UErrorCode      &status) const {
 }
 
 
+//--------------------------------------------------------------------------------
+//
+//  groupNumberFromName()
+//
+//--------------------------------------------------------------------------------
+int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+
+    // No need to explicitly check for syntactically valid names.
+    // Invalid ones will never be in the map, and the lookup will fail.
+
+    int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
+    if (number == 0) {
+        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+    }
+    return number;
+}
+
+int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+    UnicodeString name(groupName, nameLength, US_INV);
+    return groupNumberFromName(name, status);
+}
+
 
 //---------------------------------------------------------------------
 //
@@ -615,7 +679,6 @@ int32_t  RegexPattern::split(UText *input,
 }
 
 
-
 //---------------------------------------------------------------------
 //
 //   dump    Output the compiled form of the pattern.
@@ -630,7 +693,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
     int32_t val         = URX_VAL(op);
     int32_t type        = URX_TYPE(op);
     int32_t pinnedType  = type;
-    if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
+    if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) {
         pinnedType = 0;
     }
 
@@ -682,13 +745,20 @@ void   RegexPattern::dumpOp(int32_t index) const {
     case URX_LBN_END:
     case URX_LOOP_C:
     case URX_LOOP_DOT_I:
+    case URX_BACKSLASH_H:
+    case URX_BACKSLASH_R:
+    case URX_BACKSLASH_V:
         // types with an integer operand field.
         printf("%d", val);
         break;
 
     case URX_ONECHAR:
     case URX_ONECHAR_I:
-        printf("%c", val<256?val:'?');
+        if (val < 0x20) {
+            printf("%#x", val);
+        } else {
+            printf("'%s'", CStr(UnicodeString(val))());
+        }
         break;
 
     case URX_STRING:
@@ -697,12 +767,8 @@ void   RegexPattern::dumpOp(int32_t index) const {
             int32_t lengthOp       = fCompiledPat->elementAti(index+1);
             U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
             int32_t length = URX_VAL(lengthOp);
-            int32_t i;
-            for (i=val; i<val+length; i++) {
-                UChar c = fLiteralText[i];
-                if (c < 32 || c >= 256) {c = '.';}
-                printf("%c", c);
-            }
+            UnicodeString str(fLiteralText, val, length);
+            printf("%s", CStr(str)());
         }
         break;
 
@@ -712,9 +778,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
             UnicodeString s;
             UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
             set->toPattern(s, TRUE);
-            for (int32_t i=0; i<s.length(); i++) {
-                printf("%c", s.charAt(i));
-            }
+            printf("%s", CStr(s)());
         }
         break;
 
@@ -728,9 +792,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
             }
             UnicodeSet *set = fStaticSets[val];
             set->toPattern(s, TRUE);
-            for (int32_t i=0; i<s.length(); i++) {
-                printf("%c", s.charAt(i));
-            }
+            printf("%s", CStr(s)());
         }
         break;
 
@@ -747,56 +809,44 @@ void   RegexPattern::dumpOp(int32_t index) const {
 void RegexPattern::dumpPattern() const {
 #if defined(REGEX_DEBUG)
     int      index;
-    int      i;
 
-    printf("Original Pattern:  ");
-    UChar32 c = utext_next32From(fPattern, 0);
-    while (c != U_SENTINEL) {
-        if (c<32 || c>256) {
-            c = '.';
-        }
-        printf("%c", c);
-
-        c = UTEXT_NEXT32(fPattern);
+    UnicodeString patStr;
+    for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) {
+        patStr.append(c);
     }
-    printf("\n");
+    printf("Original Pattern:  \"%s\"\n", CStr(patStr)());
     printf("   Min Match Length:  %d\n", fMinMatchLen);
     printf("   Match Start Type:  %s\n", START_OF_MATCH_STR(fStartType));
     if (fStartType == START_STRING) {
-        printf("    Initial match string: \"");
-        for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
-            printf("%c", fLiteralText[i]);   // TODO:  non-printables, surrogates.
-        }
-        printf("\"\n");
-
+        UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen);
+        printf("   Initial match string: \"%s\"\n", CStr(initialString)());
     } else if (fStartType == START_SET) {
-        int32_t numSetChars = fInitialChars->size();
-        if (numSetChars > 20) {
-            numSetChars = 20;
-        }
-        printf("     Match First Chars : ");
-        for (i=0; i<numSetChars; i++) {
-            UChar32 c = fInitialChars->charAt(i);
-            if (0x20<c && c <0x7e) {
-                printf("%c ", c);
-            } else {
-                printf("%#x ", c);
-            }
-        }
-        if (numSetChars < fInitialChars->size()) {
-            printf(" ...");
-        }
-        printf("\n");
+        UnicodeString s;
+        fInitialChars->toPattern(s, TRUE);
+        printf("    Match First Chars: %s\n", CStr(s)());
 
     } else if (fStartType == START_CHAR) {
-        printf("    First char of Match : ");
-        if (0x20 < fInitialChar && fInitialChar<0x7e) {
-                printf("%c\n", fInitialChar);
+        printf("    First char of Match: ");
+        if (fInitialChar > 0x20) {
+                printf("'%s'\n", CStr(UnicodeString(fInitialChar))());
             } else {
                 printf("%#x\n", fInitialChar);
             }
     }
 
+    printf("Named Capture Groups:\n");
+    if (uhash_count(fNamedCaptureMap) == 0) {
+        printf("   None\n");
+    } else {
+        int32_t pos = UHASH_FIRST;
+        const UHashElement *el = NULL;
+        while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
+            const UnicodeString *name = (const UnicodeString *)el->key.pointer;
+            int32_t number = el->value.integer;
+            printf("   %d\t%s\n", number, CStr(*name)());
+        }
+    }
+
     printf("\nIndex   Binary     Type             Operand\n" \
            "-------------------------------------------\n");
     for (index = 0; index<fCompiledPat->size(); index++) {