]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/i18n/repattrn.cpp
ICU-59117.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / repattrn.cpp
index 462a4b3ecbcadc2e88c867436227f52a2d624085..b8aee1a028dabd83ef63e54aeff1568022280db9 100644 (file)
@@ -1,10 +1,12 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
 //
-//  file:  repattrn.cpp    
+//  file:  repattrn.cpp
 //
 /*
 ***************************************************************************
-*   Copyright (C) 2002-2003 International Business Machines Corporation   *
-*   and others. All rights reserved.                                      *
+*   Copyright (C) 2002-2016 International Business Machines Corporation
+*   and others. All rights reserved.
 ***************************************************************************
 */
 
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
 
 #include "unicode/regex.h"
+#include "unicode/uclean.h"
+#include "cmemory.h"
+#include "cstr.h"
 #include "uassert.h"
+#include "uhash.h"
 #include "uvector.h"
 #include "uvectr32.h"
+#include "uvectr64.h"
 #include "regexcmp.h"
 #include "regeximp.h"
 #include "regexst.h"
@@ -30,10 +37,7 @@ U_NAMESPACE_BEGIN
 RegexPattern::RegexPattern() {
     // Init all of this instances data.
     init();
-
-    // Lazy init of all shared global sets.
-    RegexStaticSets::initGlobals(&fDeferredStatus);
-};
+}
 
 
 //--------------------------------------------------------------------------
@@ -43,7 +47,7 @@ RegexPattern::RegexPattern() {
 //
 //--------------------------------------------------------------------------
 RegexPattern::RegexPattern(const RegexPattern &other) :  UObject(other) {
-    init(); 
+    init();
     *this = other;
 }
 
@@ -51,7 +55,7 @@ RegexPattern::RegexPattern(const RegexPattern &other) :  UObject(other) {
 
 //--------------------------------------------------------------------------
 //
-//    Assignmenet Operator
+//    Assignment Operator
 //
 //--------------------------------------------------------------------------
 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
@@ -67,32 +71,58 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
     init();
 
     // Copy simple fields
-    fPattern          = other.fPattern;
+    fDeferredStatus   = other.fDeferredStatus;
+
+    if (U_FAILURE(fDeferredStatus)) {
+        return *this;
+    }
+
+    if (other.fPatternString == NULL) {
+        fPatternString = NULL;
+        fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
+    } else {
+        fPatternString = new UnicodeString(*(other.fPatternString));
+        if (fPatternString == NULL) {
+            fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
+        } else {
+            fPattern = utext_openConstUnicodeString(NULL, fPatternString, &fDeferredStatus);
+        }
+    }
+    if (U_FAILURE(fDeferredStatus)) {
+        return *this;
+    }
+
     fFlags            = other.fFlags;
     fLiteralText      = other.fLiteralText;
-    fDeferredStatus   = other.fDeferredStatus;
     fMinMatchLen      = other.fMinMatchLen;
-    fMaxCaptureDigits = other.fMaxCaptureDigits;
-    fStaticSets       = other.fStaticSets; 
-    
+    fFrameSize        = other.fFrameSize;
+    fDataSize         = other.fDataSize;
+    fStaticSets       = other.fStaticSets;
+    fStaticSets8      = other.fStaticSets8;
+
     fStartType        = other.fStartType;
     fInitialStringIdx = other.fInitialStringIdx;
     fInitialStringLen = other.fInitialStringLen;
     *fInitialChars    = *other.fInitialChars;
-    *fInitialChars8   = *other.fInitialChars8;
     fInitialChar      = other.fInitialChar;
+    *fInitialChars8   = *other.fInitialChars8;
+    fNeedsAltInput    = other.fNeedsAltInput;
 
     //  Copy the pattern.  It's just values, nothing deep to copy.
     fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
     fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
 
-    //  Copy the Unicode Sets.  
+    //  Copy the Unicode Sets.
     //    Could be made more efficient if the sets were reference counted and shared,
-    //    but I doubt that pattern copying will be particularly common. 
+    //    but I doubt that pattern copying will be particularly common.
     //    Note:  init() already added an empty element zero to fSets
     int32_t i;
     int32_t  numSets = other.fSets->size();
     fSets8 = new Regex8BitSet[numSets];
+    if (fSets8 == NULL) {
+       fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
+       return *this;
+    }
     for (i=1; i<numSets; i++) {
         if (U_FAILURE(fDeferredStatus)) {
             return *this;
@@ -107,6 +137,21 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
         fSets8[i] = other.fSets8[i];
     }
 
+    // Copy the named capture group hash map.
+    int32_t hashPos = UHASH_FIRST;
+    while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
+        if (U_FAILURE(fDeferredStatus)) {
+            break;
+        }
+        const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
+        UnicodeString *key = new UnicodeString(*name);
+        int32_t val = hashEl->value.integer;
+        if (key == NULL) {
+            fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
+        } else {
+            uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
+        }
+    }
     return *this;
 }
 
@@ -119,42 +164,57 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
 //--------------------------------------------------------------------------
 void RegexPattern::init() {
     fFlags            = 0;
+    fCompiledPat      = 0;
+    fLiteralText.remove();
+    fSets             = NULL;
+    fSets8            = NULL;
     fDeferredStatus   = U_ZERO_ERROR;
     fMinMatchLen      = 0;
-    fMaxCaptureDigits = 1;  
-    fStaticSets       = NULL;
     fFrameSize        = 0;
     fDataSize         = 0;
+    fGroupMap         = NULL;
+    fStaticSets       = NULL;
+    fStaticSets8      = NULL;
     fStartType        = START_NO_INFO;
     fInitialStringIdx = 0;
     fInitialStringLen = 0;
     fInitialChars     = NULL;
-    fInitialChars8    = NULL;
     fInitialChar      = 0;
-    fSets8            = NULL;
-    
-    fCompiledPat      = new UVector32(fDeferredStatus);
+    fInitialChars8    = NULL;
+    fNeedsAltInput    = FALSE;
+    fNamedCaptureMap  = NULL;
+
+    fPattern          = NULL; // will be set later
+    fPatternString    = NULL; // may be set later
+    fCompiledPat      = new UVector64(fDeferredStatus);
     fGroupMap         = new UVector32(fDeferredStatus);
     fSets             = new UVector(fDeferredStatus);
     fInitialChars     = new UnicodeSet;
     fInitialChars8    = new Regex8BitSet;
+    fNamedCaptureMap  = uhash_open(uhash_hashUnicodeString,     // Key hash function
+                                   uhash_compareUnicodeString,  // Key comparator function
+                                   uhash_compareLong,           // Value comparator function
+                                   &fDeferredStatus);
     if (U_FAILURE(fDeferredStatus)) {
         return;
     }
     if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
-        fInitialChars == NULL || fInitialChars8 == NULL) {
+            fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
         fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
         return;
     }
 
     // Slot zero of the vector of sets is reserved.  Fill it here.
     fSets->addElement((int32_t)0, fDeferredStatus);
+
+    // fNamedCaptureMap owns its key strings, type (UnicodeString *)
+    uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
 }
 
 
 //--------------------------------------------------------------------------
 //
-//   zap            Delete everything owned by this RegexPattern. 
+//   zap            Delete everything owned by this RegexPattern.
 //
 //--------------------------------------------------------------------------
 void RegexPattern::zap() {
@@ -170,14 +230,24 @@ void RegexPattern::zap() {
     }
     delete fSets;
     fSets = NULL;
+    delete[] fSets8;
+    fSets8 = NULL;
     delete fGroupMap;
     fGroupMap = NULL;
     delete fInitialChars;
     fInitialChars = NULL;
     delete fInitialChars8;
     fInitialChars8 = NULL;
-    delete[] fSets8;
-    fSets8 = NULL;
+    if (fPattern != NULL) {
+        utext_close(fPattern);
+        fPattern = NULL;
+    }
+    if (fPatternString != NULL) {
+        delete fPatternString;
+        fPatternString = NULL;
+    }
+    uhash_close(fNamedCaptureMap);
+    fNamedCaptureMap = NULL;
 }
 
 
@@ -188,7 +258,7 @@ void RegexPattern::zap() {
 //--------------------------------------------------------------------------
 RegexPattern::~RegexPattern() {
     zap();
-};
+}
 
 
 //--------------------------------------------------------------------------
@@ -196,42 +266,106 @@ RegexPattern::~RegexPattern() {
 //   Clone
 //
 //--------------------------------------------------------------------------
-RegexPattern  *RegexPattern::clone() const { 
+RegexPattern  *RegexPattern::clone() const {
     RegexPattern  *copy = new RegexPattern(*this);
     return copy;
-};
+}
 
 
 //--------------------------------------------------------------------------
 //
 //   operator ==   (comparison)    Consider to patterns to be == if the
 //                                 pattern strings and the flags are the same.
+//                                 Note that pattern strings with the same
+//                                 characters can still be considered different.
 //
 //--------------------------------------------------------------------------
 UBool   RegexPattern::operator ==(const RegexPattern &other) const {
-    UBool r = this->fFlags    == other.fFlags &&
-              this->fPattern  == other.fPattern &&
-              this->fDeferredStatus == other.fDeferredStatus;
-    return r;
+    if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) {
+        if (this->fPatternString != NULL && other.fPatternString != NULL) {
+            return *(this->fPatternString) == *(other.fPatternString);
+        } else if (this->fPattern == NULL) {
+            if (other.fPattern == NULL) {
+                return TRUE;
+            }
+        } else if (other.fPattern != NULL) {
+            UTEXT_SETNATIVEINDEX(this->fPattern, 0);
+            UTEXT_SETNATIVEINDEX(other.fPattern, 0);
+            return utext_equals(this->fPattern, other.fPattern);
+        }
+    }
+    return FALSE;
 }
 
 //---------------------------------------------------------------------
 //
-//   compile        
+//   compile
 //
 //---------------------------------------------------------------------
-RegexPattern  *RegexPattern::compile(
-                             const UnicodeString &regex,
-                             uint32_t             flags,
-                             UParseError          &pe,
-                             UErrorCode           &status)  {
+RegexPattern * U_EXPORT2
+RegexPattern::compile(const UnicodeString &regex,
+                      uint32_t             flags,
+                      UParseError          &pe,
+                      UErrorCode           &status)
+{
+    if (U_FAILURE(status)) {
+        return NULL;
+    }
+
+    const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
+    UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
+    UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES | UREGEX_LITERAL;
+
+    if ((flags & ~allFlags) != 0) {
+        status = U_REGEX_INVALID_FLAG;
+        return NULL;
+    }
+
+    if ((flags & UREGEX_CANON_EQ) != 0) {
+        status = U_REGEX_UNIMPLEMENTED;
+        return NULL;
+    }
+
+    RegexPattern *This = new RegexPattern;
+    if (This == NULL) {
+        status = U_MEMORY_ALLOCATION_ERROR;
+        return NULL;
+    }
+    if (U_FAILURE(This->fDeferredStatus)) {
+        status = This->fDeferredStatus;
+        delete This;
+        return NULL;
+    }
+    This->fFlags = flags;
+
+    RegexCompile     compiler(This, status);
+    compiler.compile(regex, pe, status);
 
+    if (U_FAILURE(status)) {
+        delete This;
+        This = NULL;
+    }
+
+    return This;
+}
+
+
+//
+//   compile, UText mode
+//
+RegexPattern * U_EXPORT2
+RegexPattern::compile(UText                *regex,
+                      uint32_t             flags,
+                      UParseError          &pe,
+                      UErrorCode           &status)
+{
     if (U_FAILURE(status)) {
         return NULL;
     }
 
     const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
-                              UREGEX_DOTALL   | UREGEX_MULTILINE;
+                              UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
+                              UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES | UREGEX_LITERAL;
 
     if ((flags & ~allFlags) != 0) {
         status = U_REGEX_INVALID_FLAG;
@@ -250,6 +384,7 @@ RegexPattern  *RegexPattern::compile(
     }
     if (U_FAILURE(This->fDeferredStatus)) {
         status = This->fDeferredStatus;
+        delete This;
         return NULL;
     }
     This->fFlags = flags;
@@ -257,33 +392,63 @@ RegexPattern  *RegexPattern::compile(
     RegexCompile     compiler(This, status);
     compiler.compile(regex, pe, status);
 
+    if (U_FAILURE(status)) {
+        delete This;
+        This = NULL;
+    }
+
     return This;
-};
-    
+}
+
 //
 //   compile with default flags.
 //
-RegexPattern *RegexPattern::compile( const UnicodeString &regex,
-        UParseError          &pe,
-        UErrorCode           &err) 
+RegexPattern * U_EXPORT2
+RegexPattern::compile(const UnicodeString &regex,
+                      UParseError         &pe,
+                      UErrorCode          &err)
 {
-    return compile(regex, 0, pe, err); 
+    return compile(regex, 0, pe, err);
 }
 
 
+//
+//   compile with default flags, UText mode
+//
+RegexPattern * U_EXPORT2
+RegexPattern::compile(UText               *regex,
+                      UParseError         &pe,
+                      UErrorCode          &err)
+{
+    return compile(regex, 0, pe, err);
+}
+
 
 //
 //   compile with no UParseErr parameter.
 //
-RegexPattern *RegexPattern::compile( const UnicodeString &regex,
-        uint32_t             flags,
-        UErrorCode           &err) 
+RegexPattern * U_EXPORT2
+RegexPattern::compile(const UnicodeString &regex,
+                      uint32_t             flags,
+                      UErrorCode          &err)
 {
     UParseError pe;
-    return compile(regex, flags, pe, err); 
+    return compile(regex, flags, pe, err);
 }
 
 
+//
+//   compile with no UParseErr parameter, UText mode
+//
+RegexPattern * U_EXPORT2
+RegexPattern::compile(UText                *regex,
+                      uint32_t             flags,
+                      UErrorCode           &err)
+{
+    UParseError pe;
+    return compile(regex, flags, pe, err);
+}
+
 
 //---------------------------------------------------------------------
 //
@@ -304,11 +469,11 @@ RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
                                     UErrorCode          &status)  const {
     RegexMatcher    *retMatcher = matcher(status);
     if (retMatcher != NULL) {
+        retMatcher->fDeferredStatus = status;
         retMatcher->reset(input);
     }
     return retMatcher;
-};
-
+}
 
 
 //---------------------------------------------------------------------
@@ -327,13 +492,13 @@ RegexMatcher *RegexPattern::matcher(UErrorCode &status)  const {
         return NULL;
     }
 
-    retMatcher = new RegexMatcher(this); 
+    retMatcher = new RegexMatcher(this);
     if (retMatcher == NULL) {
         status = U_MEMORY_ALLOCATION_ERROR;
         return NULL;
     }
     return retMatcher;
-};
+}
 
 
 
@@ -343,7 +508,7 @@ RegexMatcher *RegexPattern::matcher(UErrorCode &status)  const {
 //                  with a pattern string and a data string.
 //
 //---------------------------------------------------------------------
-UBool RegexPattern::matches(const UnicodeString   &regex,
+UBool U_EXPORT2 RegexPattern::matches(const UnicodeString   &regex,
               const UnicodeString   &input,
                     UParseError     &pe,
                     UErrorCode      &status) {
@@ -364,6 +529,34 @@ UBool RegexPattern::matches(const UnicodeString   &regex,
 }
 
 
+//
+//   matches, UText mode
+//
+UBool U_EXPORT2 RegexPattern::matches(UText                *regex,
+                    UText           *input,
+                    UParseError     &pe,
+                    UErrorCode      &status) {
+
+    if (U_FAILURE(status)) {return FALSE;}
+
+    UBool         retVal  = FALSE;
+    RegexPattern *pat     = NULL;
+    RegexMatcher *matcher = NULL;
+
+    pat     = RegexPattern::compile(regex, 0, pe, status);
+    matcher = pat->matcher(status);
+    if (U_SUCCESS(status)) {
+        matcher->reset(input);
+        retVal  = matcher->matches(status);
+    }
+
+    delete matcher;
+    delete pat;
+    return retVal;
+}
+
+
+
 
 
 //---------------------------------------------------------------------
@@ -372,12 +565,75 @@ UBool RegexPattern::matches(const UnicodeString   &regex,
 //
 //---------------------------------------------------------------------
 UnicodeString RegexPattern::pattern() const {
-    return fPattern;
+    if (fPatternString != NULL) {
+        return *fPatternString;
+    } else if (fPattern == NULL) {
+        return UnicodeString();
+    } else {
+        UErrorCode status = U_ZERO_ERROR;
+        int64_t nativeLen = utext_nativeLength(fPattern);
+        int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
+        UnicodeString result;
+
+        status = U_ZERO_ERROR;
+        UChar *resultChars = result.getBuffer(len16);
+        utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
+        result.releaseBuffer(len16);
+
+        return result;
+    }
 }
 
 
 
 
+//---------------------------------------------------------------------
+//
+//   patternText
+//
+//---------------------------------------------------------------------
+UText *RegexPattern::patternText(UErrorCode      &status) const {
+    if (U_FAILURE(status)) {return NULL;}
+    status = U_ZERO_ERROR;
+
+    if (fPattern != NULL) {
+        return fPattern;
+    } else {
+        RegexStaticSets::initGlobals(&status);
+        return RegexStaticSets::gStaticSets->fEmptyText;
+    }
+}
+
+
+//--------------------------------------------------------------------------------
+//
+//  groupNumberFromName()
+//
+//--------------------------------------------------------------------------------
+int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+
+    // No need to explicitly check for syntactically valid names.
+    // Invalid ones will never be in the map, and the lookup will fail.
+
+    int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
+    if (number == 0) {
+        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+    }
+    return number;
+}
+
+int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+    UnicodeString name(groupName, nameLength, US_INV);
+    return groupNumberFromName(name, status);
+}
+
+
 //---------------------------------------------------------------------
 //
 //   split
@@ -386,17 +642,41 @@ UnicodeString RegexPattern::pattern() const {
 int32_t  RegexPattern::split(const UnicodeString &input,
         UnicodeString    dest[],
         int32_t          destCapacity,
-        UErrorCode       &status) const
+        UErrorCode      &status) const
 {
     if (U_FAILURE(status)) {
         return 0;
     };
 
     RegexMatcher  m(this);
-    int32_t r = m.split(input, dest, destCapacity, status);
+    int32_t r = 0;
+    // Check m's status to make sure all is ok.
+    if (U_SUCCESS(m.fDeferredStatus)) {
+       r = m.split(input, dest, destCapacity, status);
+    }
     return r;
 }
 
+//
+//   split, UText mode
+//
+int32_t  RegexPattern::split(UText *input,
+        UText           *dest[],
+        int32_t          destCapacity,
+        UErrorCode      &status) const
+{
+    if (U_FAILURE(status)) {
+        return 0;
+    };
+
+    RegexMatcher  m(this);
+    int32_t r = 0;
+    // Check m's status to make sure all is ok.
+    if (U_SUCCESS(m.fDeferredStatus)) {
+       r = m.split(input, dest, destCapacity, status);
+    }
+    return r;
+}
 
 
 //---------------------------------------------------------------------
@@ -406,23 +686,22 @@ int32_t  RegexPattern::split(const UnicodeString &input,
 //
 //---------------------------------------------------------------------
 void   RegexPattern::dumpOp(int32_t index) const {
+    (void)index;  // Suppress warnings in non-debug build.
 #if defined(REGEX_DEBUG)
     static const char * const opNames[] = {URX_OPCODE_NAMES};
     int32_t op          = fCompiledPat->elementAti(index);
     int32_t val         = URX_VAL(op);
     int32_t type        = URX_TYPE(op);
     int32_t pinnedType  = type;
-    if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
+    if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) {
         pinnedType = 0;
     }
-    
-    REGEX_DUMP_DEBUG_PRINTF("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]);
+
+    printf("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]);
     switch (type) {
     case URX_NOP:
     case URX_DOTANY:
     case URX_DOTANY_ALL:
-    case URX_DOTANY_PL:
-    case URX_DOTANY_ALL_PL:
     case URX_FAIL:
     case URX_CARET:
     case URX_DOLLAR:
@@ -433,7 +712,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
     case URX_CARET_M:
         // Types with no operand field of interest.
         break;
-        
+
     case URX_RESERVED_OP:
     case URX_START_CAPTURE:
     case URX_END_CAPTURE:
@@ -442,6 +721,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
     case URX_JMP_SAV:
     case URX_JMP_SAV_X:
     case URX_BACKSLASH_B:
+    case URX_BACKSLASH_BU:
     case URX_BACKSLASH_D:
     case URX_BACKSLASH_Z:
     case URX_STRING_LEN:
@@ -465,27 +745,30 @@ void   RegexPattern::dumpOp(int32_t index) const {
     case URX_LBN_END:
     case URX_LOOP_C:
     case URX_LOOP_DOT_I:
+    case URX_BACKSLASH_H:
+    case URX_BACKSLASH_R:
+    case URX_BACKSLASH_V:
         // types with an integer operand field.
-        REGEX_DUMP_DEBUG_PRINTF("%d", val);
+        printf("%d", val);
         break;
-        
+
     case URX_ONECHAR:
     case URX_ONECHAR_I:
-        REGEX_DUMP_DEBUG_PRINTF("%c", val<256?val:'?');
+        if (val < 0x20) {
+            printf("%#x", val);
+        } else {
+            printf("'%s'", CStr(UnicodeString(val))());
+        }
         break;
-        
+
     case URX_STRING:
     case URX_STRING_I:
         {
             int32_t lengthOp       = fCompiledPat->elementAti(index+1);
             U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
             int32_t length = URX_VAL(lengthOp);
-            int32_t i;
-            for (i=val; i<val+length; i++) {
-                UChar c = fLiteralText[i];
-                if (c < 32 || c >= 256) {c = '.';}
-                REGEX_DUMP_DEBUG_PRINTF("%c", c);
-            }
+            UnicodeString str(fLiteralText, val, length);
+            printf("%s", CStr(str)());
         }
         break;
 
@@ -495,9 +778,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
             UnicodeString s;
             UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
             set->toPattern(s, TRUE);
-            for (int32_t i=0; i<s.length(); i++) {
-                REGEX_DUMP_DEBUG_PRINTF("%c", s.charAt(i));
-            }
+            printf("%s", CStr(s)());
         }
         break;
 
@@ -506,98 +787,78 @@ void   RegexPattern::dumpOp(int32_t index) const {
         {
             UnicodeString s;
             if (val & URX_NEG_SET) {
-                REGEX_DUMP_DEBUG_PRINTF("NOT ");
+                printf("NOT ");
                 val &= ~URX_NEG_SET;
             }
             UnicodeSet *set = fStaticSets[val];
             set->toPattern(s, TRUE);
-            for (int32_t i=0; i<s.length(); i++) {
-                REGEX_DUMP_DEBUG_PRINTF("%c", s.charAt(i));
-            }
+            printf("%s", CStr(s)());
         }
         break;
 
-        
+
     default:
-        REGEX_DUMP_DEBUG_PRINTF("??????");
+        printf("??????");
         break;
     }
-    REGEX_DUMP_DEBUG_PRINTF("\n");
+    printf("\n");
 #endif
 }
 
 
-
-void   RegexPattern::dump() const {
+void RegexPattern::dumpPattern() const {
 #if defined(REGEX_DEBUG)
     int      index;
-    int      i;
 
-    REGEX_DUMP_DEBUG_PRINTF("Original Pattern:  ");
-    for (i=0; i<fPattern.length(); i++) {
-        REGEX_DUMP_DEBUG_PRINTF("%c", fPattern.charAt(i));
+    UnicodeString patStr;
+    for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) {
+        patStr.append(c);
     }
-    REGEX_DUMP_DEBUG_PRINTF("\n");
-    REGEX_DUMP_DEBUG_PRINTF("   Min Match Length:  %d\n", fMinMatchLen);
-    REGEX_DUMP_DEBUG_PRINTF("   Match Start Type:  %s\n", START_OF_MATCH_STR(fStartType));   
+    printf("Original Pattern:  \"%s\"\n", CStr(patStr)());
+    printf("   Min Match Length:  %d\n", fMinMatchLen);
+    printf("   Match Start Type:  %s\n", START_OF_MATCH_STR(fStartType));
     if (fStartType == START_STRING) {
-        REGEX_DUMP_DEBUG_PRINTF("    Initial match sting: \"");
-        for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
-            REGEX_DUMP_DEBUG_PRINTF("%c", fLiteralText[i]);   // TODO:  non-printables, surrogates.
-        }
-
+        UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen);
+        printf("   Initial match string: \"%s\"\n", CStr(initialString)());
     } else if (fStartType == START_SET) {
-        int32_t numSetChars = fInitialChars->size();
-        if (numSetChars > 20) {
-            numSetChars = 20;
-        }
-        REGEX_DUMP_DEBUG_PRINTF("     Match First Chars : ");
-        for (i=0; i<numSetChars; i++) {
-            UChar32 c = fInitialChars->charAt(i);
-            if (0x20<c && c <0x7e) { 
-                REGEX_DUMP_DEBUG_PRINTF("%c ", c);
-            } else {
-                REGEX_DUMP_DEBUG_PRINTF("%#x ", c);
-            }
-        }
-        if (numSetChars < fInitialChars->size()) {
-            REGEX_DUMP_DEBUG_PRINTF(" ...");
-        }
-        REGEX_DUMP_DEBUG_PRINTF("\n");
+        UnicodeString s;
+        fInitialChars->toPattern(s, TRUE);
+        printf("    Match First Chars: %s\n", CStr(s)());
 
     } else if (fStartType == START_CHAR) {
-        REGEX_DUMP_DEBUG_PRINTF("    First char of Match : ");
-        if (0x20 < fInitialChar && fInitialChar<0x7e) {
-                REGEX_DUMP_DEBUG_PRINTF("%c\n", fInitialChar);
+        printf("    First char of Match: ");
+        if (fInitialChar > 0x20) {
+                printf("'%s'\n", CStr(UnicodeString(fInitialChar))());
             } else {
-                REGEX_DUMP_DEBUG_PRINTF("%#x\n", fInitialChar);
+                printf("%#x\n", fInitialChar);
             }
     }
 
-    REGEX_DUMP_DEBUG_PRINTF("\nIndex   Binary     Type             Operand\n"
+    printf("Named Capture Groups:\n");
+    if (uhash_count(fNamedCaptureMap) == 0) {
+        printf("   None\n");
+    } else {
+        int32_t pos = UHASH_FIRST;
+        const UHashElement *el = NULL;
+        while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
+            const UnicodeString *name = (const UnicodeString *)el->key.pointer;
+            int32_t number = el->value.integer;
+            printf("   %d\t%s\n", number, CStr(*name)());
+        }
+    }
+
+    printf("\nIndex   Binary     Type             Operand\n" \
            "-------------------------------------------\n");
     for (index = 0; index<fCompiledPat->size(); index++) {
         dumpOp(index);
     }
-    REGEX_DUMP_DEBUG_PRINTF("\n\n");
+    printf("\n\n");
 #endif
-};
-
+}
 
 
-const char RegexPattern::fgClassID = 0;
 
-//----------------------------------------------------------------------------------
-//
-//   regex_cleanup      Memory cleanup function, free/delete all
-//                      cached memory.  Called by ICU's u_cleanup() function.
-//
-//----------------------------------------------------------------------------------
-U_CFUNC UBool 
-regex_cleanup(void) {
-    RegexCompile::cleanup();
-    return TRUE;
-};
+UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
 
 U_NAMESPACE_END
 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS