5 ***************************************************************************
6 * Copyright (C) 2002-2016 International Business Machines Corporation
7 * and others. All rights reserved.
8 ***************************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 #include "unicode/regex.h"
16 #include "unicode/uclean.h"
30 //--------------------------------------------------------------------------
32 // RegexPattern Default Constructor
34 //--------------------------------------------------------------------------
35 RegexPattern::RegexPattern() {
36 // Init all of this instances data.
41 //--------------------------------------------------------------------------
43 // Copy Constructor Note: This is a rather inefficient implementation,
44 // but it probably doesn't matter.
46 //--------------------------------------------------------------------------
47 RegexPattern::RegexPattern(const RegexPattern
&other
) : UObject(other
) {
54 //--------------------------------------------------------------------------
56 // Assignment Operator
58 //--------------------------------------------------------------------------
59 RegexPattern
&RegexPattern::operator = (const RegexPattern
&other
) {
61 // Source and destination are the same. Don't do anything.
65 // Clean out any previous contents of object being assigned to.
68 // Give target object a default initialization
72 fDeferredStatus
= other
.fDeferredStatus
;
74 if (U_FAILURE(fDeferredStatus
)) {
78 if (other
.fPatternString
== NULL
) {
79 fPatternString
= NULL
;
80 fPattern
= utext_clone(fPattern
, other
.fPattern
, FALSE
, TRUE
, &fDeferredStatus
);
82 fPatternString
= new UnicodeString(*(other
.fPatternString
));
83 if (fPatternString
== NULL
) {
84 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
86 fPattern
= utext_openConstUnicodeString(NULL
, fPatternString
, &fDeferredStatus
);
89 if (U_FAILURE(fDeferredStatus
)) {
93 fFlags
= other
.fFlags
;
94 fLiteralText
= other
.fLiteralText
;
95 fMinMatchLen
= other
.fMinMatchLen
;
96 fFrameSize
= other
.fFrameSize
;
97 fDataSize
= other
.fDataSize
;
98 fStaticSets
= other
.fStaticSets
;
99 fStaticSets8
= other
.fStaticSets8
;
101 fStartType
= other
.fStartType
;
102 fInitialStringIdx
= other
.fInitialStringIdx
;
103 fInitialStringLen
= other
.fInitialStringLen
;
104 *fInitialChars
= *other
.fInitialChars
;
105 fInitialChar
= other
.fInitialChar
;
106 *fInitialChars8
= *other
.fInitialChars8
;
107 fNeedsAltInput
= other
.fNeedsAltInput
;
109 // Copy the pattern. It's just values, nothing deep to copy.
110 fCompiledPat
->assign(*other
.fCompiledPat
, fDeferredStatus
);
111 fGroupMap
->assign(*other
.fGroupMap
, fDeferredStatus
);
113 // Copy the Unicode Sets.
114 // Could be made more efficient if the sets were reference counted and shared,
115 // but I doubt that pattern copying will be particularly common.
116 // Note: init() already added an empty element zero to fSets
118 int32_t numSets
= other
.fSets
->size();
119 fSets8
= new Regex8BitSet
[numSets
];
120 if (fSets8
== NULL
) {
121 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
124 for (i
=1; i
<numSets
; i
++) {
125 if (U_FAILURE(fDeferredStatus
)) {
128 UnicodeSet
*sourceSet
= (UnicodeSet
*)other
.fSets
->elementAt(i
);
129 UnicodeSet
*newSet
= new UnicodeSet(*sourceSet
);
130 if (newSet
== NULL
) {
131 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
134 fSets
->addElement(newSet
, fDeferredStatus
);
135 fSets8
[i
] = other
.fSets8
[i
];
138 // Copy the named capture group hash map.
139 int32_t hashPos
= UHASH_FIRST
;
140 while (const UHashElement
*hashEl
= uhash_nextElement(other
.fNamedCaptureMap
, &hashPos
)) {
141 if (U_FAILURE(fDeferredStatus
)) {
144 const UnicodeString
*name
= (const UnicodeString
*)hashEl
->key
.pointer
;
145 UnicodeString
*key
= new UnicodeString(*name
);
146 int32_t val
= hashEl
->value
.integer
;
148 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
150 uhash_puti(fNamedCaptureMap
, key
, val
, &fDeferredStatus
);
157 //--------------------------------------------------------------------------
159 // init Shared initialization for use by constructors.
160 // Bring an uninitialized RegexPattern up to a default state.
162 //--------------------------------------------------------------------------
163 void RegexPattern::init() {
166 fLiteralText
.remove();
169 fDeferredStatus
= U_ZERO_ERROR
;
176 fStartType
= START_NO_INFO
;
177 fInitialStringIdx
= 0;
178 fInitialStringLen
= 0;
179 fInitialChars
= NULL
;
181 fInitialChars8
= NULL
;
182 fNeedsAltInput
= FALSE
;
183 fNamedCaptureMap
= NULL
;
185 fPattern
= NULL
; // will be set later
186 fPatternString
= NULL
; // may be set later
187 fCompiledPat
= new UVector64(fDeferredStatus
);
188 fGroupMap
= new UVector32(fDeferredStatus
);
189 fSets
= new UVector(fDeferredStatus
);
190 fInitialChars
= new UnicodeSet
;
191 fInitialChars8
= new Regex8BitSet
;
192 fNamedCaptureMap
= uhash_open(uhash_hashUnicodeString
, // Key hash function
193 uhash_compareUnicodeString
, // Key comparator function
194 uhash_compareLong
, // Value comparator function
196 if (U_FAILURE(fDeferredStatus
)) {
199 if (fCompiledPat
== NULL
|| fGroupMap
== NULL
|| fSets
== NULL
||
200 fInitialChars
== NULL
|| fInitialChars8
== NULL
|| fNamedCaptureMap
== NULL
) {
201 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
205 // Slot zero of the vector of sets is reserved. Fill it here.
206 fSets
->addElement((int32_t)0, fDeferredStatus
);
208 // fNamedCaptureMap owns its key strings, type (UnicodeString *)
209 uhash_setKeyDeleter(fNamedCaptureMap
, uprv_deleteUObject
);
213 //--------------------------------------------------------------------------
215 // zap Delete everything owned by this RegexPattern.
217 //--------------------------------------------------------------------------
218 void RegexPattern::zap() {
222 for (i
=1; i
<fSets
->size(); i
++) {
224 s
= (UnicodeSet
*)fSets
->elementAt(i
);
235 delete fInitialChars
;
236 fInitialChars
= NULL
;
237 delete fInitialChars8
;
238 fInitialChars8
= NULL
;
239 if (fPattern
!= NULL
) {
240 utext_close(fPattern
);
243 if (fPatternString
!= NULL
) {
244 delete fPatternString
;
245 fPatternString
= NULL
;
247 uhash_close(fNamedCaptureMap
);
248 fNamedCaptureMap
= NULL
;
252 //--------------------------------------------------------------------------
256 //--------------------------------------------------------------------------
257 RegexPattern::~RegexPattern() {
262 //--------------------------------------------------------------------------
266 //--------------------------------------------------------------------------
267 RegexPattern
*RegexPattern::clone() const {
268 RegexPattern
*copy
= new RegexPattern(*this);
273 //--------------------------------------------------------------------------
275 // operator == (comparison) Consider to patterns to be == if the
276 // pattern strings and the flags are the same.
277 // Note that pattern strings with the same
278 // characters can still be considered different.
280 //--------------------------------------------------------------------------
281 UBool
RegexPattern::operator ==(const RegexPattern
&other
) const {
282 if (this->fFlags
== other
.fFlags
&& this->fDeferredStatus
== other
.fDeferredStatus
) {
283 if (this->fPatternString
!= NULL
&& other
.fPatternString
!= NULL
) {
284 return *(this->fPatternString
) == *(other
.fPatternString
);
285 } else if (this->fPattern
== NULL
) {
286 if (other
.fPattern
== NULL
) {
289 } else if (other
.fPattern
!= NULL
) {
290 UTEXT_SETNATIVEINDEX(this->fPattern
, 0);
291 UTEXT_SETNATIVEINDEX(other
.fPattern
, 0);
292 return utext_equals(this->fPattern
, other
.fPattern
);
298 //---------------------------------------------------------------------
302 //---------------------------------------------------------------------
303 RegexPattern
* U_EXPORT2
304 RegexPattern::compile(const UnicodeString
®ex
,
309 if (U_FAILURE(status
)) {
313 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
314 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
315 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
317 if ((flags
& ~allFlags
) != 0) {
318 status
= U_REGEX_INVALID_FLAG
;
322 if ((flags
& UREGEX_CANON_EQ
) != 0) {
323 status
= U_REGEX_UNIMPLEMENTED
;
327 RegexPattern
*This
= new RegexPattern
;
329 status
= U_MEMORY_ALLOCATION_ERROR
;
332 if (U_FAILURE(This
->fDeferredStatus
)) {
333 status
= This
->fDeferredStatus
;
337 This
->fFlags
= flags
;
339 RegexCompile
compiler(This
, status
);
340 compiler
.compile(regex
, pe
, status
);
342 if (U_FAILURE(status
)) {
352 // compile, UText mode
354 RegexPattern
* U_EXPORT2
355 RegexPattern::compile(UText
*regex
,
360 if (U_FAILURE(status
)) {
364 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
365 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
366 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
368 if ((flags
& ~allFlags
) != 0) {
369 status
= U_REGEX_INVALID_FLAG
;
373 if ((flags
& UREGEX_CANON_EQ
) != 0) {
374 status
= U_REGEX_UNIMPLEMENTED
;
378 RegexPattern
*This
= new RegexPattern
;
380 status
= U_MEMORY_ALLOCATION_ERROR
;
383 if (U_FAILURE(This
->fDeferredStatus
)) {
384 status
= This
->fDeferredStatus
;
388 This
->fFlags
= flags
;
390 RegexCompile
compiler(This
, status
);
391 compiler
.compile(regex
, pe
, status
);
393 if (U_FAILURE(status
)) {
402 // compile with default flags.
404 RegexPattern
* U_EXPORT2
405 RegexPattern::compile(const UnicodeString
®ex
,
409 return compile(regex
, 0, pe
, err
);
414 // compile with default flags, UText mode
416 RegexPattern
* U_EXPORT2
417 RegexPattern::compile(UText
*regex
,
421 return compile(regex
, 0, pe
, err
);
426 // compile with no UParseErr parameter.
428 RegexPattern
* U_EXPORT2
429 RegexPattern::compile(const UnicodeString
®ex
,
434 return compile(regex
, flags
, pe
, err
);
439 // compile with no UParseErr parameter, UText mode
441 RegexPattern
* U_EXPORT2
442 RegexPattern::compile(UText
*regex
,
447 return compile(regex
, flags
, pe
, err
);
451 //---------------------------------------------------------------------
455 //---------------------------------------------------------------------
456 uint32_t RegexPattern::flags() const {
461 //---------------------------------------------------------------------
463 // matcher(UnicodeString, err)
465 //---------------------------------------------------------------------
466 RegexMatcher
*RegexPattern::matcher(const UnicodeString
&input
,
467 UErrorCode
&status
) const {
468 RegexMatcher
*retMatcher
= matcher(status
);
469 if (retMatcher
!= NULL
) {
470 retMatcher
->fDeferredStatus
= status
;
471 retMatcher
->reset(input
);
477 //---------------------------------------------------------------------
481 //---------------------------------------------------------------------
482 RegexMatcher
*RegexPattern::matcher(UErrorCode
&status
) const {
483 RegexMatcher
*retMatcher
= NULL
;
485 if (U_FAILURE(status
)) {
488 if (U_FAILURE(fDeferredStatus
)) {
489 status
= fDeferredStatus
;
493 retMatcher
= new RegexMatcher(this);
494 if (retMatcher
== NULL
) {
495 status
= U_MEMORY_ALLOCATION_ERROR
;
503 //---------------------------------------------------------------------
505 // matches Convenience function to test for a match, starting
506 // with a pattern string and a data string.
508 //---------------------------------------------------------------------
509 UBool U_EXPORT2
RegexPattern::matches(const UnicodeString
®ex
,
510 const UnicodeString
&input
,
512 UErrorCode
&status
) {
514 if (U_FAILURE(status
)) {return FALSE
;}
517 RegexPattern
*pat
= NULL
;
518 RegexMatcher
*matcher
= NULL
;
520 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
521 matcher
= pat
->matcher(input
, status
);
522 retVal
= matcher
->matches(status
);
531 // matches, UText mode
533 UBool U_EXPORT2
RegexPattern::matches(UText
*regex
,
536 UErrorCode
&status
) {
538 if (U_FAILURE(status
)) {return FALSE
;}
540 UBool retVal
= FALSE
;
541 RegexPattern
*pat
= NULL
;
542 RegexMatcher
*matcher
= NULL
;
544 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
545 matcher
= pat
->matcher(status
);
546 if (U_SUCCESS(status
)) {
547 matcher
->reset(input
);
548 retVal
= matcher
->matches(status
);
560 //---------------------------------------------------------------------
564 //---------------------------------------------------------------------
565 UnicodeString
RegexPattern::pattern() const {
566 if (fPatternString
!= NULL
) {
567 return *fPatternString
;
568 } else if (fPattern
== NULL
) {
569 return UnicodeString();
571 UErrorCode status
= U_ZERO_ERROR
;
572 int64_t nativeLen
= utext_nativeLength(fPattern
);
573 int32_t len16
= utext_extract(fPattern
, 0, nativeLen
, NULL
, 0, &status
); // buffer overflow error
574 UnicodeString result
;
576 status
= U_ZERO_ERROR
;
577 UChar
*resultChars
= result
.getBuffer(len16
);
578 utext_extract(fPattern
, 0, nativeLen
, resultChars
, len16
, &status
); // unterminated warning
579 result
.releaseBuffer(len16
);
588 //---------------------------------------------------------------------
592 //---------------------------------------------------------------------
593 UText
*RegexPattern::patternText(UErrorCode
&status
) const {
594 if (U_FAILURE(status
)) {return NULL
;}
595 status
= U_ZERO_ERROR
;
597 if (fPattern
!= NULL
) {
600 RegexStaticSets::initGlobals(&status
);
601 return RegexStaticSets::gStaticSets
->fEmptyText
;
606 //--------------------------------------------------------------------------------
608 // groupNumberFromName()
610 //--------------------------------------------------------------------------------
611 int32_t RegexPattern::groupNumberFromName(const UnicodeString
&groupName
, UErrorCode
&status
) const {
612 if (U_FAILURE(status
)) {
616 // No need to explicitly check for syntactically valid names.
617 // Invalid ones will never be in the map, and the lookup will fail.
619 int32_t number
= uhash_geti(fNamedCaptureMap
, &groupName
);
621 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
626 int32_t RegexPattern::groupNumberFromName(const char *groupName
, int32_t nameLength
, UErrorCode
&status
) const {
627 if (U_FAILURE(status
)) {
630 UnicodeString
name(groupName
, nameLength
, US_INV
);
631 return groupNumberFromName(name
, status
);
635 //---------------------------------------------------------------------
639 //---------------------------------------------------------------------
640 int32_t RegexPattern::split(const UnicodeString
&input
,
641 UnicodeString dest
[],
642 int32_t destCapacity
,
643 UErrorCode
&status
) const
645 if (U_FAILURE(status
)) {
649 RegexMatcher
m(this);
651 // Check m's status to make sure all is ok.
652 if (U_SUCCESS(m
.fDeferredStatus
)) {
653 r
= m
.split(input
, dest
, destCapacity
, status
);
661 int32_t RegexPattern::split(UText
*input
,
663 int32_t destCapacity
,
664 UErrorCode
&status
) const
666 if (U_FAILURE(status
)) {
670 RegexMatcher
m(this);
672 // Check m's status to make sure all is ok.
673 if (U_SUCCESS(m
.fDeferredStatus
)) {
674 r
= m
.split(input
, dest
, destCapacity
, status
);
680 //---------------------------------------------------------------------
682 // dump Output the compiled form of the pattern.
683 // Debugging function only.
685 //---------------------------------------------------------------------
686 void RegexPattern::dumpOp(int32_t index
) const {
687 (void)index
; // Suppress warnings in non-debug build.
688 #if defined(REGEX_DEBUG)
689 static const char * const opNames
[] = {URX_OPCODE_NAMES
};
690 int32_t op
= fCompiledPat
->elementAti(index
);
691 int32_t val
= URX_VAL(op
);
692 int32_t type
= URX_TYPE(op
);
693 int32_t pinnedType
= type
;
694 if ((uint32_t)pinnedType
>= UPRV_LENGTHOF(opNames
)) {
698 printf("%4d %08x %-15s ", index
, op
, opNames
[pinnedType
]);
706 case URX_BACKSLASH_G
:
707 case URX_BACKSLASH_X
:
711 // Types with no operand field of interest.
714 case URX_RESERVED_OP
:
715 case URX_START_CAPTURE
:
716 case URX_END_CAPTURE
:
721 case URX_BACKSLASH_B
:
722 case URX_BACKSLASH_BU
:
723 case URX_BACKSLASH_D
:
724 case URX_BACKSLASH_Z
:
727 case URX_CTR_INIT_NG
:
729 case URX_CTR_LOOP_NG
:
730 case URX_RELOC_OPRND
:
734 case URX_STO_INP_LOC
:
746 case URX_BACKSLASH_H
:
747 case URX_BACKSLASH_R
:
748 case URX_BACKSLASH_V
:
749 // types with an integer operand field.
758 printf("'%s'", CStr(UnicodeString(val
))());
765 int32_t lengthOp
= fCompiledPat
->elementAti(index
+1);
766 U_ASSERT(URX_TYPE(lengthOp
) == URX_STRING_LEN
);
767 int32_t length
= URX_VAL(lengthOp
);
768 UnicodeString
str(fLiteralText
, val
, length
);
769 printf("%s", CStr(str
)());
777 UnicodeSet
*set
= (UnicodeSet
*)fSets
->elementAt(val
);
778 set
->toPattern(s
, TRUE
);
779 printf("%s", CStr(s
)());
783 case URX_STATIC_SETREF
:
784 case URX_STAT_SETREF_N
:
787 if (val
& URX_NEG_SET
) {
791 UnicodeSet
*set
= fStaticSets
[val
];
792 set
->toPattern(s
, TRUE
);
793 printf("%s", CStr(s
)());
807 void RegexPattern::dumpPattern() const {
808 #if defined(REGEX_DEBUG)
811 UnicodeString patStr
;
812 for (UChar32 c
= utext_next32From(fPattern
, 0); c
!= U_SENTINEL
; c
= utext_next32(fPattern
)) {
815 printf("Original Pattern: \"%s\"\n", CStr(patStr
)());
816 printf(" Min Match Length: %d\n", fMinMatchLen
);
817 printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType
));
818 if (fStartType
== START_STRING
) {
819 UnicodeString
initialString(fLiteralText
,fInitialStringIdx
, fInitialStringLen
);
820 printf(" Initial match string: \"%s\"\n", CStr(initialString
)());
821 } else if (fStartType
== START_SET
) {
823 fInitialChars
->toPattern(s
, TRUE
);
824 printf(" Match First Chars: %s\n", CStr(s
)());
826 } else if (fStartType
== START_CHAR
) {
827 printf(" First char of Match: ");
828 if (fInitialChar
> 0x20) {
829 printf("'%s'\n", CStr(UnicodeString(fInitialChar
))());
831 printf("%#x\n", fInitialChar
);
835 printf("Named Capture Groups:\n");
836 if (uhash_count(fNamedCaptureMap
) == 0) {
839 int32_t pos
= UHASH_FIRST
;
840 const UHashElement
*el
= NULL
;
841 while ((el
= uhash_nextElement(fNamedCaptureMap
, &pos
))) {
842 const UnicodeString
*name
= (const UnicodeString
*)el
->key
.pointer
;
843 int32_t number
= el
->value
.integer
;
844 printf(" %d\t%s\n", number
, CStr(*name
)());
848 printf("\nIndex Binary Type Operand\n" \
849 "-------------------------------------------\n");
850 for (index
= 0; index
<fCompiledPat
->size(); index
++) {
859 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern
)
862 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS