1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
7 ***************************************************************************
8 * Copyright (C) 2002-2016 International Business Machines Corporation
9 * and others. All rights reserved.
10 ***************************************************************************
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uclean.h"
32 //--------------------------------------------------------------------------
34 // RegexPattern Default Constructor
36 //--------------------------------------------------------------------------
37 RegexPattern::RegexPattern() {
38 // Init all of this instances data.
43 //--------------------------------------------------------------------------
45 // Copy Constructor Note: This is a rather inefficient implementation,
46 // but it probably doesn't matter.
48 //--------------------------------------------------------------------------
49 RegexPattern::RegexPattern(const RegexPattern
&other
) : UObject(other
) {
56 //--------------------------------------------------------------------------
58 // Assignment Operator
60 //--------------------------------------------------------------------------
61 RegexPattern
&RegexPattern::operator = (const RegexPattern
&other
) {
63 // Source and destination are the same. Don't do anything.
67 // Clean out any previous contents of object being assigned to.
70 // Give target object a default initialization
74 fDeferredStatus
= other
.fDeferredStatus
;
76 if (U_FAILURE(fDeferredStatus
)) {
80 if (other
.fPatternString
== NULL
) {
81 fPatternString
= NULL
;
82 fPattern
= utext_clone(fPattern
, other
.fPattern
, FALSE
, TRUE
, &fDeferredStatus
);
84 fPatternString
= new UnicodeString(*(other
.fPatternString
));
85 if (fPatternString
== NULL
) {
86 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
88 fPattern
= utext_openConstUnicodeString(NULL
, fPatternString
, &fDeferredStatus
);
91 if (U_FAILURE(fDeferredStatus
)) {
95 fFlags
= other
.fFlags
;
96 fLiteralText
= other
.fLiteralText
;
97 fMinMatchLen
= other
.fMinMatchLen
;
98 fFrameSize
= other
.fFrameSize
;
99 fDataSize
= other
.fDataSize
;
100 fStaticSets
= other
.fStaticSets
;
101 fStaticSets8
= other
.fStaticSets8
;
103 fStartType
= other
.fStartType
;
104 fInitialStringIdx
= other
.fInitialStringIdx
;
105 fInitialStringLen
= other
.fInitialStringLen
;
106 *fInitialChars
= *other
.fInitialChars
;
107 fInitialChar
= other
.fInitialChar
;
108 *fInitialChars8
= *other
.fInitialChars8
;
109 fNeedsAltInput
= other
.fNeedsAltInput
;
111 // Copy the pattern. It's just values, nothing deep to copy.
112 fCompiledPat
->assign(*other
.fCompiledPat
, fDeferredStatus
);
113 fGroupMap
->assign(*other
.fGroupMap
, fDeferredStatus
);
115 // Copy the Unicode Sets.
116 // Could be made more efficient if the sets were reference counted and shared,
117 // but I doubt that pattern copying will be particularly common.
118 // Note: init() already added an empty element zero to fSets
120 int32_t numSets
= other
.fSets
->size();
121 fSets8
= new Regex8BitSet
[numSets
];
122 if (fSets8
== NULL
) {
123 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
126 for (i
=1; i
<numSets
; i
++) {
127 if (U_FAILURE(fDeferredStatus
)) {
130 UnicodeSet
*sourceSet
= (UnicodeSet
*)other
.fSets
->elementAt(i
);
131 UnicodeSet
*newSet
= new UnicodeSet(*sourceSet
);
132 if (newSet
== NULL
) {
133 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
136 fSets
->addElement(newSet
, fDeferredStatus
);
137 fSets8
[i
] = other
.fSets8
[i
];
140 // Copy the named capture group hash map.
141 if (other
.fNamedCaptureMap
!= nullptr && initNamedCaptureMap()) {
142 int32_t hashPos
= UHASH_FIRST
;
143 while (const UHashElement
*hashEl
= uhash_nextElement(other
.fNamedCaptureMap
, &hashPos
)) {
144 if (U_FAILURE(fDeferredStatus
)) {
147 const UnicodeString
*name
= (const UnicodeString
*)hashEl
->key
.pointer
;
148 UnicodeString
*key
= new UnicodeString(*name
);
149 int32_t val
= hashEl
->value
.integer
;
151 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
153 uhash_puti(fNamedCaptureMap
, key
, val
, &fDeferredStatus
);
161 //--------------------------------------------------------------------------
163 // init Shared initialization for use by constructors.
164 // Bring an uninitialized RegexPattern up to a default state.
166 //--------------------------------------------------------------------------
167 void RegexPattern::init() {
170 fLiteralText
.remove();
173 fDeferredStatus
= U_ZERO_ERROR
;
180 fStartType
= START_NO_INFO
;
181 fInitialStringIdx
= 0;
182 fInitialStringLen
= 0;
183 fInitialChars
= NULL
;
185 fInitialChars8
= NULL
;
186 fNeedsAltInput
= FALSE
;
187 fNamedCaptureMap
= NULL
;
189 fPattern
= NULL
; // will be set later
190 fPatternString
= NULL
; // may be set later
191 fCompiledPat
= new UVector64(fDeferredStatus
);
192 fGroupMap
= new UVector32(fDeferredStatus
);
193 fSets
= new UVector(fDeferredStatus
);
194 fInitialChars
= new UnicodeSet
;
195 fInitialChars8
= new Regex8BitSet
;
196 if (U_FAILURE(fDeferredStatus
)) {
199 if (fCompiledPat
== NULL
|| fGroupMap
== NULL
|| fSets
== NULL
||
200 fInitialChars
== NULL
|| fInitialChars8
== NULL
) {
201 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
205 // Slot zero of the vector of sets is reserved. Fill it here.
206 fSets
->addElement((int32_t)0, fDeferredStatus
);
210 bool RegexPattern::initNamedCaptureMap() {
211 if (fNamedCaptureMap
) {
214 fNamedCaptureMap
= uhash_openSize(uhash_hashUnicodeString
, // Key hash function
215 uhash_compareUnicodeString
, // Key comparator function
216 uhash_compareLong
, // Value comparator function
217 7, // Initial table capacity
219 if (U_FAILURE(fDeferredStatus
)) {
223 // fNamedCaptureMap owns its key strings, type (UnicodeString *)
224 uhash_setKeyDeleter(fNamedCaptureMap
, uprv_deleteUObject
);
228 //--------------------------------------------------------------------------
230 // zap Delete everything owned by this RegexPattern.
232 //--------------------------------------------------------------------------
233 void RegexPattern::zap() {
237 for (i
=1; i
<fSets
->size(); i
++) {
239 s
= (UnicodeSet
*)fSets
->elementAt(i
);
250 delete fInitialChars
;
251 fInitialChars
= NULL
;
252 delete fInitialChars8
;
253 fInitialChars8
= NULL
;
254 if (fPattern
!= NULL
) {
255 utext_close(fPattern
);
258 if (fPatternString
!= NULL
) {
259 delete fPatternString
;
260 fPatternString
= NULL
;
262 if (fNamedCaptureMap
!= NULL
) {
263 uhash_close(fNamedCaptureMap
);
264 fNamedCaptureMap
= NULL
;
269 //--------------------------------------------------------------------------
273 //--------------------------------------------------------------------------
274 RegexPattern::~RegexPattern() {
279 //--------------------------------------------------------------------------
283 //--------------------------------------------------------------------------
284 RegexPattern
*RegexPattern::clone() const {
285 RegexPattern
*copy
= new RegexPattern(*this);
290 //--------------------------------------------------------------------------
292 // operator == (comparison) Consider to patterns to be == if the
293 // pattern strings and the flags are the same.
294 // Note that pattern strings with the same
295 // characters can still be considered different.
297 //--------------------------------------------------------------------------
298 UBool
RegexPattern::operator ==(const RegexPattern
&other
) const {
299 if (this->fFlags
== other
.fFlags
&& this->fDeferredStatus
== other
.fDeferredStatus
) {
300 if (this->fPatternString
!= NULL
&& other
.fPatternString
!= NULL
) {
301 return *(this->fPatternString
) == *(other
.fPatternString
);
302 } else if (this->fPattern
== NULL
) {
303 if (other
.fPattern
== NULL
) {
306 } else if (other
.fPattern
!= NULL
) {
307 UTEXT_SETNATIVEINDEX(this->fPattern
, 0);
308 UTEXT_SETNATIVEINDEX(other
.fPattern
, 0);
309 return utext_equals(this->fPattern
, other
.fPattern
);
315 //---------------------------------------------------------------------
319 //---------------------------------------------------------------------
320 RegexPattern
* U_EXPORT2
321 RegexPattern::compile(const UnicodeString
®ex
,
326 if (U_FAILURE(status
)) {
330 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
331 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
332 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
334 if ((flags
& ~allFlags
) != 0) {
335 status
= U_REGEX_INVALID_FLAG
;
339 if ((flags
& UREGEX_CANON_EQ
) != 0) {
340 status
= U_REGEX_UNIMPLEMENTED
;
344 RegexPattern
*This
= new RegexPattern
;
346 status
= U_MEMORY_ALLOCATION_ERROR
;
349 if (U_FAILURE(This
->fDeferredStatus
)) {
350 status
= This
->fDeferredStatus
;
354 This
->fFlags
= flags
;
356 RegexCompile
compiler(This
, status
);
357 compiler
.compile(regex
, pe
, status
);
359 if (U_FAILURE(status
)) {
369 // compile, UText mode
371 RegexPattern
* U_EXPORT2
372 RegexPattern::compile(UText
*regex
,
377 if (U_FAILURE(status
)) {
381 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
382 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
383 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
385 if ((flags
& ~allFlags
) != 0) {
386 status
= U_REGEX_INVALID_FLAG
;
390 if ((flags
& UREGEX_CANON_EQ
) != 0) {
391 status
= U_REGEX_UNIMPLEMENTED
;
395 RegexPattern
*This
= new RegexPattern
;
397 status
= U_MEMORY_ALLOCATION_ERROR
;
400 if (U_FAILURE(This
->fDeferredStatus
)) {
401 status
= This
->fDeferredStatus
;
405 This
->fFlags
= flags
;
407 RegexCompile
compiler(This
, status
);
408 compiler
.compile(regex
, pe
, status
);
410 if (U_FAILURE(status
)) {
419 // compile with default flags.
421 RegexPattern
* U_EXPORT2
422 RegexPattern::compile(const UnicodeString
®ex
,
426 return compile(regex
, 0, pe
, err
);
431 // compile with default flags, UText mode
433 RegexPattern
* U_EXPORT2
434 RegexPattern::compile(UText
*regex
,
438 return compile(regex
, 0, pe
, err
);
443 // compile with no UParseErr parameter.
445 RegexPattern
* U_EXPORT2
446 RegexPattern::compile(const UnicodeString
®ex
,
451 return compile(regex
, flags
, pe
, err
);
456 // compile with no UParseErr parameter, UText mode
458 RegexPattern
* U_EXPORT2
459 RegexPattern::compile(UText
*regex
,
464 return compile(regex
, flags
, pe
, err
);
468 //---------------------------------------------------------------------
472 //---------------------------------------------------------------------
473 uint32_t RegexPattern::flags() const {
478 //---------------------------------------------------------------------
480 // matcher(UnicodeString, err)
482 //---------------------------------------------------------------------
483 RegexMatcher
*RegexPattern::matcher(const UnicodeString
&input
,
484 UErrorCode
&status
) const {
485 RegexMatcher
*retMatcher
= matcher(status
);
486 if (retMatcher
!= NULL
) {
487 retMatcher
->fDeferredStatus
= status
;
488 retMatcher
->reset(input
);
494 //---------------------------------------------------------------------
498 //---------------------------------------------------------------------
499 RegexMatcher
*RegexPattern::matcher(UErrorCode
&status
) const {
500 RegexMatcher
*retMatcher
= NULL
;
502 if (U_FAILURE(status
)) {
505 if (U_FAILURE(fDeferredStatus
)) {
506 status
= fDeferredStatus
;
510 retMatcher
= new RegexMatcher(this);
511 if (retMatcher
== NULL
) {
512 status
= U_MEMORY_ALLOCATION_ERROR
;
520 //---------------------------------------------------------------------
522 // matches Convenience function to test for a match, starting
523 // with a pattern string and a data string.
525 //---------------------------------------------------------------------
526 UBool U_EXPORT2
RegexPattern::matches(const UnicodeString
®ex
,
527 const UnicodeString
&input
,
529 UErrorCode
&status
) {
531 if (U_FAILURE(status
)) {return FALSE
;}
534 RegexPattern
*pat
= NULL
;
535 RegexMatcher
*matcher
= NULL
;
537 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
538 matcher
= pat
->matcher(input
, status
);
539 retVal
= matcher
->matches(status
);
548 // matches, UText mode
550 UBool U_EXPORT2
RegexPattern::matches(UText
*regex
,
553 UErrorCode
&status
) {
555 if (U_FAILURE(status
)) {return FALSE
;}
557 UBool retVal
= FALSE
;
558 RegexPattern
*pat
= NULL
;
559 RegexMatcher
*matcher
= NULL
;
561 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
562 matcher
= pat
->matcher(status
);
563 if (U_SUCCESS(status
)) {
564 matcher
->reset(input
);
565 retVal
= matcher
->matches(status
);
577 //---------------------------------------------------------------------
581 //---------------------------------------------------------------------
582 UnicodeString
RegexPattern::pattern() const {
583 if (fPatternString
!= NULL
) {
584 return *fPatternString
;
585 } else if (fPattern
== NULL
) {
586 return UnicodeString();
588 UErrorCode status
= U_ZERO_ERROR
;
589 int64_t nativeLen
= utext_nativeLength(fPattern
);
590 int32_t len16
= utext_extract(fPattern
, 0, nativeLen
, NULL
, 0, &status
); // buffer overflow error
591 UnicodeString result
;
593 status
= U_ZERO_ERROR
;
594 UChar
*resultChars
= result
.getBuffer(len16
);
595 utext_extract(fPattern
, 0, nativeLen
, resultChars
, len16
, &status
); // unterminated warning
596 result
.releaseBuffer(len16
);
605 //---------------------------------------------------------------------
609 //---------------------------------------------------------------------
610 UText
*RegexPattern::patternText(UErrorCode
&status
) const {
611 if (U_FAILURE(status
)) {return NULL
;}
612 status
= U_ZERO_ERROR
;
614 if (fPattern
!= NULL
) {
617 RegexStaticSets::initGlobals(&status
);
618 return RegexStaticSets::gStaticSets
->fEmptyText
;
623 //--------------------------------------------------------------------------------
625 // groupNumberFromName()
627 //--------------------------------------------------------------------------------
628 int32_t RegexPattern::groupNumberFromName(const UnicodeString
&groupName
, UErrorCode
&status
) const {
629 if (U_FAILURE(status
)) {
633 // No need to explicitly check for syntactically valid names.
634 // Invalid ones will never be in the map, and the lookup will fail.
636 int32_t number
= fNamedCaptureMap
? uhash_geti(fNamedCaptureMap
, &groupName
) : 0;
638 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
643 int32_t RegexPattern::groupNumberFromName(const char *groupName
, int32_t nameLength
, UErrorCode
&status
) const {
644 if (U_FAILURE(status
)) {
647 UnicodeString
name(groupName
, nameLength
, US_INV
);
648 return groupNumberFromName(name
, status
);
652 //---------------------------------------------------------------------
656 //---------------------------------------------------------------------
657 int32_t RegexPattern::split(const UnicodeString
&input
,
658 UnicodeString dest
[],
659 int32_t destCapacity
,
660 UErrorCode
&status
) const
662 if (U_FAILURE(status
)) {
666 RegexMatcher
m(this);
668 // Check m's status to make sure all is ok.
669 if (U_SUCCESS(m
.fDeferredStatus
)) {
670 r
= m
.split(input
, dest
, destCapacity
, status
);
678 int32_t RegexPattern::split(UText
*input
,
680 int32_t destCapacity
,
681 UErrorCode
&status
) const
683 if (U_FAILURE(status
)) {
687 RegexMatcher
m(this);
689 // Check m's status to make sure all is ok.
690 if (U_SUCCESS(m
.fDeferredStatus
)) {
691 r
= m
.split(input
, dest
, destCapacity
, status
);
697 //---------------------------------------------------------------------
699 // dump Output the compiled form of the pattern.
700 // Debugging function only.
702 //---------------------------------------------------------------------
703 void RegexPattern::dumpOp(int32_t index
) const {
704 (void)index
; // Suppress warnings in non-debug build.
705 #if defined(REGEX_DEBUG)
706 static const char * const opNames
[] = {URX_OPCODE_NAMES
};
707 int32_t op
= fCompiledPat
->elementAti(index
);
708 int32_t val
= URX_VAL(op
);
709 int32_t type
= URX_TYPE(op
);
710 int32_t pinnedType
= type
;
711 if ((uint32_t)pinnedType
>= UPRV_LENGTHOF(opNames
)) {
715 printf("%4d %08x %-15s ", index
, op
, opNames
[pinnedType
]);
723 case URX_BACKSLASH_G
:
724 case URX_BACKSLASH_X
:
728 // Types with no operand field of interest.
731 case URX_RESERVED_OP
:
732 case URX_START_CAPTURE
:
733 case URX_END_CAPTURE
:
738 case URX_BACKSLASH_B
:
739 case URX_BACKSLASH_BU
:
740 case URX_BACKSLASH_D
:
741 case URX_BACKSLASH_Z
:
744 case URX_CTR_INIT_NG
:
746 case URX_CTR_LOOP_NG
:
747 case URX_RELOC_OPRND
:
751 case URX_STO_INP_LOC
:
763 case URX_BACKSLASH_H
:
764 case URX_BACKSLASH_R
:
765 case URX_BACKSLASH_V
:
766 // types with an integer operand field.
775 printf("'%s'", CStr(UnicodeString(val
))());
782 int32_t lengthOp
= fCompiledPat
->elementAti(index
+1);
783 U_ASSERT(URX_TYPE(lengthOp
) == URX_STRING_LEN
);
784 int32_t length
= URX_VAL(lengthOp
);
785 UnicodeString
str(fLiteralText
, val
, length
);
786 printf("%s", CStr(str
)());
794 UnicodeSet
*set
= (UnicodeSet
*)fSets
->elementAt(val
);
795 set
->toPattern(s
, TRUE
);
796 printf("%s", CStr(s
)());
800 case URX_STATIC_SETREF
:
801 case URX_STAT_SETREF_N
:
804 if (val
& URX_NEG_SET
) {
808 UnicodeSet
*set
= fStaticSets
[val
];
809 set
->toPattern(s
, TRUE
);
810 printf("%s", CStr(s
)());
824 void RegexPattern::dumpPattern() const {
825 #if defined(REGEX_DEBUG)
828 UnicodeString patStr
;
829 for (UChar32 c
= utext_next32From(fPattern
, 0); c
!= U_SENTINEL
; c
= utext_next32(fPattern
)) {
832 printf("Original Pattern: \"%s\"\n", CStr(patStr
)());
833 printf(" Min Match Length: %d\n", fMinMatchLen
);
834 printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType
));
835 if (fStartType
== START_STRING
) {
836 UnicodeString
initialString(fLiteralText
,fInitialStringIdx
, fInitialStringLen
);
837 printf(" Initial match string: \"%s\"\n", CStr(initialString
)());
838 } else if (fStartType
== START_SET
) {
840 fInitialChars
->toPattern(s
, TRUE
);
841 printf(" Match First Chars: %s\n", CStr(s
)());
843 } else if (fStartType
== START_CHAR
) {
844 printf(" First char of Match: ");
845 if (fInitialChar
> 0x20) {
846 printf("'%s'\n", CStr(UnicodeString(fInitialChar
))());
848 printf("%#x\n", fInitialChar
);
852 printf("Named Capture Groups:\n");
853 if (!fNamedCaptureMap
|| uhash_count(fNamedCaptureMap
) == 0) {
856 int32_t pos
= UHASH_FIRST
;
857 const UHashElement
*el
= NULL
;
858 while ((el
= uhash_nextElement(fNamedCaptureMap
, &pos
))) {
859 const UnicodeString
*name
= (const UnicodeString
*)el
->key
.pointer
;
860 int32_t number
= el
->value
.integer
;
861 printf(" %d\t%s\n", number
, CStr(*name
)());
865 printf("\nIndex Binary Type Operand\n" \
866 "-------------------------------------------\n");
867 for (index
= 0; index
<fCompiledPat
->size(); index
++) {
876 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern
)
879 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS