1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
7 ***************************************************************************
8 * Copyright (C) 2002-2016 International Business Machines Corporation
9 * and others. All rights reserved.
10 ***************************************************************************
13 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 #include "unicode/regex.h"
18 #include "unicode/uclean.h"
32 //--------------------------------------------------------------------------
34 // RegexPattern Default Constructor
36 //--------------------------------------------------------------------------
37 RegexPattern::RegexPattern() {
38 // Init all of this instances data.
43 //--------------------------------------------------------------------------
45 // Copy Constructor Note: This is a rather inefficient implementation,
46 // but it probably doesn't matter.
48 //--------------------------------------------------------------------------
49 RegexPattern::RegexPattern(const RegexPattern
&other
) : UObject(other
) {
56 //--------------------------------------------------------------------------
58 // Assignment Operator
60 //--------------------------------------------------------------------------
61 RegexPattern
&RegexPattern::operator = (const RegexPattern
&other
) {
63 // Source and destination are the same. Don't do anything.
67 // Clean out any previous contents of object being assigned to.
70 // Give target object a default initialization
74 fDeferredStatus
= other
.fDeferredStatus
;
76 if (U_FAILURE(fDeferredStatus
)) {
80 if (other
.fPatternString
== NULL
) {
81 fPatternString
= NULL
;
82 fPattern
= utext_clone(fPattern
, other
.fPattern
, FALSE
, TRUE
, &fDeferredStatus
);
84 fPatternString
= new UnicodeString(*(other
.fPatternString
));
85 if (fPatternString
== NULL
) {
86 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
88 fPattern
= utext_openConstUnicodeString(NULL
, fPatternString
, &fDeferredStatus
);
91 if (U_FAILURE(fDeferredStatus
)) {
95 fFlags
= other
.fFlags
;
96 fLiteralText
= other
.fLiteralText
;
97 fMinMatchLen
= other
.fMinMatchLen
;
98 fFrameSize
= other
.fFrameSize
;
99 fDataSize
= other
.fDataSize
;
100 fStaticSets
= other
.fStaticSets
;
101 fStaticSets8
= other
.fStaticSets8
;
103 fStartType
= other
.fStartType
;
104 fInitialStringIdx
= other
.fInitialStringIdx
;
105 fInitialStringLen
= other
.fInitialStringLen
;
106 *fInitialChars
= *other
.fInitialChars
;
107 fInitialChar
= other
.fInitialChar
;
108 *fInitialChars8
= *other
.fInitialChars8
;
109 fNeedsAltInput
= other
.fNeedsAltInput
;
111 // Copy the pattern. It's just values, nothing deep to copy.
112 fCompiledPat
->assign(*other
.fCompiledPat
, fDeferredStatus
);
113 fGroupMap
->assign(*other
.fGroupMap
, fDeferredStatus
);
115 // Copy the Unicode Sets.
116 // Could be made more efficient if the sets were reference counted and shared,
117 // but I doubt that pattern copying will be particularly common.
118 // Note: init() already added an empty element zero to fSets
120 int32_t numSets
= other
.fSets
->size();
121 fSets8
= new Regex8BitSet
[numSets
];
122 if (fSets8
== NULL
) {
123 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
126 for (i
=1; i
<numSets
; i
++) {
127 if (U_FAILURE(fDeferredStatus
)) {
130 UnicodeSet
*sourceSet
= (UnicodeSet
*)other
.fSets
->elementAt(i
);
131 UnicodeSet
*newSet
= new UnicodeSet(*sourceSet
);
132 if (newSet
== NULL
) {
133 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
136 fSets
->addElement(newSet
, fDeferredStatus
);
137 fSets8
[i
] = other
.fSets8
[i
];
140 // Copy the named capture group hash map.
141 int32_t hashPos
= UHASH_FIRST
;
142 while (const UHashElement
*hashEl
= uhash_nextElement(other
.fNamedCaptureMap
, &hashPos
)) {
143 if (U_FAILURE(fDeferredStatus
)) {
146 const UnicodeString
*name
= (const UnicodeString
*)hashEl
->key
.pointer
;
147 UnicodeString
*key
= new UnicodeString(*name
);
148 int32_t val
= hashEl
->value
.integer
;
150 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
152 uhash_puti(fNamedCaptureMap
, key
, val
, &fDeferredStatus
);
159 //--------------------------------------------------------------------------
161 // init Shared initialization for use by constructors.
162 // Bring an uninitialized RegexPattern up to a default state.
164 //--------------------------------------------------------------------------
165 void RegexPattern::init() {
168 fLiteralText
.remove();
171 fDeferredStatus
= U_ZERO_ERROR
;
178 fStartType
= START_NO_INFO
;
179 fInitialStringIdx
= 0;
180 fInitialStringLen
= 0;
181 fInitialChars
= NULL
;
183 fInitialChars8
= NULL
;
184 fNeedsAltInput
= FALSE
;
185 fNamedCaptureMap
= NULL
;
187 fPattern
= NULL
; // will be set later
188 fPatternString
= NULL
; // may be set later
189 fCompiledPat
= new UVector64(fDeferredStatus
);
190 fGroupMap
= new UVector32(fDeferredStatus
);
191 fSets
= new UVector(fDeferredStatus
);
192 fInitialChars
= new UnicodeSet
;
193 fInitialChars8
= new Regex8BitSet
;
194 fNamedCaptureMap
= uhash_open(uhash_hashUnicodeString
, // Key hash function
195 uhash_compareUnicodeString
, // Key comparator function
196 uhash_compareLong
, // Value comparator function
198 if (U_FAILURE(fDeferredStatus
)) {
201 if (fCompiledPat
== NULL
|| fGroupMap
== NULL
|| fSets
== NULL
||
202 fInitialChars
== NULL
|| fInitialChars8
== NULL
|| fNamedCaptureMap
== NULL
) {
203 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
207 // Slot zero of the vector of sets is reserved. Fill it here.
208 fSets
->addElement((int32_t)0, fDeferredStatus
);
210 // fNamedCaptureMap owns its key strings, type (UnicodeString *)
211 uhash_setKeyDeleter(fNamedCaptureMap
, uprv_deleteUObject
);
215 //--------------------------------------------------------------------------
217 // zap Delete everything owned by this RegexPattern.
219 //--------------------------------------------------------------------------
220 void RegexPattern::zap() {
224 for (i
=1; i
<fSets
->size(); i
++) {
226 s
= (UnicodeSet
*)fSets
->elementAt(i
);
237 delete fInitialChars
;
238 fInitialChars
= NULL
;
239 delete fInitialChars8
;
240 fInitialChars8
= NULL
;
241 if (fPattern
!= NULL
) {
242 utext_close(fPattern
);
245 if (fPatternString
!= NULL
) {
246 delete fPatternString
;
247 fPatternString
= NULL
;
249 uhash_close(fNamedCaptureMap
);
250 fNamedCaptureMap
= NULL
;
254 //--------------------------------------------------------------------------
258 //--------------------------------------------------------------------------
259 RegexPattern::~RegexPattern() {
264 //--------------------------------------------------------------------------
268 //--------------------------------------------------------------------------
269 RegexPattern
*RegexPattern::clone() const {
270 RegexPattern
*copy
= new RegexPattern(*this);
275 //--------------------------------------------------------------------------
277 // operator == (comparison) Consider to patterns to be == if the
278 // pattern strings and the flags are the same.
279 // Note that pattern strings with the same
280 // characters can still be considered different.
282 //--------------------------------------------------------------------------
283 UBool
RegexPattern::operator ==(const RegexPattern
&other
) const {
284 if (this->fFlags
== other
.fFlags
&& this->fDeferredStatus
== other
.fDeferredStatus
) {
285 if (this->fPatternString
!= NULL
&& other
.fPatternString
!= NULL
) {
286 return *(this->fPatternString
) == *(other
.fPatternString
);
287 } else if (this->fPattern
== NULL
) {
288 if (other
.fPattern
== NULL
) {
291 } else if (other
.fPattern
!= NULL
) {
292 UTEXT_SETNATIVEINDEX(this->fPattern
, 0);
293 UTEXT_SETNATIVEINDEX(other
.fPattern
, 0);
294 return utext_equals(this->fPattern
, other
.fPattern
);
300 //---------------------------------------------------------------------
304 //---------------------------------------------------------------------
305 RegexPattern
* U_EXPORT2
306 RegexPattern::compile(const UnicodeString
®ex
,
311 if (U_FAILURE(status
)) {
315 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
316 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
317 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
319 if ((flags
& ~allFlags
) != 0) {
320 status
= U_REGEX_INVALID_FLAG
;
324 if ((flags
& UREGEX_CANON_EQ
) != 0) {
325 status
= U_REGEX_UNIMPLEMENTED
;
329 RegexPattern
*This
= new RegexPattern
;
331 status
= U_MEMORY_ALLOCATION_ERROR
;
334 if (U_FAILURE(This
->fDeferredStatus
)) {
335 status
= This
->fDeferredStatus
;
339 This
->fFlags
= flags
;
341 RegexCompile
compiler(This
, status
);
342 compiler
.compile(regex
, pe
, status
);
344 if (U_FAILURE(status
)) {
354 // compile, UText mode
356 RegexPattern
* U_EXPORT2
357 RegexPattern::compile(UText
*regex
,
362 if (U_FAILURE(status
)) {
366 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
367 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
368 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
370 if ((flags
& ~allFlags
) != 0) {
371 status
= U_REGEX_INVALID_FLAG
;
375 if ((flags
& UREGEX_CANON_EQ
) != 0) {
376 status
= U_REGEX_UNIMPLEMENTED
;
380 RegexPattern
*This
= new RegexPattern
;
382 status
= U_MEMORY_ALLOCATION_ERROR
;
385 if (U_FAILURE(This
->fDeferredStatus
)) {
386 status
= This
->fDeferredStatus
;
390 This
->fFlags
= flags
;
392 RegexCompile
compiler(This
, status
);
393 compiler
.compile(regex
, pe
, status
);
395 if (U_FAILURE(status
)) {
404 // compile with default flags.
406 RegexPattern
* U_EXPORT2
407 RegexPattern::compile(const UnicodeString
®ex
,
411 return compile(regex
, 0, pe
, err
);
416 // compile with default flags, UText mode
418 RegexPattern
* U_EXPORT2
419 RegexPattern::compile(UText
*regex
,
423 return compile(regex
, 0, pe
, err
);
428 // compile with no UParseErr parameter.
430 RegexPattern
* U_EXPORT2
431 RegexPattern::compile(const UnicodeString
®ex
,
436 return compile(regex
, flags
, pe
, err
);
441 // compile with no UParseErr parameter, UText mode
443 RegexPattern
* U_EXPORT2
444 RegexPattern::compile(UText
*regex
,
449 return compile(regex
, flags
, pe
, err
);
453 //---------------------------------------------------------------------
457 //---------------------------------------------------------------------
458 uint32_t RegexPattern::flags() const {
463 //---------------------------------------------------------------------
465 // matcher(UnicodeString, err)
467 //---------------------------------------------------------------------
468 RegexMatcher
*RegexPattern::matcher(const UnicodeString
&input
,
469 UErrorCode
&status
) const {
470 RegexMatcher
*retMatcher
= matcher(status
);
471 if (retMatcher
!= NULL
) {
472 retMatcher
->fDeferredStatus
= status
;
473 retMatcher
->reset(input
);
479 //---------------------------------------------------------------------
483 //---------------------------------------------------------------------
484 RegexMatcher
*RegexPattern::matcher(UErrorCode
&status
) const {
485 RegexMatcher
*retMatcher
= NULL
;
487 if (U_FAILURE(status
)) {
490 if (U_FAILURE(fDeferredStatus
)) {
491 status
= fDeferredStatus
;
495 retMatcher
= new RegexMatcher(this);
496 if (retMatcher
== NULL
) {
497 status
= U_MEMORY_ALLOCATION_ERROR
;
505 //---------------------------------------------------------------------
507 // matches Convenience function to test for a match, starting
508 // with a pattern string and a data string.
510 //---------------------------------------------------------------------
511 UBool U_EXPORT2
RegexPattern::matches(const UnicodeString
®ex
,
512 const UnicodeString
&input
,
514 UErrorCode
&status
) {
516 if (U_FAILURE(status
)) {return FALSE
;}
519 RegexPattern
*pat
= NULL
;
520 RegexMatcher
*matcher
= NULL
;
522 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
523 matcher
= pat
->matcher(input
, status
);
524 retVal
= matcher
->matches(status
);
533 // matches, UText mode
535 UBool U_EXPORT2
RegexPattern::matches(UText
*regex
,
538 UErrorCode
&status
) {
540 if (U_FAILURE(status
)) {return FALSE
;}
542 UBool retVal
= FALSE
;
543 RegexPattern
*pat
= NULL
;
544 RegexMatcher
*matcher
= NULL
;
546 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
547 matcher
= pat
->matcher(status
);
548 if (U_SUCCESS(status
)) {
549 matcher
->reset(input
);
550 retVal
= matcher
->matches(status
);
562 //---------------------------------------------------------------------
566 //---------------------------------------------------------------------
567 UnicodeString
RegexPattern::pattern() const {
568 if (fPatternString
!= NULL
) {
569 return *fPatternString
;
570 } else if (fPattern
== NULL
) {
571 return UnicodeString();
573 UErrorCode status
= U_ZERO_ERROR
;
574 int64_t nativeLen
= utext_nativeLength(fPattern
);
575 int32_t len16
= utext_extract(fPattern
, 0, nativeLen
, NULL
, 0, &status
); // buffer overflow error
576 UnicodeString result
;
578 status
= U_ZERO_ERROR
;
579 UChar
*resultChars
= result
.getBuffer(len16
);
580 utext_extract(fPattern
, 0, nativeLen
, resultChars
, len16
, &status
); // unterminated warning
581 result
.releaseBuffer(len16
);
590 //---------------------------------------------------------------------
594 //---------------------------------------------------------------------
595 UText
*RegexPattern::patternText(UErrorCode
&status
) const {
596 if (U_FAILURE(status
)) {return NULL
;}
597 status
= U_ZERO_ERROR
;
599 if (fPattern
!= NULL
) {
602 RegexStaticSets::initGlobals(&status
);
603 return RegexStaticSets::gStaticSets
->fEmptyText
;
608 //--------------------------------------------------------------------------------
610 // groupNumberFromName()
612 //--------------------------------------------------------------------------------
613 int32_t RegexPattern::groupNumberFromName(const UnicodeString
&groupName
, UErrorCode
&status
) const {
614 if (U_FAILURE(status
)) {
618 // No need to explicitly check for syntactically valid names.
619 // Invalid ones will never be in the map, and the lookup will fail.
621 int32_t number
= uhash_geti(fNamedCaptureMap
, &groupName
);
623 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
628 int32_t RegexPattern::groupNumberFromName(const char *groupName
, int32_t nameLength
, UErrorCode
&status
) const {
629 if (U_FAILURE(status
)) {
632 UnicodeString
name(groupName
, nameLength
, US_INV
);
633 return groupNumberFromName(name
, status
);
637 //---------------------------------------------------------------------
641 //---------------------------------------------------------------------
642 int32_t RegexPattern::split(const UnicodeString
&input
,
643 UnicodeString dest
[],
644 int32_t destCapacity
,
645 UErrorCode
&status
) const
647 if (U_FAILURE(status
)) {
651 RegexMatcher
m(this);
653 // Check m's status to make sure all is ok.
654 if (U_SUCCESS(m
.fDeferredStatus
)) {
655 r
= m
.split(input
, dest
, destCapacity
, status
);
663 int32_t RegexPattern::split(UText
*input
,
665 int32_t destCapacity
,
666 UErrorCode
&status
) const
668 if (U_FAILURE(status
)) {
672 RegexMatcher
m(this);
674 // Check m's status to make sure all is ok.
675 if (U_SUCCESS(m
.fDeferredStatus
)) {
676 r
= m
.split(input
, dest
, destCapacity
, status
);
682 //---------------------------------------------------------------------
684 // dump Output the compiled form of the pattern.
685 // Debugging function only.
687 //---------------------------------------------------------------------
688 void RegexPattern::dumpOp(int32_t index
) const {
689 (void)index
; // Suppress warnings in non-debug build.
690 #if defined(REGEX_DEBUG)
691 static const char * const opNames
[] = {URX_OPCODE_NAMES
};
692 int32_t op
= fCompiledPat
->elementAti(index
);
693 int32_t val
= URX_VAL(op
);
694 int32_t type
= URX_TYPE(op
);
695 int32_t pinnedType
= type
;
696 if ((uint32_t)pinnedType
>= UPRV_LENGTHOF(opNames
)) {
700 printf("%4d %08x %-15s ", index
, op
, opNames
[pinnedType
]);
708 case URX_BACKSLASH_G
:
709 case URX_BACKSLASH_X
:
713 // Types with no operand field of interest.
716 case URX_RESERVED_OP
:
717 case URX_START_CAPTURE
:
718 case URX_END_CAPTURE
:
723 case URX_BACKSLASH_B
:
724 case URX_BACKSLASH_BU
:
725 case URX_BACKSLASH_D
:
726 case URX_BACKSLASH_Z
:
729 case URX_CTR_INIT_NG
:
731 case URX_CTR_LOOP_NG
:
732 case URX_RELOC_OPRND
:
736 case URX_STO_INP_LOC
:
748 case URX_BACKSLASH_H
:
749 case URX_BACKSLASH_R
:
750 case URX_BACKSLASH_V
:
751 // types with an integer operand field.
760 printf("'%s'", CStr(UnicodeString(val
))());
767 int32_t lengthOp
= fCompiledPat
->elementAti(index
+1);
768 U_ASSERT(URX_TYPE(lengthOp
) == URX_STRING_LEN
);
769 int32_t length
= URX_VAL(lengthOp
);
770 UnicodeString
str(fLiteralText
, val
, length
);
771 printf("%s", CStr(str
)());
779 UnicodeSet
*set
= (UnicodeSet
*)fSets
->elementAt(val
);
780 set
->toPattern(s
, TRUE
);
781 printf("%s", CStr(s
)());
785 case URX_STATIC_SETREF
:
786 case URX_STAT_SETREF_N
:
789 if (val
& URX_NEG_SET
) {
793 UnicodeSet
*set
= fStaticSets
[val
];
794 set
->toPattern(s
, TRUE
);
795 printf("%s", CStr(s
)());
809 void RegexPattern::dumpPattern() const {
810 #if defined(REGEX_DEBUG)
813 UnicodeString patStr
;
814 for (UChar32 c
= utext_next32From(fPattern
, 0); c
!= U_SENTINEL
; c
= utext_next32(fPattern
)) {
817 printf("Original Pattern: \"%s\"\n", CStr(patStr
)());
818 printf(" Min Match Length: %d\n", fMinMatchLen
);
819 printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType
));
820 if (fStartType
== START_STRING
) {
821 UnicodeString
initialString(fLiteralText
,fInitialStringIdx
, fInitialStringLen
);
822 printf(" Initial match string: \"%s\"\n", CStr(initialString
)());
823 } else if (fStartType
== START_SET
) {
825 fInitialChars
->toPattern(s
, TRUE
);
826 printf(" Match First Chars: %s\n", CStr(s
)());
828 } else if (fStartType
== START_CHAR
) {
829 printf(" First char of Match: ");
830 if (fInitialChar
> 0x20) {
831 printf("'%s'\n", CStr(UnicodeString(fInitialChar
))());
833 printf("%#x\n", fInitialChar
);
837 printf("Named Capture Groups:\n");
838 if (uhash_count(fNamedCaptureMap
) == 0) {
841 int32_t pos
= UHASH_FIRST
;
842 const UHashElement
*el
= NULL
;
843 while ((el
= uhash_nextElement(fNamedCaptureMap
, &pos
))) {
844 const UnicodeString
*name
= (const UnicodeString
*)el
->key
.pointer
;
845 int32_t number
= el
->value
.integer
;
846 printf(" %d\t%s\n", number
, CStr(*name
)());
850 printf("\nIndex Binary Type Operand\n" \
851 "-------------------------------------------\n");
852 for (index
= 0; index
<fCompiledPat
->size(); index
++) {
861 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern
)
864 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS