5 ***************************************************************************
6 * Copyright (C) 2002-2015 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 #include "unicode/regex.h"
16 #include "unicode/uclean.h"
28 //--------------------------------------------------------------------------
30 // RegexPattern Default Constructor
32 //--------------------------------------------------------------------------
33 RegexPattern::RegexPattern() {
34 // Init all of this instances data.
39 //--------------------------------------------------------------------------
41 // Copy Constructor Note: This is a rather inefficient implementation,
42 // but it probably doesn't matter.
44 //--------------------------------------------------------------------------
45 RegexPattern::RegexPattern(const RegexPattern
&other
) : UObject(other
) {
52 //--------------------------------------------------------------------------
54 // Assignment Operator
56 //--------------------------------------------------------------------------
57 RegexPattern
&RegexPattern::operator = (const RegexPattern
&other
) {
59 // Source and destination are the same. Don't do anything.
63 // Clean out any previous contents of object being assigned to.
66 // Give target object a default initialization
70 fDeferredStatus
= other
.fDeferredStatus
;
72 if (U_FAILURE(fDeferredStatus
)) {
76 if (other
.fPatternString
== NULL
) {
77 fPatternString
= NULL
;
78 fPattern
= utext_clone(fPattern
, other
.fPattern
, FALSE
, TRUE
, &fDeferredStatus
);
80 fPatternString
= new UnicodeString(*(other
.fPatternString
));
81 if (fPatternString
== NULL
) {
82 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
84 fPattern
= utext_openConstUnicodeString(NULL
, fPatternString
, &fDeferredStatus
);
87 if (U_FAILURE(fDeferredStatus
)) {
91 fFlags
= other
.fFlags
;
92 fLiteralText
= other
.fLiteralText
;
93 fMinMatchLen
= other
.fMinMatchLen
;
94 fFrameSize
= other
.fFrameSize
;
95 fDataSize
= other
.fDataSize
;
96 fStaticSets
= other
.fStaticSets
;
97 fStaticSets8
= other
.fStaticSets8
;
99 fStartType
= other
.fStartType
;
100 fInitialStringIdx
= other
.fInitialStringIdx
;
101 fInitialStringLen
= other
.fInitialStringLen
;
102 *fInitialChars
= *other
.fInitialChars
;
103 fInitialChar
= other
.fInitialChar
;
104 *fInitialChars8
= *other
.fInitialChars8
;
105 fNeedsAltInput
= other
.fNeedsAltInput
;
107 // Copy the pattern. It's just values, nothing deep to copy.
108 fCompiledPat
->assign(*other
.fCompiledPat
, fDeferredStatus
);
109 fGroupMap
->assign(*other
.fGroupMap
, fDeferredStatus
);
111 // Copy the Unicode Sets.
112 // Could be made more efficient if the sets were reference counted and shared,
113 // but I doubt that pattern copying will be particularly common.
114 // Note: init() already added an empty element zero to fSets
116 int32_t numSets
= other
.fSets
->size();
117 fSets8
= new Regex8BitSet
[numSets
];
118 if (fSets8
== NULL
) {
119 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
122 for (i
=1; i
<numSets
; i
++) {
123 if (U_FAILURE(fDeferredStatus
)) {
126 UnicodeSet
*sourceSet
= (UnicodeSet
*)other
.fSets
->elementAt(i
);
127 UnicodeSet
*newSet
= new UnicodeSet(*sourceSet
);
128 if (newSet
== NULL
) {
129 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
132 fSets
->addElement(newSet
, fDeferredStatus
);
133 fSets8
[i
] = other
.fSets8
[i
];
136 // Copy the named capture group hash map.
137 int32_t hashPos
= UHASH_FIRST
;
138 while (const UHashElement
*hashEl
= uhash_nextElement(other
.fNamedCaptureMap
, &hashPos
)) {
139 if (U_FAILURE(fDeferredStatus
)) {
142 const UnicodeString
*name
= (const UnicodeString
*)hashEl
->key
.pointer
;
143 UnicodeString
*key
= new UnicodeString(*name
);
144 int32_t val
= hashEl
->value
.integer
;
146 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
148 uhash_puti(fNamedCaptureMap
, key
, val
, &fDeferredStatus
);
155 //--------------------------------------------------------------------------
157 // init Shared initialization for use by constructors.
158 // Bring an uninitialized RegexPattern up to a default state.
160 //--------------------------------------------------------------------------
161 void RegexPattern::init() {
164 fLiteralText
.remove();
167 fDeferredStatus
= U_ZERO_ERROR
;
174 fStartType
= START_NO_INFO
;
175 fInitialStringIdx
= 0;
176 fInitialStringLen
= 0;
177 fInitialChars
= NULL
;
179 fInitialChars8
= NULL
;
180 fNeedsAltInput
= FALSE
;
181 fNamedCaptureMap
= NULL
;
183 fPattern
= NULL
; // will be set later
184 fPatternString
= NULL
; // may be set later
185 fCompiledPat
= new UVector64(fDeferredStatus
);
186 fGroupMap
= new UVector32(fDeferredStatus
);
187 fSets
= new UVector(fDeferredStatus
);
188 fInitialChars
= new UnicodeSet
;
189 fInitialChars8
= new Regex8BitSet
;
190 fNamedCaptureMap
= uhash_open(uhash_hashUnicodeString
, // Key hash function
191 uhash_compareUnicodeString
, // Key comparator function
192 uhash_compareLong
, // Value comparator function
194 if (U_FAILURE(fDeferredStatus
)) {
197 if (fCompiledPat
== NULL
|| fGroupMap
== NULL
|| fSets
== NULL
||
198 fInitialChars
== NULL
|| fInitialChars8
== NULL
|| fNamedCaptureMap
== NULL
) {
199 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
203 // Slot zero of the vector of sets is reserved. Fill it here.
204 fSets
->addElement((int32_t)0, fDeferredStatus
);
206 // fNamedCaptureMap owns its key strings, type (UnicodeString *)
207 uhash_setKeyDeleter(fNamedCaptureMap
, uprv_deleteUObject
);
211 //--------------------------------------------------------------------------
213 // zap Delete everything owned by this RegexPattern.
215 //--------------------------------------------------------------------------
216 void RegexPattern::zap() {
220 for (i
=1; i
<fSets
->size(); i
++) {
222 s
= (UnicodeSet
*)fSets
->elementAt(i
);
233 delete fInitialChars
;
234 fInitialChars
= NULL
;
235 delete fInitialChars8
;
236 fInitialChars8
= NULL
;
237 if (fPattern
!= NULL
) {
238 utext_close(fPattern
);
241 if (fPatternString
!= NULL
) {
242 delete fPatternString
;
243 fPatternString
= NULL
;
245 uhash_close(fNamedCaptureMap
);
246 fNamedCaptureMap
= NULL
;
250 //--------------------------------------------------------------------------
254 //--------------------------------------------------------------------------
255 RegexPattern::~RegexPattern() {
260 //--------------------------------------------------------------------------
264 //--------------------------------------------------------------------------
265 RegexPattern
*RegexPattern::clone() const {
266 RegexPattern
*copy
= new RegexPattern(*this);
271 //--------------------------------------------------------------------------
273 // operator == (comparison) Consider to patterns to be == if the
274 // pattern strings and the flags are the same.
275 // Note that pattern strings with the same
276 // characters can still be considered different.
278 //--------------------------------------------------------------------------
279 UBool
RegexPattern::operator ==(const RegexPattern
&other
) const {
280 if (this->fFlags
== other
.fFlags
&& this->fDeferredStatus
== other
.fDeferredStatus
) {
281 if (this->fPatternString
!= NULL
&& other
.fPatternString
!= NULL
) {
282 return *(this->fPatternString
) == *(other
.fPatternString
);
283 } else if (this->fPattern
== NULL
) {
284 if (other
.fPattern
== NULL
) {
287 } else if (other
.fPattern
!= NULL
) {
288 UTEXT_SETNATIVEINDEX(this->fPattern
, 0);
289 UTEXT_SETNATIVEINDEX(other
.fPattern
, 0);
290 return utext_equals(this->fPattern
, other
.fPattern
);
296 //---------------------------------------------------------------------
300 //---------------------------------------------------------------------
301 RegexPattern
* U_EXPORT2
302 RegexPattern::compile(const UnicodeString
®ex
,
307 if (U_FAILURE(status
)) {
311 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
312 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
313 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
315 if ((flags
& ~allFlags
) != 0) {
316 status
= U_REGEX_INVALID_FLAG
;
320 if ((flags
& UREGEX_CANON_EQ
) != 0) {
321 status
= U_REGEX_UNIMPLEMENTED
;
325 RegexPattern
*This
= new RegexPattern
;
327 status
= U_MEMORY_ALLOCATION_ERROR
;
330 if (U_FAILURE(This
->fDeferredStatus
)) {
331 status
= This
->fDeferredStatus
;
335 This
->fFlags
= flags
;
337 RegexCompile
compiler(This
, status
);
338 compiler
.compile(regex
, pe
, status
);
340 if (U_FAILURE(status
)) {
350 // compile, UText mode
352 RegexPattern
* U_EXPORT2
353 RegexPattern::compile(UText
*regex
,
358 if (U_FAILURE(status
)) {
362 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
363 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
364 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
366 if ((flags
& ~allFlags
) != 0) {
367 status
= U_REGEX_INVALID_FLAG
;
371 if ((flags
& UREGEX_CANON_EQ
) != 0) {
372 status
= U_REGEX_UNIMPLEMENTED
;
376 RegexPattern
*This
= new RegexPattern
;
378 status
= U_MEMORY_ALLOCATION_ERROR
;
381 if (U_FAILURE(This
->fDeferredStatus
)) {
382 status
= This
->fDeferredStatus
;
386 This
->fFlags
= flags
;
388 RegexCompile
compiler(This
, status
);
389 compiler
.compile(regex
, pe
, status
);
391 if (U_FAILURE(status
)) {
400 // compile with default flags.
402 RegexPattern
* U_EXPORT2
403 RegexPattern::compile(const UnicodeString
®ex
,
407 return compile(regex
, 0, pe
, err
);
412 // compile with default flags, UText mode
414 RegexPattern
* U_EXPORT2
415 RegexPattern::compile(UText
*regex
,
419 return compile(regex
, 0, pe
, err
);
424 // compile with no UParseErr parameter.
426 RegexPattern
* U_EXPORT2
427 RegexPattern::compile(const UnicodeString
®ex
,
432 return compile(regex
, flags
, pe
, err
);
437 // compile with no UParseErr parameter, UText mode
439 RegexPattern
* U_EXPORT2
440 RegexPattern::compile(UText
*regex
,
445 return compile(regex
, flags
, pe
, err
);
449 //---------------------------------------------------------------------
453 //---------------------------------------------------------------------
454 uint32_t RegexPattern::flags() const {
459 //---------------------------------------------------------------------
461 // matcher(UnicodeString, err)
463 //---------------------------------------------------------------------
464 RegexMatcher
*RegexPattern::matcher(const UnicodeString
&input
,
465 UErrorCode
&status
) const {
466 RegexMatcher
*retMatcher
= matcher(status
);
467 if (retMatcher
!= NULL
) {
468 retMatcher
->fDeferredStatus
= status
;
469 retMatcher
->reset(input
);
475 //---------------------------------------------------------------------
479 //---------------------------------------------------------------------
480 RegexMatcher
*RegexPattern::matcher(UErrorCode
&status
) const {
481 RegexMatcher
*retMatcher
= NULL
;
483 if (U_FAILURE(status
)) {
486 if (U_FAILURE(fDeferredStatus
)) {
487 status
= fDeferredStatus
;
491 retMatcher
= new RegexMatcher(this);
492 if (retMatcher
== NULL
) {
493 status
= U_MEMORY_ALLOCATION_ERROR
;
501 //---------------------------------------------------------------------
503 // matches Convenience function to test for a match, starting
504 // with a pattern string and a data string.
506 //---------------------------------------------------------------------
507 UBool U_EXPORT2
RegexPattern::matches(const UnicodeString
®ex
,
508 const UnicodeString
&input
,
510 UErrorCode
&status
) {
512 if (U_FAILURE(status
)) {return FALSE
;}
515 RegexPattern
*pat
= NULL
;
516 RegexMatcher
*matcher
= NULL
;
518 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
519 matcher
= pat
->matcher(input
, status
);
520 retVal
= matcher
->matches(status
);
529 // matches, UText mode
531 UBool U_EXPORT2
RegexPattern::matches(UText
*regex
,
534 UErrorCode
&status
) {
536 if (U_FAILURE(status
)) {return FALSE
;}
538 UBool retVal
= FALSE
;
539 RegexPattern
*pat
= NULL
;
540 RegexMatcher
*matcher
= NULL
;
542 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
543 matcher
= pat
->matcher(status
);
544 if (U_SUCCESS(status
)) {
545 matcher
->reset(input
);
546 retVal
= matcher
->matches(status
);
558 //---------------------------------------------------------------------
562 //---------------------------------------------------------------------
563 UnicodeString
RegexPattern::pattern() const {
564 if (fPatternString
!= NULL
) {
565 return *fPatternString
;
566 } else if (fPattern
== NULL
) {
567 return UnicodeString();
569 UErrorCode status
= U_ZERO_ERROR
;
570 int64_t nativeLen
= utext_nativeLength(fPattern
);
571 int32_t len16
= utext_extract(fPattern
, 0, nativeLen
, NULL
, 0, &status
); // buffer overflow error
572 UnicodeString result
;
574 status
= U_ZERO_ERROR
;
575 UChar
*resultChars
= result
.getBuffer(len16
);
576 utext_extract(fPattern
, 0, nativeLen
, resultChars
, len16
, &status
); // unterminated warning
577 result
.releaseBuffer(len16
);
586 //---------------------------------------------------------------------
590 //---------------------------------------------------------------------
591 UText
*RegexPattern::patternText(UErrorCode
&status
) const {
592 if (U_FAILURE(status
)) {return NULL
;}
593 status
= U_ZERO_ERROR
;
595 if (fPattern
!= NULL
) {
598 RegexStaticSets::initGlobals(&status
);
599 return RegexStaticSets::gStaticSets
->fEmptyText
;
604 //--------------------------------------------------------------------------------
606 // groupNumberFromName()
608 //--------------------------------------------------------------------------------
609 int32_t RegexPattern::groupNumberFromName(const UnicodeString
&groupName
, UErrorCode
&status
) const {
610 if (U_FAILURE(status
)) {
614 // No need to explicitly check for syntactically valid names.
615 // Invalid ones will never be in the map, and the lookup will fail.
617 int32_t number
= uhash_geti(fNamedCaptureMap
, &groupName
);
619 status
= U_REGEX_INVALID_CAPTURE_GROUP_NAME
;
624 int32_t RegexPattern::groupNumberFromName(const char *groupName
, int32_t nameLength
, UErrorCode
&status
) const {
625 if (U_FAILURE(status
)) {
628 UnicodeString
name(groupName
, nameLength
, US_INV
);
629 return groupNumberFromName(name
, status
);
633 //---------------------------------------------------------------------
637 //---------------------------------------------------------------------
638 int32_t RegexPattern::split(const UnicodeString
&input
,
639 UnicodeString dest
[],
640 int32_t destCapacity
,
641 UErrorCode
&status
) const
643 if (U_FAILURE(status
)) {
647 RegexMatcher
m(this);
649 // Check m's status to make sure all is ok.
650 if (U_SUCCESS(m
.fDeferredStatus
)) {
651 r
= m
.split(input
, dest
, destCapacity
, status
);
659 int32_t RegexPattern::split(UText
*input
,
661 int32_t destCapacity
,
662 UErrorCode
&status
) const
664 if (U_FAILURE(status
)) {
668 RegexMatcher
m(this);
670 // Check m's status to make sure all is ok.
671 if (U_SUCCESS(m
.fDeferredStatus
)) {
672 r
= m
.split(input
, dest
, destCapacity
, status
);
679 //---------------------------------------------------------------------
681 // dump Output the compiled form of the pattern.
682 // Debugging function only.
684 //---------------------------------------------------------------------
685 void RegexPattern::dumpOp(int32_t index
) const {
686 (void)index
; // Suppress warnings in non-debug build.
687 #if defined(REGEX_DEBUG)
688 static const char * const opNames
[] = {URX_OPCODE_NAMES
};
689 int32_t op
= fCompiledPat
->elementAti(index
);
690 int32_t val
= URX_VAL(op
);
691 int32_t type
= URX_TYPE(op
);
692 int32_t pinnedType
= type
;
693 if ((uint32_t)pinnedType
>= sizeof(opNames
)/sizeof(char *)) {
697 printf("%4d %08x %-15s ", index
, op
, opNames
[pinnedType
]);
705 case URX_BACKSLASH_G
:
706 case URX_BACKSLASH_X
:
710 // Types with no operand field of interest.
713 case URX_RESERVED_OP
:
714 case URX_START_CAPTURE
:
715 case URX_END_CAPTURE
:
720 case URX_BACKSLASH_B
:
721 case URX_BACKSLASH_BU
:
722 case URX_BACKSLASH_D
:
723 case URX_BACKSLASH_Z
:
726 case URX_CTR_INIT_NG
:
728 case URX_CTR_LOOP_NG
:
729 case URX_RELOC_OPRND
:
733 case URX_STO_INP_LOC
:
745 case URX_BACKSLASH_H
:
746 case URX_BACKSLASH_R
:
747 case URX_BACKSLASH_V
:
748 // types with an integer operand field.
754 printf("%c", val
<256?val
:'?');
760 int32_t lengthOp
= fCompiledPat
->elementAti(index
+1);
761 U_ASSERT(URX_TYPE(lengthOp
) == URX_STRING_LEN
);
762 int32_t length
= URX_VAL(lengthOp
);
764 for (i
=val
; i
<val
+length
; i
++) {
765 UChar c
= fLiteralText
[i
];
766 if (c
< 32 || c
>= 256) {c
= '.';}
776 UnicodeSet
*set
= (UnicodeSet
*)fSets
->elementAt(val
);
777 set
->toPattern(s
, TRUE
);
778 for (int32_t i
=0; i
<s
.length(); i
++) {
779 printf("%c", s
.charAt(i
));
784 case URX_STATIC_SETREF
:
785 case URX_STAT_SETREF_N
:
788 if (val
& URX_NEG_SET
) {
792 UnicodeSet
*set
= fStaticSets
[val
];
793 set
->toPattern(s
, TRUE
);
794 for (int32_t i
=0; i
<s
.length(); i
++) {
795 printf("%c", s
.charAt(i
));
810 void RegexPattern::dumpPattern() const {
811 #if defined(REGEX_DEBUG)
812 // TODO: This function assumes an ASCII based charset.
816 printf("Original Pattern: ");
817 UChar32 c
= utext_next32From(fPattern
, 0);
818 while (c
!= U_SENTINEL
) {
824 c
= UTEXT_NEXT32(fPattern
);
827 printf(" Min Match Length: %d\n", fMinMatchLen
);
828 printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType
));
829 if (fStartType
== START_STRING
) {
830 printf(" Initial match string: \"");
831 for (i
=fInitialStringIdx
; i
<fInitialStringIdx
+fInitialStringLen
; i
++) {
832 printf("%c", fLiteralText
[i
]); // TODO: non-printables, surrogates.
836 } else if (fStartType
== START_SET
) {
837 int32_t numSetChars
= fInitialChars
->size();
838 if (numSetChars
> 20) {
841 printf(" Match First Chars : ");
842 for (i
=0; i
<numSetChars
; i
++) {
843 UChar32 c
= fInitialChars
->charAt(i
);
844 if (0x20<c
&& c
<0x7e) {
850 if (numSetChars
< fInitialChars
->size()) {
855 } else if (fStartType
== START_CHAR
) {
856 printf(" First char of Match : ");
857 if (0x20 < fInitialChar
&& fInitialChar
<0x7e) {
858 printf("%c\n", fInitialChar
);
860 printf("%#x\n", fInitialChar
);
864 printf("Named Capture Groups:\n");
865 if (uhash_count(fNamedCaptureMap
) == 0) {
868 int32_t pos
= UHASH_FIRST
;
869 const UHashElement
*el
= NULL
;
870 while ((el
= uhash_nextElement(fNamedCaptureMap
, &pos
))) {
871 const UnicodeString
*name
= (const UnicodeString
*)el
->key
.pointer
;
873 name
->extract(0, 99, s
, sizeof(s
), US_INV
); // capture group names are invariant.
874 int32_t number
= el
->value
.integer
;
875 printf(" %d\t%s\n", number
, s
);
879 printf("\nIndex Binary Type Operand\n" \
880 "-------------------------------------------\n");
881 for (index
= 0; index
<fCompiledPat
->size(); index
++) {
890 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern
)
893 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS