5 ***************************************************************************
6 * Copyright (C) 2002-2013 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 #include "unicode/regex.h"
16 #include "unicode/uclean.h"
27 //--------------------------------------------------------------------------
29 // RegexPattern Default Constructor
31 //--------------------------------------------------------------------------
32 RegexPattern::RegexPattern() {
33 // Init all of this instances data.
38 //--------------------------------------------------------------------------
40 // Copy Constructor Note: This is a rather inefficient implementation,
41 // but it probably doesn't matter.
43 //--------------------------------------------------------------------------
44 RegexPattern::RegexPattern(const RegexPattern
&other
) : UObject(other
) {
51 //--------------------------------------------------------------------------
53 // Assignment Operator
55 //--------------------------------------------------------------------------
56 RegexPattern
&RegexPattern::operator = (const RegexPattern
&other
) {
58 // Source and destination are the same. Don't do anything.
62 // Clean out any previous contents of object being assigned to.
65 // Give target object a default initialization
69 if ( other
.fPatternString
== NULL
) {
70 fPatternString
= NULL
;
71 fPattern
= utext_clone(fPattern
, other
.fPattern
, FALSE
, TRUE
, &fDeferredStatus
);
73 fPatternString
= new UnicodeString(*(other
.fPatternString
));
74 UErrorCode status
= U_ZERO_ERROR
;
75 fPattern
= utext_openConstUnicodeString(NULL
, fPatternString
, &status
);
76 if (U_FAILURE(status
)) {
77 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
81 fFlags
= other
.fFlags
;
82 fLiteralText
= other
.fLiteralText
;
83 fDeferredStatus
= other
.fDeferredStatus
;
84 fMinMatchLen
= other
.fMinMatchLen
;
85 fFrameSize
= other
.fFrameSize
;
86 fDataSize
= other
.fDataSize
;
87 fMaxCaptureDigits
= other
.fMaxCaptureDigits
;
88 fStaticSets
= other
.fStaticSets
;
89 fStaticSets8
= other
.fStaticSets8
;
91 fStartType
= other
.fStartType
;
92 fInitialStringIdx
= other
.fInitialStringIdx
;
93 fInitialStringLen
= other
.fInitialStringLen
;
94 *fInitialChars
= *other
.fInitialChars
;
95 fInitialChar
= other
.fInitialChar
;
96 *fInitialChars8
= *other
.fInitialChars8
;
97 fNeedsAltInput
= other
.fNeedsAltInput
;
99 // Copy the pattern. It's just values, nothing deep to copy.
100 fCompiledPat
->assign(*other
.fCompiledPat
, fDeferredStatus
);
101 fGroupMap
->assign(*other
.fGroupMap
, fDeferredStatus
);
103 // Copy the Unicode Sets.
104 // Could be made more efficient if the sets were reference counted and shared,
105 // but I doubt that pattern copying will be particularly common.
106 // Note: init() already added an empty element zero to fSets
108 int32_t numSets
= other
.fSets
->size();
109 fSets8
= new Regex8BitSet
[numSets
];
110 if (fSets8
== NULL
) {
111 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
114 for (i
=1; i
<numSets
; i
++) {
115 if (U_FAILURE(fDeferredStatus
)) {
118 UnicodeSet
*sourceSet
= (UnicodeSet
*)other
.fSets
->elementAt(i
);
119 UnicodeSet
*newSet
= new UnicodeSet(*sourceSet
);
120 if (newSet
== NULL
) {
121 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
124 fSets
->addElement(newSet
, fDeferredStatus
);
125 fSets8
[i
] = other
.fSets8
[i
];
132 //--------------------------------------------------------------------------
134 // init Shared initialization for use by constructors.
135 // Bring an uninitialized RegexPattern up to a default state.
137 //--------------------------------------------------------------------------
138 void RegexPattern::init() {
141 fLiteralText
.remove();
144 fDeferredStatus
= U_ZERO_ERROR
;
149 fMaxCaptureDigits
= 1;
152 fStartType
= START_NO_INFO
;
153 fInitialStringIdx
= 0;
154 fInitialStringLen
= 0;
155 fInitialChars
= NULL
;
157 fInitialChars8
= NULL
;
158 fNeedsAltInput
= FALSE
;
160 fPattern
= NULL
; // will be set later
161 fPatternString
= NULL
; // may be set later
162 fCompiledPat
= new UVector64(fDeferredStatus
);
163 fGroupMap
= new UVector32(fDeferredStatus
);
164 fSets
= new UVector(fDeferredStatus
);
165 fInitialChars
= new UnicodeSet
;
166 fInitialChars8
= new Regex8BitSet
;
167 if (U_FAILURE(fDeferredStatus
)) {
170 if (fCompiledPat
== NULL
|| fGroupMap
== NULL
|| fSets
== NULL
||
171 fInitialChars
== NULL
|| fInitialChars8
== NULL
) {
172 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
176 // Slot zero of the vector of sets is reserved. Fill it here.
177 fSets
->addElement((int32_t)0, fDeferredStatus
);
181 //--------------------------------------------------------------------------
183 // zap Delete everything owned by this RegexPattern.
185 //--------------------------------------------------------------------------
186 void RegexPattern::zap() {
190 for (i
=1; i
<fSets
->size(); i
++) {
192 s
= (UnicodeSet
*)fSets
->elementAt(i
);
203 delete fInitialChars
;
204 fInitialChars
= NULL
;
205 delete fInitialChars8
;
206 fInitialChars8
= NULL
;
207 if (fPattern
!= NULL
) {
208 utext_close(fPattern
);
211 if (fPatternString
!= NULL
) {
212 delete fPatternString
;
213 fPatternString
= NULL
;
218 //--------------------------------------------------------------------------
222 //--------------------------------------------------------------------------
223 RegexPattern::~RegexPattern() {
228 //--------------------------------------------------------------------------
232 //--------------------------------------------------------------------------
233 RegexPattern
*RegexPattern::clone() const {
234 RegexPattern
*copy
= new RegexPattern(*this);
239 //--------------------------------------------------------------------------
241 // operator == (comparison) Consider to patterns to be == if the
242 // pattern strings and the flags are the same.
243 // Note that pattern strings with the same
244 // characters can still be considered different.
246 //--------------------------------------------------------------------------
247 UBool
RegexPattern::operator ==(const RegexPattern
&other
) const {
248 if (this->fFlags
== other
.fFlags
&& this->fDeferredStatus
== other
.fDeferredStatus
) {
249 if (this->fPatternString
!= NULL
&& other
.fPatternString
!= NULL
) {
250 return *(this->fPatternString
) == *(other
.fPatternString
);
251 } else if (this->fPattern
== NULL
) {
252 if (other
.fPattern
== NULL
) {
255 } else if (other
.fPattern
!= NULL
) {
256 UTEXT_SETNATIVEINDEX(this->fPattern
, 0);
257 UTEXT_SETNATIVEINDEX(other
.fPattern
, 0);
258 return utext_equals(this->fPattern
, other
.fPattern
);
264 //---------------------------------------------------------------------
268 //---------------------------------------------------------------------
269 RegexPattern
* U_EXPORT2
270 RegexPattern::compile(const UnicodeString
®ex
,
275 if (U_FAILURE(status
)) {
279 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
280 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
281 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
283 if ((flags
& ~allFlags
) != 0) {
284 status
= U_REGEX_INVALID_FLAG
;
288 if ((flags
& UREGEX_CANON_EQ
) != 0) {
289 status
= U_REGEX_UNIMPLEMENTED
;
293 RegexPattern
*This
= new RegexPattern
;
295 status
= U_MEMORY_ALLOCATION_ERROR
;
298 if (U_FAILURE(This
->fDeferredStatus
)) {
299 status
= This
->fDeferredStatus
;
303 This
->fFlags
= flags
;
305 RegexCompile
compiler(This
, status
);
306 compiler
.compile(regex
, pe
, status
);
308 if (U_FAILURE(status
)) {
318 // compile, UText mode
320 RegexPattern
* U_EXPORT2
321 RegexPattern::compile(UText
*regex
,
326 if (U_FAILURE(status
)) {
330 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
331 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
332 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
334 if ((flags
& ~allFlags
) != 0) {
335 status
= U_REGEX_INVALID_FLAG
;
339 if ((flags
& UREGEX_CANON_EQ
) != 0) {
340 status
= U_REGEX_UNIMPLEMENTED
;
344 RegexPattern
*This
= new RegexPattern
;
346 status
= U_MEMORY_ALLOCATION_ERROR
;
349 if (U_FAILURE(This
->fDeferredStatus
)) {
350 status
= This
->fDeferredStatus
;
354 This
->fFlags
= flags
;
356 RegexCompile
compiler(This
, status
);
357 compiler
.compile(regex
, pe
, status
);
359 if (U_FAILURE(status
)) {
368 // compile with default flags.
370 RegexPattern
* U_EXPORT2
371 RegexPattern::compile(const UnicodeString
®ex
,
375 return compile(regex
, 0, pe
, err
);
380 // compile with default flags, UText mode
382 RegexPattern
* U_EXPORT2
383 RegexPattern::compile(UText
*regex
,
387 return compile(regex
, 0, pe
, err
);
392 // compile with no UParseErr parameter.
394 RegexPattern
* U_EXPORT2
395 RegexPattern::compile(const UnicodeString
®ex
,
400 return compile(regex
, flags
, pe
, err
);
405 // compile with no UParseErr parameter, UText mode
407 RegexPattern
* U_EXPORT2
408 RegexPattern::compile(UText
*regex
,
413 return compile(regex
, flags
, pe
, err
);
417 //---------------------------------------------------------------------
421 //---------------------------------------------------------------------
422 uint32_t RegexPattern::flags() const {
427 //---------------------------------------------------------------------
429 // matcher(UnicodeString, err)
431 //---------------------------------------------------------------------
432 RegexMatcher
*RegexPattern::matcher(const UnicodeString
&input
,
433 UErrorCode
&status
) const {
434 RegexMatcher
*retMatcher
= matcher(status
);
435 if (retMatcher
!= NULL
) {
436 retMatcher
->fDeferredStatus
= status
;
437 retMatcher
->reset(input
);
443 //---------------------------------------------------------------------
447 //---------------------------------------------------------------------
448 RegexMatcher
*RegexPattern::matcher(UErrorCode
&status
) const {
449 RegexMatcher
*retMatcher
= NULL
;
451 if (U_FAILURE(status
)) {
454 if (U_FAILURE(fDeferredStatus
)) {
455 status
= fDeferredStatus
;
459 retMatcher
= new RegexMatcher(this);
460 if (retMatcher
== NULL
) {
461 status
= U_MEMORY_ALLOCATION_ERROR
;
469 //---------------------------------------------------------------------
471 // matches Convenience function to test for a match, starting
472 // with a pattern string and a data string.
474 //---------------------------------------------------------------------
475 UBool U_EXPORT2
RegexPattern::matches(const UnicodeString
®ex
,
476 const UnicodeString
&input
,
478 UErrorCode
&status
) {
480 if (U_FAILURE(status
)) {return FALSE
;}
483 RegexPattern
*pat
= NULL
;
484 RegexMatcher
*matcher
= NULL
;
486 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
487 matcher
= pat
->matcher(input
, status
);
488 retVal
= matcher
->matches(status
);
497 // matches, UText mode
499 UBool U_EXPORT2
RegexPattern::matches(UText
*regex
,
502 UErrorCode
&status
) {
504 if (U_FAILURE(status
)) {return FALSE
;}
506 UBool retVal
= FALSE
;
507 RegexPattern
*pat
= NULL
;
508 RegexMatcher
*matcher
= NULL
;
510 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
511 matcher
= pat
->matcher(status
);
512 if (U_SUCCESS(status
)) {
513 matcher
->reset(input
);
514 retVal
= matcher
->matches(status
);
526 //---------------------------------------------------------------------
530 //---------------------------------------------------------------------
531 UnicodeString
RegexPattern::pattern() const {
532 if (fPatternString
!= NULL
) {
533 return *fPatternString
;
534 } else if (fPattern
== NULL
) {
535 return UnicodeString();
537 UErrorCode status
= U_ZERO_ERROR
;
538 int64_t nativeLen
= utext_nativeLength(fPattern
);
539 int32_t len16
= utext_extract(fPattern
, 0, nativeLen
, NULL
, 0, &status
); // buffer overflow error
540 UnicodeString result
;
542 status
= U_ZERO_ERROR
;
543 UChar
*resultChars
= result
.getBuffer(len16
);
544 utext_extract(fPattern
, 0, nativeLen
, resultChars
, len16
, &status
); // unterminated warning
545 result
.releaseBuffer(len16
);
554 //---------------------------------------------------------------------
558 //---------------------------------------------------------------------
559 UText
*RegexPattern::patternText(UErrorCode
&status
) const {
560 if (U_FAILURE(status
)) {return NULL
;}
561 status
= U_ZERO_ERROR
;
563 if (fPattern
!= NULL
) {
566 RegexStaticSets::initGlobals(&status
);
567 return RegexStaticSets::gStaticSets
->fEmptyText
;
573 //---------------------------------------------------------------------
577 //---------------------------------------------------------------------
578 int32_t RegexPattern::split(const UnicodeString
&input
,
579 UnicodeString dest
[],
580 int32_t destCapacity
,
581 UErrorCode
&status
) const
583 if (U_FAILURE(status
)) {
587 RegexMatcher
m(this);
589 // Check m's status to make sure all is ok.
590 if (U_SUCCESS(m
.fDeferredStatus
)) {
591 r
= m
.split(input
, dest
, destCapacity
, status
);
599 int32_t RegexPattern::split(UText
*input
,
601 int32_t destCapacity
,
602 UErrorCode
&status
) const
604 if (U_FAILURE(status
)) {
608 RegexMatcher
m(this);
610 // Check m's status to make sure all is ok.
611 if (U_SUCCESS(m
.fDeferredStatus
)) {
612 r
= m
.split(input
, dest
, destCapacity
, status
);
619 //---------------------------------------------------------------------
621 // dump Output the compiled form of the pattern.
622 // Debugging function only.
624 //---------------------------------------------------------------------
625 void RegexPattern::dumpOp(int32_t index
) const {
626 (void)index
; // Suppress warnings in non-debug build.
627 #if defined(REGEX_DEBUG)
628 static const char * const opNames
[] = {URX_OPCODE_NAMES
};
629 int32_t op
= fCompiledPat
->elementAti(index
);
630 int32_t val
= URX_VAL(op
);
631 int32_t type
= URX_TYPE(op
);
632 int32_t pinnedType
= type
;
633 if ((uint32_t)pinnedType
>= sizeof(opNames
)/sizeof(char *)) {
637 printf("%4d %08x %-15s ", index
, op
, opNames
[pinnedType
]);
645 case URX_BACKSLASH_G
:
646 case URX_BACKSLASH_X
:
650 // Types with no operand field of interest.
653 case URX_RESERVED_OP
:
654 case URX_START_CAPTURE
:
655 case URX_END_CAPTURE
:
660 case URX_BACKSLASH_B
:
661 case URX_BACKSLASH_BU
:
662 case URX_BACKSLASH_D
:
663 case URX_BACKSLASH_Z
:
666 case URX_CTR_INIT_NG
:
668 case URX_CTR_LOOP_NG
:
669 case URX_RELOC_OPRND
:
673 case URX_STO_INP_LOC
:
685 // types with an integer operand field.
691 printf("%c", val
<256?val
:'?');
697 int32_t lengthOp
= fCompiledPat
->elementAti(index
+1);
698 U_ASSERT(URX_TYPE(lengthOp
) == URX_STRING_LEN
);
699 int32_t length
= URX_VAL(lengthOp
);
701 for (i
=val
; i
<val
+length
; i
++) {
702 UChar c
= fLiteralText
[i
];
703 if (c
< 32 || c
>= 256) {c
= '.';}
713 UnicodeSet
*set
= (UnicodeSet
*)fSets
->elementAt(val
);
714 set
->toPattern(s
, TRUE
);
715 for (int32_t i
=0; i
<s
.length(); i
++) {
716 printf("%c", s
.charAt(i
));
721 case URX_STATIC_SETREF
:
722 case URX_STAT_SETREF_N
:
725 if (val
& URX_NEG_SET
) {
729 UnicodeSet
*set
= fStaticSets
[val
];
730 set
->toPattern(s
, TRUE
);
731 for (int32_t i
=0; i
<s
.length(); i
++) {
732 printf("%c", s
.charAt(i
));
747 void RegexPattern::dumpPattern() const {
748 #if defined(REGEX_DEBUG)
752 printf("Original Pattern: ");
753 UChar32 c
= utext_next32From(fPattern
, 0);
754 while (c
!= U_SENTINEL
) {
760 c
= UTEXT_NEXT32(fPattern
);
763 printf(" Min Match Length: %d\n", fMinMatchLen
);
764 printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType
));
765 if (fStartType
== START_STRING
) {
766 printf(" Initial match string: \"");
767 for (i
=fInitialStringIdx
; i
<fInitialStringIdx
+fInitialStringLen
; i
++) {
768 printf("%c", fLiteralText
[i
]); // TODO: non-printables, surrogates.
772 } else if (fStartType
== START_SET
) {
773 int32_t numSetChars
= fInitialChars
->size();
774 if (numSetChars
> 20) {
777 printf(" Match First Chars : ");
778 for (i
=0; i
<numSetChars
; i
++) {
779 UChar32 c
= fInitialChars
->charAt(i
);
780 if (0x20<c
&& c
<0x7e) {
786 if (numSetChars
< fInitialChars
->size()) {
791 } else if (fStartType
== START_CHAR
) {
792 printf(" First char of Match : ");
793 if (0x20 < fInitialChar
&& fInitialChar
<0x7e) {
794 printf("%c\n", fInitialChar
);
796 printf("%#x\n", fInitialChar
);
800 printf("\nIndex Binary Type Operand\n" \
801 "-------------------------------------------\n");
802 for (index
= 0; index
<fCompiledPat
->size(); index
++) {
811 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern
)
814 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS