5 ***************************************************************************
6 * Copyright (C) 2002-2012 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 #include "unicode/regex.h"
16 #include "unicode/uclean.h"
27 //--------------------------------------------------------------------------
29 // RegexPattern Default Constructor
31 //--------------------------------------------------------------------------
32 RegexPattern::RegexPattern() {
33 // Init all of this instances data.
38 //--------------------------------------------------------------------------
40 // Copy Constructor Note: This is a rather inefficient implementation,
41 // but it probably doesn't matter.
43 //--------------------------------------------------------------------------
44 RegexPattern::RegexPattern(const RegexPattern
&other
) : UObject(other
) {
51 //--------------------------------------------------------------------------
53 // Assignment Operator
55 //--------------------------------------------------------------------------
56 RegexPattern
&RegexPattern::operator = (const RegexPattern
&other
) {
58 // Source and destination are the same. Don't do anything.
62 // Clean out any previous contents of object being assigned to.
65 // Give target object a default initialization
69 if ( other
.fPatternString
== NULL
) {
70 fPatternString
= NULL
;
71 fPattern
= utext_clone(fPattern
, other
.fPattern
, FALSE
, TRUE
, &fDeferredStatus
);
73 fPatternString
= new UnicodeString(*(other
.fPatternString
));
74 UErrorCode status
= U_ZERO_ERROR
;
75 fPattern
= utext_openConstUnicodeString(NULL
, fPatternString
, &status
);
76 if (U_FAILURE(status
)) {
77 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
81 fFlags
= other
.fFlags
;
82 fLiteralText
= other
.fLiteralText
;
83 fDeferredStatus
= other
.fDeferredStatus
;
84 fMinMatchLen
= other
.fMinMatchLen
;
85 fFrameSize
= other
.fFrameSize
;
86 fDataSize
= other
.fDataSize
;
87 fMaxCaptureDigits
= other
.fMaxCaptureDigits
;
88 fStaticSets
= other
.fStaticSets
;
89 fStaticSets8
= other
.fStaticSets8
;
91 fStartType
= other
.fStartType
;
92 fInitialStringIdx
= other
.fInitialStringIdx
;
93 fInitialStringLen
= other
.fInitialStringLen
;
94 *fInitialChars
= *other
.fInitialChars
;
95 fInitialChar
= other
.fInitialChar
;
96 *fInitialChars8
= *other
.fInitialChars8
;
97 fNeedsAltInput
= other
.fNeedsAltInput
;
99 // Copy the pattern. It's just values, nothing deep to copy.
100 fCompiledPat
->assign(*other
.fCompiledPat
, fDeferredStatus
);
101 fGroupMap
->assign(*other
.fGroupMap
, fDeferredStatus
);
103 // Copy the Unicode Sets.
104 // Could be made more efficient if the sets were reference counted and shared,
105 // but I doubt that pattern copying will be particularly common.
106 // Note: init() already added an empty element zero to fSets
108 int32_t numSets
= other
.fSets
->size();
109 fSets8
= new Regex8BitSet
[numSets
];
110 if (fSets8
== NULL
) {
111 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
114 for (i
=1; i
<numSets
; i
++) {
115 if (U_FAILURE(fDeferredStatus
)) {
118 UnicodeSet
*sourceSet
= (UnicodeSet
*)other
.fSets
->elementAt(i
);
119 UnicodeSet
*newSet
= new UnicodeSet(*sourceSet
);
120 if (newSet
== NULL
) {
121 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
124 fSets
->addElement(newSet
, fDeferredStatus
);
125 fSets8
[i
] = other
.fSets8
[i
];
132 //--------------------------------------------------------------------------
134 // init Shared initialization for use by constructors.
135 // Bring an uninitialized RegexPattern up to a default state.
137 //--------------------------------------------------------------------------
138 void RegexPattern::init() {
141 fLiteralText
.remove();
144 fDeferredStatus
= U_ZERO_ERROR
;
149 fMaxCaptureDigits
= 1;
152 fStartType
= START_NO_INFO
;
153 fInitialStringIdx
= 0;
154 fInitialStringLen
= 0;
155 fInitialChars
= NULL
;
157 fInitialChars8
= NULL
;
158 fNeedsAltInput
= FALSE
;
160 fPattern
= NULL
; // will be set later
161 fPatternString
= NULL
; // may be set later
162 fCompiledPat
= new UVector64(fDeferredStatus
);
163 fGroupMap
= new UVector32(fDeferredStatus
);
164 fSets
= new UVector(fDeferredStatus
);
165 fInitialChars
= new UnicodeSet
;
166 fInitialChars8
= new Regex8BitSet
;
167 if (U_FAILURE(fDeferredStatus
)) {
170 if (fCompiledPat
== NULL
|| fGroupMap
== NULL
|| fSets
== NULL
||
171 fInitialChars
== NULL
|| fInitialChars8
== NULL
) {
172 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
176 // Slot zero of the vector of sets is reserved. Fill it here.
177 fSets
->addElement((int32_t)0, fDeferredStatus
);
181 //--------------------------------------------------------------------------
183 // zap Delete everything owned by this RegexPattern.
185 //--------------------------------------------------------------------------
186 void RegexPattern::zap() {
190 for (i
=1; i
<fSets
->size(); i
++) {
192 s
= (UnicodeSet
*)fSets
->elementAt(i
);
203 delete fInitialChars
;
204 fInitialChars
= NULL
;
205 delete fInitialChars8
;
206 fInitialChars8
= NULL
;
207 if (fPattern
!= NULL
) {
208 utext_close(fPattern
);
211 if (fPatternString
!= NULL
) {
212 delete fPatternString
;
213 fPatternString
= NULL
;
218 //--------------------------------------------------------------------------
222 //--------------------------------------------------------------------------
223 RegexPattern::~RegexPattern() {
228 //--------------------------------------------------------------------------
232 //--------------------------------------------------------------------------
233 RegexPattern
*RegexPattern::clone() const {
234 RegexPattern
*copy
= new RegexPattern(*this);
239 //--------------------------------------------------------------------------
241 // operator == (comparison) Consider to patterns to be == if the
242 // pattern strings and the flags are the same.
243 // Note that pattern strings with the same
244 // characters can still be considered different.
246 //--------------------------------------------------------------------------
247 UBool
RegexPattern::operator ==(const RegexPattern
&other
) const {
248 if (this->fFlags
== other
.fFlags
&& this->fDeferredStatus
== other
.fDeferredStatus
) {
249 if (this->fPatternString
!= NULL
&& other
.fPatternString
!= NULL
) {
250 return *(this->fPatternString
) == *(other
.fPatternString
);
251 } else if (this->fPattern
== NULL
) {
252 if (other
.fPattern
== NULL
) {
255 } else if (other
.fPattern
!= NULL
) {
256 UTEXT_SETNATIVEINDEX(this->fPattern
, 0);
257 UTEXT_SETNATIVEINDEX(other
.fPattern
, 0);
258 return utext_equals(this->fPattern
, other
.fPattern
);
264 //---------------------------------------------------------------------
268 //---------------------------------------------------------------------
269 RegexPattern
* U_EXPORT2
270 RegexPattern::compile(const UnicodeString
®ex
,
275 if (U_FAILURE(status
)) {
279 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
280 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
281 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
283 if ((flags
& ~allFlags
) != 0) {
284 status
= U_REGEX_INVALID_FLAG
;
288 if ((flags
& UREGEX_CANON_EQ
) != 0) {
289 status
= U_REGEX_UNIMPLEMENTED
;
293 RegexPattern
*This
= new RegexPattern
;
295 status
= U_MEMORY_ALLOCATION_ERROR
;
298 if (U_FAILURE(This
->fDeferredStatus
)) {
299 status
= This
->fDeferredStatus
;
303 This
->fFlags
= flags
;
305 RegexCompile
compiler(This
, status
);
306 compiler
.compile(regex
, pe
, status
);
308 if (U_FAILURE(status
)) {
318 // compile, UText mode
320 RegexPattern
* U_EXPORT2
321 RegexPattern::compile(UText
*regex
,
326 if (U_FAILURE(status
)) {
330 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
331 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
332 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
334 if ((flags
& ~allFlags
) != 0) {
335 status
= U_REGEX_INVALID_FLAG
;
339 if ((flags
& UREGEX_CANON_EQ
) != 0) {
340 status
= U_REGEX_UNIMPLEMENTED
;
344 RegexPattern
*This
= new RegexPattern
;
346 status
= U_MEMORY_ALLOCATION_ERROR
;
349 if (U_FAILURE(This
->fDeferredStatus
)) {
350 status
= This
->fDeferredStatus
;
354 This
->fFlags
= flags
;
356 RegexCompile
compiler(This
, status
);
357 compiler
.compile(regex
, pe
, status
);
359 if (U_FAILURE(status
)) {
368 // compile with default flags.
370 RegexPattern
* U_EXPORT2
371 RegexPattern::compile(const UnicodeString
®ex
,
375 return compile(regex
, 0, pe
, err
);
380 // compile with default flags, UText mode
382 RegexPattern
* U_EXPORT2
383 RegexPattern::compile(UText
*regex
,
387 return compile(regex
, 0, pe
, err
);
392 // compile with no UParseErr parameter.
394 RegexPattern
* U_EXPORT2
395 RegexPattern::compile(const UnicodeString
®ex
,
400 return compile(regex
, flags
, pe
, err
);
405 // compile with no UParseErr parameter, UText mode
407 RegexPattern
* U_EXPORT2
408 RegexPattern::compile(UText
*regex
,
413 return compile(regex
, flags
, pe
, err
);
417 //---------------------------------------------------------------------
421 //---------------------------------------------------------------------
422 uint32_t RegexPattern::flags() const {
427 //---------------------------------------------------------------------
429 // matcher(UnicodeString, err)
431 //---------------------------------------------------------------------
432 RegexMatcher
*RegexPattern::matcher(const UnicodeString
&input
,
433 UErrorCode
&status
) const {
434 RegexMatcher
*retMatcher
= matcher(status
);
435 if (retMatcher
!= NULL
) {
436 retMatcher
->fDeferredStatus
= status
;
437 retMatcher
->reset(input
);
443 //---------------------------------------------------------------------
447 //---------------------------------------------------------------------
448 RegexMatcher
*RegexPattern::matcher(UErrorCode
&status
) const {
449 RegexMatcher
*retMatcher
= NULL
;
451 if (U_FAILURE(status
)) {
454 if (U_FAILURE(fDeferredStatus
)) {
455 status
= fDeferredStatus
;
459 retMatcher
= new RegexMatcher(this);
460 if (retMatcher
== NULL
) {
461 status
= U_MEMORY_ALLOCATION_ERROR
;
469 //---------------------------------------------------------------------
471 // matches Convenience function to test for a match, starting
472 // with a pattern string and a data string.
474 //---------------------------------------------------------------------
475 UBool U_EXPORT2
RegexPattern::matches(const UnicodeString
®ex
,
476 const UnicodeString
&input
,
478 UErrorCode
&status
) {
480 if (U_FAILURE(status
)) {return FALSE
;}
483 RegexPattern
*pat
= NULL
;
484 RegexMatcher
*matcher
= NULL
;
486 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
487 matcher
= pat
->matcher(input
, status
);
488 retVal
= matcher
->matches(status
);
497 // matches, UText mode
499 UBool U_EXPORT2
RegexPattern::matches(UText
*regex
,
502 UErrorCode
&status
) {
504 if (U_FAILURE(status
)) {return FALSE
;}
506 UBool retVal
= FALSE
;
507 RegexPattern
*pat
= NULL
;
508 RegexMatcher
*matcher
= NULL
;
510 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
511 matcher
= pat
->matcher(status
);
512 if (U_SUCCESS(status
)) {
513 matcher
->reset(input
);
514 retVal
= matcher
->matches(status
);
526 //---------------------------------------------------------------------
530 //---------------------------------------------------------------------
531 UnicodeString
RegexPattern::pattern() const {
532 if (fPatternString
!= NULL
) {
533 return *fPatternString
;
534 } else if (fPattern
== NULL
) {
535 return UnicodeString();
537 UErrorCode status
= U_ZERO_ERROR
;
538 int64_t nativeLen
= utext_nativeLength(fPattern
);
539 int32_t len16
= utext_extract(fPattern
, 0, nativeLen
, NULL
, 0, &status
); // buffer overflow error
540 UnicodeString result
;
542 status
= U_ZERO_ERROR
;
543 UChar
*resultChars
= result
.getBuffer(len16
);
544 utext_extract(fPattern
, 0, nativeLen
, resultChars
, len16
, &status
); // unterminated warning
545 result
.releaseBuffer(len16
);
554 //---------------------------------------------------------------------
558 //---------------------------------------------------------------------
559 UText
*RegexPattern::patternText(UErrorCode
&status
) const {
560 if (U_FAILURE(status
)) {return NULL
;}
561 status
= U_ZERO_ERROR
;
563 if (fPattern
!= NULL
) {
566 RegexStaticSets::initGlobals(&status
);
567 return RegexStaticSets::gStaticSets
->fEmptyText
;
573 //---------------------------------------------------------------------
577 //---------------------------------------------------------------------
578 int32_t RegexPattern::split(const UnicodeString
&input
,
579 UnicodeString dest
[],
580 int32_t destCapacity
,
581 UErrorCode
&status
) const
583 if (U_FAILURE(status
)) {
587 RegexMatcher
m(this);
589 // Check m's status to make sure all is ok.
590 if (U_SUCCESS(m
.fDeferredStatus
)) {
591 r
= m
.split(input
, dest
, destCapacity
, status
);
599 int32_t RegexPattern::split(UText
*input
,
601 int32_t destCapacity
,
602 UErrorCode
&status
) const
604 if (U_FAILURE(status
)) {
608 RegexMatcher
m(this);
610 // Check m's status to make sure all is ok.
611 if (U_SUCCESS(m
.fDeferredStatus
)) {
612 r
= m
.split(input
, dest
, destCapacity
, status
);
619 //---------------------------------------------------------------------
621 // dump Output the compiled form of the pattern.
622 // Debugging function only.
624 //---------------------------------------------------------------------
625 #if defined(REGEX_DEBUG)
626 void RegexPattern::dumpOp(int32_t index
) const {
627 static const char * const opNames
[] = {URX_OPCODE_NAMES
};
628 int32_t op
= fCompiledPat
->elementAti(index
);
629 int32_t val
= URX_VAL(op
);
630 int32_t type
= URX_TYPE(op
);
631 int32_t pinnedType
= type
;
632 if ((uint32_t)pinnedType
>= sizeof(opNames
)/sizeof(char *)) {
636 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index
, op
, opNames
[pinnedType
]));
644 case URX_BACKSLASH_G
:
645 case URX_BACKSLASH_X
:
649 // Types with no operand field of interest.
652 case URX_RESERVED_OP
:
653 case URX_START_CAPTURE
:
654 case URX_END_CAPTURE
:
659 case URX_BACKSLASH_B
:
660 case URX_BACKSLASH_BU
:
661 case URX_BACKSLASH_D
:
662 case URX_BACKSLASH_Z
:
665 case URX_CTR_INIT_NG
:
667 case URX_CTR_LOOP_NG
:
668 case URX_RELOC_OPRND
:
672 case URX_STO_INP_LOC
:
684 // types with an integer operand field.
685 REGEX_DUMP_DEBUG_PRINTF(("%d", val
));
690 REGEX_DUMP_DEBUG_PRINTF(("%c", val
<256?val
:'?'));
696 int32_t lengthOp
= fCompiledPat
->elementAti(index
+1);
697 U_ASSERT(URX_TYPE(lengthOp
) == URX_STRING_LEN
);
698 int32_t length
= URX_VAL(lengthOp
);
700 for (i
=val
; i
<val
+length
; i
++) {
701 UChar c
= fLiteralText
[i
];
702 if (c
< 32 || c
>= 256) {c
= '.';}
703 REGEX_DUMP_DEBUG_PRINTF(("%c", c
));
712 UnicodeSet
*set
= (UnicodeSet
*)fSets
->elementAt(val
);
713 set
->toPattern(s
, TRUE
);
714 for (int32_t i
=0; i
<s
.length(); i
++) {
715 REGEX_DUMP_DEBUG_PRINTF(("%c", s
.charAt(i
)));
720 case URX_STATIC_SETREF
:
721 case URX_STAT_SETREF_N
:
724 if (val
& URX_NEG_SET
) {
725 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
728 UnicodeSet
*set
= fStaticSets
[val
];
729 set
->toPattern(s
, TRUE
);
730 for (int32_t i
=0; i
<s
.length(); i
++) {
731 REGEX_DUMP_DEBUG_PRINTF(("%c", s
.charAt(i
)));
738 REGEX_DUMP_DEBUG_PRINTF(("??????"));
741 REGEX_DUMP_DEBUG_PRINTF(("\n"));
746 #if defined(REGEX_DEBUG)
747 U_CAPI
void U_EXPORT2
748 RegexPatternDump(const RegexPattern
*This
) {
752 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
753 UChar32 c
= utext_next32From(This
->fPattern
, 0);
754 while (c
!= U_SENTINEL
) {
758 REGEX_DUMP_DEBUG_PRINTF(("%c", c
));
760 c
= UTEXT_NEXT32(This
->fPattern
);
762 REGEX_DUMP_DEBUG_PRINTF(("\n"));
763 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This
->fMinMatchLen
));
764 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This
->fStartType
)));
765 if (This
->fStartType
== START_STRING
) {
766 REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \""));
767 for (i
=This
->fInitialStringIdx
; i
<This
->fInitialStringIdx
+This
->fInitialStringLen
; i
++) {
768 REGEX_DUMP_DEBUG_PRINTF(("%c", This
->fLiteralText
[i
])); // TODO: non-printables, surrogates.
770 REGEX_DUMP_DEBUG_PRINTF(("\"\n"));
772 } else if (This
->fStartType
== START_SET
) {
773 int32_t numSetChars
= This
->fInitialChars
->size();
774 if (numSetChars
> 20) {
777 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
778 for (i
=0; i
<numSetChars
; i
++) {
779 UChar32 c
= This
->fInitialChars
->charAt(i
);
780 if (0x20<c
&& c
<0x7e) {
781 REGEX_DUMP_DEBUG_PRINTF(("%c ", c
));
783 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c
));
786 if (numSetChars
< This
->fInitialChars
->size()) {
787 REGEX_DUMP_DEBUG_PRINTF((" ..."));
789 REGEX_DUMP_DEBUG_PRINTF(("\n"));
791 } else if (This
->fStartType
== START_CHAR
) {
792 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : "));
793 if (0x20 < This
->fInitialChar
&& This
->fInitialChar
<0x7e) {
794 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This
->fInitialChar
));
796 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This
->fInitialChar
));
800 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \
801 "-------------------------------------------\n"));
802 for (index
= 0; index
<This
->fCompiledPat
->size(); index
++) {
805 REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
811 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern
)
814 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS