5 ***************************************************************************
6 * Copyright (C) 2002-2010 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 #include "unicode/regex.h"
16 #include "unicode/uclean.h"
27 //--------------------------------------------------------------------------
29 // RegexPattern Default Constructor
31 //--------------------------------------------------------------------------
32 RegexPattern::RegexPattern() {
33 UErrorCode status
= U_ZERO_ERROR
;
36 // Init all of this instances data.
41 //--------------------------------------------------------------------------
43 // Copy Constructor Note: This is a rather inefficient implementation,
44 // but it probably doesn't matter.
46 //--------------------------------------------------------------------------
47 RegexPattern::RegexPattern(const RegexPattern
&other
) : UObject(other
) {
54 //--------------------------------------------------------------------------
56 // Assignment Operator
58 //--------------------------------------------------------------------------
59 RegexPattern
&RegexPattern::operator = (const RegexPattern
&other
) {
61 // Source and destination are the same. Don't do anything.
65 // Clean out any previous contents of object being assigned to.
68 // Give target object a default initialization
72 if ( other
.fPatternString
== NULL
) {
73 fPatternString
= NULL
;
74 fPattern
= utext_clone(fPattern
, other
.fPattern
, FALSE
, TRUE
, &fDeferredStatus
);
76 fPatternString
= new UnicodeString(*(other
.fPatternString
));
77 UErrorCode status
= U_ZERO_ERROR
;
78 fPattern
= utext_openConstUnicodeString(NULL
, fPatternString
, &status
);
79 if (U_FAILURE(status
)) {
80 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
84 fFlags
= other
.fFlags
;
85 fLiteralText
= other
.fLiteralText
;
86 fDeferredStatus
= other
.fDeferredStatus
;
87 fMinMatchLen
= other
.fMinMatchLen
;
88 fFrameSize
= other
.fFrameSize
;
89 fDataSize
= other
.fDataSize
;
90 fMaxCaptureDigits
= other
.fMaxCaptureDigits
;
91 fStaticSets
= other
.fStaticSets
;
92 fStaticSets8
= other
.fStaticSets8
;
94 fStartType
= other
.fStartType
;
95 fInitialStringIdx
= other
.fInitialStringIdx
;
96 fInitialStringLen
= other
.fInitialStringLen
;
97 *fInitialChars
= *other
.fInitialChars
;
98 fInitialChar
= other
.fInitialChar
;
99 *fInitialChars8
= *other
.fInitialChars8
;
100 fNeedsAltInput
= other
.fNeedsAltInput
;
102 // Copy the pattern. It's just values, nothing deep to copy.
103 fCompiledPat
->assign(*other
.fCompiledPat
, fDeferredStatus
);
104 fGroupMap
->assign(*other
.fGroupMap
, fDeferredStatus
);
106 // Copy the Unicode Sets.
107 // Could be made more efficient if the sets were reference counted and shared,
108 // but I doubt that pattern copying will be particularly common.
109 // Note: init() already added an empty element zero to fSets
111 int32_t numSets
= other
.fSets
->size();
112 fSets8
= new Regex8BitSet
[numSets
];
113 if (fSets8
== NULL
) {
114 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
117 for (i
=1; i
<numSets
; i
++) {
118 if (U_FAILURE(fDeferredStatus
)) {
121 UnicodeSet
*sourceSet
= (UnicodeSet
*)other
.fSets
->elementAt(i
);
122 UnicodeSet
*newSet
= new UnicodeSet(*sourceSet
);
123 if (newSet
== NULL
) {
124 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
127 fSets
->addElement(newSet
, fDeferredStatus
);
128 fSets8
[i
] = other
.fSets8
[i
];
135 //--------------------------------------------------------------------------
137 // init Shared initialization for use by constructors.
138 // Bring an uninitialized RegexPattern up to a default state.
140 //--------------------------------------------------------------------------
141 void RegexPattern::init() {
144 fLiteralText
.remove();
147 fDeferredStatus
= U_ZERO_ERROR
;
152 fMaxCaptureDigits
= 1;
155 fStartType
= START_NO_INFO
;
156 fInitialStringIdx
= 0;
157 fInitialStringLen
= 0;
158 fInitialChars
= NULL
;
160 fInitialChars8
= NULL
;
161 fNeedsAltInput
= FALSE
;
163 fPattern
= NULL
; // will be set later
164 fPatternString
= NULL
; // may be set later
165 fCompiledPat
= new UVector64(fDeferredStatus
);
166 fGroupMap
= new UVector32(fDeferredStatus
);
167 fSets
= new UVector(fDeferredStatus
);
168 fInitialChars
= new UnicodeSet
;
169 fInitialChars8
= new Regex8BitSet
;
170 if (U_FAILURE(fDeferredStatus
)) {
173 if (fCompiledPat
== NULL
|| fGroupMap
== NULL
|| fSets
== NULL
||
174 fInitialChars
== NULL
|| fInitialChars8
== NULL
) {
175 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
179 // Slot zero of the vector of sets is reserved. Fill it here.
180 fSets
->addElement((int32_t)0, fDeferredStatus
);
184 //--------------------------------------------------------------------------
186 // zap Delete everything owned by this RegexPattern.
188 //--------------------------------------------------------------------------
189 void RegexPattern::zap() {
193 for (i
=1; i
<fSets
->size(); i
++) {
195 s
= (UnicodeSet
*)fSets
->elementAt(i
);
206 delete fInitialChars
;
207 fInitialChars
= NULL
;
208 delete fInitialChars8
;
209 fInitialChars8
= NULL
;
210 if (fPattern
!= NULL
) {
211 utext_close(fPattern
);
214 if (fPatternString
!= NULL
) {
215 delete fPatternString
;
216 fPatternString
= NULL
;
221 //--------------------------------------------------------------------------
225 //--------------------------------------------------------------------------
226 RegexPattern::~RegexPattern() {
231 //--------------------------------------------------------------------------
235 //--------------------------------------------------------------------------
236 RegexPattern
*RegexPattern::clone() const {
237 RegexPattern
*copy
= new RegexPattern(*this);
242 //--------------------------------------------------------------------------
244 // operator == (comparison) Consider to patterns to be == if the
245 // pattern strings and the flags are the same.
246 // Note that pattern strings with the same
247 // characters can still be considered different.
249 //--------------------------------------------------------------------------
250 UBool
RegexPattern::operator ==(const RegexPattern
&other
) const {
251 if (this->fFlags
== other
.fFlags
&& this->fDeferredStatus
== other
.fDeferredStatus
) {
252 if (this->fPatternString
!= NULL
&& other
.fPatternString
!= NULL
) {
253 return *(this->fPatternString
) == *(other
.fPatternString
);
254 } else if (this->fPattern
== NULL
) {
255 if (other
.fPattern
== NULL
) {
258 } else if (other
.fPattern
!= NULL
) {
259 UTEXT_SETNATIVEINDEX(this->fPattern
, 0);
260 UTEXT_SETNATIVEINDEX(other
.fPattern
, 0);
261 return utext_equals(this->fPattern
, other
.fPattern
);
267 //---------------------------------------------------------------------
271 //---------------------------------------------------------------------
272 RegexPattern
* U_EXPORT2
273 RegexPattern::compile(const UnicodeString
®ex
,
278 if (U_FAILURE(status
)) {
282 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
283 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
284 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
286 if ((flags
& ~allFlags
) != 0) {
287 status
= U_REGEX_INVALID_FLAG
;
291 if ((flags
& (UREGEX_CANON_EQ
| UREGEX_LITERAL
)) != 0) {
292 status
= U_REGEX_UNIMPLEMENTED
;
296 RegexPattern
*This
= new RegexPattern
;
298 status
= U_MEMORY_ALLOCATION_ERROR
;
301 if (U_FAILURE(This
->fDeferredStatus
)) {
302 status
= This
->fDeferredStatus
;
306 This
->fFlags
= flags
;
308 RegexCompile
compiler(This
, status
);
309 compiler
.compile(regex
, pe
, status
);
311 if (U_FAILURE(status
)) {
321 // compile, UText mode
323 RegexPattern
* U_EXPORT2
324 RegexPattern::compile(UText
*regex
,
329 if (U_FAILURE(status
)) {
333 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
334 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
|
335 UREGEX_ERROR_ON_UNKNOWN_ESCAPES
| UREGEX_UNIX_LINES
| UREGEX_LITERAL
;
337 if ((flags
& ~allFlags
) != 0) {
338 status
= U_REGEX_INVALID_FLAG
;
342 if ((flags
& (UREGEX_CANON_EQ
| UREGEX_LITERAL
)) != 0) {
343 status
= U_REGEX_UNIMPLEMENTED
;
347 RegexPattern
*This
= new RegexPattern
;
349 status
= U_MEMORY_ALLOCATION_ERROR
;
352 if (U_FAILURE(This
->fDeferredStatus
)) {
353 status
= This
->fDeferredStatus
;
357 This
->fFlags
= flags
;
359 RegexCompile
compiler(This
, status
);
360 compiler
.compile(regex
, pe
, status
);
362 if (U_FAILURE(status
)) {
371 // compile with default flags.
373 RegexPattern
* U_EXPORT2
374 RegexPattern::compile(const UnicodeString
®ex
,
378 return compile(regex
, 0, pe
, err
);
383 // compile with default flags, UText mode
385 RegexPattern
* U_EXPORT2
386 RegexPattern::compile(UText
*regex
,
390 return compile(regex
, 0, pe
, err
);
395 // compile with no UParseErr parameter.
397 RegexPattern
* U_EXPORT2
398 RegexPattern::compile(const UnicodeString
®ex
,
403 return compile(regex
, flags
, pe
, err
);
408 // compile with no UParseErr parameter, UText mode
410 RegexPattern
* U_EXPORT2
411 RegexPattern::compile(UText
*regex
,
416 return compile(regex
, flags
, pe
, err
);
420 //---------------------------------------------------------------------
424 //---------------------------------------------------------------------
425 uint32_t RegexPattern::flags() const {
430 //---------------------------------------------------------------------
432 // matcher(UnicodeString, err)
434 //---------------------------------------------------------------------
435 RegexMatcher
*RegexPattern::matcher(const UnicodeString
&input
,
436 UErrorCode
&status
) const {
437 RegexMatcher
*retMatcher
= matcher(status
);
438 if (retMatcher
!= NULL
) {
439 retMatcher
->fDeferredStatus
= status
;
440 retMatcher
->reset(input
);
446 // matcher, UText mode
448 RegexMatcher
*RegexPattern::matcher(UText
*input
,
449 PatternIsUTextFlag
/*flag*/,
450 UErrorCode
&status
) const {
451 RegexMatcher
*retMatcher
= matcher(status
);
452 if (retMatcher
!= NULL
) {
453 retMatcher
->fDeferredStatus
= status
;
454 retMatcher
->reset(input
);
460 RegexMatcher
*RegexPattern::matcher(const UChar
* /*input*/,
461 UErrorCode
&status
) const
463 /* This should never get called. The API with UnicodeString should be called instead. */
464 if (U_SUCCESS(status
)) {
465 status
= U_UNSUPPORTED_ERROR
;
471 //---------------------------------------------------------------------
475 //---------------------------------------------------------------------
476 RegexMatcher
*RegexPattern::matcher(UErrorCode
&status
) const {
477 RegexMatcher
*retMatcher
= NULL
;
479 if (U_FAILURE(status
)) {
482 if (U_FAILURE(fDeferredStatus
)) {
483 status
= fDeferredStatus
;
487 retMatcher
= new RegexMatcher(this);
488 if (retMatcher
== NULL
) {
489 status
= U_MEMORY_ALLOCATION_ERROR
;
497 //---------------------------------------------------------------------
499 // matches Convenience function to test for a match, starting
500 // with a pattern string and a data string.
502 //---------------------------------------------------------------------
503 UBool U_EXPORT2
RegexPattern::matches(const UnicodeString
®ex
,
504 const UnicodeString
&input
,
506 UErrorCode
&status
) {
508 if (U_FAILURE(status
)) {return FALSE
;}
511 RegexPattern
*pat
= NULL
;
512 RegexMatcher
*matcher
= NULL
;
514 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
515 matcher
= pat
->matcher(input
, status
);
516 retVal
= matcher
->matches(status
);
525 // matches, UText mode
527 UBool U_EXPORT2
RegexPattern::matches(UText
*regex
,
530 UErrorCode
&status
) {
532 if (U_FAILURE(status
)) {return FALSE
;}
535 RegexPattern
*pat
= NULL
;
536 RegexMatcher
*matcher
= NULL
;
538 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
539 matcher
= pat
->matcher(input
, PATTERN_IS_UTEXT
, status
);
540 retVal
= matcher
->matches(status
);
551 //---------------------------------------------------------------------
555 //---------------------------------------------------------------------
556 UnicodeString
RegexPattern::pattern() const {
557 if (fPatternString
!= NULL
) {
558 return *fPatternString
;
559 } else if (fPattern
== NULL
) {
560 return UnicodeString();
562 UErrorCode status
= U_ZERO_ERROR
;
563 int64_t nativeLen
= utext_nativeLength(fPattern
);
564 int32_t len16
= utext_extract(fPattern
, 0, nativeLen
, NULL
, 0, &status
); // buffer overflow error
565 UnicodeString result
;
567 status
= U_ZERO_ERROR
;
568 UChar
*resultChars
= result
.getBuffer(len16
);
569 utext_extract(fPattern
, 0, nativeLen
, resultChars
, len16
, &status
); // unterminated warning
570 result
.releaseBuffer(len16
);
579 //---------------------------------------------------------------------
583 //---------------------------------------------------------------------
584 UText
*RegexPattern::patternText(UErrorCode
&status
) const {
585 if (U_FAILURE(status
)) {return NULL
;}
586 status
= U_ZERO_ERROR
;
588 if (fPattern
!= NULL
) {
591 RegexStaticSets::initGlobals(&status
);
592 return RegexStaticSets::gStaticSets
->fEmptyText
;
598 //---------------------------------------------------------------------
602 //---------------------------------------------------------------------
603 int32_t RegexPattern::split(const UnicodeString
&input
,
604 UnicodeString dest
[],
605 int32_t destCapacity
,
606 UErrorCode
&status
) const
608 if (U_FAILURE(status
)) {
612 RegexMatcher
m(this);
614 // Check m's status to make sure all is ok.
615 if (U_SUCCESS(m
.fDeferredStatus
)) {
616 r
= m
.split(input
, dest
, destCapacity
, status
);
624 int32_t RegexPattern::split(UText
*input
,
626 int32_t destCapacity
,
627 UErrorCode
&status
) const
629 if (U_FAILURE(status
)) {
633 RegexMatcher
m(this);
635 // Check m's status to make sure all is ok.
636 if (U_SUCCESS(m
.fDeferredStatus
)) {
637 r
= m
.split(input
, dest
, destCapacity
, status
);
644 //---------------------------------------------------------------------
646 // dump Output the compiled form of the pattern.
647 // Debugging function only.
649 //---------------------------------------------------------------------
650 #if defined(REGEX_DEBUG)
651 void RegexPattern::dumpOp(int32_t index
) const {
652 static const char * const opNames
[] = {URX_OPCODE_NAMES
};
653 int32_t op
= fCompiledPat
->elementAti(index
);
654 int32_t val
= URX_VAL(op
);
655 int32_t type
= URX_TYPE(op
);
656 int32_t pinnedType
= type
;
657 if ((uint32_t)pinnedType
>= sizeof(opNames
)/sizeof(char *)) {
661 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index
, op
, opNames
[pinnedType
]));
669 case URX_BACKSLASH_G
:
670 case URX_BACKSLASH_X
:
674 // Types with no operand field of interest.
677 case URX_RESERVED_OP
:
678 case URX_START_CAPTURE
:
679 case URX_END_CAPTURE
:
684 case URX_BACKSLASH_B
:
685 case URX_BACKSLASH_BU
:
686 case URX_BACKSLASH_D
:
687 case URX_BACKSLASH_Z
:
690 case URX_CTR_INIT_NG
:
692 case URX_CTR_LOOP_NG
:
693 case URX_RELOC_OPRND
:
697 case URX_STO_INP_LOC
:
709 // types with an integer operand field.
710 REGEX_DUMP_DEBUG_PRINTF(("%d", val
));
715 REGEX_DUMP_DEBUG_PRINTF(("%c", val
<256?val
:'?'));
721 int32_t lengthOp
= fCompiledPat
->elementAti(index
+1);
722 U_ASSERT(URX_TYPE(lengthOp
) == URX_STRING_LEN
);
723 int32_t length
= URX_VAL(lengthOp
);
725 for (i
=val
; i
<val
+length
; i
++) {
726 UChar c
= fLiteralText
[i
];
727 if (c
< 32 || c
>= 256) {c
= '.';}
728 REGEX_DUMP_DEBUG_PRINTF(("%c", c
));
737 UnicodeSet
*set
= (UnicodeSet
*)fSets
->elementAt(val
);
738 set
->toPattern(s
, TRUE
);
739 for (int32_t i
=0; i
<s
.length(); i
++) {
740 REGEX_DUMP_DEBUG_PRINTF(("%c", s
.charAt(i
)));
745 case URX_STATIC_SETREF
:
746 case URX_STAT_SETREF_N
:
749 if (val
& URX_NEG_SET
) {
750 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
753 UnicodeSet
*set
= fStaticSets
[val
];
754 set
->toPattern(s
, TRUE
);
755 for (int32_t i
=0; i
<s
.length(); i
++) {
756 REGEX_DUMP_DEBUG_PRINTF(("%c", s
.charAt(i
)));
763 REGEX_DUMP_DEBUG_PRINTF(("??????"));
766 REGEX_DUMP_DEBUG_PRINTF(("\n"));
771 #if defined(REGEX_DEBUG)
772 U_CAPI
void U_EXPORT2
773 RegexPatternDump(const RegexPattern
*This
) {
777 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
778 UChar32 c
= utext_next32From(This
->fPattern
, 0);
779 while (c
!= U_SENTINEL
) {
783 REGEX_DUMP_DEBUG_PRINTF(("%c", c
));
785 c
= UTEXT_NEXT32(This
->fPattern
);
787 REGEX_DUMP_DEBUG_PRINTF(("\n"));
788 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This
->fMinMatchLen
));
789 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This
->fStartType
)));
790 if (This
->fStartType
== START_STRING
) {
791 REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \""));
792 for (i
=This
->fInitialStringIdx
; i
<This
->fInitialStringIdx
+This
->fInitialStringLen
; i
++) {
793 REGEX_DUMP_DEBUG_PRINTF(("%c", This
->fLiteralText
[i
])); // TODO: non-printables, surrogates.
795 REGEX_DUMP_DEBUG_PRINTF(("\"\n"));
797 } else if (This
->fStartType
== START_SET
) {
798 int32_t numSetChars
= This
->fInitialChars
->size();
799 if (numSetChars
> 20) {
802 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
803 for (i
=0; i
<numSetChars
; i
++) {
804 UChar32 c
= This
->fInitialChars
->charAt(i
);
805 if (0x20<c
&& c
<0x7e) {
806 REGEX_DUMP_DEBUG_PRINTF(("%c ", c
));
808 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c
));
811 if (numSetChars
< This
->fInitialChars
->size()) {
812 REGEX_DUMP_DEBUG_PRINTF((" ..."));
814 REGEX_DUMP_DEBUG_PRINTF(("\n"));
816 } else if (This
->fStartType
== START_CHAR
) {
817 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : "));
818 if (0x20 < This
->fInitialChar
&& This
->fInitialChar
<0x7e) {
819 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This
->fInitialChar
));
821 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This
->fInitialChar
));
825 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \
826 "-------------------------------------------\n"));
827 for (index
= 0; index
<This
->fCompiledPat
->size(); index
++) {
830 REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
836 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern
)
839 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS