5 ***************************************************************************
6 * Copyright (C) 2002-2003 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 #include "unicode/regex.h"
25 //--------------------------------------------------------------------------
27 // RegexPattern Default Constructor
29 //--------------------------------------------------------------------------
30 RegexPattern::RegexPattern() {
31 // Init all of this instances data.
34 // Lazy init of all shared global sets.
35 RegexStaticSets::initGlobals(&fDeferredStatus
);
39 //--------------------------------------------------------------------------
41 // Copy Constructor Note: This is a rather inefficient implementation,
42 // but it probably doesn't matter.
44 //--------------------------------------------------------------------------
45 RegexPattern::RegexPattern(const RegexPattern
&other
) : UObject(other
) {
52 //--------------------------------------------------------------------------
54 // Assignmenet Operator
56 //--------------------------------------------------------------------------
57 RegexPattern
&RegexPattern::operator = (const RegexPattern
&other
) {
59 // Source and destination are the same. Don't do anything.
63 // Clean out any previous contents of object being assigned to.
66 // Give target object a default initialization
70 fPattern
= other
.fPattern
;
71 fFlags
= other
.fFlags
;
72 fLiteralText
= other
.fLiteralText
;
73 fDeferredStatus
= other
.fDeferredStatus
;
74 fMinMatchLen
= other
.fMinMatchLen
;
75 fMaxCaptureDigits
= other
.fMaxCaptureDigits
;
76 fStaticSets
= other
.fStaticSets
;
78 fStartType
= other
.fStartType
;
79 fInitialStringIdx
= other
.fInitialStringIdx
;
80 fInitialStringLen
= other
.fInitialStringLen
;
81 *fInitialChars
= *other
.fInitialChars
;
82 *fInitialChars8
= *other
.fInitialChars8
;
83 fInitialChar
= other
.fInitialChar
;
85 // Copy the pattern. It's just values, nothing deep to copy.
86 fCompiledPat
->assign(*other
.fCompiledPat
, fDeferredStatus
);
87 fGroupMap
->assign(*other
.fGroupMap
, fDeferredStatus
);
89 // Copy the Unicode Sets.
90 // Could be made more efficient if the sets were reference counted and shared,
91 // but I doubt that pattern copying will be particularly common.
92 // Note: init() already added an empty element zero to fSets
94 int32_t numSets
= other
.fSets
->size();
95 fSets8
= new Regex8BitSet
[numSets
];
96 for (i
=1; i
<numSets
; i
++) {
97 if (U_FAILURE(fDeferredStatus
)) {
100 UnicodeSet
*sourceSet
= (UnicodeSet
*)other
.fSets
->elementAt(i
);
101 UnicodeSet
*newSet
= new UnicodeSet(*sourceSet
);
102 if (newSet
== NULL
) {
103 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
106 fSets
->addElement(newSet
, fDeferredStatus
);
107 fSets8
[i
] = other
.fSets8
[i
];
114 //--------------------------------------------------------------------------
116 // init Shared initialization for use by constructors.
117 // Bring an uninitialized RegexPattern up to a default state.
119 //--------------------------------------------------------------------------
120 void RegexPattern::init() {
122 fDeferredStatus
= U_ZERO_ERROR
;
124 fMaxCaptureDigits
= 1;
128 fStartType
= START_NO_INFO
;
129 fInitialStringIdx
= 0;
130 fInitialStringLen
= 0;
131 fInitialChars
= NULL
;
132 fInitialChars8
= NULL
;
136 fCompiledPat
= new UVector32(fDeferredStatus
);
137 fGroupMap
= new UVector32(fDeferredStatus
);
138 fSets
= new UVector(fDeferredStatus
);
139 fInitialChars
= new UnicodeSet
;
140 fInitialChars8
= new Regex8BitSet
;
141 if (U_FAILURE(fDeferredStatus
)) {
144 if (fCompiledPat
== NULL
|| fGroupMap
== NULL
|| fSets
== NULL
||
145 fInitialChars
== NULL
|| fInitialChars8
== NULL
) {
146 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
150 // Slot zero of the vector of sets is reserved. Fill it here.
151 fSets
->addElement((int32_t)0, fDeferredStatus
);
155 //--------------------------------------------------------------------------
157 // zap Delete everything owned by this RegexPattern.
159 //--------------------------------------------------------------------------
160 void RegexPattern::zap() {
164 for (i
=1; i
<fSets
->size(); i
++) {
166 s
= (UnicodeSet
*)fSets
->elementAt(i
);
175 delete fInitialChars
;
176 fInitialChars
= NULL
;
177 delete fInitialChars8
;
178 fInitialChars8
= NULL
;
184 //--------------------------------------------------------------------------
188 //--------------------------------------------------------------------------
189 RegexPattern::~RegexPattern() {
194 //--------------------------------------------------------------------------
198 //--------------------------------------------------------------------------
199 RegexPattern
*RegexPattern::clone() const {
200 RegexPattern
*copy
= new RegexPattern(*this);
205 //--------------------------------------------------------------------------
207 // operator == (comparison) Consider to patterns to be == if the
208 // pattern strings and the flags are the same.
210 //--------------------------------------------------------------------------
211 UBool
RegexPattern::operator ==(const RegexPattern
&other
) const {
212 UBool r
= this->fFlags
== other
.fFlags
&&
213 this->fPattern
== other
.fPattern
&&
214 this->fDeferredStatus
== other
.fDeferredStatus
;
218 //---------------------------------------------------------------------
222 //---------------------------------------------------------------------
223 RegexPattern
*RegexPattern::compile(
224 const UnicodeString
®ex
,
227 UErrorCode
&status
) {
229 if (U_FAILURE(status
)) {
233 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
234 UREGEX_DOTALL
| UREGEX_MULTILINE
;
236 if ((flags
& ~allFlags
) != 0) {
237 status
= U_REGEX_INVALID_FLAG
;
241 if ((flags
& UREGEX_CANON_EQ
) != 0) {
242 status
= U_REGEX_UNIMPLEMENTED
;
246 RegexPattern
*This
= new RegexPattern
;
248 status
= U_MEMORY_ALLOCATION_ERROR
;
251 if (U_FAILURE(This
->fDeferredStatus
)) {
252 status
= This
->fDeferredStatus
;
255 This
->fFlags
= flags
;
257 RegexCompile
compiler(This
, status
);
258 compiler
.compile(regex
, pe
, status
);
264 // compile with default flags.
266 RegexPattern
*RegexPattern::compile( const UnicodeString
®ex
,
270 return compile(regex
, 0, pe
, err
);
276 // compile with no UParseErr parameter.
278 RegexPattern
*RegexPattern::compile( const UnicodeString
®ex
,
283 return compile(regex
, flags
, pe
, err
);
288 //---------------------------------------------------------------------
292 //---------------------------------------------------------------------
293 uint32_t RegexPattern::flags() const {
298 //---------------------------------------------------------------------
300 // matcher(UnicodeString, err)
302 //---------------------------------------------------------------------
303 RegexMatcher
*RegexPattern::matcher(const UnicodeString
&input
,
304 UErrorCode
&status
) const {
305 RegexMatcher
*retMatcher
= matcher(status
);
306 if (retMatcher
!= NULL
) {
307 retMatcher
->reset(input
);
314 //---------------------------------------------------------------------
318 //---------------------------------------------------------------------
319 RegexMatcher
*RegexPattern::matcher(UErrorCode
&status
) const {
320 RegexMatcher
*retMatcher
= NULL
;
322 if (U_FAILURE(status
)) {
325 if (U_FAILURE(fDeferredStatus
)) {
326 status
= fDeferredStatus
;
330 retMatcher
= new RegexMatcher(this);
331 if (retMatcher
== NULL
) {
332 status
= U_MEMORY_ALLOCATION_ERROR
;
340 //---------------------------------------------------------------------
342 // matches Convenience function to test for a match, starting
343 // with a pattern string and a data string.
345 //---------------------------------------------------------------------
346 UBool
RegexPattern::matches(const UnicodeString
®ex
,
347 const UnicodeString
&input
,
349 UErrorCode
&status
) {
351 if (U_FAILURE(status
)) {return FALSE
;}
354 RegexPattern
*pat
= NULL
;
355 RegexMatcher
*matcher
= NULL
;
357 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
358 matcher
= pat
->matcher(input
, status
);
359 retVal
= matcher
->matches(status
);
369 //---------------------------------------------------------------------
373 //---------------------------------------------------------------------
374 UnicodeString
RegexPattern::pattern() const {
381 //---------------------------------------------------------------------
385 //---------------------------------------------------------------------
386 int32_t RegexPattern::split(const UnicodeString
&input
,
387 UnicodeString dest
[],
388 int32_t destCapacity
,
389 UErrorCode
&status
) const
391 if (U_FAILURE(status
)) {
395 RegexMatcher
m(this);
396 int32_t r
= m
.split(input
, dest
, destCapacity
, status
);
402 //---------------------------------------------------------------------
404 // dump Output the compiled form of the pattern.
405 // Debugging function only.
407 //---------------------------------------------------------------------
408 void RegexPattern::dumpOp(int32_t index
) const {
409 #if defined(REGEX_DEBUG)
410 static const char * const opNames
[] = {URX_OPCODE_NAMES
};
411 int32_t op
= fCompiledPat
->elementAti(index
);
412 int32_t val
= URX_VAL(op
);
413 int32_t type
= URX_TYPE(op
);
414 int32_t pinnedType
= type
;
415 if (pinnedType
>= sizeof(opNames
)/sizeof(char *)) {
419 REGEX_DUMP_DEBUG_PRINTF("%4d %08x %-15s ", index
, op
, opNames
[pinnedType
]);
425 case URX_DOTANY_ALL_PL
:
429 case URX_BACKSLASH_G
:
430 case URX_BACKSLASH_X
:
434 // Types with no operand field of interest.
437 case URX_RESERVED_OP
:
438 case URX_START_CAPTURE
:
439 case URX_END_CAPTURE
:
444 case URX_BACKSLASH_B
:
445 case URX_BACKSLASH_D
:
446 case URX_BACKSLASH_Z
:
449 case URX_CTR_INIT_NG
:
451 case URX_CTR_LOOP_NG
:
452 case URX_RELOC_OPRND
:
456 case URX_STO_INP_LOC
:
468 // types with an integer operand field.
469 REGEX_DUMP_DEBUG_PRINTF("%d", val
);
474 REGEX_DUMP_DEBUG_PRINTF("%c", val
<256?val
:'?');
480 int32_t lengthOp
= fCompiledPat
->elementAti(index
+1);
481 U_ASSERT(URX_TYPE(lengthOp
) == URX_STRING_LEN
);
482 int32_t length
= URX_VAL(lengthOp
);
484 for (i
=val
; i
<val
+length
; i
++) {
485 UChar c
= fLiteralText
[i
];
486 if (c
< 32 || c
>= 256) {c
= '.';}
487 REGEX_DUMP_DEBUG_PRINTF("%c", c
);
496 UnicodeSet
*set
= (UnicodeSet
*)fSets
->elementAt(val
);
497 set
->toPattern(s
, TRUE
);
498 for (int32_t i
=0; i
<s
.length(); i
++) {
499 REGEX_DUMP_DEBUG_PRINTF("%c", s
.charAt(i
));
504 case URX_STATIC_SETREF
:
505 case URX_STAT_SETREF_N
:
508 if (val
& URX_NEG_SET
) {
509 REGEX_DUMP_DEBUG_PRINTF("NOT ");
512 UnicodeSet
*set
= fStaticSets
[val
];
513 set
->toPattern(s
, TRUE
);
514 for (int32_t i
=0; i
<s
.length(); i
++) {
515 REGEX_DUMP_DEBUG_PRINTF("%c", s
.charAt(i
));
522 REGEX_DUMP_DEBUG_PRINTF("??????");
525 REGEX_DUMP_DEBUG_PRINTF("\n");
531 void RegexPattern::dump() const {
532 #if defined(REGEX_DEBUG)
536 REGEX_DUMP_DEBUG_PRINTF("Original Pattern: ");
537 for (i
=0; i
<fPattern
.length(); i
++) {
538 REGEX_DUMP_DEBUG_PRINTF("%c", fPattern
.charAt(i
));
540 REGEX_DUMP_DEBUG_PRINTF("\n");
541 REGEX_DUMP_DEBUG_PRINTF(" Min Match Length: %d\n", fMinMatchLen
);
542 REGEX_DUMP_DEBUG_PRINTF(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType
));
543 if (fStartType
== START_STRING
) {
544 REGEX_DUMP_DEBUG_PRINTF(" Initial match sting: \"");
545 for (i
=fInitialStringIdx
; i
<fInitialStringIdx
+fInitialStringLen
; i
++) {
546 REGEX_DUMP_DEBUG_PRINTF("%c", fLiteralText
[i
]); // TODO: non-printables, surrogates.
549 } else if (fStartType
== START_SET
) {
550 int32_t numSetChars
= fInitialChars
->size();
551 if (numSetChars
> 20) {
554 REGEX_DUMP_DEBUG_PRINTF(" Match First Chars : ");
555 for (i
=0; i
<numSetChars
; i
++) {
556 UChar32 c
= fInitialChars
->charAt(i
);
557 if (0x20<c
&& c
<0x7e) {
558 REGEX_DUMP_DEBUG_PRINTF("%c ", c
);
560 REGEX_DUMP_DEBUG_PRINTF("%#x ", c
);
563 if (numSetChars
< fInitialChars
->size()) {
564 REGEX_DUMP_DEBUG_PRINTF(" ...");
566 REGEX_DUMP_DEBUG_PRINTF("\n");
568 } else if (fStartType
== START_CHAR
) {
569 REGEX_DUMP_DEBUG_PRINTF(" First char of Match : ");
570 if (0x20 < fInitialChar
&& fInitialChar
<0x7e) {
571 REGEX_DUMP_DEBUG_PRINTF("%c\n", fInitialChar
);
573 REGEX_DUMP_DEBUG_PRINTF("%#x\n", fInitialChar
);
577 REGEX_DUMP_DEBUG_PRINTF("\nIndex Binary Type Operand\n"
578 "-------------------------------------------\n");
579 for (index
= 0; index
<fCompiledPat
->size(); index
++) {
582 REGEX_DUMP_DEBUG_PRINTF("\n\n");
588 const char RegexPattern::fgClassID
= 0;
590 //----------------------------------------------------------------------------------
592 // regex_cleanup Memory cleanup function, free/delete all
593 // cached memory. Called by ICU's u_cleanup() function.
595 //----------------------------------------------------------------------------------
597 regex_cleanup(void) {
598 RegexCompile::cleanup();
603 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS