5 ***************************************************************************
6 * Copyright (C) 2002-2004 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15 #include "unicode/regex.h"
16 #include "unicode/uclean.h"
26 //--------------------------------------------------------------------------
28 // RegexPattern Default Constructor
30 //--------------------------------------------------------------------------
31 RegexPattern::RegexPattern() {
32 UErrorCode status
= U_ZERO_ERROR
;
34 // Init all of this instances data.
37 // Lazy init of all shared global sets.
38 RegexStaticSets::initGlobals(&fDeferredStatus
);
42 //--------------------------------------------------------------------------
44 // Copy Constructor Note: This is a rather inefficient implementation,
45 // but it probably doesn't matter.
47 //--------------------------------------------------------------------------
48 RegexPattern::RegexPattern(const RegexPattern
&other
) : UObject(other
) {
55 //--------------------------------------------------------------------------
57 // Assignmenet Operator
59 //--------------------------------------------------------------------------
60 RegexPattern
&RegexPattern::operator = (const RegexPattern
&other
) {
62 // Source and destination are the same. Don't do anything.
66 // Clean out any previous contents of object being assigned to.
69 // Give target object a default initialization
73 fPattern
= other
.fPattern
;
74 fFlags
= other
.fFlags
;
75 fLiteralText
= other
.fLiteralText
;
76 fDeferredStatus
= other
.fDeferredStatus
;
77 fMinMatchLen
= other
.fMinMatchLen
;
78 fFrameSize
= other
.fFrameSize
;
79 fDataSize
= other
.fDataSize
;
80 fMaxCaptureDigits
= other
.fMaxCaptureDigits
;
81 fStaticSets
= other
.fStaticSets
;
82 fStaticSets8
= other
.fStaticSets8
;
84 fStartType
= other
.fStartType
;
85 fInitialStringIdx
= other
.fInitialStringIdx
;
86 fInitialStringLen
= other
.fInitialStringLen
;
87 *fInitialChars
= *other
.fInitialChars
;
88 fInitialChar
= other
.fInitialChar
;
89 *fInitialChars8
= *other
.fInitialChars8
;
91 // Copy the pattern. It's just values, nothing deep to copy.
92 fCompiledPat
->assign(*other
.fCompiledPat
, fDeferredStatus
);
93 fGroupMap
->assign(*other
.fGroupMap
, fDeferredStatus
);
95 // Copy the Unicode Sets.
96 // Could be made more efficient if the sets were reference counted and shared,
97 // but I doubt that pattern copying will be particularly common.
98 // Note: init() already added an empty element zero to fSets
100 int32_t numSets
= other
.fSets
->size();
101 fSets8
= new Regex8BitSet
[numSets
];
102 for (i
=1; i
<numSets
; i
++) {
103 if (U_FAILURE(fDeferredStatus
)) {
106 UnicodeSet
*sourceSet
= (UnicodeSet
*)other
.fSets
->elementAt(i
);
107 UnicodeSet
*newSet
= new UnicodeSet(*sourceSet
);
108 if (newSet
== NULL
) {
109 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
112 fSets
->addElement(newSet
, fDeferredStatus
);
113 fSets8
[i
] = other
.fSets8
[i
];
120 //--------------------------------------------------------------------------
122 // init Shared initialization for use by constructors.
123 // Bring an uninitialized RegexPattern up to a default state.
125 //--------------------------------------------------------------------------
126 void RegexPattern::init() {
130 fLiteralText
.remove();
133 fDeferredStatus
= U_ZERO_ERROR
;
138 fMaxCaptureDigits
= 1;
141 fStartType
= START_NO_INFO
;
142 fInitialStringIdx
= 0;
143 fInitialStringLen
= 0;
144 fInitialChars
= NULL
;
146 fInitialChars8
= NULL
;
148 fCompiledPat
= new UVector32(fDeferredStatus
);
149 fGroupMap
= new UVector32(fDeferredStatus
);
150 fSets
= new UVector(fDeferredStatus
);
151 fInitialChars
= new UnicodeSet
;
152 fInitialChars8
= new Regex8BitSet
;
153 if (U_FAILURE(fDeferredStatus
)) {
156 if (fCompiledPat
== NULL
|| fGroupMap
== NULL
|| fSets
== NULL
||
157 fInitialChars
== NULL
|| fInitialChars8
== NULL
) {
158 fDeferredStatus
= U_MEMORY_ALLOCATION_ERROR
;
162 // Slot zero of the vector of sets is reserved. Fill it here.
163 fSets
->addElement((int32_t)0, fDeferredStatus
);
167 //--------------------------------------------------------------------------
169 // zap Delete everything owned by this RegexPattern.
171 //--------------------------------------------------------------------------
172 void RegexPattern::zap() {
176 for (i
=1; i
<fSets
->size(); i
++) {
178 s
= (UnicodeSet
*)fSets
->elementAt(i
);
189 delete fInitialChars
;
190 fInitialChars
= NULL
;
191 delete fInitialChars8
;
192 fInitialChars8
= NULL
;
196 //--------------------------------------------------------------------------
200 //--------------------------------------------------------------------------
201 RegexPattern::~RegexPattern() {
206 //--------------------------------------------------------------------------
210 //--------------------------------------------------------------------------
211 RegexPattern
*RegexPattern::clone() const {
212 RegexPattern
*copy
= new RegexPattern(*this);
217 //--------------------------------------------------------------------------
219 // operator == (comparison) Consider to patterns to be == if the
220 // pattern strings and the flags are the same.
222 //--------------------------------------------------------------------------
223 UBool
RegexPattern::operator ==(const RegexPattern
&other
) const {
224 UBool r
= this->fFlags
== other
.fFlags
&&
225 this->fPattern
== other
.fPattern
&&
226 this->fDeferredStatus
== other
.fDeferredStatus
;
230 //---------------------------------------------------------------------
234 //---------------------------------------------------------------------
235 RegexPattern
* U_EXPORT2
236 RegexPattern::compile(const UnicodeString
®ex
,
242 if (U_FAILURE(status
)) {
246 const uint32_t allFlags
= UREGEX_CANON_EQ
| UREGEX_CASE_INSENSITIVE
| UREGEX_COMMENTS
|
247 UREGEX_DOTALL
| UREGEX_MULTILINE
| UREGEX_UWORD
;
249 if ((flags
& ~allFlags
) != 0) {
250 status
= U_REGEX_INVALID_FLAG
;
254 if ((flags
& UREGEX_CANON_EQ
) != 0) {
255 status
= U_REGEX_UNIMPLEMENTED
;
259 RegexPattern
*This
= new RegexPattern
;
261 status
= U_MEMORY_ALLOCATION_ERROR
;
264 if (U_FAILURE(This
->fDeferredStatus
)) {
265 status
= This
->fDeferredStatus
;
268 This
->fFlags
= flags
;
270 RegexCompile
compiler(This
, status
);
271 compiler
.compile(regex
, pe
, status
);
277 // compile with default flags.
279 RegexPattern
* U_EXPORT2
280 RegexPattern::compile(const UnicodeString
®ex
,
284 return compile(regex
, 0, pe
, err
);
290 // compile with no UParseErr parameter.
292 RegexPattern
* U_EXPORT2
293 RegexPattern::compile( const UnicodeString
®ex
,
298 return compile(regex
, flags
, pe
, err
);
303 //---------------------------------------------------------------------
307 //---------------------------------------------------------------------
308 uint32_t RegexPattern::flags() const {
313 //---------------------------------------------------------------------
315 // matcher(UnicodeString, err)
317 //---------------------------------------------------------------------
318 RegexMatcher
*RegexPattern::matcher(const UnicodeString
&input
,
319 UErrorCode
&status
) const {
320 RegexMatcher
*retMatcher
= matcher(status
);
321 if (retMatcher
!= NULL
) {
322 retMatcher
->reset(input
);
327 RegexMatcher
*RegexPattern::matcher(const UChar
* /*input*/,
328 UErrorCode
&status
) const
330 /* This should never get called. The API with UnicodeString should be called instead. */
331 if (U_SUCCESS(status
)) {
332 status
= U_UNSUPPORTED_ERROR
;
338 //---------------------------------------------------------------------
342 //---------------------------------------------------------------------
343 RegexMatcher
*RegexPattern::matcher(UErrorCode
&status
) const {
344 RegexMatcher
*retMatcher
= NULL
;
346 if (U_FAILURE(status
)) {
349 if (U_FAILURE(fDeferredStatus
)) {
350 status
= fDeferredStatus
;
354 retMatcher
= new RegexMatcher(this);
355 if (retMatcher
== NULL
) {
356 status
= U_MEMORY_ALLOCATION_ERROR
;
364 //---------------------------------------------------------------------
366 // matches Convenience function to test for a match, starting
367 // with a pattern string and a data string.
369 //---------------------------------------------------------------------
370 UBool U_EXPORT2
RegexPattern::matches(const UnicodeString
®ex
,
371 const UnicodeString
&input
,
373 UErrorCode
&status
) {
375 if (U_FAILURE(status
)) {return FALSE
;}
378 RegexPattern
*pat
= NULL
;
379 RegexMatcher
*matcher
= NULL
;
381 pat
= RegexPattern::compile(regex
, 0, pe
, status
);
382 matcher
= pat
->matcher(input
, status
);
383 retVal
= matcher
->matches(status
);
393 //---------------------------------------------------------------------
397 //---------------------------------------------------------------------
398 UnicodeString
RegexPattern::pattern() const {
405 //---------------------------------------------------------------------
409 //---------------------------------------------------------------------
410 int32_t RegexPattern::split(const UnicodeString
&input
,
411 UnicodeString dest
[],
412 int32_t destCapacity
,
413 UErrorCode
&status
) const
415 if (U_FAILURE(status
)) {
419 RegexMatcher
m(this);
420 int32_t r
= m
.split(input
, dest
, destCapacity
, status
);
426 //---------------------------------------------------------------------
428 // dump Output the compiled form of the pattern.
429 // Debugging function only.
431 //---------------------------------------------------------------------
432 #if defined(REGEX_DEBUG)
433 void RegexPattern::dumpOp(int32_t index
) const {
434 static const char * const opNames
[] = {URX_OPCODE_NAMES
};
435 int32_t op
= fCompiledPat
->elementAti(index
);
436 int32_t val
= URX_VAL(op
);
437 int32_t type
= URX_TYPE(op
);
438 int32_t pinnedType
= type
;
439 if (pinnedType
>= sizeof(opNames
)/sizeof(char *)) {
443 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index
, op
, opNames
[pinnedType
]));
449 case URX_DOTANY_ALL_PL
:
453 case URX_BACKSLASH_G
:
454 case URX_BACKSLASH_X
:
458 // Types with no operand field of interest.
461 case URX_RESERVED_OP
:
462 case URX_START_CAPTURE
:
463 case URX_END_CAPTURE
:
468 case URX_BACKSLASH_B
:
469 case URX_BACKSLASH_BU
:
470 case URX_BACKSLASH_D
:
471 case URX_BACKSLASH_Z
:
474 case URX_CTR_INIT_NG
:
476 case URX_CTR_LOOP_NG
:
477 case URX_RELOC_OPRND
:
481 case URX_STO_INP_LOC
:
493 // types with an integer operand field.
494 REGEX_DUMP_DEBUG_PRINTF(("%d", val
));
499 REGEX_DUMP_DEBUG_PRINTF(("%c", val
<256?val
:'?'));
505 int32_t lengthOp
= fCompiledPat
->elementAti(index
+1);
506 U_ASSERT(URX_TYPE(lengthOp
) == URX_STRING_LEN
);
507 int32_t length
= URX_VAL(lengthOp
);
509 for (i
=val
; i
<val
+length
; i
++) {
510 UChar c
= fLiteralText
[i
];
511 if (c
< 32 || c
>= 256) {c
= '.';}
512 REGEX_DUMP_DEBUG_PRINTF(("%c", c
));
521 UnicodeSet
*set
= (UnicodeSet
*)fSets
->elementAt(val
);
522 set
->toPattern(s
, TRUE
);
523 for (int32_t i
=0; i
<s
.length(); i
++) {
524 REGEX_DUMP_DEBUG_PRINTF(("%c", s
.charAt(i
)));
529 case URX_STATIC_SETREF
:
530 case URX_STAT_SETREF_N
:
533 if (val
& URX_NEG_SET
) {
534 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
537 UnicodeSet
*set
= fStaticSets
[val
];
538 set
->toPattern(s
, TRUE
);
539 for (int32_t i
=0; i
<s
.length(); i
++) {
540 REGEX_DUMP_DEBUG_PRINTF(("%c", s
.charAt(i
)));
547 REGEX_DUMP_DEBUG_PRINTF(("??????"));
550 REGEX_DUMP_DEBUG_PRINTF(("\n"));
555 #if defined(REGEX_DEBUG)
556 U_CAPI
void U_EXPORT2
557 RegexPatternDump(const RegexPattern
*This
) {
561 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
562 for (i
=0; i
<This
->fPattern
.length(); i
++) {
563 REGEX_DUMP_DEBUG_PRINTF(("%c", This
->fPattern
.charAt(i
)));
565 REGEX_DUMP_DEBUG_PRINTF(("\n"));
566 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This
->fMinMatchLen
));
567 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This
->fStartType
)));
568 if (This
->fStartType
== START_STRING
) {
569 REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
570 for (i
=This
->fInitialStringIdx
; i
<This
->fInitialStringIdx
+This
->fInitialStringLen
; i
++) {
571 REGEX_DUMP_DEBUG_PRINTF(("%c", This
->fLiteralText
[i
])); // TODO: non-printables, surrogates.
574 } else if (This
->fStartType
== START_SET
) {
575 int32_t numSetChars
= This
->fInitialChars
->size();
576 if (numSetChars
> 20) {
579 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
580 for (i
=0; i
<numSetChars
; i
++) {
581 UChar32 c
= This
->fInitialChars
->charAt(i
);
582 if (0x20<c
&& c
<0x7e) {
583 REGEX_DUMP_DEBUG_PRINTF(("%c ", c
));
585 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c
));
588 if (numSetChars
< This
->fInitialChars
->size()) {
589 REGEX_DUMP_DEBUG_PRINTF((" ..."));
591 REGEX_DUMP_DEBUG_PRINTF(("\n"));
593 } else if (This
->fStartType
== START_CHAR
) {
594 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : "));
595 if (0x20 < This
->fInitialChar
&& This
->fInitialChar
<0x7e) {
596 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This
->fInitialChar
));
598 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This
->fInitialChar
));
602 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \
603 "-------------------------------------------\n"));
604 for (index
= 0; index
<This
->fCompiledPat
->size(); index
++) {
607 REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
613 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern
)
616 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS