]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/repattrn.cpp
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / i18n / repattrn.cpp
CommitLineData
b75a7d8f 1//
46f4442e 2// file: repattrn.cpp
b75a7d8f
A
3//
4/*
5***************************************************************************
46f4442e 6* Copyright (C) 2002-2008 International Business Machines Corporation *
b75a7d8f
A
7* and others. All rights reserved. *
8***************************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_REGULAR_EXPRESSIONS
14
15#include "unicode/regex.h"
374ca955 16#include "unicode/uclean.h"
b75a7d8f
A
17#include "uassert.h"
18#include "uvector.h"
19#include "uvectr32.h"
20#include "regexcmp.h"
21#include "regeximp.h"
22#include "regexst.h"
23
24U_NAMESPACE_BEGIN
25
26//--------------------------------------------------------------------------
27//
28// RegexPattern Default Constructor
29//
30//--------------------------------------------------------------------------
31RegexPattern::RegexPattern() {
374ca955
A
32 UErrorCode status = U_ZERO_ERROR;
33 u_init(&status);
b75a7d8f
A
34 // Init all of this instances data.
35 init();
36
37 // Lazy init of all shared global sets.
38 RegexStaticSets::initGlobals(&fDeferredStatus);
73c04bcf 39}
b75a7d8f
A
40
41
42//--------------------------------------------------------------------------
43//
44// Copy Constructor Note: This is a rather inefficient implementation,
45// but it probably doesn't matter.
46//
47//--------------------------------------------------------------------------
48RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
46f4442e 49 init();
b75a7d8f
A
50 *this = other;
51}
52
53
54
55//--------------------------------------------------------------------------
56//
57// Assignmenet Operator
58//
59//--------------------------------------------------------------------------
60RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
61 if (this == &other) {
62 // Source and destination are the same. Don't do anything.
63 return *this;
64 }
65
66 // Clean out any previous contents of object being assigned to.
67 zap();
68
69 // Give target object a default initialization
70 init();
71
72 // Copy simple fields
73 fPattern = other.fPattern;
74 fFlags = other.fFlags;
75 fLiteralText = other.fLiteralText;
76 fDeferredStatus = other.fDeferredStatus;
77 fMinMatchLen = other.fMinMatchLen;
374ca955
A
78 fFrameSize = other.fFrameSize;
79 fDataSize = other.fDataSize;
b75a7d8f 80 fMaxCaptureDigits = other.fMaxCaptureDigits;
46f4442e 81 fStaticSets = other.fStaticSets;
374ca955 82 fStaticSets8 = other.fStaticSets8;
46f4442e 83
b75a7d8f
A
84 fStartType = other.fStartType;
85 fInitialStringIdx = other.fInitialStringIdx;
86 fInitialStringLen = other.fInitialStringLen;
87 *fInitialChars = *other.fInitialChars;
b75a7d8f 88 fInitialChar = other.fInitialChar;
374ca955 89 *fInitialChars8 = *other.fInitialChars8;
b75a7d8f
A
90
91 // Copy the pattern. It's just values, nothing deep to copy.
92 fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
93 fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
94
46f4442e 95 // Copy the Unicode Sets.
b75a7d8f 96 // Could be made more efficient if the sets were reference counted and shared,
46f4442e 97 // but I doubt that pattern copying will be particularly common.
b75a7d8f
A
98 // Note: init() already added an empty element zero to fSets
99 int32_t i;
100 int32_t numSets = other.fSets->size();
101 fSets8 = new Regex8BitSet[numSets];
46f4442e
A
102 if (fSets8 == NULL) {
103 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
104 return *this;
105 }
b75a7d8f
A
106 for (i=1; i<numSets; i++) {
107 if (U_FAILURE(fDeferredStatus)) {
108 return *this;
109 }
110 UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
111 UnicodeSet *newSet = new UnicodeSet(*sourceSet);
112 if (newSet == NULL) {
113 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
114 break;
115 }
116 fSets->addElement(newSet, fDeferredStatus);
117 fSets8[i] = other.fSets8[i];
118 }
119
120 return *this;
121}
122
123
124//--------------------------------------------------------------------------
125//
126// init Shared initialization for use by constructors.
127// Bring an uninitialized RegexPattern up to a default state.
128//
129//--------------------------------------------------------------------------
130void RegexPattern::init() {
374ca955 131 fPattern.remove();
b75a7d8f 132 fFlags = 0;
374ca955
A
133 fCompiledPat = 0;
134 fLiteralText.remove();
135 fSets = NULL;
136 fSets8 = NULL;
b75a7d8f
A
137 fDeferredStatus = U_ZERO_ERROR;
138 fMinMatchLen = 0;
b75a7d8f
A
139 fFrameSize = 0;
140 fDataSize = 0;
374ca955 141 fGroupMap = NULL;
46f4442e 142 fMaxCaptureDigits = 1;
374ca955
A
143 fStaticSets = NULL;
144 fStaticSets8 = NULL;
b75a7d8f
A
145 fStartType = START_NO_INFO;
146 fInitialStringIdx = 0;
147 fInitialStringLen = 0;
148 fInitialChars = NULL;
b75a7d8f 149 fInitialChar = 0;
374ca955 150 fInitialChars8 = NULL;
46f4442e 151
b75a7d8f
A
152 fCompiledPat = new UVector32(fDeferredStatus);
153 fGroupMap = new UVector32(fDeferredStatus);
154 fSets = new UVector(fDeferredStatus);
155 fInitialChars = new UnicodeSet;
156 fInitialChars8 = new Regex8BitSet;
157 if (U_FAILURE(fDeferredStatus)) {
158 return;
159 }
160 if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
161 fInitialChars == NULL || fInitialChars8 == NULL) {
162 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
163 return;
164 }
165
166 // Slot zero of the vector of sets is reserved. Fill it here.
167 fSets->addElement((int32_t)0, fDeferredStatus);
168}
169
170
171//--------------------------------------------------------------------------
172//
46f4442e 173// zap Delete everything owned by this RegexPattern.
b75a7d8f
A
174//
175//--------------------------------------------------------------------------
176void RegexPattern::zap() {
177 delete fCompiledPat;
178 fCompiledPat = NULL;
179 int i;
180 for (i=1; i<fSets->size(); i++) {
181 UnicodeSet *s;
182 s = (UnicodeSet *)fSets->elementAt(i);
183 if (s != NULL) {
184 delete s;
185 }
186 }
187 delete fSets;
188 fSets = NULL;
374ca955
A
189 delete[] fSets8;
190 fSets8 = NULL;
b75a7d8f
A
191 delete fGroupMap;
192 fGroupMap = NULL;
193 delete fInitialChars;
194 fInitialChars = NULL;
195 delete fInitialChars8;
196 fInitialChars8 = NULL;
b75a7d8f
A
197}
198
199
200//--------------------------------------------------------------------------
201//
202// Destructor
203//
204//--------------------------------------------------------------------------
205RegexPattern::~RegexPattern() {
206 zap();
73c04bcf 207}
b75a7d8f
A
208
209
210//--------------------------------------------------------------------------
211//
212// Clone
213//
214//--------------------------------------------------------------------------
46f4442e 215RegexPattern *RegexPattern::clone() const {
b75a7d8f
A
216 RegexPattern *copy = new RegexPattern(*this);
217 return copy;
73c04bcf 218}
b75a7d8f
A
219
220
221//--------------------------------------------------------------------------
222//
223// operator == (comparison) Consider to patterns to be == if the
224// pattern strings and the flags are the same.
225//
226//--------------------------------------------------------------------------
227UBool RegexPattern::operator ==(const RegexPattern &other) const {
228 UBool r = this->fFlags == other.fFlags &&
229 this->fPattern == other.fPattern &&
230 this->fDeferredStatus == other.fDeferredStatus;
231 return r;
232}
233
234//---------------------------------------------------------------------
235//
46f4442e 236// compile
b75a7d8f
A
237//
238//---------------------------------------------------------------------
374ca955
A
239RegexPattern * U_EXPORT2
240RegexPattern::compile(const UnicodeString &regex,
241 uint32_t flags,
242 UParseError &pe,
243 UErrorCode &status)
244{
b75a7d8f
A
245
246 if (U_FAILURE(status)) {
247 return NULL;
248 }
249
250 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
46f4442e
A
251 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
252 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES;
b75a7d8f
A
253
254 if ((flags & ~allFlags) != 0) {
255 status = U_REGEX_INVALID_FLAG;
256 return NULL;
257 }
258
259 if ((flags & UREGEX_CANON_EQ) != 0) {
260 status = U_REGEX_UNIMPLEMENTED;
261 return NULL;
262 }
263
264 RegexPattern *This = new RegexPattern;
265 if (This == NULL) {
266 status = U_MEMORY_ALLOCATION_ERROR;
267 return NULL;
268 }
269 if (U_FAILURE(This->fDeferredStatus)) {
270 status = This->fDeferredStatus;
46f4442e 271 delete This;
b75a7d8f
A
272 return NULL;
273 }
274 This->fFlags = flags;
275
276 RegexCompile compiler(This, status);
277 compiler.compile(regex, pe, status);
46f4442e
A
278
279 if (U_FAILURE(status)) {
280 delete This;
281 This = NULL;
282 }
b75a7d8f
A
283
284 return This;
73c04bcf 285}
46f4442e 286
b75a7d8f
A
287//
288// compile with default flags.
289//
374ca955
A
290RegexPattern * U_EXPORT2
291RegexPattern::compile(const UnicodeString &regex,
292 UParseError &pe,
46f4442e 293 UErrorCode &err)
b75a7d8f 294{
46f4442e 295 return compile(regex, 0, pe, err);
b75a7d8f
A
296}
297
298
299
300//
301// compile with no UParseErr parameter.
302//
374ca955
A
303RegexPattern * U_EXPORT2
304RegexPattern::compile( const UnicodeString &regex,
b75a7d8f 305 uint32_t flags,
46f4442e 306 UErrorCode &err)
b75a7d8f
A
307{
308 UParseError pe;
46f4442e 309 return compile(regex, flags, pe, err);
b75a7d8f
A
310}
311
312
313
314//---------------------------------------------------------------------
315//
316// flags
317//
318//---------------------------------------------------------------------
319uint32_t RegexPattern::flags() const {
320 return fFlags;
321}
322
323
324//---------------------------------------------------------------------
325//
326// matcher(UnicodeString, err)
327//
328//---------------------------------------------------------------------
329RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
330 UErrorCode &status) const {
331 RegexMatcher *retMatcher = matcher(status);
46f4442e 332 retMatcher->fDeferredStatus = status;
b75a7d8f
A
333 if (retMatcher != NULL) {
334 retMatcher->reset(input);
335 }
336 return retMatcher;
73c04bcf 337}
b75a7d8f 338
73c04bcf 339#if 0
374ca955 340RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
46f4442e 341 UErrorCode &status) const
374ca955
A
342{
343 /* This should never get called. The API with UnicodeString should be called instead. */
344 if (U_SUCCESS(status)) {
345 status = U_UNSUPPORTED_ERROR;
346 }
347 return NULL;
348}
73c04bcf 349#endif
b75a7d8f
A
350
351//---------------------------------------------------------------------
352//
353// matcher(status)
354//
355//---------------------------------------------------------------------
356RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
357 RegexMatcher *retMatcher = NULL;
358
359 if (U_FAILURE(status)) {
360 return NULL;
361 }
362 if (U_FAILURE(fDeferredStatus)) {
363 status = fDeferredStatus;
364 return NULL;
365 }
366
46f4442e 367 retMatcher = new RegexMatcher(this);
b75a7d8f
A
368 if (retMatcher == NULL) {
369 status = U_MEMORY_ALLOCATION_ERROR;
370 return NULL;
371 }
372 return retMatcher;
73c04bcf 373}
b75a7d8f
A
374
375
376
377//---------------------------------------------------------------------
378//
379// matches Convenience function to test for a match, starting
380// with a pattern string and a data string.
381//
382//---------------------------------------------------------------------
374ca955 383UBool U_EXPORT2 RegexPattern::matches(const UnicodeString &regex,
b75a7d8f
A
384 const UnicodeString &input,
385 UParseError &pe,
386 UErrorCode &status) {
387
388 if (U_FAILURE(status)) {return FALSE;}
389
390 UBool retVal;
391 RegexPattern *pat = NULL;
392 RegexMatcher *matcher = NULL;
393
394 pat = RegexPattern::compile(regex, 0, pe, status);
395 matcher = pat->matcher(input, status);
396 retVal = matcher->matches(status);
397
398 delete matcher;
399 delete pat;
400 return retVal;
401}
402
403
404
405
406//---------------------------------------------------------------------
407//
408// pattern
409//
410//---------------------------------------------------------------------
411UnicodeString RegexPattern::pattern() const {
412 return fPattern;
413}
414
415
416
417
418//---------------------------------------------------------------------
419//
420// split
421//
422//---------------------------------------------------------------------
423int32_t RegexPattern::split(const UnicodeString &input,
424 UnicodeString dest[],
425 int32_t destCapacity,
426 UErrorCode &status) const
427{
428 if (U_FAILURE(status)) {
429 return 0;
430 };
431
432 RegexMatcher m(this);
46f4442e
A
433 int32_t r = 0;
434 // Check m's status to make sure all is ok.
435 if (U_SUCCESS(m.fDeferredStatus)) {
436 r = m.split(input, dest, destCapacity, status);
437 }
b75a7d8f
A
438 return r;
439}
440
441
442
443//---------------------------------------------------------------------
444//
445// dump Output the compiled form of the pattern.
446// Debugging function only.
447//
448//---------------------------------------------------------------------
b75a7d8f 449#if defined(REGEX_DEBUG)
374ca955 450void RegexPattern::dumpOp(int32_t index) const {
b75a7d8f
A
451 static const char * const opNames[] = {URX_OPCODE_NAMES};
452 int32_t op = fCompiledPat->elementAti(index);
453 int32_t val = URX_VAL(op);
454 int32_t type = URX_TYPE(op);
455 int32_t pinnedType = type;
46f4442e 456 if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
b75a7d8f
A
457 pinnedType = 0;
458 }
46f4442e 459
374ca955 460 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType]));
b75a7d8f
A
461 switch (type) {
462 case URX_NOP:
463 case URX_DOTANY:
464 case URX_DOTANY_ALL:
b75a7d8f
A
465 case URX_FAIL:
466 case URX_CARET:
467 case URX_DOLLAR:
468 case URX_BACKSLASH_G:
469 case URX_BACKSLASH_X:
470 case URX_END:
471 case URX_DOLLAR_M:
472 case URX_CARET_M:
473 // Types with no operand field of interest.
474 break;
46f4442e 475
b75a7d8f
A
476 case URX_RESERVED_OP:
477 case URX_START_CAPTURE:
478 case URX_END_CAPTURE:
479 case URX_STATE_SAVE:
480 case URX_JMP:
481 case URX_JMP_SAV:
482 case URX_JMP_SAV_X:
483 case URX_BACKSLASH_B:
374ca955 484 case URX_BACKSLASH_BU:
b75a7d8f
A
485 case URX_BACKSLASH_D:
486 case URX_BACKSLASH_Z:
487 case URX_STRING_LEN:
488 case URX_CTR_INIT:
489 case URX_CTR_INIT_NG:
490 case URX_CTR_LOOP:
491 case URX_CTR_LOOP_NG:
492 case URX_RELOC_OPRND:
493 case URX_STO_SP:
494 case URX_LD_SP:
495 case URX_BACKREF:
496 case URX_STO_INP_LOC:
497 case URX_JMPX:
498 case URX_LA_START:
499 case URX_LA_END:
500 case URX_BACKREF_I:
501 case URX_LB_START:
502 case URX_LB_CONT:
503 case URX_LB_END:
504 case URX_LBN_CONT:
505 case URX_LBN_END:
506 case URX_LOOP_C:
507 case URX_LOOP_DOT_I:
508 // types with an integer operand field.
374ca955 509 REGEX_DUMP_DEBUG_PRINTF(("%d", val));
b75a7d8f 510 break;
46f4442e 511
b75a7d8f
A
512 case URX_ONECHAR:
513 case URX_ONECHAR_I:
374ca955 514 REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
b75a7d8f 515 break;
46f4442e 516
b75a7d8f
A
517 case URX_STRING:
518 case URX_STRING_I:
519 {
520 int32_t lengthOp = fCompiledPat->elementAti(index+1);
521 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
522 int32_t length = URX_VAL(lengthOp);
523 int32_t i;
524 for (i=val; i<val+length; i++) {
525 UChar c = fLiteralText[i];
526 if (c < 32 || c >= 256) {c = '.';}
374ca955 527 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
b75a7d8f
A
528 }
529 }
530 break;
531
532 case URX_SETREF:
533 case URX_LOOP_SR_I:
534 {
535 UnicodeString s;
536 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
537 set->toPattern(s, TRUE);
538 for (int32_t i=0; i<s.length(); i++) {
374ca955 539 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
b75a7d8f
A
540 }
541 }
542 break;
543
544 case URX_STATIC_SETREF:
545 case URX_STAT_SETREF_N:
546 {
547 UnicodeString s;
548 if (val & URX_NEG_SET) {
374ca955 549 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
b75a7d8f
A
550 val &= ~URX_NEG_SET;
551 }
552 UnicodeSet *set = fStaticSets[val];
553 set->toPattern(s, TRUE);
554 for (int32_t i=0; i<s.length(); i++) {
374ca955 555 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
b75a7d8f
A
556 }
557 }
558 break;
559
46f4442e 560
b75a7d8f 561 default:
374ca955 562 REGEX_DUMP_DEBUG_PRINTF(("??????"));
b75a7d8f
A
563 break;
564 }
374ca955 565 REGEX_DUMP_DEBUG_PRINTF(("\n"));
b75a7d8f 566}
374ca955 567#endif
b75a7d8f
A
568
569
b75a7d8f 570#if defined(REGEX_DEBUG)
46f4442e 571U_CAPI void U_EXPORT2
374ca955 572RegexPatternDump(const RegexPattern *This) {
b75a7d8f
A
573 int index;
574 int i;
575
374ca955
A
576 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
577 for (i=0; i<This->fPattern.length(); i++) {
578 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
b75a7d8f 579 }
374ca955
A
580 REGEX_DUMP_DEBUG_PRINTF(("\n"));
581 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
46f4442e 582 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
374ca955
A
583 if (This->fStartType == START_STRING) {
584 REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
585 for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
586 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates.
b75a7d8f
A
587 }
588
374ca955
A
589 } else if (This->fStartType == START_SET) {
590 int32_t numSetChars = This->fInitialChars->size();
b75a7d8f
A
591 if (numSetChars > 20) {
592 numSetChars = 20;
593 }
374ca955 594 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
b75a7d8f 595 for (i=0; i<numSetChars; i++) {
374ca955 596 UChar32 c = This->fInitialChars->charAt(i);
46f4442e 597 if (0x20<c && c <0x7e) {
374ca955 598 REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
b75a7d8f 599 } else {
374ca955 600 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
b75a7d8f
A
601 }
602 }
374ca955
A
603 if (numSetChars < This->fInitialChars->size()) {
604 REGEX_DUMP_DEBUG_PRINTF((" ..."));
b75a7d8f 605 }
374ca955 606 REGEX_DUMP_DEBUG_PRINTF(("\n"));
b75a7d8f 607
374ca955
A
608 } else if (This->fStartType == START_CHAR) {
609 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : "));
610 if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
611 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
b75a7d8f 612 } else {
374ca955 613 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
b75a7d8f
A
614 }
615 }
616
374ca955
A
617 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \
618 "-------------------------------------------\n"));
619 for (index = 0; index<This->fCompiledPat->size(); index++) {
620 This->dumpOp(index);
b75a7d8f 621 }
374ca955 622 REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
46f4442e 623}
374ca955 624#endif
b75a7d8f
A
625
626
627
374ca955 628UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
b75a7d8f
A
629
630U_NAMESPACE_END
631#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS