]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/repattrn.cpp
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / i18n / repattrn.cpp
CommitLineData
b75a7d8f
A
1//
2// file: repattrn.cpp
3//
4/*
5***************************************************************************
73c04bcf 6* Copyright (C) 2002-2006 International Business Machines Corporation *
b75a7d8f
A
7* and others. All rights reserved. *
8***************************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_REGULAR_EXPRESSIONS
14
15#include "unicode/regex.h"
374ca955 16#include "unicode/uclean.h"
b75a7d8f
A
17#include "uassert.h"
18#include "uvector.h"
19#include "uvectr32.h"
20#include "regexcmp.h"
21#include "regeximp.h"
22#include "regexst.h"
23
24U_NAMESPACE_BEGIN
25
26//--------------------------------------------------------------------------
27//
28// RegexPattern Default Constructor
29//
30//--------------------------------------------------------------------------
31RegexPattern::RegexPattern() {
374ca955
A
32 UErrorCode status = U_ZERO_ERROR;
33 u_init(&status);
b75a7d8f
A
34 // Init all of this instances data.
35 init();
36
37 // Lazy init of all shared global sets.
38 RegexStaticSets::initGlobals(&fDeferredStatus);
73c04bcf 39}
b75a7d8f
A
40
41
42//--------------------------------------------------------------------------
43//
44// Copy Constructor Note: This is a rather inefficient implementation,
45// but it probably doesn't matter.
46//
47//--------------------------------------------------------------------------
48RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
49 init();
50 *this = other;
51}
52
53
54
55//--------------------------------------------------------------------------
56//
57// Assignmenet Operator
58//
59//--------------------------------------------------------------------------
60RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
61 if (this == &other) {
62 // Source and destination are the same. Don't do anything.
63 return *this;
64 }
65
66 // Clean out any previous contents of object being assigned to.
67 zap();
68
69 // Give target object a default initialization
70 init();
71
72 // Copy simple fields
73 fPattern = other.fPattern;
74 fFlags = other.fFlags;
75 fLiteralText = other.fLiteralText;
76 fDeferredStatus = other.fDeferredStatus;
77 fMinMatchLen = other.fMinMatchLen;
374ca955
A
78 fFrameSize = other.fFrameSize;
79 fDataSize = other.fDataSize;
b75a7d8f
A
80 fMaxCaptureDigits = other.fMaxCaptureDigits;
81 fStaticSets = other.fStaticSets;
374ca955 82 fStaticSets8 = other.fStaticSets8;
b75a7d8f
A
83
84 fStartType = other.fStartType;
85 fInitialStringIdx = other.fInitialStringIdx;
86 fInitialStringLen = other.fInitialStringLen;
87 *fInitialChars = *other.fInitialChars;
b75a7d8f 88 fInitialChar = other.fInitialChar;
374ca955 89 *fInitialChars8 = *other.fInitialChars8;
b75a7d8f
A
90
91 // Copy the pattern. It's just values, nothing deep to copy.
92 fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
93 fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
94
95 // Copy the Unicode Sets.
96 // Could be made more efficient if the sets were reference counted and shared,
97 // but I doubt that pattern copying will be particularly common.
98 // Note: init() already added an empty element zero to fSets
99 int32_t i;
100 int32_t numSets = other.fSets->size();
101 fSets8 = new Regex8BitSet[numSets];
102 for (i=1; i<numSets; i++) {
103 if (U_FAILURE(fDeferredStatus)) {
104 return *this;
105 }
106 UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
107 UnicodeSet *newSet = new UnicodeSet(*sourceSet);
108 if (newSet == NULL) {
109 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
110 break;
111 }
112 fSets->addElement(newSet, fDeferredStatus);
113 fSets8[i] = other.fSets8[i];
114 }
115
116 return *this;
117}
118
119
120//--------------------------------------------------------------------------
121//
122// init Shared initialization for use by constructors.
123// Bring an uninitialized RegexPattern up to a default state.
124//
125//--------------------------------------------------------------------------
126void RegexPattern::init() {
374ca955 127 fPattern.remove();
b75a7d8f 128 fFlags = 0;
374ca955
A
129 fCompiledPat = 0;
130 fLiteralText.remove();
131 fSets = NULL;
132 fSets8 = NULL;
b75a7d8f
A
133 fDeferredStatus = U_ZERO_ERROR;
134 fMinMatchLen = 0;
b75a7d8f
A
135 fFrameSize = 0;
136 fDataSize = 0;
374ca955
A
137 fGroupMap = NULL;
138 fMaxCaptureDigits = 1;
139 fStaticSets = NULL;
140 fStaticSets8 = NULL;
b75a7d8f
A
141 fStartType = START_NO_INFO;
142 fInitialStringIdx = 0;
143 fInitialStringLen = 0;
144 fInitialChars = NULL;
b75a7d8f 145 fInitialChar = 0;
374ca955 146 fInitialChars8 = NULL;
b75a7d8f
A
147
148 fCompiledPat = new UVector32(fDeferredStatus);
149 fGroupMap = new UVector32(fDeferredStatus);
150 fSets = new UVector(fDeferredStatus);
151 fInitialChars = new UnicodeSet;
152 fInitialChars8 = new Regex8BitSet;
153 if (U_FAILURE(fDeferredStatus)) {
154 return;
155 }
156 if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
157 fInitialChars == NULL || fInitialChars8 == NULL) {
158 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
159 return;
160 }
161
162 // Slot zero of the vector of sets is reserved. Fill it here.
163 fSets->addElement((int32_t)0, fDeferredStatus);
164}
165
166
167//--------------------------------------------------------------------------
168//
169// zap Delete everything owned by this RegexPattern.
170//
171//--------------------------------------------------------------------------
172void RegexPattern::zap() {
173 delete fCompiledPat;
174 fCompiledPat = NULL;
175 int i;
176 for (i=1; i<fSets->size(); i++) {
177 UnicodeSet *s;
178 s = (UnicodeSet *)fSets->elementAt(i);
179 if (s != NULL) {
180 delete s;
181 }
182 }
183 delete fSets;
184 fSets = NULL;
374ca955
A
185 delete[] fSets8;
186 fSets8 = NULL;
b75a7d8f
A
187 delete fGroupMap;
188 fGroupMap = NULL;
189 delete fInitialChars;
190 fInitialChars = NULL;
191 delete fInitialChars8;
192 fInitialChars8 = NULL;
b75a7d8f
A
193}
194
195
196//--------------------------------------------------------------------------
197//
198// Destructor
199//
200//--------------------------------------------------------------------------
201RegexPattern::~RegexPattern() {
202 zap();
73c04bcf 203}
b75a7d8f
A
204
205
206//--------------------------------------------------------------------------
207//
208// Clone
209//
210//--------------------------------------------------------------------------
211RegexPattern *RegexPattern::clone() const {
212 RegexPattern *copy = new RegexPattern(*this);
213 return copy;
73c04bcf 214}
b75a7d8f
A
215
216
217//--------------------------------------------------------------------------
218//
219// operator == (comparison) Consider to patterns to be == if the
220// pattern strings and the flags are the same.
221//
222//--------------------------------------------------------------------------
223UBool RegexPattern::operator ==(const RegexPattern &other) const {
224 UBool r = this->fFlags == other.fFlags &&
225 this->fPattern == other.fPattern &&
226 this->fDeferredStatus == other.fDeferredStatus;
227 return r;
228}
229
230//---------------------------------------------------------------------
231//
232// compile
233//
234//---------------------------------------------------------------------
374ca955
A
235RegexPattern * U_EXPORT2
236RegexPattern::compile(const UnicodeString &regex,
237 uint32_t flags,
238 UParseError &pe,
239 UErrorCode &status)
240{
b75a7d8f
A
241
242 if (U_FAILURE(status)) {
243 return NULL;
244 }
245
246 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
374ca955 247 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD;
b75a7d8f
A
248
249 if ((flags & ~allFlags) != 0) {
250 status = U_REGEX_INVALID_FLAG;
251 return NULL;
252 }
253
254 if ((flags & UREGEX_CANON_EQ) != 0) {
255 status = U_REGEX_UNIMPLEMENTED;
256 return NULL;
257 }
258
259 RegexPattern *This = new RegexPattern;
260 if (This == NULL) {
261 status = U_MEMORY_ALLOCATION_ERROR;
262 return NULL;
263 }
264 if (U_FAILURE(This->fDeferredStatus)) {
265 status = This->fDeferredStatus;
266 return NULL;
267 }
268 This->fFlags = flags;
269
270 RegexCompile compiler(This, status);
271 compiler.compile(regex, pe, status);
272
273 return This;
73c04bcf 274}
b75a7d8f
A
275
276//
277// compile with default flags.
278//
374ca955
A
279RegexPattern * U_EXPORT2
280RegexPattern::compile(const UnicodeString &regex,
281 UParseError &pe,
282 UErrorCode &err)
b75a7d8f
A
283{
284 return compile(regex, 0, pe, err);
285}
286
287
288
289//
290// compile with no UParseErr parameter.
291//
374ca955
A
292RegexPattern * U_EXPORT2
293RegexPattern::compile( const UnicodeString &regex,
b75a7d8f
A
294 uint32_t flags,
295 UErrorCode &err)
296{
297 UParseError pe;
298 return compile(regex, flags, pe, err);
299}
300
301
302
303//---------------------------------------------------------------------
304//
305// flags
306//
307//---------------------------------------------------------------------
308uint32_t RegexPattern::flags() const {
309 return fFlags;
310}
311
312
313//---------------------------------------------------------------------
314//
315// matcher(UnicodeString, err)
316//
317//---------------------------------------------------------------------
318RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
319 UErrorCode &status) const {
320 RegexMatcher *retMatcher = matcher(status);
321 if (retMatcher != NULL) {
322 retMatcher->reset(input);
323 }
324 return retMatcher;
73c04bcf 325}
b75a7d8f 326
73c04bcf 327#if 0
374ca955
A
328RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
329 UErrorCode &status) const
330{
331 /* This should never get called. The API with UnicodeString should be called instead. */
332 if (U_SUCCESS(status)) {
333 status = U_UNSUPPORTED_ERROR;
334 }
335 return NULL;
336}
73c04bcf 337#endif
b75a7d8f
A
338
339//---------------------------------------------------------------------
340//
341// matcher(status)
342//
343//---------------------------------------------------------------------
344RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
345 RegexMatcher *retMatcher = NULL;
346
347 if (U_FAILURE(status)) {
348 return NULL;
349 }
350 if (U_FAILURE(fDeferredStatus)) {
351 status = fDeferredStatus;
352 return NULL;
353 }
354
355 retMatcher = new RegexMatcher(this);
356 if (retMatcher == NULL) {
357 status = U_MEMORY_ALLOCATION_ERROR;
358 return NULL;
359 }
360 return retMatcher;
73c04bcf 361}
b75a7d8f
A
362
363
364
365//---------------------------------------------------------------------
366//
367// matches Convenience function to test for a match, starting
368// with a pattern string and a data string.
369//
370//---------------------------------------------------------------------
374ca955 371UBool U_EXPORT2 RegexPattern::matches(const UnicodeString &regex,
b75a7d8f
A
372 const UnicodeString &input,
373 UParseError &pe,
374 UErrorCode &status) {
375
376 if (U_FAILURE(status)) {return FALSE;}
377
378 UBool retVal;
379 RegexPattern *pat = NULL;
380 RegexMatcher *matcher = NULL;
381
382 pat = RegexPattern::compile(regex, 0, pe, status);
383 matcher = pat->matcher(input, status);
384 retVal = matcher->matches(status);
385
386 delete matcher;
387 delete pat;
388 return retVal;
389}
390
391
392
393
394//---------------------------------------------------------------------
395//
396// pattern
397//
398//---------------------------------------------------------------------
399UnicodeString RegexPattern::pattern() const {
400 return fPattern;
401}
402
403
404
405
406//---------------------------------------------------------------------
407//
408// split
409//
410//---------------------------------------------------------------------
411int32_t RegexPattern::split(const UnicodeString &input,
412 UnicodeString dest[],
413 int32_t destCapacity,
414 UErrorCode &status) const
415{
416 if (U_FAILURE(status)) {
417 return 0;
418 };
419
420 RegexMatcher m(this);
421 int32_t r = m.split(input, dest, destCapacity, status);
422 return r;
423}
424
425
426
427//---------------------------------------------------------------------
428//
429// dump Output the compiled form of the pattern.
430// Debugging function only.
431//
432//---------------------------------------------------------------------
b75a7d8f 433#if defined(REGEX_DEBUG)
374ca955 434void RegexPattern::dumpOp(int32_t index) const {
b75a7d8f
A
435 static const char * const opNames[] = {URX_OPCODE_NAMES};
436 int32_t op = fCompiledPat->elementAti(index);
437 int32_t val = URX_VAL(op);
438 int32_t type = URX_TYPE(op);
439 int32_t pinnedType = type;
440 if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
441 pinnedType = 0;
442 }
443
374ca955 444 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType]));
b75a7d8f
A
445 switch (type) {
446 case URX_NOP:
447 case URX_DOTANY:
448 case URX_DOTANY_ALL:
449 case URX_DOTANY_PL:
450 case URX_DOTANY_ALL_PL:
451 case URX_FAIL:
452 case URX_CARET:
453 case URX_DOLLAR:
454 case URX_BACKSLASH_G:
455 case URX_BACKSLASH_X:
456 case URX_END:
457 case URX_DOLLAR_M:
458 case URX_CARET_M:
459 // Types with no operand field of interest.
460 break;
461
462 case URX_RESERVED_OP:
463 case URX_START_CAPTURE:
464 case URX_END_CAPTURE:
465 case URX_STATE_SAVE:
466 case URX_JMP:
467 case URX_JMP_SAV:
468 case URX_JMP_SAV_X:
469 case URX_BACKSLASH_B:
374ca955 470 case URX_BACKSLASH_BU:
b75a7d8f
A
471 case URX_BACKSLASH_D:
472 case URX_BACKSLASH_Z:
473 case URX_STRING_LEN:
474 case URX_CTR_INIT:
475 case URX_CTR_INIT_NG:
476 case URX_CTR_LOOP:
477 case URX_CTR_LOOP_NG:
478 case URX_RELOC_OPRND:
479 case URX_STO_SP:
480 case URX_LD_SP:
481 case URX_BACKREF:
482 case URX_STO_INP_LOC:
483 case URX_JMPX:
484 case URX_LA_START:
485 case URX_LA_END:
486 case URX_BACKREF_I:
487 case URX_LB_START:
488 case URX_LB_CONT:
489 case URX_LB_END:
490 case URX_LBN_CONT:
491 case URX_LBN_END:
492 case URX_LOOP_C:
493 case URX_LOOP_DOT_I:
494 // types with an integer operand field.
374ca955 495 REGEX_DUMP_DEBUG_PRINTF(("%d", val));
b75a7d8f
A
496 break;
497
498 case URX_ONECHAR:
499 case URX_ONECHAR_I:
374ca955 500 REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
b75a7d8f
A
501 break;
502
503 case URX_STRING:
504 case URX_STRING_I:
505 {
506 int32_t lengthOp = fCompiledPat->elementAti(index+1);
507 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
508 int32_t length = URX_VAL(lengthOp);
509 int32_t i;
510 for (i=val; i<val+length; i++) {
511 UChar c = fLiteralText[i];
512 if (c < 32 || c >= 256) {c = '.';}
374ca955 513 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
b75a7d8f
A
514 }
515 }
516 break;
517
518 case URX_SETREF:
519 case URX_LOOP_SR_I:
520 {
521 UnicodeString s;
522 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
523 set->toPattern(s, TRUE);
524 for (int32_t i=0; i<s.length(); i++) {
374ca955 525 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
b75a7d8f
A
526 }
527 }
528 break;
529
530 case URX_STATIC_SETREF:
531 case URX_STAT_SETREF_N:
532 {
533 UnicodeString s;
534 if (val & URX_NEG_SET) {
374ca955 535 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
b75a7d8f
A
536 val &= ~URX_NEG_SET;
537 }
538 UnicodeSet *set = fStaticSets[val];
539 set->toPattern(s, TRUE);
540 for (int32_t i=0; i<s.length(); i++) {
374ca955 541 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
b75a7d8f
A
542 }
543 }
544 break;
545
546
547 default:
374ca955 548 REGEX_DUMP_DEBUG_PRINTF(("??????"));
b75a7d8f
A
549 break;
550 }
374ca955 551 REGEX_DUMP_DEBUG_PRINTF(("\n"));
b75a7d8f 552}
374ca955 553#endif
b75a7d8f
A
554
555
b75a7d8f 556#if defined(REGEX_DEBUG)
374ca955
A
557U_CAPI void U_EXPORT2
558RegexPatternDump(const RegexPattern *This) {
b75a7d8f
A
559 int index;
560 int i;
561
374ca955
A
562 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
563 for (i=0; i<This->fPattern.length(); i++) {
564 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
b75a7d8f 565 }
374ca955
A
566 REGEX_DUMP_DEBUG_PRINTF(("\n"));
567 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
568 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
569 if (This->fStartType == START_STRING) {
570 REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
571 for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
572 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates.
b75a7d8f
A
573 }
574
374ca955
A
575 } else if (This->fStartType == START_SET) {
576 int32_t numSetChars = This->fInitialChars->size();
b75a7d8f
A
577 if (numSetChars > 20) {
578 numSetChars = 20;
579 }
374ca955 580 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
b75a7d8f 581 for (i=0; i<numSetChars; i++) {
374ca955 582 UChar32 c = This->fInitialChars->charAt(i);
b75a7d8f 583 if (0x20<c && c <0x7e) {
374ca955 584 REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
b75a7d8f 585 } else {
374ca955 586 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
b75a7d8f
A
587 }
588 }
374ca955
A
589 if (numSetChars < This->fInitialChars->size()) {
590 REGEX_DUMP_DEBUG_PRINTF((" ..."));
b75a7d8f 591 }
374ca955 592 REGEX_DUMP_DEBUG_PRINTF(("\n"));
b75a7d8f 593
374ca955
A
594 } else if (This->fStartType == START_CHAR) {
595 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : "));
596 if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
597 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
b75a7d8f 598 } else {
374ca955 599 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
b75a7d8f
A
600 }
601 }
602
374ca955
A
603 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \
604 "-------------------------------------------\n"));
605 for (index = 0; index<This->fCompiledPat->size(); index++) {
606 This->dumpOp(index);
b75a7d8f 607 }
374ca955 608 REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
b75a7d8f 609};
374ca955 610#endif
b75a7d8f
A
611
612
613
374ca955 614UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
b75a7d8f
A
615
616U_NAMESPACE_END
617#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS