]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/repattrn.cpp
ICU-6.2.9.tar.gz
[apple/icu.git] / icuSources / i18n / repattrn.cpp
CommitLineData
b75a7d8f
A
1//
2// file: repattrn.cpp
3//
4/*
5***************************************************************************
374ca955 6* Copyright (C) 2002-2004 International Business Machines Corporation *
b75a7d8f
A
7* and others. All rights reserved. *
8***************************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_REGULAR_EXPRESSIONS
14
15#include "unicode/regex.h"
374ca955 16#include "unicode/uclean.h"
b75a7d8f
A
17#include "uassert.h"
18#include "uvector.h"
19#include "uvectr32.h"
20#include "regexcmp.h"
21#include "regeximp.h"
22#include "regexst.h"
23
24U_NAMESPACE_BEGIN
25
26//--------------------------------------------------------------------------
27//
28// RegexPattern Default Constructor
29//
30//--------------------------------------------------------------------------
31RegexPattern::RegexPattern() {
374ca955
A
32 UErrorCode status = U_ZERO_ERROR;
33 u_init(&status);
b75a7d8f
A
34 // Init all of this instances data.
35 init();
36
37 // Lazy init of all shared global sets.
38 RegexStaticSets::initGlobals(&fDeferredStatus);
39};
40
41
42//--------------------------------------------------------------------------
43//
44// Copy Constructor Note: This is a rather inefficient implementation,
45// but it probably doesn't matter.
46//
47//--------------------------------------------------------------------------
48RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
49 init();
50 *this = other;
51}
52
53
54
55//--------------------------------------------------------------------------
56//
57// Assignmenet Operator
58//
59//--------------------------------------------------------------------------
60RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
61 if (this == &other) {
62 // Source and destination are the same. Don't do anything.
63 return *this;
64 }
65
66 // Clean out any previous contents of object being assigned to.
67 zap();
68
69 // Give target object a default initialization
70 init();
71
72 // Copy simple fields
73 fPattern = other.fPattern;
74 fFlags = other.fFlags;
75 fLiteralText = other.fLiteralText;
76 fDeferredStatus = other.fDeferredStatus;
77 fMinMatchLen = other.fMinMatchLen;
374ca955
A
78 fFrameSize = other.fFrameSize;
79 fDataSize = other.fDataSize;
b75a7d8f
A
80 fMaxCaptureDigits = other.fMaxCaptureDigits;
81 fStaticSets = other.fStaticSets;
374ca955 82 fStaticSets8 = other.fStaticSets8;
b75a7d8f
A
83
84 fStartType = other.fStartType;
85 fInitialStringIdx = other.fInitialStringIdx;
86 fInitialStringLen = other.fInitialStringLen;
87 *fInitialChars = *other.fInitialChars;
b75a7d8f 88 fInitialChar = other.fInitialChar;
374ca955 89 *fInitialChars8 = *other.fInitialChars8;
b75a7d8f
A
90
91 // Copy the pattern. It's just values, nothing deep to copy.
92 fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
93 fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
94
95 // Copy the Unicode Sets.
96 // Could be made more efficient if the sets were reference counted and shared,
97 // but I doubt that pattern copying will be particularly common.
98 // Note: init() already added an empty element zero to fSets
99 int32_t i;
100 int32_t numSets = other.fSets->size();
101 fSets8 = new Regex8BitSet[numSets];
102 for (i=1; i<numSets; i++) {
103 if (U_FAILURE(fDeferredStatus)) {
104 return *this;
105 }
106 UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
107 UnicodeSet *newSet = new UnicodeSet(*sourceSet);
108 if (newSet == NULL) {
109 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
110 break;
111 }
112 fSets->addElement(newSet, fDeferredStatus);
113 fSets8[i] = other.fSets8[i];
114 }
115
116 return *this;
117}
118
119
120//--------------------------------------------------------------------------
121//
122// init Shared initialization for use by constructors.
123// Bring an uninitialized RegexPattern up to a default state.
124//
125//--------------------------------------------------------------------------
126void RegexPattern::init() {
374ca955 127 fPattern.remove();
b75a7d8f 128 fFlags = 0;
374ca955
A
129 fCompiledPat = 0;
130 fLiteralText.remove();
131 fSets = NULL;
132 fSets8 = NULL;
b75a7d8f
A
133 fDeferredStatus = U_ZERO_ERROR;
134 fMinMatchLen = 0;
b75a7d8f
A
135 fFrameSize = 0;
136 fDataSize = 0;
374ca955
A
137 fGroupMap = NULL;
138 fMaxCaptureDigits = 1;
139 fStaticSets = NULL;
140 fStaticSets8 = NULL;
b75a7d8f
A
141 fStartType = START_NO_INFO;
142 fInitialStringIdx = 0;
143 fInitialStringLen = 0;
144 fInitialChars = NULL;
b75a7d8f 145 fInitialChar = 0;
374ca955 146 fInitialChars8 = NULL;
b75a7d8f
A
147
148 fCompiledPat = new UVector32(fDeferredStatus);
149 fGroupMap = new UVector32(fDeferredStatus);
150 fSets = new UVector(fDeferredStatus);
151 fInitialChars = new UnicodeSet;
152 fInitialChars8 = new Regex8BitSet;
153 if (U_FAILURE(fDeferredStatus)) {
154 return;
155 }
156 if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
157 fInitialChars == NULL || fInitialChars8 == NULL) {
158 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
159 return;
160 }
161
162 // Slot zero of the vector of sets is reserved. Fill it here.
163 fSets->addElement((int32_t)0, fDeferredStatus);
164}
165
166
167//--------------------------------------------------------------------------
168//
169// zap Delete everything owned by this RegexPattern.
170//
171//--------------------------------------------------------------------------
172void RegexPattern::zap() {
173 delete fCompiledPat;
174 fCompiledPat = NULL;
175 int i;
176 for (i=1; i<fSets->size(); i++) {
177 UnicodeSet *s;
178 s = (UnicodeSet *)fSets->elementAt(i);
179 if (s != NULL) {
180 delete s;
181 }
182 }
183 delete fSets;
184 fSets = NULL;
374ca955
A
185 delete[] fSets8;
186 fSets8 = NULL;
b75a7d8f
A
187 delete fGroupMap;
188 fGroupMap = NULL;
189 delete fInitialChars;
190 fInitialChars = NULL;
191 delete fInitialChars8;
192 fInitialChars8 = NULL;
b75a7d8f
A
193}
194
195
196//--------------------------------------------------------------------------
197//
198// Destructor
199//
200//--------------------------------------------------------------------------
201RegexPattern::~RegexPattern() {
202 zap();
203};
204
205
206//--------------------------------------------------------------------------
207//
208// Clone
209//
210//--------------------------------------------------------------------------
211RegexPattern *RegexPattern::clone() const {
212 RegexPattern *copy = new RegexPattern(*this);
213 return copy;
214};
215
216
217//--------------------------------------------------------------------------
218//
219// operator == (comparison) Consider to patterns to be == if the
220// pattern strings and the flags are the same.
221//
222//--------------------------------------------------------------------------
223UBool RegexPattern::operator ==(const RegexPattern &other) const {
224 UBool r = this->fFlags == other.fFlags &&
225 this->fPattern == other.fPattern &&
226 this->fDeferredStatus == other.fDeferredStatus;
227 return r;
228}
229
230//---------------------------------------------------------------------
231//
232// compile
233//
234//---------------------------------------------------------------------
374ca955
A
235RegexPattern * U_EXPORT2
236RegexPattern::compile(const UnicodeString &regex,
237 uint32_t flags,
238 UParseError &pe,
239 UErrorCode &status)
240{
b75a7d8f
A
241
242 if (U_FAILURE(status)) {
243 return NULL;
244 }
245
246 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
374ca955 247 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD;
b75a7d8f
A
248
249 if ((flags & ~allFlags) != 0) {
250 status = U_REGEX_INVALID_FLAG;
251 return NULL;
252 }
253
254 if ((flags & UREGEX_CANON_EQ) != 0) {
255 status = U_REGEX_UNIMPLEMENTED;
256 return NULL;
257 }
258
259 RegexPattern *This = new RegexPattern;
260 if (This == NULL) {
261 status = U_MEMORY_ALLOCATION_ERROR;
262 return NULL;
263 }
264 if (U_FAILURE(This->fDeferredStatus)) {
265 status = This->fDeferredStatus;
266 return NULL;
267 }
268 This->fFlags = flags;
269
270 RegexCompile compiler(This, status);
271 compiler.compile(regex, pe, status);
272
273 return This;
274};
275
276//
277// compile with default flags.
278//
374ca955
A
279RegexPattern * U_EXPORT2
280RegexPattern::compile(const UnicodeString &regex,
281 UParseError &pe,
282 UErrorCode &err)
b75a7d8f
A
283{
284 return compile(regex, 0, pe, err);
285}
286
287
288
289//
290// compile with no UParseErr parameter.
291//
374ca955
A
292RegexPattern * U_EXPORT2
293RegexPattern::compile( const UnicodeString &regex,
b75a7d8f
A
294 uint32_t flags,
295 UErrorCode &err)
296{
297 UParseError pe;
298 return compile(regex, flags, pe, err);
299}
300
301
302
303//---------------------------------------------------------------------
304//
305// flags
306//
307//---------------------------------------------------------------------
308uint32_t RegexPattern::flags() const {
309 return fFlags;
310}
311
312
313//---------------------------------------------------------------------
314//
315// matcher(UnicodeString, err)
316//
317//---------------------------------------------------------------------
318RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
319 UErrorCode &status) const {
320 RegexMatcher *retMatcher = matcher(status);
321 if (retMatcher != NULL) {
322 retMatcher->reset(input);
323 }
324 return retMatcher;
325};
326
374ca955
A
327RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
328 UErrorCode &status) const
329{
330 /* This should never get called. The API with UnicodeString should be called instead. */
331 if (U_SUCCESS(status)) {
332 status = U_UNSUPPORTED_ERROR;
333 }
334 return NULL;
335}
b75a7d8f
A
336
337
338//---------------------------------------------------------------------
339//
340// matcher(status)
341//
342//---------------------------------------------------------------------
343RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
344 RegexMatcher *retMatcher = NULL;
345
346 if (U_FAILURE(status)) {
347 return NULL;
348 }
349 if (U_FAILURE(fDeferredStatus)) {
350 status = fDeferredStatus;
351 return NULL;
352 }
353
354 retMatcher = new RegexMatcher(this);
355 if (retMatcher == NULL) {
356 status = U_MEMORY_ALLOCATION_ERROR;
357 return NULL;
358 }
359 return retMatcher;
360};
361
362
363
364//---------------------------------------------------------------------
365//
366// matches Convenience function to test for a match, starting
367// with a pattern string and a data string.
368//
369//---------------------------------------------------------------------
374ca955 370UBool U_EXPORT2 RegexPattern::matches(const UnicodeString &regex,
b75a7d8f
A
371 const UnicodeString &input,
372 UParseError &pe,
373 UErrorCode &status) {
374
375 if (U_FAILURE(status)) {return FALSE;}
376
377 UBool retVal;
378 RegexPattern *pat = NULL;
379 RegexMatcher *matcher = NULL;
380
381 pat = RegexPattern::compile(regex, 0, pe, status);
382 matcher = pat->matcher(input, status);
383 retVal = matcher->matches(status);
384
385 delete matcher;
386 delete pat;
387 return retVal;
388}
389
390
391
392
393//---------------------------------------------------------------------
394//
395// pattern
396//
397//---------------------------------------------------------------------
398UnicodeString RegexPattern::pattern() const {
399 return fPattern;
400}
401
402
403
404
405//---------------------------------------------------------------------
406//
407// split
408//
409//---------------------------------------------------------------------
410int32_t RegexPattern::split(const UnicodeString &input,
411 UnicodeString dest[],
412 int32_t destCapacity,
413 UErrorCode &status) const
414{
415 if (U_FAILURE(status)) {
416 return 0;
417 };
418
419 RegexMatcher m(this);
420 int32_t r = m.split(input, dest, destCapacity, status);
421 return r;
422}
423
424
425
426//---------------------------------------------------------------------
427//
428// dump Output the compiled form of the pattern.
429// Debugging function only.
430//
431//---------------------------------------------------------------------
b75a7d8f 432#if defined(REGEX_DEBUG)
374ca955 433void RegexPattern::dumpOp(int32_t index) const {
b75a7d8f
A
434 static const char * const opNames[] = {URX_OPCODE_NAMES};
435 int32_t op = fCompiledPat->elementAti(index);
436 int32_t val = URX_VAL(op);
437 int32_t type = URX_TYPE(op);
438 int32_t pinnedType = type;
439 if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
440 pinnedType = 0;
441 }
442
374ca955 443 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType]));
b75a7d8f
A
444 switch (type) {
445 case URX_NOP:
446 case URX_DOTANY:
447 case URX_DOTANY_ALL:
448 case URX_DOTANY_PL:
449 case URX_DOTANY_ALL_PL:
450 case URX_FAIL:
451 case URX_CARET:
452 case URX_DOLLAR:
453 case URX_BACKSLASH_G:
454 case URX_BACKSLASH_X:
455 case URX_END:
456 case URX_DOLLAR_M:
457 case URX_CARET_M:
458 // Types with no operand field of interest.
459 break;
460
461 case URX_RESERVED_OP:
462 case URX_START_CAPTURE:
463 case URX_END_CAPTURE:
464 case URX_STATE_SAVE:
465 case URX_JMP:
466 case URX_JMP_SAV:
467 case URX_JMP_SAV_X:
468 case URX_BACKSLASH_B:
374ca955 469 case URX_BACKSLASH_BU:
b75a7d8f
A
470 case URX_BACKSLASH_D:
471 case URX_BACKSLASH_Z:
472 case URX_STRING_LEN:
473 case URX_CTR_INIT:
474 case URX_CTR_INIT_NG:
475 case URX_CTR_LOOP:
476 case URX_CTR_LOOP_NG:
477 case URX_RELOC_OPRND:
478 case URX_STO_SP:
479 case URX_LD_SP:
480 case URX_BACKREF:
481 case URX_STO_INP_LOC:
482 case URX_JMPX:
483 case URX_LA_START:
484 case URX_LA_END:
485 case URX_BACKREF_I:
486 case URX_LB_START:
487 case URX_LB_CONT:
488 case URX_LB_END:
489 case URX_LBN_CONT:
490 case URX_LBN_END:
491 case URX_LOOP_C:
492 case URX_LOOP_DOT_I:
493 // types with an integer operand field.
374ca955 494 REGEX_DUMP_DEBUG_PRINTF(("%d", val));
b75a7d8f
A
495 break;
496
497 case URX_ONECHAR:
498 case URX_ONECHAR_I:
374ca955 499 REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
b75a7d8f
A
500 break;
501
502 case URX_STRING:
503 case URX_STRING_I:
504 {
505 int32_t lengthOp = fCompiledPat->elementAti(index+1);
506 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
507 int32_t length = URX_VAL(lengthOp);
508 int32_t i;
509 for (i=val; i<val+length; i++) {
510 UChar c = fLiteralText[i];
511 if (c < 32 || c >= 256) {c = '.';}
374ca955 512 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
b75a7d8f
A
513 }
514 }
515 break;
516
517 case URX_SETREF:
518 case URX_LOOP_SR_I:
519 {
520 UnicodeString s;
521 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
522 set->toPattern(s, TRUE);
523 for (int32_t i=0; i<s.length(); i++) {
374ca955 524 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
b75a7d8f
A
525 }
526 }
527 break;
528
529 case URX_STATIC_SETREF:
530 case URX_STAT_SETREF_N:
531 {
532 UnicodeString s;
533 if (val & URX_NEG_SET) {
374ca955 534 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
b75a7d8f
A
535 val &= ~URX_NEG_SET;
536 }
537 UnicodeSet *set = fStaticSets[val];
538 set->toPattern(s, TRUE);
539 for (int32_t i=0; i<s.length(); i++) {
374ca955 540 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
b75a7d8f
A
541 }
542 }
543 break;
544
545
546 default:
374ca955 547 REGEX_DUMP_DEBUG_PRINTF(("??????"));
b75a7d8f
A
548 break;
549 }
374ca955 550 REGEX_DUMP_DEBUG_PRINTF(("\n"));
b75a7d8f 551}
374ca955 552#endif
b75a7d8f
A
553
554
b75a7d8f 555#if defined(REGEX_DEBUG)
374ca955
A
556U_CAPI void U_EXPORT2
557RegexPatternDump(const RegexPattern *This) {
b75a7d8f
A
558 int index;
559 int i;
560
374ca955
A
561 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
562 for (i=0; i<This->fPattern.length(); i++) {
563 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
b75a7d8f 564 }
374ca955
A
565 REGEX_DUMP_DEBUG_PRINTF(("\n"));
566 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
567 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
568 if (This->fStartType == START_STRING) {
569 REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
570 for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
571 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates.
b75a7d8f
A
572 }
573
374ca955
A
574 } else if (This->fStartType == START_SET) {
575 int32_t numSetChars = This->fInitialChars->size();
b75a7d8f
A
576 if (numSetChars > 20) {
577 numSetChars = 20;
578 }
374ca955 579 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
b75a7d8f 580 for (i=0; i<numSetChars; i++) {
374ca955 581 UChar32 c = This->fInitialChars->charAt(i);
b75a7d8f 582 if (0x20<c && c <0x7e) {
374ca955 583 REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
b75a7d8f 584 } else {
374ca955 585 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
b75a7d8f
A
586 }
587 }
374ca955
A
588 if (numSetChars < This->fInitialChars->size()) {
589 REGEX_DUMP_DEBUG_PRINTF((" ..."));
b75a7d8f 590 }
374ca955 591 REGEX_DUMP_DEBUG_PRINTF(("\n"));
b75a7d8f 592
374ca955
A
593 } else if (This->fStartType == START_CHAR) {
594 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : "));
595 if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
596 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
b75a7d8f 597 } else {
374ca955 598 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
b75a7d8f
A
599 }
600 }
601
374ca955
A
602 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \
603 "-------------------------------------------\n"));
604 for (index = 0; index<This->fCompiledPat->size(); index++) {
605 This->dumpOp(index);
b75a7d8f 606 }
374ca955 607 REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
b75a7d8f 608};
374ca955 609#endif
b75a7d8f
A
610
611
612
374ca955 613UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
b75a7d8f
A
614
615U_NAMESPACE_END
616#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS