]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/repattrn.cpp
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / i18n / repattrn.cpp
1 //
2 // file: repattrn.cpp
3 //
4 /*
5 ***************************************************************************
6 * Copyright (C) 2002-2004 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
14
15 #include "unicode/regex.h"
16 #include "unicode/uclean.h"
17 #include "uassert.h"
18 #include "uvector.h"
19 #include "uvectr32.h"
20 #include "regexcmp.h"
21 #include "regeximp.h"
22 #include "regexst.h"
23
24 U_NAMESPACE_BEGIN
25
26 //--------------------------------------------------------------------------
27 //
28 // RegexPattern Default Constructor
29 //
30 //--------------------------------------------------------------------------
31 RegexPattern::RegexPattern() {
32 UErrorCode status = U_ZERO_ERROR;
33 u_init(&status);
34 // Init all of this instances data.
35 init();
36
37 // Lazy init of all shared global sets.
38 RegexStaticSets::initGlobals(&fDeferredStatus);
39 };
40
41
42 //--------------------------------------------------------------------------
43 //
44 // Copy Constructor Note: This is a rather inefficient implementation,
45 // but it probably doesn't matter.
46 //
47 //--------------------------------------------------------------------------
48 RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
49 init();
50 *this = other;
51 }
52
53
54
55 //--------------------------------------------------------------------------
56 //
57 // Assignmenet Operator
58 //
59 //--------------------------------------------------------------------------
60 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
61 if (this == &other) {
62 // Source and destination are the same. Don't do anything.
63 return *this;
64 }
65
66 // Clean out any previous contents of object being assigned to.
67 zap();
68
69 // Give target object a default initialization
70 init();
71
72 // Copy simple fields
73 fPattern = other.fPattern;
74 fFlags = other.fFlags;
75 fLiteralText = other.fLiteralText;
76 fDeferredStatus = other.fDeferredStatus;
77 fMinMatchLen = other.fMinMatchLen;
78 fFrameSize = other.fFrameSize;
79 fDataSize = other.fDataSize;
80 fMaxCaptureDigits = other.fMaxCaptureDigits;
81 fStaticSets = other.fStaticSets;
82 fStaticSets8 = other.fStaticSets8;
83
84 fStartType = other.fStartType;
85 fInitialStringIdx = other.fInitialStringIdx;
86 fInitialStringLen = other.fInitialStringLen;
87 *fInitialChars = *other.fInitialChars;
88 fInitialChar = other.fInitialChar;
89 *fInitialChars8 = *other.fInitialChars8;
90
91 // Copy the pattern. It's just values, nothing deep to copy.
92 fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
93 fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
94
95 // Copy the Unicode Sets.
96 // Could be made more efficient if the sets were reference counted and shared,
97 // but I doubt that pattern copying will be particularly common.
98 // Note: init() already added an empty element zero to fSets
99 int32_t i;
100 int32_t numSets = other.fSets->size();
101 fSets8 = new Regex8BitSet[numSets];
102 for (i=1; i<numSets; i++) {
103 if (U_FAILURE(fDeferredStatus)) {
104 return *this;
105 }
106 UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
107 UnicodeSet *newSet = new UnicodeSet(*sourceSet);
108 if (newSet == NULL) {
109 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
110 break;
111 }
112 fSets->addElement(newSet, fDeferredStatus);
113 fSets8[i] = other.fSets8[i];
114 }
115
116 return *this;
117 }
118
119
120 //--------------------------------------------------------------------------
121 //
122 // init Shared initialization for use by constructors.
123 // Bring an uninitialized RegexPattern up to a default state.
124 //
125 //--------------------------------------------------------------------------
126 void RegexPattern::init() {
127 fPattern.remove();
128 fFlags = 0;
129 fCompiledPat = 0;
130 fLiteralText.remove();
131 fSets = NULL;
132 fSets8 = NULL;
133 fDeferredStatus = U_ZERO_ERROR;
134 fMinMatchLen = 0;
135 fFrameSize = 0;
136 fDataSize = 0;
137 fGroupMap = NULL;
138 fMaxCaptureDigits = 1;
139 fStaticSets = NULL;
140 fStaticSets8 = NULL;
141 fStartType = START_NO_INFO;
142 fInitialStringIdx = 0;
143 fInitialStringLen = 0;
144 fInitialChars = NULL;
145 fInitialChar = 0;
146 fInitialChars8 = NULL;
147
148 fCompiledPat = new UVector32(fDeferredStatus);
149 fGroupMap = new UVector32(fDeferredStatus);
150 fSets = new UVector(fDeferredStatus);
151 fInitialChars = new UnicodeSet;
152 fInitialChars8 = new Regex8BitSet;
153 if (U_FAILURE(fDeferredStatus)) {
154 return;
155 }
156 if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
157 fInitialChars == NULL || fInitialChars8 == NULL) {
158 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
159 return;
160 }
161
162 // Slot zero of the vector of sets is reserved. Fill it here.
163 fSets->addElement((int32_t)0, fDeferredStatus);
164 }
165
166
167 //--------------------------------------------------------------------------
168 //
169 // zap Delete everything owned by this RegexPattern.
170 //
171 //--------------------------------------------------------------------------
172 void RegexPattern::zap() {
173 delete fCompiledPat;
174 fCompiledPat = NULL;
175 int i;
176 for (i=1; i<fSets->size(); i++) {
177 UnicodeSet *s;
178 s = (UnicodeSet *)fSets->elementAt(i);
179 if (s != NULL) {
180 delete s;
181 }
182 }
183 delete fSets;
184 fSets = NULL;
185 delete[] fSets8;
186 fSets8 = NULL;
187 delete fGroupMap;
188 fGroupMap = NULL;
189 delete fInitialChars;
190 fInitialChars = NULL;
191 delete fInitialChars8;
192 fInitialChars8 = NULL;
193 }
194
195
196 //--------------------------------------------------------------------------
197 //
198 // Destructor
199 //
200 //--------------------------------------------------------------------------
201 RegexPattern::~RegexPattern() {
202 zap();
203 };
204
205
206 //--------------------------------------------------------------------------
207 //
208 // Clone
209 //
210 //--------------------------------------------------------------------------
211 RegexPattern *RegexPattern::clone() const {
212 RegexPattern *copy = new RegexPattern(*this);
213 return copy;
214 };
215
216
217 //--------------------------------------------------------------------------
218 //
219 // operator == (comparison) Consider to patterns to be == if the
220 // pattern strings and the flags are the same.
221 //
222 //--------------------------------------------------------------------------
223 UBool RegexPattern::operator ==(const RegexPattern &other) const {
224 UBool r = this->fFlags == other.fFlags &&
225 this->fPattern == other.fPattern &&
226 this->fDeferredStatus == other.fDeferredStatus;
227 return r;
228 }
229
230 //---------------------------------------------------------------------
231 //
232 // compile
233 //
234 //---------------------------------------------------------------------
235 RegexPattern * U_EXPORT2
236 RegexPattern::compile(const UnicodeString &regex,
237 uint32_t flags,
238 UParseError &pe,
239 UErrorCode &status)
240 {
241
242 if (U_FAILURE(status)) {
243 return NULL;
244 }
245
246 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
247 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD;
248
249 if ((flags & ~allFlags) != 0) {
250 status = U_REGEX_INVALID_FLAG;
251 return NULL;
252 }
253
254 if ((flags & UREGEX_CANON_EQ) != 0) {
255 status = U_REGEX_UNIMPLEMENTED;
256 return NULL;
257 }
258
259 RegexPattern *This = new RegexPattern;
260 if (This == NULL) {
261 status = U_MEMORY_ALLOCATION_ERROR;
262 return NULL;
263 }
264 if (U_FAILURE(This->fDeferredStatus)) {
265 status = This->fDeferredStatus;
266 return NULL;
267 }
268 This->fFlags = flags;
269
270 RegexCompile compiler(This, status);
271 compiler.compile(regex, pe, status);
272
273 return This;
274 };
275
276 //
277 // compile with default flags.
278 //
279 RegexPattern * U_EXPORT2
280 RegexPattern::compile(const UnicodeString &regex,
281 UParseError &pe,
282 UErrorCode &err)
283 {
284 return compile(regex, 0, pe, err);
285 }
286
287
288
289 //
290 // compile with no UParseErr parameter.
291 //
292 RegexPattern * U_EXPORT2
293 RegexPattern::compile( const UnicodeString &regex,
294 uint32_t flags,
295 UErrorCode &err)
296 {
297 UParseError pe;
298 return compile(regex, flags, pe, err);
299 }
300
301
302
303 //---------------------------------------------------------------------
304 //
305 // flags
306 //
307 //---------------------------------------------------------------------
308 uint32_t RegexPattern::flags() const {
309 return fFlags;
310 }
311
312
313 //---------------------------------------------------------------------
314 //
315 // matcher(UnicodeString, err)
316 //
317 //---------------------------------------------------------------------
318 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
319 UErrorCode &status) const {
320 RegexMatcher *retMatcher = matcher(status);
321 if (retMatcher != NULL) {
322 retMatcher->reset(input);
323 }
324 return retMatcher;
325 };
326
327 RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
328 UErrorCode &status) const
329 {
330 /* This should never get called. The API with UnicodeString should be called instead. */
331 if (U_SUCCESS(status)) {
332 status = U_UNSUPPORTED_ERROR;
333 }
334 return NULL;
335 }
336
337
338 //---------------------------------------------------------------------
339 //
340 // matcher(status)
341 //
342 //---------------------------------------------------------------------
343 RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
344 RegexMatcher *retMatcher = NULL;
345
346 if (U_FAILURE(status)) {
347 return NULL;
348 }
349 if (U_FAILURE(fDeferredStatus)) {
350 status = fDeferredStatus;
351 return NULL;
352 }
353
354 retMatcher = new RegexMatcher(this);
355 if (retMatcher == NULL) {
356 status = U_MEMORY_ALLOCATION_ERROR;
357 return NULL;
358 }
359 return retMatcher;
360 };
361
362
363
364 //---------------------------------------------------------------------
365 //
366 // matches Convenience function to test for a match, starting
367 // with a pattern string and a data string.
368 //
369 //---------------------------------------------------------------------
370 UBool U_EXPORT2 RegexPattern::matches(const UnicodeString &regex,
371 const UnicodeString &input,
372 UParseError &pe,
373 UErrorCode &status) {
374
375 if (U_FAILURE(status)) {return FALSE;}
376
377 UBool retVal;
378 RegexPattern *pat = NULL;
379 RegexMatcher *matcher = NULL;
380
381 pat = RegexPattern::compile(regex, 0, pe, status);
382 matcher = pat->matcher(input, status);
383 retVal = matcher->matches(status);
384
385 delete matcher;
386 delete pat;
387 return retVal;
388 }
389
390
391
392
393 //---------------------------------------------------------------------
394 //
395 // pattern
396 //
397 //---------------------------------------------------------------------
398 UnicodeString RegexPattern::pattern() const {
399 return fPattern;
400 }
401
402
403
404
405 //---------------------------------------------------------------------
406 //
407 // split
408 //
409 //---------------------------------------------------------------------
410 int32_t RegexPattern::split(const UnicodeString &input,
411 UnicodeString dest[],
412 int32_t destCapacity,
413 UErrorCode &status) const
414 {
415 if (U_FAILURE(status)) {
416 return 0;
417 };
418
419 RegexMatcher m(this);
420 int32_t r = m.split(input, dest, destCapacity, status);
421 return r;
422 }
423
424
425
426 //---------------------------------------------------------------------
427 //
428 // dump Output the compiled form of the pattern.
429 // Debugging function only.
430 //
431 //---------------------------------------------------------------------
432 #if defined(REGEX_DEBUG)
433 void RegexPattern::dumpOp(int32_t index) const {
434 static const char * const opNames[] = {URX_OPCODE_NAMES};
435 int32_t op = fCompiledPat->elementAti(index);
436 int32_t val = URX_VAL(op);
437 int32_t type = URX_TYPE(op);
438 int32_t pinnedType = type;
439 if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
440 pinnedType = 0;
441 }
442
443 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType]));
444 switch (type) {
445 case URX_NOP:
446 case URX_DOTANY:
447 case URX_DOTANY_ALL:
448 case URX_DOTANY_PL:
449 case URX_DOTANY_ALL_PL:
450 case URX_FAIL:
451 case URX_CARET:
452 case URX_DOLLAR:
453 case URX_BACKSLASH_G:
454 case URX_BACKSLASH_X:
455 case URX_END:
456 case URX_DOLLAR_M:
457 case URX_CARET_M:
458 // Types with no operand field of interest.
459 break;
460
461 case URX_RESERVED_OP:
462 case URX_START_CAPTURE:
463 case URX_END_CAPTURE:
464 case URX_STATE_SAVE:
465 case URX_JMP:
466 case URX_JMP_SAV:
467 case URX_JMP_SAV_X:
468 case URX_BACKSLASH_B:
469 case URX_BACKSLASH_BU:
470 case URX_BACKSLASH_D:
471 case URX_BACKSLASH_Z:
472 case URX_STRING_LEN:
473 case URX_CTR_INIT:
474 case URX_CTR_INIT_NG:
475 case URX_CTR_LOOP:
476 case URX_CTR_LOOP_NG:
477 case URX_RELOC_OPRND:
478 case URX_STO_SP:
479 case URX_LD_SP:
480 case URX_BACKREF:
481 case URX_STO_INP_LOC:
482 case URX_JMPX:
483 case URX_LA_START:
484 case URX_LA_END:
485 case URX_BACKREF_I:
486 case URX_LB_START:
487 case URX_LB_CONT:
488 case URX_LB_END:
489 case URX_LBN_CONT:
490 case URX_LBN_END:
491 case URX_LOOP_C:
492 case URX_LOOP_DOT_I:
493 // types with an integer operand field.
494 REGEX_DUMP_DEBUG_PRINTF(("%d", val));
495 break;
496
497 case URX_ONECHAR:
498 case URX_ONECHAR_I:
499 REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
500 break;
501
502 case URX_STRING:
503 case URX_STRING_I:
504 {
505 int32_t lengthOp = fCompiledPat->elementAti(index+1);
506 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
507 int32_t length = URX_VAL(lengthOp);
508 int32_t i;
509 for (i=val; i<val+length; i++) {
510 UChar c = fLiteralText[i];
511 if (c < 32 || c >= 256) {c = '.';}
512 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
513 }
514 }
515 break;
516
517 case URX_SETREF:
518 case URX_LOOP_SR_I:
519 {
520 UnicodeString s;
521 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
522 set->toPattern(s, TRUE);
523 for (int32_t i=0; i<s.length(); i++) {
524 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
525 }
526 }
527 break;
528
529 case URX_STATIC_SETREF:
530 case URX_STAT_SETREF_N:
531 {
532 UnicodeString s;
533 if (val & URX_NEG_SET) {
534 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
535 val &= ~URX_NEG_SET;
536 }
537 UnicodeSet *set = fStaticSets[val];
538 set->toPattern(s, TRUE);
539 for (int32_t i=0; i<s.length(); i++) {
540 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
541 }
542 }
543 break;
544
545
546 default:
547 REGEX_DUMP_DEBUG_PRINTF(("??????"));
548 break;
549 }
550 REGEX_DUMP_DEBUG_PRINTF(("\n"));
551 }
552 #endif
553
554
555 #if defined(REGEX_DEBUG)
556 U_CAPI void U_EXPORT2
557 RegexPatternDump(const RegexPattern *This) {
558 int index;
559 int i;
560
561 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
562 for (i=0; i<This->fPattern.length(); i++) {
563 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
564 }
565 REGEX_DUMP_DEBUG_PRINTF(("\n"));
566 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
567 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
568 if (This->fStartType == START_STRING) {
569 REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
570 for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
571 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates.
572 }
573
574 } else if (This->fStartType == START_SET) {
575 int32_t numSetChars = This->fInitialChars->size();
576 if (numSetChars > 20) {
577 numSetChars = 20;
578 }
579 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
580 for (i=0; i<numSetChars; i++) {
581 UChar32 c = This->fInitialChars->charAt(i);
582 if (0x20<c && c <0x7e) {
583 REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
584 } else {
585 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
586 }
587 }
588 if (numSetChars < This->fInitialChars->size()) {
589 REGEX_DUMP_DEBUG_PRINTF((" ..."));
590 }
591 REGEX_DUMP_DEBUG_PRINTF(("\n"));
592
593 } else if (This->fStartType == START_CHAR) {
594 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : "));
595 if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
596 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
597 } else {
598 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
599 }
600 }
601
602 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \
603 "-------------------------------------------\n"));
604 for (index = 0; index<This->fCompiledPat->size(); index++) {
605 This->dumpOp(index);
606 }
607 REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
608 };
609 #endif
610
611
612
613 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
614
615 U_NAMESPACE_END
616 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS