]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | // |
46f4442e | 2 | // file: repattrn.cpp |
b75a7d8f A |
3 | // |
4 | /* | |
5 | *************************************************************************** | |
729e4ab9 | 6 | * Copyright (C) 2002-2010 International Business Machines Corporation * |
b75a7d8f A |
7 | * and others. All rights reserved. * |
8 | *************************************************************************** | |
9 | */ | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
14 | ||
15 | #include "unicode/regex.h" | |
374ca955 | 16 | #include "unicode/uclean.h" |
b75a7d8f A |
17 | #include "uassert.h" |
18 | #include "uvector.h" | |
19 | #include "uvectr32.h" | |
729e4ab9 | 20 | #include "uvectr64.h" |
b75a7d8f A |
21 | #include "regexcmp.h" |
22 | #include "regeximp.h" | |
23 | #include "regexst.h" | |
24 | ||
25 | U_NAMESPACE_BEGIN | |
26 | ||
27 | //-------------------------------------------------------------------------- | |
28 | // | |
29 | // RegexPattern Default Constructor | |
30 | // | |
31 | //-------------------------------------------------------------------------- | |
32 | RegexPattern::RegexPattern() { | |
374ca955 A |
33 | UErrorCode status = U_ZERO_ERROR; |
34 | u_init(&status); | |
729e4ab9 | 35 | |
b75a7d8f A |
36 | // Init all of this instances data. |
37 | init(); | |
73c04bcf | 38 | } |
b75a7d8f A |
39 | |
40 | ||
41 | //-------------------------------------------------------------------------- | |
42 | // | |
43 | // Copy Constructor Note: This is a rather inefficient implementation, | |
44 | // but it probably doesn't matter. | |
45 | // | |
46 | //-------------------------------------------------------------------------- | |
47 | RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) { | |
46f4442e | 48 | init(); |
b75a7d8f A |
49 | *this = other; |
50 | } | |
51 | ||
52 | ||
53 | ||
54 | //-------------------------------------------------------------------------- | |
55 | // | |
729e4ab9 | 56 | // Assignment Operator |
b75a7d8f A |
57 | // |
58 | //-------------------------------------------------------------------------- | |
59 | RegexPattern &RegexPattern::operator = (const RegexPattern &other) { | |
60 | if (this == &other) { | |
61 | // Source and destination are the same. Don't do anything. | |
62 | return *this; | |
63 | } | |
64 | ||
65 | // Clean out any previous contents of object being assigned to. | |
66 | zap(); | |
67 | ||
68 | // Give target object a default initialization | |
69 | init(); | |
70 | ||
71 | // Copy simple fields | |
729e4ab9 A |
72 | if ( other.fPatternString == NULL ) { |
73 | fPatternString = NULL; | |
74 | fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus); | |
75 | } else { | |
76 | fPatternString = new UnicodeString(*(other.fPatternString)); | |
77 | UErrorCode status = U_ZERO_ERROR; | |
78 | fPattern = utext_openConstUnicodeString(NULL, fPatternString, &status); | |
79 | if (U_FAILURE(status)) { | |
80 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | |
81 | return *this; | |
82 | } | |
83 | } | |
b75a7d8f A |
84 | fFlags = other.fFlags; |
85 | fLiteralText = other.fLiteralText; | |
86 | fDeferredStatus = other.fDeferredStatus; | |
87 | fMinMatchLen = other.fMinMatchLen; | |
374ca955 A |
88 | fFrameSize = other.fFrameSize; |
89 | fDataSize = other.fDataSize; | |
b75a7d8f | 90 | fMaxCaptureDigits = other.fMaxCaptureDigits; |
46f4442e | 91 | fStaticSets = other.fStaticSets; |
374ca955 | 92 | fStaticSets8 = other.fStaticSets8; |
46f4442e | 93 | |
b75a7d8f A |
94 | fStartType = other.fStartType; |
95 | fInitialStringIdx = other.fInitialStringIdx; | |
96 | fInitialStringLen = other.fInitialStringLen; | |
97 | *fInitialChars = *other.fInitialChars; | |
b75a7d8f | 98 | fInitialChar = other.fInitialChar; |
374ca955 | 99 | *fInitialChars8 = *other.fInitialChars8; |
729e4ab9 | 100 | fNeedsAltInput = other.fNeedsAltInput; |
b75a7d8f A |
101 | |
102 | // Copy the pattern. It's just values, nothing deep to copy. | |
103 | fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus); | |
104 | fGroupMap->assign(*other.fGroupMap, fDeferredStatus); | |
105 | ||
46f4442e | 106 | // Copy the Unicode Sets. |
b75a7d8f | 107 | // Could be made more efficient if the sets were reference counted and shared, |
46f4442e | 108 | // but I doubt that pattern copying will be particularly common. |
b75a7d8f A |
109 | // Note: init() already added an empty element zero to fSets |
110 | int32_t i; | |
111 | int32_t numSets = other.fSets->size(); | |
112 | fSets8 = new Regex8BitSet[numSets]; | |
46f4442e A |
113 | if (fSets8 == NULL) { |
114 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | |
115 | return *this; | |
116 | } | |
b75a7d8f A |
117 | for (i=1; i<numSets; i++) { |
118 | if (U_FAILURE(fDeferredStatus)) { | |
119 | return *this; | |
120 | } | |
121 | UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i); | |
122 | UnicodeSet *newSet = new UnicodeSet(*sourceSet); | |
123 | if (newSet == NULL) { | |
124 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | |
125 | break; | |
126 | } | |
127 | fSets->addElement(newSet, fDeferredStatus); | |
128 | fSets8[i] = other.fSets8[i]; | |
129 | } | |
130 | ||
131 | return *this; | |
132 | } | |
133 | ||
134 | ||
135 | //-------------------------------------------------------------------------- | |
136 | // | |
137 | // init Shared initialization for use by constructors. | |
138 | // Bring an uninitialized RegexPattern up to a default state. | |
139 | // | |
140 | //-------------------------------------------------------------------------- | |
141 | void RegexPattern::init() { | |
142 | fFlags = 0; | |
374ca955 A |
143 | fCompiledPat = 0; |
144 | fLiteralText.remove(); | |
145 | fSets = NULL; | |
146 | fSets8 = NULL; | |
b75a7d8f A |
147 | fDeferredStatus = U_ZERO_ERROR; |
148 | fMinMatchLen = 0; | |
b75a7d8f A |
149 | fFrameSize = 0; |
150 | fDataSize = 0; | |
374ca955 | 151 | fGroupMap = NULL; |
46f4442e | 152 | fMaxCaptureDigits = 1; |
374ca955 A |
153 | fStaticSets = NULL; |
154 | fStaticSets8 = NULL; | |
b75a7d8f A |
155 | fStartType = START_NO_INFO; |
156 | fInitialStringIdx = 0; | |
157 | fInitialStringLen = 0; | |
158 | fInitialChars = NULL; | |
b75a7d8f | 159 | fInitialChar = 0; |
374ca955 | 160 | fInitialChars8 = NULL; |
729e4ab9 | 161 | fNeedsAltInput = FALSE; |
46f4442e | 162 | |
729e4ab9 A |
163 | fPattern = NULL; // will be set later |
164 | fPatternString = NULL; // may be set later | |
165 | fCompiledPat = new UVector64(fDeferredStatus); | |
b75a7d8f A |
166 | fGroupMap = new UVector32(fDeferredStatus); |
167 | fSets = new UVector(fDeferredStatus); | |
168 | fInitialChars = new UnicodeSet; | |
169 | fInitialChars8 = new Regex8BitSet; | |
170 | if (U_FAILURE(fDeferredStatus)) { | |
171 | return; | |
172 | } | |
173 | if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || | |
174 | fInitialChars == NULL || fInitialChars8 == NULL) { | |
175 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | |
176 | return; | |
177 | } | |
178 | ||
179 | // Slot zero of the vector of sets is reserved. Fill it here. | |
180 | fSets->addElement((int32_t)0, fDeferredStatus); | |
181 | } | |
182 | ||
183 | ||
184 | //-------------------------------------------------------------------------- | |
185 | // | |
46f4442e | 186 | // zap Delete everything owned by this RegexPattern. |
b75a7d8f A |
187 | // |
188 | //-------------------------------------------------------------------------- | |
189 | void RegexPattern::zap() { | |
190 | delete fCompiledPat; | |
191 | fCompiledPat = NULL; | |
192 | int i; | |
193 | for (i=1; i<fSets->size(); i++) { | |
194 | UnicodeSet *s; | |
195 | s = (UnicodeSet *)fSets->elementAt(i); | |
196 | if (s != NULL) { | |
197 | delete s; | |
198 | } | |
199 | } | |
200 | delete fSets; | |
201 | fSets = NULL; | |
374ca955 A |
202 | delete[] fSets8; |
203 | fSets8 = NULL; | |
b75a7d8f A |
204 | delete fGroupMap; |
205 | fGroupMap = NULL; | |
206 | delete fInitialChars; | |
207 | fInitialChars = NULL; | |
208 | delete fInitialChars8; | |
209 | fInitialChars8 = NULL; | |
729e4ab9 A |
210 | if (fPattern != NULL) { |
211 | utext_close(fPattern); | |
212 | fPattern = NULL; | |
213 | } | |
214 | if (fPatternString != NULL) { | |
215 | delete fPatternString; | |
216 | fPatternString = NULL; | |
217 | } | |
b75a7d8f A |
218 | } |
219 | ||
220 | ||
221 | //-------------------------------------------------------------------------- | |
222 | // | |
223 | // Destructor | |
224 | // | |
225 | //-------------------------------------------------------------------------- | |
226 | RegexPattern::~RegexPattern() { | |
227 | zap(); | |
73c04bcf | 228 | } |
b75a7d8f A |
229 | |
230 | ||
231 | //-------------------------------------------------------------------------- | |
232 | // | |
233 | // Clone | |
234 | // | |
235 | //-------------------------------------------------------------------------- | |
46f4442e | 236 | RegexPattern *RegexPattern::clone() const { |
b75a7d8f A |
237 | RegexPattern *copy = new RegexPattern(*this); |
238 | return copy; | |
73c04bcf | 239 | } |
b75a7d8f A |
240 | |
241 | ||
242 | //-------------------------------------------------------------------------- | |
243 | // | |
244 | // operator == (comparison) Consider to patterns to be == if the | |
245 | // pattern strings and the flags are the same. | |
729e4ab9 A |
246 | // Note that pattern strings with the same |
247 | // characters can still be considered different. | |
b75a7d8f A |
248 | // |
249 | //-------------------------------------------------------------------------- | |
250 | UBool RegexPattern::operator ==(const RegexPattern &other) const { | |
729e4ab9 A |
251 | if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) { |
252 | if (this->fPatternString != NULL && other.fPatternString != NULL) { | |
253 | return *(this->fPatternString) == *(other.fPatternString); | |
254 | } else if (this->fPattern == NULL) { | |
255 | if (other.fPattern == NULL) { | |
256 | return TRUE; | |
257 | } | |
258 | } else if (other.fPattern != NULL) { | |
259 | UTEXT_SETNATIVEINDEX(this->fPattern, 0); | |
260 | UTEXT_SETNATIVEINDEX(other.fPattern, 0); | |
261 | return utext_equals(this->fPattern, other.fPattern); | |
262 | } | |
263 | } | |
264 | return FALSE; | |
b75a7d8f A |
265 | } |
266 | ||
267 | //--------------------------------------------------------------------- | |
268 | // | |
46f4442e | 269 | // compile |
b75a7d8f A |
270 | // |
271 | //--------------------------------------------------------------------- | |
374ca955 A |
272 | RegexPattern * U_EXPORT2 |
273 | RegexPattern::compile(const UnicodeString ®ex, | |
274 | uint32_t flags, | |
275 | UParseError &pe, | |
276 | UErrorCode &status) | |
277 | { | |
729e4ab9 A |
278 | if (U_FAILURE(status)) { |
279 | return NULL; | |
280 | } | |
281 | ||
282 | const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | | |
283 | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | | |
284 | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; | |
285 | ||
286 | if ((flags & ~allFlags) != 0) { | |
287 | status = U_REGEX_INVALID_FLAG; | |
288 | return NULL; | |
289 | } | |
290 | ||
291 | if ((flags & (UREGEX_CANON_EQ | UREGEX_LITERAL)) != 0) { | |
292 | status = U_REGEX_UNIMPLEMENTED; | |
293 | return NULL; | |
294 | } | |
295 | ||
296 | RegexPattern *This = new RegexPattern; | |
297 | if (This == NULL) { | |
298 | status = U_MEMORY_ALLOCATION_ERROR; | |
299 | return NULL; | |
300 | } | |
301 | if (U_FAILURE(This->fDeferredStatus)) { | |
302 | status = This->fDeferredStatus; | |
303 | delete This; | |
304 | return NULL; | |
305 | } | |
306 | This->fFlags = flags; | |
307 | ||
308 | RegexCompile compiler(This, status); | |
309 | compiler.compile(regex, pe, status); | |
310 | ||
311 | if (U_FAILURE(status)) { | |
312 | delete This; | |
313 | This = NULL; | |
314 | } | |
315 | ||
316 | return This; | |
317 | } | |
318 | ||
b75a7d8f | 319 | |
729e4ab9 A |
320 | // |
321 | // compile, UText mode | |
322 | // | |
323 | RegexPattern * U_EXPORT2 | |
324 | RegexPattern::compile(UText *regex, | |
325 | uint32_t flags, | |
326 | UParseError &pe, | |
327 | UErrorCode &status) | |
328 | { | |
b75a7d8f A |
329 | if (U_FAILURE(status)) { |
330 | return NULL; | |
331 | } | |
332 | ||
333 | const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | | |
46f4442e | 334 | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | |
729e4ab9 | 335 | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; |
b75a7d8f A |
336 | |
337 | if ((flags & ~allFlags) != 0) { | |
338 | status = U_REGEX_INVALID_FLAG; | |
339 | return NULL; | |
340 | } | |
341 | ||
729e4ab9 | 342 | if ((flags & (UREGEX_CANON_EQ | UREGEX_LITERAL)) != 0) { |
b75a7d8f A |
343 | status = U_REGEX_UNIMPLEMENTED; |
344 | return NULL; | |
345 | } | |
346 | ||
347 | RegexPattern *This = new RegexPattern; | |
348 | if (This == NULL) { | |
349 | status = U_MEMORY_ALLOCATION_ERROR; | |
350 | return NULL; | |
351 | } | |
352 | if (U_FAILURE(This->fDeferredStatus)) { | |
353 | status = This->fDeferredStatus; | |
46f4442e | 354 | delete This; |
b75a7d8f A |
355 | return NULL; |
356 | } | |
357 | This->fFlags = flags; | |
358 | ||
359 | RegexCompile compiler(This, status); | |
360 | compiler.compile(regex, pe, status); | |
46f4442e A |
361 | |
362 | if (U_FAILURE(status)) { | |
363 | delete This; | |
364 | This = NULL; | |
365 | } | |
b75a7d8f A |
366 | |
367 | return This; | |
73c04bcf | 368 | } |
46f4442e | 369 | |
b75a7d8f A |
370 | // |
371 | // compile with default flags. | |
372 | // | |
374ca955 A |
373 | RegexPattern * U_EXPORT2 |
374 | RegexPattern::compile(const UnicodeString ®ex, | |
375 | UParseError &pe, | |
46f4442e | 376 | UErrorCode &err) |
b75a7d8f | 377 | { |
46f4442e | 378 | return compile(regex, 0, pe, err); |
b75a7d8f A |
379 | } |
380 | ||
381 | ||
729e4ab9 A |
382 | // |
383 | // compile with default flags, UText mode | |
384 | // | |
385 | RegexPattern * U_EXPORT2 | |
386 | RegexPattern::compile(UText *regex, | |
387 | UParseError &pe, | |
388 | UErrorCode &err) | |
389 | { | |
390 | return compile(regex, 0, pe, err); | |
391 | } | |
392 | ||
b75a7d8f A |
393 | |
394 | // | |
395 | // compile with no UParseErr parameter. | |
396 | // | |
374ca955 | 397 | RegexPattern * U_EXPORT2 |
729e4ab9 A |
398 | RegexPattern::compile(const UnicodeString ®ex, |
399 | uint32_t flags, | |
400 | UErrorCode &err) | |
b75a7d8f A |
401 | { |
402 | UParseError pe; | |
46f4442e | 403 | return compile(regex, flags, pe, err); |
b75a7d8f A |
404 | } |
405 | ||
406 | ||
729e4ab9 A |
407 | // |
408 | // compile with no UParseErr parameter, UText mode | |
409 | // | |
410 | RegexPattern * U_EXPORT2 | |
411 | RegexPattern::compile(UText *regex, | |
412 | uint32_t flags, | |
413 | UErrorCode &err) | |
414 | { | |
415 | UParseError pe; | |
416 | return compile(regex, flags, pe, err); | |
417 | } | |
418 | ||
b75a7d8f A |
419 | |
420 | //--------------------------------------------------------------------- | |
421 | // | |
422 | // flags | |
423 | // | |
424 | //--------------------------------------------------------------------- | |
425 | uint32_t RegexPattern::flags() const { | |
426 | return fFlags; | |
427 | } | |
428 | ||
429 | ||
430 | //--------------------------------------------------------------------- | |
431 | // | |
432 | // matcher(UnicodeString, err) | |
433 | // | |
434 | //--------------------------------------------------------------------- | |
435 | RegexMatcher *RegexPattern::matcher(const UnicodeString &input, | |
436 | UErrorCode &status) const { | |
437 | RegexMatcher *retMatcher = matcher(status); | |
438 | if (retMatcher != NULL) { | |
729e4ab9 A |
439 | retMatcher->fDeferredStatus = status; |
440 | retMatcher->reset(input); | |
441 | } | |
442 | return retMatcher; | |
443 | } | |
444 | ||
445 | // | |
446 | // matcher, UText mode | |
447 | // | |
448 | RegexMatcher *RegexPattern::matcher(UText *input, | |
449 | PatternIsUTextFlag /*flag*/, | |
450 | UErrorCode &status) const { | |
451 | RegexMatcher *retMatcher = matcher(status); | |
452 | if (retMatcher != NULL) { | |
453 | retMatcher->fDeferredStatus = status; | |
b75a7d8f A |
454 | retMatcher->reset(input); |
455 | } | |
456 | return retMatcher; | |
73c04bcf | 457 | } |
b75a7d8f | 458 | |
73c04bcf | 459 | #if 0 |
374ca955 | 460 | RegexMatcher *RegexPattern::matcher(const UChar * /*input*/, |
46f4442e | 461 | UErrorCode &status) const |
374ca955 A |
462 | { |
463 | /* This should never get called. The API with UnicodeString should be called instead. */ | |
464 | if (U_SUCCESS(status)) { | |
465 | status = U_UNSUPPORTED_ERROR; | |
466 | } | |
467 | return NULL; | |
468 | } | |
73c04bcf | 469 | #endif |
b75a7d8f A |
470 | |
471 | //--------------------------------------------------------------------- | |
472 | // | |
473 | // matcher(status) | |
474 | // | |
475 | //--------------------------------------------------------------------- | |
476 | RegexMatcher *RegexPattern::matcher(UErrorCode &status) const { | |
477 | RegexMatcher *retMatcher = NULL; | |
478 | ||
479 | if (U_FAILURE(status)) { | |
480 | return NULL; | |
481 | } | |
482 | if (U_FAILURE(fDeferredStatus)) { | |
483 | status = fDeferredStatus; | |
484 | return NULL; | |
485 | } | |
486 | ||
46f4442e | 487 | retMatcher = new RegexMatcher(this); |
b75a7d8f A |
488 | if (retMatcher == NULL) { |
489 | status = U_MEMORY_ALLOCATION_ERROR; | |
490 | return NULL; | |
491 | } | |
492 | return retMatcher; | |
73c04bcf | 493 | } |
b75a7d8f A |
494 | |
495 | ||
496 | ||
497 | //--------------------------------------------------------------------- | |
498 | // | |
499 | // matches Convenience function to test for a match, starting | |
500 | // with a pattern string and a data string. | |
501 | // | |
502 | //--------------------------------------------------------------------- | |
374ca955 | 503 | UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex, |
b75a7d8f A |
504 | const UnicodeString &input, |
505 | UParseError &pe, | |
506 | UErrorCode &status) { | |
507 | ||
508 | if (U_FAILURE(status)) {return FALSE;} | |
509 | ||
510 | UBool retVal; | |
511 | RegexPattern *pat = NULL; | |
512 | RegexMatcher *matcher = NULL; | |
513 | ||
514 | pat = RegexPattern::compile(regex, 0, pe, status); | |
515 | matcher = pat->matcher(input, status); | |
516 | retVal = matcher->matches(status); | |
517 | ||
518 | delete matcher; | |
519 | delete pat; | |
520 | return retVal; | |
521 | } | |
522 | ||
523 | ||
729e4ab9 A |
524 | // |
525 | // matches, UText mode | |
526 | // | |
527 | UBool U_EXPORT2 RegexPattern::matches(UText *regex, | |
528 | UText *input, | |
529 | UParseError &pe, | |
530 | UErrorCode &status) { | |
531 | ||
532 | if (U_FAILURE(status)) {return FALSE;} | |
533 | ||
534 | UBool retVal; | |
535 | RegexPattern *pat = NULL; | |
536 | RegexMatcher *matcher = NULL; | |
537 | ||
538 | pat = RegexPattern::compile(regex, 0, pe, status); | |
539 | matcher = pat->matcher(input, PATTERN_IS_UTEXT, status); | |
540 | retVal = matcher->matches(status); | |
541 | ||
542 | delete matcher; | |
543 | delete pat; | |
544 | return retVal; | |
545 | } | |
546 | ||
547 | ||
548 | ||
b75a7d8f A |
549 | |
550 | ||
551 | //--------------------------------------------------------------------- | |
552 | // | |
553 | // pattern | |
554 | // | |
555 | //--------------------------------------------------------------------- | |
556 | UnicodeString RegexPattern::pattern() const { | |
729e4ab9 A |
557 | if (fPatternString != NULL) { |
558 | return *fPatternString; | |
559 | } else if (fPattern == NULL) { | |
560 | return UnicodeString(); | |
561 | } else { | |
562 | UErrorCode status = U_ZERO_ERROR; | |
563 | int64_t nativeLen = utext_nativeLength(fPattern); | |
564 | int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error | |
565 | UnicodeString result; | |
566 | ||
567 | status = U_ZERO_ERROR; | |
568 | UChar *resultChars = result.getBuffer(len16); | |
569 | utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning | |
570 | result.releaseBuffer(len16); | |
571 | ||
572 | return result; | |
573 | } | |
b75a7d8f A |
574 | } |
575 | ||
576 | ||
577 | ||
578 | ||
729e4ab9 A |
579 | //--------------------------------------------------------------------- |
580 | // | |
581 | // patternText | |
582 | // | |
583 | //--------------------------------------------------------------------- | |
584 | UText *RegexPattern::patternText(UErrorCode &status) const { | |
585 | if (U_FAILURE(status)) {return NULL;} | |
586 | status = U_ZERO_ERROR; | |
587 | ||
588 | if (fPattern != NULL) { | |
589 | return fPattern; | |
590 | } else { | |
591 | RegexStaticSets::initGlobals(&status); | |
592 | return RegexStaticSets::gStaticSets->fEmptyText; | |
593 | } | |
594 | } | |
595 | ||
596 | ||
597 | ||
b75a7d8f A |
598 | //--------------------------------------------------------------------- |
599 | // | |
600 | // split | |
601 | // | |
602 | //--------------------------------------------------------------------- | |
603 | int32_t RegexPattern::split(const UnicodeString &input, | |
604 | UnicodeString dest[], | |
605 | int32_t destCapacity, | |
729e4ab9 A |
606 | UErrorCode &status) const |
607 | { | |
608 | if (U_FAILURE(status)) { | |
609 | return 0; | |
610 | }; | |
611 | ||
612 | RegexMatcher m(this); | |
613 | int32_t r = 0; | |
614 | // Check m's status to make sure all is ok. | |
615 | if (U_SUCCESS(m.fDeferredStatus)) { | |
616 | r = m.split(input, dest, destCapacity, status); | |
617 | } | |
618 | return r; | |
619 | } | |
620 | ||
621 | // | |
622 | // split, UText mode | |
623 | // | |
624 | int32_t RegexPattern::split(UText *input, | |
625 | UText *dest[], | |
626 | int32_t destCapacity, | |
627 | UErrorCode &status) const | |
b75a7d8f A |
628 | { |
629 | if (U_FAILURE(status)) { | |
630 | return 0; | |
631 | }; | |
632 | ||
633 | RegexMatcher m(this); | |
46f4442e A |
634 | int32_t r = 0; |
635 | // Check m's status to make sure all is ok. | |
636 | if (U_SUCCESS(m.fDeferredStatus)) { | |
637 | r = m.split(input, dest, destCapacity, status); | |
638 | } | |
b75a7d8f A |
639 | return r; |
640 | } | |
641 | ||
642 | ||
643 | ||
644 | //--------------------------------------------------------------------- | |
645 | // | |
646 | // dump Output the compiled form of the pattern. | |
647 | // Debugging function only. | |
648 | // | |
649 | //--------------------------------------------------------------------- | |
b75a7d8f | 650 | #if defined(REGEX_DEBUG) |
374ca955 | 651 | void RegexPattern::dumpOp(int32_t index) const { |
b75a7d8f A |
652 | static const char * const opNames[] = {URX_OPCODE_NAMES}; |
653 | int32_t op = fCompiledPat->elementAti(index); | |
654 | int32_t val = URX_VAL(op); | |
655 | int32_t type = URX_TYPE(op); | |
656 | int32_t pinnedType = type; | |
46f4442e | 657 | if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) { |
b75a7d8f A |
658 | pinnedType = 0; |
659 | } | |
46f4442e | 660 | |
374ca955 | 661 | REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType])); |
b75a7d8f A |
662 | switch (type) { |
663 | case URX_NOP: | |
664 | case URX_DOTANY: | |
665 | case URX_DOTANY_ALL: | |
b75a7d8f A |
666 | case URX_FAIL: |
667 | case URX_CARET: | |
668 | case URX_DOLLAR: | |
669 | case URX_BACKSLASH_G: | |
670 | case URX_BACKSLASH_X: | |
671 | case URX_END: | |
672 | case URX_DOLLAR_M: | |
673 | case URX_CARET_M: | |
674 | // Types with no operand field of interest. | |
675 | break; | |
46f4442e | 676 | |
b75a7d8f A |
677 | case URX_RESERVED_OP: |
678 | case URX_START_CAPTURE: | |
679 | case URX_END_CAPTURE: | |
680 | case URX_STATE_SAVE: | |
681 | case URX_JMP: | |
682 | case URX_JMP_SAV: | |
683 | case URX_JMP_SAV_X: | |
684 | case URX_BACKSLASH_B: | |
374ca955 | 685 | case URX_BACKSLASH_BU: |
b75a7d8f A |
686 | case URX_BACKSLASH_D: |
687 | case URX_BACKSLASH_Z: | |
688 | case URX_STRING_LEN: | |
689 | case URX_CTR_INIT: | |
690 | case URX_CTR_INIT_NG: | |
691 | case URX_CTR_LOOP: | |
692 | case URX_CTR_LOOP_NG: | |
693 | case URX_RELOC_OPRND: | |
694 | case URX_STO_SP: | |
695 | case URX_LD_SP: | |
696 | case URX_BACKREF: | |
697 | case URX_STO_INP_LOC: | |
698 | case URX_JMPX: | |
699 | case URX_LA_START: | |
700 | case URX_LA_END: | |
701 | case URX_BACKREF_I: | |
702 | case URX_LB_START: | |
703 | case URX_LB_CONT: | |
704 | case URX_LB_END: | |
705 | case URX_LBN_CONT: | |
706 | case URX_LBN_END: | |
707 | case URX_LOOP_C: | |
708 | case URX_LOOP_DOT_I: | |
709 | // types with an integer operand field. | |
374ca955 | 710 | REGEX_DUMP_DEBUG_PRINTF(("%d", val)); |
b75a7d8f | 711 | break; |
46f4442e | 712 | |
b75a7d8f A |
713 | case URX_ONECHAR: |
714 | case URX_ONECHAR_I: | |
374ca955 | 715 | REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?')); |
b75a7d8f | 716 | break; |
46f4442e | 717 | |
b75a7d8f A |
718 | case URX_STRING: |
719 | case URX_STRING_I: | |
720 | { | |
721 | int32_t lengthOp = fCompiledPat->elementAti(index+1); | |
722 | U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); | |
723 | int32_t length = URX_VAL(lengthOp); | |
724 | int32_t i; | |
725 | for (i=val; i<val+length; i++) { | |
726 | UChar c = fLiteralText[i]; | |
727 | if (c < 32 || c >= 256) {c = '.';} | |
374ca955 | 728 | REGEX_DUMP_DEBUG_PRINTF(("%c", c)); |
b75a7d8f A |
729 | } |
730 | } | |
731 | break; | |
732 | ||
733 | case URX_SETREF: | |
734 | case URX_LOOP_SR_I: | |
735 | { | |
736 | UnicodeString s; | |
737 | UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); | |
738 | set->toPattern(s, TRUE); | |
739 | for (int32_t i=0; i<s.length(); i++) { | |
374ca955 | 740 | REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i))); |
b75a7d8f A |
741 | } |
742 | } | |
743 | break; | |
744 | ||
745 | case URX_STATIC_SETREF: | |
746 | case URX_STAT_SETREF_N: | |
747 | { | |
748 | UnicodeString s; | |
749 | if (val & URX_NEG_SET) { | |
374ca955 | 750 | REGEX_DUMP_DEBUG_PRINTF(("NOT ")); |
b75a7d8f A |
751 | val &= ~URX_NEG_SET; |
752 | } | |
753 | UnicodeSet *set = fStaticSets[val]; | |
754 | set->toPattern(s, TRUE); | |
755 | for (int32_t i=0; i<s.length(); i++) { | |
374ca955 | 756 | REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i))); |
b75a7d8f A |
757 | } |
758 | } | |
759 | break; | |
760 | ||
46f4442e | 761 | |
b75a7d8f | 762 | default: |
374ca955 | 763 | REGEX_DUMP_DEBUG_PRINTF(("??????")); |
b75a7d8f A |
764 | break; |
765 | } | |
374ca955 | 766 | REGEX_DUMP_DEBUG_PRINTF(("\n")); |
b75a7d8f | 767 | } |
374ca955 | 768 | #endif |
b75a7d8f A |
769 | |
770 | ||
b75a7d8f | 771 | #if defined(REGEX_DEBUG) |
46f4442e | 772 | U_CAPI void U_EXPORT2 |
374ca955 | 773 | RegexPatternDump(const RegexPattern *This) { |
b75a7d8f A |
774 | int index; |
775 | int i; | |
776 | ||
374ca955 | 777 | REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: ")); |
729e4ab9 A |
778 | UChar32 c = utext_next32From(This->fPattern, 0); |
779 | while (c != U_SENTINEL) { | |
780 | if (c<32 || c>256) { | |
781 | c = '.'; | |
782 | } | |
783 | REGEX_DUMP_DEBUG_PRINTF(("%c", c)); | |
784 | ||
785 | c = UTEXT_NEXT32(This->fPattern); | |
b75a7d8f | 786 | } |
374ca955 A |
787 | REGEX_DUMP_DEBUG_PRINTF(("\n")); |
788 | REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen)); | |
46f4442e | 789 | REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType))); |
374ca955 | 790 | if (This->fStartType == START_STRING) { |
729e4ab9 | 791 | REGEX_DUMP_DEBUG_PRINTF((" Initial match string: \"")); |
374ca955 A |
792 | for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) { |
793 | REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates. | |
b75a7d8f | 794 | } |
729e4ab9 | 795 | REGEX_DUMP_DEBUG_PRINTF(("\"\n")); |
b75a7d8f | 796 | |
374ca955 A |
797 | } else if (This->fStartType == START_SET) { |
798 | int32_t numSetChars = This->fInitialChars->size(); | |
b75a7d8f A |
799 | if (numSetChars > 20) { |
800 | numSetChars = 20; | |
801 | } | |
374ca955 | 802 | REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : ")); |
b75a7d8f | 803 | for (i=0; i<numSetChars; i++) { |
374ca955 | 804 | UChar32 c = This->fInitialChars->charAt(i); |
46f4442e | 805 | if (0x20<c && c <0x7e) { |
374ca955 | 806 | REGEX_DUMP_DEBUG_PRINTF(("%c ", c)); |
b75a7d8f | 807 | } else { |
374ca955 | 808 | REGEX_DUMP_DEBUG_PRINTF(("%#x ", c)); |
b75a7d8f A |
809 | } |
810 | } | |
374ca955 A |
811 | if (numSetChars < This->fInitialChars->size()) { |
812 | REGEX_DUMP_DEBUG_PRINTF((" ...")); | |
b75a7d8f | 813 | } |
374ca955 | 814 | REGEX_DUMP_DEBUG_PRINTF(("\n")); |
b75a7d8f | 815 | |
374ca955 A |
816 | } else if (This->fStartType == START_CHAR) { |
817 | REGEX_DUMP_DEBUG_PRINTF((" First char of Match : ")); | |
818 | if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) { | |
819 | REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar)); | |
b75a7d8f | 820 | } else { |
374ca955 | 821 | REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar)); |
b75a7d8f A |
822 | } |
823 | } | |
824 | ||
374ca955 A |
825 | REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \ |
826 | "-------------------------------------------\n")); | |
827 | for (index = 0; index<This->fCompiledPat->size(); index++) { | |
828 | This->dumpOp(index); | |
b75a7d8f | 829 | } |
374ca955 | 830 | REGEX_DUMP_DEBUG_PRINTF(("\n\n")); |
46f4442e | 831 | } |
374ca955 | 832 | #endif |
b75a7d8f A |
833 | |
834 | ||
835 | ||
374ca955 | 836 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern) |
b75a7d8f A |
837 | |
838 | U_NAMESPACE_END | |
839 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |