]>
Commit | Line | Data |
---|---|---|
b75a7d8f | 1 | // |
46f4442e | 2 | // file: repattrn.cpp |
b75a7d8f A |
3 | // |
4 | /* | |
5 | *************************************************************************** | |
2ca993e8 A |
6 | * Copyright (C) 2002-2016 International Business Machines Corporation |
7 | * and others. All rights reserved. | |
b75a7d8f A |
8 | *************************************************************************** |
9 | */ | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | ||
13 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
14 | ||
15 | #include "unicode/regex.h" | |
374ca955 | 16 | #include "unicode/uclean.h" |
2ca993e8 A |
17 | #include "cmemory.h" |
18 | #include "cstr.h" | |
b75a7d8f | 19 | #include "uassert.h" |
b331163b | 20 | #include "uhash.h" |
b75a7d8f A |
21 | #include "uvector.h" |
22 | #include "uvectr32.h" | |
729e4ab9 | 23 | #include "uvectr64.h" |
b75a7d8f A |
24 | #include "regexcmp.h" |
25 | #include "regeximp.h" | |
26 | #include "regexst.h" | |
27 | ||
28 | U_NAMESPACE_BEGIN | |
29 | ||
30 | //-------------------------------------------------------------------------- | |
31 | // | |
32 | // RegexPattern Default Constructor | |
33 | // | |
34 | //-------------------------------------------------------------------------- | |
35 | RegexPattern::RegexPattern() { | |
36 | // Init all of this instances data. | |
37 | init(); | |
73c04bcf | 38 | } |
b75a7d8f A |
39 | |
40 | ||
41 | //-------------------------------------------------------------------------- | |
42 | // | |
43 | // Copy Constructor Note: This is a rather inefficient implementation, | |
44 | // but it probably doesn't matter. | |
45 | // | |
46 | //-------------------------------------------------------------------------- | |
47 | RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) { | |
46f4442e | 48 | init(); |
b75a7d8f A |
49 | *this = other; |
50 | } | |
51 | ||
52 | ||
53 | ||
54 | //-------------------------------------------------------------------------- | |
55 | // | |
729e4ab9 | 56 | // Assignment Operator |
b75a7d8f A |
57 | // |
58 | //-------------------------------------------------------------------------- | |
59 | RegexPattern &RegexPattern::operator = (const RegexPattern &other) { | |
60 | if (this == &other) { | |
61 | // Source and destination are the same. Don't do anything. | |
62 | return *this; | |
63 | } | |
64 | ||
65 | // Clean out any previous contents of object being assigned to. | |
66 | zap(); | |
67 | ||
68 | // Give target object a default initialization | |
69 | init(); | |
70 | ||
71 | // Copy simple fields | |
b331163b A |
72 | fDeferredStatus = other.fDeferredStatus; |
73 | ||
74 | if (U_FAILURE(fDeferredStatus)) { | |
75 | return *this; | |
76 | } | |
77 | ||
78 | if (other.fPatternString == NULL) { | |
729e4ab9 | 79 | fPatternString = NULL; |
b331163b | 80 | fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus); |
729e4ab9 A |
81 | } else { |
82 | fPatternString = new UnicodeString(*(other.fPatternString)); | |
b331163b | 83 | if (fPatternString == NULL) { |
729e4ab9 | 84 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
b331163b A |
85 | } else { |
86 | fPattern = utext_openConstUnicodeString(NULL, fPatternString, &fDeferredStatus); | |
729e4ab9 A |
87 | } |
88 | } | |
b331163b A |
89 | if (U_FAILURE(fDeferredStatus)) { |
90 | return *this; | |
91 | } | |
92 | ||
b75a7d8f A |
93 | fFlags = other.fFlags; |
94 | fLiteralText = other.fLiteralText; | |
b75a7d8f | 95 | fMinMatchLen = other.fMinMatchLen; |
374ca955 A |
96 | fFrameSize = other.fFrameSize; |
97 | fDataSize = other.fDataSize; | |
46f4442e | 98 | fStaticSets = other.fStaticSets; |
374ca955 | 99 | fStaticSets8 = other.fStaticSets8; |
46f4442e | 100 | |
b75a7d8f A |
101 | fStartType = other.fStartType; |
102 | fInitialStringIdx = other.fInitialStringIdx; | |
103 | fInitialStringLen = other.fInitialStringLen; | |
104 | *fInitialChars = *other.fInitialChars; | |
b75a7d8f | 105 | fInitialChar = other.fInitialChar; |
374ca955 | 106 | *fInitialChars8 = *other.fInitialChars8; |
729e4ab9 | 107 | fNeedsAltInput = other.fNeedsAltInput; |
b75a7d8f A |
108 | |
109 | // Copy the pattern. It's just values, nothing deep to copy. | |
110 | fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus); | |
111 | fGroupMap->assign(*other.fGroupMap, fDeferredStatus); | |
112 | ||
46f4442e | 113 | // Copy the Unicode Sets. |
b75a7d8f | 114 | // Could be made more efficient if the sets were reference counted and shared, |
46f4442e | 115 | // but I doubt that pattern copying will be particularly common. |
b75a7d8f A |
116 | // Note: init() already added an empty element zero to fSets |
117 | int32_t i; | |
118 | int32_t numSets = other.fSets->size(); | |
119 | fSets8 = new Regex8BitSet[numSets]; | |
46f4442e A |
120 | if (fSets8 == NULL) { |
121 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | |
122 | return *this; | |
123 | } | |
b75a7d8f A |
124 | for (i=1; i<numSets; i++) { |
125 | if (U_FAILURE(fDeferredStatus)) { | |
126 | return *this; | |
127 | } | |
128 | UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i); | |
129 | UnicodeSet *newSet = new UnicodeSet(*sourceSet); | |
130 | if (newSet == NULL) { | |
131 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | |
132 | break; | |
133 | } | |
134 | fSets->addElement(newSet, fDeferredStatus); | |
135 | fSets8[i] = other.fSets8[i]; | |
136 | } | |
137 | ||
b331163b A |
138 | // Copy the named capture group hash map. |
139 | int32_t hashPos = UHASH_FIRST; | |
140 | while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) { | |
141 | if (U_FAILURE(fDeferredStatus)) { | |
142 | break; | |
143 | } | |
144 | const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer; | |
145 | UnicodeString *key = new UnicodeString(*name); | |
146 | int32_t val = hashEl->value.integer; | |
147 | if (key == NULL) { | |
148 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; | |
149 | } else { | |
150 | uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus); | |
151 | } | |
152 | } | |
b75a7d8f A |
153 | return *this; |
154 | } | |
155 | ||
156 | ||
157 | //-------------------------------------------------------------------------- | |
158 | // | |
159 | // init Shared initialization for use by constructors. | |
160 | // Bring an uninitialized RegexPattern up to a default state. | |
161 | // | |
162 | //-------------------------------------------------------------------------- | |
163 | void RegexPattern::init() { | |
164 | fFlags = 0; | |
374ca955 A |
165 | fCompiledPat = 0; |
166 | fLiteralText.remove(); | |
167 | fSets = NULL; | |
168 | fSets8 = NULL; | |
b75a7d8f A |
169 | fDeferredStatus = U_ZERO_ERROR; |
170 | fMinMatchLen = 0; | |
b75a7d8f A |
171 | fFrameSize = 0; |
172 | fDataSize = 0; | |
374ca955 | 173 | fGroupMap = NULL; |
374ca955 A |
174 | fStaticSets = NULL; |
175 | fStaticSets8 = NULL; | |
b75a7d8f A |
176 | fStartType = START_NO_INFO; |
177 | fInitialStringIdx = 0; | |
178 | fInitialStringLen = 0; | |
179 | fInitialChars = NULL; | |
b75a7d8f | 180 | fInitialChar = 0; |
374ca955 | 181 | fInitialChars8 = NULL; |
729e4ab9 | 182 | fNeedsAltInput = FALSE; |
b331163b | 183 | fNamedCaptureMap = NULL; |
46f4442e | 184 | |
729e4ab9 A |
185 | fPattern = NULL; // will be set later |
186 | fPatternString = NULL; // may be set later | |
187 | fCompiledPat = new UVector64(fDeferredStatus); | |
b75a7d8f A |
188 | fGroupMap = new UVector32(fDeferredStatus); |
189 | fSets = new UVector(fDeferredStatus); | |
190 | fInitialChars = new UnicodeSet; | |
191 | fInitialChars8 = new Regex8BitSet; | |
b331163b A |
192 | fNamedCaptureMap = uhash_open(uhash_hashUnicodeString, // Key hash function |
193 | uhash_compareUnicodeString, // Key comparator function | |
194 | uhash_compareLong, // Value comparator function | |
195 | &fDeferredStatus); | |
b75a7d8f A |
196 | if (U_FAILURE(fDeferredStatus)) { |
197 | return; | |
198 | } | |
199 | if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL || | |
b331163b | 200 | fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) { |
b75a7d8f A |
201 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
202 | return; | |
203 | } | |
204 | ||
205 | // Slot zero of the vector of sets is reserved. Fill it here. | |
206 | fSets->addElement((int32_t)0, fDeferredStatus); | |
b331163b A |
207 | |
208 | // fNamedCaptureMap owns its key strings, type (UnicodeString *) | |
209 | uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject); | |
b75a7d8f A |
210 | } |
211 | ||
212 | ||
213 | //-------------------------------------------------------------------------- | |
214 | // | |
46f4442e | 215 | // zap Delete everything owned by this RegexPattern. |
b75a7d8f A |
216 | // |
217 | //-------------------------------------------------------------------------- | |
218 | void RegexPattern::zap() { | |
219 | delete fCompiledPat; | |
220 | fCompiledPat = NULL; | |
221 | int i; | |
222 | for (i=1; i<fSets->size(); i++) { | |
223 | UnicodeSet *s; | |
224 | s = (UnicodeSet *)fSets->elementAt(i); | |
225 | if (s != NULL) { | |
226 | delete s; | |
227 | } | |
228 | } | |
229 | delete fSets; | |
230 | fSets = NULL; | |
374ca955 A |
231 | delete[] fSets8; |
232 | fSets8 = NULL; | |
b75a7d8f A |
233 | delete fGroupMap; |
234 | fGroupMap = NULL; | |
235 | delete fInitialChars; | |
236 | fInitialChars = NULL; | |
237 | delete fInitialChars8; | |
238 | fInitialChars8 = NULL; | |
729e4ab9 A |
239 | if (fPattern != NULL) { |
240 | utext_close(fPattern); | |
241 | fPattern = NULL; | |
242 | } | |
243 | if (fPatternString != NULL) { | |
244 | delete fPatternString; | |
245 | fPatternString = NULL; | |
246 | } | |
b331163b A |
247 | uhash_close(fNamedCaptureMap); |
248 | fNamedCaptureMap = NULL; | |
b75a7d8f A |
249 | } |
250 | ||
251 | ||
252 | //-------------------------------------------------------------------------- | |
253 | // | |
254 | // Destructor | |
255 | // | |
256 | //-------------------------------------------------------------------------- | |
257 | RegexPattern::~RegexPattern() { | |
258 | zap(); | |
73c04bcf | 259 | } |
b75a7d8f A |
260 | |
261 | ||
262 | //-------------------------------------------------------------------------- | |
263 | // | |
264 | // Clone | |
265 | // | |
266 | //-------------------------------------------------------------------------- | |
46f4442e | 267 | RegexPattern *RegexPattern::clone() const { |
b75a7d8f A |
268 | RegexPattern *copy = new RegexPattern(*this); |
269 | return copy; | |
73c04bcf | 270 | } |
b75a7d8f A |
271 | |
272 | ||
273 | //-------------------------------------------------------------------------- | |
274 | // | |
275 | // operator == (comparison) Consider to patterns to be == if the | |
276 | // pattern strings and the flags are the same. | |
729e4ab9 A |
277 | // Note that pattern strings with the same |
278 | // characters can still be considered different. | |
b75a7d8f A |
279 | // |
280 | //-------------------------------------------------------------------------- | |
281 | UBool RegexPattern::operator ==(const RegexPattern &other) const { | |
729e4ab9 A |
282 | if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) { |
283 | if (this->fPatternString != NULL && other.fPatternString != NULL) { | |
284 | return *(this->fPatternString) == *(other.fPatternString); | |
285 | } else if (this->fPattern == NULL) { | |
286 | if (other.fPattern == NULL) { | |
287 | return TRUE; | |
288 | } | |
289 | } else if (other.fPattern != NULL) { | |
290 | UTEXT_SETNATIVEINDEX(this->fPattern, 0); | |
291 | UTEXT_SETNATIVEINDEX(other.fPattern, 0); | |
292 | return utext_equals(this->fPattern, other.fPattern); | |
293 | } | |
294 | } | |
295 | return FALSE; | |
b75a7d8f A |
296 | } |
297 | ||
298 | //--------------------------------------------------------------------- | |
299 | // | |
46f4442e | 300 | // compile |
b75a7d8f A |
301 | // |
302 | //--------------------------------------------------------------------- | |
374ca955 A |
303 | RegexPattern * U_EXPORT2 |
304 | RegexPattern::compile(const UnicodeString ®ex, | |
305 | uint32_t flags, | |
306 | UParseError &pe, | |
307 | UErrorCode &status) | |
308 | { | |
729e4ab9 A |
309 | if (U_FAILURE(status)) { |
310 | return NULL; | |
311 | } | |
57a6839d | 312 | |
729e4ab9 A |
313 | const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | |
314 | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | | |
315 | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; | |
57a6839d | 316 | |
729e4ab9 A |
317 | if ((flags & ~allFlags) != 0) { |
318 | status = U_REGEX_INVALID_FLAG; | |
319 | return NULL; | |
320 | } | |
57a6839d | 321 | |
4388f060 | 322 | if ((flags & UREGEX_CANON_EQ) != 0) { |
729e4ab9 A |
323 | status = U_REGEX_UNIMPLEMENTED; |
324 | return NULL; | |
325 | } | |
57a6839d | 326 | |
729e4ab9 A |
327 | RegexPattern *This = new RegexPattern; |
328 | if (This == NULL) { | |
329 | status = U_MEMORY_ALLOCATION_ERROR; | |
330 | return NULL; | |
331 | } | |
332 | if (U_FAILURE(This->fDeferredStatus)) { | |
333 | status = This->fDeferredStatus; | |
334 | delete This; | |
335 | return NULL; | |
336 | } | |
337 | This->fFlags = flags; | |
57a6839d | 338 | |
729e4ab9 A |
339 | RegexCompile compiler(This, status); |
340 | compiler.compile(regex, pe, status); | |
57a6839d | 341 | |
729e4ab9 A |
342 | if (U_FAILURE(status)) { |
343 | delete This; | |
344 | This = NULL; | |
345 | } | |
57a6839d | 346 | |
729e4ab9 A |
347 | return This; |
348 | } | |
349 | ||
b75a7d8f | 350 | |
729e4ab9 A |
351 | // |
352 | // compile, UText mode | |
353 | // | |
354 | RegexPattern * U_EXPORT2 | |
355 | RegexPattern::compile(UText *regex, | |
356 | uint32_t flags, | |
357 | UParseError &pe, | |
358 | UErrorCode &status) | |
359 | { | |
b75a7d8f A |
360 | if (U_FAILURE(status)) { |
361 | return NULL; | |
362 | } | |
363 | ||
364 | const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | | |
46f4442e | 365 | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | |
729e4ab9 | 366 | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; |
b75a7d8f A |
367 | |
368 | if ((flags & ~allFlags) != 0) { | |
369 | status = U_REGEX_INVALID_FLAG; | |
370 | return NULL; | |
371 | } | |
372 | ||
4388f060 | 373 | if ((flags & UREGEX_CANON_EQ) != 0) { |
b75a7d8f A |
374 | status = U_REGEX_UNIMPLEMENTED; |
375 | return NULL; | |
376 | } | |
377 | ||
378 | RegexPattern *This = new RegexPattern; | |
379 | if (This == NULL) { | |
380 | status = U_MEMORY_ALLOCATION_ERROR; | |
381 | return NULL; | |
382 | } | |
383 | if (U_FAILURE(This->fDeferredStatus)) { | |
384 | status = This->fDeferredStatus; | |
46f4442e | 385 | delete This; |
b75a7d8f A |
386 | return NULL; |
387 | } | |
388 | This->fFlags = flags; | |
389 | ||
390 | RegexCompile compiler(This, status); | |
391 | compiler.compile(regex, pe, status); | |
57a6839d | 392 | |
46f4442e A |
393 | if (U_FAILURE(status)) { |
394 | delete This; | |
395 | This = NULL; | |
396 | } | |
b75a7d8f A |
397 | |
398 | return This; | |
73c04bcf | 399 | } |
46f4442e | 400 | |
b75a7d8f A |
401 | // |
402 | // compile with default flags. | |
403 | // | |
374ca955 A |
404 | RegexPattern * U_EXPORT2 |
405 | RegexPattern::compile(const UnicodeString ®ex, | |
406 | UParseError &pe, | |
46f4442e | 407 | UErrorCode &err) |
b75a7d8f | 408 | { |
46f4442e | 409 | return compile(regex, 0, pe, err); |
b75a7d8f A |
410 | } |
411 | ||
412 | ||
729e4ab9 A |
413 | // |
414 | // compile with default flags, UText mode | |
415 | // | |
416 | RegexPattern * U_EXPORT2 | |
417 | RegexPattern::compile(UText *regex, | |
418 | UParseError &pe, | |
419 | UErrorCode &err) | |
420 | { | |
421 | return compile(regex, 0, pe, err); | |
422 | } | |
423 | ||
b75a7d8f A |
424 | |
425 | // | |
426 | // compile with no UParseErr parameter. | |
427 | // | |
374ca955 | 428 | RegexPattern * U_EXPORT2 |
729e4ab9 A |
429 | RegexPattern::compile(const UnicodeString ®ex, |
430 | uint32_t flags, | |
431 | UErrorCode &err) | |
b75a7d8f A |
432 | { |
433 | UParseError pe; | |
46f4442e | 434 | return compile(regex, flags, pe, err); |
b75a7d8f A |
435 | } |
436 | ||
437 | ||
729e4ab9 A |
438 | // |
439 | // compile with no UParseErr parameter, UText mode | |
440 | // | |
441 | RegexPattern * U_EXPORT2 | |
442 | RegexPattern::compile(UText *regex, | |
443 | uint32_t flags, | |
444 | UErrorCode &err) | |
445 | { | |
446 | UParseError pe; | |
447 | return compile(regex, flags, pe, err); | |
448 | } | |
449 | ||
b75a7d8f A |
450 | |
451 | //--------------------------------------------------------------------- | |
452 | // | |
453 | // flags | |
454 | // | |
455 | //--------------------------------------------------------------------- | |
456 | uint32_t RegexPattern::flags() const { | |
457 | return fFlags; | |
458 | } | |
459 | ||
460 | ||
461 | //--------------------------------------------------------------------- | |
462 | // | |
463 | // matcher(UnicodeString, err) | |
464 | // | |
465 | //--------------------------------------------------------------------- | |
466 | RegexMatcher *RegexPattern::matcher(const UnicodeString &input, | |
467 | UErrorCode &status) const { | |
468 | RegexMatcher *retMatcher = matcher(status); | |
469 | if (retMatcher != NULL) { | |
729e4ab9 A |
470 | retMatcher->fDeferredStatus = status; |
471 | retMatcher->reset(input); | |
472 | } | |
473 | return retMatcher; | |
474 | } | |
475 | ||
b75a7d8f A |
476 | |
477 | //--------------------------------------------------------------------- | |
478 | // | |
479 | // matcher(status) | |
480 | // | |
481 | //--------------------------------------------------------------------- | |
482 | RegexMatcher *RegexPattern::matcher(UErrorCode &status) const { | |
483 | RegexMatcher *retMatcher = NULL; | |
484 | ||
485 | if (U_FAILURE(status)) { | |
486 | return NULL; | |
487 | } | |
488 | if (U_FAILURE(fDeferredStatus)) { | |
489 | status = fDeferredStatus; | |
490 | return NULL; | |
491 | } | |
492 | ||
46f4442e | 493 | retMatcher = new RegexMatcher(this); |
b75a7d8f A |
494 | if (retMatcher == NULL) { |
495 | status = U_MEMORY_ALLOCATION_ERROR; | |
496 | return NULL; | |
497 | } | |
498 | return retMatcher; | |
73c04bcf | 499 | } |
b75a7d8f A |
500 | |
501 | ||
502 | ||
503 | //--------------------------------------------------------------------- | |
504 | // | |
505 | // matches Convenience function to test for a match, starting | |
506 | // with a pattern string and a data string. | |
507 | // | |
508 | //--------------------------------------------------------------------- | |
374ca955 | 509 | UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex, |
b75a7d8f A |
510 | const UnicodeString &input, |
511 | UParseError &pe, | |
512 | UErrorCode &status) { | |
513 | ||
514 | if (U_FAILURE(status)) {return FALSE;} | |
515 | ||
516 | UBool retVal; | |
517 | RegexPattern *pat = NULL; | |
518 | RegexMatcher *matcher = NULL; | |
519 | ||
520 | pat = RegexPattern::compile(regex, 0, pe, status); | |
521 | matcher = pat->matcher(input, status); | |
522 | retVal = matcher->matches(status); | |
523 | ||
524 | delete matcher; | |
525 | delete pat; | |
526 | return retVal; | |
527 | } | |
528 | ||
529 | ||
729e4ab9 A |
530 | // |
531 | // matches, UText mode | |
532 | // | |
533 | UBool U_EXPORT2 RegexPattern::matches(UText *regex, | |
534 | UText *input, | |
535 | UParseError &pe, | |
536 | UErrorCode &status) { | |
537 | ||
538 | if (U_FAILURE(status)) {return FALSE;} | |
539 | ||
4388f060 | 540 | UBool retVal = FALSE; |
729e4ab9 A |
541 | RegexPattern *pat = NULL; |
542 | RegexMatcher *matcher = NULL; | |
543 | ||
544 | pat = RegexPattern::compile(regex, 0, pe, status); | |
4388f060 A |
545 | matcher = pat->matcher(status); |
546 | if (U_SUCCESS(status)) { | |
547 | matcher->reset(input); | |
548 | retVal = matcher->matches(status); | |
549 | } | |
729e4ab9 A |
550 | |
551 | delete matcher; | |
552 | delete pat; | |
553 | return retVal; | |
554 | } | |
555 | ||
556 | ||
557 | ||
b75a7d8f A |
558 | |
559 | ||
560 | //--------------------------------------------------------------------- | |
561 | // | |
562 | // pattern | |
563 | // | |
564 | //--------------------------------------------------------------------- | |
565 | UnicodeString RegexPattern::pattern() const { | |
729e4ab9 A |
566 | if (fPatternString != NULL) { |
567 | return *fPatternString; | |
568 | } else if (fPattern == NULL) { | |
569 | return UnicodeString(); | |
570 | } else { | |
571 | UErrorCode status = U_ZERO_ERROR; | |
572 | int64_t nativeLen = utext_nativeLength(fPattern); | |
573 | int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error | |
574 | UnicodeString result; | |
57a6839d | 575 | |
729e4ab9 A |
576 | status = U_ZERO_ERROR; |
577 | UChar *resultChars = result.getBuffer(len16); | |
578 | utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning | |
579 | result.releaseBuffer(len16); | |
57a6839d | 580 | |
729e4ab9 A |
581 | return result; |
582 | } | |
b75a7d8f A |
583 | } |
584 | ||
585 | ||
586 | ||
587 | ||
729e4ab9 A |
588 | //--------------------------------------------------------------------- |
589 | // | |
590 | // patternText | |
591 | // | |
592 | //--------------------------------------------------------------------- | |
593 | UText *RegexPattern::patternText(UErrorCode &status) const { | |
594 | if (U_FAILURE(status)) {return NULL;} | |
595 | status = U_ZERO_ERROR; | |
596 | ||
597 | if (fPattern != NULL) { | |
598 | return fPattern; | |
599 | } else { | |
600 | RegexStaticSets::initGlobals(&status); | |
601 | return RegexStaticSets::gStaticSets->fEmptyText; | |
602 | } | |
603 | } | |
604 | ||
605 | ||
b331163b A |
606 | //-------------------------------------------------------------------------------- |
607 | // | |
608 | // groupNumberFromName() | |
609 | // | |
610 | //-------------------------------------------------------------------------------- | |
611 | int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const { | |
612 | if (U_FAILURE(status)) { | |
613 | return 0; | |
614 | } | |
615 | ||
616 | // No need to explicitly check for syntactically valid names. | |
617 | // Invalid ones will never be in the map, and the lookup will fail. | |
618 | ||
619 | int32_t number = uhash_geti(fNamedCaptureMap, &groupName); | |
620 | if (number == 0) { | |
621 | status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; | |
622 | } | |
623 | return number; | |
624 | } | |
625 | ||
626 | int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const { | |
627 | if (U_FAILURE(status)) { | |
628 | return 0; | |
629 | } | |
630 | UnicodeString name(groupName, nameLength, US_INV); | |
631 | return groupNumberFromName(name, status); | |
632 | } | |
633 | ||
729e4ab9 | 634 | |
b75a7d8f A |
635 | //--------------------------------------------------------------------- |
636 | // | |
637 | // split | |
638 | // | |
639 | //--------------------------------------------------------------------- | |
640 | int32_t RegexPattern::split(const UnicodeString &input, | |
641 | UnicodeString dest[], | |
642 | int32_t destCapacity, | |
729e4ab9 A |
643 | UErrorCode &status) const |
644 | { | |
645 | if (U_FAILURE(status)) { | |
646 | return 0; | |
647 | }; | |
648 | ||
649 | RegexMatcher m(this); | |
650 | int32_t r = 0; | |
651 | // Check m's status to make sure all is ok. | |
652 | if (U_SUCCESS(m.fDeferredStatus)) { | |
653 | r = m.split(input, dest, destCapacity, status); | |
654 | } | |
655 | return r; | |
656 | } | |
657 | ||
658 | // | |
659 | // split, UText mode | |
660 | // | |
661 | int32_t RegexPattern::split(UText *input, | |
662 | UText *dest[], | |
663 | int32_t destCapacity, | |
664 | UErrorCode &status) const | |
b75a7d8f A |
665 | { |
666 | if (U_FAILURE(status)) { | |
667 | return 0; | |
668 | }; | |
669 | ||
670 | RegexMatcher m(this); | |
46f4442e A |
671 | int32_t r = 0; |
672 | // Check m's status to make sure all is ok. | |
673 | if (U_SUCCESS(m.fDeferredStatus)) { | |
674 | r = m.split(input, dest, destCapacity, status); | |
675 | } | |
b75a7d8f A |
676 | return r; |
677 | } | |
678 | ||
679 | ||
b75a7d8f A |
680 | //--------------------------------------------------------------------- |
681 | // | |
682 | // dump Output the compiled form of the pattern. | |
683 | // Debugging function only. | |
684 | // | |
685 | //--------------------------------------------------------------------- | |
374ca955 | 686 | void RegexPattern::dumpOp(int32_t index) const { |
57a6839d A |
687 | (void)index; // Suppress warnings in non-debug build. |
688 | #if defined(REGEX_DEBUG) | |
b75a7d8f A |
689 | static const char * const opNames[] = {URX_OPCODE_NAMES}; |
690 | int32_t op = fCompiledPat->elementAti(index); | |
691 | int32_t val = URX_VAL(op); | |
692 | int32_t type = URX_TYPE(op); | |
693 | int32_t pinnedType = type; | |
2ca993e8 | 694 | if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) { |
b75a7d8f A |
695 | pinnedType = 0; |
696 | } | |
46f4442e | 697 | |
57a6839d | 698 | printf("%4d %08x %-15s ", index, op, opNames[pinnedType]); |
b75a7d8f A |
699 | switch (type) { |
700 | case URX_NOP: | |
701 | case URX_DOTANY: | |
702 | case URX_DOTANY_ALL: | |
b75a7d8f A |
703 | case URX_FAIL: |
704 | case URX_CARET: | |
705 | case URX_DOLLAR: | |
706 | case URX_BACKSLASH_G: | |
707 | case URX_BACKSLASH_X: | |
708 | case URX_END: | |
709 | case URX_DOLLAR_M: | |
710 | case URX_CARET_M: | |
711 | // Types with no operand field of interest. | |
712 | break; | |
46f4442e | 713 | |
b75a7d8f A |
714 | case URX_RESERVED_OP: |
715 | case URX_START_CAPTURE: | |
716 | case URX_END_CAPTURE: | |
717 | case URX_STATE_SAVE: | |
718 | case URX_JMP: | |
719 | case URX_JMP_SAV: | |
720 | case URX_JMP_SAV_X: | |
721 | case URX_BACKSLASH_B: | |
374ca955 | 722 | case URX_BACKSLASH_BU: |
b75a7d8f A |
723 | case URX_BACKSLASH_D: |
724 | case URX_BACKSLASH_Z: | |
725 | case URX_STRING_LEN: | |
726 | case URX_CTR_INIT: | |
727 | case URX_CTR_INIT_NG: | |
728 | case URX_CTR_LOOP: | |
729 | case URX_CTR_LOOP_NG: | |
730 | case URX_RELOC_OPRND: | |
731 | case URX_STO_SP: | |
732 | case URX_LD_SP: | |
733 | case URX_BACKREF: | |
734 | case URX_STO_INP_LOC: | |
735 | case URX_JMPX: | |
736 | case URX_LA_START: | |
737 | case URX_LA_END: | |
738 | case URX_BACKREF_I: | |
739 | case URX_LB_START: | |
740 | case URX_LB_CONT: | |
741 | case URX_LB_END: | |
742 | case URX_LBN_CONT: | |
743 | case URX_LBN_END: | |
744 | case URX_LOOP_C: | |
745 | case URX_LOOP_DOT_I: | |
b331163b A |
746 | case URX_BACKSLASH_H: |
747 | case URX_BACKSLASH_R: | |
748 | case URX_BACKSLASH_V: | |
b75a7d8f | 749 | // types with an integer operand field. |
57a6839d | 750 | printf("%d", val); |
b75a7d8f | 751 | break; |
46f4442e | 752 | |
b75a7d8f A |
753 | case URX_ONECHAR: |
754 | case URX_ONECHAR_I: | |
2ca993e8 A |
755 | if (val < 0x20) { |
756 | printf("%#x", val); | |
757 | } else { | |
758 | printf("'%s'", CStr(UnicodeString(val))()); | |
759 | } | |
b75a7d8f | 760 | break; |
46f4442e | 761 | |
b75a7d8f A |
762 | case URX_STRING: |
763 | case URX_STRING_I: | |
764 | { | |
765 | int32_t lengthOp = fCompiledPat->elementAti(index+1); | |
766 | U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); | |
767 | int32_t length = URX_VAL(lengthOp); | |
2ca993e8 A |
768 | UnicodeString str(fLiteralText, val, length); |
769 | printf("%s", CStr(str)()); | |
b75a7d8f A |
770 | } |
771 | break; | |
772 | ||
773 | case URX_SETREF: | |
774 | case URX_LOOP_SR_I: | |
775 | { | |
776 | UnicodeString s; | |
777 | UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); | |
778 | set->toPattern(s, TRUE); | |
2ca993e8 | 779 | printf("%s", CStr(s)()); |
b75a7d8f A |
780 | } |
781 | break; | |
782 | ||
783 | case URX_STATIC_SETREF: | |
784 | case URX_STAT_SETREF_N: | |
785 | { | |
786 | UnicodeString s; | |
787 | if (val & URX_NEG_SET) { | |
57a6839d | 788 | printf("NOT "); |
b75a7d8f A |
789 | val &= ~URX_NEG_SET; |
790 | } | |
791 | UnicodeSet *set = fStaticSets[val]; | |
792 | set->toPattern(s, TRUE); | |
2ca993e8 | 793 | printf("%s", CStr(s)()); |
b75a7d8f A |
794 | } |
795 | break; | |
796 | ||
46f4442e | 797 | |
b75a7d8f | 798 | default: |
57a6839d | 799 | printf("??????"); |
b75a7d8f A |
800 | break; |
801 | } | |
57a6839d | 802 | printf("\n"); |
374ca955 | 803 | #endif |
57a6839d | 804 | } |
b75a7d8f A |
805 | |
806 | ||
57a6839d | 807 | void RegexPattern::dumpPattern() const { |
b75a7d8f A |
808 | #if defined(REGEX_DEBUG) |
809 | int index; | |
57a6839d | 810 | |
2ca993e8 A |
811 | UnicodeString patStr; |
812 | for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) { | |
813 | patStr.append(c); | |
57a6839d | 814 | } |
2ca993e8 | 815 | printf("Original Pattern: \"%s\"\n", CStr(patStr)()); |
57a6839d A |
816 | printf(" Min Match Length: %d\n", fMinMatchLen); |
817 | printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType)); | |
818 | if (fStartType == START_STRING) { | |
2ca993e8 A |
819 | UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen); |
820 | printf(" Initial match string: \"%s\"\n", CStr(initialString)()); | |
57a6839d | 821 | } else if (fStartType == START_SET) { |
2ca993e8 A |
822 | UnicodeString s; |
823 | fInitialChars->toPattern(s, TRUE); | |
824 | printf(" Match First Chars: %s\n", CStr(s)()); | |
b75a7d8f | 825 | |
57a6839d | 826 | } else if (fStartType == START_CHAR) { |
2ca993e8 A |
827 | printf(" First char of Match: "); |
828 | if (fInitialChar > 0x20) { | |
829 | printf("'%s'\n", CStr(UnicodeString(fInitialChar))()); | |
b75a7d8f | 830 | } else { |
57a6839d | 831 | printf("%#x\n", fInitialChar); |
b75a7d8f A |
832 | } |
833 | } | |
834 | ||
b331163b A |
835 | printf("Named Capture Groups:\n"); |
836 | if (uhash_count(fNamedCaptureMap) == 0) { | |
837 | printf(" None\n"); | |
838 | } else { | |
839 | int32_t pos = UHASH_FIRST; | |
840 | const UHashElement *el = NULL; | |
841 | while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) { | |
842 | const UnicodeString *name = (const UnicodeString *)el->key.pointer; | |
b331163b | 843 | int32_t number = el->value.integer; |
2ca993e8 | 844 | printf(" %d\t%s\n", number, CStr(*name)()); |
b331163b A |
845 | } |
846 | } | |
847 | ||
57a6839d A |
848 | printf("\nIndex Binary Type Operand\n" \ |
849 | "-------------------------------------------\n"); | |
850 | for (index = 0; index<fCompiledPat->size(); index++) { | |
851 | dumpOp(index); | |
b75a7d8f | 852 | } |
57a6839d | 853 | printf("\n\n"); |
374ca955 | 854 | #endif |
57a6839d | 855 | } |
b75a7d8f A |
856 | |
857 | ||
858 | ||
374ca955 | 859 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern) |
b75a7d8f A |
860 | |
861 | U_NAMESPACE_END | |
862 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |