]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ******************************************************************************* | |
46f4442e | 3 | * Copyright (C) 2004-2008, International Business Machines |
374ca955 A |
4 | * Corporation and others. All Rights Reserved. |
5 | ******************************************************************************* | |
6 | * file name: regex.cpp | |
7 | */ | |
8 | ||
9 | #include "unicode/utypes.h" | |
10 | ||
11 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
12 | ||
13 | #include "unicode/regex.h" | |
14 | #include "unicode/uregex.h" | |
15 | #include "unicode/unistr.h" | |
16 | #include "unicode/ustring.h" | |
17 | #include "unicode/uchar.h" | |
18 | #include "unicode/uobject.h" | |
19 | #include "umutex.h" | |
20 | #include "uassert.h" | |
21 | #include "cmemory.h" | |
22 | ||
46f4442e A |
23 | U_NAMESPACE_USE |
24 | ||
374ca955 A |
25 | struct URegularExpression: public UMemory { |
26 | public: | |
27 | URegularExpression(); | |
28 | ~URegularExpression(); | |
29 | int32_t fMagic; | |
30 | RegexPattern *fPat; | |
31 | int32_t *fPatRefCount; | |
32 | UChar *fPatString; | |
33 | int32_t fPatStringLen; | |
34 | RegexMatcher *fMatcher; | |
35 | const UChar *fText; // Text from setText() | |
36 | int32_t fTextLength; // Length provided by user with setText(), which | |
37 | // may be -1. | |
38 | ||
39 | UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString. | |
40 | // TODO: regexp engine should not depend on UnicodeString. | |
41 | }; | |
42 | ||
43 | static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII | |
44 | ||
374ca955 A |
45 | URegularExpression::URegularExpression() { |
46 | fMagic = REXP_MAGIC; | |
47 | fPat = NULL; | |
48 | fPatRefCount = NULL; | |
49 | fPatString = NULL; | |
50 | fPatStringLen = 0; | |
51 | fMatcher = NULL; | |
52 | fText = NULL; | |
53 | fTextLength = 0; | |
54 | } | |
55 | ||
56 | URegularExpression::~URegularExpression() { | |
57 | delete fMatcher; | |
58 | fMatcher = NULL; | |
59 | if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) { | |
60 | delete fPat; | |
61 | uprv_free(fPatString); | |
62 | uprv_free(fPatRefCount); | |
63 | } | |
64 | fMagic = 0; | |
65 | } | |
66 | ||
67 | //---------------------------------------------------------------------------------------- | |
68 | // | |
69 | // validateRE Do boilerplate style checks on API function parameters. | |
70 | // Return TRUE if they look OK. | |
71 | //---------------------------------------------------------------------------------------- | |
72 | static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) { | |
73 | if (U_FAILURE(*status)) { | |
74 | return FALSE; | |
75 | } | |
76 | if (re == NULL || re->fMagic != REXP_MAGIC) { | |
374ca955 A |
77 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
78 | return FALSE; | |
79 | } | |
80 | if (requiresText && re->fText == NULL) { | |
81 | *status = U_REGEX_INVALID_STATE; | |
82 | return FALSE; | |
83 | } | |
84 | return TRUE; | |
85 | } | |
86 | ||
87 | //---------------------------------------------------------------------------------------- | |
88 | // | |
89 | // uregex_open | |
90 | // | |
91 | //---------------------------------------------------------------------------------------- | |
92 | U_CAPI URegularExpression * U_EXPORT2 | |
93 | uregex_open( const UChar *pattern, | |
94 | int32_t patternLength, | |
95 | uint32_t flags, | |
96 | UParseError *pe, | |
97 | UErrorCode *status) { | |
98 | ||
99 | if (U_FAILURE(*status)) { | |
100 | return NULL; | |
101 | } | |
102 | if (pattern == NULL || patternLength < -1 || patternLength == 0) { | |
103 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
104 | return NULL; | |
105 | } | |
106 | int32_t actualPatLen = patternLength; | |
107 | if (actualPatLen == -1) { | |
108 | actualPatLen = u_strlen(pattern); | |
109 | } | |
110 | ||
111 | URegularExpression *re = new URegularExpression; | |
112 | int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t)); | |
113 | UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1)); | |
114 | if (re == NULL || refC == NULL || patBuf == NULL) { | |
115 | *status = U_MEMORY_ALLOCATION_ERROR; | |
116 | delete re; | |
117 | uprv_free(refC); | |
118 | uprv_free(patBuf); | |
119 | return NULL; | |
120 | } | |
121 | re->fPatRefCount = refC; | |
122 | *re->fPatRefCount = 1; | |
123 | ||
124 | // | |
125 | // Make a copy of the pattern string, so we can return it later if asked. | |
126 | // For compiling the pattern, we will use a read-only-aliased UnicodeString | |
127 | // of this local copy, to avoid making even more copies. | |
128 | // | |
129 | re->fPatString = patBuf; | |
130 | re->fPatStringLen = patternLength; | |
131 | u_memcpy(patBuf, pattern, actualPatLen); | |
132 | patBuf[actualPatLen] = 0; | |
133 | UnicodeString patString(patternLength==-1, patBuf, patternLength); | |
134 | ||
135 | // | |
136 | // Compile the pattern | |
137 | // | |
138 | if (pe != NULL) { | |
139 | re->fPat = RegexPattern::compile(patString, flags, *pe, *status); | |
140 | } else { | |
141 | re->fPat = RegexPattern::compile(patString, flags, *status); | |
142 | } | |
143 | if (U_FAILURE(*status)) { | |
144 | goto ErrorExit; | |
145 | } | |
146 | ||
147 | // | |
148 | // Create the matcher object | |
149 | // | |
150 | re->fMatcher = re->fPat->matcher(*status); | |
151 | if (U_SUCCESS(*status)) { | |
152 | return re; | |
153 | } | |
154 | ||
155 | ErrorExit: | |
156 | delete re; | |
157 | return NULL; | |
158 | ||
159 | } | |
160 | ||
374ca955 A |
161 | //---------------------------------------------------------------------------------------- |
162 | // | |
163 | // uregex_close | |
164 | // | |
165 | //---------------------------------------------------------------------------------------- | |
166 | U_CAPI void U_EXPORT2 | |
167 | uregex_close(URegularExpression *re) { | |
168 | UErrorCode status = U_ZERO_ERROR; | |
169 | if (validateRE(re, &status, FALSE) == FALSE) { | |
170 | return; | |
171 | } | |
172 | delete re; | |
173 | } | |
174 | ||
175 | ||
176 | //---------------------------------------------------------------------------------------- | |
177 | // | |
178 | // uregex_clone | |
179 | // | |
180 | //---------------------------------------------------------------------------------------- | |
181 | U_CAPI URegularExpression * U_EXPORT2 | |
182 | uregex_clone(const URegularExpression *source, UErrorCode *status) { | |
183 | if (validateRE(source, status, FALSE) == FALSE) { | |
184 | return NULL; | |
185 | } | |
186 | ||
187 | URegularExpression *clone = new URegularExpression; | |
188 | if (clone == NULL) { | |
189 | *status = U_MEMORY_ALLOCATION_ERROR; | |
190 | return NULL; | |
191 | } | |
192 | ||
193 | clone->fMatcher = source->fPat->matcher(*status); | |
194 | if (U_FAILURE(*status)) { | |
195 | delete clone; | |
196 | return NULL; | |
197 | } | |
374ca955 A |
198 | |
199 | clone->fPat = source->fPat; | |
200 | clone->fPatRefCount = source->fPatRefCount; | |
201 | clone->fPatString = source->fPatString; | |
202 | clone->fPatStringLen = source->fPatStringLen; | |
203 | umtx_atomic_inc(source->fPatRefCount); | |
204 | // Note: fText is not cloned. | |
205 | ||
206 | return clone; | |
73c04bcf | 207 | } |
374ca955 A |
208 | |
209 | ||
210 | ||
211 | ||
73c04bcf | 212 | //------------------------------------------------------------------------------ |
374ca955 A |
213 | // |
214 | // uregex_pattern | |
215 | // | |
73c04bcf | 216 | //------------------------------------------------------------------------------ |
374ca955 A |
217 | U_CAPI const UChar * U_EXPORT2 |
218 | uregex_pattern(const URegularExpression *regexp, | |
219 | int32_t *patLength, | |
220 | UErrorCode *status) { | |
221 | ||
222 | if (validateRE(regexp, status, FALSE) == FALSE) { | |
223 | return NULL; | |
224 | } | |
225 | if (patLength != NULL) { | |
226 | *patLength = regexp->fPatStringLen; | |
227 | } | |
228 | return regexp->fPatString; | |
73c04bcf | 229 | } |
374ca955 A |
230 | |
231 | ||
73c04bcf | 232 | //------------------------------------------------------------------------------ |
374ca955 A |
233 | // |
234 | // uregex_flags | |
235 | // | |
73c04bcf | 236 | //------------------------------------------------------------------------------ |
374ca955 A |
237 | U_CAPI int32_t U_EXPORT2 |
238 | uregex_flags(const URegularExpression *regexp, UErrorCode *status) { | |
239 | if (validateRE(regexp, status, FALSE) == FALSE) { | |
240 | return 0; | |
241 | } | |
242 | int32_t flags = regexp->fPat->flags(); | |
243 | return flags; | |
73c04bcf | 244 | } |
374ca955 A |
245 | |
246 | ||
73c04bcf | 247 | //------------------------------------------------------------------------------ |
374ca955 A |
248 | // |
249 | // uregex_setText | |
250 | // | |
73c04bcf | 251 | //------------------------------------------------------------------------------ |
374ca955 A |
252 | U_CAPI void U_EXPORT2 |
253 | uregex_setText(URegularExpression *regexp, | |
254 | const UChar *text, | |
255 | int32_t textLength, | |
256 | UErrorCode *status) { | |
257 | if (validateRE(regexp, status, FALSE) == FALSE) { | |
258 | return; | |
259 | } | |
260 | if (text == NULL || textLength < -1) { | |
261 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
262 | return; | |
263 | } | |
264 | regexp->fText = text; | |
265 | regexp->fTextLength = textLength; | |
266 | UBool isTerminated = (textLength == -1); | |
267 | ||
268 | regexp->fTextString.setTo(isTerminated, text, textLength); | |
269 | regexp->fMatcher->reset(regexp->fTextString); | |
73c04bcf | 270 | } |
374ca955 A |
271 | |
272 | ||
273 | ||
73c04bcf | 274 | //------------------------------------------------------------------------------ |
374ca955 A |
275 | // |
276 | // uregex_getText | |
277 | // | |
73c04bcf | 278 | //------------------------------------------------------------------------------ |
374ca955 A |
279 | U_CAPI const UChar * U_EXPORT2 |
280 | uregex_getText(URegularExpression *regexp, | |
281 | int32_t *textLength, | |
282 | UErrorCode *status) { | |
283 | if (validateRE(regexp, status, FALSE) == FALSE) { | |
284 | return NULL; | |
285 | } | |
286 | if (textLength != NULL) { | |
287 | *textLength = regexp->fTextLength; | |
288 | } | |
289 | return regexp->fText; | |
73c04bcf | 290 | } |
374ca955 A |
291 | |
292 | ||
73c04bcf | 293 | //------------------------------------------------------------------------------ |
374ca955 A |
294 | // |
295 | // uregex_matches | |
296 | // | |
73c04bcf | 297 | //------------------------------------------------------------------------------ |
374ca955 A |
298 | U_CAPI UBool U_EXPORT2 |
299 | uregex_matches(URegularExpression *regexp, | |
300 | int32_t startIndex, | |
301 | UErrorCode *status) { | |
46f4442e | 302 | UBool result = FALSE; |
374ca955 | 303 | if (validateRE(regexp, status) == FALSE) { |
46f4442e A |
304 | return result; |
305 | } | |
306 | if (startIndex == -1) { | |
307 | result = regexp->fMatcher->matches(*status); | |
308 | } else { | |
309 | result = regexp->fMatcher->matches(startIndex, *status); | |
374ca955 | 310 | } |
374ca955 | 311 | return result; |
73c04bcf | 312 | } |
374ca955 A |
313 | |
314 | ||
315 | ||
73c04bcf | 316 | //------------------------------------------------------------------------------ |
374ca955 A |
317 | // |
318 | // uregex_lookingAt | |
319 | // | |
73c04bcf | 320 | //------------------------------------------------------------------------------ |
374ca955 A |
321 | U_CAPI UBool U_EXPORT2 |
322 | uregex_lookingAt(URegularExpression *regexp, | |
323 | int32_t startIndex, | |
324 | UErrorCode *status) { | |
46f4442e | 325 | UBool result = FALSE; |
374ca955 | 326 | if (validateRE(regexp, status) == FALSE) { |
46f4442e A |
327 | return result; |
328 | } | |
329 | if (startIndex == -1) { | |
330 | result = regexp->fMatcher->lookingAt(*status); | |
331 | } else { | |
332 | result = regexp->fMatcher->lookingAt(startIndex, *status); | |
374ca955 | 333 | } |
374ca955 | 334 | return result; |
73c04bcf | 335 | } |
374ca955 A |
336 | |
337 | ||
338 | ||
73c04bcf | 339 | //------------------------------------------------------------------------------ |
374ca955 A |
340 | // |
341 | // uregex_find | |
342 | // | |
73c04bcf | 343 | //------------------------------------------------------------------------------ |
374ca955 A |
344 | U_CAPI UBool U_EXPORT2 |
345 | uregex_find(URegularExpression *regexp, | |
346 | int32_t startIndex, | |
347 | UErrorCode *status) { | |
46f4442e | 348 | UBool result = FALSE; |
374ca955 | 349 | if (validateRE(regexp, status) == FALSE) { |
46f4442e A |
350 | return result; |
351 | } | |
352 | if (startIndex == -1) { | |
353 | regexp->fMatcher->resetPreserveRegion(); | |
354 | result = regexp->fMatcher->find(); | |
355 | } else { | |
356 | result = regexp->fMatcher->find(startIndex, *status); | |
374ca955 | 357 | } |
374ca955 | 358 | return result; |
73c04bcf | 359 | } |
374ca955 | 360 | |
73c04bcf | 361 | //------------------------------------------------------------------------------ |
374ca955 A |
362 | // |
363 | // uregex_findNext | |
364 | // | |
73c04bcf | 365 | //------------------------------------------------------------------------------ |
374ca955 A |
366 | U_CAPI UBool U_EXPORT2 |
367 | uregex_findNext(URegularExpression *regexp, | |
368 | UErrorCode *status) { | |
369 | if (validateRE(regexp, status) == FALSE) { | |
370 | return FALSE; | |
371 | } | |
372 | UBool result = regexp->fMatcher->find(); | |
373 | return result; | |
73c04bcf | 374 | } |
374ca955 | 375 | |
73c04bcf | 376 | //------------------------------------------------------------------------------ |
374ca955 A |
377 | // |
378 | // uregex_groupCount | |
379 | // | |
73c04bcf | 380 | //------------------------------------------------------------------------------ |
374ca955 A |
381 | U_CAPI int32_t U_EXPORT2 |
382 | uregex_groupCount(URegularExpression *regexp, | |
383 | UErrorCode *status) { | |
384 | if (validateRE(regexp, status, FALSE) == FALSE) { | |
385 | return 0; | |
386 | } | |
387 | int32_t result = regexp->fMatcher->groupCount(); | |
388 | return result; | |
73c04bcf | 389 | } |
374ca955 A |
390 | |
391 | ||
73c04bcf | 392 | //------------------------------------------------------------------------------ |
374ca955 A |
393 | // |
394 | // uregex_group | |
395 | // | |
73c04bcf | 396 | //------------------------------------------------------------------------------ |
374ca955 A |
397 | U_CAPI int32_t U_EXPORT2 |
398 | uregex_group(URegularExpression *regexp, | |
399 | int32_t groupNum, | |
400 | UChar *dest, | |
401 | int32_t destCapacity, | |
402 | UErrorCode *status) { | |
403 | if (validateRE(regexp, status) == FALSE) { | |
404 | return 0; | |
405 | } | |
406 | if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) { | |
407 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
408 | return 0; | |
409 | } | |
410 | ||
411 | // | |
412 | // Pick up the range of characters from the matcher | |
413 | // | |
414 | int32_t startIx = regexp->fMatcher->start(groupNum, *status); | |
415 | int32_t endIx = regexp->fMatcher->end (groupNum, *status); | |
416 | if (U_FAILURE(*status)) { | |
417 | return 0; | |
418 | } | |
419 | ||
420 | // | |
421 | // Trim length based on buffer capacity | |
422 | // | |
423 | int32_t fullLength = endIx - startIx; | |
424 | int32_t copyLength = fullLength; | |
425 | if (copyLength < destCapacity) { | |
426 | dest[copyLength] = 0; | |
427 | } else if (copyLength == destCapacity) { | |
428 | *status = U_STRING_NOT_TERMINATED_WARNING; | |
429 | } else { | |
430 | copyLength = destCapacity; | |
431 | *status = U_BUFFER_OVERFLOW_ERROR; | |
432 | } | |
433 | ||
434 | // | |
435 | // Copy capture group to user's buffer | |
436 | // | |
437 | if (copyLength > 0) { | |
438 | u_memcpy(dest, ®exp->fText[startIx], copyLength); | |
439 | } | |
440 | return fullLength; | |
73c04bcf | 441 | } |
374ca955 A |
442 | |
443 | ||
73c04bcf | 444 | //------------------------------------------------------------------------------ |
374ca955 A |
445 | // |
446 | // uregex_start | |
447 | // | |
73c04bcf | 448 | //------------------------------------------------------------------------------ |
374ca955 A |
449 | U_CAPI int32_t U_EXPORT2 |
450 | uregex_start(URegularExpression *regexp, | |
451 | int32_t groupNum, | |
452 | UErrorCode *status) { | |
453 | if (validateRE(regexp, status) == FALSE) { | |
454 | return 0; | |
455 | } | |
456 | int32_t result = regexp->fMatcher->start(groupNum, *status); | |
457 | return result; | |
73c04bcf | 458 | } |
374ca955 A |
459 | |
460 | ||
73c04bcf | 461 | //------------------------------------------------------------------------------ |
374ca955 A |
462 | // |
463 | // uregex_end | |
464 | // | |
73c04bcf | 465 | //------------------------------------------------------------------------------ |
374ca955 A |
466 | U_CAPI int32_t U_EXPORT2 |
467 | uregex_end(URegularExpression *regexp, | |
468 | int32_t groupNum, | |
469 | UErrorCode *status) { | |
470 | if (validateRE(regexp, status) == FALSE) { | |
471 | return 0; | |
472 | } | |
473 | int32_t result = regexp->fMatcher->end(groupNum, *status); | |
474 | return result; | |
73c04bcf | 475 | } |
374ca955 | 476 | |
73c04bcf | 477 | //------------------------------------------------------------------------------ |
374ca955 A |
478 | // |
479 | // uregex_reset | |
480 | // | |
73c04bcf | 481 | //------------------------------------------------------------------------------ |
374ca955 A |
482 | U_CAPI void U_EXPORT2 |
483 | uregex_reset(URegularExpression *regexp, | |
484 | int32_t index, | |
485 | UErrorCode *status) { | |
486 | if (validateRE(regexp, status) == FALSE) { | |
487 | return; | |
488 | } | |
489 | regexp->fMatcher->reset(index, *status); | |
73c04bcf | 490 | } |
374ca955 A |
491 | |
492 | ||
46f4442e A |
493 | //------------------------------------------------------------------------------ |
494 | // | |
495 | // uregex_setRegion | |
496 | // | |
497 | //------------------------------------------------------------------------------ | |
498 | U_CAPI void U_EXPORT2 | |
499 | uregex_setRegion(URegularExpression *regexp, | |
500 | int32_t regionStart, | |
501 | int32_t regionLimit, | |
502 | UErrorCode *status) { | |
503 | if (validateRE(regexp, status) == FALSE) { | |
504 | return; | |
505 | } | |
506 | regexp->fMatcher->region(regionStart, regionLimit, *status); | |
507 | } | |
508 | ||
509 | ||
510 | //------------------------------------------------------------------------------ | |
511 | // | |
512 | // uregex_regionStart | |
513 | // | |
514 | //------------------------------------------------------------------------------ | |
515 | U_CAPI int32_t U_EXPORT2 | |
516 | uregex_regionStart(const URegularExpression *regexp, | |
517 | UErrorCode *status) { | |
518 | if (validateRE(regexp, status) == FALSE) { | |
519 | return 0; | |
520 | } | |
521 | return regexp->fMatcher->regionStart(); | |
522 | } | |
523 | ||
524 | ||
525 | //------------------------------------------------------------------------------ | |
526 | // | |
527 | // uregex_regionEnd | |
528 | // | |
529 | //------------------------------------------------------------------------------ | |
530 | U_CAPI int32_t U_EXPORT2 | |
531 | uregex_regionEnd(const URegularExpression *regexp, | |
532 | UErrorCode *status) { | |
533 | if (validateRE(regexp, status) == FALSE) { | |
534 | return 0; | |
535 | } | |
536 | return regexp->fMatcher->regionEnd(); | |
537 | } | |
538 | ||
539 | ||
540 | //------------------------------------------------------------------------------ | |
541 | // | |
542 | // uregex_hasTransparentBounds | |
543 | // | |
544 | //------------------------------------------------------------------------------ | |
545 | U_CAPI UBool U_EXPORT2 | |
546 | uregex_hasTransparentBounds(const URegularExpression *regexp, | |
547 | UErrorCode *status) { | |
548 | if (validateRE(regexp, status) == FALSE) { | |
549 | return FALSE; | |
550 | } | |
551 | return regexp->fMatcher->hasTransparentBounds(); | |
552 | } | |
553 | ||
554 | ||
555 | //------------------------------------------------------------------------------ | |
556 | // | |
557 | // uregex_useTransparentBounds | |
558 | // | |
559 | //------------------------------------------------------------------------------ | |
560 | U_CAPI void U_EXPORT2 | |
561 | uregex_useTransparentBounds(URegularExpression *regexp, | |
562 | UBool b, | |
563 | UErrorCode *status) { | |
564 | if (validateRE(regexp, status) == FALSE) { | |
565 | return; | |
566 | } | |
567 | regexp->fMatcher->useTransparentBounds(b); | |
568 | } | |
569 | ||
570 | ||
571 | //------------------------------------------------------------------------------ | |
572 | // | |
573 | // uregex_hasAnchoringBounds | |
574 | // | |
575 | //------------------------------------------------------------------------------ | |
576 | U_CAPI UBool U_EXPORT2 | |
577 | uregex_hasAnchoringBounds(const URegularExpression *regexp, | |
578 | UErrorCode *status) { | |
579 | if (validateRE(regexp, status) == FALSE) { | |
580 | return FALSE; | |
581 | } | |
582 | return regexp->fMatcher->hasAnchoringBounds(); | |
583 | } | |
584 | ||
585 | ||
586 | //------------------------------------------------------------------------------ | |
587 | // | |
588 | // uregex_useAnchoringBounds | |
589 | // | |
590 | //------------------------------------------------------------------------------ | |
591 | U_CAPI void U_EXPORT2 | |
592 | uregex_useAnchoringBounds(URegularExpression *regexp, | |
593 | UBool b, | |
594 | UErrorCode *status) { | |
595 | if (validateRE(regexp, status) == FALSE) { | |
596 | return; | |
597 | } | |
598 | regexp->fMatcher->useAnchoringBounds(b); | |
599 | } | |
600 | ||
601 | ||
602 | //------------------------------------------------------------------------------ | |
603 | // | |
604 | // uregex_hitEnd | |
605 | // | |
606 | //------------------------------------------------------------------------------ | |
607 | U_CAPI UBool U_EXPORT2 | |
608 | uregex_hitEnd(const URegularExpression *regexp, | |
609 | UErrorCode *status) { | |
610 | if (validateRE(regexp, status) == FALSE) { | |
611 | return FALSE; | |
612 | } | |
613 | return regexp->fMatcher->hitEnd(); | |
614 | } | |
615 | ||
616 | ||
617 | //------------------------------------------------------------------------------ | |
618 | // | |
619 | // uregex_requireEnd | |
620 | // | |
621 | //------------------------------------------------------------------------------ | |
622 | U_CAPI UBool U_EXPORT2 | |
623 | uregex_requireEnd(const URegularExpression *regexp, | |
624 | UErrorCode *status) { | |
625 | if (validateRE(regexp, status) == FALSE) { | |
626 | return FALSE; | |
627 | } | |
628 | return regexp->fMatcher->requireEnd(); | |
629 | } | |
630 | ||
631 | ||
632 | //------------------------------------------------------------------------------ | |
633 | // | |
634 | // uregex_setTimeLimit | |
635 | // | |
636 | //------------------------------------------------------------------------------ | |
637 | U_CAPI void U_EXPORT2 | |
638 | uregex_setTimeLimit(URegularExpression *regexp, | |
639 | int32_t limit, | |
640 | UErrorCode *status) { | |
641 | if (validateRE(regexp, status)) { | |
642 | regexp->fMatcher->setTimeLimit(limit, *status); | |
643 | } | |
644 | } | |
645 | ||
646 | ||
647 | ||
648 | //------------------------------------------------------------------------------ | |
649 | // | |
650 | // uregex_getTimeLimit | |
651 | // | |
652 | //------------------------------------------------------------------------------ | |
653 | U_CAPI int32_t U_EXPORT2 | |
654 | uregex_getTimeLimit(const URegularExpression *regexp, | |
655 | UErrorCode *status) { | |
656 | int32_t retVal = 0; | |
657 | if (validateRE(regexp, status)) { | |
658 | retVal = regexp->fMatcher->getTimeLimit(); | |
659 | } | |
660 | return retVal; | |
661 | } | |
662 | ||
663 | ||
664 | ||
665 | //------------------------------------------------------------------------------ | |
666 | // | |
667 | // uregex_setStackLimit | |
668 | // | |
669 | //------------------------------------------------------------------------------ | |
670 | U_CAPI void U_EXPORT2 | |
671 | uregex_setStackLimit(URegularExpression *regexp, | |
672 | int32_t limit, | |
673 | UErrorCode *status) { | |
674 | if (validateRE(regexp, status)) { | |
675 | regexp->fMatcher->setStackLimit(limit, *status); | |
676 | } | |
677 | } | |
678 | ||
679 | ||
680 | ||
681 | //------------------------------------------------------------------------------ | |
682 | // | |
683 | // uregex_getStackLimit | |
684 | // | |
685 | //------------------------------------------------------------------------------ | |
686 | U_CAPI int32_t U_EXPORT2 | |
687 | uregex_getStackLimit(const URegularExpression *regexp, | |
688 | UErrorCode *status) { | |
689 | int32_t retVal = 0; | |
690 | if (validateRE(regexp, status)) { | |
691 | retVal = regexp->fMatcher->getStackLimit(); | |
692 | } | |
693 | return retVal; | |
694 | } | |
695 | ||
696 | ||
697 | //------------------------------------------------------------------------------ | |
698 | // | |
699 | // uregex_setMatchCallback | |
700 | // | |
701 | //------------------------------------------------------------------------------ | |
702 | U_CAPI void U_EXPORT2 | |
703 | uregex_setMatchCallback(URegularExpression *regexp, | |
704 | URegexMatchCallback *callback, | |
705 | const void *context, | |
706 | UErrorCode *status) { | |
707 | if (validateRE(regexp, status)) { | |
708 | regexp->fMatcher->setMatchCallback(callback, context, *status); | |
709 | } | |
710 | } | |
711 | ||
712 | ||
713 | //------------------------------------------------------------------------------ | |
714 | // | |
715 | // uregex_getMatchCallback | |
716 | // | |
717 | //------------------------------------------------------------------------------ | |
718 | U_CAPI void U_EXPORT2 | |
719 | uregex_getMatchCallback(const URegularExpression *regexp, | |
720 | URegexMatchCallback **callback, | |
721 | const void **context, | |
722 | UErrorCode *status) { | |
723 | if (validateRE(regexp, status)) { | |
724 | regexp->fMatcher->getMatchCallback(*callback, *context, *status); | |
725 | } | |
726 | } | |
727 | ||
728 | ||
73c04bcf | 729 | //------------------------------------------------------------------------------ |
374ca955 A |
730 | // |
731 | // uregex_replaceAll | |
732 | // | |
73c04bcf | 733 | //------------------------------------------------------------------------------ |
374ca955 A |
734 | U_CAPI int32_t U_EXPORT2 |
735 | uregex_replaceAll(URegularExpression *regexp, | |
73c04bcf | 736 | const UChar *replacementText, |
374ca955 A |
737 | int32_t replacementLength, |
738 | UChar *destBuf, | |
739 | int32_t destCapacity, | |
740 | UErrorCode *status) { | |
741 | if (validateRE(regexp, status) == FALSE) { | |
742 | return 0; | |
743 | } | |
744 | if (replacementText == NULL || replacementLength < -1 || | |
745 | destBuf == NULL && destCapacity > 0 || | |
746 | destCapacity < 0) { | |
747 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
748 | return 0; | |
749 | } | |
750 | ||
751 | int32_t len = 0; | |
752 | uregex_reset(regexp, 0, status); | |
753 | while (uregex_findNext(regexp, status)) { | |
754 | len += uregex_appendReplacement(regexp, replacementText, replacementLength, | |
755 | &destBuf, &destCapacity, status); | |
756 | } | |
757 | len += uregex_appendTail(regexp, &destBuf, &destCapacity, status); | |
758 | ||
759 | return len; | |
73c04bcf | 760 | } |
374ca955 A |
761 | |
762 | ||
73c04bcf | 763 | //------------------------------------------------------------------------------ |
374ca955 A |
764 | // |
765 | // uregex_replaceFirst | |
766 | // | |
73c04bcf | 767 | //------------------------------------------------------------------------------ |
374ca955 A |
768 | U_CAPI int32_t U_EXPORT2 |
769 | uregex_replaceFirst(URegularExpression *regexp, | |
73c04bcf | 770 | const UChar *replacementText, |
374ca955 A |
771 | int32_t replacementLength, |
772 | UChar *destBuf, | |
773 | int32_t destCapacity, | |
774 | UErrorCode *status) { | |
775 | if (validateRE(regexp, status) == FALSE) { | |
776 | return 0; | |
777 | } | |
778 | if (replacementText == NULL || replacementLength < -1 || | |
779 | destBuf == NULL && destCapacity > 0 || | |
780 | destCapacity < 0) { | |
781 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
782 | return 0; | |
783 | } | |
784 | ||
785 | int32_t len = 0; | |
786 | UBool findSucceeded; | |
787 | uregex_reset(regexp, 0, status); | |
788 | findSucceeded = uregex_find(regexp, 0, status); | |
789 | if (findSucceeded) { | |
790 | len = uregex_appendReplacement(regexp, replacementText, replacementLength, | |
791 | &destBuf, &destCapacity, status); | |
792 | } | |
793 | len += uregex_appendTail(regexp, &destBuf, &destCapacity, status); | |
794 | ||
795 | return len; | |
73c04bcf | 796 | } |
374ca955 A |
797 | |
798 | ||
73c04bcf | 799 | //------------------------------------------------------------------------------ |
374ca955 A |
800 | // |
801 | // uregex_appendReplacement | |
802 | // | |
73c04bcf | 803 | //------------------------------------------------------------------------------ |
374ca955 A |
804 | |
805 | ||
806 | // | |
807 | // Dummy class, because these functions need to be friends of class RegexMatcher, | |
808 | // and stand-alone C functions don't work as friends | |
809 | // | |
810 | U_NAMESPACE_BEGIN | |
811 | class RegexCImpl { | |
812 | public: | |
813 | inline static int32_t appendReplacement(URegularExpression *regexp, | |
73c04bcf | 814 | const UChar *replacementText, |
374ca955 A |
815 | int32_t replacementLength, |
816 | UChar **destBuf, | |
817 | int32_t *destCapacity, | |
818 | UErrorCode *status); | |
819 | ||
820 | inline static int32_t appendTail(URegularExpression *regexp, | |
821 | UChar **destBuf, | |
822 | int32_t *destCapacity, | |
823 | UErrorCode *status); | |
824 | }; | |
825 | U_NAMESPACE_END | |
826 | ||
827 | ||
828 | // | |
829 | // Call-back function for u_unescapeAt(), used when we encounter | |
830 | // \uxxxx or \Uxxxxxxxxx escapes in the replacement text. | |
831 | // | |
832 | U_CDECL_BEGIN | |
833 | static UChar U_CALLCONV | |
834 | unescape_charAt(int32_t offset, void *context) { | |
835 | UChar c16 = ((UChar *)context)[offset]; | |
836 | return c16; | |
837 | } | |
838 | U_CDECL_END | |
839 | ||
840 | ||
841 | static const UChar BACKSLASH = 0x5c; | |
842 | static const UChar DOLLARSIGN = 0x24; | |
843 | ||
844 | // | |
845 | // Move a character to an output buffer, with bounds checking on the index. | |
846 | // Index advances even if capacity is exceeded, for preflight size computations. | |
847 | // This little sequence is used a LOT. | |
848 | // | |
849 | static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) { | |
850 | if (*idx < bufCapacity) { | |
851 | buf[*idx] = c; | |
852 | } | |
853 | (*idx)++; | |
854 | } | |
855 | ||
856 | ||
857 | // | |
858 | // appendReplacement, the actual implementation. | |
859 | // | |
860 | int32_t RegexCImpl::appendReplacement(URegularExpression *regexp, | |
73c04bcf | 861 | const UChar *replacementText, |
374ca955 A |
862 | int32_t replacementLength, |
863 | UChar **destBuf, | |
864 | int32_t *destCapacity, | |
865 | UErrorCode *status) { | |
866 | ||
867 | // If we come in with a buffer overflow error, don't suppress the operation. | |
868 | // A series of appendReplacements, appendTail need to correctly preflight | |
869 | // the buffer size when an overflow happens somewhere in the middle. | |
870 | UBool pendingBufferOverflow = FALSE; | |
871 | if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) { | |
872 | pendingBufferOverflow = TRUE; | |
873 | *status = U_ZERO_ERROR; | |
874 | } | |
875 | ||
876 | // | |
877 | // Validate all paramters | |
878 | // | |
879 | if (validateRE(regexp, status) == FALSE) { | |
880 | return 0; | |
881 | } | |
882 | if (replacementText == NULL || replacementLength < -1 || | |
883 | destCapacity == NULL || destBuf == NULL || | |
884 | *destBuf == NULL && *destCapacity > 0 || | |
885 | *destCapacity < 0) { | |
886 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
887 | return 0; | |
888 | } | |
889 | ||
890 | RegexMatcher *m = regexp->fMatcher; | |
891 | if (m->fMatch == FALSE) { | |
892 | *status = U_REGEX_INVALID_STATE; | |
893 | return 0; | |
894 | } | |
895 | ||
896 | UChar *dest = *destBuf; | |
897 | int32_t capacity = *destCapacity; | |
898 | int32_t destIdx = 0; | |
899 | int32_t i; | |
900 | ||
901 | // If it wasn't supplied by the caller, get the length of the replacement text. | |
902 | // TODO: slightly smarter logic in the copy loop could watch for the NUL on | |
903 | // the fly and avoid this step. | |
904 | if (replacementLength == -1) { | |
905 | replacementLength = u_strlen(replacementText); | |
906 | } | |
907 | ||
908 | // Copy input string from the end of previous match to start of current match | |
909 | for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) { | |
910 | appendToBuf(regexp->fText[i], &destIdx, dest, capacity); | |
911 | } | |
912 | ||
913 | ||
914 | ||
915 | // scan the replacement text, looking for substitutions ($n) and \escapes. | |
916 | int32_t replIdx = 0; | |
917 | while (replIdx < replacementLength) { | |
918 | UChar c = replacementText[replIdx]; | |
919 | replIdx++; | |
920 | if (c != DOLLARSIGN && c != BACKSLASH) { | |
921 | // Common case, no substitution, no escaping, | |
922 | // just copy the char to the dest buf. | |
923 | appendToBuf(c, &destIdx, dest, capacity); | |
924 | continue; | |
925 | } | |
926 | ||
927 | if (c == BACKSLASH) { | |
928 | // Backslash Escape. Copy the following char out without further checks. | |
929 | // Note: Surrogate pairs don't need any special handling | |
930 | // The second half wont be a '$' or a '\', and | |
931 | // will move to the dest normally on the next | |
932 | // loop iteration. | |
933 | if (replIdx >= replacementLength) { | |
934 | break; | |
935 | } | |
936 | c = replacementText[replIdx]; | |
937 | ||
938 | if (c==0x55/*U*/ || c==0x75/*u*/) { | |
939 | // We have a \udddd or \Udddddddd escape sequence. | |
940 | UChar32 escapedChar = | |
941 | u_unescapeAt(unescape_charAt, | |
942 | &replIdx, // Index is updated by unescapeAt | |
943 | replacementLength, // Length of replacement text | |
73c04bcf | 944 | (void *)replacementText); |
374ca955 A |
945 | |
946 | if (escapedChar != (UChar32)0xFFFFFFFF) { | |
947 | if (escapedChar <= 0xffff) { | |
948 | appendToBuf((UChar)escapedChar, &destIdx, dest, capacity); | |
949 | } else { | |
950 | appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity); | |
951 | appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity); | |
952 | } | |
953 | continue; | |
954 | } | |
955 | // Note: if the \u escape was invalid, just fall through and | |
956 | // treat it as a plain \<anything> escape. | |
957 | } | |
958 | ||
959 | // Plain backslash escape. Just put out the escaped character. | |
960 | appendToBuf(c, &destIdx, dest, capacity); | |
961 | ||
962 | replIdx++; | |
963 | continue; | |
964 | } | |
965 | ||
966 | ||
967 | ||
968 | // We've got a $. Pick up a capture group number if one follows. | |
969 | // Consume at most the number of digits necessary for the largest capture | |
970 | // number that is valid for this pattern. | |
971 | ||
972 | int32_t numDigits = 0; | |
973 | int32_t groupNum = 0; | |
974 | UChar32 digitC; | |
975 | for (;;) { | |
976 | if (replIdx >= replacementLength) { | |
977 | break; | |
978 | } | |
979 | U16_GET(replacementText, 0, replIdx, replacementLength, digitC); | |
980 | if (u_isdigit(digitC) == FALSE) { | |
981 | break; | |
982 | } | |
983 | ||
984 | U16_FWD_1(replacementText, replIdx, replacementLength); | |
985 | groupNum=groupNum*10 + u_charDigitValue(digitC); | |
986 | numDigits++; | |
987 | if (numDigits >= m->fPattern->fMaxCaptureDigits) { | |
988 | break; | |
989 | } | |
990 | } | |
991 | ||
992 | ||
993 | if (numDigits == 0) { | |
994 | // The $ didn't introduce a group number at all. | |
995 | // Treat it as just part of the substitution text. | |
996 | appendToBuf(DOLLARSIGN, &destIdx, dest, capacity); | |
997 | continue; | |
998 | } | |
999 | ||
1000 | // Finally, append the capture group data to the destination. | |
1001 | int32_t capacityRemaining = capacity - destIdx; | |
1002 | if (capacityRemaining < 0) { | |
1003 | capacityRemaining = 0; | |
1004 | } | |
1005 | destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status); | |
1006 | if (*status == U_BUFFER_OVERFLOW_ERROR) { | |
1007 | // Ignore buffer overflow when extracting the group. We need to | |
1008 | // continue on to get full size of the untruncated result. We will | |
1009 | // raise our own buffer overflow error at the end. | |
1010 | *status = U_ZERO_ERROR; | |
1011 | } | |
1012 | ||
1013 | if (U_FAILURE(*status)) { | |
1014 | // Can fail if group number is out of range. | |
1015 | break; | |
1016 | } | |
1017 | ||
1018 | } | |
1019 | ||
1020 | // | |
1021 | // Nul Terminate the dest buffer if possible. | |
1022 | // Set the appropriate buffer overflow or not terminated error, if needed. | |
1023 | // | |
1024 | if (destIdx < capacity) { | |
1025 | dest[destIdx] = 0; | |
1026 | } else if (destIdx == *destCapacity) { | |
1027 | *status = U_STRING_NOT_TERMINATED_WARNING; | |
1028 | } else { | |
1029 | *status = U_BUFFER_OVERFLOW_ERROR; | |
1030 | } | |
1031 | ||
1032 | // | |
1033 | // Return an updated dest buffer and capacity to the caller. | |
1034 | // | |
1035 | if (destIdx > 0 && *destCapacity > 0) { | |
1036 | if (destIdx < capacity) { | |
1037 | *destBuf += destIdx; | |
1038 | *destCapacity -= destIdx; | |
1039 | } else { | |
1040 | *destBuf += capacity; | |
1041 | *destCapacity = 0; | |
1042 | } | |
1043 | } | |
1044 | ||
1045 | // If we came in with a buffer overflow, make sure we go out with one also. | |
1046 | // (A zero length match right at the end of the previous match could | |
1047 | // make this function succeed even though a previous call had overflowed the buf) | |
1048 | if (pendingBufferOverflow && U_SUCCESS(*status)) { | |
1049 | *status = U_BUFFER_OVERFLOW_ERROR; | |
1050 | } | |
1051 | ||
1052 | return destIdx; | |
1053 | } | |
1054 | ||
1055 | // | |
1056 | // appendReplacement the acutal API function, | |
1057 | // | |
1058 | U_CAPI int32_t U_EXPORT2 | |
1059 | uregex_appendReplacement(URegularExpression *regexp, | |
73c04bcf | 1060 | const UChar *replacementText, |
374ca955 A |
1061 | int32_t replacementLength, |
1062 | UChar **destBuf, | |
1063 | int32_t *destCapacity, | |
1064 | UErrorCode *status) { | |
1065 | return RegexCImpl::appendReplacement( | |
1066 | regexp, replacementText, replacementLength,destBuf, destCapacity, status); | |
1067 | } | |
1068 | ||
1069 | ||
73c04bcf | 1070 | //------------------------------------------------------------------------------ |
374ca955 A |
1071 | // |
1072 | // uregex_appendTail | |
1073 | // | |
73c04bcf | 1074 | //------------------------------------------------------------------------------ |
374ca955 A |
1075 | int32_t RegexCImpl::appendTail(URegularExpression *regexp, |
1076 | UChar **destBuf, | |
1077 | int32_t *destCapacity, | |
46f4442e A |
1078 | UErrorCode *status) |
1079 | { | |
374ca955 | 1080 | |
46f4442e A |
1081 | if (destCapacity == NULL || destBuf == NULL || |
1082 | *destBuf == NULL && *destCapacity > 0 || | |
1083 | *destCapacity < 0) | |
1084 | { | |
1085 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
1086 | return 0; | |
1087 | } | |
1088 | ||
374ca955 A |
1089 | // If we come in with a buffer overflow error, don't suppress the operation. |
1090 | // A series of appendReplacements, appendTail need to correctly preflight | |
1091 | // the buffer size when an overflow happens somewhere in the middle. | |
1092 | UBool pendingBufferOverflow = FALSE; | |
1093 | if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) { | |
1094 | pendingBufferOverflow = TRUE; | |
1095 | *status = U_ZERO_ERROR; | |
1096 | } | |
1097 | ||
1098 | if (validateRE(regexp, status) == FALSE) { | |
1099 | return 0; | |
1100 | } | |
374ca955 A |
1101 | RegexMatcher *m = regexp->fMatcher; |
1102 | ||
1103 | int32_t srcIdx; | |
1104 | if (m->fMatch) { | |
1105 | // The most recent call to find() succeeded. | |
1106 | srcIdx = m->fMatchEnd; | |
1107 | } else { | |
1108 | // The last call to find() on this matcher failed(). | |
1109 | // Look back to the end of the last find() that succeeded for src index. | |
1110 | srcIdx = m->fLastMatchEnd; | |
1111 | if (srcIdx == -1) { | |
1112 | // There has been no successful match with this matcher. | |
1113 | // We want to copy the whole string. | |
1114 | srcIdx = 0; | |
1115 | } | |
1116 | } | |
1117 | ||
1118 | int32_t destIdx = 0; | |
1119 | int32_t destCap = *destCapacity; | |
1120 | UChar *dest = *destBuf; | |
1121 | ||
1122 | for (;;) { | |
1123 | if (srcIdx == regexp->fTextLength) { | |
1124 | break; | |
1125 | } | |
1126 | UChar c = regexp->fText[srcIdx]; | |
1127 | if (c == 0 && regexp->fTextLength == -1) { | |
1128 | break; | |
1129 | } | |
1130 | if (destIdx < destCap) { | |
1131 | dest[destIdx] = c; | |
1132 | } else { | |
1133 | // We've overflowed the dest buffer. | |
1134 | // If the total input string length is known, we can | |
1135 | // compute the total buffer size needed without scanning through the string. | |
1136 | if (regexp->fTextLength > 0) { | |
1137 | destIdx += (regexp->fTextLength - srcIdx); | |
1138 | break; | |
1139 | } | |
1140 | } | |
1141 | srcIdx++; | |
1142 | destIdx++; | |
1143 | } | |
1144 | ||
1145 | // | |
1146 | // NUL terminate the output string, if possible, otherwise issue the | |
1147 | // appropriate error or warning. | |
1148 | // | |
1149 | if (destIdx < destCap) { | |
1150 | dest[destIdx] = 0; | |
1151 | } else if (destIdx == destCap) { | |
1152 | *status = U_STRING_NOT_TERMINATED_WARNING; | |
1153 | } else { | |
1154 | *status = U_BUFFER_OVERFLOW_ERROR; | |
1155 | } | |
1156 | ||
1157 | // | |
1158 | // Update the user's buffer ptr and capacity vars to reflect the | |
1159 | // amount used. | |
1160 | // | |
1161 | if (destIdx < destCap) { | |
1162 | *destBuf += destIdx; | |
1163 | *destCapacity -= destIdx; | |
1164 | } else { | |
1165 | *destBuf += destCap; | |
1166 | *destCapacity = 0; | |
1167 | } | |
1168 | ||
1169 | if (pendingBufferOverflow && U_SUCCESS(*status)) { | |
1170 | *status = U_BUFFER_OVERFLOW_ERROR; | |
1171 | } | |
1172 | ||
1173 | return destIdx; | |
73c04bcf | 1174 | } |
374ca955 A |
1175 | |
1176 | ||
1177 | U_CAPI int32_t U_EXPORT2 | |
1178 | uregex_appendTail(URegularExpression *regexp, | |
1179 | UChar **destBuf, | |
1180 | int32_t *destCapacity, | |
1181 | UErrorCode *status) { | |
1182 | return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status); | |
1183 | } | |
1184 | ||
1185 | ||
73c04bcf | 1186 | //------------------------------------------------------------------------------ |
374ca955 A |
1187 | // |
1188 | // copyString Internal utility to copy a string to an output buffer, | |
1189 | // while managing buffer overflow and preflight size | |
1190 | // computation. NUL termination is added to destination, | |
1191 | // and the NUL is counted in the output size. | |
1192 | // | |
73c04bcf | 1193 | //------------------------------------------------------------------------------ |
374ca955 A |
1194 | static void copyString(UChar *destBuffer, // Destination buffer. |
1195 | int32_t destCapacity, // Total capacity of dest buffer | |
1196 | int32_t *destIndex, // Index into dest buffer. Updated on return. | |
1197 | // Update not clipped to destCapacity. | |
1198 | const UChar *srcPtr, // Pointer to source string | |
1199 | int32_t srcLen) // Source string len. | |
1200 | { | |
1201 | int32_t si; | |
1202 | int32_t di = *destIndex; | |
1203 | UChar c; | |
1204 | ||
1205 | for (si=0; si<srcLen; si++) { | |
1206 | c = srcPtr[si]; | |
1207 | if (di < destCapacity) { | |
1208 | destBuffer[di] = c; | |
1209 | di++; | |
1210 | } else { | |
1211 | di += srcLen - si; | |
1212 | break; | |
1213 | } | |
1214 | } | |
73c04bcf A |
1215 | if (di<destCapacity) { |
1216 | destBuffer[di] = 0; | |
1217 | } | |
1218 | di++; | |
374ca955 A |
1219 | *destIndex = di; |
1220 | } | |
1221 | ||
1222 | ||
73c04bcf | 1223 | //------------------------------------------------------------------------------ |
374ca955 A |
1224 | // |
1225 | // uregex_split | |
1226 | // | |
73c04bcf | 1227 | //------------------------------------------------------------------------------ |
374ca955 A |
1228 | U_CAPI int32_t U_EXPORT2 |
1229 | uregex_split( URegularExpression *regexp, | |
1230 | UChar *destBuf, | |
1231 | int32_t destCapacity, | |
1232 | int32_t *requiredCapacity, | |
1233 | UChar *destFields[], | |
1234 | int32_t destFieldsCapacity, | |
1235 | UErrorCode *status) { | |
1236 | if (validateRE(regexp, status) == FALSE) { | |
1237 | return 0; | |
1238 | } | |
1239 | if (destBuf == NULL && destCapacity > 0 || | |
1240 | destCapacity < 0 || | |
1241 | destFields == NULL || | |
1242 | destFieldsCapacity < 1 ) { | |
1243 | *status = U_ILLEGAL_ARGUMENT_ERROR; | |
1244 | return 0; | |
1245 | } | |
1246 | ||
1247 | // | |
1248 | // Reset for the input text | |
1249 | // | |
1250 | regexp->fMatcher->reset(); | |
1251 | int32_t inputLen = regexp->fTextString.length(); | |
1252 | int32_t nextOutputStringStart = 0; | |
1253 | if (inputLen == 0) { | |
1254 | return 0; | |
1255 | } | |
1256 | ||
1257 | ||
1258 | // | |
1259 | // Loop through the input text, searching for the delimiter pattern | |
1260 | // | |
1261 | int32_t i; // Index of the field being processed. | |
1262 | int32_t destIdx = 0; // Next available position in destBuf; | |
1263 | int32_t numCaptureGroups = regexp->fMatcher->groupCount(); | |
1264 | for (i=0; ; i++) { | |
1265 | if (i>=destFieldsCapacity-1) { | |
1266 | // There are one or zero output string left. | |
1267 | // Fill the last output string with whatever is left from the input, then exit the loop. | |
1268 | // ( i will be == destFieldsCapacity if we filled the output array while processing | |
1269 | // capture groups of the delimiter expression, in which case we will discard the | |
1270 | // last capture group saved in favor of the unprocessed remainder of the | |
1271 | // input string.) | |
1272 | int32_t remainingLength = inputLen-nextOutputStringStart; | |
1273 | if (remainingLength > 0) { | |
1274 | } | |
1275 | if (i >= destFieldsCapacity) { | |
1276 | // No fields are left. Recycle the last one for holding the trailing part of | |
1277 | // the input string. | |
1278 | i = destFieldsCapacity-1; | |
1279 | destIdx = (int32_t)(destFields[i] - destFields[0]); | |
1280 | } | |
1281 | ||
1282 | destFields[i] = &destBuf[destIdx]; | |
1283 | copyString(destBuf, destCapacity, &destIdx, | |
1284 | ®exp->fText[nextOutputStringStart], remainingLength); | |
1285 | break; | |
1286 | } | |
1287 | ||
1288 | if (regexp->fMatcher->find()) { | |
1289 | // We found another delimiter. Move everything from where we started looking | |
1290 | // up until the start of the delimiter into the next output string. | |
1291 | int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart; | |
1292 | destFields[i] = &destBuf[destIdx]; | |
1293 | copyString(destBuf, destCapacity, &destIdx, | |
1294 | ®exp->fText[nextOutputStringStart], fieldLen); | |
1295 | nextOutputStringStart = regexp->fMatcher->end(*status); | |
1296 | ||
1297 | // If the delimiter pattern has capturing parentheses, the captured | |
1298 | // text goes out into the next n destination strings. | |
1299 | int32_t groupNum; | |
1300 | for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { | |
1301 | // If we've run out of output string slots, bail out. | |
1302 | if (i==destFieldsCapacity-1) { | |
1303 | break; | |
1304 | } | |
1305 | i++; | |
1306 | ||
1307 | // Set up to extract the capture group contents into the dest buffer. | |
1308 | UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow | |
1309 | // error while extracting this group. | |
1310 | int32_t remainingCapacity = destCapacity - destIdx; | |
1311 | if (remainingCapacity < 0) { | |
1312 | remainingCapacity = 0; | |
1313 | } | |
1314 | destFields[i] = &destBuf[destIdx]; | |
1315 | int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus); | |
1316 | destIdx += t + 1; // Record the space used in the output string buffer. | |
1317 | // +1 for the NUL that terminates the string. | |
1318 | } | |
1319 | ||
1320 | if (nextOutputStringStart == inputLen) { | |
1321 | // The delimiter was at the end of the string. We're done. | |
1322 | break; | |
1323 | } | |
1324 | ||
1325 | } | |
1326 | else | |
1327 | { | |
1328 | // We ran off the end of the input while looking for the next delimiter. | |
1329 | // All the remaining text goes into the current output string. | |
1330 | destFields[i] = &destBuf[destIdx]; | |
1331 | copyString(destBuf, destCapacity, &destIdx, | |
1332 | ®exp->fText[nextOutputStringStart], inputLen-nextOutputStringStart); | |
1333 | break; | |
1334 | } | |
1335 | } | |
1336 | ||
1337 | // Zero out any unused portion of the destFields array | |
1338 | int j; | |
1339 | for (j=i+1; j<destFieldsCapacity; j++) { | |
1340 | destFields[j] = NULL; | |
1341 | } | |
1342 | ||
1343 | if (requiredCapacity != NULL) { | |
1344 | *requiredCapacity = destIdx; | |
1345 | } | |
73c04bcf | 1346 | if (destIdx > destCapacity) { |
374ca955 A |
1347 | *status = U_BUFFER_OVERFLOW_ERROR; |
1348 | } | |
1349 | return i+1; | |
1350 | } | |
1351 | ||
1352 | ||
374ca955 | 1353 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |
73c04bcf | 1354 |