]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/uregex.cpp
ICU-6.2.14.tar.gz
[apple/icu.git] / icuSources / i18n / uregex.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: regex.cpp
7 */
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "umutex.h"
20 #include "uassert.h"
21 #include "cmemory.h"
22
23 struct URegularExpression: public UMemory {
24 public:
25 URegularExpression();
26 ~URegularExpression();
27 int32_t fMagic;
28 RegexPattern *fPat;
29 int32_t *fPatRefCount;
30 UChar *fPatString;
31 int32_t fPatStringLen;
32 RegexMatcher *fMatcher;
33 const UChar *fText; // Text from setText()
34 int32_t fTextLength; // Length provided by user with setText(), which
35 // may be -1.
36
37 UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString.
38 // TODO: regexp engine should not depend on UnicodeString.
39 };
40
41 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
42
43 U_NAMESPACE_USE
44
45 URegularExpression::URegularExpression() {
46 fMagic = REXP_MAGIC;
47 fPat = NULL;
48 fPatRefCount = NULL;
49 fPatString = NULL;
50 fPatStringLen = 0;
51 fMatcher = NULL;
52 fText = NULL;
53 fTextLength = 0;
54 }
55
56 URegularExpression::~URegularExpression() {
57 delete fMatcher;
58 fMatcher = NULL;
59 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
60 delete fPat;
61 uprv_free(fPatString);
62 uprv_free(fPatRefCount);
63 }
64 fMagic = 0;
65 }
66
67 //----------------------------------------------------------------------------------------
68 //
69 // validateRE Do boilerplate style checks on API function parameters.
70 // Return TRUE if they look OK.
71 //----------------------------------------------------------------------------------------
72 static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
73 if (U_FAILURE(*status)) {
74 return FALSE;
75 }
76 if (re == NULL || re->fMagic != REXP_MAGIC) {
77 // U_ASSERT(FALSE);
78 *status = U_ILLEGAL_ARGUMENT_ERROR;
79 return FALSE;
80 }
81 if (requiresText && re->fText == NULL) {
82 *status = U_REGEX_INVALID_STATE;
83 return FALSE;
84 }
85 return TRUE;
86 }
87
88 //----------------------------------------------------------------------------------------
89 //
90 // uregex_open
91 //
92 //----------------------------------------------------------------------------------------
93 U_CAPI URegularExpression * U_EXPORT2
94 uregex_open( const UChar *pattern,
95 int32_t patternLength,
96 uint32_t flags,
97 UParseError *pe,
98 UErrorCode *status) {
99
100 if (U_FAILURE(*status)) {
101 return NULL;
102 }
103 if (pattern == NULL || patternLength < -1 || patternLength == 0) {
104 *status = U_ILLEGAL_ARGUMENT_ERROR;
105 return NULL;
106 }
107 int32_t actualPatLen = patternLength;
108 if (actualPatLen == -1) {
109 actualPatLen = u_strlen(pattern);
110 }
111
112 URegularExpression *re = new URegularExpression;
113 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
114 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
115 if (re == NULL || refC == NULL || patBuf == NULL) {
116 *status = U_MEMORY_ALLOCATION_ERROR;
117 delete re;
118 uprv_free(refC);
119 uprv_free(patBuf);
120 return NULL;
121 }
122 re->fPatRefCount = refC;
123 *re->fPatRefCount = 1;
124
125 //
126 // Make a copy of the pattern string, so we can return it later if asked.
127 // For compiling the pattern, we will use a read-only-aliased UnicodeString
128 // of this local copy, to avoid making even more copies.
129 //
130 re->fPatString = patBuf;
131 re->fPatStringLen = patternLength;
132 u_memcpy(patBuf, pattern, actualPatLen);
133 patBuf[actualPatLen] = 0;
134 UnicodeString patString(patternLength==-1, patBuf, patternLength);
135
136 //
137 // Compile the pattern
138 //
139 if (pe != NULL) {
140 re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
141 } else {
142 re->fPat = RegexPattern::compile(patString, flags, *status);
143 }
144 if (U_FAILURE(*status)) {
145 goto ErrorExit;
146 }
147
148 //
149 // Create the matcher object
150 //
151 re->fMatcher = re->fPat->matcher(*status);
152 if (U_SUCCESS(*status)) {
153 return re;
154 }
155
156 ErrorExit:
157 delete re;
158 return NULL;
159
160 }
161
162
163
164
165 //----------------------------------------------------------------------------------------
166 //
167 // uregex_openC
168 //
169 //----------------------------------------------------------------------------------------
170 U_CAPI URegularExpression * U_EXPORT2
171 uregex_openC( const char *pattern,
172 uint32_t flags,
173 UParseError *pe,
174 UErrorCode *status) {
175 if (U_FAILURE(*status)) {
176 return NULL;
177 }
178 if (pattern == NULL) {
179 *status = U_ILLEGAL_ARGUMENT_ERROR;
180 return NULL;
181 }
182
183 UnicodeString patString(pattern);
184 URegularExpression *re = uregex_open(patString.getBuffer(), patString.length(), flags, pe, status);
185 return re;
186 }
187
188 //----------------------------------------------------------------------------------------
189 //
190 // uregex_close
191 //
192 //----------------------------------------------------------------------------------------
193 U_CAPI void U_EXPORT2
194 uregex_close(URegularExpression *re) {
195 UErrorCode status = U_ZERO_ERROR;
196 if (validateRE(re, &status, FALSE) == FALSE) {
197 return;
198 }
199 delete re;
200 }
201
202
203 //----------------------------------------------------------------------------------------
204 //
205 // uregex_clone
206 //
207 //----------------------------------------------------------------------------------------
208 U_CAPI URegularExpression * U_EXPORT2
209 uregex_clone(const URegularExpression *source, UErrorCode *status) {
210 if (validateRE(source, status, FALSE) == FALSE) {
211 return NULL;
212 }
213
214 URegularExpression *clone = new URegularExpression;
215 if (clone == NULL) {
216 *status = U_MEMORY_ALLOCATION_ERROR;
217 return NULL;
218 }
219
220 clone->fMatcher = source->fPat->matcher(*status);
221 if (U_FAILURE(*status)) {
222 delete clone;
223 return NULL;
224 }
225 if (clone == NULL) {
226 *status = U_MEMORY_ALLOCATION_ERROR;
227 return NULL;
228 }
229
230 clone->fPat = source->fPat;
231 clone->fPatRefCount = source->fPatRefCount;
232 clone->fPatString = source->fPatString;
233 clone->fPatStringLen = source->fPatStringLen;
234 umtx_atomic_inc(source->fPatRefCount);
235 // Note: fText is not cloned.
236
237 return clone;
238 };
239
240
241
242
243 //----------------------------------------------------------------------------------------
244 //
245 // uregex_pattern
246 //
247 //----------------------------------------------------------------------------------------
248 U_CAPI const UChar * U_EXPORT2
249 uregex_pattern(const URegularExpression *regexp,
250 int32_t *patLength,
251 UErrorCode *status) {
252
253 if (validateRE(regexp, status, FALSE) == FALSE) {
254 return NULL;
255 }
256 if (patLength != NULL) {
257 *patLength = regexp->fPatStringLen;
258 }
259 return regexp->fPatString;
260 };
261
262
263 //----------------------------------------------------------------------------------------
264 //
265 // uregex_flags
266 //
267 //----------------------------------------------------------------------------------------
268 U_CAPI int32_t U_EXPORT2
269 uregex_flags(const URegularExpression *regexp, UErrorCode *status) {
270 if (validateRE(regexp, status, FALSE) == FALSE) {
271 return 0;
272 }
273 int32_t flags = regexp->fPat->flags();
274 return flags;
275 };
276
277
278 //----------------------------------------------------------------------------------------
279 //
280 // uregex_setText
281 //
282 //----------------------------------------------------------------------------------------
283 U_CAPI void U_EXPORT2
284 uregex_setText(URegularExpression *regexp,
285 const UChar *text,
286 int32_t textLength,
287 UErrorCode *status) {
288 if (validateRE(regexp, status, FALSE) == FALSE) {
289 return;
290 }
291 if (text == NULL || textLength < -1) {
292 *status = U_ILLEGAL_ARGUMENT_ERROR;
293 return;
294 }
295 regexp->fText = text;
296 regexp->fTextLength = textLength;
297 UBool isTerminated = (textLength == -1);
298
299 regexp->fTextString.setTo(isTerminated, text, textLength);
300 regexp->fMatcher->reset(regexp->fTextString);
301 };
302
303
304
305 //----------------------------------------------------------------------------------------
306 //
307 // uregex_getText
308 //
309 //----------------------------------------------------------------------------------------
310 U_CAPI const UChar * U_EXPORT2
311 uregex_getText(URegularExpression *regexp,
312 int32_t *textLength,
313 UErrorCode *status) {
314 if (validateRE(regexp, status, FALSE) == FALSE) {
315 return NULL;
316 }
317 if (textLength != NULL) {
318 *textLength = regexp->fTextLength;
319 }
320 return regexp->fText;
321 };
322
323
324 //----------------------------------------------------------------------------------------
325 //
326 // uregex_matches
327 //
328 //----------------------------------------------------------------------------------------
329 U_CAPI UBool U_EXPORT2
330 uregex_matches(URegularExpression *regexp,
331 int32_t startIndex,
332 UErrorCode *status) {
333 if (validateRE(regexp, status) == FALSE) {
334 return FALSE;
335 }
336 UBool result = regexp->fMatcher->matches(startIndex, *status);
337 return result;
338 };
339
340
341
342 //----------------------------------------------------------------------------------------
343 //
344 // uregex_lookingAt
345 //
346 //----------------------------------------------------------------------------------------
347 U_CAPI UBool U_EXPORT2
348 uregex_lookingAt(URegularExpression *regexp,
349 int32_t startIndex,
350 UErrorCode *status) {
351 if (validateRE(regexp, status) == FALSE) {
352 return FALSE;
353 }
354 UBool result = regexp->fMatcher->lookingAt(startIndex, *status);
355 return result;
356 };
357
358
359
360 //----------------------------------------------------------------------------------------
361 //
362 // uregex_find
363 //
364 //----------------------------------------------------------------------------------------
365 U_CAPI UBool U_EXPORT2
366 uregex_find(URegularExpression *regexp,
367 int32_t startIndex,
368 UErrorCode *status) {
369 if (validateRE(regexp, status) == FALSE) {
370 return FALSE;
371 }
372 UBool result = regexp->fMatcher->find(startIndex, *status);
373 return result;
374 };
375
376 //----------------------------------------------------------------------------------------
377 //
378 // uregex_findNext
379 //
380 //----------------------------------------------------------------------------------------
381 U_CAPI UBool U_EXPORT2
382 uregex_findNext(URegularExpression *regexp,
383 UErrorCode *status) {
384 if (validateRE(regexp, status) == FALSE) {
385 return FALSE;
386 }
387 UBool result = regexp->fMatcher->find();
388 return result;
389 };
390
391 //----------------------------------------------------------------------------------------
392 //
393 // uregex_groupCount
394 //
395 //----------------------------------------------------------------------------------------
396 U_CAPI int32_t U_EXPORT2
397 uregex_groupCount(URegularExpression *regexp,
398 UErrorCode *status) {
399 if (validateRE(regexp, status, FALSE) == FALSE) {
400 return 0;
401 }
402 int32_t result = regexp->fMatcher->groupCount();
403 return result;
404 };
405
406
407 //----------------------------------------------------------------------------------------
408 //
409 // uregex_group
410 //
411 //----------------------------------------------------------------------------------------
412 U_CAPI int32_t U_EXPORT2
413 uregex_group(URegularExpression *regexp,
414 int32_t groupNum,
415 UChar *dest,
416 int32_t destCapacity,
417 UErrorCode *status) {
418 if (validateRE(regexp, status) == FALSE) {
419 return 0;
420 }
421 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
422 *status = U_ILLEGAL_ARGUMENT_ERROR;
423 return 0;
424 }
425
426 //
427 // Pick up the range of characters from the matcher
428 //
429 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
430 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
431 if (U_FAILURE(*status)) {
432 return 0;
433 }
434
435 //
436 // Trim length based on buffer capacity
437 //
438 int32_t fullLength = endIx - startIx;
439 int32_t copyLength = fullLength;
440 if (copyLength < destCapacity) {
441 dest[copyLength] = 0;
442 } else if (copyLength == destCapacity) {
443 *status = U_STRING_NOT_TERMINATED_WARNING;
444 } else {
445 copyLength = destCapacity;
446 *status = U_BUFFER_OVERFLOW_ERROR;
447 }
448
449 //
450 // Copy capture group to user's buffer
451 //
452 if (copyLength > 0) {
453 u_memcpy(dest, &regexp->fText[startIx], copyLength);
454 }
455 return fullLength;
456 };
457
458
459 //----------------------------------------------------------------------------------------
460 //
461 // uregex_start
462 //
463 //----------------------------------------------------------------------------------------
464 U_CAPI int32_t U_EXPORT2
465 uregex_start(URegularExpression *regexp,
466 int32_t groupNum,
467 UErrorCode *status) {
468 if (validateRE(regexp, status) == FALSE) {
469 return 0;
470 }
471 int32_t result = regexp->fMatcher->start(groupNum, *status);
472 return result;
473 };
474
475
476 //----------------------------------------------------------------------------------------
477 //
478 // uregex_end
479 //
480 //----------------------------------------------------------------------------------------
481 U_CAPI int32_t U_EXPORT2
482 uregex_end(URegularExpression *regexp,
483 int32_t groupNum,
484 UErrorCode *status) {
485 if (validateRE(regexp, status) == FALSE) {
486 return 0;
487 }
488 int32_t result = regexp->fMatcher->end(groupNum, *status);
489 return result;
490 };
491
492 //----------------------------------------------------------------------------------------
493 //
494 // uregex_reset
495 //
496 //----------------------------------------------------------------------------------------
497 U_CAPI void U_EXPORT2
498 uregex_reset(URegularExpression *regexp,
499 int32_t index,
500 UErrorCode *status) {
501 if (validateRE(regexp, status) == FALSE) {
502 return;
503 }
504 regexp->fMatcher->reset(index, *status);
505 };
506
507
508 //----------------------------------------------------------------------------------------
509 //
510 // uregex_replaceAll
511 //
512 //----------------------------------------------------------------------------------------
513 U_CAPI int32_t U_EXPORT2
514 uregex_replaceAll(URegularExpression *regexp,
515 UChar *replacementText,
516 int32_t replacementLength,
517 UChar *destBuf,
518 int32_t destCapacity,
519 UErrorCode *status) {
520 if (validateRE(regexp, status) == FALSE) {
521 return 0;
522 }
523 if (replacementText == NULL || replacementLength < -1 ||
524 destBuf == NULL && destCapacity > 0 ||
525 destCapacity < 0) {
526 *status = U_ILLEGAL_ARGUMENT_ERROR;
527 return 0;
528 }
529
530 int32_t len = 0;
531 uregex_reset(regexp, 0, status);
532 while (uregex_findNext(regexp, status)) {
533 len += uregex_appendReplacement(regexp, replacementText, replacementLength,
534 &destBuf, &destCapacity, status);
535 }
536 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
537
538 return len;
539 };
540
541
542 //----------------------------------------------------------------------------------------
543 //
544 // uregex_replaceFirst
545 //
546 //----------------------------------------------------------------------------------------
547 U_CAPI int32_t U_EXPORT2
548 uregex_replaceFirst(URegularExpression *regexp,
549 UChar *replacementText,
550 int32_t replacementLength,
551 UChar *destBuf,
552 int32_t destCapacity,
553 UErrorCode *status) {
554 if (validateRE(regexp, status) == FALSE) {
555 return 0;
556 }
557 if (replacementText == NULL || replacementLength < -1 ||
558 destBuf == NULL && destCapacity > 0 ||
559 destCapacity < 0) {
560 *status = U_ILLEGAL_ARGUMENT_ERROR;
561 return 0;
562 }
563
564 int32_t len = 0;
565 UBool findSucceeded;
566 uregex_reset(regexp, 0, status);
567 findSucceeded = uregex_find(regexp, 0, status);
568 if (findSucceeded) {
569 len = uregex_appendReplacement(regexp, replacementText, replacementLength,
570 &destBuf, &destCapacity, status);
571 }
572 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
573
574 return len;
575 };
576
577
578 //----------------------------------------------------------------------------------------
579 //
580 // uregex_appendReplacement
581 //
582 //----------------------------------------------------------------------------------------
583
584
585 //
586 // Dummy class, because these functions need to be friends of class RegexMatcher,
587 // and stand-alone C functions don't work as friends
588 //
589 U_NAMESPACE_BEGIN
590 class RegexCImpl {
591 public:
592 inline static int32_t appendReplacement(URegularExpression *regexp,
593 UChar *replacementText,
594 int32_t replacementLength,
595 UChar **destBuf,
596 int32_t *destCapacity,
597 UErrorCode *status);
598
599 inline static int32_t appendTail(URegularExpression *regexp,
600 UChar **destBuf,
601 int32_t *destCapacity,
602 UErrorCode *status);
603 };
604 U_NAMESPACE_END
605
606
607 //
608 // Call-back function for u_unescapeAt(), used when we encounter
609 // \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
610 //
611 U_CDECL_BEGIN
612 static UChar U_CALLCONV
613 unescape_charAt(int32_t offset, void *context) {
614 UChar c16 = ((UChar *)context)[offset];
615 return c16;
616 }
617 U_CDECL_END
618
619
620 static const UChar BACKSLASH = 0x5c;
621 static const UChar DOLLARSIGN = 0x24;
622
623 //
624 // Move a character to an output buffer, with bounds checking on the index.
625 // Index advances even if capacity is exceeded, for preflight size computations.
626 // This little sequence is used a LOT.
627 //
628 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
629 if (*idx < bufCapacity) {
630 buf[*idx] = c;
631 }
632 (*idx)++;
633 }
634
635
636 //
637 // appendReplacement, the actual implementation.
638 //
639 int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
640 UChar *replacementText,
641 int32_t replacementLength,
642 UChar **destBuf,
643 int32_t *destCapacity,
644 UErrorCode *status) {
645
646 // If we come in with a buffer overflow error, don't suppress the operation.
647 // A series of appendReplacements, appendTail need to correctly preflight
648 // the buffer size when an overflow happens somewhere in the middle.
649 UBool pendingBufferOverflow = FALSE;
650 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) {
651 pendingBufferOverflow = TRUE;
652 *status = U_ZERO_ERROR;
653 }
654
655 //
656 // Validate all paramters
657 //
658 if (validateRE(regexp, status) == FALSE) {
659 return 0;
660 }
661 if (replacementText == NULL || replacementLength < -1 ||
662 destCapacity == NULL || destBuf == NULL ||
663 *destBuf == NULL && *destCapacity > 0 ||
664 *destCapacity < 0) {
665 *status = U_ILLEGAL_ARGUMENT_ERROR;
666 return 0;
667 }
668
669 RegexMatcher *m = regexp->fMatcher;
670 if (m->fMatch == FALSE) {
671 *status = U_REGEX_INVALID_STATE;
672 return 0;
673 }
674
675 UChar *dest = *destBuf;
676 int32_t capacity = *destCapacity;
677 int32_t destIdx = 0;
678 int32_t i;
679
680 // If it wasn't supplied by the caller, get the length of the replacement text.
681 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
682 // the fly and avoid this step.
683 if (replacementLength == -1) {
684 replacementLength = u_strlen(replacementText);
685 }
686
687 // Copy input string from the end of previous match to start of current match
688 for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
689 appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
690 }
691
692
693
694 // scan the replacement text, looking for substitutions ($n) and \escapes.
695 int32_t replIdx = 0;
696 while (replIdx < replacementLength) {
697 UChar c = replacementText[replIdx];
698 replIdx++;
699 if (c != DOLLARSIGN && c != BACKSLASH) {
700 // Common case, no substitution, no escaping,
701 // just copy the char to the dest buf.
702 appendToBuf(c, &destIdx, dest, capacity);
703 continue;
704 }
705
706 if (c == BACKSLASH) {
707 // Backslash Escape. Copy the following char out without further checks.
708 // Note: Surrogate pairs don't need any special handling
709 // The second half wont be a '$' or a '\', and
710 // will move to the dest normally on the next
711 // loop iteration.
712 if (replIdx >= replacementLength) {
713 break;
714 }
715 c = replacementText[replIdx];
716
717 if (c==0x55/*U*/ || c==0x75/*u*/) {
718 // We have a \udddd or \Udddddddd escape sequence.
719 UChar32 escapedChar =
720 u_unescapeAt(unescape_charAt,
721 &replIdx, // Index is updated by unescapeAt
722 replacementLength, // Length of replacement text
723 replacementText);
724
725 if (escapedChar != (UChar32)0xFFFFFFFF) {
726 if (escapedChar <= 0xffff) {
727 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
728 } else {
729 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
730 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
731 }
732 continue;
733 }
734 // Note: if the \u escape was invalid, just fall through and
735 // treat it as a plain \<anything> escape.
736 }
737
738 // Plain backslash escape. Just put out the escaped character.
739 appendToBuf(c, &destIdx, dest, capacity);
740
741 replIdx++;
742 continue;
743 }
744
745
746
747 // We've got a $. Pick up a capture group number if one follows.
748 // Consume at most the number of digits necessary for the largest capture
749 // number that is valid for this pattern.
750
751 int32_t numDigits = 0;
752 int32_t groupNum = 0;
753 UChar32 digitC;
754 for (;;) {
755 if (replIdx >= replacementLength) {
756 break;
757 }
758 U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
759 if (u_isdigit(digitC) == FALSE) {
760 break;
761 }
762
763 U16_FWD_1(replacementText, replIdx, replacementLength);
764 groupNum=groupNum*10 + u_charDigitValue(digitC);
765 numDigits++;
766 if (numDigits >= m->fPattern->fMaxCaptureDigits) {
767 break;
768 }
769 }
770
771
772 if (numDigits == 0) {
773 // The $ didn't introduce a group number at all.
774 // Treat it as just part of the substitution text.
775 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
776 continue;
777 }
778
779 // Finally, append the capture group data to the destination.
780 int32_t capacityRemaining = capacity - destIdx;
781 if (capacityRemaining < 0) {
782 capacityRemaining = 0;
783 }
784 destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
785 if (*status == U_BUFFER_OVERFLOW_ERROR) {
786 // Ignore buffer overflow when extracting the group. We need to
787 // continue on to get full size of the untruncated result. We will
788 // raise our own buffer overflow error at the end.
789 *status = U_ZERO_ERROR;
790 }
791
792 if (U_FAILURE(*status)) {
793 // Can fail if group number is out of range.
794 break;
795 }
796
797 }
798
799 //
800 // Nul Terminate the dest buffer if possible.
801 // Set the appropriate buffer overflow or not terminated error, if needed.
802 //
803 if (destIdx < capacity) {
804 dest[destIdx] = 0;
805 } else if (destIdx == *destCapacity) {
806 *status = U_STRING_NOT_TERMINATED_WARNING;
807 } else {
808 *status = U_BUFFER_OVERFLOW_ERROR;
809 }
810
811 //
812 // Return an updated dest buffer and capacity to the caller.
813 //
814 if (destIdx > 0 && *destCapacity > 0) {
815 if (destIdx < capacity) {
816 *destBuf += destIdx;
817 *destCapacity -= destIdx;
818 } else {
819 *destBuf += capacity;
820 *destCapacity = 0;
821 }
822 }
823
824 // If we came in with a buffer overflow, make sure we go out with one also.
825 // (A zero length match right at the end of the previous match could
826 // make this function succeed even though a previous call had overflowed the buf)
827 if (pendingBufferOverflow && U_SUCCESS(*status)) {
828 *status = U_BUFFER_OVERFLOW_ERROR;
829 }
830
831 return destIdx;
832 }
833
834 //
835 // appendReplacement the acutal API function,
836 //
837 U_CAPI int32_t U_EXPORT2
838 uregex_appendReplacement(URegularExpression *regexp,
839 UChar *replacementText,
840 int32_t replacementLength,
841 UChar **destBuf,
842 int32_t *destCapacity,
843 UErrorCode *status) {
844 return RegexCImpl::appendReplacement(
845 regexp, replacementText, replacementLength,destBuf, destCapacity, status);
846 }
847
848
849 //----------------------------------------------------------------------------------------
850 //
851 // uregex_appendTail
852 //
853 //----------------------------------------------------------------------------------------
854 int32_t RegexCImpl::appendTail(URegularExpression *regexp,
855 UChar **destBuf,
856 int32_t *destCapacity,
857 UErrorCode *status) {
858
859 // If we come in with a buffer overflow error, don't suppress the operation.
860 // A series of appendReplacements, appendTail need to correctly preflight
861 // the buffer size when an overflow happens somewhere in the middle.
862 UBool pendingBufferOverflow = FALSE;
863 if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) {
864 pendingBufferOverflow = TRUE;
865 *status = U_ZERO_ERROR;
866 }
867
868 if (validateRE(regexp, status) == FALSE) {
869 return 0;
870 }
871 if (destCapacity == NULL || destBuf == NULL ||
872 *destBuf == NULL && *destCapacity > 0 ||
873 *destCapacity < 0) {
874 *status = U_ILLEGAL_ARGUMENT_ERROR;
875 return 0;
876 }
877
878 RegexMatcher *m = regexp->fMatcher;
879
880 int32_t srcIdx;
881 if (m->fMatch) {
882 // The most recent call to find() succeeded.
883 srcIdx = m->fMatchEnd;
884 } else {
885 // The last call to find() on this matcher failed().
886 // Look back to the end of the last find() that succeeded for src index.
887 srcIdx = m->fLastMatchEnd;
888 if (srcIdx == -1) {
889 // There has been no successful match with this matcher.
890 // We want to copy the whole string.
891 srcIdx = 0;
892 }
893 }
894
895 int32_t destIdx = 0;
896 int32_t destCap = *destCapacity;
897 UChar *dest = *destBuf;
898
899 for (;;) {
900 if (srcIdx == regexp->fTextLength) {
901 break;
902 }
903 UChar c = regexp->fText[srcIdx];
904 if (c == 0 && regexp->fTextLength == -1) {
905 break;
906 }
907 if (destIdx < destCap) {
908 dest[destIdx] = c;
909 } else {
910 // We've overflowed the dest buffer.
911 // If the total input string length is known, we can
912 // compute the total buffer size needed without scanning through the string.
913 if (regexp->fTextLength > 0) {
914 destIdx += (regexp->fTextLength - srcIdx);
915 break;
916 }
917 }
918 srcIdx++;
919 destIdx++;
920 }
921
922 //
923 // NUL terminate the output string, if possible, otherwise issue the
924 // appropriate error or warning.
925 //
926 if (destIdx < destCap) {
927 dest[destIdx] = 0;
928 } else if (destIdx == destCap) {
929 *status = U_STRING_NOT_TERMINATED_WARNING;
930 } else {
931 *status = U_BUFFER_OVERFLOW_ERROR;
932 }
933
934 //
935 // Update the user's buffer ptr and capacity vars to reflect the
936 // amount used.
937 //
938 if (destIdx < destCap) {
939 *destBuf += destIdx;
940 *destCapacity -= destIdx;
941 } else {
942 *destBuf += destCap;
943 *destCapacity = 0;
944 }
945
946 if (pendingBufferOverflow && U_SUCCESS(*status)) {
947 *status = U_BUFFER_OVERFLOW_ERROR;
948 }
949
950 return destIdx;
951 };
952
953
954 U_CAPI int32_t U_EXPORT2
955 uregex_appendTail(URegularExpression *regexp,
956 UChar **destBuf,
957 int32_t *destCapacity,
958 UErrorCode *status) {
959 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
960 }
961
962
963 //----------------------------------------------------------------------------------------
964 //
965 // copyString Internal utility to copy a string to an output buffer,
966 // while managing buffer overflow and preflight size
967 // computation. NUL termination is added to destination,
968 // and the NUL is counted in the output size.
969 //
970 //----------------------------------------------------------------------------------------
971 static void copyString(UChar *destBuffer, // Destination buffer.
972 int32_t destCapacity, // Total capacity of dest buffer
973 int32_t *destIndex, // Index into dest buffer. Updated on return.
974 // Update not clipped to destCapacity.
975 const UChar *srcPtr, // Pointer to source string
976 int32_t srcLen) // Source string len.
977 {
978 int32_t si;
979 int32_t di = *destIndex;
980 UChar c;
981
982 for (si=0; si<srcLen; si++) {
983 c = srcPtr[si];
984 if (di < destCapacity) {
985 destBuffer[di] = c;
986 di++;
987 } else {
988 di += srcLen - si;
989 break;
990 }
991 }
992 destBuffer[di++] = 0;
993 *destIndex = di;
994 }
995
996
997 //----------------------------------------------------------------------------------------
998 //
999 // uregex_split
1000 //
1001 //----------------------------------------------------------------------------------------
1002 U_CAPI int32_t U_EXPORT2
1003 uregex_split( URegularExpression *regexp,
1004 UChar *destBuf,
1005 int32_t destCapacity,
1006 int32_t *requiredCapacity,
1007 UChar *destFields[],
1008 int32_t destFieldsCapacity,
1009 UErrorCode *status) {
1010 if (validateRE(regexp, status) == FALSE) {
1011 return 0;
1012 }
1013 if (destBuf == NULL && destCapacity > 0 ||
1014 destCapacity < 0 ||
1015 destFields == NULL ||
1016 destFieldsCapacity < 1 ) {
1017 *status = U_ILLEGAL_ARGUMENT_ERROR;
1018 return 0;
1019 }
1020
1021 //
1022 // Reset for the input text
1023 //
1024 regexp->fMatcher->reset();
1025 int32_t inputLen = regexp->fTextString.length();
1026 int32_t nextOutputStringStart = 0;
1027 if (inputLen == 0) {
1028 return 0;
1029 }
1030
1031
1032 //
1033 // Loop through the input text, searching for the delimiter pattern
1034 //
1035 int32_t i; // Index of the field being processed.
1036 int32_t destIdx = 0; // Next available position in destBuf;
1037 int32_t numCaptureGroups = regexp->fMatcher->groupCount();
1038 for (i=0; ; i++) {
1039 if (i>=destFieldsCapacity-1) {
1040 // There are one or zero output string left.
1041 // Fill the last output string with whatever is left from the input, then exit the loop.
1042 // ( i will be == destFieldsCapacity if we filled the output array while processing
1043 // capture groups of the delimiter expression, in which case we will discard the
1044 // last capture group saved in favor of the unprocessed remainder of the
1045 // input string.)
1046 int32_t remainingLength = inputLen-nextOutputStringStart;
1047 if (remainingLength > 0) {
1048 }
1049 if (i >= destFieldsCapacity) {
1050 // No fields are left. Recycle the last one for holding the trailing part of
1051 // the input string.
1052 i = destFieldsCapacity-1;
1053 destIdx = (int32_t)(destFields[i] - destFields[0]);
1054 }
1055
1056 destFields[i] = &destBuf[destIdx];
1057 copyString(destBuf, destCapacity, &destIdx,
1058 &regexp->fText[nextOutputStringStart], remainingLength);
1059 break;
1060 }
1061
1062 if (regexp->fMatcher->find()) {
1063 // We found another delimiter. Move everything from where we started looking
1064 // up until the start of the delimiter into the next output string.
1065 int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
1066 destFields[i] = &destBuf[destIdx];
1067 copyString(destBuf, destCapacity, &destIdx,
1068 &regexp->fText[nextOutputStringStart], fieldLen);
1069 nextOutputStringStart = regexp->fMatcher->end(*status);
1070
1071 // If the delimiter pattern has capturing parentheses, the captured
1072 // text goes out into the next n destination strings.
1073 int32_t groupNum;
1074 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1075 // If we've run out of output string slots, bail out.
1076 if (i==destFieldsCapacity-1) {
1077 break;
1078 }
1079 i++;
1080
1081 // Set up to extract the capture group contents into the dest buffer.
1082 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow
1083 // error while extracting this group.
1084 int32_t remainingCapacity = destCapacity - destIdx;
1085 if (remainingCapacity < 0) {
1086 remainingCapacity = 0;
1087 }
1088 destFields[i] = &destBuf[destIdx];
1089 int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus);
1090 destIdx += t + 1; // Record the space used in the output string buffer.
1091 // +1 for the NUL that terminates the string.
1092 }
1093
1094 if (nextOutputStringStart == inputLen) {
1095 // The delimiter was at the end of the string. We're done.
1096 break;
1097 }
1098
1099 }
1100 else
1101 {
1102 // We ran off the end of the input while looking for the next delimiter.
1103 // All the remaining text goes into the current output string.
1104 destFields[i] = &destBuf[destIdx];
1105 copyString(destBuf, destCapacity, &destIdx,
1106 &regexp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
1107 break;
1108 }
1109 }
1110
1111 // Zero out any unused portion of the destFields array
1112 int j;
1113 for (j=i+1; j<destFieldsCapacity; j++) {
1114 destFields[j] = NULL;
1115 }
1116
1117 if (requiredCapacity != NULL) {
1118 *requiredCapacity = destIdx;
1119 }
1120 if (*requiredCapacity > destCapacity) {
1121 *status = U_BUFFER_OVERFLOW_ERROR;
1122 }
1123 return i+1;
1124 }
1125
1126
1127
1128
1129
1130
1131 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS