]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/uregex.cpp
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / i18n / uregex.cpp
1 /*
2 *******************************************************************************
3 * Copyright (C) 2004-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * file name: regex.cpp
7 */
8
9 #include "unicode/utypes.h"
10
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "umutex.h"
20 #include "uassert.h"
21 #include "cmemory.h"
22
23 U_NAMESPACE_USE
24
25 struct URegularExpression: public UMemory {
26 public:
27 URegularExpression();
28 ~URegularExpression();
29 int32_t fMagic;
30 RegexPattern *fPat;
31 int32_t *fPatRefCount;
32 UChar *fPatString;
33 int32_t fPatStringLen;
34 RegexMatcher *fMatcher;
35 const UChar *fText; // Text from setText()
36 int32_t fTextLength; // Length provided by user with setText(), which
37 // may be -1.
38
39 UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString.
40 // TODO: regexp engine should not depend on UnicodeString.
41 };
42
43 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
44
45 URegularExpression::URegularExpression() {
46 fMagic = REXP_MAGIC;
47 fPat = NULL;
48 fPatRefCount = NULL;
49 fPatString = NULL;
50 fPatStringLen = 0;
51 fMatcher = NULL;
52 fText = NULL;
53 fTextLength = 0;
54 }
55
56 URegularExpression::~URegularExpression() {
57 delete fMatcher;
58 fMatcher = NULL;
59 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
60 delete fPat;
61 uprv_free(fPatString);
62 uprv_free(fPatRefCount);
63 }
64 fMagic = 0;
65 }
66
67 //----------------------------------------------------------------------------------------
68 //
69 // validateRE Do boilerplate style checks on API function parameters.
70 // Return TRUE if they look OK.
71 //----------------------------------------------------------------------------------------
72 static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
73 if (U_FAILURE(*status)) {
74 return FALSE;
75 }
76 if (re == NULL || re->fMagic != REXP_MAGIC) {
77 *status = U_ILLEGAL_ARGUMENT_ERROR;
78 return FALSE;
79 }
80 if (requiresText && re->fText == NULL) {
81 *status = U_REGEX_INVALID_STATE;
82 return FALSE;
83 }
84 return TRUE;
85 }
86
87 //----------------------------------------------------------------------------------------
88 //
89 // uregex_open
90 //
91 //----------------------------------------------------------------------------------------
92 U_CAPI URegularExpression * U_EXPORT2
93 uregex_open( const UChar *pattern,
94 int32_t patternLength,
95 uint32_t flags,
96 UParseError *pe,
97 UErrorCode *status) {
98
99 if (U_FAILURE(*status)) {
100 return NULL;
101 }
102 if (pattern == NULL || patternLength < -1 || patternLength == 0) {
103 *status = U_ILLEGAL_ARGUMENT_ERROR;
104 return NULL;
105 }
106 int32_t actualPatLen = patternLength;
107 if (actualPatLen == -1) {
108 actualPatLen = u_strlen(pattern);
109 }
110
111 URegularExpression *re = new URegularExpression;
112 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
113 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
114 if (re == NULL || refC == NULL || patBuf == NULL) {
115 *status = U_MEMORY_ALLOCATION_ERROR;
116 delete re;
117 uprv_free(refC);
118 uprv_free(patBuf);
119 return NULL;
120 }
121 re->fPatRefCount = refC;
122 *re->fPatRefCount = 1;
123
124 //
125 // Make a copy of the pattern string, so we can return it later if asked.
126 // For compiling the pattern, we will use a read-only-aliased UnicodeString
127 // of this local copy, to avoid making even more copies.
128 //
129 re->fPatString = patBuf;
130 re->fPatStringLen = patternLength;
131 u_memcpy(patBuf, pattern, actualPatLen);
132 patBuf[actualPatLen] = 0;
133 UnicodeString patString(patternLength==-1, patBuf, patternLength);
134
135 //
136 // Compile the pattern
137 //
138 if (pe != NULL) {
139 re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
140 } else {
141 re->fPat = RegexPattern::compile(patString, flags, *status);
142 }
143 if (U_FAILURE(*status)) {
144 goto ErrorExit;
145 }
146
147 //
148 // Create the matcher object
149 //
150 re->fMatcher = re->fPat->matcher(*status);
151 if (U_SUCCESS(*status)) {
152 return re;
153 }
154
155 ErrorExit:
156 delete re;
157 return NULL;
158
159 }
160
161 //----------------------------------------------------------------------------------------
162 //
163 // uregex_close
164 //
165 //----------------------------------------------------------------------------------------
166 U_CAPI void U_EXPORT2
167 uregex_close(URegularExpression *re) {
168 UErrorCode status = U_ZERO_ERROR;
169 if (validateRE(re, &status, FALSE) == FALSE) {
170 return;
171 }
172 delete re;
173 }
174
175
176 //----------------------------------------------------------------------------------------
177 //
178 // uregex_clone
179 //
180 //----------------------------------------------------------------------------------------
181 U_CAPI URegularExpression * U_EXPORT2
182 uregex_clone(const URegularExpression *source, UErrorCode *status) {
183 if (validateRE(source, status, FALSE) == FALSE) {
184 return NULL;
185 }
186
187 URegularExpression *clone = new URegularExpression;
188 if (clone == NULL) {
189 *status = U_MEMORY_ALLOCATION_ERROR;
190 return NULL;
191 }
192
193 clone->fMatcher = source->fPat->matcher(*status);
194 if (U_FAILURE(*status)) {
195 delete clone;
196 return NULL;
197 }
198
199 clone->fPat = source->fPat;
200 clone->fPatRefCount = source->fPatRefCount;
201 clone->fPatString = source->fPatString;
202 clone->fPatStringLen = source->fPatStringLen;
203 umtx_atomic_inc(source->fPatRefCount);
204 // Note: fText is not cloned.
205
206 return clone;
207 }
208
209
210
211
212 //------------------------------------------------------------------------------
213 //
214 // uregex_pattern
215 //
216 //------------------------------------------------------------------------------
217 U_CAPI const UChar * U_EXPORT2
218 uregex_pattern(const URegularExpression *regexp,
219 int32_t *patLength,
220 UErrorCode *status) {
221
222 if (validateRE(regexp, status, FALSE) == FALSE) {
223 return NULL;
224 }
225 if (patLength != NULL) {
226 *patLength = regexp->fPatStringLen;
227 }
228 return regexp->fPatString;
229 }
230
231
232 //------------------------------------------------------------------------------
233 //
234 // uregex_flags
235 //
236 //------------------------------------------------------------------------------
237 U_CAPI int32_t U_EXPORT2
238 uregex_flags(const URegularExpression *regexp, UErrorCode *status) {
239 if (validateRE(regexp, status, FALSE) == FALSE) {
240 return 0;
241 }
242 int32_t flags = regexp->fPat->flags();
243 return flags;
244 }
245
246
247 //------------------------------------------------------------------------------
248 //
249 // uregex_setText
250 //
251 //------------------------------------------------------------------------------
252 U_CAPI void U_EXPORT2
253 uregex_setText(URegularExpression *regexp,
254 const UChar *text,
255 int32_t textLength,
256 UErrorCode *status) {
257 if (validateRE(regexp, status, FALSE) == FALSE) {
258 return;
259 }
260 if (text == NULL || textLength < -1) {
261 *status = U_ILLEGAL_ARGUMENT_ERROR;
262 return;
263 }
264 regexp->fText = text;
265 regexp->fTextLength = textLength;
266 UBool isTerminated = (textLength == -1);
267
268 regexp->fTextString.setTo(isTerminated, text, textLength);
269 regexp->fMatcher->reset(regexp->fTextString);
270 }
271
272
273
274 //------------------------------------------------------------------------------
275 //
276 // uregex_getText
277 //
278 //------------------------------------------------------------------------------
279 U_CAPI const UChar * U_EXPORT2
280 uregex_getText(URegularExpression *regexp,
281 int32_t *textLength,
282 UErrorCode *status) {
283 if (validateRE(regexp, status, FALSE) == FALSE) {
284 return NULL;
285 }
286 if (textLength != NULL) {
287 *textLength = regexp->fTextLength;
288 }
289 return regexp->fText;
290 }
291
292
293 //------------------------------------------------------------------------------
294 //
295 // uregex_matches
296 //
297 //------------------------------------------------------------------------------
298 U_CAPI UBool U_EXPORT2
299 uregex_matches(URegularExpression *regexp,
300 int32_t startIndex,
301 UErrorCode *status) {
302 UBool result = FALSE;
303 if (validateRE(regexp, status) == FALSE) {
304 return result;
305 }
306 if (startIndex == -1) {
307 result = regexp->fMatcher->matches(*status);
308 } else {
309 result = regexp->fMatcher->matches(startIndex, *status);
310 }
311 return result;
312 }
313
314
315
316 //------------------------------------------------------------------------------
317 //
318 // uregex_lookingAt
319 //
320 //------------------------------------------------------------------------------
321 U_CAPI UBool U_EXPORT2
322 uregex_lookingAt(URegularExpression *regexp,
323 int32_t startIndex,
324 UErrorCode *status) {
325 UBool result = FALSE;
326 if (validateRE(regexp, status) == FALSE) {
327 return result;
328 }
329 if (startIndex == -1) {
330 result = regexp->fMatcher->lookingAt(*status);
331 } else {
332 result = regexp->fMatcher->lookingAt(startIndex, *status);
333 }
334 return result;
335 }
336
337
338
339 //------------------------------------------------------------------------------
340 //
341 // uregex_find
342 //
343 //------------------------------------------------------------------------------
344 U_CAPI UBool U_EXPORT2
345 uregex_find(URegularExpression *regexp,
346 int32_t startIndex,
347 UErrorCode *status) {
348 UBool result = FALSE;
349 if (validateRE(regexp, status) == FALSE) {
350 return result;
351 }
352 if (startIndex == -1) {
353 regexp->fMatcher->resetPreserveRegion();
354 result = regexp->fMatcher->find();
355 } else {
356 result = regexp->fMatcher->find(startIndex, *status);
357 }
358 return result;
359 }
360
361 //------------------------------------------------------------------------------
362 //
363 // uregex_findNext
364 //
365 //------------------------------------------------------------------------------
366 U_CAPI UBool U_EXPORT2
367 uregex_findNext(URegularExpression *regexp,
368 UErrorCode *status) {
369 if (validateRE(regexp, status) == FALSE) {
370 return FALSE;
371 }
372 UBool result = regexp->fMatcher->find();
373 return result;
374 }
375
376 //------------------------------------------------------------------------------
377 //
378 // uregex_groupCount
379 //
380 //------------------------------------------------------------------------------
381 U_CAPI int32_t U_EXPORT2
382 uregex_groupCount(URegularExpression *regexp,
383 UErrorCode *status) {
384 if (validateRE(regexp, status, FALSE) == FALSE) {
385 return 0;
386 }
387 int32_t result = regexp->fMatcher->groupCount();
388 return result;
389 }
390
391
392 //------------------------------------------------------------------------------
393 //
394 // uregex_group
395 //
396 //------------------------------------------------------------------------------
397 U_CAPI int32_t U_EXPORT2
398 uregex_group(URegularExpression *regexp,
399 int32_t groupNum,
400 UChar *dest,
401 int32_t destCapacity,
402 UErrorCode *status) {
403 if (validateRE(regexp, status) == FALSE) {
404 return 0;
405 }
406 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
407 *status = U_ILLEGAL_ARGUMENT_ERROR;
408 return 0;
409 }
410
411 //
412 // Pick up the range of characters from the matcher
413 //
414 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
415 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
416 if (U_FAILURE(*status)) {
417 return 0;
418 }
419
420 //
421 // Trim length based on buffer capacity
422 //
423 int32_t fullLength = endIx - startIx;
424 int32_t copyLength = fullLength;
425 if (copyLength < destCapacity) {
426 dest[copyLength] = 0;
427 } else if (copyLength == destCapacity) {
428 *status = U_STRING_NOT_TERMINATED_WARNING;
429 } else {
430 copyLength = destCapacity;
431 *status = U_BUFFER_OVERFLOW_ERROR;
432 }
433
434 //
435 // Copy capture group to user's buffer
436 //
437 if (copyLength > 0) {
438 u_memcpy(dest, &regexp->fText[startIx], copyLength);
439 }
440 return fullLength;
441 }
442
443
444 //------------------------------------------------------------------------------
445 //
446 // uregex_start
447 //
448 //------------------------------------------------------------------------------
449 U_CAPI int32_t U_EXPORT2
450 uregex_start(URegularExpression *regexp,
451 int32_t groupNum,
452 UErrorCode *status) {
453 if (validateRE(regexp, status) == FALSE) {
454 return 0;
455 }
456 int32_t result = regexp->fMatcher->start(groupNum, *status);
457 return result;
458 }
459
460
461 //------------------------------------------------------------------------------
462 //
463 // uregex_end
464 //
465 //------------------------------------------------------------------------------
466 U_CAPI int32_t U_EXPORT2
467 uregex_end(URegularExpression *regexp,
468 int32_t groupNum,
469 UErrorCode *status) {
470 if (validateRE(regexp, status) == FALSE) {
471 return 0;
472 }
473 int32_t result = regexp->fMatcher->end(groupNum, *status);
474 return result;
475 }
476
477 //------------------------------------------------------------------------------
478 //
479 // uregex_reset
480 //
481 //------------------------------------------------------------------------------
482 U_CAPI void U_EXPORT2
483 uregex_reset(URegularExpression *regexp,
484 int32_t index,
485 UErrorCode *status) {
486 if (validateRE(regexp, status) == FALSE) {
487 return;
488 }
489 regexp->fMatcher->reset(index, *status);
490 }
491
492
493 //------------------------------------------------------------------------------
494 //
495 // uregex_setRegion
496 //
497 //------------------------------------------------------------------------------
498 U_CAPI void U_EXPORT2
499 uregex_setRegion(URegularExpression *regexp,
500 int32_t regionStart,
501 int32_t regionLimit,
502 UErrorCode *status) {
503 if (validateRE(regexp, status) == FALSE) {
504 return;
505 }
506 regexp->fMatcher->region(regionStart, regionLimit, *status);
507 }
508
509
510 //------------------------------------------------------------------------------
511 //
512 // uregex_regionStart
513 //
514 //------------------------------------------------------------------------------
515 U_CAPI int32_t U_EXPORT2
516 uregex_regionStart(const URegularExpression *regexp,
517 UErrorCode *status) {
518 if (validateRE(regexp, status) == FALSE) {
519 return 0;
520 }
521 return regexp->fMatcher->regionStart();
522 }
523
524
525 //------------------------------------------------------------------------------
526 //
527 // uregex_regionEnd
528 //
529 //------------------------------------------------------------------------------
530 U_CAPI int32_t U_EXPORT2
531 uregex_regionEnd(const URegularExpression *regexp,
532 UErrorCode *status) {
533 if (validateRE(regexp, status) == FALSE) {
534 return 0;
535 }
536 return regexp->fMatcher->regionEnd();
537 }
538
539
540 //------------------------------------------------------------------------------
541 //
542 // uregex_hasTransparentBounds
543 //
544 //------------------------------------------------------------------------------
545 U_CAPI UBool U_EXPORT2
546 uregex_hasTransparentBounds(const URegularExpression *regexp,
547 UErrorCode *status) {
548 if (validateRE(regexp, status) == FALSE) {
549 return FALSE;
550 }
551 return regexp->fMatcher->hasTransparentBounds();
552 }
553
554
555 //------------------------------------------------------------------------------
556 //
557 // uregex_useTransparentBounds
558 //
559 //------------------------------------------------------------------------------
560 U_CAPI void U_EXPORT2
561 uregex_useTransparentBounds(URegularExpression *regexp,
562 UBool b,
563 UErrorCode *status) {
564 if (validateRE(regexp, status) == FALSE) {
565 return;
566 }
567 regexp->fMatcher->useTransparentBounds(b);
568 }
569
570
571 //------------------------------------------------------------------------------
572 //
573 // uregex_hasAnchoringBounds
574 //
575 //------------------------------------------------------------------------------
576 U_CAPI UBool U_EXPORT2
577 uregex_hasAnchoringBounds(const URegularExpression *regexp,
578 UErrorCode *status) {
579 if (validateRE(regexp, status) == FALSE) {
580 return FALSE;
581 }
582 return regexp->fMatcher->hasAnchoringBounds();
583 }
584
585
586 //------------------------------------------------------------------------------
587 //
588 // uregex_useAnchoringBounds
589 //
590 //------------------------------------------------------------------------------
591 U_CAPI void U_EXPORT2
592 uregex_useAnchoringBounds(URegularExpression *regexp,
593 UBool b,
594 UErrorCode *status) {
595 if (validateRE(regexp, status) == FALSE) {
596 return;
597 }
598 regexp->fMatcher->useAnchoringBounds(b);
599 }
600
601
602 //------------------------------------------------------------------------------
603 //
604 // uregex_hitEnd
605 //
606 //------------------------------------------------------------------------------
607 U_CAPI UBool U_EXPORT2
608 uregex_hitEnd(const URegularExpression *regexp,
609 UErrorCode *status) {
610 if (validateRE(regexp, status) == FALSE) {
611 return FALSE;
612 }
613 return regexp->fMatcher->hitEnd();
614 }
615
616
617 //------------------------------------------------------------------------------
618 //
619 // uregex_requireEnd
620 //
621 //------------------------------------------------------------------------------
622 U_CAPI UBool U_EXPORT2
623 uregex_requireEnd(const URegularExpression *regexp,
624 UErrorCode *status) {
625 if (validateRE(regexp, status) == FALSE) {
626 return FALSE;
627 }
628 return regexp->fMatcher->requireEnd();
629 }
630
631
632 //------------------------------------------------------------------------------
633 //
634 // uregex_setTimeLimit
635 //
636 //------------------------------------------------------------------------------
637 U_CAPI void U_EXPORT2
638 uregex_setTimeLimit(URegularExpression *regexp,
639 int32_t limit,
640 UErrorCode *status) {
641 if (validateRE(regexp, status)) {
642 regexp->fMatcher->setTimeLimit(limit, *status);
643 }
644 }
645
646
647
648 //------------------------------------------------------------------------------
649 //
650 // uregex_getTimeLimit
651 //
652 //------------------------------------------------------------------------------
653 U_CAPI int32_t U_EXPORT2
654 uregex_getTimeLimit(const URegularExpression *regexp,
655 UErrorCode *status) {
656 int32_t retVal = 0;
657 if (validateRE(regexp, status)) {
658 retVal = regexp->fMatcher->getTimeLimit();
659 }
660 return retVal;
661 }
662
663
664
665 //------------------------------------------------------------------------------
666 //
667 // uregex_setStackLimit
668 //
669 //------------------------------------------------------------------------------
670 U_CAPI void U_EXPORT2
671 uregex_setStackLimit(URegularExpression *regexp,
672 int32_t limit,
673 UErrorCode *status) {
674 if (validateRE(regexp, status)) {
675 regexp->fMatcher->setStackLimit(limit, *status);
676 }
677 }
678
679
680
681 //------------------------------------------------------------------------------
682 //
683 // uregex_getStackLimit
684 //
685 //------------------------------------------------------------------------------
686 U_CAPI int32_t U_EXPORT2
687 uregex_getStackLimit(const URegularExpression *regexp,
688 UErrorCode *status) {
689 int32_t retVal = 0;
690 if (validateRE(regexp, status)) {
691 retVal = regexp->fMatcher->getStackLimit();
692 }
693 return retVal;
694 }
695
696
697 //------------------------------------------------------------------------------
698 //
699 // uregex_setMatchCallback
700 //
701 //------------------------------------------------------------------------------
702 U_CAPI void U_EXPORT2
703 uregex_setMatchCallback(URegularExpression *regexp,
704 URegexMatchCallback *callback,
705 const void *context,
706 UErrorCode *status) {
707 if (validateRE(regexp, status)) {
708 regexp->fMatcher->setMatchCallback(callback, context, *status);
709 }
710 }
711
712
713 //------------------------------------------------------------------------------
714 //
715 // uregex_getMatchCallback
716 //
717 //------------------------------------------------------------------------------
718 U_CAPI void U_EXPORT2
719 uregex_getMatchCallback(const URegularExpression *regexp,
720 URegexMatchCallback **callback,
721 const void **context,
722 UErrorCode *status) {
723 if (validateRE(regexp, status)) {
724 regexp->fMatcher->getMatchCallback(*callback, *context, *status);
725 }
726 }
727
728
729 //------------------------------------------------------------------------------
730 //
731 // uregex_replaceAll
732 //
733 //------------------------------------------------------------------------------
734 U_CAPI int32_t U_EXPORT2
735 uregex_replaceAll(URegularExpression *regexp,
736 const UChar *replacementText,
737 int32_t replacementLength,
738 UChar *destBuf,
739 int32_t destCapacity,
740 UErrorCode *status) {
741 if (validateRE(regexp, status) == FALSE) {
742 return 0;
743 }
744 if (replacementText == NULL || replacementLength < -1 ||
745 destBuf == NULL && destCapacity > 0 ||
746 destCapacity < 0) {
747 *status = U_ILLEGAL_ARGUMENT_ERROR;
748 return 0;
749 }
750
751 int32_t len = 0;
752 uregex_reset(regexp, 0, status);
753 while (uregex_findNext(regexp, status)) {
754 len += uregex_appendReplacement(regexp, replacementText, replacementLength,
755 &destBuf, &destCapacity, status);
756 }
757 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
758
759 return len;
760 }
761
762
763 //------------------------------------------------------------------------------
764 //
765 // uregex_replaceFirst
766 //
767 //------------------------------------------------------------------------------
768 U_CAPI int32_t U_EXPORT2
769 uregex_replaceFirst(URegularExpression *regexp,
770 const UChar *replacementText,
771 int32_t replacementLength,
772 UChar *destBuf,
773 int32_t destCapacity,
774 UErrorCode *status) {
775 if (validateRE(regexp, status) == FALSE) {
776 return 0;
777 }
778 if (replacementText == NULL || replacementLength < -1 ||
779 destBuf == NULL && destCapacity > 0 ||
780 destCapacity < 0) {
781 *status = U_ILLEGAL_ARGUMENT_ERROR;
782 return 0;
783 }
784
785 int32_t len = 0;
786 UBool findSucceeded;
787 uregex_reset(regexp, 0, status);
788 findSucceeded = uregex_find(regexp, 0, status);
789 if (findSucceeded) {
790 len = uregex_appendReplacement(regexp, replacementText, replacementLength,
791 &destBuf, &destCapacity, status);
792 }
793 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
794
795 return len;
796 }
797
798
799 //------------------------------------------------------------------------------
800 //
801 // uregex_appendReplacement
802 //
803 //------------------------------------------------------------------------------
804
805
806 //
807 // Dummy class, because these functions need to be friends of class RegexMatcher,
808 // and stand-alone C functions don't work as friends
809 //
810 U_NAMESPACE_BEGIN
811 class RegexCImpl {
812 public:
813 inline static int32_t appendReplacement(URegularExpression *regexp,
814 const UChar *replacementText,
815 int32_t replacementLength,
816 UChar **destBuf,
817 int32_t *destCapacity,
818 UErrorCode *status);
819
820 inline static int32_t appendTail(URegularExpression *regexp,
821 UChar **destBuf,
822 int32_t *destCapacity,
823 UErrorCode *status);
824 };
825 U_NAMESPACE_END
826
827
828 //
829 // Call-back function for u_unescapeAt(), used when we encounter
830 // \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
831 //
832 U_CDECL_BEGIN
833 static UChar U_CALLCONV
834 unescape_charAt(int32_t offset, void *context) {
835 UChar c16 = ((UChar *)context)[offset];
836 return c16;
837 }
838 U_CDECL_END
839
840
841 static const UChar BACKSLASH = 0x5c;
842 static const UChar DOLLARSIGN = 0x24;
843
844 //
845 // Move a character to an output buffer, with bounds checking on the index.
846 // Index advances even if capacity is exceeded, for preflight size computations.
847 // This little sequence is used a LOT.
848 //
849 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
850 if (*idx < bufCapacity) {
851 buf[*idx] = c;
852 }
853 (*idx)++;
854 }
855
856
857 //
858 // appendReplacement, the actual implementation.
859 //
860 int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
861 const UChar *replacementText,
862 int32_t replacementLength,
863 UChar **destBuf,
864 int32_t *destCapacity,
865 UErrorCode *status) {
866
867 // If we come in with a buffer overflow error, don't suppress the operation.
868 // A series of appendReplacements, appendTail need to correctly preflight
869 // the buffer size when an overflow happens somewhere in the middle.
870 UBool pendingBufferOverflow = FALSE;
871 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) {
872 pendingBufferOverflow = TRUE;
873 *status = U_ZERO_ERROR;
874 }
875
876 //
877 // Validate all paramters
878 //
879 if (validateRE(regexp, status) == FALSE) {
880 return 0;
881 }
882 if (replacementText == NULL || replacementLength < -1 ||
883 destCapacity == NULL || destBuf == NULL ||
884 *destBuf == NULL && *destCapacity > 0 ||
885 *destCapacity < 0) {
886 *status = U_ILLEGAL_ARGUMENT_ERROR;
887 return 0;
888 }
889
890 RegexMatcher *m = regexp->fMatcher;
891 if (m->fMatch == FALSE) {
892 *status = U_REGEX_INVALID_STATE;
893 return 0;
894 }
895
896 UChar *dest = *destBuf;
897 int32_t capacity = *destCapacity;
898 int32_t destIdx = 0;
899 int32_t i;
900
901 // If it wasn't supplied by the caller, get the length of the replacement text.
902 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
903 // the fly and avoid this step.
904 if (replacementLength == -1) {
905 replacementLength = u_strlen(replacementText);
906 }
907
908 // Copy input string from the end of previous match to start of current match
909 for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
910 appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
911 }
912
913
914
915 // scan the replacement text, looking for substitutions ($n) and \escapes.
916 int32_t replIdx = 0;
917 while (replIdx < replacementLength) {
918 UChar c = replacementText[replIdx];
919 replIdx++;
920 if (c != DOLLARSIGN && c != BACKSLASH) {
921 // Common case, no substitution, no escaping,
922 // just copy the char to the dest buf.
923 appendToBuf(c, &destIdx, dest, capacity);
924 continue;
925 }
926
927 if (c == BACKSLASH) {
928 // Backslash Escape. Copy the following char out without further checks.
929 // Note: Surrogate pairs don't need any special handling
930 // The second half wont be a '$' or a '\', and
931 // will move to the dest normally on the next
932 // loop iteration.
933 if (replIdx >= replacementLength) {
934 break;
935 }
936 c = replacementText[replIdx];
937
938 if (c==0x55/*U*/ || c==0x75/*u*/) {
939 // We have a \udddd or \Udddddddd escape sequence.
940 UChar32 escapedChar =
941 u_unescapeAt(unescape_charAt,
942 &replIdx, // Index is updated by unescapeAt
943 replacementLength, // Length of replacement text
944 (void *)replacementText);
945
946 if (escapedChar != (UChar32)0xFFFFFFFF) {
947 if (escapedChar <= 0xffff) {
948 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
949 } else {
950 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
951 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
952 }
953 continue;
954 }
955 // Note: if the \u escape was invalid, just fall through and
956 // treat it as a plain \<anything> escape.
957 }
958
959 // Plain backslash escape. Just put out the escaped character.
960 appendToBuf(c, &destIdx, dest, capacity);
961
962 replIdx++;
963 continue;
964 }
965
966
967
968 // We've got a $. Pick up a capture group number if one follows.
969 // Consume at most the number of digits necessary for the largest capture
970 // number that is valid for this pattern.
971
972 int32_t numDigits = 0;
973 int32_t groupNum = 0;
974 UChar32 digitC;
975 for (;;) {
976 if (replIdx >= replacementLength) {
977 break;
978 }
979 U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
980 if (u_isdigit(digitC) == FALSE) {
981 break;
982 }
983
984 U16_FWD_1(replacementText, replIdx, replacementLength);
985 groupNum=groupNum*10 + u_charDigitValue(digitC);
986 numDigits++;
987 if (numDigits >= m->fPattern->fMaxCaptureDigits) {
988 break;
989 }
990 }
991
992
993 if (numDigits == 0) {
994 // The $ didn't introduce a group number at all.
995 // Treat it as just part of the substitution text.
996 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
997 continue;
998 }
999
1000 // Finally, append the capture group data to the destination.
1001 int32_t capacityRemaining = capacity - destIdx;
1002 if (capacityRemaining < 0) {
1003 capacityRemaining = 0;
1004 }
1005 destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
1006 if (*status == U_BUFFER_OVERFLOW_ERROR) {
1007 // Ignore buffer overflow when extracting the group. We need to
1008 // continue on to get full size of the untruncated result. We will
1009 // raise our own buffer overflow error at the end.
1010 *status = U_ZERO_ERROR;
1011 }
1012
1013 if (U_FAILURE(*status)) {
1014 // Can fail if group number is out of range.
1015 break;
1016 }
1017
1018 }
1019
1020 //
1021 // Nul Terminate the dest buffer if possible.
1022 // Set the appropriate buffer overflow or not terminated error, if needed.
1023 //
1024 if (destIdx < capacity) {
1025 dest[destIdx] = 0;
1026 } else if (destIdx == *destCapacity) {
1027 *status = U_STRING_NOT_TERMINATED_WARNING;
1028 } else {
1029 *status = U_BUFFER_OVERFLOW_ERROR;
1030 }
1031
1032 //
1033 // Return an updated dest buffer and capacity to the caller.
1034 //
1035 if (destIdx > 0 && *destCapacity > 0) {
1036 if (destIdx < capacity) {
1037 *destBuf += destIdx;
1038 *destCapacity -= destIdx;
1039 } else {
1040 *destBuf += capacity;
1041 *destCapacity = 0;
1042 }
1043 }
1044
1045 // If we came in with a buffer overflow, make sure we go out with one also.
1046 // (A zero length match right at the end of the previous match could
1047 // make this function succeed even though a previous call had overflowed the buf)
1048 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1049 *status = U_BUFFER_OVERFLOW_ERROR;
1050 }
1051
1052 return destIdx;
1053 }
1054
1055 //
1056 // appendReplacement the acutal API function,
1057 //
1058 U_CAPI int32_t U_EXPORT2
1059 uregex_appendReplacement(URegularExpression *regexp,
1060 const UChar *replacementText,
1061 int32_t replacementLength,
1062 UChar **destBuf,
1063 int32_t *destCapacity,
1064 UErrorCode *status) {
1065 return RegexCImpl::appendReplacement(
1066 regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1067 }
1068
1069
1070 //------------------------------------------------------------------------------
1071 //
1072 // uregex_appendTail
1073 //
1074 //------------------------------------------------------------------------------
1075 int32_t RegexCImpl::appendTail(URegularExpression *regexp,
1076 UChar **destBuf,
1077 int32_t *destCapacity,
1078 UErrorCode *status)
1079 {
1080
1081 if (destCapacity == NULL || destBuf == NULL ||
1082 *destBuf == NULL && *destCapacity > 0 ||
1083 *destCapacity < 0)
1084 {
1085 *status = U_ILLEGAL_ARGUMENT_ERROR;
1086 return 0;
1087 }
1088
1089 // If we come in with a buffer overflow error, don't suppress the operation.
1090 // A series of appendReplacements, appendTail need to correctly preflight
1091 // the buffer size when an overflow happens somewhere in the middle.
1092 UBool pendingBufferOverflow = FALSE;
1093 if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) {
1094 pendingBufferOverflow = TRUE;
1095 *status = U_ZERO_ERROR;
1096 }
1097
1098 if (validateRE(regexp, status) == FALSE) {
1099 return 0;
1100 }
1101 RegexMatcher *m = regexp->fMatcher;
1102
1103 int32_t srcIdx;
1104 if (m->fMatch) {
1105 // The most recent call to find() succeeded.
1106 srcIdx = m->fMatchEnd;
1107 } else {
1108 // The last call to find() on this matcher failed().
1109 // Look back to the end of the last find() that succeeded for src index.
1110 srcIdx = m->fLastMatchEnd;
1111 if (srcIdx == -1) {
1112 // There has been no successful match with this matcher.
1113 // We want to copy the whole string.
1114 srcIdx = 0;
1115 }
1116 }
1117
1118 int32_t destIdx = 0;
1119 int32_t destCap = *destCapacity;
1120 UChar *dest = *destBuf;
1121
1122 for (;;) {
1123 if (srcIdx == regexp->fTextLength) {
1124 break;
1125 }
1126 UChar c = regexp->fText[srcIdx];
1127 if (c == 0 && regexp->fTextLength == -1) {
1128 break;
1129 }
1130 if (destIdx < destCap) {
1131 dest[destIdx] = c;
1132 } else {
1133 // We've overflowed the dest buffer.
1134 // If the total input string length is known, we can
1135 // compute the total buffer size needed without scanning through the string.
1136 if (regexp->fTextLength > 0) {
1137 destIdx += (regexp->fTextLength - srcIdx);
1138 break;
1139 }
1140 }
1141 srcIdx++;
1142 destIdx++;
1143 }
1144
1145 //
1146 // NUL terminate the output string, if possible, otherwise issue the
1147 // appropriate error or warning.
1148 //
1149 if (destIdx < destCap) {
1150 dest[destIdx] = 0;
1151 } else if (destIdx == destCap) {
1152 *status = U_STRING_NOT_TERMINATED_WARNING;
1153 } else {
1154 *status = U_BUFFER_OVERFLOW_ERROR;
1155 }
1156
1157 //
1158 // Update the user's buffer ptr and capacity vars to reflect the
1159 // amount used.
1160 //
1161 if (destIdx < destCap) {
1162 *destBuf += destIdx;
1163 *destCapacity -= destIdx;
1164 } else {
1165 *destBuf += destCap;
1166 *destCapacity = 0;
1167 }
1168
1169 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1170 *status = U_BUFFER_OVERFLOW_ERROR;
1171 }
1172
1173 return destIdx;
1174 }
1175
1176
1177 U_CAPI int32_t U_EXPORT2
1178 uregex_appendTail(URegularExpression *regexp,
1179 UChar **destBuf,
1180 int32_t *destCapacity,
1181 UErrorCode *status) {
1182 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1183 }
1184
1185
1186 //------------------------------------------------------------------------------
1187 //
1188 // copyString Internal utility to copy a string to an output buffer,
1189 // while managing buffer overflow and preflight size
1190 // computation. NUL termination is added to destination,
1191 // and the NUL is counted in the output size.
1192 //
1193 //------------------------------------------------------------------------------
1194 static void copyString(UChar *destBuffer, // Destination buffer.
1195 int32_t destCapacity, // Total capacity of dest buffer
1196 int32_t *destIndex, // Index into dest buffer. Updated on return.
1197 // Update not clipped to destCapacity.
1198 const UChar *srcPtr, // Pointer to source string
1199 int32_t srcLen) // Source string len.
1200 {
1201 int32_t si;
1202 int32_t di = *destIndex;
1203 UChar c;
1204
1205 for (si=0; si<srcLen; si++) {
1206 c = srcPtr[si];
1207 if (di < destCapacity) {
1208 destBuffer[di] = c;
1209 di++;
1210 } else {
1211 di += srcLen - si;
1212 break;
1213 }
1214 }
1215 if (di<destCapacity) {
1216 destBuffer[di] = 0;
1217 }
1218 di++;
1219 *destIndex = di;
1220 }
1221
1222
1223 //------------------------------------------------------------------------------
1224 //
1225 // uregex_split
1226 //
1227 //------------------------------------------------------------------------------
1228 U_CAPI int32_t U_EXPORT2
1229 uregex_split( URegularExpression *regexp,
1230 UChar *destBuf,
1231 int32_t destCapacity,
1232 int32_t *requiredCapacity,
1233 UChar *destFields[],
1234 int32_t destFieldsCapacity,
1235 UErrorCode *status) {
1236 if (validateRE(regexp, status) == FALSE) {
1237 return 0;
1238 }
1239 if (destBuf == NULL && destCapacity > 0 ||
1240 destCapacity < 0 ||
1241 destFields == NULL ||
1242 destFieldsCapacity < 1 ) {
1243 *status = U_ILLEGAL_ARGUMENT_ERROR;
1244 return 0;
1245 }
1246
1247 //
1248 // Reset for the input text
1249 //
1250 regexp->fMatcher->reset();
1251 int32_t inputLen = regexp->fTextString.length();
1252 int32_t nextOutputStringStart = 0;
1253 if (inputLen == 0) {
1254 return 0;
1255 }
1256
1257
1258 //
1259 // Loop through the input text, searching for the delimiter pattern
1260 //
1261 int32_t i; // Index of the field being processed.
1262 int32_t destIdx = 0; // Next available position in destBuf;
1263 int32_t numCaptureGroups = regexp->fMatcher->groupCount();
1264 for (i=0; ; i++) {
1265 if (i>=destFieldsCapacity-1) {
1266 // There are one or zero output string left.
1267 // Fill the last output string with whatever is left from the input, then exit the loop.
1268 // ( i will be == destFieldsCapacity if we filled the output array while processing
1269 // capture groups of the delimiter expression, in which case we will discard the
1270 // last capture group saved in favor of the unprocessed remainder of the
1271 // input string.)
1272 int32_t remainingLength = inputLen-nextOutputStringStart;
1273 if (remainingLength > 0) {
1274 }
1275 if (i >= destFieldsCapacity) {
1276 // No fields are left. Recycle the last one for holding the trailing part of
1277 // the input string.
1278 i = destFieldsCapacity-1;
1279 destIdx = (int32_t)(destFields[i] - destFields[0]);
1280 }
1281
1282 destFields[i] = &destBuf[destIdx];
1283 copyString(destBuf, destCapacity, &destIdx,
1284 &regexp->fText[nextOutputStringStart], remainingLength);
1285 break;
1286 }
1287
1288 if (regexp->fMatcher->find()) {
1289 // We found another delimiter. Move everything from where we started looking
1290 // up until the start of the delimiter into the next output string.
1291 int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
1292 destFields[i] = &destBuf[destIdx];
1293 copyString(destBuf, destCapacity, &destIdx,
1294 &regexp->fText[nextOutputStringStart], fieldLen);
1295 nextOutputStringStart = regexp->fMatcher->end(*status);
1296
1297 // If the delimiter pattern has capturing parentheses, the captured
1298 // text goes out into the next n destination strings.
1299 int32_t groupNum;
1300 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1301 // If we've run out of output string slots, bail out.
1302 if (i==destFieldsCapacity-1) {
1303 break;
1304 }
1305 i++;
1306
1307 // Set up to extract the capture group contents into the dest buffer.
1308 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow
1309 // error while extracting this group.
1310 int32_t remainingCapacity = destCapacity - destIdx;
1311 if (remainingCapacity < 0) {
1312 remainingCapacity = 0;
1313 }
1314 destFields[i] = &destBuf[destIdx];
1315 int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus);
1316 destIdx += t + 1; // Record the space used in the output string buffer.
1317 // +1 for the NUL that terminates the string.
1318 }
1319
1320 if (nextOutputStringStart == inputLen) {
1321 // The delimiter was at the end of the string. We're done.
1322 break;
1323 }
1324
1325 }
1326 else
1327 {
1328 // We ran off the end of the input while looking for the next delimiter.
1329 // All the remaining text goes into the current output string.
1330 destFields[i] = &destBuf[destIdx];
1331 copyString(destBuf, destCapacity, &destIdx,
1332 &regexp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
1333 break;
1334 }
1335 }
1336
1337 // Zero out any unused portion of the destFields array
1338 int j;
1339 for (j=i+1; j<destFieldsCapacity; j++) {
1340 destFields[j] = NULL;
1341 }
1342
1343 if (requiredCapacity != NULL) {
1344 *requiredCapacity = destIdx;
1345 }
1346 if (destIdx > destCapacity) {
1347 *status = U_BUFFER_OVERFLOW_ERROR;
1348 }
1349 return i+1;
1350 }
1351
1352
1353 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1354