]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/uregex.cpp
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / i18n / uregex.cpp
CommitLineData
374ca955
A
1/*
2*******************************************************************************
46f4442e 3* Copyright (C) 2004-2008, International Business Machines
374ca955
A
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* file name: regex.cpp
7*/
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13#include "unicode/regex.h"
14#include "unicode/uregex.h"
15#include "unicode/unistr.h"
16#include "unicode/ustring.h"
17#include "unicode/uchar.h"
18#include "unicode/uobject.h"
19#include "umutex.h"
20#include "uassert.h"
21#include "cmemory.h"
22
46f4442e
A
23U_NAMESPACE_USE
24
374ca955
A
25struct URegularExpression: public UMemory {
26public:
27 URegularExpression();
28 ~URegularExpression();
29 int32_t fMagic;
30 RegexPattern *fPat;
31 int32_t *fPatRefCount;
32 UChar *fPatString;
33 int32_t fPatStringLen;
34 RegexMatcher *fMatcher;
35 const UChar *fText; // Text from setText()
36 int32_t fTextLength; // Length provided by user with setText(), which
37 // may be -1.
38
39 UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString.
40 // TODO: regexp engine should not depend on UnicodeString.
41};
42
43static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
44
374ca955
A
45URegularExpression::URegularExpression() {
46 fMagic = REXP_MAGIC;
47 fPat = NULL;
48 fPatRefCount = NULL;
49 fPatString = NULL;
50 fPatStringLen = 0;
51 fMatcher = NULL;
52 fText = NULL;
53 fTextLength = 0;
54}
55
56URegularExpression::~URegularExpression() {
57 delete fMatcher;
58 fMatcher = NULL;
59 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
60 delete fPat;
61 uprv_free(fPatString);
62 uprv_free(fPatRefCount);
63 }
64 fMagic = 0;
65}
66
67//----------------------------------------------------------------------------------------
68//
69// validateRE Do boilerplate style checks on API function parameters.
70// Return TRUE if they look OK.
71//----------------------------------------------------------------------------------------
72static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
73 if (U_FAILURE(*status)) {
74 return FALSE;
75 }
76 if (re == NULL || re->fMagic != REXP_MAGIC) {
374ca955
A
77 *status = U_ILLEGAL_ARGUMENT_ERROR;
78 return FALSE;
79 }
80 if (requiresText && re->fText == NULL) {
81 *status = U_REGEX_INVALID_STATE;
82 return FALSE;
83 }
84 return TRUE;
85}
86
87//----------------------------------------------------------------------------------------
88//
89// uregex_open
90//
91//----------------------------------------------------------------------------------------
92U_CAPI URegularExpression * U_EXPORT2
93uregex_open( const UChar *pattern,
94 int32_t patternLength,
95 uint32_t flags,
96 UParseError *pe,
97 UErrorCode *status) {
98
99 if (U_FAILURE(*status)) {
100 return NULL;
101 }
102 if (pattern == NULL || patternLength < -1 || patternLength == 0) {
103 *status = U_ILLEGAL_ARGUMENT_ERROR;
104 return NULL;
105 }
106 int32_t actualPatLen = patternLength;
107 if (actualPatLen == -1) {
108 actualPatLen = u_strlen(pattern);
109 }
110
111 URegularExpression *re = new URegularExpression;
112 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
113 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
114 if (re == NULL || refC == NULL || patBuf == NULL) {
115 *status = U_MEMORY_ALLOCATION_ERROR;
116 delete re;
117 uprv_free(refC);
118 uprv_free(patBuf);
119 return NULL;
120 }
121 re->fPatRefCount = refC;
122 *re->fPatRefCount = 1;
123
124 //
125 // Make a copy of the pattern string, so we can return it later if asked.
126 // For compiling the pattern, we will use a read-only-aliased UnicodeString
127 // of this local copy, to avoid making even more copies.
128 //
129 re->fPatString = patBuf;
130 re->fPatStringLen = patternLength;
131 u_memcpy(patBuf, pattern, actualPatLen);
132 patBuf[actualPatLen] = 0;
133 UnicodeString patString(patternLength==-1, patBuf, patternLength);
134
135 //
136 // Compile the pattern
137 //
138 if (pe != NULL) {
139 re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
140 } else {
141 re->fPat = RegexPattern::compile(patString, flags, *status);
142 }
143 if (U_FAILURE(*status)) {
144 goto ErrorExit;
145 }
146
147 //
148 // Create the matcher object
149 //
150 re->fMatcher = re->fPat->matcher(*status);
151 if (U_SUCCESS(*status)) {
152 return re;
153 }
154
155ErrorExit:
156 delete re;
157 return NULL;
158
159}
160
374ca955
A
161//----------------------------------------------------------------------------------------
162//
163// uregex_close
164//
165//----------------------------------------------------------------------------------------
166U_CAPI void U_EXPORT2
167uregex_close(URegularExpression *re) {
168 UErrorCode status = U_ZERO_ERROR;
169 if (validateRE(re, &status, FALSE) == FALSE) {
170 return;
171 }
172 delete re;
173}
174
175
176//----------------------------------------------------------------------------------------
177//
178// uregex_clone
179//
180//----------------------------------------------------------------------------------------
181U_CAPI URegularExpression * U_EXPORT2
182uregex_clone(const URegularExpression *source, UErrorCode *status) {
183 if (validateRE(source, status, FALSE) == FALSE) {
184 return NULL;
185 }
186
187 URegularExpression *clone = new URegularExpression;
188 if (clone == NULL) {
189 *status = U_MEMORY_ALLOCATION_ERROR;
190 return NULL;
191 }
192
193 clone->fMatcher = source->fPat->matcher(*status);
194 if (U_FAILURE(*status)) {
195 delete clone;
196 return NULL;
197 }
374ca955
A
198
199 clone->fPat = source->fPat;
200 clone->fPatRefCount = source->fPatRefCount;
201 clone->fPatString = source->fPatString;
202 clone->fPatStringLen = source->fPatStringLen;
203 umtx_atomic_inc(source->fPatRefCount);
204 // Note: fText is not cloned.
205
206 return clone;
73c04bcf 207}
374ca955
A
208
209
210
211
73c04bcf 212//------------------------------------------------------------------------------
374ca955
A
213//
214// uregex_pattern
215//
73c04bcf 216//------------------------------------------------------------------------------
374ca955
A
217U_CAPI const UChar * U_EXPORT2
218uregex_pattern(const URegularExpression *regexp,
219 int32_t *patLength,
220 UErrorCode *status) {
221
222 if (validateRE(regexp, status, FALSE) == FALSE) {
223 return NULL;
224 }
225 if (patLength != NULL) {
226 *patLength = regexp->fPatStringLen;
227 }
228 return regexp->fPatString;
73c04bcf 229}
374ca955
A
230
231
73c04bcf 232//------------------------------------------------------------------------------
374ca955
A
233//
234// uregex_flags
235//
73c04bcf 236//------------------------------------------------------------------------------
374ca955
A
237U_CAPI int32_t U_EXPORT2
238uregex_flags(const URegularExpression *regexp, UErrorCode *status) {
239 if (validateRE(regexp, status, FALSE) == FALSE) {
240 return 0;
241 }
242 int32_t flags = regexp->fPat->flags();
243 return flags;
73c04bcf 244}
374ca955
A
245
246
73c04bcf 247//------------------------------------------------------------------------------
374ca955
A
248//
249// uregex_setText
250//
73c04bcf 251//------------------------------------------------------------------------------
374ca955
A
252U_CAPI void U_EXPORT2
253uregex_setText(URegularExpression *regexp,
254 const UChar *text,
255 int32_t textLength,
256 UErrorCode *status) {
257 if (validateRE(regexp, status, FALSE) == FALSE) {
258 return;
259 }
260 if (text == NULL || textLength < -1) {
261 *status = U_ILLEGAL_ARGUMENT_ERROR;
262 return;
263 }
264 regexp->fText = text;
265 regexp->fTextLength = textLength;
266 UBool isTerminated = (textLength == -1);
267
268 regexp->fTextString.setTo(isTerminated, text, textLength);
269 regexp->fMatcher->reset(regexp->fTextString);
73c04bcf 270}
374ca955
A
271
272
273
73c04bcf 274//------------------------------------------------------------------------------
374ca955
A
275//
276// uregex_getText
277//
73c04bcf 278//------------------------------------------------------------------------------
374ca955
A
279U_CAPI const UChar * U_EXPORT2
280uregex_getText(URegularExpression *regexp,
281 int32_t *textLength,
282 UErrorCode *status) {
283 if (validateRE(regexp, status, FALSE) == FALSE) {
284 return NULL;
285 }
286 if (textLength != NULL) {
287 *textLength = regexp->fTextLength;
288 }
289 return regexp->fText;
73c04bcf 290}
374ca955
A
291
292
73c04bcf 293//------------------------------------------------------------------------------
374ca955
A
294//
295// uregex_matches
296//
73c04bcf 297//------------------------------------------------------------------------------
374ca955
A
298U_CAPI UBool U_EXPORT2
299uregex_matches(URegularExpression *regexp,
300 int32_t startIndex,
301 UErrorCode *status) {
46f4442e 302 UBool result = FALSE;
374ca955 303 if (validateRE(regexp, status) == FALSE) {
46f4442e
A
304 return result;
305 }
306 if (startIndex == -1) {
307 result = regexp->fMatcher->matches(*status);
308 } else {
309 result = regexp->fMatcher->matches(startIndex, *status);
374ca955 310 }
374ca955 311 return result;
73c04bcf 312}
374ca955
A
313
314
315
73c04bcf 316//------------------------------------------------------------------------------
374ca955
A
317//
318// uregex_lookingAt
319//
73c04bcf 320//------------------------------------------------------------------------------
374ca955
A
321U_CAPI UBool U_EXPORT2
322uregex_lookingAt(URegularExpression *regexp,
323 int32_t startIndex,
324 UErrorCode *status) {
46f4442e 325 UBool result = FALSE;
374ca955 326 if (validateRE(regexp, status) == FALSE) {
46f4442e
A
327 return result;
328 }
329 if (startIndex == -1) {
330 result = regexp->fMatcher->lookingAt(*status);
331 } else {
332 result = regexp->fMatcher->lookingAt(startIndex, *status);
374ca955 333 }
374ca955 334 return result;
73c04bcf 335}
374ca955
A
336
337
338
73c04bcf 339//------------------------------------------------------------------------------
374ca955
A
340//
341// uregex_find
342//
73c04bcf 343//------------------------------------------------------------------------------
374ca955
A
344U_CAPI UBool U_EXPORT2
345uregex_find(URegularExpression *regexp,
346 int32_t startIndex,
347 UErrorCode *status) {
46f4442e 348 UBool result = FALSE;
374ca955 349 if (validateRE(regexp, status) == FALSE) {
46f4442e
A
350 return result;
351 }
352 if (startIndex == -1) {
353 regexp->fMatcher->resetPreserveRegion();
354 result = regexp->fMatcher->find();
355 } else {
356 result = regexp->fMatcher->find(startIndex, *status);
374ca955 357 }
374ca955 358 return result;
73c04bcf 359}
374ca955 360
73c04bcf 361//------------------------------------------------------------------------------
374ca955
A
362//
363// uregex_findNext
364//
73c04bcf 365//------------------------------------------------------------------------------
374ca955
A
366U_CAPI UBool U_EXPORT2
367uregex_findNext(URegularExpression *regexp,
368 UErrorCode *status) {
369 if (validateRE(regexp, status) == FALSE) {
370 return FALSE;
371 }
372 UBool result = regexp->fMatcher->find();
373 return result;
73c04bcf 374}
374ca955 375
73c04bcf 376//------------------------------------------------------------------------------
374ca955
A
377//
378// uregex_groupCount
379//
73c04bcf 380//------------------------------------------------------------------------------
374ca955
A
381U_CAPI int32_t U_EXPORT2
382uregex_groupCount(URegularExpression *regexp,
383 UErrorCode *status) {
384 if (validateRE(regexp, status, FALSE) == FALSE) {
385 return 0;
386 }
387 int32_t result = regexp->fMatcher->groupCount();
388 return result;
73c04bcf 389}
374ca955
A
390
391
73c04bcf 392//------------------------------------------------------------------------------
374ca955
A
393//
394// uregex_group
395//
73c04bcf 396//------------------------------------------------------------------------------
374ca955
A
397U_CAPI int32_t U_EXPORT2
398uregex_group(URegularExpression *regexp,
399 int32_t groupNum,
400 UChar *dest,
401 int32_t destCapacity,
402 UErrorCode *status) {
403 if (validateRE(regexp, status) == FALSE) {
404 return 0;
405 }
406 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
407 *status = U_ILLEGAL_ARGUMENT_ERROR;
408 return 0;
409 }
410
411 //
412 // Pick up the range of characters from the matcher
413 //
414 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
415 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
416 if (U_FAILURE(*status)) {
417 return 0;
418 }
419
420 //
421 // Trim length based on buffer capacity
422 //
423 int32_t fullLength = endIx - startIx;
424 int32_t copyLength = fullLength;
425 if (copyLength < destCapacity) {
426 dest[copyLength] = 0;
427 } else if (copyLength == destCapacity) {
428 *status = U_STRING_NOT_TERMINATED_WARNING;
429 } else {
430 copyLength = destCapacity;
431 *status = U_BUFFER_OVERFLOW_ERROR;
432 }
433
434 //
435 // Copy capture group to user's buffer
436 //
437 if (copyLength > 0) {
438 u_memcpy(dest, &regexp->fText[startIx], copyLength);
439 }
440 return fullLength;
73c04bcf 441}
374ca955
A
442
443
73c04bcf 444//------------------------------------------------------------------------------
374ca955
A
445//
446// uregex_start
447//
73c04bcf 448//------------------------------------------------------------------------------
374ca955
A
449U_CAPI int32_t U_EXPORT2
450uregex_start(URegularExpression *regexp,
451 int32_t groupNum,
452 UErrorCode *status) {
453 if (validateRE(regexp, status) == FALSE) {
454 return 0;
455 }
456 int32_t result = regexp->fMatcher->start(groupNum, *status);
457 return result;
73c04bcf 458}
374ca955
A
459
460
73c04bcf 461//------------------------------------------------------------------------------
374ca955
A
462//
463// uregex_end
464//
73c04bcf 465//------------------------------------------------------------------------------
374ca955
A
466U_CAPI int32_t U_EXPORT2
467uregex_end(URegularExpression *regexp,
468 int32_t groupNum,
469 UErrorCode *status) {
470 if (validateRE(regexp, status) == FALSE) {
471 return 0;
472 }
473 int32_t result = regexp->fMatcher->end(groupNum, *status);
474 return result;
73c04bcf 475}
374ca955 476
73c04bcf 477//------------------------------------------------------------------------------
374ca955
A
478//
479// uregex_reset
480//
73c04bcf 481//------------------------------------------------------------------------------
374ca955
A
482U_CAPI void U_EXPORT2
483uregex_reset(URegularExpression *regexp,
484 int32_t index,
485 UErrorCode *status) {
486 if (validateRE(regexp, status) == FALSE) {
487 return;
488 }
489 regexp->fMatcher->reset(index, *status);
73c04bcf 490}
374ca955
A
491
492
46f4442e
A
493//------------------------------------------------------------------------------
494//
495// uregex_setRegion
496//
497//------------------------------------------------------------------------------
498U_CAPI void U_EXPORT2
499uregex_setRegion(URegularExpression *regexp,
500 int32_t regionStart,
501 int32_t regionLimit,
502 UErrorCode *status) {
503 if (validateRE(regexp, status) == FALSE) {
504 return;
505 }
506 regexp->fMatcher->region(regionStart, regionLimit, *status);
507}
508
509
510//------------------------------------------------------------------------------
511//
512// uregex_regionStart
513//
514//------------------------------------------------------------------------------
515U_CAPI int32_t U_EXPORT2
516uregex_regionStart(const URegularExpression *regexp,
517 UErrorCode *status) {
518 if (validateRE(regexp, status) == FALSE) {
519 return 0;
520 }
521 return regexp->fMatcher->regionStart();
522}
523
524
525//------------------------------------------------------------------------------
526//
527// uregex_regionEnd
528//
529//------------------------------------------------------------------------------
530U_CAPI int32_t U_EXPORT2
531uregex_regionEnd(const URegularExpression *regexp,
532 UErrorCode *status) {
533 if (validateRE(regexp, status) == FALSE) {
534 return 0;
535 }
536 return regexp->fMatcher->regionEnd();
537}
538
539
540//------------------------------------------------------------------------------
541//
542// uregex_hasTransparentBounds
543//
544//------------------------------------------------------------------------------
545U_CAPI UBool U_EXPORT2
546uregex_hasTransparentBounds(const URegularExpression *regexp,
547 UErrorCode *status) {
548 if (validateRE(regexp, status) == FALSE) {
549 return FALSE;
550 }
551 return regexp->fMatcher->hasTransparentBounds();
552}
553
554
555//------------------------------------------------------------------------------
556//
557// uregex_useTransparentBounds
558//
559//------------------------------------------------------------------------------
560U_CAPI void U_EXPORT2
561uregex_useTransparentBounds(URegularExpression *regexp,
562 UBool b,
563 UErrorCode *status) {
564 if (validateRE(regexp, status) == FALSE) {
565 return;
566 }
567 regexp->fMatcher->useTransparentBounds(b);
568}
569
570
571//------------------------------------------------------------------------------
572//
573// uregex_hasAnchoringBounds
574//
575//------------------------------------------------------------------------------
576U_CAPI UBool U_EXPORT2
577uregex_hasAnchoringBounds(const URegularExpression *regexp,
578 UErrorCode *status) {
579 if (validateRE(regexp, status) == FALSE) {
580 return FALSE;
581 }
582 return regexp->fMatcher->hasAnchoringBounds();
583}
584
585
586//------------------------------------------------------------------------------
587//
588// uregex_useAnchoringBounds
589//
590//------------------------------------------------------------------------------
591U_CAPI void U_EXPORT2
592uregex_useAnchoringBounds(URegularExpression *regexp,
593 UBool b,
594 UErrorCode *status) {
595 if (validateRE(regexp, status) == FALSE) {
596 return;
597 }
598 regexp->fMatcher->useAnchoringBounds(b);
599}
600
601
602//------------------------------------------------------------------------------
603//
604// uregex_hitEnd
605//
606//------------------------------------------------------------------------------
607U_CAPI UBool U_EXPORT2
608uregex_hitEnd(const URegularExpression *regexp,
609 UErrorCode *status) {
610 if (validateRE(regexp, status) == FALSE) {
611 return FALSE;
612 }
613 return regexp->fMatcher->hitEnd();
614}
615
616
617//------------------------------------------------------------------------------
618//
619// uregex_requireEnd
620//
621//------------------------------------------------------------------------------
622U_CAPI UBool U_EXPORT2
623uregex_requireEnd(const URegularExpression *regexp,
624 UErrorCode *status) {
625 if (validateRE(regexp, status) == FALSE) {
626 return FALSE;
627 }
628 return regexp->fMatcher->requireEnd();
629}
630
631
632//------------------------------------------------------------------------------
633//
634// uregex_setTimeLimit
635//
636//------------------------------------------------------------------------------
637U_CAPI void U_EXPORT2
638uregex_setTimeLimit(URegularExpression *regexp,
639 int32_t limit,
640 UErrorCode *status) {
641 if (validateRE(regexp, status)) {
642 regexp->fMatcher->setTimeLimit(limit, *status);
643 }
644}
645
646
647
648//------------------------------------------------------------------------------
649//
650// uregex_getTimeLimit
651//
652//------------------------------------------------------------------------------
653U_CAPI int32_t U_EXPORT2
654uregex_getTimeLimit(const URegularExpression *regexp,
655 UErrorCode *status) {
656 int32_t retVal = 0;
657 if (validateRE(regexp, status)) {
658 retVal = regexp->fMatcher->getTimeLimit();
659 }
660 return retVal;
661}
662
663
664
665//------------------------------------------------------------------------------
666//
667// uregex_setStackLimit
668//
669//------------------------------------------------------------------------------
670U_CAPI void U_EXPORT2
671uregex_setStackLimit(URegularExpression *regexp,
672 int32_t limit,
673 UErrorCode *status) {
674 if (validateRE(regexp, status)) {
675 regexp->fMatcher->setStackLimit(limit, *status);
676 }
677}
678
679
680
681//------------------------------------------------------------------------------
682//
683// uregex_getStackLimit
684//
685//------------------------------------------------------------------------------
686U_CAPI int32_t U_EXPORT2
687uregex_getStackLimit(const URegularExpression *regexp,
688 UErrorCode *status) {
689 int32_t retVal = 0;
690 if (validateRE(regexp, status)) {
691 retVal = regexp->fMatcher->getStackLimit();
692 }
693 return retVal;
694}
695
696
697//------------------------------------------------------------------------------
698//
699// uregex_setMatchCallback
700//
701//------------------------------------------------------------------------------
702U_CAPI void U_EXPORT2
703uregex_setMatchCallback(URegularExpression *regexp,
704 URegexMatchCallback *callback,
705 const void *context,
706 UErrorCode *status) {
707 if (validateRE(regexp, status)) {
708 regexp->fMatcher->setMatchCallback(callback, context, *status);
709 }
710}
711
712
713//------------------------------------------------------------------------------
714//
715// uregex_getMatchCallback
716//
717//------------------------------------------------------------------------------
718U_CAPI void U_EXPORT2
719uregex_getMatchCallback(const URegularExpression *regexp,
720 URegexMatchCallback **callback,
721 const void **context,
722 UErrorCode *status) {
723 if (validateRE(regexp, status)) {
724 regexp->fMatcher->getMatchCallback(*callback, *context, *status);
725 }
726}
727
728
73c04bcf 729//------------------------------------------------------------------------------
374ca955
A
730//
731// uregex_replaceAll
732//
73c04bcf 733//------------------------------------------------------------------------------
374ca955
A
734U_CAPI int32_t U_EXPORT2
735uregex_replaceAll(URegularExpression *regexp,
73c04bcf 736 const UChar *replacementText,
374ca955
A
737 int32_t replacementLength,
738 UChar *destBuf,
739 int32_t destCapacity,
740 UErrorCode *status) {
741 if (validateRE(regexp, status) == FALSE) {
742 return 0;
743 }
744 if (replacementText == NULL || replacementLength < -1 ||
745 destBuf == NULL && destCapacity > 0 ||
746 destCapacity < 0) {
747 *status = U_ILLEGAL_ARGUMENT_ERROR;
748 return 0;
749 }
750
751 int32_t len = 0;
752 uregex_reset(regexp, 0, status);
753 while (uregex_findNext(regexp, status)) {
754 len += uregex_appendReplacement(regexp, replacementText, replacementLength,
755 &destBuf, &destCapacity, status);
756 }
757 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
758
759 return len;
73c04bcf 760}
374ca955
A
761
762
73c04bcf 763//------------------------------------------------------------------------------
374ca955
A
764//
765// uregex_replaceFirst
766//
73c04bcf 767//------------------------------------------------------------------------------
374ca955
A
768U_CAPI int32_t U_EXPORT2
769uregex_replaceFirst(URegularExpression *regexp,
73c04bcf 770 const UChar *replacementText,
374ca955
A
771 int32_t replacementLength,
772 UChar *destBuf,
773 int32_t destCapacity,
774 UErrorCode *status) {
775 if (validateRE(regexp, status) == FALSE) {
776 return 0;
777 }
778 if (replacementText == NULL || replacementLength < -1 ||
779 destBuf == NULL && destCapacity > 0 ||
780 destCapacity < 0) {
781 *status = U_ILLEGAL_ARGUMENT_ERROR;
782 return 0;
783 }
784
785 int32_t len = 0;
786 UBool findSucceeded;
787 uregex_reset(regexp, 0, status);
788 findSucceeded = uregex_find(regexp, 0, status);
789 if (findSucceeded) {
790 len = uregex_appendReplacement(regexp, replacementText, replacementLength,
791 &destBuf, &destCapacity, status);
792 }
793 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
794
795 return len;
73c04bcf 796}
374ca955
A
797
798
73c04bcf 799//------------------------------------------------------------------------------
374ca955
A
800//
801// uregex_appendReplacement
802//
73c04bcf 803//------------------------------------------------------------------------------
374ca955
A
804
805
806//
807// Dummy class, because these functions need to be friends of class RegexMatcher,
808// and stand-alone C functions don't work as friends
809//
810U_NAMESPACE_BEGIN
811class RegexCImpl {
812 public:
813 inline static int32_t appendReplacement(URegularExpression *regexp,
73c04bcf 814 const UChar *replacementText,
374ca955
A
815 int32_t replacementLength,
816 UChar **destBuf,
817 int32_t *destCapacity,
818 UErrorCode *status);
819
820 inline static int32_t appendTail(URegularExpression *regexp,
821 UChar **destBuf,
822 int32_t *destCapacity,
823 UErrorCode *status);
824};
825U_NAMESPACE_END
826
827
828//
829// Call-back function for u_unescapeAt(), used when we encounter
830// \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
831//
832U_CDECL_BEGIN
833static UChar U_CALLCONV
834unescape_charAt(int32_t offset, void *context) {
835 UChar c16 = ((UChar *)context)[offset];
836 return c16;
837}
838U_CDECL_END
839
840
841static const UChar BACKSLASH = 0x5c;
842static const UChar DOLLARSIGN = 0x24;
843
844//
845// Move a character to an output buffer, with bounds checking on the index.
846// Index advances even if capacity is exceeded, for preflight size computations.
847// This little sequence is used a LOT.
848//
849static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
850 if (*idx < bufCapacity) {
851 buf[*idx] = c;
852 }
853 (*idx)++;
854}
855
856
857//
858// appendReplacement, the actual implementation.
859//
860int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
73c04bcf 861 const UChar *replacementText,
374ca955
A
862 int32_t replacementLength,
863 UChar **destBuf,
864 int32_t *destCapacity,
865 UErrorCode *status) {
866
867 // If we come in with a buffer overflow error, don't suppress the operation.
868 // A series of appendReplacements, appendTail need to correctly preflight
869 // the buffer size when an overflow happens somewhere in the middle.
870 UBool pendingBufferOverflow = FALSE;
871 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) {
872 pendingBufferOverflow = TRUE;
873 *status = U_ZERO_ERROR;
874 }
875
876 //
877 // Validate all paramters
878 //
879 if (validateRE(regexp, status) == FALSE) {
880 return 0;
881 }
882 if (replacementText == NULL || replacementLength < -1 ||
883 destCapacity == NULL || destBuf == NULL ||
884 *destBuf == NULL && *destCapacity > 0 ||
885 *destCapacity < 0) {
886 *status = U_ILLEGAL_ARGUMENT_ERROR;
887 return 0;
888 }
889
890 RegexMatcher *m = regexp->fMatcher;
891 if (m->fMatch == FALSE) {
892 *status = U_REGEX_INVALID_STATE;
893 return 0;
894 }
895
896 UChar *dest = *destBuf;
897 int32_t capacity = *destCapacity;
898 int32_t destIdx = 0;
899 int32_t i;
900
901 // If it wasn't supplied by the caller, get the length of the replacement text.
902 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
903 // the fly and avoid this step.
904 if (replacementLength == -1) {
905 replacementLength = u_strlen(replacementText);
906 }
907
908 // Copy input string from the end of previous match to start of current match
909 for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
910 appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
911 }
912
913
914
915 // scan the replacement text, looking for substitutions ($n) and \escapes.
916 int32_t replIdx = 0;
917 while (replIdx < replacementLength) {
918 UChar c = replacementText[replIdx];
919 replIdx++;
920 if (c != DOLLARSIGN && c != BACKSLASH) {
921 // Common case, no substitution, no escaping,
922 // just copy the char to the dest buf.
923 appendToBuf(c, &destIdx, dest, capacity);
924 continue;
925 }
926
927 if (c == BACKSLASH) {
928 // Backslash Escape. Copy the following char out without further checks.
929 // Note: Surrogate pairs don't need any special handling
930 // The second half wont be a '$' or a '\', and
931 // will move to the dest normally on the next
932 // loop iteration.
933 if (replIdx >= replacementLength) {
934 break;
935 }
936 c = replacementText[replIdx];
937
938 if (c==0x55/*U*/ || c==0x75/*u*/) {
939 // We have a \udddd or \Udddddddd escape sequence.
940 UChar32 escapedChar =
941 u_unescapeAt(unescape_charAt,
942 &replIdx, // Index is updated by unescapeAt
943 replacementLength, // Length of replacement text
73c04bcf 944 (void *)replacementText);
374ca955
A
945
946 if (escapedChar != (UChar32)0xFFFFFFFF) {
947 if (escapedChar <= 0xffff) {
948 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
949 } else {
950 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
951 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
952 }
953 continue;
954 }
955 // Note: if the \u escape was invalid, just fall through and
956 // treat it as a plain \<anything> escape.
957 }
958
959 // Plain backslash escape. Just put out the escaped character.
960 appendToBuf(c, &destIdx, dest, capacity);
961
962 replIdx++;
963 continue;
964 }
965
966
967
968 // We've got a $. Pick up a capture group number if one follows.
969 // Consume at most the number of digits necessary for the largest capture
970 // number that is valid for this pattern.
971
972 int32_t numDigits = 0;
973 int32_t groupNum = 0;
974 UChar32 digitC;
975 for (;;) {
976 if (replIdx >= replacementLength) {
977 break;
978 }
979 U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
980 if (u_isdigit(digitC) == FALSE) {
981 break;
982 }
983
984 U16_FWD_1(replacementText, replIdx, replacementLength);
985 groupNum=groupNum*10 + u_charDigitValue(digitC);
986 numDigits++;
987 if (numDigits >= m->fPattern->fMaxCaptureDigits) {
988 break;
989 }
990 }
991
992
993 if (numDigits == 0) {
994 // The $ didn't introduce a group number at all.
995 // Treat it as just part of the substitution text.
996 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
997 continue;
998 }
999
1000 // Finally, append the capture group data to the destination.
1001 int32_t capacityRemaining = capacity - destIdx;
1002 if (capacityRemaining < 0) {
1003 capacityRemaining = 0;
1004 }
1005 destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
1006 if (*status == U_BUFFER_OVERFLOW_ERROR) {
1007 // Ignore buffer overflow when extracting the group. We need to
1008 // continue on to get full size of the untruncated result. We will
1009 // raise our own buffer overflow error at the end.
1010 *status = U_ZERO_ERROR;
1011 }
1012
1013 if (U_FAILURE(*status)) {
1014 // Can fail if group number is out of range.
1015 break;
1016 }
1017
1018 }
1019
1020 //
1021 // Nul Terminate the dest buffer if possible.
1022 // Set the appropriate buffer overflow or not terminated error, if needed.
1023 //
1024 if (destIdx < capacity) {
1025 dest[destIdx] = 0;
1026 } else if (destIdx == *destCapacity) {
1027 *status = U_STRING_NOT_TERMINATED_WARNING;
1028 } else {
1029 *status = U_BUFFER_OVERFLOW_ERROR;
1030 }
1031
1032 //
1033 // Return an updated dest buffer and capacity to the caller.
1034 //
1035 if (destIdx > 0 && *destCapacity > 0) {
1036 if (destIdx < capacity) {
1037 *destBuf += destIdx;
1038 *destCapacity -= destIdx;
1039 } else {
1040 *destBuf += capacity;
1041 *destCapacity = 0;
1042 }
1043 }
1044
1045 // If we came in with a buffer overflow, make sure we go out with one also.
1046 // (A zero length match right at the end of the previous match could
1047 // make this function succeed even though a previous call had overflowed the buf)
1048 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1049 *status = U_BUFFER_OVERFLOW_ERROR;
1050 }
1051
1052 return destIdx;
1053}
1054
1055//
1056// appendReplacement the acutal API function,
1057//
1058U_CAPI int32_t U_EXPORT2
1059uregex_appendReplacement(URegularExpression *regexp,
73c04bcf 1060 const UChar *replacementText,
374ca955
A
1061 int32_t replacementLength,
1062 UChar **destBuf,
1063 int32_t *destCapacity,
1064 UErrorCode *status) {
1065 return RegexCImpl::appendReplacement(
1066 regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1067}
1068
1069
73c04bcf 1070//------------------------------------------------------------------------------
374ca955
A
1071//
1072// uregex_appendTail
1073//
73c04bcf 1074//------------------------------------------------------------------------------
374ca955
A
1075int32_t RegexCImpl::appendTail(URegularExpression *regexp,
1076 UChar **destBuf,
1077 int32_t *destCapacity,
46f4442e
A
1078 UErrorCode *status)
1079{
374ca955 1080
46f4442e
A
1081 if (destCapacity == NULL || destBuf == NULL ||
1082 *destBuf == NULL && *destCapacity > 0 ||
1083 *destCapacity < 0)
1084 {
1085 *status = U_ILLEGAL_ARGUMENT_ERROR;
1086 return 0;
1087 }
1088
374ca955
A
1089 // If we come in with a buffer overflow error, don't suppress the operation.
1090 // A series of appendReplacements, appendTail need to correctly preflight
1091 // the buffer size when an overflow happens somewhere in the middle.
1092 UBool pendingBufferOverflow = FALSE;
1093 if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) {
1094 pendingBufferOverflow = TRUE;
1095 *status = U_ZERO_ERROR;
1096 }
1097
1098 if (validateRE(regexp, status) == FALSE) {
1099 return 0;
1100 }
374ca955
A
1101 RegexMatcher *m = regexp->fMatcher;
1102
1103 int32_t srcIdx;
1104 if (m->fMatch) {
1105 // The most recent call to find() succeeded.
1106 srcIdx = m->fMatchEnd;
1107 } else {
1108 // The last call to find() on this matcher failed().
1109 // Look back to the end of the last find() that succeeded for src index.
1110 srcIdx = m->fLastMatchEnd;
1111 if (srcIdx == -1) {
1112 // There has been no successful match with this matcher.
1113 // We want to copy the whole string.
1114 srcIdx = 0;
1115 }
1116 }
1117
1118 int32_t destIdx = 0;
1119 int32_t destCap = *destCapacity;
1120 UChar *dest = *destBuf;
1121
1122 for (;;) {
1123 if (srcIdx == regexp->fTextLength) {
1124 break;
1125 }
1126 UChar c = regexp->fText[srcIdx];
1127 if (c == 0 && regexp->fTextLength == -1) {
1128 break;
1129 }
1130 if (destIdx < destCap) {
1131 dest[destIdx] = c;
1132 } else {
1133 // We've overflowed the dest buffer.
1134 // If the total input string length is known, we can
1135 // compute the total buffer size needed without scanning through the string.
1136 if (regexp->fTextLength > 0) {
1137 destIdx += (regexp->fTextLength - srcIdx);
1138 break;
1139 }
1140 }
1141 srcIdx++;
1142 destIdx++;
1143 }
1144
1145 //
1146 // NUL terminate the output string, if possible, otherwise issue the
1147 // appropriate error or warning.
1148 //
1149 if (destIdx < destCap) {
1150 dest[destIdx] = 0;
1151 } else if (destIdx == destCap) {
1152 *status = U_STRING_NOT_TERMINATED_WARNING;
1153 } else {
1154 *status = U_BUFFER_OVERFLOW_ERROR;
1155 }
1156
1157 //
1158 // Update the user's buffer ptr and capacity vars to reflect the
1159 // amount used.
1160 //
1161 if (destIdx < destCap) {
1162 *destBuf += destIdx;
1163 *destCapacity -= destIdx;
1164 } else {
1165 *destBuf += destCap;
1166 *destCapacity = 0;
1167 }
1168
1169 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1170 *status = U_BUFFER_OVERFLOW_ERROR;
1171 }
1172
1173 return destIdx;
73c04bcf 1174}
374ca955
A
1175
1176
1177U_CAPI int32_t U_EXPORT2
1178uregex_appendTail(URegularExpression *regexp,
1179 UChar **destBuf,
1180 int32_t *destCapacity,
1181 UErrorCode *status) {
1182 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1183}
1184
1185
73c04bcf 1186//------------------------------------------------------------------------------
374ca955
A
1187//
1188// copyString Internal utility to copy a string to an output buffer,
1189// while managing buffer overflow and preflight size
1190// computation. NUL termination is added to destination,
1191// and the NUL is counted in the output size.
1192//
73c04bcf 1193//------------------------------------------------------------------------------
374ca955
A
1194static void copyString(UChar *destBuffer, // Destination buffer.
1195 int32_t destCapacity, // Total capacity of dest buffer
1196 int32_t *destIndex, // Index into dest buffer. Updated on return.
1197 // Update not clipped to destCapacity.
1198 const UChar *srcPtr, // Pointer to source string
1199 int32_t srcLen) // Source string len.
1200{
1201 int32_t si;
1202 int32_t di = *destIndex;
1203 UChar c;
1204
1205 for (si=0; si<srcLen; si++) {
1206 c = srcPtr[si];
1207 if (di < destCapacity) {
1208 destBuffer[di] = c;
1209 di++;
1210 } else {
1211 di += srcLen - si;
1212 break;
1213 }
1214 }
73c04bcf
A
1215 if (di<destCapacity) {
1216 destBuffer[di] = 0;
1217 }
1218 di++;
374ca955
A
1219 *destIndex = di;
1220}
1221
1222
73c04bcf 1223//------------------------------------------------------------------------------
374ca955
A
1224//
1225// uregex_split
1226//
73c04bcf 1227//------------------------------------------------------------------------------
374ca955
A
1228U_CAPI int32_t U_EXPORT2
1229uregex_split( URegularExpression *regexp,
1230 UChar *destBuf,
1231 int32_t destCapacity,
1232 int32_t *requiredCapacity,
1233 UChar *destFields[],
1234 int32_t destFieldsCapacity,
1235 UErrorCode *status) {
1236 if (validateRE(regexp, status) == FALSE) {
1237 return 0;
1238 }
1239 if (destBuf == NULL && destCapacity > 0 ||
1240 destCapacity < 0 ||
1241 destFields == NULL ||
1242 destFieldsCapacity < 1 ) {
1243 *status = U_ILLEGAL_ARGUMENT_ERROR;
1244 return 0;
1245 }
1246
1247 //
1248 // Reset for the input text
1249 //
1250 regexp->fMatcher->reset();
1251 int32_t inputLen = regexp->fTextString.length();
1252 int32_t nextOutputStringStart = 0;
1253 if (inputLen == 0) {
1254 return 0;
1255 }
1256
1257
1258 //
1259 // Loop through the input text, searching for the delimiter pattern
1260 //
1261 int32_t i; // Index of the field being processed.
1262 int32_t destIdx = 0; // Next available position in destBuf;
1263 int32_t numCaptureGroups = regexp->fMatcher->groupCount();
1264 for (i=0; ; i++) {
1265 if (i>=destFieldsCapacity-1) {
1266 // There are one or zero output string left.
1267 // Fill the last output string with whatever is left from the input, then exit the loop.
1268 // ( i will be == destFieldsCapacity if we filled the output array while processing
1269 // capture groups of the delimiter expression, in which case we will discard the
1270 // last capture group saved in favor of the unprocessed remainder of the
1271 // input string.)
1272 int32_t remainingLength = inputLen-nextOutputStringStart;
1273 if (remainingLength > 0) {
1274 }
1275 if (i >= destFieldsCapacity) {
1276 // No fields are left. Recycle the last one for holding the trailing part of
1277 // the input string.
1278 i = destFieldsCapacity-1;
1279 destIdx = (int32_t)(destFields[i] - destFields[0]);
1280 }
1281
1282 destFields[i] = &destBuf[destIdx];
1283 copyString(destBuf, destCapacity, &destIdx,
1284 &regexp->fText[nextOutputStringStart], remainingLength);
1285 break;
1286 }
1287
1288 if (regexp->fMatcher->find()) {
1289 // We found another delimiter. Move everything from where we started looking
1290 // up until the start of the delimiter into the next output string.
1291 int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
1292 destFields[i] = &destBuf[destIdx];
1293 copyString(destBuf, destCapacity, &destIdx,
1294 &regexp->fText[nextOutputStringStart], fieldLen);
1295 nextOutputStringStart = regexp->fMatcher->end(*status);
1296
1297 // If the delimiter pattern has capturing parentheses, the captured
1298 // text goes out into the next n destination strings.
1299 int32_t groupNum;
1300 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1301 // If we've run out of output string slots, bail out.
1302 if (i==destFieldsCapacity-1) {
1303 break;
1304 }
1305 i++;
1306
1307 // Set up to extract the capture group contents into the dest buffer.
1308 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow
1309 // error while extracting this group.
1310 int32_t remainingCapacity = destCapacity - destIdx;
1311 if (remainingCapacity < 0) {
1312 remainingCapacity = 0;
1313 }
1314 destFields[i] = &destBuf[destIdx];
1315 int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus);
1316 destIdx += t + 1; // Record the space used in the output string buffer.
1317 // +1 for the NUL that terminates the string.
1318 }
1319
1320 if (nextOutputStringStart == inputLen) {
1321 // The delimiter was at the end of the string. We're done.
1322 break;
1323 }
1324
1325 }
1326 else
1327 {
1328 // We ran off the end of the input while looking for the next delimiter.
1329 // All the remaining text goes into the current output string.
1330 destFields[i] = &destBuf[destIdx];
1331 copyString(destBuf, destCapacity, &destIdx,
1332 &regexp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
1333 break;
1334 }
1335 }
1336
1337 // Zero out any unused portion of the destFields array
1338 int j;
1339 for (j=i+1; j<destFieldsCapacity; j++) {
1340 destFields[j] = NULL;
1341 }
1342
1343 if (requiredCapacity != NULL) {
1344 *requiredCapacity = destIdx;
1345 }
73c04bcf 1346 if (destIdx > destCapacity) {
374ca955
A
1347 *status = U_BUFFER_OVERFLOW_ERROR;
1348 }
1349 return i+1;
1350}
1351
1352
374ca955 1353#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf 1354