]> git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/uregex.cpp
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / i18n / uregex.cpp
CommitLineData
374ca955
A
1/*
2*******************************************************************************
73c04bcf 3* Copyright (C) 2004-2005, International Business Machines
374ca955
A
4* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* file name: regex.cpp
7*/
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13#include "unicode/regex.h"
14#include "unicode/uregex.h"
15#include "unicode/unistr.h"
16#include "unicode/ustring.h"
17#include "unicode/uchar.h"
18#include "unicode/uobject.h"
19#include "umutex.h"
20#include "uassert.h"
21#include "cmemory.h"
22
23struct URegularExpression: public UMemory {
24public:
25 URegularExpression();
26 ~URegularExpression();
27 int32_t fMagic;
28 RegexPattern *fPat;
29 int32_t *fPatRefCount;
30 UChar *fPatString;
31 int32_t fPatStringLen;
32 RegexMatcher *fMatcher;
33 const UChar *fText; // Text from setText()
34 int32_t fTextLength; // Length provided by user with setText(), which
35 // may be -1.
36
37 UnicodeString fTextString; // The setText(text) is wrapped into a UnicodeString.
38 // TODO: regexp engine should not depend on UnicodeString.
39};
40
41static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
42
43U_NAMESPACE_USE
44
45URegularExpression::URegularExpression() {
46 fMagic = REXP_MAGIC;
47 fPat = NULL;
48 fPatRefCount = NULL;
49 fPatString = NULL;
50 fPatStringLen = 0;
51 fMatcher = NULL;
52 fText = NULL;
53 fTextLength = 0;
54}
55
56URegularExpression::~URegularExpression() {
57 delete fMatcher;
58 fMatcher = NULL;
59 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
60 delete fPat;
61 uprv_free(fPatString);
62 uprv_free(fPatRefCount);
63 }
64 fMagic = 0;
65}
66
67//----------------------------------------------------------------------------------------
68//
69// validateRE Do boilerplate style checks on API function parameters.
70// Return TRUE if they look OK.
71//----------------------------------------------------------------------------------------
72static UBool validateRE(const URegularExpression *re, UErrorCode *status, UBool requiresText = TRUE) {
73 if (U_FAILURE(*status)) {
74 return FALSE;
75 }
76 if (re == NULL || re->fMagic != REXP_MAGIC) {
77 // U_ASSERT(FALSE);
78 *status = U_ILLEGAL_ARGUMENT_ERROR;
79 return FALSE;
80 }
81 if (requiresText && re->fText == NULL) {
82 *status = U_REGEX_INVALID_STATE;
83 return FALSE;
84 }
85 return TRUE;
86}
87
88//----------------------------------------------------------------------------------------
89//
90// uregex_open
91//
92//----------------------------------------------------------------------------------------
93U_CAPI URegularExpression * U_EXPORT2
94uregex_open( const UChar *pattern,
95 int32_t patternLength,
96 uint32_t flags,
97 UParseError *pe,
98 UErrorCode *status) {
99
100 if (U_FAILURE(*status)) {
101 return NULL;
102 }
103 if (pattern == NULL || patternLength < -1 || patternLength == 0) {
104 *status = U_ILLEGAL_ARGUMENT_ERROR;
105 return NULL;
106 }
107 int32_t actualPatLen = patternLength;
108 if (actualPatLen == -1) {
109 actualPatLen = u_strlen(pattern);
110 }
111
112 URegularExpression *re = new URegularExpression;
113 int32_t *refC = (int32_t *)uprv_malloc(sizeof(int32_t));
114 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
115 if (re == NULL || refC == NULL || patBuf == NULL) {
116 *status = U_MEMORY_ALLOCATION_ERROR;
117 delete re;
118 uprv_free(refC);
119 uprv_free(patBuf);
120 return NULL;
121 }
122 re->fPatRefCount = refC;
123 *re->fPatRefCount = 1;
124
125 //
126 // Make a copy of the pattern string, so we can return it later if asked.
127 // For compiling the pattern, we will use a read-only-aliased UnicodeString
128 // of this local copy, to avoid making even more copies.
129 //
130 re->fPatString = patBuf;
131 re->fPatStringLen = patternLength;
132 u_memcpy(patBuf, pattern, actualPatLen);
133 patBuf[actualPatLen] = 0;
134 UnicodeString patString(patternLength==-1, patBuf, patternLength);
135
136 //
137 // Compile the pattern
138 //
139 if (pe != NULL) {
140 re->fPat = RegexPattern::compile(patString, flags, *pe, *status);
141 } else {
142 re->fPat = RegexPattern::compile(patString, flags, *status);
143 }
144 if (U_FAILURE(*status)) {
145 goto ErrorExit;
146 }
147
148 //
149 // Create the matcher object
150 //
151 re->fMatcher = re->fPat->matcher(*status);
152 if (U_SUCCESS(*status)) {
153 return re;
154 }
155
156ErrorExit:
157 delete re;
158 return NULL;
159
160}
161
374ca955
A
162//----------------------------------------------------------------------------------------
163//
164// uregex_close
165//
166//----------------------------------------------------------------------------------------
167U_CAPI void U_EXPORT2
168uregex_close(URegularExpression *re) {
169 UErrorCode status = U_ZERO_ERROR;
170 if (validateRE(re, &status, FALSE) == FALSE) {
171 return;
172 }
173 delete re;
174}
175
176
177//----------------------------------------------------------------------------------------
178//
179// uregex_clone
180//
181//----------------------------------------------------------------------------------------
182U_CAPI URegularExpression * U_EXPORT2
183uregex_clone(const URegularExpression *source, UErrorCode *status) {
184 if (validateRE(source, status, FALSE) == FALSE) {
185 return NULL;
186 }
187
188 URegularExpression *clone = new URegularExpression;
189 if (clone == NULL) {
190 *status = U_MEMORY_ALLOCATION_ERROR;
191 return NULL;
192 }
193
194 clone->fMatcher = source->fPat->matcher(*status);
195 if (U_FAILURE(*status)) {
196 delete clone;
197 return NULL;
198 }
199 if (clone == NULL) {
200 *status = U_MEMORY_ALLOCATION_ERROR;
201 return NULL;
202 }
203
204 clone->fPat = source->fPat;
205 clone->fPatRefCount = source->fPatRefCount;
206 clone->fPatString = source->fPatString;
207 clone->fPatStringLen = source->fPatStringLen;
208 umtx_atomic_inc(source->fPatRefCount);
209 // Note: fText is not cloned.
210
211 return clone;
73c04bcf 212}
374ca955
A
213
214
215
216
73c04bcf 217//------------------------------------------------------------------------------
374ca955
A
218//
219// uregex_pattern
220//
73c04bcf 221//------------------------------------------------------------------------------
374ca955
A
222U_CAPI const UChar * U_EXPORT2
223uregex_pattern(const URegularExpression *regexp,
224 int32_t *patLength,
225 UErrorCode *status) {
226
227 if (validateRE(regexp, status, FALSE) == FALSE) {
228 return NULL;
229 }
230 if (patLength != NULL) {
231 *patLength = regexp->fPatStringLen;
232 }
233 return regexp->fPatString;
73c04bcf 234}
374ca955
A
235
236
73c04bcf 237//------------------------------------------------------------------------------
374ca955
A
238//
239// uregex_flags
240//
73c04bcf 241//------------------------------------------------------------------------------
374ca955
A
242U_CAPI int32_t U_EXPORT2
243uregex_flags(const URegularExpression *regexp, UErrorCode *status) {
244 if (validateRE(regexp, status, FALSE) == FALSE) {
245 return 0;
246 }
247 int32_t flags = regexp->fPat->flags();
248 return flags;
73c04bcf 249}
374ca955
A
250
251
73c04bcf 252//------------------------------------------------------------------------------
374ca955
A
253//
254// uregex_setText
255//
73c04bcf 256//------------------------------------------------------------------------------
374ca955
A
257U_CAPI void U_EXPORT2
258uregex_setText(URegularExpression *regexp,
259 const UChar *text,
260 int32_t textLength,
261 UErrorCode *status) {
262 if (validateRE(regexp, status, FALSE) == FALSE) {
263 return;
264 }
265 if (text == NULL || textLength < -1) {
266 *status = U_ILLEGAL_ARGUMENT_ERROR;
267 return;
268 }
269 regexp->fText = text;
270 regexp->fTextLength = textLength;
271 UBool isTerminated = (textLength == -1);
272
273 regexp->fTextString.setTo(isTerminated, text, textLength);
274 regexp->fMatcher->reset(regexp->fTextString);
73c04bcf 275}
374ca955
A
276
277
278
73c04bcf 279//------------------------------------------------------------------------------
374ca955
A
280//
281// uregex_getText
282//
73c04bcf 283//------------------------------------------------------------------------------
374ca955
A
284U_CAPI const UChar * U_EXPORT2
285uregex_getText(URegularExpression *regexp,
286 int32_t *textLength,
287 UErrorCode *status) {
288 if (validateRE(regexp, status, FALSE) == FALSE) {
289 return NULL;
290 }
291 if (textLength != NULL) {
292 *textLength = regexp->fTextLength;
293 }
294 return regexp->fText;
73c04bcf 295}
374ca955
A
296
297
73c04bcf 298//------------------------------------------------------------------------------
374ca955
A
299//
300// uregex_matches
301//
73c04bcf 302//------------------------------------------------------------------------------
374ca955
A
303U_CAPI UBool U_EXPORT2
304uregex_matches(URegularExpression *regexp,
305 int32_t startIndex,
306 UErrorCode *status) {
307 if (validateRE(regexp, status) == FALSE) {
308 return FALSE;
309 }
310 UBool result = regexp->fMatcher->matches(startIndex, *status);
311 return result;
73c04bcf 312}
374ca955
A
313
314
315
73c04bcf 316//------------------------------------------------------------------------------
374ca955
A
317//
318// uregex_lookingAt
319//
73c04bcf 320//------------------------------------------------------------------------------
374ca955
A
321U_CAPI UBool U_EXPORT2
322uregex_lookingAt(URegularExpression *regexp,
323 int32_t startIndex,
324 UErrorCode *status) {
325 if (validateRE(regexp, status) == FALSE) {
326 return FALSE;
327 }
328 UBool result = regexp->fMatcher->lookingAt(startIndex, *status);
329 return result;
73c04bcf 330}
374ca955
A
331
332
333
73c04bcf 334//------------------------------------------------------------------------------
374ca955
A
335//
336// uregex_find
337//
73c04bcf 338//------------------------------------------------------------------------------
374ca955
A
339U_CAPI UBool U_EXPORT2
340uregex_find(URegularExpression *regexp,
341 int32_t startIndex,
342 UErrorCode *status) {
343 if (validateRE(regexp, status) == FALSE) {
344 return FALSE;
345 }
346 UBool result = regexp->fMatcher->find(startIndex, *status);
347 return result;
73c04bcf 348}
374ca955 349
73c04bcf 350//------------------------------------------------------------------------------
374ca955
A
351//
352// uregex_findNext
353//
73c04bcf 354//------------------------------------------------------------------------------
374ca955
A
355U_CAPI UBool U_EXPORT2
356uregex_findNext(URegularExpression *regexp,
357 UErrorCode *status) {
358 if (validateRE(regexp, status) == FALSE) {
359 return FALSE;
360 }
361 UBool result = regexp->fMatcher->find();
362 return result;
73c04bcf 363}
374ca955 364
73c04bcf 365//------------------------------------------------------------------------------
374ca955
A
366//
367// uregex_groupCount
368//
73c04bcf 369//------------------------------------------------------------------------------
374ca955
A
370U_CAPI int32_t U_EXPORT2
371uregex_groupCount(URegularExpression *regexp,
372 UErrorCode *status) {
373 if (validateRE(regexp, status, FALSE) == FALSE) {
374 return 0;
375 }
376 int32_t result = regexp->fMatcher->groupCount();
377 return result;
73c04bcf 378}
374ca955
A
379
380
73c04bcf 381//------------------------------------------------------------------------------
374ca955
A
382//
383// uregex_group
384//
73c04bcf 385//------------------------------------------------------------------------------
374ca955
A
386U_CAPI int32_t U_EXPORT2
387uregex_group(URegularExpression *regexp,
388 int32_t groupNum,
389 UChar *dest,
390 int32_t destCapacity,
391 UErrorCode *status) {
392 if (validateRE(regexp, status) == FALSE) {
393 return 0;
394 }
395 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
396 *status = U_ILLEGAL_ARGUMENT_ERROR;
397 return 0;
398 }
399
400 //
401 // Pick up the range of characters from the matcher
402 //
403 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
404 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
405 if (U_FAILURE(*status)) {
406 return 0;
407 }
408
409 //
410 // Trim length based on buffer capacity
411 //
412 int32_t fullLength = endIx - startIx;
413 int32_t copyLength = fullLength;
414 if (copyLength < destCapacity) {
415 dest[copyLength] = 0;
416 } else if (copyLength == destCapacity) {
417 *status = U_STRING_NOT_TERMINATED_WARNING;
418 } else {
419 copyLength = destCapacity;
420 *status = U_BUFFER_OVERFLOW_ERROR;
421 }
422
423 //
424 // Copy capture group to user's buffer
425 //
426 if (copyLength > 0) {
427 u_memcpy(dest, &regexp->fText[startIx], copyLength);
428 }
429 return fullLength;
73c04bcf 430}
374ca955
A
431
432
73c04bcf 433//------------------------------------------------------------------------------
374ca955
A
434//
435// uregex_start
436//
73c04bcf 437//------------------------------------------------------------------------------
374ca955
A
438U_CAPI int32_t U_EXPORT2
439uregex_start(URegularExpression *regexp,
440 int32_t groupNum,
441 UErrorCode *status) {
442 if (validateRE(regexp, status) == FALSE) {
443 return 0;
444 }
445 int32_t result = regexp->fMatcher->start(groupNum, *status);
446 return result;
73c04bcf 447}
374ca955
A
448
449
73c04bcf 450//------------------------------------------------------------------------------
374ca955
A
451//
452// uregex_end
453//
73c04bcf 454//------------------------------------------------------------------------------
374ca955
A
455U_CAPI int32_t U_EXPORT2
456uregex_end(URegularExpression *regexp,
457 int32_t groupNum,
458 UErrorCode *status) {
459 if (validateRE(regexp, status) == FALSE) {
460 return 0;
461 }
462 int32_t result = regexp->fMatcher->end(groupNum, *status);
463 return result;
73c04bcf 464}
374ca955 465
73c04bcf 466//------------------------------------------------------------------------------
374ca955
A
467//
468// uregex_reset
469//
73c04bcf 470//------------------------------------------------------------------------------
374ca955
A
471U_CAPI void U_EXPORT2
472uregex_reset(URegularExpression *regexp,
473 int32_t index,
474 UErrorCode *status) {
475 if (validateRE(regexp, status) == FALSE) {
476 return;
477 }
478 regexp->fMatcher->reset(index, *status);
73c04bcf 479}
374ca955
A
480
481
73c04bcf 482//------------------------------------------------------------------------------
374ca955
A
483//
484// uregex_replaceAll
485//
73c04bcf 486//------------------------------------------------------------------------------
374ca955
A
487U_CAPI int32_t U_EXPORT2
488uregex_replaceAll(URegularExpression *regexp,
73c04bcf 489 const UChar *replacementText,
374ca955
A
490 int32_t replacementLength,
491 UChar *destBuf,
492 int32_t destCapacity,
493 UErrorCode *status) {
494 if (validateRE(regexp, status) == FALSE) {
495 return 0;
496 }
497 if (replacementText == NULL || replacementLength < -1 ||
498 destBuf == NULL && destCapacity > 0 ||
499 destCapacity < 0) {
500 *status = U_ILLEGAL_ARGUMENT_ERROR;
501 return 0;
502 }
503
504 int32_t len = 0;
505 uregex_reset(regexp, 0, status);
506 while (uregex_findNext(regexp, status)) {
507 len += uregex_appendReplacement(regexp, replacementText, replacementLength,
508 &destBuf, &destCapacity, status);
509 }
510 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
511
512 return len;
73c04bcf 513}
374ca955
A
514
515
73c04bcf 516//------------------------------------------------------------------------------
374ca955
A
517//
518// uregex_replaceFirst
519//
73c04bcf 520//------------------------------------------------------------------------------
374ca955
A
521U_CAPI int32_t U_EXPORT2
522uregex_replaceFirst(URegularExpression *regexp,
73c04bcf 523 const UChar *replacementText,
374ca955
A
524 int32_t replacementLength,
525 UChar *destBuf,
526 int32_t destCapacity,
527 UErrorCode *status) {
528 if (validateRE(regexp, status) == FALSE) {
529 return 0;
530 }
531 if (replacementText == NULL || replacementLength < -1 ||
532 destBuf == NULL && destCapacity > 0 ||
533 destCapacity < 0) {
534 *status = U_ILLEGAL_ARGUMENT_ERROR;
535 return 0;
536 }
537
538 int32_t len = 0;
539 UBool findSucceeded;
540 uregex_reset(regexp, 0, status);
541 findSucceeded = uregex_find(regexp, 0, status);
542 if (findSucceeded) {
543 len = uregex_appendReplacement(regexp, replacementText, replacementLength,
544 &destBuf, &destCapacity, status);
545 }
546 len += uregex_appendTail(regexp, &destBuf, &destCapacity, status);
547
548 return len;
73c04bcf 549}
374ca955
A
550
551
73c04bcf 552//------------------------------------------------------------------------------
374ca955
A
553//
554// uregex_appendReplacement
555//
73c04bcf 556//------------------------------------------------------------------------------
374ca955
A
557
558
559//
560// Dummy class, because these functions need to be friends of class RegexMatcher,
561// and stand-alone C functions don't work as friends
562//
563U_NAMESPACE_BEGIN
564class RegexCImpl {
565 public:
566 inline static int32_t appendReplacement(URegularExpression *regexp,
73c04bcf 567 const UChar *replacementText,
374ca955
A
568 int32_t replacementLength,
569 UChar **destBuf,
570 int32_t *destCapacity,
571 UErrorCode *status);
572
573 inline static int32_t appendTail(URegularExpression *regexp,
574 UChar **destBuf,
575 int32_t *destCapacity,
576 UErrorCode *status);
577};
578U_NAMESPACE_END
579
580
581//
582// Call-back function for u_unescapeAt(), used when we encounter
583// \uxxxx or \Uxxxxxxxxx escapes in the replacement text.
584//
585U_CDECL_BEGIN
586static UChar U_CALLCONV
587unescape_charAt(int32_t offset, void *context) {
588 UChar c16 = ((UChar *)context)[offset];
589 return c16;
590}
591U_CDECL_END
592
593
594static const UChar BACKSLASH = 0x5c;
595static const UChar DOLLARSIGN = 0x24;
596
597//
598// Move a character to an output buffer, with bounds checking on the index.
599// Index advances even if capacity is exceeded, for preflight size computations.
600// This little sequence is used a LOT.
601//
602static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
603 if (*idx < bufCapacity) {
604 buf[*idx] = c;
605 }
606 (*idx)++;
607}
608
609
610//
611// appendReplacement, the actual implementation.
612//
613int32_t RegexCImpl::appendReplacement(URegularExpression *regexp,
73c04bcf 614 const UChar *replacementText,
374ca955
A
615 int32_t replacementLength,
616 UChar **destBuf,
617 int32_t *destCapacity,
618 UErrorCode *status) {
619
620 // If we come in with a buffer overflow error, don't suppress the operation.
621 // A series of appendReplacements, appendTail need to correctly preflight
622 // the buffer size when an overflow happens somewhere in the middle.
623 UBool pendingBufferOverflow = FALSE;
624 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity == 0) {
625 pendingBufferOverflow = TRUE;
626 *status = U_ZERO_ERROR;
627 }
628
629 //
630 // Validate all paramters
631 //
632 if (validateRE(regexp, status) == FALSE) {
633 return 0;
634 }
635 if (replacementText == NULL || replacementLength < -1 ||
636 destCapacity == NULL || destBuf == NULL ||
637 *destBuf == NULL && *destCapacity > 0 ||
638 *destCapacity < 0) {
639 *status = U_ILLEGAL_ARGUMENT_ERROR;
640 return 0;
641 }
642
643 RegexMatcher *m = regexp->fMatcher;
644 if (m->fMatch == FALSE) {
645 *status = U_REGEX_INVALID_STATE;
646 return 0;
647 }
648
649 UChar *dest = *destBuf;
650 int32_t capacity = *destCapacity;
651 int32_t destIdx = 0;
652 int32_t i;
653
654 // If it wasn't supplied by the caller, get the length of the replacement text.
655 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
656 // the fly and avoid this step.
657 if (replacementLength == -1) {
658 replacementLength = u_strlen(replacementText);
659 }
660
661 // Copy input string from the end of previous match to start of current match
662 for (i=m->fLastMatchEnd; i<m->fMatchStart; i++) {
663 appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
664 }
665
666
667
668 // scan the replacement text, looking for substitutions ($n) and \escapes.
669 int32_t replIdx = 0;
670 while (replIdx < replacementLength) {
671 UChar c = replacementText[replIdx];
672 replIdx++;
673 if (c != DOLLARSIGN && c != BACKSLASH) {
674 // Common case, no substitution, no escaping,
675 // just copy the char to the dest buf.
676 appendToBuf(c, &destIdx, dest, capacity);
677 continue;
678 }
679
680 if (c == BACKSLASH) {
681 // Backslash Escape. Copy the following char out without further checks.
682 // Note: Surrogate pairs don't need any special handling
683 // The second half wont be a '$' or a '\', and
684 // will move to the dest normally on the next
685 // loop iteration.
686 if (replIdx >= replacementLength) {
687 break;
688 }
689 c = replacementText[replIdx];
690
691 if (c==0x55/*U*/ || c==0x75/*u*/) {
692 // We have a \udddd or \Udddddddd escape sequence.
693 UChar32 escapedChar =
694 u_unescapeAt(unescape_charAt,
695 &replIdx, // Index is updated by unescapeAt
696 replacementLength, // Length of replacement text
73c04bcf 697 (void *)replacementText);
374ca955
A
698
699 if (escapedChar != (UChar32)0xFFFFFFFF) {
700 if (escapedChar <= 0xffff) {
701 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
702 } else {
703 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
704 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
705 }
706 continue;
707 }
708 // Note: if the \u escape was invalid, just fall through and
709 // treat it as a plain \<anything> escape.
710 }
711
712 // Plain backslash escape. Just put out the escaped character.
713 appendToBuf(c, &destIdx, dest, capacity);
714
715 replIdx++;
716 continue;
717 }
718
719
720
721 // We've got a $. Pick up a capture group number if one follows.
722 // Consume at most the number of digits necessary for the largest capture
723 // number that is valid for this pattern.
724
725 int32_t numDigits = 0;
726 int32_t groupNum = 0;
727 UChar32 digitC;
728 for (;;) {
729 if (replIdx >= replacementLength) {
730 break;
731 }
732 U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
733 if (u_isdigit(digitC) == FALSE) {
734 break;
735 }
736
737 U16_FWD_1(replacementText, replIdx, replacementLength);
738 groupNum=groupNum*10 + u_charDigitValue(digitC);
739 numDigits++;
740 if (numDigits >= m->fPattern->fMaxCaptureDigits) {
741 break;
742 }
743 }
744
745
746 if (numDigits == 0) {
747 // The $ didn't introduce a group number at all.
748 // Treat it as just part of the substitution text.
749 appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
750 continue;
751 }
752
753 // Finally, append the capture group data to the destination.
754 int32_t capacityRemaining = capacity - destIdx;
755 if (capacityRemaining < 0) {
756 capacityRemaining = 0;
757 }
758 destIdx += uregex_group(regexp, groupNum, dest+destIdx, capacityRemaining, status);
759 if (*status == U_BUFFER_OVERFLOW_ERROR) {
760 // Ignore buffer overflow when extracting the group. We need to
761 // continue on to get full size of the untruncated result. We will
762 // raise our own buffer overflow error at the end.
763 *status = U_ZERO_ERROR;
764 }
765
766 if (U_FAILURE(*status)) {
767 // Can fail if group number is out of range.
768 break;
769 }
770
771 }
772
773 //
774 // Nul Terminate the dest buffer if possible.
775 // Set the appropriate buffer overflow or not terminated error, if needed.
776 //
777 if (destIdx < capacity) {
778 dest[destIdx] = 0;
779 } else if (destIdx == *destCapacity) {
780 *status = U_STRING_NOT_TERMINATED_WARNING;
781 } else {
782 *status = U_BUFFER_OVERFLOW_ERROR;
783 }
784
785 //
786 // Return an updated dest buffer and capacity to the caller.
787 //
788 if (destIdx > 0 && *destCapacity > 0) {
789 if (destIdx < capacity) {
790 *destBuf += destIdx;
791 *destCapacity -= destIdx;
792 } else {
793 *destBuf += capacity;
794 *destCapacity = 0;
795 }
796 }
797
798 // If we came in with a buffer overflow, make sure we go out with one also.
799 // (A zero length match right at the end of the previous match could
800 // make this function succeed even though a previous call had overflowed the buf)
801 if (pendingBufferOverflow && U_SUCCESS(*status)) {
802 *status = U_BUFFER_OVERFLOW_ERROR;
803 }
804
805 return destIdx;
806}
807
808//
809// appendReplacement the acutal API function,
810//
811U_CAPI int32_t U_EXPORT2
812uregex_appendReplacement(URegularExpression *regexp,
73c04bcf 813 const UChar *replacementText,
374ca955
A
814 int32_t replacementLength,
815 UChar **destBuf,
816 int32_t *destCapacity,
817 UErrorCode *status) {
818 return RegexCImpl::appendReplacement(
819 regexp, replacementText, replacementLength,destBuf, destCapacity, status);
820}
821
822
73c04bcf 823//------------------------------------------------------------------------------
374ca955
A
824//
825// uregex_appendTail
826//
73c04bcf 827//------------------------------------------------------------------------------
374ca955
A
828int32_t RegexCImpl::appendTail(URegularExpression *regexp,
829 UChar **destBuf,
830 int32_t *destCapacity,
831 UErrorCode *status) {
832
833 // If we come in with a buffer overflow error, don't suppress the operation.
834 // A series of appendReplacements, appendTail need to correctly preflight
835 // the buffer size when an overflow happens somewhere in the middle.
836 UBool pendingBufferOverflow = FALSE;
837 if (*status == U_BUFFER_OVERFLOW_ERROR && *destCapacity == 0) {
838 pendingBufferOverflow = TRUE;
839 *status = U_ZERO_ERROR;
840 }
841
842 if (validateRE(regexp, status) == FALSE) {
843 return 0;
844 }
845 if (destCapacity == NULL || destBuf == NULL ||
846 *destBuf == NULL && *destCapacity > 0 ||
847 *destCapacity < 0) {
848 *status = U_ILLEGAL_ARGUMENT_ERROR;
849 return 0;
850 }
851
852 RegexMatcher *m = regexp->fMatcher;
853
854 int32_t srcIdx;
855 if (m->fMatch) {
856 // The most recent call to find() succeeded.
857 srcIdx = m->fMatchEnd;
858 } else {
859 // The last call to find() on this matcher failed().
860 // Look back to the end of the last find() that succeeded for src index.
861 srcIdx = m->fLastMatchEnd;
862 if (srcIdx == -1) {
863 // There has been no successful match with this matcher.
864 // We want to copy the whole string.
865 srcIdx = 0;
866 }
867 }
868
869 int32_t destIdx = 0;
870 int32_t destCap = *destCapacity;
871 UChar *dest = *destBuf;
872
873 for (;;) {
874 if (srcIdx == regexp->fTextLength) {
875 break;
876 }
877 UChar c = regexp->fText[srcIdx];
878 if (c == 0 && regexp->fTextLength == -1) {
879 break;
880 }
881 if (destIdx < destCap) {
882 dest[destIdx] = c;
883 } else {
884 // We've overflowed the dest buffer.
885 // If the total input string length is known, we can
886 // compute the total buffer size needed without scanning through the string.
887 if (regexp->fTextLength > 0) {
888 destIdx += (regexp->fTextLength - srcIdx);
889 break;
890 }
891 }
892 srcIdx++;
893 destIdx++;
894 }
895
896 //
897 // NUL terminate the output string, if possible, otherwise issue the
898 // appropriate error or warning.
899 //
900 if (destIdx < destCap) {
901 dest[destIdx] = 0;
902 } else if (destIdx == destCap) {
903 *status = U_STRING_NOT_TERMINATED_WARNING;
904 } else {
905 *status = U_BUFFER_OVERFLOW_ERROR;
906 }
907
908 //
909 // Update the user's buffer ptr and capacity vars to reflect the
910 // amount used.
911 //
912 if (destIdx < destCap) {
913 *destBuf += destIdx;
914 *destCapacity -= destIdx;
915 } else {
916 *destBuf += destCap;
917 *destCapacity = 0;
918 }
919
920 if (pendingBufferOverflow && U_SUCCESS(*status)) {
921 *status = U_BUFFER_OVERFLOW_ERROR;
922 }
923
924 return destIdx;
73c04bcf 925}
374ca955
A
926
927
928U_CAPI int32_t U_EXPORT2
929uregex_appendTail(URegularExpression *regexp,
930 UChar **destBuf,
931 int32_t *destCapacity,
932 UErrorCode *status) {
933 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
934}
935
936
73c04bcf 937//------------------------------------------------------------------------------
374ca955
A
938//
939// copyString Internal utility to copy a string to an output buffer,
940// while managing buffer overflow and preflight size
941// computation. NUL termination is added to destination,
942// and the NUL is counted in the output size.
943//
73c04bcf 944//------------------------------------------------------------------------------
374ca955
A
945static void copyString(UChar *destBuffer, // Destination buffer.
946 int32_t destCapacity, // Total capacity of dest buffer
947 int32_t *destIndex, // Index into dest buffer. Updated on return.
948 // Update not clipped to destCapacity.
949 const UChar *srcPtr, // Pointer to source string
950 int32_t srcLen) // Source string len.
951{
952 int32_t si;
953 int32_t di = *destIndex;
954 UChar c;
955
956 for (si=0; si<srcLen; si++) {
957 c = srcPtr[si];
958 if (di < destCapacity) {
959 destBuffer[di] = c;
960 di++;
961 } else {
962 di += srcLen - si;
963 break;
964 }
965 }
73c04bcf
A
966 if (di<destCapacity) {
967 destBuffer[di] = 0;
968 }
969 di++;
374ca955
A
970 *destIndex = di;
971}
972
973
73c04bcf 974//------------------------------------------------------------------------------
374ca955
A
975//
976// uregex_split
977//
73c04bcf 978//------------------------------------------------------------------------------
374ca955
A
979U_CAPI int32_t U_EXPORT2
980uregex_split( URegularExpression *regexp,
981 UChar *destBuf,
982 int32_t destCapacity,
983 int32_t *requiredCapacity,
984 UChar *destFields[],
985 int32_t destFieldsCapacity,
986 UErrorCode *status) {
987 if (validateRE(regexp, status) == FALSE) {
988 return 0;
989 }
990 if (destBuf == NULL && destCapacity > 0 ||
991 destCapacity < 0 ||
992 destFields == NULL ||
993 destFieldsCapacity < 1 ) {
994 *status = U_ILLEGAL_ARGUMENT_ERROR;
995 return 0;
996 }
997
998 //
999 // Reset for the input text
1000 //
1001 regexp->fMatcher->reset();
1002 int32_t inputLen = regexp->fTextString.length();
1003 int32_t nextOutputStringStart = 0;
1004 if (inputLen == 0) {
1005 return 0;
1006 }
1007
1008
1009 //
1010 // Loop through the input text, searching for the delimiter pattern
1011 //
1012 int32_t i; // Index of the field being processed.
1013 int32_t destIdx = 0; // Next available position in destBuf;
1014 int32_t numCaptureGroups = regexp->fMatcher->groupCount();
1015 for (i=0; ; i++) {
1016 if (i>=destFieldsCapacity-1) {
1017 // There are one or zero output string left.
1018 // Fill the last output string with whatever is left from the input, then exit the loop.
1019 // ( i will be == destFieldsCapacity if we filled the output array while processing
1020 // capture groups of the delimiter expression, in which case we will discard the
1021 // last capture group saved in favor of the unprocessed remainder of the
1022 // input string.)
1023 int32_t remainingLength = inputLen-nextOutputStringStart;
1024 if (remainingLength > 0) {
1025 }
1026 if (i >= destFieldsCapacity) {
1027 // No fields are left. Recycle the last one for holding the trailing part of
1028 // the input string.
1029 i = destFieldsCapacity-1;
1030 destIdx = (int32_t)(destFields[i] - destFields[0]);
1031 }
1032
1033 destFields[i] = &destBuf[destIdx];
1034 copyString(destBuf, destCapacity, &destIdx,
1035 &regexp->fText[nextOutputStringStart], remainingLength);
1036 break;
1037 }
1038
1039 if (regexp->fMatcher->find()) {
1040 // We found another delimiter. Move everything from where we started looking
1041 // up until the start of the delimiter into the next output string.
1042 int32_t fieldLen = regexp->fMatcher->start(*status) - nextOutputStringStart;
1043 destFields[i] = &destBuf[destIdx];
1044 copyString(destBuf, destCapacity, &destIdx,
1045 &regexp->fText[nextOutputStringStart], fieldLen);
1046 nextOutputStringStart = regexp->fMatcher->end(*status);
1047
1048 // If the delimiter pattern has capturing parentheses, the captured
1049 // text goes out into the next n destination strings.
1050 int32_t groupNum;
1051 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1052 // If we've run out of output string slots, bail out.
1053 if (i==destFieldsCapacity-1) {
1054 break;
1055 }
1056 i++;
1057
1058 // Set up to extract the capture group contents into the dest buffer.
1059 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow
1060 // error while extracting this group.
1061 int32_t remainingCapacity = destCapacity - destIdx;
1062 if (remainingCapacity < 0) {
1063 remainingCapacity = 0;
1064 }
1065 destFields[i] = &destBuf[destIdx];
1066 int32_t t = uregex_group(regexp, groupNum, destFields[i], remainingCapacity, &tStatus);
1067 destIdx += t + 1; // Record the space used in the output string buffer.
1068 // +1 for the NUL that terminates the string.
1069 }
1070
1071 if (nextOutputStringStart == inputLen) {
1072 // The delimiter was at the end of the string. We're done.
1073 break;
1074 }
1075
1076 }
1077 else
1078 {
1079 // We ran off the end of the input while looking for the next delimiter.
1080 // All the remaining text goes into the current output string.
1081 destFields[i] = &destBuf[destIdx];
1082 copyString(destBuf, destCapacity, &destIdx,
1083 &regexp->fText[nextOutputStringStart], inputLen-nextOutputStringStart);
1084 break;
1085 }
1086 }
1087
1088 // Zero out any unused portion of the destFields array
1089 int j;
1090 for (j=i+1; j<destFieldsCapacity; j++) {
1091 destFields[j] = NULL;
1092 }
1093
1094 if (requiredCapacity != NULL) {
1095 *requiredCapacity = destIdx;
1096 }
73c04bcf 1097 if (destIdx > destCapacity) {
374ca955
A
1098 *status = U_BUFFER_OVERFLOW_ERROR;
1099 }
1100 return i+1;
1101}
1102
1103
374ca955 1104#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
73c04bcf 1105