]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ustrcase.cpp
ICU-59152.0.1.tar.gz
[apple/icu.git] / icuSources / common / ustrcase.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2001-2015, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ustrcase.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2002feb20
16 * created by: Markus W. Scherer
17 *
18 * Implementation file for string casing C API functions.
19 * Uses functions from uchar.c for basic functionality that requires access
20 * to the Unicode Character Database (uprops.dat).
21 */
22
23 #include "unicode/utypes.h"
24 #include "unicode/brkiter.h"
25 #include "unicode/casemap.h"
26 #include "unicode/edits.h"
27 #include "unicode/ustring.h"
28 #include "unicode/ucasemap.h"
29 #include "unicode/ubrk.h"
30 #include "unicode/utf.h"
31 #include "unicode/utf16.h"
32 #include "cmemory.h"
33 #include "ucase.h"
34 #include "ucasemap_imp.h"
35 #include "ustr_imp.h"
36 #include "uassert.h"
37
38 U_NAMESPACE_BEGIN
39
40 namespace {
41
42 int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
43 Edits *edits, UErrorCode &errorCode) {
44 if (U_SUCCESS(errorCode)) {
45 if (destIndex > destCapacity) {
46 errorCode = U_BUFFER_OVERFLOW_ERROR;
47 } else if (edits != NULL) {
48 edits->copyErrorTo(errorCode);
49 }
50 }
51 return destIndex;
52 }
53
54 } // namespace
55
56 U_NAMESPACE_END
57
58 U_NAMESPACE_USE
59
60 /* string casing ------------------------------------------------------------ */
61
62 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
63 static inline int32_t
64 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
65 int32_t result, const UChar *s,
66 int32_t cpLength, uint32_t options, icu::Edits *edits) {
67 UChar32 c;
68 int32_t length;
69
70 /* decode the result */
71 if(result<0) {
72 /* (not) original code point */
73 if(edits!=NULL) {
74 edits->addUnchanged(cpLength);
75 if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) {
76 return destIndex;
77 }
78 }
79 c=~result;
80 if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
81 dest[destIndex++]=(UChar)c;
82 return destIndex;
83 }
84 length=cpLength;
85 } else {
86 if(result<=UCASE_MAX_STRING_LENGTH) {
87 c=U_SENTINEL;
88 length=result;
89 } else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath
90 dest[destIndex++]=(UChar)result;
91 if(edits!=NULL) {
92 edits->addReplace(cpLength, 1);
93 }
94 return destIndex;
95 } else {
96 c=result;
97 length=U16_LENGTH(c);
98 }
99 if(edits!=NULL) {
100 edits->addReplace(cpLength, length);
101 }
102 }
103 if(length>(INT32_MAX-destIndex)) {
104 return -1; // integer overflow
105 }
106
107 if(destIndex<destCapacity) {
108 /* append the result */
109 if(c>=0) {
110 /* code point */
111 UBool isError=FALSE;
112 U16_APPEND(dest, destIndex, destCapacity, c, isError);
113 if(isError) {
114 /* overflow, nothing written */
115 destIndex+=length;
116 }
117 } else {
118 /* string */
119 if((destIndex+length)<=destCapacity) {
120 while(length>0) {
121 dest[destIndex++]=*s++;
122 --length;
123 }
124 } else {
125 /* overflow */
126 destIndex+=length;
127 }
128 }
129 } else {
130 /* preflight */
131 destIndex+=length;
132 }
133 return destIndex;
134 }
135
136 static inline int32_t
137 appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
138 if(destIndex<destCapacity) {
139 dest[destIndex]=c;
140 } else if(destIndex==INT32_MAX) {
141 return -1; // integer overflow
142 }
143 return destIndex+1;
144 }
145
146 static inline int32_t
147 appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
148 const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
149 if(length>0) {
150 if(edits!=NULL) {
151 edits->addUnchanged(length);
152 if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) {
153 return destIndex;
154 }
155 }
156 if(length>(INT32_MAX-destIndex)) {
157 return -1; // integer overflow
158 }
159 if((destIndex+length)<=destCapacity) {
160 u_memcpy(dest+destIndex, s, length);
161 }
162 destIndex+=length;
163 }
164 return destIndex;
165 }
166
167 static UChar32 U_CALLCONV
168 utf16_caseContextIterator(void *context, int8_t dir) {
169 UCaseContext *csc=(UCaseContext *)context;
170 UChar32 c;
171
172 if(dir<0) {
173 /* reset for backward iteration */
174 csc->index=csc->cpStart;
175 csc->dir=dir;
176 } else if(dir>0) {
177 /* reset for forward iteration */
178 csc->index=csc->cpLimit;
179 csc->dir=dir;
180 } else {
181 /* continue current iteration direction */
182 dir=csc->dir;
183 }
184
185 if(dir<0) {
186 if(csc->start<csc->index) {
187 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
188 return c;
189 }
190 } else {
191 if(csc->index<csc->limit) {
192 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
193 return c;
194 }
195 }
196 return U_SENTINEL;
197 }
198
199 /*
200 * Case-maps [srcStart..srcLimit[ but takes
201 * context [0..srcLength[ into account.
202 */
203 static int32_t
204 _caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
205 UChar *dest, int32_t destCapacity,
206 const UChar *src, UCaseContext *csc,
207 int32_t srcStart, int32_t srcLimit,
208 icu::Edits *edits,
209 UErrorCode &errorCode) {
210 /* case mapping loop */
211 int32_t srcIndex=srcStart;
212 int32_t destIndex=0;
213 while(srcIndex<srcLimit) {
214 int32_t cpStart;
215 csc->cpStart=cpStart=srcIndex;
216 UChar32 c;
217 U16_NEXT(src, srcIndex, srcLimit, c);
218 csc->cpLimit=srcIndex;
219 const UChar *s;
220 c=map(c, utf16_caseContextIterator, csc, &s, caseLocale);
221 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
222 srcIndex - cpStart, options, edits);
223 if (destIndex < 0) {
224 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
225 return 0;
226 }
227 }
228
229 return destIndex;
230 }
231
232 #if !UCONFIG_NO_BREAK_ITERATION
233
234 U_CFUNC int32_t U_CALLCONV
235 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
236 UChar *dest, int32_t destCapacity,
237 const UChar *src, int32_t srcLength,
238 icu::Edits *edits,
239 UErrorCode &errorCode) {
240 if(U_FAILURE(errorCode)) {
241 return 0;
242 }
243
244 /* set up local variables */
245 UCaseContext csc=UCASECONTEXT_INITIALIZER;
246 csc.p=(void *)src;
247 csc.limit=srcLength;
248 int32_t destIndex=0;
249 int32_t prev=0;
250 UBool isFirstIndex=TRUE;
251
252 /* titlecasing loop */
253 while(prev<srcLength) {
254 /* find next index where to titlecase */
255 int32_t index;
256 if(isFirstIndex) {
257 isFirstIndex=FALSE;
258 index=iter->first();
259 } else {
260 index=iter->next();
261 }
262 if(index==UBRK_DONE || index>srcLength) {
263 index=srcLength;
264 }
265
266 /*
267 * Unicode 4 & 5 section 3.13 Default Case Operations:
268 *
269 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
270 * #29, "Text Boundaries." Between each pair of word boundaries, find the first
271 * cased character F. If F exists, map F to default_title(F); then map each
272 * subsequent character C to default_lower(C).
273 *
274 * In this implementation, segment [prev..index[ into 3 parts:
275 * a) uncased characters (copy as-is) [prev..titleStart[
276 * b) first case letter (titlecase) [titleStart..titleLimit[
277 * c) subsequent characters (lowercase) [titleLimit..index[
278 */
279 if(prev<index) {
280 /* find and copy uncased characters [prev..titleStart[ */
281 int32_t titleStart=prev;
282 int32_t titleLimit=prev;
283 UChar32 c;
284 U16_NEXT(src, titleLimit, index, c);
285 if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
286 /* Adjust the titlecasing index (titleStart) to the next cased character. */
287 for(;;) {
288 titleStart=titleLimit;
289 if(titleLimit==index) {
290 /*
291 * only uncased characters in [prev..index[
292 * stop with titleStart==titleLimit==index
293 */
294 break;
295 }
296 U16_NEXT(src, titleLimit, index, c);
297 if(UCASE_NONE!=ucase_getType(c)) {
298 break; /* cased letter at [titleStart..titleLimit[ */
299 }
300 }
301 destIndex=appendUnchanged(dest, destIndex, destCapacity,
302 src+prev, titleStart-prev, options, edits);
303 if(destIndex<0) {
304 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
305 return 0;
306 }
307 }
308
309 if(titleStart<titleLimit) {
310 /* titlecase c which is from [titleStart..titleLimit[ */
311 csc.cpStart=titleStart;
312 csc.cpLimit=titleLimit;
313 const UChar *s;
314 c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
315 destIndex=appendResult(dest, destIndex, destCapacity, c, s,
316 titleLimit-titleStart, options, edits);
317 if(destIndex<0) {
318 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
319 return 0;
320 }
321
322 /* Special case Dutch IJ titlecasing */
323 if (titleStart+1 < index &&
324 caseLocale == UCASE_LOC_DUTCH &&
325 (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
326 if (src[titleStart+1] == 0x006A) {
327 destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
328 if(destIndex<0) {
329 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
330 return 0;
331 }
332 if(edits!=NULL) {
333 edits->addReplace(1, 1);
334 }
335 titleLimit++;
336 } else if (src[titleStart+1] == 0x004A) {
337 // Keep the capital J from getting lowercased.
338 destIndex=appendUnchanged(dest, destIndex, destCapacity,
339 src+titleStart+1, 1, options, edits);
340 if(destIndex<0) {
341 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
342 return 0;
343 }
344 titleLimit++;
345 }
346 }
347
348 /* lowercase [titleLimit..index[ */
349 if(titleLimit<index) {
350 if((options&U_TITLECASE_NO_LOWERCASE)==0) {
351 /* Normal operation: Lowercase the rest of the word. */
352 destIndex+=
353 _caseMap(
354 caseLocale, options, ucase_toFullLower,
355 dest+destIndex, destCapacity-destIndex,
356 src, &csc,
357 titleLimit, index,
358 edits, errorCode);
359 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
360 errorCode=U_ZERO_ERROR;
361 }
362 if(U_FAILURE(errorCode)) {
363 return destIndex;
364 }
365 } else {
366 /* Optionally just copy the rest of the word unchanged. */
367 destIndex=appendUnchanged(dest, destIndex, destCapacity,
368 src+titleLimit, index-titleLimit, options, edits);
369 if(destIndex<0) {
370 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
371 return 0;
372 }
373 }
374 }
375 }
376 }
377
378 prev=index;
379 }
380
381 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
382 }
383
384 #endif // !UCONFIG_NO_BREAK_ITERATION
385
386 U_NAMESPACE_BEGIN
387 namespace GreekUpper {
388
389 // Data generated by prototype code, see
390 // http://site.icu-project.org/design/case/greek-upper
391 // TODO: Move this data into ucase.icu.
392 static const uint16_t data0370[] = {
393 // U+0370..03FF
394 0x0370,
395 0x0370,
396 0x0372,
397 0x0372,
398 0,
399 0,
400 0x0376,
401 0x0376,
402 0,
403 0,
404 0x037A,
405 0x03FD,
406 0x03FE,
407 0x03FF,
408 0,
409 0x037F,
410 0,
411 0,
412 0,
413 0,
414 0,
415 0,
416 0x0391 | HAS_VOWEL | HAS_ACCENT,
417 0,
418 0x0395 | HAS_VOWEL | HAS_ACCENT,
419 0x0397 | HAS_VOWEL | HAS_ACCENT,
420 0x0399 | HAS_VOWEL | HAS_ACCENT,
421 0,
422 0x039F | HAS_VOWEL | HAS_ACCENT,
423 0,
424 0x03A5 | HAS_VOWEL | HAS_ACCENT,
425 0x03A9 | HAS_VOWEL | HAS_ACCENT,
426 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
427 0x0391 | HAS_VOWEL,
428 0x0392,
429 0x0393,
430 0x0394,
431 0x0395 | HAS_VOWEL,
432 0x0396,
433 0x0397 | HAS_VOWEL,
434 0x0398,
435 0x0399 | HAS_VOWEL,
436 0x039A,
437 0x039B,
438 0x039C,
439 0x039D,
440 0x039E,
441 0x039F | HAS_VOWEL,
442 0x03A0,
443 0x03A1,
444 0,
445 0x03A3,
446 0x03A4,
447 0x03A5 | HAS_VOWEL,
448 0x03A6,
449 0x03A7,
450 0x03A8,
451 0x03A9 | HAS_VOWEL,
452 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
453 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
454 0x0391 | HAS_VOWEL | HAS_ACCENT,
455 0x0395 | HAS_VOWEL | HAS_ACCENT,
456 0x0397 | HAS_VOWEL | HAS_ACCENT,
457 0x0399 | HAS_VOWEL | HAS_ACCENT,
458 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
459 0x0391 | HAS_VOWEL,
460 0x0392,
461 0x0393,
462 0x0394,
463 0x0395 | HAS_VOWEL,
464 0x0396,
465 0x0397 | HAS_VOWEL,
466 0x0398,
467 0x0399 | HAS_VOWEL,
468 0x039A,
469 0x039B,
470 0x039C,
471 0x039D,
472 0x039E,
473 0x039F | HAS_VOWEL,
474 0x03A0,
475 0x03A1,
476 0x03A3,
477 0x03A3,
478 0x03A4,
479 0x03A5 | HAS_VOWEL,
480 0x03A6,
481 0x03A7,
482 0x03A8,
483 0x03A9 | HAS_VOWEL,
484 0x0399 | HAS_VOWEL | HAS_DIALYTIKA,
485 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA,
486 0x039F | HAS_VOWEL | HAS_ACCENT,
487 0x03A5 | HAS_VOWEL | HAS_ACCENT,
488 0x03A9 | HAS_VOWEL | HAS_ACCENT,
489 0x03CF,
490 0x0392,
491 0x0398,
492 0x03D2,
493 0x03D2 | HAS_ACCENT,
494 0x03D2 | HAS_DIALYTIKA,
495 0x03A6,
496 0x03A0,
497 0x03CF,
498 0x03D8,
499 0x03D8,
500 0x03DA,
501 0x03DA,
502 0x03DC,
503 0x03DC,
504 0x03DE,
505 0x03DE,
506 0x03E0,
507 0x03E0,
508 0,
509 0,
510 0,
511 0,
512 0,
513 0,
514 0,
515 0,
516 0,
517 0,
518 0,
519 0,
520 0,
521 0,
522 0x039A,
523 0x03A1,
524 0x03F9,
525 0x037F,
526 0x03F4,
527 0x0395 | HAS_VOWEL,
528 0,
529 0x03F7,
530 0x03F7,
531 0x03F9,
532 0x03FA,
533 0x03FA,
534 0x03FC,
535 0x03FD,
536 0x03FE,
537 0x03FF,
538 };
539
540 static const uint16_t data1F00[] = {
541 // U+1F00..1FFF
542 0x0391 | HAS_VOWEL,
543 0x0391 | HAS_VOWEL,
544 0x0391 | HAS_VOWEL | HAS_ACCENT,
545 0x0391 | HAS_VOWEL | HAS_ACCENT,
546 0x0391 | HAS_VOWEL | HAS_ACCENT,
547 0x0391 | HAS_VOWEL | HAS_ACCENT,
548 0x0391 | HAS_VOWEL | HAS_ACCENT,
549 0x0391 | HAS_VOWEL | HAS_ACCENT,
550 0x0391 | HAS_VOWEL,
551 0x0391 | HAS_VOWEL,
552 0x0391 | HAS_VOWEL | HAS_ACCENT,
553 0x0391 | HAS_VOWEL | HAS_ACCENT,
554 0x0391 | HAS_VOWEL | HAS_ACCENT,
555 0x0391 | HAS_VOWEL | HAS_ACCENT,
556 0x0391 | HAS_VOWEL | HAS_ACCENT,
557 0x0391 | HAS_VOWEL | HAS_ACCENT,
558 0x0395 | HAS_VOWEL,
559 0x0395 | HAS_VOWEL,
560 0x0395 | HAS_VOWEL | HAS_ACCENT,
561 0x0395 | HAS_VOWEL | HAS_ACCENT,
562 0x0395 | HAS_VOWEL | HAS_ACCENT,
563 0x0395 | HAS_VOWEL | HAS_ACCENT,
564 0,
565 0,
566 0x0395 | HAS_VOWEL,
567 0x0395 | HAS_VOWEL,
568 0x0395 | HAS_VOWEL | HAS_ACCENT,
569 0x0395 | HAS_VOWEL | HAS_ACCENT,
570 0x0395 | HAS_VOWEL | HAS_ACCENT,
571 0x0395 | HAS_VOWEL | HAS_ACCENT,
572 0,
573 0,
574 0x0397 | HAS_VOWEL,
575 0x0397 | HAS_VOWEL,
576 0x0397 | HAS_VOWEL | HAS_ACCENT,
577 0x0397 | HAS_VOWEL | HAS_ACCENT,
578 0x0397 | HAS_VOWEL | HAS_ACCENT,
579 0x0397 | HAS_VOWEL | HAS_ACCENT,
580 0x0397 | HAS_VOWEL | HAS_ACCENT,
581 0x0397 | HAS_VOWEL | HAS_ACCENT,
582 0x0397 | HAS_VOWEL,
583 0x0397 | HAS_VOWEL,
584 0x0397 | HAS_VOWEL | HAS_ACCENT,
585 0x0397 | HAS_VOWEL | HAS_ACCENT,
586 0x0397 | HAS_VOWEL | HAS_ACCENT,
587 0x0397 | HAS_VOWEL | HAS_ACCENT,
588 0x0397 | HAS_VOWEL | HAS_ACCENT,
589 0x0397 | HAS_VOWEL | HAS_ACCENT,
590 0x0399 | HAS_VOWEL,
591 0x0399 | HAS_VOWEL,
592 0x0399 | HAS_VOWEL | HAS_ACCENT,
593 0x0399 | HAS_VOWEL | HAS_ACCENT,
594 0x0399 | HAS_VOWEL | HAS_ACCENT,
595 0x0399 | HAS_VOWEL | HAS_ACCENT,
596 0x0399 | HAS_VOWEL | HAS_ACCENT,
597 0x0399 | HAS_VOWEL | HAS_ACCENT,
598 0x0399 | HAS_VOWEL,
599 0x0399 | HAS_VOWEL,
600 0x0399 | HAS_VOWEL | HAS_ACCENT,
601 0x0399 | HAS_VOWEL | HAS_ACCENT,
602 0x0399 | HAS_VOWEL | HAS_ACCENT,
603 0x0399 | HAS_VOWEL | HAS_ACCENT,
604 0x0399 | HAS_VOWEL | HAS_ACCENT,
605 0x0399 | HAS_VOWEL | HAS_ACCENT,
606 0x039F | HAS_VOWEL,
607 0x039F | HAS_VOWEL,
608 0x039F | HAS_VOWEL | HAS_ACCENT,
609 0x039F | HAS_VOWEL | HAS_ACCENT,
610 0x039F | HAS_VOWEL | HAS_ACCENT,
611 0x039F | HAS_VOWEL | HAS_ACCENT,
612 0,
613 0,
614 0x039F | HAS_VOWEL,
615 0x039F | HAS_VOWEL,
616 0x039F | HAS_VOWEL | HAS_ACCENT,
617 0x039F | HAS_VOWEL | HAS_ACCENT,
618 0x039F | HAS_VOWEL | HAS_ACCENT,
619 0x039F | HAS_VOWEL | HAS_ACCENT,
620 0,
621 0,
622 0x03A5 | HAS_VOWEL,
623 0x03A5 | HAS_VOWEL,
624 0x03A5 | HAS_VOWEL | HAS_ACCENT,
625 0x03A5 | HAS_VOWEL | HAS_ACCENT,
626 0x03A5 | HAS_VOWEL | HAS_ACCENT,
627 0x03A5 | HAS_VOWEL | HAS_ACCENT,
628 0x03A5 | HAS_VOWEL | HAS_ACCENT,
629 0x03A5 | HAS_VOWEL | HAS_ACCENT,
630 0,
631 0x03A5 | HAS_VOWEL,
632 0,
633 0x03A5 | HAS_VOWEL | HAS_ACCENT,
634 0,
635 0x03A5 | HAS_VOWEL | HAS_ACCENT,
636 0,
637 0x03A5 | HAS_VOWEL | HAS_ACCENT,
638 0x03A9 | HAS_VOWEL,
639 0x03A9 | HAS_VOWEL,
640 0x03A9 | HAS_VOWEL | HAS_ACCENT,
641 0x03A9 | HAS_VOWEL | HAS_ACCENT,
642 0x03A9 | HAS_VOWEL | HAS_ACCENT,
643 0x03A9 | HAS_VOWEL | HAS_ACCENT,
644 0x03A9 | HAS_VOWEL | HAS_ACCENT,
645 0x03A9 | HAS_VOWEL | HAS_ACCENT,
646 0x03A9 | HAS_VOWEL,
647 0x03A9 | HAS_VOWEL,
648 0x03A9 | HAS_VOWEL | HAS_ACCENT,
649 0x03A9 | HAS_VOWEL | HAS_ACCENT,
650 0x03A9 | HAS_VOWEL | HAS_ACCENT,
651 0x03A9 | HAS_VOWEL | HAS_ACCENT,
652 0x03A9 | HAS_VOWEL | HAS_ACCENT,
653 0x03A9 | HAS_VOWEL | HAS_ACCENT,
654 0x0391 | HAS_VOWEL | HAS_ACCENT,
655 0x0391 | HAS_VOWEL | HAS_ACCENT,
656 0x0395 | HAS_VOWEL | HAS_ACCENT,
657 0x0395 | HAS_VOWEL | HAS_ACCENT,
658 0x0397 | HAS_VOWEL | HAS_ACCENT,
659 0x0397 | HAS_VOWEL | HAS_ACCENT,
660 0x0399 | HAS_VOWEL | HAS_ACCENT,
661 0x0399 | HAS_VOWEL | HAS_ACCENT,
662 0x039F | HAS_VOWEL | HAS_ACCENT,
663 0x039F | HAS_VOWEL | HAS_ACCENT,
664 0x03A5 | HAS_VOWEL | HAS_ACCENT,
665 0x03A5 | HAS_VOWEL | HAS_ACCENT,
666 0x03A9 | HAS_VOWEL | HAS_ACCENT,
667 0x03A9 | HAS_VOWEL | HAS_ACCENT,
668 0,
669 0,
670 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
671 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
672 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
673 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
674 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
675 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
676 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
677 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
678 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
679 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
680 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
681 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
682 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
683 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
684 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
685 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
686 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
687 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
688 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
689 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
690 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
691 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
692 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
693 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
694 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
695 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
696 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
697 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
698 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
699 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
700 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
701 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
702 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
703 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
704 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
705 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
706 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
707 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
708 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
709 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
710 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
711 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
712 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
713 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
714 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
715 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
716 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
717 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
718 0x0391 | HAS_VOWEL,
719 0x0391 | HAS_VOWEL,
720 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
721 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
722 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
723 0,
724 0x0391 | HAS_VOWEL | HAS_ACCENT,
725 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
726 0x0391 | HAS_VOWEL,
727 0x0391 | HAS_VOWEL,
728 0x0391 | HAS_VOWEL | HAS_ACCENT,
729 0x0391 | HAS_VOWEL | HAS_ACCENT,
730 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
731 0,
732 0x0399 | HAS_VOWEL,
733 0,
734 0,
735 0,
736 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
737 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
738 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
739 0,
740 0x0397 | HAS_VOWEL | HAS_ACCENT,
741 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
742 0x0395 | HAS_VOWEL | HAS_ACCENT,
743 0x0395 | HAS_VOWEL | HAS_ACCENT,
744 0x0397 | HAS_VOWEL | HAS_ACCENT,
745 0x0397 | HAS_VOWEL | HAS_ACCENT,
746 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
747 0,
748 0,
749 0,
750 0x0399 | HAS_VOWEL,
751 0x0399 | HAS_VOWEL,
752 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
753 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
754 0,
755 0,
756 0x0399 | HAS_VOWEL | HAS_ACCENT,
757 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
758 0x0399 | HAS_VOWEL,
759 0x0399 | HAS_VOWEL,
760 0x0399 | HAS_VOWEL | HAS_ACCENT,
761 0x0399 | HAS_VOWEL | HAS_ACCENT,
762 0,
763 0,
764 0,
765 0,
766 0x03A5 | HAS_VOWEL,
767 0x03A5 | HAS_VOWEL,
768 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
769 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
770 0x03A1,
771 0x03A1,
772 0x03A5 | HAS_VOWEL | HAS_ACCENT,
773 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA,
774 0x03A5 | HAS_VOWEL,
775 0x03A5 | HAS_VOWEL,
776 0x03A5 | HAS_VOWEL | HAS_ACCENT,
777 0x03A5 | HAS_VOWEL | HAS_ACCENT,
778 0x03A1,
779 0,
780 0,
781 0,
782 0,
783 0,
784 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
785 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
786 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
787 0,
788 0x03A9 | HAS_VOWEL | HAS_ACCENT,
789 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT,
790 0x039F | HAS_VOWEL | HAS_ACCENT,
791 0x039F | HAS_VOWEL | HAS_ACCENT,
792 0x03A9 | HAS_VOWEL | HAS_ACCENT,
793 0x03A9 | HAS_VOWEL | HAS_ACCENT,
794 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI,
795 0,
796 0,
797 0,
798 };
799
800 // U+2126 Ohm sign
801 static const uint16_t data2126 = 0x03A9 | HAS_VOWEL;
802
803 uint32_t getLetterData(UChar32 c) {
804 if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) {
805 return 0;
806 } else if (c <= 0x3ff) {
807 return data0370[c - 0x370];
808 } else if (c <= 0x1fff) {
809 return data1F00[c - 0x1f00];
810 } else if (c == 0x2126) {
811 return data2126;
812 } else {
813 return 0;
814 }
815 }
816
817 uint32_t getDiacriticData(UChar32 c) {
818 switch (c) {
819 case 0x0300: // varia
820 case 0x0301: // tonos = oxia
821 case 0x0342: // perispomeni
822 case 0x0302: // circumflex can look like perispomeni
823 case 0x0303: // tilde can look like perispomeni
824 case 0x0311: // inverted breve can look like perispomeni
825 return HAS_ACCENT;
826 case 0x0308: // dialytika = diaeresis
827 return HAS_COMBINING_DIALYTIKA;
828 case 0x0344: // dialytika tonos
829 return HAS_COMBINING_DIALYTIKA | HAS_ACCENT;
830 case 0x0345: // ypogegrammeni = iota subscript
831 return HAS_YPOGEGRAMMENI;
832 case 0x0304: // macron
833 case 0x0306: // breve
834 case 0x0313: // comma above
835 case 0x0314: // reversed comma above
836 case 0x0343: // koronis
837 return HAS_OTHER_GREEK_DIACRITIC;
838 default:
839 return 0;
840 }
841 }
842
843 UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) {
844 while (i < length) {
845 UChar32 c;
846 U16_NEXT(s, i, length, c);
847 int32_t type = ucase_getTypeOrIgnorable(c);
848 if ((type & UCASE_IGNORABLE) != 0) {
849 // Case-ignorable, continue with the loop.
850 } else if (type != UCASE_NONE) {
851 return TRUE; // Followed by cased letter.
852 } else {
853 return FALSE; // Uncased and not case-ignorable.
854 }
855 }
856 return FALSE; // Not followed by cased letter.
857 }
858
859 /**
860 * Greek string uppercasing with a state machine.
861 * Probably simpler than a stateless function that has to figure out complex context-before
862 * for each character.
863 * TODO: Try to re-consolidate one way or another with the non-Greek function.
864 */
865 int32_t toUpper(uint32_t options,
866 UChar *dest, int32_t destCapacity,
867 const UChar *src, int32_t srcLength,
868 Edits *edits,
869 UErrorCode &errorCode) {
870 int32_t destIndex=0;
871 uint32_t state = 0;
872 for (int32_t i = 0; i < srcLength;) {
873 int32_t nextIndex = i;
874 UChar32 c;
875 U16_NEXT(src, nextIndex, srcLength, c);
876 uint32_t nextState = 0;
877 int32_t type = ucase_getTypeOrIgnorable(c);
878 if ((type & UCASE_IGNORABLE) != 0) {
879 // c is case-ignorable
880 nextState |= (state & AFTER_CASED);
881 } else if (type != UCASE_NONE) {
882 // c is cased
883 nextState |= AFTER_CASED;
884 }
885 uint32_t data = getLetterData(c);
886 if (data > 0) {
887 uint32_t upper = data & UPPER_MASK;
888 // Add a dialytika to this iota or ypsilon vowel
889 // if we removed a tonos from the previous vowel,
890 // and that previous vowel did not also have (or gain) a dialytika.
891 // Adding one only to the final vowel in a longer sequence
892 // (which does not occur in normal writing) would require lookahead.
893 // Set the same flag as for preserving an existing dialytika.
894 if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
895 (upper == 0x399 || upper == 0x3A5)) {
896 data |= HAS_DIALYTIKA;
897 }
898 int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota.
899 if ((data & HAS_YPOGEGRAMMENI) != 0) {
900 numYpogegrammeni = 1;
901 }
902 // Skip combining diacritics after this Greek letter.
903 while (nextIndex < srcLength) {
904 uint32_t diacriticData = getDiacriticData(src[nextIndex]);
905 if (diacriticData != 0) {
906 data |= diacriticData;
907 if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
908 ++numYpogegrammeni;
909 }
910 ++nextIndex;
911 } else {
912 break; // not a Greek diacritic
913 }
914 }
915 if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
916 nextState |= AFTER_VOWEL_WITH_ACCENT;
917 }
918 // Map according to Greek rules.
919 UBool addTonos = FALSE;
920 if (upper == 0x397 &&
921 (data & HAS_ACCENT) != 0 &&
922 numYpogegrammeni == 0 &&
923 (state & AFTER_CASED) == 0 &&
924 !isFollowedByCasedLetter(src, nextIndex, srcLength)) {
925 // Keep disjunctive "or" with (only) a tonos.
926 // We use the same "word boundary" conditions as for the Final_Sigma test.
927 if (i == nextIndex) {
928 upper = 0x389; // Preserve the precomposed form.
929 } else {
930 addTonos = TRUE;
931 }
932 } else if ((data & HAS_DIALYTIKA) != 0) {
933 // Preserve a vowel with dialytika in precomposed form if it exists.
934 if (upper == 0x399) {
935 upper = 0x3AA;
936 data &= ~HAS_EITHER_DIALYTIKA;
937 } else if (upper == 0x3A5) {
938 upper = 0x3AB;
939 data &= ~HAS_EITHER_DIALYTIKA;
940 }
941 }
942
943 UBool change = TRUE;
944 if (edits != NULL) {
945 // Find out first whether we are changing the text.
946 change = src[i] != upper || numYpogegrammeni > 0;
947 int32_t i2 = i + 1;
948 if ((data & HAS_EITHER_DIALYTIKA) != 0) {
949 change |= i2 >= nextIndex || src[i2] != 0x308;
950 ++i2;
951 }
952 if (addTonos) {
953 change |= i2 >= nextIndex || src[i2] != 0x301;
954 ++i2;
955 }
956 int32_t oldLength = nextIndex - i;
957 int32_t newLength = (i2 - i) + numYpogegrammeni;
958 change |= oldLength != newLength;
959 if (change) {
960 if (edits != NULL) {
961 edits->addReplace(oldLength, newLength);
962 }
963 } else {
964 if (edits != NULL) {
965 edits->addUnchanged(oldLength);
966 }
967 // Write unchanged text?
968 change = (options & UCASEMAP_OMIT_UNCHANGED_TEXT) == 0;
969 }
970 }
971
972 if (change) {
973 destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
974 if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
975 destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika
976 }
977 if (destIndex >= 0 && addTonos) {
978 destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
979 }
980 while (destIndex >= 0 && numYpogegrammeni > 0) {
981 destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
982 --numYpogegrammeni;
983 }
984 if(destIndex<0) {
985 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
986 return 0;
987 }
988 }
989 } else {
990 const UChar *s;
991 c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
992 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
993 nextIndex - i, options, edits);
994 if (destIndex < 0) {
995 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
996 return 0;
997 }
998 }
999 i = nextIndex;
1000 state = nextState;
1001 }
1002
1003 return destIndex;
1004 }
1005
1006 } // namespace GreekUpper
1007 U_NAMESPACE_END
1008
1009 /* functions available in the common library (for unistr_case.cpp) */
1010
1011 U_CFUNC int32_t U_CALLCONV
1012 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1013 UChar *dest, int32_t destCapacity,
1014 const UChar *src, int32_t srcLength,
1015 icu::Edits *edits,
1016 UErrorCode &errorCode) {
1017 UCaseContext csc=UCASECONTEXT_INITIALIZER;
1018 csc.p=(void *)src;
1019 csc.limit=srcLength;
1020 int32_t destIndex = _caseMap(
1021 caseLocale, options, ucase_toFullLower,
1022 dest, destCapacity,
1023 src, &csc, 0, srcLength,
1024 edits, errorCode);
1025 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1026 }
1027
1028 U_CFUNC int32_t U_CALLCONV
1029 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1030 UChar *dest, int32_t destCapacity,
1031 const UChar *src, int32_t srcLength,
1032 icu::Edits *edits,
1033 UErrorCode &errorCode) {
1034 int32_t destIndex;
1035 if (caseLocale == UCASE_LOC_GREEK) {
1036 destIndex = GreekUpper::toUpper(options, dest, destCapacity,
1037 src, srcLength, edits, errorCode);
1038 } else {
1039 UCaseContext csc=UCASECONTEXT_INITIALIZER;
1040 csc.p=(void *)src;
1041 csc.limit=srcLength;
1042 destIndex = _caseMap(
1043 caseLocale, options, ucase_toFullUpper,
1044 dest, destCapacity,
1045 src, &csc, 0, srcLength,
1046 edits, errorCode);
1047 }
1048 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1049 }
1050
1051 U_CFUNC int32_t U_CALLCONV
1052 ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
1053 UChar *dest, int32_t destCapacity,
1054 const UChar *src, int32_t srcLength,
1055 icu::Edits *edits,
1056 UErrorCode &errorCode) {
1057 /* case mapping loop */
1058 int32_t srcIndex = 0;
1059 int32_t destIndex = 0;
1060 while (srcIndex < srcLength) {
1061 int32_t cpStart = srcIndex;
1062 UChar32 c;
1063 U16_NEXT(src, srcIndex, srcLength, c);
1064 const UChar *s;
1065 c = ucase_toFullFolding(c, &s, options);
1066 destIndex = appendResult(dest, destIndex, destCapacity, c, s,
1067 srcIndex - cpStart, options, edits);
1068 if (destIndex < 0) {
1069 errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
1070 return 0;
1071 }
1072 }
1073
1074 return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
1075 }
1076
1077 U_CFUNC int32_t
1078 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1079 UChar *dest, int32_t destCapacity,
1080 const UChar *src, int32_t srcLength,
1081 UStringCaseMapper *stringCaseMapper,
1082 icu::Edits *edits,
1083 UErrorCode &errorCode) {
1084 int32_t destLength;
1085
1086 /* check argument values */
1087 if(U_FAILURE(errorCode)) {
1088 return 0;
1089 }
1090 if( destCapacity<0 ||
1091 (dest==NULL && destCapacity>0) ||
1092 src==NULL ||
1093 srcLength<-1
1094 ) {
1095 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1096 return 0;
1097 }
1098
1099 /* get the string length */
1100 if(srcLength==-1) {
1101 srcLength=u_strlen(src);
1102 }
1103
1104 /* check for overlapping source and destination */
1105 if( dest!=NULL &&
1106 ((src>=dest && src<(dest+destCapacity)) ||
1107 (dest>=src && dest<(src+srcLength)))
1108 ) {
1109 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1110 return 0;
1111 }
1112
1113 if(edits!=NULL) {
1114 edits->reset();
1115 }
1116 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1117 dest, destCapacity, src, srcLength, edits, errorCode);
1118 return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1119 }
1120
1121 U_CFUNC int32_t
1122 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
1123 UChar *dest, int32_t destCapacity,
1124 const UChar *src, int32_t srcLength,
1125 UStringCaseMapper *stringCaseMapper,
1126 UErrorCode &errorCode) {
1127 UChar buffer[300];
1128 UChar *temp;
1129
1130 int32_t destLength;
1131
1132 /* check argument values */
1133 if(U_FAILURE(errorCode)) {
1134 return 0;
1135 }
1136 if( destCapacity<0 ||
1137 (dest==NULL && destCapacity>0) ||
1138 src==NULL ||
1139 srcLength<-1
1140 ) {
1141 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
1142 return 0;
1143 }
1144
1145 /* get the string length */
1146 if(srcLength==-1) {
1147 srcLength=u_strlen(src);
1148 }
1149
1150 /* check for overlapping source and destination */
1151 if( dest!=NULL &&
1152 ((src>=dest && src<(dest+destCapacity)) ||
1153 (dest>=src && dest<(src+srcLength)))
1154 ) {
1155 /* overlap: provide a temporary destination buffer and later copy the result */
1156 if(destCapacity<=UPRV_LENGTHOF(buffer)) {
1157 /* the stack buffer is large enough */
1158 temp=buffer;
1159 } else {
1160 /* allocate a buffer */
1161 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
1162 if(temp==NULL) {
1163 errorCode=U_MEMORY_ALLOCATION_ERROR;
1164 return 0;
1165 }
1166 }
1167 } else {
1168 temp=dest;
1169 }
1170
1171 destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
1172 temp, destCapacity, src, srcLength, NULL, errorCode);
1173 if(temp!=dest) {
1174 /* copy the result string to the destination buffer */
1175 if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
1176 u_memmove(dest, temp, destLength);
1177 }
1178 if(temp!=buffer) {
1179 uprv_free(temp);
1180 }
1181 }
1182
1183 return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
1184 }
1185
1186 /* public API functions */
1187
1188 U_CAPI int32_t U_EXPORT2
1189 u_strFoldCase(UChar *dest, int32_t destCapacity,
1190 const UChar *src, int32_t srcLength,
1191 uint32_t options,
1192 UErrorCode *pErrorCode) {
1193 return ustrcase_mapWithOverlap(
1194 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1195 dest, destCapacity,
1196 src, srcLength,
1197 ustrcase_internalFold, *pErrorCode);
1198 }
1199
1200 U_NAMESPACE_BEGIN
1201
1202 int32_t CaseMap::fold(
1203 uint32_t options,
1204 const UChar *src, int32_t srcLength,
1205 UChar *dest, int32_t destCapacity, Edits *edits,
1206 UErrorCode &errorCode) {
1207 return ustrcase_map(
1208 UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
1209 dest, destCapacity,
1210 src, srcLength,
1211 ustrcase_internalFold, edits, errorCode);
1212 }
1213
1214 U_NAMESPACE_END
1215
1216 /* case-insensitive string comparisons -------------------------------------- */
1217
1218 /*
1219 * This function is a copy of unorm_cmpEquivFold() minus the parts for
1220 * canonical equivalence.
1221 * Keep the functions in sync, and see there for how this works.
1222 * The duplication is for modularization:
1223 * It makes caseless (but not canonical caseless) matches independent of
1224 * the normalization code.
1225 */
1226
1227 /* stack element for previous-level source/decomposition pointers */
1228 struct CmpEquivLevel {
1229 const UChar *start, *s, *limit;
1230 };
1231 typedef struct CmpEquivLevel CmpEquivLevel;
1232
1233 /**
1234 * Internal implementation code comparing string with case fold.
1235 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch().
1236 *
1237 * @param s1 input string 1
1238 * @param length1 length of string 1, or -1 (NULL terminated)
1239 * @param s2 input string 2
1240 * @param length2 length of string 2, or -1 (NULL terminated)
1241 * @param options compare options
1242 * @param matchLen1 (output) length of partial prefix match in s1
1243 * @param matchLen2 (output) length of partial prefix match in s2
1244 * @param pErrorCode receives error status
1245 * @return The result of comparison
1246 */
1247 static int32_t _cmpFold(
1248 const UChar *s1, int32_t length1,
1249 const UChar *s2, int32_t length2,
1250 uint32_t options,
1251 int32_t *matchLen1, int32_t *matchLen2,
1252 UErrorCode *pErrorCode) {
1253 int32_t cmpRes = 0;
1254
1255 /* current-level start/limit - s1/s2 as current */
1256 const UChar *start1, *start2, *limit1, *limit2;
1257
1258 /* points to the original start address */
1259 const UChar *org1, *org2;
1260
1261 /* points to the end of match + 1 */
1262 const UChar *m1, *m2;
1263
1264 /* case folding variables */
1265 const UChar *p;
1266 int32_t length;
1267
1268 /* stacks of previous-level start/current/limit */
1269 CmpEquivLevel stack1[2], stack2[2];
1270
1271 /* case folding buffers, only use current-level start/limit */
1272 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
1273
1274 /* track which is the current level per string */
1275 int32_t level1, level2;
1276
1277 /* current code units, and code points for lookups */
1278 UChar32 c1, c2, cp1, cp2;
1279
1280 /* no argument error checking because this itself is not an API */
1281
1282 /*
1283 * assume that at least the option U_COMPARE_IGNORE_CASE is set
1284 * otherwise this function would have to behave exactly as uprv_strCompare()
1285 */
1286 if(U_FAILURE(*pErrorCode)) {
1287 return 0;
1288 }
1289
1290 /* initialize */
1291 if(matchLen1) {
1292 U_ASSERT(matchLen2 !=NULL);
1293 *matchLen1=0;
1294 *matchLen2=0;
1295 }
1296
1297 start1=m1=org1=s1;
1298 if(length1==-1) {
1299 limit1=NULL;
1300 } else {
1301 limit1=s1+length1;
1302 }
1303
1304 start2=m2=org2=s2;
1305 if(length2==-1) {
1306 limit2=NULL;
1307 } else {
1308 limit2=s2+length2;
1309 }
1310
1311 level1=level2=0;
1312 c1=c2=-1;
1313
1314 /* comparison loop */
1315 for(;;) {
1316 /*
1317 * here a code unit value of -1 means "get another code unit"
1318 * below it will mean "this source is finished"
1319 */
1320
1321 if(c1<0) {
1322 /* get next code unit from string 1, post-increment */
1323 for(;;) {
1324 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
1325 if(level1==0) {
1326 c1=-1;
1327 break;
1328 }
1329 } else {
1330 ++s1;
1331 break;
1332 }
1333
1334 /* reached end of level buffer, pop one level */
1335 do {
1336 --level1;
1337 start1=stack1[level1].start; /*Not uninitialized*/
1338 } while(start1==NULL);
1339 s1=stack1[level1].s; /*Not uninitialized*/
1340 limit1=stack1[level1].limit; /*Not uninitialized*/
1341 }
1342 }
1343
1344 if(c2<0) {
1345 /* get next code unit from string 2, post-increment */
1346 for(;;) {
1347 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
1348 if(level2==0) {
1349 c2=-1;
1350 break;
1351 }
1352 } else {
1353 ++s2;
1354 break;
1355 }
1356
1357 /* reached end of level buffer, pop one level */
1358 do {
1359 --level2;
1360 start2=stack2[level2].start; /*Not uninitialized*/
1361 } while(start2==NULL);
1362 s2=stack2[level2].s; /*Not uninitialized*/
1363 limit2=stack2[level2].limit; /*Not uninitialized*/
1364 }
1365 }
1366
1367 /*
1368 * compare c1 and c2
1369 * either variable c1, c2 is -1 only if the corresponding string is finished
1370 */
1371 if(c1==c2) {
1372 const UChar *next1, *next2;
1373
1374 if(c1<0) {
1375 cmpRes=0; /* c1==c2==-1 indicating end of strings */
1376 break;
1377 }
1378
1379 /*
1380 * Note: Move the match positions in both strings at the same time
1381 * only when corresponding code point(s) in the original strings
1382 * are fully consumed. For example, when comparing s1="Fust" and
1383 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches
1384 * the first code point in the case-folded data. But the second "s"
1385 * has no matching code point in s1, so this implementation returns
1386 * 2 as the prefix match length ("Fu").
1387 */
1388 next1=next2=NULL;
1389 if(level1==0) {
1390 next1=s1;
1391 } else if(s1==limit1) {
1392 /* Note: This implementation only use a single level of stack.
1393 * If this code needs to be changed to use multiple levels
1394 * of stacks, the code above should check if the current
1395 * code is at the end of all stacks.
1396 */
1397 U_ASSERT(level1==1);
1398
1399 /* is s1 at the end of the current stack? */
1400 next1=stack1[0].s;
1401 }
1402
1403 if (next1!=NULL) {
1404 if(level2==0) {
1405 next2=s2;
1406 } else if(s2==limit2) {
1407 U_ASSERT(level2==1);
1408
1409 /* is s2 at the end of the current stack? */
1410 next2=stack2[0].s;
1411 }
1412 if(next2!=NULL) {
1413 m1=next1;
1414 m2=next2;
1415 }
1416 }
1417 c1=c2=-1; /* make us fetch new code units */
1418 continue;
1419 } else if(c1<0) {
1420 cmpRes=-1; /* string 1 ends before string 2 */
1421 break;
1422 } else if(c2<0) {
1423 cmpRes=1; /* string 2 ends before string 1 */
1424 break;
1425 }
1426 /* c1!=c2 && c1>=0 && c2>=0 */
1427
1428 /* get complete code points for c1, c2 for lookups if either is a surrogate */
1429 cp1=c1;
1430 if(U_IS_SURROGATE(c1)) {
1431 UChar c;
1432
1433 if(U_IS_SURROGATE_LEAD(c1)) {
1434 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
1435 /* advance ++s1; only below if cp1 decomposes/case-folds */
1436 cp1=U16_GET_SUPPLEMENTARY(c1, c);
1437 }
1438 } else /* isTrail(c1) */ {
1439 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
1440 cp1=U16_GET_SUPPLEMENTARY(c, c1);
1441 }
1442 }
1443 }
1444
1445 cp2=c2;
1446 if(U_IS_SURROGATE(c2)) {
1447 UChar c;
1448
1449 if(U_IS_SURROGATE_LEAD(c2)) {
1450 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
1451 /* advance ++s2; only below if cp2 decomposes/case-folds */
1452 cp2=U16_GET_SUPPLEMENTARY(c2, c);
1453 }
1454 } else /* isTrail(c2) */ {
1455 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
1456 cp2=U16_GET_SUPPLEMENTARY(c, c2);
1457 }
1458 }
1459 }
1460
1461 /*
1462 * go down one level for each string
1463 * continue with the main loop as soon as there is a real change
1464 */
1465
1466 if( level1==0 &&
1467 (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
1468 ) {
1469 /* cp1 case-folds to the code point "length" or to p[length] */
1470 if(U_IS_SURROGATE(c1)) {
1471 if(U_IS_SURROGATE_LEAD(c1)) {
1472 /* advance beyond source surrogate pair if it case-folds */
1473 ++s1;
1474 } else /* isTrail(c1) */ {
1475 /*
1476 * we got a supplementary code point when hitting its trail surrogate,
1477 * therefore the lead surrogate must have been the same as in the other string;
1478 * compare this decomposition with the lead surrogate in the other string
1479 * remember that this simulates bulk text replacement:
1480 * the decomposition would replace the entire code point
1481 */
1482 --s2;
1483 --m2;
1484 c2=*(s2-1);
1485 }
1486 }
1487
1488 /* push current level pointers */
1489 stack1[0].start=start1;
1490 stack1[0].s=s1;
1491 stack1[0].limit=limit1;
1492 ++level1;
1493
1494 /* copy the folding result to fold1[] */
1495 if(length<=UCASE_MAX_STRING_LENGTH) {
1496 u_memcpy(fold1, p, length);
1497 } else {
1498 int32_t i=0;
1499 U16_APPEND_UNSAFE(fold1, i, length);
1500 length=i;
1501 }
1502
1503 /* set next level pointers to case folding */
1504 start1=s1=fold1;
1505 limit1=fold1+length;
1506
1507 /* get ready to read from decomposition, continue with loop */
1508 c1=-1;
1509 continue;
1510 }
1511
1512 if( level2==0 &&
1513 (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
1514 ) {
1515 /* cp2 case-folds to the code point "length" or to p[length] */
1516 if(U_IS_SURROGATE(c2)) {
1517 if(U_IS_SURROGATE_LEAD(c2)) {
1518 /* advance beyond source surrogate pair if it case-folds */
1519 ++s2;
1520 } else /* isTrail(c2) */ {
1521 /*
1522 * we got a supplementary code point when hitting its trail surrogate,
1523 * therefore the lead surrogate must have been the same as in the other string;
1524 * compare this decomposition with the lead surrogate in the other string
1525 * remember that this simulates bulk text replacement:
1526 * the decomposition would replace the entire code point
1527 */
1528 --s1;
1529 --m2;
1530 c1=*(s1-1);
1531 }
1532 }
1533
1534 /* push current level pointers */
1535 stack2[0].start=start2;
1536 stack2[0].s=s2;
1537 stack2[0].limit=limit2;
1538 ++level2;
1539
1540 /* copy the folding result to fold2[] */
1541 if(length<=UCASE_MAX_STRING_LENGTH) {
1542 u_memcpy(fold2, p, length);
1543 } else {
1544 int32_t i=0;
1545 U16_APPEND_UNSAFE(fold2, i, length);
1546 length=i;
1547 }
1548
1549 /* set next level pointers to case folding */
1550 start2=s2=fold2;
1551 limit2=fold2+length;
1552
1553 /* get ready to read from decomposition, continue with loop */
1554 c2=-1;
1555 continue;
1556 }
1557
1558 /*
1559 * no decomposition/case folding, max level for both sides:
1560 * return difference result
1561 *
1562 * code point order comparison must not just return cp1-cp2
1563 * because when single surrogates are present then the surrogate pairs
1564 * that formed cp1 and cp2 may be from different string indexes
1565 *
1566 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
1567 * c1=d800 cp1=10001 c2=dc00 cp2=10000
1568 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
1569 *
1570 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
1571 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
1572 * so we have slightly different pointer/start/limit comparisons here
1573 */
1574
1575 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
1576 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
1577 if(
1578 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
1579 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
1580 ) {
1581 /* part of a surrogate pair, leave >=d800 */
1582 } else {
1583 /* BMP code point - may be surrogate code point - make <d800 */
1584 c1-=0x2800;
1585 }
1586
1587 if(
1588 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
1589 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
1590 ) {
1591 /* part of a surrogate pair, leave >=d800 */
1592 } else {
1593 /* BMP code point - may be surrogate code point - make <d800 */
1594 c2-=0x2800;
1595 }
1596 }
1597
1598 cmpRes=c1-c2;
1599 break;
1600 }
1601
1602 if(matchLen1) {
1603 *matchLen1=m1-org1;
1604 *matchLen2=m2-org2;
1605 }
1606 return cmpRes;
1607 }
1608
1609 /* internal function */
1610 U_CFUNC int32_t
1611 u_strcmpFold(const UChar *s1, int32_t length1,
1612 const UChar *s2, int32_t length2,
1613 uint32_t options,
1614 UErrorCode *pErrorCode) {
1615 return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode);
1616 }
1617
1618 /* public API functions */
1619
1620 U_CAPI int32_t U_EXPORT2
1621 u_strCaseCompare(const UChar *s1, int32_t length1,
1622 const UChar *s2, int32_t length2,
1623 uint32_t options,
1624 UErrorCode *pErrorCode) {
1625 /* argument checking */
1626 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
1627 return 0;
1628 }
1629 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
1630 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1631 return 0;
1632 }
1633 return u_strcmpFold(s1, length1, s2, length2,
1634 options|U_COMPARE_IGNORE_CASE,
1635 pErrorCode);
1636 }
1637
1638 U_CAPI int32_t U_EXPORT2
1639 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) {
1640 UErrorCode errorCode=U_ZERO_ERROR;
1641 return u_strcmpFold(s1, -1, s2, -1,
1642 options|U_COMPARE_IGNORE_CASE,
1643 &errorCode);
1644 }
1645
1646 U_CAPI int32_t U_EXPORT2
1647 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) {
1648 UErrorCode errorCode=U_ZERO_ERROR;
1649 return u_strcmpFold(s1, length, s2, length,
1650 options|U_COMPARE_IGNORE_CASE,
1651 &errorCode);
1652 }
1653
1654 U_CAPI int32_t U_EXPORT2
1655 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) {
1656 UErrorCode errorCode=U_ZERO_ERROR;
1657 return u_strcmpFold(s1, n, s2, n,
1658 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE),
1659 &errorCode);
1660 }
1661
1662 /* internal API - detect length of shared prefix */
1663 U_CAPI void
1664 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
1665 const UChar *s2, int32_t length2,
1666 uint32_t options,
1667 int32_t *matchLen1, int32_t *matchLen2,
1668 UErrorCode *pErrorCode) {
1669 _cmpFold(s1, length1, s2, length2, options,
1670 matchLen1, matchLen2, pErrorCode);
1671 }