]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
b75a7d8f A |
3 | /* |
4 | ******************************************************************************* | |
5 | * | |
b331163b | 6 | * Copyright (C) 2001-2015, International Business Machines |
b75a7d8f A |
7 | * Corporation and others. All Rights Reserved. |
8 | * | |
9 | ******************************************************************************* | |
4388f060 | 10 | * file name: ustrcase.cpp |
f3c0d7a5 | 11 | * encoding: UTF-8 |
b75a7d8f A |
12 | * tab size: 8 (not used) |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2002feb20 | |
16 | * created by: Markus W. Scherer | |
17 | * | |
18 | * Implementation file for string casing C API functions. | |
19 | * Uses functions from uchar.c for basic functionality that requires access | |
20 | * to the Unicode Character Database (uprops.dat). | |
21 | */ | |
22 | ||
23 | #include "unicode/utypes.h" | |
4388f060 | 24 | #include "unicode/brkiter.h" |
f3c0d7a5 A |
25 | #include "unicode/casemap.h" |
26 | #include "unicode/edits.h" | |
0f5d89e8 | 27 | #include "unicode/stringoptions.h" |
b75a7d8f | 28 | #include "unicode/ustring.h" |
46f4442e | 29 | #include "unicode/ucasemap.h" |
b75a7d8f | 30 | #include "unicode/ubrk.h" |
4388f060 A |
31 | #include "unicode/utf.h" |
32 | #include "unicode/utf16.h" | |
b75a7d8f | 33 | #include "cmemory.h" |
374ca955 | 34 | #include "ucase.h" |
f3c0d7a5 | 35 | #include "ucasemap_imp.h" |
b75a7d8f | 36 | #include "ustr_imp.h" |
b331163b | 37 | #include "uassert.h" |
4388f060 | 38 | |
f3c0d7a5 A |
39 | U_NAMESPACE_BEGIN |
40 | ||
41 | namespace { | |
42 | ||
43 | int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity, | |
44 | Edits *edits, UErrorCode &errorCode) { | |
45 | if (U_SUCCESS(errorCode)) { | |
46 | if (destIndex > destCapacity) { | |
47 | errorCode = U_BUFFER_OVERFLOW_ERROR; | |
48 | } else if (edits != NULL) { | |
49 | edits->copyErrorTo(errorCode); | |
50 | } | |
51 | } | |
52 | return destIndex; | |
53 | } | |
54 | ||
4388f060 | 55 | /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ |
0f5d89e8 | 56 | inline int32_t |
374ca955 | 57 | appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, |
f3c0d7a5 A |
58 | int32_t result, const UChar *s, |
59 | int32_t cpLength, uint32_t options, icu::Edits *edits) { | |
374ca955 A |
60 | UChar32 c; |
61 | int32_t length; | |
62 | ||
63 | /* decode the result */ | |
64 | if(result<0) { | |
65 | /* (not) original code point */ | |
f3c0d7a5 A |
66 | if(edits!=NULL) { |
67 | edits->addUnchanged(cpLength); | |
0f5d89e8 A |
68 | } |
69 | if(options & U_OMIT_UNCHANGED_TEXT) { | |
70 | return destIndex; | |
f3c0d7a5 | 71 | } |
374ca955 | 72 | c=~result; |
f3c0d7a5 A |
73 | if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath |
74 | dest[destIndex++]=(UChar)c; | |
75 | return destIndex; | |
76 | } | |
77 | length=cpLength; | |
374ca955 | 78 | } else { |
f3c0d7a5 A |
79 | if(result<=UCASE_MAX_STRING_LENGTH) { |
80 | c=U_SENTINEL; | |
81 | length=result; | |
82 | } else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath | |
83 | dest[destIndex++]=(UChar)result; | |
84 | if(edits!=NULL) { | |
85 | edits->addReplace(cpLength, 1); | |
86 | } | |
87 | return destIndex; | |
88 | } else { | |
89 | c=result; | |
90 | length=U16_LENGTH(c); | |
91 | } | |
92 | if(edits!=NULL) { | |
93 | edits->addReplace(cpLength, length); | |
94 | } | |
a62d09fc A |
95 | } |
96 | if(length>(INT32_MAX-destIndex)) { | |
97 | return -1; // integer overflow | |
374ca955 A |
98 | } |
99 | ||
100 | if(destIndex<destCapacity) { | |
101 | /* append the result */ | |
a62d09fc | 102 | if(c>=0) { |
374ca955 A |
103 | /* code point */ |
104 | UBool isError=FALSE; | |
105 | U16_APPEND(dest, destIndex, destCapacity, c, isError); | |
106 | if(isError) { | |
107 | /* overflow, nothing written */ | |
a62d09fc | 108 | destIndex+=length; |
374ca955 A |
109 | } |
110 | } else { | |
111 | /* string */ | |
112 | if((destIndex+length)<=destCapacity) { | |
113 | while(length>0) { | |
114 | dest[destIndex++]=*s++; | |
115 | --length; | |
116 | } | |
117 | } else { | |
118 | /* overflow */ | |
119 | destIndex+=length; | |
120 | } | |
121 | } | |
122 | } else { | |
123 | /* preflight */ | |
a62d09fc | 124 | destIndex+=length; |
374ca955 A |
125 | } |
126 | return destIndex; | |
127 | } | |
128 | ||
0f5d89e8 | 129 | inline int32_t |
a62d09fc A |
130 | appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) { |
131 | if(destIndex<destCapacity) { | |
132 | dest[destIndex]=c; | |
133 | } else if(destIndex==INT32_MAX) { | |
134 | return -1; // integer overflow | |
135 | } | |
136 | return destIndex+1; | |
137 | } | |
138 | ||
0f5d89e8 A |
139 | int32_t |
140 | appendNonEmptyUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity, | |
141 | const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) { | |
142 | if(edits!=NULL) { | |
143 | edits->addUnchanged(length); | |
144 | } | |
145 | if(options & U_OMIT_UNCHANGED_TEXT) { | |
146 | return destIndex; | |
147 | } | |
148 | if(length>(INT32_MAX-destIndex)) { | |
149 | return -1; // integer overflow | |
150 | } | |
151 | if((destIndex+length)<=destCapacity) { | |
152 | u_memcpy(dest+destIndex, s, length); | |
153 | } | |
154 | return destIndex + length; | |
155 | } | |
156 | ||
157 | inline int32_t | |
f3c0d7a5 A |
158 | appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity, |
159 | const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) { | |
0f5d89e8 A |
160 | if (length <= 0) { |
161 | return destIndex; | |
f3c0d7a5 | 162 | } |
0f5d89e8 | 163 | return appendNonEmptyUnchanged(dest, destIndex, destCapacity, s, length, options, edits); |
f3c0d7a5 A |
164 | } |
165 | ||
0f5d89e8 | 166 | UChar32 U_CALLCONV |
374ca955 A |
167 | utf16_caseContextIterator(void *context, int8_t dir) { |
168 | UCaseContext *csc=(UCaseContext *)context; | |
169 | UChar32 c; | |
170 | ||
171 | if(dir<0) { | |
172 | /* reset for backward iteration */ | |
173 | csc->index=csc->cpStart; | |
174 | csc->dir=dir; | |
175 | } else if(dir>0) { | |
176 | /* reset for forward iteration */ | |
177 | csc->index=csc->cpLimit; | |
178 | csc->dir=dir; | |
179 | } else { | |
180 | /* continue current iteration direction */ | |
181 | dir=csc->dir; | |
182 | } | |
183 | ||
184 | if(dir<0) { | |
185 | if(csc->start<csc->index) { | |
186 | U16_PREV((const UChar *)csc->p, csc->start, csc->index, c); | |
187 | return c; | |
188 | } | |
189 | } else { | |
190 | if(csc->index<csc->limit) { | |
191 | U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c); | |
192 | return c; | |
193 | } | |
194 | } | |
195 | return U_SENTINEL; | |
196 | } | |
197 | ||
0f5d89e8 A |
198 | /** |
199 | * caseLocale >= 0: Lowercases [srcStart..srcLimit[ but takes context [0..srcLength[ into account. | |
200 | * caseLocale < 0: Case-folds [srcStart..srcLimit[. | |
374ca955 | 201 | */ |
0f5d89e8 A |
202 | int32_t toLower(int32_t caseLocale, uint32_t options, |
203 | UChar *dest, int32_t destCapacity, | |
204 | const UChar *src, UCaseContext *csc, int32_t srcStart, int32_t srcLimit, | |
205 | icu::Edits *edits, UErrorCode &errorCode) { | |
206 | const int8_t *latinToLower; | |
207 | if (caseLocale == UCASE_LOC_ROOT || | |
208 | (caseLocale >= 0 ? | |
209 | !(caseLocale == UCASE_LOC_TURKISH || caseLocale == UCASE_LOC_LITHUANIAN) : | |
210 | (options & _FOLD_CASE_OPTIONS_MASK) == U_FOLD_CASE_DEFAULT)) { | |
211 | latinToLower = LatinCase::TO_LOWER_NORMAL; | |
212 | } else { | |
213 | latinToLower = LatinCase::TO_LOWER_TR_LT; | |
214 | } | |
215 | const UTrie2 *trie = ucase_getTrie(); | |
216 | int32_t destIndex = 0; | |
217 | int32_t prev = srcStart; | |
218 | int32_t srcIndex = srcStart; | |
219 | for (;;) { | |
220 | // fast path for simple cases | |
221 | UChar lead; | |
222 | while (srcIndex < srcLimit) { | |
223 | lead = src[srcIndex]; | |
224 | int32_t delta; | |
225 | if (lead < LatinCase::LONG_S) { | |
226 | int8_t d = latinToLower[lead]; | |
227 | if (d == LatinCase::EXC) { break; } | |
228 | ++srcIndex; | |
229 | if (d == 0) { continue; } | |
230 | delta = d; | |
231 | } else if (lead >= 0xd800) { | |
232 | break; // surrogate or higher | |
233 | } else { | |
234 | uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead); | |
235 | if (UCASE_HAS_EXCEPTION(props)) { break; } | |
236 | ++srcIndex; | |
237 | if (!UCASE_IS_UPPER_OR_TITLE(props) || (delta = UCASE_GET_DELTA(props)) == 0) { | |
238 | continue; | |
239 | } | |
240 | } | |
241 | lead += delta; | |
242 | destIndex = appendUnchanged(dest, destIndex, destCapacity, | |
243 | src + prev, srcIndex - 1 - prev, options, edits); | |
244 | if (destIndex >= 0) { | |
245 | destIndex = appendUChar(dest, destIndex, destCapacity, lead); | |
246 | if (edits != nullptr) { | |
247 | edits->addReplace(1, 1); | |
248 | } | |
249 | } | |
250 | if (destIndex < 0) { | |
251 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
252 | return 0; | |
253 | } | |
254 | prev = srcIndex; | |
255 | } | |
256 | if (srcIndex >= srcLimit) { | |
257 | break; | |
258 | } | |
259 | // slow path | |
260 | int32_t cpStart = srcIndex++; | |
261 | UChar trail; | |
f3c0d7a5 | 262 | UChar32 c; |
0f5d89e8 A |
263 | if (U16_IS_LEAD(lead) && srcIndex < srcLimit && U16_IS_TRAIL(trail = src[srcIndex])) { |
264 | c = U16_GET_SUPPLEMENTARY(lead, trail); | |
265 | ++srcIndex; | |
266 | } else { | |
267 | c = lead; | |
268 | } | |
f3c0d7a5 | 269 | const UChar *s; |
0f5d89e8 A |
270 | if (caseLocale >= 0) { |
271 | csc->cpStart = cpStart; | |
272 | csc->cpLimit = srcIndex; | |
273 | c = ucase_toFullLower(c, utf16_caseContextIterator, csc, &s, caseLocale); | |
274 | } else { | |
275 | c = ucase_toFullFolding(c, &s, options); | |
276 | } | |
277 | if (c >= 0) { | |
278 | destIndex = appendUnchanged(dest, destIndex, destCapacity, | |
279 | src + prev, cpStart - prev, options, edits); | |
280 | if (destIndex >= 0) { | |
281 | destIndex = appendResult(dest, destIndex, destCapacity, c, s, | |
282 | srcIndex - cpStart, options, edits); | |
283 | } | |
284 | if (destIndex < 0) { | |
285 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
286 | return 0; | |
287 | } | |
288 | prev = srcIndex; | |
73c04bcf | 289 | } |
374ca955 | 290 | } |
0f5d89e8 A |
291 | destIndex = appendUnchanged(dest, destIndex, destCapacity, |
292 | src + prev, srcIndex - prev, options, edits); | |
293 | if (destIndex < 0) { | |
294 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
295 | return 0; | |
296 | } | |
297 | return destIndex; | |
298 | } | |
374ca955 | 299 | |
0f5d89e8 A |
300 | int32_t toUpper(int32_t caseLocale, uint32_t options, |
301 | UChar *dest, int32_t destCapacity, | |
302 | const UChar *src, UCaseContext *csc, int32_t srcLength, | |
303 | icu::Edits *edits, UErrorCode &errorCode) { | |
304 | const int8_t *latinToUpper; | |
305 | if (caseLocale == UCASE_LOC_TURKISH) { | |
306 | latinToUpper = LatinCase::TO_UPPER_TR; | |
307 | } else { | |
308 | latinToUpper = LatinCase::TO_UPPER_NORMAL; | |
309 | } | |
310 | const UTrie2 *trie = ucase_getTrie(); | |
311 | int32_t destIndex = 0; | |
312 | int32_t prev = 0; | |
313 | int32_t srcIndex = 0; | |
314 | for (;;) { | |
315 | // fast path for simple cases | |
316 | UChar lead; | |
317 | while (srcIndex < srcLength) { | |
318 | lead = src[srcIndex]; | |
319 | int32_t delta; | |
320 | if (lead < LatinCase::LONG_S) { | |
321 | int8_t d = latinToUpper[lead]; | |
322 | if (d == LatinCase::EXC) { break; } | |
323 | ++srcIndex; | |
324 | if (d == 0) { continue; } | |
325 | delta = d; | |
326 | } else if (lead >= 0xd800) { | |
327 | break; // surrogate or higher | |
328 | } else { | |
329 | uint16_t props = UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie, lead); | |
330 | if (UCASE_HAS_EXCEPTION(props)) { break; } | |
331 | ++srcIndex; | |
332 | if (UCASE_GET_TYPE(props) != UCASE_LOWER || (delta = UCASE_GET_DELTA(props)) == 0) { | |
333 | continue; | |
334 | } | |
335 | } | |
336 | lead += delta; | |
337 | destIndex = appendUnchanged(dest, destIndex, destCapacity, | |
338 | src + prev, srcIndex - 1 - prev, options, edits); | |
339 | if (destIndex >= 0) { | |
340 | destIndex = appendUChar(dest, destIndex, destCapacity, lead); | |
341 | if (edits != nullptr) { | |
342 | edits->addReplace(1, 1); | |
343 | } | |
344 | } | |
345 | if (destIndex < 0) { | |
346 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
347 | return 0; | |
348 | } | |
349 | prev = srcIndex; | |
350 | } | |
351 | if (srcIndex >= srcLength) { | |
352 | break; | |
353 | } | |
354 | // slow path | |
355 | int32_t cpStart; | |
356 | csc->cpStart = cpStart = srcIndex++; | |
357 | UChar trail; | |
358 | UChar32 c; | |
359 | if (U16_IS_LEAD(lead) && srcIndex < srcLength && U16_IS_TRAIL(trail = src[srcIndex])) { | |
360 | c = U16_GET_SUPPLEMENTARY(lead, trail); | |
361 | ++srcIndex; | |
362 | } else { | |
363 | c = lead; | |
364 | } | |
365 | csc->cpLimit = srcIndex; | |
366 | const UChar *s; | |
367 | c = ucase_toFullUpper(c, utf16_caseContextIterator, csc, &s, caseLocale); | |
368 | if (c >= 0) { | |
369 | destIndex = appendUnchanged(dest, destIndex, destCapacity, | |
370 | src + prev, cpStart - prev, options, edits); | |
371 | if (destIndex >= 0) { | |
372 | destIndex = appendResult(dest, destIndex, destCapacity, c, s, | |
373 | srcIndex - cpStart, options, edits); | |
374 | } | |
375 | if (destIndex < 0) { | |
376 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
377 | return 0; | |
378 | } | |
379 | prev = srcIndex; | |
380 | } | |
381 | } | |
382 | destIndex = appendUnchanged(dest, destIndex, destCapacity, | |
383 | src + prev, srcIndex - prev, options, edits); | |
384 | if (destIndex < 0) { | |
385 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
386 | return 0; | |
387 | } | |
374ca955 A |
388 | return destIndex; |
389 | } | |
390 | ||
0f5d89e8 A |
391 | } // namespace |
392 | ||
393 | U_NAMESPACE_END | |
394 | ||
395 | U_NAMESPACE_USE | |
396 | ||
b75a7d8f A |
397 | #if !UCONFIG_NO_BREAK_ITERATION |
398 | ||
4388f060 | 399 | U_CFUNC int32_t U_CALLCONV |
f3c0d7a5 | 400 | ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter, |
4388f060 A |
401 | UChar *dest, int32_t destCapacity, |
402 | const UChar *src, int32_t srcLength, | |
f3c0d7a5 A |
403 | icu::Edits *edits, |
404 | UErrorCode &errorCode) { | |
0f5d89e8 | 405 | if (!ustrcase_checkTitleAdjustmentOptions(options, errorCode)) { |
46f4442e A |
406 | return 0; |
407 | } | |
408 | ||
b75a7d8f | 409 | /* set up local variables */ |
4388f060 A |
410 | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
411 | csc.p=(void *)src; | |
412 | csc.limit=srcLength; | |
f3c0d7a5 A |
413 | int32_t destIndex=0; |
414 | int32_t prev=0; | |
415 | UBool isFirstIndex=TRUE; | |
b75a7d8f A |
416 | |
417 | /* titlecasing loop */ | |
418 | while(prev<srcLength) { | |
419 | /* find next index where to titlecase */ | |
f3c0d7a5 | 420 | int32_t index; |
b75a7d8f A |
421 | if(isFirstIndex) { |
422 | isFirstIndex=FALSE; | |
f3c0d7a5 | 423 | index=iter->first(); |
b75a7d8f | 424 | } else { |
f3c0d7a5 | 425 | index=iter->next(); |
b75a7d8f | 426 | } |
f3c0d7a5 A |
427 | if(index==UBRK_DONE || index>srcLength) { |
428 | index=srcLength; | |
b75a7d8f A |
429 | } |
430 | ||
73c04bcf | 431 | /* |
0f5d89e8 A |
432 | * Segment [prev..index[ into 3 parts: |
433 | * a) skipped characters (copy as-is) [prev..titleStart[ | |
434 | * b) first letter (titlecase) [titleStart..titleLimit[ | |
73c04bcf A |
435 | * c) subsequent characters (lowercase) [titleLimit..index[ |
436 | */ | |
f3c0d7a5 | 437 | if(prev<index) { |
0f5d89e8 | 438 | // Find and copy skipped characters [prev..titleStart[ |
f3c0d7a5 A |
439 | int32_t titleStart=prev; |
440 | int32_t titleLimit=prev; | |
441 | UChar32 c; | |
442 | U16_NEXT(src, titleLimit, index, c); | |
0f5d89e8 A |
443 | if ((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0) { |
444 | // Adjust the titlecasing index to the next cased character, | |
445 | // or to the next letter/number/symbol/private use. | |
446 | // Stop with titleStart<titleLimit<=index | |
447 | // if there is a character to be titlecased, | |
448 | // or else stop with titleStart==titleLimit==index. | |
449 | UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0; | |
450 | while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) { | |
46f4442e | 451 | titleStart=titleLimit; |
f3c0d7a5 | 452 | if(titleLimit==index) { |
46f4442e A |
453 | break; |
454 | } | |
f3c0d7a5 | 455 | U16_NEXT(src, titleLimit, index, c); |
73c04bcf | 456 | } |
0f5d89e8 A |
457 | if (prev < titleStart) { |
458 | destIndex=appendUnchanged(dest, destIndex, destCapacity, | |
459 | src+prev, titleStart-prev, options, edits); | |
460 | if(destIndex<0) { | |
461 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
462 | return 0; | |
463 | } | |
73c04bcf A |
464 | } |
465 | } | |
b75a7d8f | 466 | |
73c04bcf A |
467 | if(titleStart<titleLimit) { |
468 | /* titlecase c which is from [titleStart..titleLimit[ */ | |
4388f060 A |
469 | csc.cpStart=titleStart; |
470 | csc.cpLimit=titleLimit; | |
f3c0d7a5 A |
471 | const UChar *s; |
472 | c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale); | |
473 | destIndex=appendResult(dest, destIndex, destCapacity, c, s, | |
474 | titleLimit-titleStart, options, edits); | |
a62d09fc | 475 | if(destIndex<0) { |
f3c0d7a5 | 476 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
a62d09fc A |
477 | return 0; |
478 | } | |
46f4442e A |
479 | |
480 | /* Special case Dutch IJ titlecasing */ | |
f3c0d7a5 A |
481 | if (titleStart+1 < index && |
482 | caseLocale == UCASE_LOC_DUTCH && | |
483 | (src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) { | |
484 | if (src[titleStart+1] == 0x006A) { | |
485 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A); | |
486 | if(destIndex<0) { | |
487 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
488 | return 0; | |
489 | } | |
490 | if(edits!=NULL) { | |
491 | edits->addReplace(1, 1); | |
492 | } | |
493 | titleLimit++; | |
494 | } else if (src[titleStart+1] == 0x004A) { | |
495 | // Keep the capital J from getting lowercased. | |
496 | destIndex=appendUnchanged(dest, destIndex, destCapacity, | |
497 | src+titleStart+1, 1, options, edits); | |
498 | if(destIndex<0) { | |
499 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
500 | return 0; | |
501 | } | |
502 | titleLimit++; | |
a62d09fc | 503 | } |
46f4442e | 504 | } |
73c04bcf A |
505 | |
506 | /* lowercase [titleLimit..index[ */ | |
f3c0d7a5 A |
507 | if(titleLimit<index) { |
508 | if((options&U_TITLECASE_NO_LOWERCASE)==0) { | |
46f4442e A |
509 | /* Normal operation: Lowercase the rest of the word. */ |
510 | destIndex+= | |
0f5d89e8 A |
511 | toLower( |
512 | caseLocale, options, | |
46f4442e | 513 | dest+destIndex, destCapacity-destIndex, |
0f5d89e8 | 514 | src, &csc, titleLimit, index, |
f3c0d7a5 A |
515 | edits, errorCode); |
516 | if(errorCode==U_BUFFER_OVERFLOW_ERROR) { | |
517 | errorCode=U_ZERO_ERROR; | |
518 | } | |
519 | if(U_FAILURE(errorCode)) { | |
a62d09fc A |
520 | return destIndex; |
521 | } | |
46f4442e A |
522 | } else { |
523 | /* Optionally just copy the rest of the word unchanged. */ | |
f3c0d7a5 A |
524 | destIndex=appendUnchanged(dest, destIndex, destCapacity, |
525 | src+titleLimit, index-titleLimit, options, edits); | |
526 | if(destIndex<0) { | |
527 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
a62d09fc A |
528 | return 0; |
529 | } | |
46f4442e | 530 | } |
73c04bcf A |
531 | } |
532 | } | |
b75a7d8f A |
533 | } |
534 | ||
f3c0d7a5 A |
535 | prev=index; |
536 | } | |
537 | ||
538 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); | |
539 | } | |
540 | ||
541 | #endif // !UCONFIG_NO_BREAK_ITERATION | |
542 | ||
543 | U_NAMESPACE_BEGIN | |
544 | namespace GreekUpper { | |
545 | ||
546 | // Data generated by prototype code, see | |
547 | // http://site.icu-project.org/design/case/greek-upper | |
548 | // TODO: Move this data into ucase.icu. | |
549 | static const uint16_t data0370[] = { | |
550 | // U+0370..03FF | |
551 | 0x0370, | |
552 | 0x0370, | |
553 | 0x0372, | |
554 | 0x0372, | |
555 | 0, | |
556 | 0, | |
557 | 0x0376, | |
558 | 0x0376, | |
559 | 0, | |
560 | 0, | |
561 | 0x037A, | |
562 | 0x03FD, | |
563 | 0x03FE, | |
564 | 0x03FF, | |
565 | 0, | |
566 | 0x037F, | |
567 | 0, | |
568 | 0, | |
569 | 0, | |
570 | 0, | |
571 | 0, | |
572 | 0, | |
573 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
574 | 0, | |
575 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
576 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
577 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
578 | 0, | |
579 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
580 | 0, | |
581 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
582 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
583 | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, | |
584 | 0x0391 | HAS_VOWEL, | |
585 | 0x0392, | |
586 | 0x0393, | |
587 | 0x0394, | |
588 | 0x0395 | HAS_VOWEL, | |
589 | 0x0396, | |
590 | 0x0397 | HAS_VOWEL, | |
591 | 0x0398, | |
592 | 0x0399 | HAS_VOWEL, | |
593 | 0x039A, | |
594 | 0x039B, | |
595 | 0x039C, | |
596 | 0x039D, | |
597 | 0x039E, | |
598 | 0x039F | HAS_VOWEL, | |
599 | 0x03A0, | |
600 | 0x03A1, | |
601 | 0, | |
602 | 0x03A3, | |
603 | 0x03A4, | |
604 | 0x03A5 | HAS_VOWEL, | |
605 | 0x03A6, | |
606 | 0x03A7, | |
607 | 0x03A8, | |
608 | 0x03A9 | HAS_VOWEL, | |
609 | 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, | |
610 | 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, | |
611 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
612 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
613 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
614 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
615 | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, | |
616 | 0x0391 | HAS_VOWEL, | |
617 | 0x0392, | |
618 | 0x0393, | |
619 | 0x0394, | |
620 | 0x0395 | HAS_VOWEL, | |
621 | 0x0396, | |
622 | 0x0397 | HAS_VOWEL, | |
623 | 0x0398, | |
624 | 0x0399 | HAS_VOWEL, | |
625 | 0x039A, | |
626 | 0x039B, | |
627 | 0x039C, | |
628 | 0x039D, | |
629 | 0x039E, | |
630 | 0x039F | HAS_VOWEL, | |
631 | 0x03A0, | |
632 | 0x03A1, | |
633 | 0x03A3, | |
634 | 0x03A3, | |
635 | 0x03A4, | |
636 | 0x03A5 | HAS_VOWEL, | |
637 | 0x03A6, | |
638 | 0x03A7, | |
639 | 0x03A8, | |
640 | 0x03A9 | HAS_VOWEL, | |
641 | 0x0399 | HAS_VOWEL | HAS_DIALYTIKA, | |
642 | 0x03A5 | HAS_VOWEL | HAS_DIALYTIKA, | |
643 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
644 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
645 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
646 | 0x03CF, | |
647 | 0x0392, | |
648 | 0x0398, | |
649 | 0x03D2, | |
650 | 0x03D2 | HAS_ACCENT, | |
651 | 0x03D2 | HAS_DIALYTIKA, | |
652 | 0x03A6, | |
653 | 0x03A0, | |
654 | 0x03CF, | |
655 | 0x03D8, | |
656 | 0x03D8, | |
657 | 0x03DA, | |
658 | 0x03DA, | |
659 | 0x03DC, | |
660 | 0x03DC, | |
661 | 0x03DE, | |
662 | 0x03DE, | |
663 | 0x03E0, | |
664 | 0x03E0, | |
665 | 0, | |
666 | 0, | |
667 | 0, | |
668 | 0, | |
669 | 0, | |
670 | 0, | |
671 | 0, | |
672 | 0, | |
673 | 0, | |
674 | 0, | |
675 | 0, | |
676 | 0, | |
677 | 0, | |
678 | 0, | |
679 | 0x039A, | |
680 | 0x03A1, | |
681 | 0x03F9, | |
682 | 0x037F, | |
683 | 0x03F4, | |
684 | 0x0395 | HAS_VOWEL, | |
685 | 0, | |
686 | 0x03F7, | |
687 | 0x03F7, | |
688 | 0x03F9, | |
689 | 0x03FA, | |
690 | 0x03FA, | |
691 | 0x03FC, | |
692 | 0x03FD, | |
693 | 0x03FE, | |
694 | 0x03FF, | |
695 | }; | |
696 | ||
697 | static const uint16_t data1F00[] = { | |
698 | // U+1F00..1FFF | |
699 | 0x0391 | HAS_VOWEL, | |
700 | 0x0391 | HAS_VOWEL, | |
701 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
702 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
703 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
704 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
705 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
706 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
707 | 0x0391 | HAS_VOWEL, | |
708 | 0x0391 | HAS_VOWEL, | |
709 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
710 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
711 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
712 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
713 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
714 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
715 | 0x0395 | HAS_VOWEL, | |
716 | 0x0395 | HAS_VOWEL, | |
717 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
718 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
719 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
720 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
721 | 0, | |
722 | 0, | |
723 | 0x0395 | HAS_VOWEL, | |
724 | 0x0395 | HAS_VOWEL, | |
725 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
726 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
727 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
728 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
729 | 0, | |
730 | 0, | |
731 | 0x0397 | HAS_VOWEL, | |
732 | 0x0397 | HAS_VOWEL, | |
733 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
734 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
735 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
736 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
737 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
738 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
739 | 0x0397 | HAS_VOWEL, | |
740 | 0x0397 | HAS_VOWEL, | |
741 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
742 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
743 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
744 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
745 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
746 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
747 | 0x0399 | HAS_VOWEL, | |
748 | 0x0399 | HAS_VOWEL, | |
749 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
750 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
751 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
752 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
753 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
754 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
755 | 0x0399 | HAS_VOWEL, | |
756 | 0x0399 | HAS_VOWEL, | |
757 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
758 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
759 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
760 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
761 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
762 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
763 | 0x039F | HAS_VOWEL, | |
764 | 0x039F | HAS_VOWEL, | |
765 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
766 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
767 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
768 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
769 | 0, | |
770 | 0, | |
771 | 0x039F | HAS_VOWEL, | |
772 | 0x039F | HAS_VOWEL, | |
773 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
774 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
775 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
776 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
777 | 0, | |
778 | 0, | |
779 | 0x03A5 | HAS_VOWEL, | |
780 | 0x03A5 | HAS_VOWEL, | |
781 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
782 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
783 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
784 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
785 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
786 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
787 | 0, | |
788 | 0x03A5 | HAS_VOWEL, | |
789 | 0, | |
790 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
791 | 0, | |
792 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
793 | 0, | |
794 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
795 | 0x03A9 | HAS_VOWEL, | |
796 | 0x03A9 | HAS_VOWEL, | |
797 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
798 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
799 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
800 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
801 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
802 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
803 | 0x03A9 | HAS_VOWEL, | |
804 | 0x03A9 | HAS_VOWEL, | |
805 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
806 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
807 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
808 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
809 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
810 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
811 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
812 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
813 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
814 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
815 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
816 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
817 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
818 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
819 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
820 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
821 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
822 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
823 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
824 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
825 | 0, | |
826 | 0, | |
827 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
828 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
829 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
830 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
831 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
832 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
833 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
834 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
835 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
836 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
837 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
838 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
839 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
840 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
841 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
842 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
843 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
844 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
845 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
846 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
847 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
848 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
849 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
850 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
851 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
852 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
853 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
854 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
855 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
856 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
857 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
858 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
859 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
860 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
861 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
862 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
863 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
864 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
865 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
866 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
867 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
868 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
869 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
870 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
871 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
872 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
873 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
874 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
875 | 0x0391 | HAS_VOWEL, | |
876 | 0x0391 | HAS_VOWEL, | |
877 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
878 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
879 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
880 | 0, | |
881 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
882 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
883 | 0x0391 | HAS_VOWEL, | |
884 | 0x0391 | HAS_VOWEL, | |
885 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
886 | 0x0391 | HAS_VOWEL | HAS_ACCENT, | |
887 | 0x0391 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
888 | 0, | |
889 | 0x0399 | HAS_VOWEL, | |
890 | 0, | |
891 | 0, | |
892 | 0, | |
893 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
894 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
895 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
896 | 0, | |
897 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
898 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
899 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
900 | 0x0395 | HAS_VOWEL | HAS_ACCENT, | |
901 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
902 | 0x0397 | HAS_VOWEL | HAS_ACCENT, | |
903 | 0x0397 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
904 | 0, | |
905 | 0, | |
906 | 0, | |
907 | 0x0399 | HAS_VOWEL, | |
908 | 0x0399 | HAS_VOWEL, | |
909 | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, | |
910 | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, | |
911 | 0, | |
912 | 0, | |
913 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
914 | 0x0399 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, | |
915 | 0x0399 | HAS_VOWEL, | |
916 | 0x0399 | HAS_VOWEL, | |
917 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
918 | 0x0399 | HAS_VOWEL | HAS_ACCENT, | |
919 | 0, | |
920 | 0, | |
921 | 0, | |
922 | 0, | |
923 | 0x03A5 | HAS_VOWEL, | |
924 | 0x03A5 | HAS_VOWEL, | |
925 | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, | |
926 | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, | |
927 | 0x03A1, | |
928 | 0x03A1, | |
929 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
930 | 0x03A5 | HAS_VOWEL | HAS_ACCENT | HAS_DIALYTIKA, | |
931 | 0x03A5 | HAS_VOWEL, | |
932 | 0x03A5 | HAS_VOWEL, | |
933 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
934 | 0x03A5 | HAS_VOWEL | HAS_ACCENT, | |
935 | 0x03A1, | |
936 | 0, | |
937 | 0, | |
938 | 0, | |
939 | 0, | |
940 | 0, | |
941 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
942 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
943 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
944 | 0, | |
945 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
946 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI | HAS_ACCENT, | |
947 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
948 | 0x039F | HAS_VOWEL | HAS_ACCENT, | |
949 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
950 | 0x03A9 | HAS_VOWEL | HAS_ACCENT, | |
951 | 0x03A9 | HAS_VOWEL | HAS_YPOGEGRAMMENI, | |
952 | 0, | |
953 | 0, | |
954 | 0, | |
955 | }; | |
956 | ||
957 | // U+2126 Ohm sign | |
958 | static const uint16_t data2126 = 0x03A9 | HAS_VOWEL; | |
959 | ||
960 | uint32_t getLetterData(UChar32 c) { | |
961 | if (c < 0x370 || 0x2126 < c || (0x3ff < c && c < 0x1f00)) { | |
962 | return 0; | |
963 | } else if (c <= 0x3ff) { | |
964 | return data0370[c - 0x370]; | |
965 | } else if (c <= 0x1fff) { | |
966 | return data1F00[c - 0x1f00]; | |
967 | } else if (c == 0x2126) { | |
968 | return data2126; | |
969 | } else { | |
970 | return 0; | |
b75a7d8f | 971 | } |
f3c0d7a5 | 972 | } |
b75a7d8f | 973 | |
f3c0d7a5 A |
974 | uint32_t getDiacriticData(UChar32 c) { |
975 | switch (c) { | |
976 | case 0x0300: // varia | |
977 | case 0x0301: // tonos = oxia | |
978 | case 0x0342: // perispomeni | |
979 | case 0x0302: // circumflex can look like perispomeni | |
980 | case 0x0303: // tilde can look like perispomeni | |
981 | case 0x0311: // inverted breve can look like perispomeni | |
982 | return HAS_ACCENT; | |
983 | case 0x0308: // dialytika = diaeresis | |
984 | return HAS_COMBINING_DIALYTIKA; | |
985 | case 0x0344: // dialytika tonos | |
986 | return HAS_COMBINING_DIALYTIKA | HAS_ACCENT; | |
987 | case 0x0345: // ypogegrammeni = iota subscript | |
988 | return HAS_YPOGEGRAMMENI; | |
989 | case 0x0304: // macron | |
990 | case 0x0306: // breve | |
991 | case 0x0313: // comma above | |
992 | case 0x0314: // reversed comma above | |
993 | case 0x0343: // koronis | |
994 | return HAS_OTHER_GREEK_DIACRITIC; | |
995 | default: | |
996 | return 0; | |
374ca955 | 997 | } |
f3c0d7a5 A |
998 | } |
999 | ||
1000 | UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) { | |
1001 | while (i < length) { | |
1002 | UChar32 c; | |
1003 | U16_NEXT(s, i, length, c); | |
1004 | int32_t type = ucase_getTypeOrIgnorable(c); | |
1005 | if ((type & UCASE_IGNORABLE) != 0) { | |
1006 | // Case-ignorable, continue with the loop. | |
1007 | } else if (type != UCASE_NONE) { | |
1008 | return TRUE; // Followed by cased letter. | |
1009 | } else { | |
1010 | return FALSE; // Uncased and not case-ignorable. | |
1011 | } | |
1012 | } | |
1013 | return FALSE; // Not followed by cased letter. | |
1014 | } | |
1015 | ||
1016 | /** | |
1017 | * Greek string uppercasing with a state machine. | |
1018 | * Probably simpler than a stateless function that has to figure out complex context-before | |
1019 | * for each character. | |
1020 | * TODO: Try to re-consolidate one way or another with the non-Greek function. | |
1021 | */ | |
1022 | int32_t toUpper(uint32_t options, | |
1023 | UChar *dest, int32_t destCapacity, | |
1024 | const UChar *src, int32_t srcLength, | |
1025 | Edits *edits, | |
1026 | UErrorCode &errorCode) { | |
1027 | int32_t destIndex=0; | |
1028 | uint32_t state = 0; | |
1029 | for (int32_t i = 0; i < srcLength;) { | |
1030 | int32_t nextIndex = i; | |
1031 | UChar32 c; | |
1032 | U16_NEXT(src, nextIndex, srcLength, c); | |
1033 | uint32_t nextState = 0; | |
1034 | int32_t type = ucase_getTypeOrIgnorable(c); | |
1035 | if ((type & UCASE_IGNORABLE) != 0) { | |
1036 | // c is case-ignorable | |
1037 | nextState |= (state & AFTER_CASED); | |
1038 | } else if (type != UCASE_NONE) { | |
1039 | // c is cased | |
1040 | nextState |= AFTER_CASED; | |
1041 | } | |
1042 | uint32_t data = getLetterData(c); | |
1043 | if (data > 0) { | |
1044 | uint32_t upper = data & UPPER_MASK; | |
1045 | // Add a dialytika to this iota or ypsilon vowel | |
1046 | // if we removed a tonos from the previous vowel, | |
1047 | // and that previous vowel did not also have (or gain) a dialytika. | |
1048 | // Adding one only to the final vowel in a longer sequence | |
1049 | // (which does not occur in normal writing) would require lookahead. | |
1050 | // Set the same flag as for preserving an existing dialytika. | |
1051 | if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 && | |
1052 | (upper == 0x399 || upper == 0x3A5)) { | |
1053 | data |= HAS_DIALYTIKA; | |
1054 | } | |
1055 | int32_t numYpogegrammeni = 0; // Map each one to a trailing, spacing, capital iota. | |
1056 | if ((data & HAS_YPOGEGRAMMENI) != 0) { | |
1057 | numYpogegrammeni = 1; | |
1058 | } | |
1059 | // Skip combining diacritics after this Greek letter. | |
1060 | while (nextIndex < srcLength) { | |
1061 | uint32_t diacriticData = getDiacriticData(src[nextIndex]); | |
1062 | if (diacriticData != 0) { | |
1063 | data |= diacriticData; | |
1064 | if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) { | |
1065 | ++numYpogegrammeni; | |
1066 | } | |
1067 | ++nextIndex; | |
1068 | } else { | |
1069 | break; // not a Greek diacritic | |
1070 | } | |
1071 | } | |
1072 | if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) { | |
1073 | nextState |= AFTER_VOWEL_WITH_ACCENT; | |
1074 | } | |
1075 | // Map according to Greek rules. | |
1076 | UBool addTonos = FALSE; | |
1077 | if (upper == 0x397 && | |
1078 | (data & HAS_ACCENT) != 0 && | |
1079 | numYpogegrammeni == 0 && | |
1080 | (state & AFTER_CASED) == 0 && | |
1081 | !isFollowedByCasedLetter(src, nextIndex, srcLength)) { | |
1082 | // Keep disjunctive "or" with (only) a tonos. | |
1083 | // We use the same "word boundary" conditions as for the Final_Sigma test. | |
1084 | if (i == nextIndex) { | |
1085 | upper = 0x389; // Preserve the precomposed form. | |
1086 | } else { | |
1087 | addTonos = TRUE; | |
1088 | } | |
1089 | } else if ((data & HAS_DIALYTIKA) != 0) { | |
1090 | // Preserve a vowel with dialytika in precomposed form if it exists. | |
1091 | if (upper == 0x399) { | |
1092 | upper = 0x3AA; | |
1093 | data &= ~HAS_EITHER_DIALYTIKA; | |
1094 | } else if (upper == 0x3A5) { | |
1095 | upper = 0x3AB; | |
1096 | data &= ~HAS_EITHER_DIALYTIKA; | |
1097 | } | |
1098 | } | |
1099 | ||
0f5d89e8 A |
1100 | UBool change; |
1101 | if (edits == nullptr && (options & U_OMIT_UNCHANGED_TEXT) == 0) { | |
1102 | change = TRUE; // common, simple usage | |
1103 | } else { | |
f3c0d7a5 A |
1104 | // Find out first whether we are changing the text. |
1105 | change = src[i] != upper || numYpogegrammeni > 0; | |
1106 | int32_t i2 = i + 1; | |
1107 | if ((data & HAS_EITHER_DIALYTIKA) != 0) { | |
1108 | change |= i2 >= nextIndex || src[i2] != 0x308; | |
1109 | ++i2; | |
1110 | } | |
1111 | if (addTonos) { | |
1112 | change |= i2 >= nextIndex || src[i2] != 0x301; | |
1113 | ++i2; | |
1114 | } | |
1115 | int32_t oldLength = nextIndex - i; | |
1116 | int32_t newLength = (i2 - i) + numYpogegrammeni; | |
1117 | change |= oldLength != newLength; | |
1118 | if (change) { | |
1119 | if (edits != NULL) { | |
1120 | edits->addReplace(oldLength, newLength); | |
1121 | } | |
1122 | } else { | |
1123 | if (edits != NULL) { | |
1124 | edits->addUnchanged(oldLength); | |
1125 | } | |
1126 | // Write unchanged text? | |
0f5d89e8 | 1127 | change = (options & U_OMIT_UNCHANGED_TEXT) == 0; |
f3c0d7a5 A |
1128 | } |
1129 | } | |
1130 | ||
1131 | if (change) { | |
1132 | destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper); | |
1133 | if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { | |
1134 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika | |
1135 | } | |
1136 | if (destIndex >= 0 && addTonos) { | |
1137 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x301); | |
1138 | } | |
1139 | while (destIndex >= 0 && numYpogegrammeni > 0) { | |
1140 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x399); | |
1141 | --numYpogegrammeni; | |
1142 | } | |
1143 | if(destIndex<0) { | |
1144 | errorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
1145 | return 0; | |
1146 | } | |
1147 | } | |
1148 | } else { | |
1149 | const UChar *s; | |
1150 | c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK); | |
1151 | destIndex = appendResult(dest, destIndex, destCapacity, c, s, | |
1152 | nextIndex - i, options, edits); | |
1153 | if (destIndex < 0) { | |
1154 | errorCode = U_INDEX_OUTOFBOUNDS_ERROR; | |
1155 | return 0; | |
1156 | } | |
1157 | } | |
1158 | i = nextIndex; | |
1159 | state = nextState; | |
1160 | } | |
1161 | ||
b75a7d8f A |
1162 | return destIndex; |
1163 | } | |
1164 | ||
f3c0d7a5 A |
1165 | } // namespace GreekUpper |
1166 | U_NAMESPACE_END | |
46f4442e A |
1167 | |
1168 | /* functions available in the common library (for unistr_case.cpp) */ | |
1169 | ||
4388f060 | 1170 | U_CFUNC int32_t U_CALLCONV |
f3c0d7a5 | 1171 | ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
4388f060 A |
1172 | UChar *dest, int32_t destCapacity, |
1173 | const UChar *src, int32_t srcLength, | |
f3c0d7a5 A |
1174 | icu::Edits *edits, |
1175 | UErrorCode &errorCode) { | |
4388f060 | 1176 | UCaseContext csc=UCASECONTEXT_INITIALIZER; |
374ca955 A |
1177 | csc.p=(void *)src; |
1178 | csc.limit=srcLength; | |
0f5d89e8 A |
1179 | int32_t destIndex = toLower( |
1180 | caseLocale, options, | |
4388f060 A |
1181 | dest, destCapacity, |
1182 | src, &csc, 0, srcLength, | |
f3c0d7a5 A |
1183 | edits, errorCode); |
1184 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); | |
374ca955 A |
1185 | } |
1186 | ||
4388f060 | 1187 | U_CFUNC int32_t U_CALLCONV |
f3c0d7a5 | 1188 | ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED |
4388f060 A |
1189 | UChar *dest, int32_t destCapacity, |
1190 | const UChar *src, int32_t srcLength, | |
f3c0d7a5 A |
1191 | icu::Edits *edits, |
1192 | UErrorCode &errorCode) { | |
1193 | int32_t destIndex; | |
1194 | if (caseLocale == UCASE_LOC_GREEK) { | |
1195 | destIndex = GreekUpper::toUpper(options, dest, destCapacity, | |
1196 | src, srcLength, edits, errorCode); | |
1197 | } else { | |
1198 | UCaseContext csc=UCASECONTEXT_INITIALIZER; | |
1199 | csc.p=(void *)src; | |
1200 | csc.limit=srcLength; | |
0f5d89e8 A |
1201 | destIndex = toUpper( |
1202 | caseLocale, options, | |
f3c0d7a5 | 1203 | dest, destCapacity, |
0f5d89e8 | 1204 | src, &csc, srcLength, |
f3c0d7a5 A |
1205 | edits, errorCode); |
1206 | } | |
1207 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); | |
374ca955 A |
1208 | } |
1209 | ||
f3c0d7a5 A |
1210 | U_CFUNC int32_t U_CALLCONV |
1211 | ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED | |
1212 | UChar *dest, int32_t destCapacity, | |
1213 | const UChar *src, int32_t srcLength, | |
1214 | icu::Edits *edits, | |
1215 | UErrorCode &errorCode) { | |
0f5d89e8 A |
1216 | int32_t destIndex = toLower( |
1217 | -1, options, | |
1218 | dest, destCapacity, | |
1219 | src, nullptr, 0, srcLength, | |
1220 | edits, errorCode); | |
f3c0d7a5 | 1221 | return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode); |
4388f060 | 1222 | } |
374ca955 | 1223 | |
4388f060 | 1224 | U_CFUNC int32_t |
f3c0d7a5 | 1225 | ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM |
4388f060 A |
1226 | UChar *dest, int32_t destCapacity, |
1227 | const UChar *src, int32_t srcLength, | |
1228 | UStringCaseMapper *stringCaseMapper, | |
f3c0d7a5 A |
1229 | icu::Edits *edits, |
1230 | UErrorCode &errorCode) { | |
1231 | int32_t destLength; | |
1232 | ||
1233 | /* check argument values */ | |
1234 | if(U_FAILURE(errorCode)) { | |
1235 | return 0; | |
1236 | } | |
1237 | if( destCapacity<0 || | |
1238 | (dest==NULL && destCapacity>0) || | |
1239 | src==NULL || | |
1240 | srcLength<-1 | |
1241 | ) { | |
1242 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
1243 | return 0; | |
1244 | } | |
1245 | ||
1246 | /* get the string length */ | |
1247 | if(srcLength==-1) { | |
1248 | srcLength=u_strlen(src); | |
1249 | } | |
1250 | ||
1251 | /* check for overlapping source and destination */ | |
1252 | if( dest!=NULL && | |
1253 | ((src>=dest && src<(dest+destCapacity)) || | |
1254 | (dest>=src && dest<(src+srcLength))) | |
1255 | ) { | |
1256 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
1257 | return 0; | |
1258 | } | |
1259 | ||
0f5d89e8 | 1260 | if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) { |
f3c0d7a5 A |
1261 | edits->reset(); |
1262 | } | |
1263 | destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR | |
1264 | dest, destCapacity, src, srcLength, edits, errorCode); | |
1265 | return u_terminateUChars(dest, destCapacity, destLength, &errorCode); | |
1266 | } | |
1267 | ||
1268 | U_CFUNC int32_t | |
1269 | ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM | |
1270 | UChar *dest, int32_t destCapacity, | |
1271 | const UChar *src, int32_t srcLength, | |
1272 | UStringCaseMapper *stringCaseMapper, | |
1273 | UErrorCode &errorCode) { | |
b75a7d8f A |
1274 | UChar buffer[300]; |
1275 | UChar *temp; | |
374ca955 | 1276 | |
b75a7d8f | 1277 | int32_t destLength; |
b75a7d8f A |
1278 | |
1279 | /* check argument values */ | |
f3c0d7a5 | 1280 | if(U_FAILURE(errorCode)) { |
b75a7d8f A |
1281 | return 0; |
1282 | } | |
1283 | if( destCapacity<0 || | |
1284 | (dest==NULL && destCapacity>0) || | |
1285 | src==NULL || | |
1286 | srcLength<-1 | |
1287 | ) { | |
f3c0d7a5 | 1288 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
b75a7d8f A |
1289 | return 0; |
1290 | } | |
1291 | ||
1292 | /* get the string length */ | |
1293 | if(srcLength==-1) { | |
1294 | srcLength=u_strlen(src); | |
1295 | } | |
1296 | ||
1297 | /* check for overlapping source and destination */ | |
1298 | if( dest!=NULL && | |
1299 | ((src>=dest && src<(dest+destCapacity)) || | |
1300 | (dest>=src && dest<(src+srcLength))) | |
1301 | ) { | |
1302 | /* overlap: provide a temporary destination buffer and later copy the result */ | |
b331163b | 1303 | if(destCapacity<=UPRV_LENGTHOF(buffer)) { |
b75a7d8f A |
1304 | /* the stack buffer is large enough */ |
1305 | temp=buffer; | |
1306 | } else { | |
1307 | /* allocate a buffer */ | |
1308 | temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); | |
1309 | if(temp==NULL) { | |
f3c0d7a5 | 1310 | errorCode=U_MEMORY_ALLOCATION_ERROR; |
b75a7d8f A |
1311 | return 0; |
1312 | } | |
1313 | } | |
1314 | } else { | |
1315 | temp=dest; | |
1316 | } | |
1317 | ||
f3c0d7a5 A |
1318 | destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR |
1319 | temp, destCapacity, src, srcLength, NULL, errorCode); | |
b75a7d8f A |
1320 | if(temp!=dest) { |
1321 | /* copy the result string to the destination buffer */ | |
f3c0d7a5 A |
1322 | if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) { |
1323 | u_memmove(dest, temp, destLength); | |
b75a7d8f A |
1324 | } |
1325 | if(temp!=buffer) { | |
1326 | uprv_free(temp); | |
1327 | } | |
1328 | } | |
1329 | ||
f3c0d7a5 | 1330 | return u_terminateUChars(dest, destCapacity, destLength, &errorCode); |
b75a7d8f A |
1331 | } |
1332 | ||
374ca955 A |
1333 | /* public API functions */ |
1334 | ||
b75a7d8f A |
1335 | U_CAPI int32_t U_EXPORT2 |
1336 | u_strFoldCase(UChar *dest, int32_t destCapacity, | |
1337 | const UChar *src, int32_t srcLength, | |
1338 | uint32_t options, | |
1339 | UErrorCode *pErrorCode) { | |
f3c0d7a5 A |
1340 | return ustrcase_mapWithOverlap( |
1341 | UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL | |
1342 | dest, destCapacity, | |
1343 | src, srcLength, | |
1344 | ustrcase_internalFold, *pErrorCode); | |
1345 | } | |
1346 | ||
1347 | U_NAMESPACE_BEGIN | |
1348 | ||
1349 | int32_t CaseMap::fold( | |
1350 | uint32_t options, | |
1351 | const UChar *src, int32_t srcLength, | |
1352 | UChar *dest, int32_t destCapacity, Edits *edits, | |
1353 | UErrorCode &errorCode) { | |
4388f060 | 1354 | return ustrcase_map( |
f3c0d7a5 | 1355 | UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL |
4388f060 A |
1356 | dest, destCapacity, |
1357 | src, srcLength, | |
f3c0d7a5 | 1358 | ustrcase_internalFold, edits, errorCode); |
374ca955 A |
1359 | } |
1360 | ||
f3c0d7a5 A |
1361 | U_NAMESPACE_END |
1362 | ||
374ca955 A |
1363 | /* case-insensitive string comparisons -------------------------------------- */ |
1364 | ||
1365 | /* | |
1366 | * This function is a copy of unorm_cmpEquivFold() minus the parts for | |
1367 | * canonical equivalence. | |
1368 | * Keep the functions in sync, and see there for how this works. | |
1369 | * The duplication is for modularization: | |
1370 | * It makes caseless (but not canonical caseless) matches independent of | |
1371 | * the normalization code. | |
1372 | */ | |
1373 | ||
1374 | /* stack element for previous-level source/decomposition pointers */ | |
1375 | struct CmpEquivLevel { | |
1376 | const UChar *start, *s, *limit; | |
1377 | }; | |
1378 | typedef struct CmpEquivLevel CmpEquivLevel; | |
1379 | ||
b331163b A |
1380 | /** |
1381 | * Internal implementation code comparing string with case fold. | |
1382 | * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch(). | |
1383 | * | |
1384 | * @param s1 input string 1 | |
1385 | * @param length1 length of string 1, or -1 (NULL terminated) | |
1386 | * @param s2 input string 2 | |
1387 | * @param length2 length of string 2, or -1 (NULL terminated) | |
1388 | * @param options compare options | |
1389 | * @param matchLen1 (output) length of partial prefix match in s1 | |
1390 | * @param matchLen2 (output) length of partial prefix match in s2 | |
1391 | * @param pErrorCode receives error status | |
1392 | * @return The result of comparison | |
1393 | */ | |
1394 | static int32_t _cmpFold( | |
1395 | const UChar *s1, int32_t length1, | |
1396 | const UChar *s2, int32_t length2, | |
1397 | uint32_t options, | |
1398 | int32_t *matchLen1, int32_t *matchLen2, | |
1399 | UErrorCode *pErrorCode) { | |
1400 | int32_t cmpRes = 0; | |
1401 | ||
374ca955 A |
1402 | /* current-level start/limit - s1/s2 as current */ |
1403 | const UChar *start1, *start2, *limit1, *limit2; | |
1404 | ||
b331163b A |
1405 | /* points to the original start address */ |
1406 | const UChar *org1, *org2; | |
1407 | ||
1408 | /* points to the end of match + 1 */ | |
1409 | const UChar *m1, *m2; | |
1410 | ||
374ca955 A |
1411 | /* case folding variables */ |
1412 | const UChar *p; | |
1413 | int32_t length; | |
1414 | ||
1415 | /* stacks of previous-level start/current/limit */ | |
1416 | CmpEquivLevel stack1[2], stack2[2]; | |
1417 | ||
1418 | /* case folding buffers, only use current-level start/limit */ | |
1419 | UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; | |
1420 | ||
1421 | /* track which is the current level per string */ | |
1422 | int32_t level1, level2; | |
1423 | ||
1424 | /* current code units, and code points for lookups */ | |
1425 | UChar32 c1, c2, cp1, cp2; | |
1426 | ||
1427 | /* no argument error checking because this itself is not an API */ | |
1428 | ||
1429 | /* | |
1430 | * assume that at least the option U_COMPARE_IGNORE_CASE is set | |
1431 | * otherwise this function would have to behave exactly as uprv_strCompare() | |
1432 | */ | |
374ca955 A |
1433 | if(U_FAILURE(*pErrorCode)) { |
1434 | return 0; | |
1435 | } | |
1436 | ||
1437 | /* initialize */ | |
b331163b A |
1438 | if(matchLen1) { |
1439 | U_ASSERT(matchLen2 !=NULL); | |
1440 | *matchLen1=0; | |
1441 | *matchLen2=0; | |
1442 | } | |
1443 | ||
1444 | start1=m1=org1=s1; | |
374ca955 A |
1445 | if(length1==-1) { |
1446 | limit1=NULL; | |
1447 | } else { | |
1448 | limit1=s1+length1; | |
1449 | } | |
1450 | ||
b331163b | 1451 | start2=m2=org2=s2; |
374ca955 A |
1452 | if(length2==-1) { |
1453 | limit2=NULL; | |
1454 | } else { | |
1455 | limit2=s2+length2; | |
1456 | } | |
1457 | ||
1458 | level1=level2=0; | |
1459 | c1=c2=-1; | |
1460 | ||
1461 | /* comparison loop */ | |
1462 | for(;;) { | |
1463 | /* | |
1464 | * here a code unit value of -1 means "get another code unit" | |
1465 | * below it will mean "this source is finished" | |
1466 | */ | |
1467 | ||
1468 | if(c1<0) { | |
1469 | /* get next code unit from string 1, post-increment */ | |
1470 | for(;;) { | |
1471 | if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { | |
1472 | if(level1==0) { | |
1473 | c1=-1; | |
1474 | break; | |
1475 | } | |
1476 | } else { | |
1477 | ++s1; | |
1478 | break; | |
1479 | } | |
1480 | ||
1481 | /* reached end of level buffer, pop one level */ | |
1482 | do { | |
1483 | --level1; | |
4388f060 | 1484 | start1=stack1[level1].start; /*Not uninitialized*/ |
374ca955 | 1485 | } while(start1==NULL); |
4388f060 A |
1486 | s1=stack1[level1].s; /*Not uninitialized*/ |
1487 | limit1=stack1[level1].limit; /*Not uninitialized*/ | |
374ca955 A |
1488 | } |
1489 | } | |
1490 | ||
1491 | if(c2<0) { | |
1492 | /* get next code unit from string 2, post-increment */ | |
1493 | for(;;) { | |
1494 | if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { | |
1495 | if(level2==0) { | |
1496 | c2=-1; | |
1497 | break; | |
1498 | } | |
1499 | } else { | |
1500 | ++s2; | |
1501 | break; | |
1502 | } | |
1503 | ||
1504 | /* reached end of level buffer, pop one level */ | |
1505 | do { | |
1506 | --level2; | |
4388f060 | 1507 | start2=stack2[level2].start; /*Not uninitialized*/ |
374ca955 | 1508 | } while(start2==NULL); |
4388f060 A |
1509 | s2=stack2[level2].s; /*Not uninitialized*/ |
1510 | limit2=stack2[level2].limit; /*Not uninitialized*/ | |
374ca955 A |
1511 | } |
1512 | } | |
1513 | ||
1514 | /* | |
1515 | * compare c1 and c2 | |
1516 | * either variable c1, c2 is -1 only if the corresponding string is finished | |
1517 | */ | |
1518 | if(c1==c2) { | |
b331163b A |
1519 | const UChar *next1, *next2; |
1520 | ||
374ca955 | 1521 | if(c1<0) { |
b331163b A |
1522 | cmpRes=0; /* c1==c2==-1 indicating end of strings */ |
1523 | break; | |
1524 | } | |
1525 | ||
1526 | /* | |
1527 | * Note: Move the match positions in both strings at the same time | |
1528 | * only when corresponding code point(s) in the original strings | |
1529 | * are fully consumed. For example, when comparing s1="Fust" and | |
1530 | * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches | |
1531 | * the first code point in the case-folded data. But the second "s" | |
1532 | * has no matching code point in s1, so this implementation returns | |
1533 | * 2 as the prefix match length ("Fu"). | |
1534 | */ | |
1535 | next1=next2=NULL; | |
1536 | if(level1==0) { | |
1537 | next1=s1; | |
1538 | } else if(s1==limit1) { | |
1539 | /* Note: This implementation only use a single level of stack. | |
1540 | * If this code needs to be changed to use multiple levels | |
1541 | * of stacks, the code above should check if the current | |
1542 | * code is at the end of all stacks. | |
1543 | */ | |
1544 | U_ASSERT(level1==1); | |
1545 | ||
1546 | /* is s1 at the end of the current stack? */ | |
1547 | next1=stack1[0].s; | |
1548 | } | |
1549 | ||
1550 | if (next1!=NULL) { | |
1551 | if(level2==0) { | |
1552 | next2=s2; | |
1553 | } else if(s2==limit2) { | |
1554 | U_ASSERT(level2==1); | |
1555 | ||
1556 | /* is s2 at the end of the current stack? */ | |
1557 | next2=stack2[0].s; | |
1558 | } | |
1559 | if(next2!=NULL) { | |
1560 | m1=next1; | |
1561 | m2=next2; | |
1562 | } | |
374ca955 A |
1563 | } |
1564 | c1=c2=-1; /* make us fetch new code units */ | |
1565 | continue; | |
1566 | } else if(c1<0) { | |
b331163b A |
1567 | cmpRes=-1; /* string 1 ends before string 2 */ |
1568 | break; | |
374ca955 | 1569 | } else if(c2<0) { |
b331163b A |
1570 | cmpRes=1; /* string 2 ends before string 1 */ |
1571 | break; | |
374ca955 A |
1572 | } |
1573 | /* c1!=c2 && c1>=0 && c2>=0 */ | |
1574 | ||
1575 | /* get complete code points for c1, c2 for lookups if either is a surrogate */ | |
1576 | cp1=c1; | |
1577 | if(U_IS_SURROGATE(c1)) { | |
1578 | UChar c; | |
1579 | ||
1580 | if(U_IS_SURROGATE_LEAD(c1)) { | |
1581 | if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { | |
1582 | /* advance ++s1; only below if cp1 decomposes/case-folds */ | |
1583 | cp1=U16_GET_SUPPLEMENTARY(c1, c); | |
1584 | } | |
1585 | } else /* isTrail(c1) */ { | |
1586 | if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { | |
1587 | cp1=U16_GET_SUPPLEMENTARY(c, c1); | |
1588 | } | |
1589 | } | |
1590 | } | |
1591 | ||
1592 | cp2=c2; | |
1593 | if(U_IS_SURROGATE(c2)) { | |
1594 | UChar c; | |
1595 | ||
1596 | if(U_IS_SURROGATE_LEAD(c2)) { | |
1597 | if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { | |
1598 | /* advance ++s2; only below if cp2 decomposes/case-folds */ | |
1599 | cp2=U16_GET_SUPPLEMENTARY(c2, c); | |
1600 | } | |
1601 | } else /* isTrail(c2) */ { | |
1602 | if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { | |
1603 | cp2=U16_GET_SUPPLEMENTARY(c, c2); | |
1604 | } | |
1605 | } | |
1606 | } | |
1607 | ||
1608 | /* | |
1609 | * go down one level for each string | |
1610 | * continue with the main loop as soon as there is a real change | |
1611 | */ | |
1612 | ||
1613 | if( level1==0 && | |
f3c0d7a5 | 1614 | (length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0 |
374ca955 A |
1615 | ) { |
1616 | /* cp1 case-folds to the code point "length" or to p[length] */ | |
1617 | if(U_IS_SURROGATE(c1)) { | |
1618 | if(U_IS_SURROGATE_LEAD(c1)) { | |
1619 | /* advance beyond source surrogate pair if it case-folds */ | |
1620 | ++s1; | |
1621 | } else /* isTrail(c1) */ { | |
1622 | /* | |
1623 | * we got a supplementary code point when hitting its trail surrogate, | |
1624 | * therefore the lead surrogate must have been the same as in the other string; | |
1625 | * compare this decomposition with the lead surrogate in the other string | |
1626 | * remember that this simulates bulk text replacement: | |
1627 | * the decomposition would replace the entire code point | |
1628 | */ | |
1629 | --s2; | |
b331163b | 1630 | --m2; |
374ca955 A |
1631 | c2=*(s2-1); |
1632 | } | |
1633 | } | |
1634 | ||
1635 | /* push current level pointers */ | |
1636 | stack1[0].start=start1; | |
1637 | stack1[0].s=s1; | |
1638 | stack1[0].limit=limit1; | |
1639 | ++level1; | |
1640 | ||
1641 | /* copy the folding result to fold1[] */ | |
1642 | if(length<=UCASE_MAX_STRING_LENGTH) { | |
1643 | u_memcpy(fold1, p, length); | |
1644 | } else { | |
1645 | int32_t i=0; | |
1646 | U16_APPEND_UNSAFE(fold1, i, length); | |
1647 | length=i; | |
1648 | } | |
1649 | ||
1650 | /* set next level pointers to case folding */ | |
1651 | start1=s1=fold1; | |
1652 | limit1=fold1+length; | |
1653 | ||
1654 | /* get ready to read from decomposition, continue with loop */ | |
1655 | c1=-1; | |
1656 | continue; | |
1657 | } | |
1658 | ||
1659 | if( level2==0 && | |
f3c0d7a5 | 1660 | (length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0 |
374ca955 A |
1661 | ) { |
1662 | /* cp2 case-folds to the code point "length" or to p[length] */ | |
1663 | if(U_IS_SURROGATE(c2)) { | |
1664 | if(U_IS_SURROGATE_LEAD(c2)) { | |
1665 | /* advance beyond source surrogate pair if it case-folds */ | |
1666 | ++s2; | |
1667 | } else /* isTrail(c2) */ { | |
1668 | /* | |
1669 | * we got a supplementary code point when hitting its trail surrogate, | |
1670 | * therefore the lead surrogate must have been the same as in the other string; | |
1671 | * compare this decomposition with the lead surrogate in the other string | |
1672 | * remember that this simulates bulk text replacement: | |
1673 | * the decomposition would replace the entire code point | |
1674 | */ | |
1675 | --s1; | |
b331163b | 1676 | --m2; |
374ca955 A |
1677 | c1=*(s1-1); |
1678 | } | |
1679 | } | |
1680 | ||
1681 | /* push current level pointers */ | |
1682 | stack2[0].start=start2; | |
1683 | stack2[0].s=s2; | |
1684 | stack2[0].limit=limit2; | |
1685 | ++level2; | |
1686 | ||
1687 | /* copy the folding result to fold2[] */ | |
1688 | if(length<=UCASE_MAX_STRING_LENGTH) { | |
1689 | u_memcpy(fold2, p, length); | |
1690 | } else { | |
1691 | int32_t i=0; | |
1692 | U16_APPEND_UNSAFE(fold2, i, length); | |
1693 | length=i; | |
1694 | } | |
1695 | ||
1696 | /* set next level pointers to case folding */ | |
1697 | start2=s2=fold2; | |
1698 | limit2=fold2+length; | |
1699 | ||
1700 | /* get ready to read from decomposition, continue with loop */ | |
1701 | c2=-1; | |
1702 | continue; | |
1703 | } | |
1704 | ||
1705 | /* | |
1706 | * no decomposition/case folding, max level for both sides: | |
1707 | * return difference result | |
1708 | * | |
1709 | * code point order comparison must not just return cp1-cp2 | |
1710 | * because when single surrogates are present then the surrogate pairs | |
1711 | * that formed cp1 and cp2 may be from different string indexes | |
1712 | * | |
1713 | * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units | |
1714 | * c1=d800 cp1=10001 c2=dc00 cp2=10000 | |
1715 | * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } | |
1716 | * | |
1717 | * therefore, use same fix-up as in ustring.c/uprv_strCompare() | |
1718 | * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ | |
1719 | * so we have slightly different pointer/start/limit comparisons here | |
1720 | */ | |
1721 | ||
1722 | if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { | |
1723 | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ | |
1724 | if( | |
1725 | (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || | |
1726 | (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) | |
1727 | ) { | |
1728 | /* part of a surrogate pair, leave >=d800 */ | |
1729 | } else { | |
1730 | /* BMP code point - may be surrogate code point - make <d800 */ | |
1731 | c1-=0x2800; | |
1732 | } | |
1733 | ||
1734 | if( | |
1735 | (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || | |
1736 | (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) | |
1737 | ) { | |
1738 | /* part of a surrogate pair, leave >=d800 */ | |
1739 | } else { | |
1740 | /* BMP code point - may be surrogate code point - make <d800 */ | |
1741 | c2-=0x2800; | |
1742 | } | |
1743 | } | |
1744 | ||
b331163b A |
1745 | cmpRes=c1-c2; |
1746 | break; | |
374ca955 | 1747 | } |
b331163b A |
1748 | |
1749 | if(matchLen1) { | |
1750 | *matchLen1=m1-org1; | |
1751 | *matchLen2=m2-org2; | |
1752 | } | |
1753 | return cmpRes; | |
1754 | } | |
1755 | ||
1756 | /* internal function */ | |
1757 | U_CFUNC int32_t | |
1758 | u_strcmpFold(const UChar *s1, int32_t length1, | |
1759 | const UChar *s2, int32_t length2, | |
1760 | uint32_t options, | |
1761 | UErrorCode *pErrorCode) { | |
1762 | return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode); | |
b75a7d8f A |
1763 | } |
1764 | ||
374ca955 | 1765 | /* public API functions */ |
b75a7d8f A |
1766 | |
1767 | U_CAPI int32_t U_EXPORT2 | |
1768 | u_strCaseCompare(const UChar *s1, int32_t length1, | |
1769 | const UChar *s2, int32_t length2, | |
1770 | uint32_t options, | |
1771 | UErrorCode *pErrorCode) { | |
1772 | /* argument checking */ | |
1773 | if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { | |
1774 | return 0; | |
1775 | } | |
1776 | if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { | |
1777 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
1778 | return 0; | |
1779 | } | |
374ca955 A |
1780 | return u_strcmpFold(s1, length1, s2, length2, |
1781 | options|U_COMPARE_IGNORE_CASE, | |
1782 | pErrorCode); | |
b75a7d8f A |
1783 | } |
1784 | ||
1785 | U_CAPI int32_t U_EXPORT2 | |
1786 | u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) { | |
1787 | UErrorCode errorCode=U_ZERO_ERROR; | |
374ca955 A |
1788 | return u_strcmpFold(s1, -1, s2, -1, |
1789 | options|U_COMPARE_IGNORE_CASE, | |
1790 | &errorCode); | |
b75a7d8f A |
1791 | } |
1792 | ||
1793 | U_CAPI int32_t U_EXPORT2 | |
1794 | u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) { | |
1795 | UErrorCode errorCode=U_ZERO_ERROR; | |
374ca955 A |
1796 | return u_strcmpFold(s1, length, s2, length, |
1797 | options|U_COMPARE_IGNORE_CASE, | |
1798 | &errorCode); | |
b75a7d8f A |
1799 | } |
1800 | ||
1801 | U_CAPI int32_t U_EXPORT2 | |
1802 | u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { | |
1803 | UErrorCode errorCode=U_ZERO_ERROR; | |
374ca955 A |
1804 | return u_strcmpFold(s1, n, s2, n, |
1805 | options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), | |
1806 | &errorCode); | |
b75a7d8f | 1807 | } |
b331163b A |
1808 | |
1809 | /* internal API - detect length of shared prefix */ | |
1810 | U_CAPI void | |
1811 | u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1, | |
1812 | const UChar *s2, int32_t length2, | |
1813 | uint32_t options, | |
1814 | int32_t *matchLen1, int32_t *matchLen2, | |
1815 | UErrorCode *pErrorCode) { | |
1816 | _cmpFold(s1, length1, s2, length2, options, | |
1817 | matchLen1, matchLen2, pErrorCode); | |
1818 | } |