]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
b331163b | 4 | * Copyright (C) 2001-2015, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
4388f060 | 8 | * file name: ustrcase.cpp |
b75a7d8f A |
9 | * encoding: US-ASCII |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2002feb20 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Implementation file for string casing C API functions. | |
17 | * Uses functions from uchar.c for basic functionality that requires access | |
18 | * to the Unicode Character Database (uprops.dat). | |
19 | */ | |
20 | ||
21 | #include "unicode/utypes.h" | |
4388f060 | 22 | #include "unicode/brkiter.h" |
b75a7d8f | 23 | #include "unicode/ustring.h" |
46f4442e | 24 | #include "unicode/ucasemap.h" |
b75a7d8f | 25 | #include "unicode/ubrk.h" |
4388f060 A |
26 | #include "unicode/utf.h" |
27 | #include "unicode/utf16.h" | |
b75a7d8f | 28 | #include "cmemory.h" |
374ca955 | 29 | #include "ucase.h" |
b75a7d8f | 30 | #include "ustr_imp.h" |
b331163b | 31 | #include "uassert.h" |
4388f060 A |
32 | |
33 | U_NAMESPACE_USE | |
34 | ||
b75a7d8f A |
35 | /* string casing ------------------------------------------------------------ */ |
36 | ||
4388f060 A |
37 | /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ |
38 | static inline int32_t | |
374ca955 A |
39 | appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, |
40 | int32_t result, const UChar *s) { | |
41 | UChar32 c; | |
42 | int32_t length; | |
43 | ||
44 | /* decode the result */ | |
45 | if(result<0) { | |
46 | /* (not) original code point */ | |
47 | c=~result; | |
a62d09fc | 48 | length=U16_LENGTH(c); |
374ca955 A |
49 | } else if(result<=UCASE_MAX_STRING_LENGTH) { |
50 | c=U_SENTINEL; | |
51 | length=result; | |
52 | } else { | |
53 | c=result; | |
a62d09fc A |
54 | length=U16_LENGTH(c); |
55 | } | |
56 | if(length>(INT32_MAX-destIndex)) { | |
57 | return -1; // integer overflow | |
374ca955 A |
58 | } |
59 | ||
60 | if(destIndex<destCapacity) { | |
61 | /* append the result */ | |
a62d09fc | 62 | if(c>=0) { |
374ca955 A |
63 | /* code point */ |
64 | UBool isError=FALSE; | |
65 | U16_APPEND(dest, destIndex, destCapacity, c, isError); | |
66 | if(isError) { | |
67 | /* overflow, nothing written */ | |
a62d09fc | 68 | destIndex+=length; |
374ca955 A |
69 | } |
70 | } else { | |
71 | /* string */ | |
72 | if((destIndex+length)<=destCapacity) { | |
73 | while(length>0) { | |
74 | dest[destIndex++]=*s++; | |
75 | --length; | |
76 | } | |
77 | } else { | |
78 | /* overflow */ | |
79 | destIndex+=length; | |
80 | } | |
81 | } | |
82 | } else { | |
83 | /* preflight */ | |
a62d09fc | 84 | destIndex+=length; |
374ca955 A |
85 | } |
86 | return destIndex; | |
87 | } | |
88 | ||
a62d09fc A |
89 | static inline int32_t |
90 | appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) { | |
91 | if(destIndex<destCapacity) { | |
92 | dest[destIndex]=c; | |
93 | } else if(destIndex==INT32_MAX) { | |
94 | return -1; // integer overflow | |
95 | } | |
96 | return destIndex+1; | |
97 | } | |
98 | ||
374ca955 A |
99 | static UChar32 U_CALLCONV |
100 | utf16_caseContextIterator(void *context, int8_t dir) { | |
101 | UCaseContext *csc=(UCaseContext *)context; | |
102 | UChar32 c; | |
103 | ||
104 | if(dir<0) { | |
105 | /* reset for backward iteration */ | |
106 | csc->index=csc->cpStart; | |
107 | csc->dir=dir; | |
108 | } else if(dir>0) { | |
109 | /* reset for forward iteration */ | |
110 | csc->index=csc->cpLimit; | |
111 | csc->dir=dir; | |
112 | } else { | |
113 | /* continue current iteration direction */ | |
114 | dir=csc->dir; | |
115 | } | |
116 | ||
117 | if(dir<0) { | |
118 | if(csc->start<csc->index) { | |
119 | U16_PREV((const UChar *)csc->p, csc->start, csc->index, c); | |
120 | return c; | |
121 | } | |
122 | } else { | |
123 | if(csc->index<csc->limit) { | |
124 | U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c); | |
125 | return c; | |
126 | } | |
127 | } | |
128 | return U_SENTINEL; | |
129 | } | |
130 | ||
374ca955 | 131 | /* |
73c04bcf | 132 | * Case-maps [srcStart..srcLimit[ but takes |
374ca955 A |
133 | * context [0..srcLength[ into account. |
134 | */ | |
135 | static int32_t | |
46f4442e | 136 | _caseMap(const UCaseMap *csm, UCaseMapFull *map, |
374ca955 A |
137 | UChar *dest, int32_t destCapacity, |
138 | const UChar *src, UCaseContext *csc, | |
139 | int32_t srcStart, int32_t srcLimit, | |
374ca955 A |
140 | UErrorCode *pErrorCode) { |
141 | const UChar *s; | |
729e4ab9 | 142 | UChar32 c, c2 = 0; |
374ca955 | 143 | int32_t srcIndex, destIndex; |
46f4442e A |
144 | int32_t locCache; |
145 | ||
146 | locCache=csm->locCache; | |
374ca955 A |
147 | |
148 | /* case mapping loop */ | |
149 | srcIndex=srcStart; | |
150 | destIndex=0; | |
151 | while(srcIndex<srcLimit) { | |
152 | csc->cpStart=srcIndex; | |
153 | U16_NEXT(src, srcIndex, srcLimit, c); | |
154 | csc->cpLimit=srcIndex; | |
46f4442e | 155 | c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache); |
73c04bcf A |
156 | if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { |
157 | /* fast path version of appendResult() for BMP results */ | |
158 | dest[destIndex++]=(UChar)c2; | |
159 | } else { | |
160 | destIndex=appendResult(dest, destIndex, destCapacity, c, s); | |
a62d09fc A |
161 | if(destIndex<0) { |
162 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
163 | return 0; | |
164 | } | |
73c04bcf | 165 | } |
374ca955 A |
166 | } |
167 | ||
168 | if(destIndex>destCapacity) { | |
169 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
170 | } | |
171 | return destIndex; | |
172 | } | |
173 | ||
b75a7d8f A |
174 | #if !UCONFIG_NO_BREAK_ITERATION |
175 | ||
4388f060 A |
176 | U_CFUNC int32_t U_CALLCONV |
177 | ustrcase_internalToTitle(const UCaseMap *csm, | |
178 | UChar *dest, int32_t destCapacity, | |
179 | const UChar *src, int32_t srcLength, | |
180 | UErrorCode *pErrorCode) { | |
374ca955 | 181 | const UChar *s; |
b75a7d8f | 182 | UChar32 c; |
729e4ab9 | 183 | int32_t prev, titleStart, titleLimit, idx, destIndex, length; |
b75a7d8f A |
184 | UBool isFirstIndex; |
185 | ||
46f4442e A |
186 | if(U_FAILURE(*pErrorCode)) { |
187 | return 0; | |
188 | } | |
189 | ||
4388f060 A |
190 | // Use the C++ abstract base class to minimize dependencies. |
191 | // TODO: Change UCaseMap.iter to store a BreakIterator directly. | |
192 | BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter); | |
193 | ||
b75a7d8f | 194 | /* set up local variables */ |
4388f060 A |
195 | int32_t locCache=csm->locCache; |
196 | UCaseContext csc=UCASECONTEXT_INITIALIZER; | |
197 | csc.p=(void *)src; | |
198 | csc.limit=srcLength; | |
b75a7d8f A |
199 | destIndex=0; |
200 | prev=0; | |
201 | isFirstIndex=TRUE; | |
202 | ||
203 | /* titlecasing loop */ | |
204 | while(prev<srcLength) { | |
205 | /* find next index where to titlecase */ | |
206 | if(isFirstIndex) { | |
207 | isFirstIndex=FALSE; | |
4388f060 | 208 | idx=bi->first(); |
b75a7d8f | 209 | } else { |
4388f060 | 210 | idx=bi->next(); |
b75a7d8f | 211 | } |
729e4ab9 A |
212 | if(idx==UBRK_DONE || idx>srcLength) { |
213 | idx=srcLength; | |
b75a7d8f A |
214 | } |
215 | ||
73c04bcf A |
216 | /* |
217 | * Unicode 4 & 5 section 3.13 Default Case Operations: | |
218 | * | |
219 | * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex | |
220 | * #29, "Text Boundaries." Between each pair of word boundaries, find the first | |
221 | * cased character F. If F exists, map F to default_title(F); then map each | |
222 | * subsequent character C to default_lower(C). | |
223 | * | |
224 | * In this implementation, segment [prev..index[ into 3 parts: | |
225 | * a) uncased characters (copy as-is) [prev..titleStart[ | |
226 | * b) first case letter (titlecase) [titleStart..titleLimit[ | |
227 | * c) subsequent characters (lowercase) [titleLimit..index[ | |
228 | */ | |
729e4ab9 | 229 | if(prev<idx) { |
73c04bcf A |
230 | /* find and copy uncased characters [prev..titleStart[ */ |
231 | titleStart=titleLimit=prev; | |
729e4ab9 | 232 | U16_NEXT(src, titleLimit, idx, c); |
46f4442e A |
233 | if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { |
234 | /* Adjust the titlecasing index (titleStart) to the next cased character. */ | |
235 | for(;;) { | |
236 | titleStart=titleLimit; | |
729e4ab9 | 237 | if(titleLimit==idx) { |
46f4442e A |
238 | /* |
239 | * only uncased characters in [prev..index[ | |
240 | * stop with titleStart==titleLimit==index | |
241 | */ | |
242 | break; | |
243 | } | |
729e4ab9 | 244 | U16_NEXT(src, titleLimit, idx, c); |
46f4442e A |
245 | if(UCASE_NONE!=ucase_getType(csm->csp, c)) { |
246 | break; /* cased letter at [titleStart..titleLimit[ */ | |
247 | } | |
73c04bcf | 248 | } |
46f4442e A |
249 | length=titleStart-prev; |
250 | if(length>0) { | |
251 | if((destIndex+length)<=destCapacity) { | |
a62d09fc | 252 | u_memcpy(dest+destIndex, src+prev, length); |
46f4442e A |
253 | } |
254 | destIndex+=length; | |
73c04bcf A |
255 | } |
256 | } | |
b75a7d8f | 257 | |
73c04bcf A |
258 | if(titleStart<titleLimit) { |
259 | /* titlecase c which is from [titleStart..titleLimit[ */ | |
4388f060 A |
260 | csc.cpStart=titleStart; |
261 | csc.cpLimit=titleLimit; | |
262 | c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache); | |
46f4442e | 263 | destIndex=appendResult(dest, destIndex, destCapacity, c, s); |
a62d09fc A |
264 | if(destIndex<0) { |
265 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
266 | return 0; | |
267 | } | |
46f4442e A |
268 | |
269 | /* Special case Dutch IJ titlecasing */ | |
a62d09fc A |
270 | if (titleStart+1 < idx && |
271 | ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH && | |
272 | (src[titleStart] == 0x0049 || src[titleStart] == 0x0069) && | |
273 | (src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A)) { | |
274 | destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A); | |
275 | if(destIndex<0) { | |
276 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
277 | return 0; | |
278 | } | |
279 | titleLimit++; | |
46f4442e | 280 | } |
73c04bcf A |
281 | |
282 | /* lowercase [titleLimit..index[ */ | |
729e4ab9 | 283 | if(titleLimit<idx) { |
46f4442e A |
284 | if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { |
285 | /* Normal operation: Lowercase the rest of the word. */ | |
286 | destIndex+= | |
287 | _caseMap( | |
288 | csm, ucase_toFullLower, | |
289 | dest+destIndex, destCapacity-destIndex, | |
4388f060 | 290 | src, &csc, |
729e4ab9 | 291 | titleLimit, idx, |
46f4442e | 292 | pErrorCode); |
a62d09fc A |
293 | if(U_FAILURE(*pErrorCode)) { |
294 | return destIndex; | |
295 | } | |
46f4442e A |
296 | } else { |
297 | /* Optionally just copy the rest of the word unchanged. */ | |
729e4ab9 | 298 | length=idx-titleLimit; |
a62d09fc A |
299 | if(length>(INT32_MAX-destIndex)) { |
300 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
301 | return 0; | |
302 | } | |
46f4442e | 303 | if((destIndex+length)<=destCapacity) { |
a62d09fc | 304 | u_memcpy(dest+destIndex, src+titleLimit, length); |
46f4442e A |
305 | } |
306 | destIndex+=length; | |
307 | } | |
73c04bcf A |
308 | } |
309 | } | |
b75a7d8f A |
310 | } |
311 | ||
729e4ab9 | 312 | prev=idx; |
b75a7d8f A |
313 | } |
314 | ||
374ca955 A |
315 | if(destIndex>destCapacity) { |
316 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
317 | } | |
b75a7d8f A |
318 | return destIndex; |
319 | } | |
320 | ||
4388f060 | 321 | #endif // !UCONFIG_NO_BREAK_ITERATION |
46f4442e A |
322 | |
323 | /* functions available in the common library (for unistr_case.cpp) */ | |
324 | ||
4388f060 A |
325 | U_CFUNC int32_t U_CALLCONV |
326 | ustrcase_internalToLower(const UCaseMap *csm, | |
327 | UChar *dest, int32_t destCapacity, | |
328 | const UChar *src, int32_t srcLength, | |
329 | UErrorCode *pErrorCode) { | |
330 | UCaseContext csc=UCASECONTEXT_INITIALIZER; | |
374ca955 A |
331 | csc.p=(void *)src; |
332 | csc.limit=srcLength; | |
4388f060 A |
333 | return _caseMap( |
334 | csm, ucase_toFullLower, | |
335 | dest, destCapacity, | |
336 | src, &csc, 0, srcLength, | |
337 | pErrorCode); | |
374ca955 A |
338 | } |
339 | ||
4388f060 A |
340 | U_CFUNC int32_t U_CALLCONV |
341 | ustrcase_internalToUpper(const UCaseMap *csm, | |
342 | UChar *dest, int32_t destCapacity, | |
343 | const UChar *src, int32_t srcLength, | |
344 | UErrorCode *pErrorCode) { | |
345 | UCaseContext csc=UCASECONTEXT_INITIALIZER; | |
374ca955 A |
346 | csc.p=(void *)src; |
347 | csc.limit=srcLength; | |
4388f060 A |
348 | return _caseMap( |
349 | csm, ucase_toFullUpper, | |
350 | dest, destCapacity, | |
351 | src, &csc, 0, srcLength, | |
352 | pErrorCode); | |
374ca955 A |
353 | } |
354 | ||
4388f060 | 355 | static int32_t |
73c04bcf | 356 | ustr_foldCase(const UCaseProps *csp, |
374ca955 A |
357 | UChar *dest, int32_t destCapacity, |
358 | const UChar *src, int32_t srcLength, | |
359 | uint32_t options, | |
360 | UErrorCode *pErrorCode) { | |
361 | int32_t srcIndex, destIndex; | |
362 | ||
363 | const UChar *s; | |
729e4ab9 | 364 | UChar32 c, c2 = 0; |
374ca955 A |
365 | |
366 | /* case mapping loop */ | |
367 | srcIndex=destIndex=0; | |
368 | while(srcIndex<srcLength) { | |
369 | U16_NEXT(src, srcIndex, srcLength, c); | |
370 | c=ucase_toFullFolding(csp, c, &s, options); | |
73c04bcf A |
371 | if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { |
372 | /* fast path version of appendResult() for BMP results */ | |
373 | dest[destIndex++]=(UChar)c2; | |
374 | } else { | |
375 | destIndex=appendResult(dest, destIndex, destCapacity, c, s); | |
a62d09fc A |
376 | if(destIndex<0) { |
377 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; | |
378 | return 0; | |
379 | } | |
73c04bcf | 380 | } |
374ca955 A |
381 | } |
382 | ||
383 | if(destIndex>destCapacity) { | |
384 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; | |
385 | } | |
386 | return destIndex; | |
387 | } | |
388 | ||
4388f060 A |
389 | U_CFUNC int32_t U_CALLCONV |
390 | ustrcase_internalFold(const UCaseMap *csm, | |
391 | UChar *dest, int32_t destCapacity, | |
392 | const UChar *src, int32_t srcLength, | |
393 | UErrorCode *pErrorCode) { | |
394 | return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode); | |
395 | } | |
374ca955 | 396 | |
4388f060 A |
397 | U_CFUNC int32_t |
398 | ustrcase_map(const UCaseMap *csm, | |
399 | UChar *dest, int32_t destCapacity, | |
400 | const UChar *src, int32_t srcLength, | |
401 | UStringCaseMapper *stringCaseMapper, | |
402 | UErrorCode *pErrorCode) { | |
b75a7d8f A |
403 | UChar buffer[300]; |
404 | UChar *temp; | |
374ca955 | 405 | |
b75a7d8f | 406 | int32_t destLength; |
b75a7d8f A |
407 | |
408 | /* check argument values */ | |
4388f060 | 409 | if(U_FAILURE(*pErrorCode)) { |
b75a7d8f A |
410 | return 0; |
411 | } | |
412 | if( destCapacity<0 || | |
413 | (dest==NULL && destCapacity>0) || | |
414 | src==NULL || | |
415 | srcLength<-1 | |
416 | ) { | |
417 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
418 | return 0; | |
419 | } | |
420 | ||
421 | /* get the string length */ | |
422 | if(srcLength==-1) { | |
423 | srcLength=u_strlen(src); | |
424 | } | |
425 | ||
426 | /* check for overlapping source and destination */ | |
427 | if( dest!=NULL && | |
428 | ((src>=dest && src<(dest+destCapacity)) || | |
429 | (dest>=src && dest<(src+srcLength))) | |
430 | ) { | |
431 | /* overlap: provide a temporary destination buffer and later copy the result */ | |
b331163b | 432 | if(destCapacity<=UPRV_LENGTHOF(buffer)) { |
b75a7d8f A |
433 | /* the stack buffer is large enough */ |
434 | temp=buffer; | |
435 | } else { | |
436 | /* allocate a buffer */ | |
437 | temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); | |
438 | if(temp==NULL) { | |
439 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
440 | return 0; | |
441 | } | |
442 | } | |
443 | } else { | |
444 | temp=dest; | |
445 | } | |
446 | ||
4388f060 | 447 | destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode); |
b75a7d8f A |
448 | if(temp!=dest) { |
449 | /* copy the result string to the destination buffer */ | |
450 | if(destLength>0) { | |
451 | int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity; | |
452 | if(copyLength>0) { | |
a62d09fc | 453 | u_memmove(dest, temp, copyLength); |
b75a7d8f A |
454 | } |
455 | } | |
456 | if(temp!=buffer) { | |
457 | uprv_free(temp); | |
458 | } | |
459 | } | |
460 | ||
b75a7d8f A |
461 | return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); |
462 | } | |
463 | ||
374ca955 A |
464 | /* public API functions */ |
465 | ||
b75a7d8f A |
466 | U_CAPI int32_t U_EXPORT2 |
467 | u_strFoldCase(UChar *dest, int32_t destCapacity, | |
468 | const UChar *src, int32_t srcLength, | |
469 | uint32_t options, | |
470 | UErrorCode *pErrorCode) { | |
4388f060 | 471 | UCaseMap csm=UCASEMAP_INITIALIZER; |
729e4ab9 | 472 | csm.csp=ucase_getSingleton(); |
46f4442e | 473 | csm.options=options; |
4388f060 A |
474 | return ustrcase_map( |
475 | &csm, | |
476 | dest, destCapacity, | |
477 | src, srcLength, | |
478 | ustrcase_internalFold, pErrorCode); | |
374ca955 A |
479 | } |
480 | ||
481 | /* case-insensitive string comparisons -------------------------------------- */ | |
482 | ||
483 | /* | |
484 | * This function is a copy of unorm_cmpEquivFold() minus the parts for | |
485 | * canonical equivalence. | |
486 | * Keep the functions in sync, and see there for how this works. | |
487 | * The duplication is for modularization: | |
488 | * It makes caseless (but not canonical caseless) matches independent of | |
489 | * the normalization code. | |
490 | */ | |
491 | ||
492 | /* stack element for previous-level source/decomposition pointers */ | |
493 | struct CmpEquivLevel { | |
494 | const UChar *start, *s, *limit; | |
495 | }; | |
496 | typedef struct CmpEquivLevel CmpEquivLevel; | |
497 | ||
b331163b A |
498 | /** |
499 | * Internal implementation code comparing string with case fold. | |
500 | * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch(). | |
501 | * | |
502 | * @param s1 input string 1 | |
503 | * @param length1 length of string 1, or -1 (NULL terminated) | |
504 | * @param s2 input string 2 | |
505 | * @param length2 length of string 2, or -1 (NULL terminated) | |
506 | * @param options compare options | |
507 | * @param matchLen1 (output) length of partial prefix match in s1 | |
508 | * @param matchLen2 (output) length of partial prefix match in s2 | |
509 | * @param pErrorCode receives error status | |
510 | * @return The result of comparison | |
511 | */ | |
512 | static int32_t _cmpFold( | |
513 | const UChar *s1, int32_t length1, | |
514 | const UChar *s2, int32_t length2, | |
515 | uint32_t options, | |
516 | int32_t *matchLen1, int32_t *matchLen2, | |
517 | UErrorCode *pErrorCode) { | |
518 | int32_t cmpRes = 0; | |
519 | ||
73c04bcf | 520 | const UCaseProps *csp; |
374ca955 A |
521 | |
522 | /* current-level start/limit - s1/s2 as current */ | |
523 | const UChar *start1, *start2, *limit1, *limit2; | |
524 | ||
b331163b A |
525 | /* points to the original start address */ |
526 | const UChar *org1, *org2; | |
527 | ||
528 | /* points to the end of match + 1 */ | |
529 | const UChar *m1, *m2; | |
530 | ||
374ca955 A |
531 | /* case folding variables */ |
532 | const UChar *p; | |
533 | int32_t length; | |
534 | ||
535 | /* stacks of previous-level start/current/limit */ | |
536 | CmpEquivLevel stack1[2], stack2[2]; | |
537 | ||
538 | /* case folding buffers, only use current-level start/limit */ | |
539 | UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; | |
540 | ||
541 | /* track which is the current level per string */ | |
542 | int32_t level1, level2; | |
543 | ||
544 | /* current code units, and code points for lookups */ | |
545 | UChar32 c1, c2, cp1, cp2; | |
546 | ||
547 | /* no argument error checking because this itself is not an API */ | |
548 | ||
549 | /* | |
550 | * assume that at least the option U_COMPARE_IGNORE_CASE is set | |
551 | * otherwise this function would have to behave exactly as uprv_strCompare() | |
552 | */ | |
729e4ab9 | 553 | csp=ucase_getSingleton(); |
374ca955 A |
554 | if(U_FAILURE(*pErrorCode)) { |
555 | return 0; | |
556 | } | |
557 | ||
558 | /* initialize */ | |
b331163b A |
559 | if(matchLen1) { |
560 | U_ASSERT(matchLen2 !=NULL); | |
561 | *matchLen1=0; | |
562 | *matchLen2=0; | |
563 | } | |
564 | ||
565 | start1=m1=org1=s1; | |
374ca955 A |
566 | if(length1==-1) { |
567 | limit1=NULL; | |
568 | } else { | |
569 | limit1=s1+length1; | |
570 | } | |
571 | ||
b331163b | 572 | start2=m2=org2=s2; |
374ca955 A |
573 | if(length2==-1) { |
574 | limit2=NULL; | |
575 | } else { | |
576 | limit2=s2+length2; | |
577 | } | |
578 | ||
579 | level1=level2=0; | |
580 | c1=c2=-1; | |
581 | ||
582 | /* comparison loop */ | |
583 | for(;;) { | |
584 | /* | |
585 | * here a code unit value of -1 means "get another code unit" | |
586 | * below it will mean "this source is finished" | |
587 | */ | |
588 | ||
589 | if(c1<0) { | |
590 | /* get next code unit from string 1, post-increment */ | |
591 | for(;;) { | |
592 | if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { | |
593 | if(level1==0) { | |
594 | c1=-1; | |
595 | break; | |
596 | } | |
597 | } else { | |
598 | ++s1; | |
599 | break; | |
600 | } | |
601 | ||
602 | /* reached end of level buffer, pop one level */ | |
603 | do { | |
604 | --level1; | |
4388f060 | 605 | start1=stack1[level1].start; /*Not uninitialized*/ |
374ca955 | 606 | } while(start1==NULL); |
4388f060 A |
607 | s1=stack1[level1].s; /*Not uninitialized*/ |
608 | limit1=stack1[level1].limit; /*Not uninitialized*/ | |
374ca955 A |
609 | } |
610 | } | |
611 | ||
612 | if(c2<0) { | |
613 | /* get next code unit from string 2, post-increment */ | |
614 | for(;;) { | |
615 | if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { | |
616 | if(level2==0) { | |
617 | c2=-1; | |
618 | break; | |
619 | } | |
620 | } else { | |
621 | ++s2; | |
622 | break; | |
623 | } | |
624 | ||
625 | /* reached end of level buffer, pop one level */ | |
626 | do { | |
627 | --level2; | |
4388f060 | 628 | start2=stack2[level2].start; /*Not uninitialized*/ |
374ca955 | 629 | } while(start2==NULL); |
4388f060 A |
630 | s2=stack2[level2].s; /*Not uninitialized*/ |
631 | limit2=stack2[level2].limit; /*Not uninitialized*/ | |
374ca955 A |
632 | } |
633 | } | |
634 | ||
635 | /* | |
636 | * compare c1 and c2 | |
637 | * either variable c1, c2 is -1 only if the corresponding string is finished | |
638 | */ | |
639 | if(c1==c2) { | |
b331163b A |
640 | const UChar *next1, *next2; |
641 | ||
374ca955 | 642 | if(c1<0) { |
b331163b A |
643 | cmpRes=0; /* c1==c2==-1 indicating end of strings */ |
644 | break; | |
645 | } | |
646 | ||
647 | /* | |
648 | * Note: Move the match positions in both strings at the same time | |
649 | * only when corresponding code point(s) in the original strings | |
650 | * are fully consumed. For example, when comparing s1="Fust" and | |
651 | * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches | |
652 | * the first code point in the case-folded data. But the second "s" | |
653 | * has no matching code point in s1, so this implementation returns | |
654 | * 2 as the prefix match length ("Fu"). | |
655 | */ | |
656 | next1=next2=NULL; | |
657 | if(level1==0) { | |
658 | next1=s1; | |
659 | } else if(s1==limit1) { | |
660 | /* Note: This implementation only use a single level of stack. | |
661 | * If this code needs to be changed to use multiple levels | |
662 | * of stacks, the code above should check if the current | |
663 | * code is at the end of all stacks. | |
664 | */ | |
665 | U_ASSERT(level1==1); | |
666 | ||
667 | /* is s1 at the end of the current stack? */ | |
668 | next1=stack1[0].s; | |
669 | } | |
670 | ||
671 | if (next1!=NULL) { | |
672 | if(level2==0) { | |
673 | next2=s2; | |
674 | } else if(s2==limit2) { | |
675 | U_ASSERT(level2==1); | |
676 | ||
677 | /* is s2 at the end of the current stack? */ | |
678 | next2=stack2[0].s; | |
679 | } | |
680 | if(next2!=NULL) { | |
681 | m1=next1; | |
682 | m2=next2; | |
683 | } | |
374ca955 A |
684 | } |
685 | c1=c2=-1; /* make us fetch new code units */ | |
686 | continue; | |
687 | } else if(c1<0) { | |
b331163b A |
688 | cmpRes=-1; /* string 1 ends before string 2 */ |
689 | break; | |
374ca955 | 690 | } else if(c2<0) { |
b331163b A |
691 | cmpRes=1; /* string 2 ends before string 1 */ |
692 | break; | |
374ca955 A |
693 | } |
694 | /* c1!=c2 && c1>=0 && c2>=0 */ | |
695 | ||
696 | /* get complete code points for c1, c2 for lookups if either is a surrogate */ | |
697 | cp1=c1; | |
698 | if(U_IS_SURROGATE(c1)) { | |
699 | UChar c; | |
700 | ||
701 | if(U_IS_SURROGATE_LEAD(c1)) { | |
702 | if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { | |
703 | /* advance ++s1; only below if cp1 decomposes/case-folds */ | |
704 | cp1=U16_GET_SUPPLEMENTARY(c1, c); | |
705 | } | |
706 | } else /* isTrail(c1) */ { | |
707 | if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { | |
708 | cp1=U16_GET_SUPPLEMENTARY(c, c1); | |
709 | } | |
710 | } | |
711 | } | |
712 | ||
713 | cp2=c2; | |
714 | if(U_IS_SURROGATE(c2)) { | |
715 | UChar c; | |
716 | ||
717 | if(U_IS_SURROGATE_LEAD(c2)) { | |
718 | if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { | |
719 | /* advance ++s2; only below if cp2 decomposes/case-folds */ | |
720 | cp2=U16_GET_SUPPLEMENTARY(c2, c); | |
721 | } | |
722 | } else /* isTrail(c2) */ { | |
723 | if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { | |
724 | cp2=U16_GET_SUPPLEMENTARY(c, c2); | |
725 | } | |
726 | } | |
727 | } | |
728 | ||
729 | /* | |
730 | * go down one level for each string | |
731 | * continue with the main loop as soon as there is a real change | |
732 | */ | |
733 | ||
734 | if( level1==0 && | |
735 | (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 | |
736 | ) { | |
737 | /* cp1 case-folds to the code point "length" or to p[length] */ | |
738 | if(U_IS_SURROGATE(c1)) { | |
739 | if(U_IS_SURROGATE_LEAD(c1)) { | |
740 | /* advance beyond source surrogate pair if it case-folds */ | |
741 | ++s1; | |
742 | } else /* isTrail(c1) */ { | |
743 | /* | |
744 | * we got a supplementary code point when hitting its trail surrogate, | |
745 | * therefore the lead surrogate must have been the same as in the other string; | |
746 | * compare this decomposition with the lead surrogate in the other string | |
747 | * remember that this simulates bulk text replacement: | |
748 | * the decomposition would replace the entire code point | |
749 | */ | |
750 | --s2; | |
b331163b | 751 | --m2; |
374ca955 A |
752 | c2=*(s2-1); |
753 | } | |
754 | } | |
755 | ||
756 | /* push current level pointers */ | |
757 | stack1[0].start=start1; | |
758 | stack1[0].s=s1; | |
759 | stack1[0].limit=limit1; | |
760 | ++level1; | |
761 | ||
762 | /* copy the folding result to fold1[] */ | |
763 | if(length<=UCASE_MAX_STRING_LENGTH) { | |
764 | u_memcpy(fold1, p, length); | |
765 | } else { | |
766 | int32_t i=0; | |
767 | U16_APPEND_UNSAFE(fold1, i, length); | |
768 | length=i; | |
769 | } | |
770 | ||
771 | /* set next level pointers to case folding */ | |
772 | start1=s1=fold1; | |
773 | limit1=fold1+length; | |
774 | ||
775 | /* get ready to read from decomposition, continue with loop */ | |
776 | c1=-1; | |
777 | continue; | |
778 | } | |
779 | ||
780 | if( level2==0 && | |
781 | (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 | |
782 | ) { | |
783 | /* cp2 case-folds to the code point "length" or to p[length] */ | |
784 | if(U_IS_SURROGATE(c2)) { | |
785 | if(U_IS_SURROGATE_LEAD(c2)) { | |
786 | /* advance beyond source surrogate pair if it case-folds */ | |
787 | ++s2; | |
788 | } else /* isTrail(c2) */ { | |
789 | /* | |
790 | * we got a supplementary code point when hitting its trail surrogate, | |
791 | * therefore the lead surrogate must have been the same as in the other string; | |
792 | * compare this decomposition with the lead surrogate in the other string | |
793 | * remember that this simulates bulk text replacement: | |
794 | * the decomposition would replace the entire code point | |
795 | */ | |
796 | --s1; | |
b331163b | 797 | --m2; |
374ca955 A |
798 | c1=*(s1-1); |
799 | } | |
800 | } | |
801 | ||
802 | /* push current level pointers */ | |
803 | stack2[0].start=start2; | |
804 | stack2[0].s=s2; | |
805 | stack2[0].limit=limit2; | |
806 | ++level2; | |
807 | ||
808 | /* copy the folding result to fold2[] */ | |
809 | if(length<=UCASE_MAX_STRING_LENGTH) { | |
810 | u_memcpy(fold2, p, length); | |
811 | } else { | |
812 | int32_t i=0; | |
813 | U16_APPEND_UNSAFE(fold2, i, length); | |
814 | length=i; | |
815 | } | |
816 | ||
817 | /* set next level pointers to case folding */ | |
818 | start2=s2=fold2; | |
819 | limit2=fold2+length; | |
820 | ||
821 | /* get ready to read from decomposition, continue with loop */ | |
822 | c2=-1; | |
823 | continue; | |
824 | } | |
825 | ||
826 | /* | |
827 | * no decomposition/case folding, max level for both sides: | |
828 | * return difference result | |
829 | * | |
830 | * code point order comparison must not just return cp1-cp2 | |
831 | * because when single surrogates are present then the surrogate pairs | |
832 | * that formed cp1 and cp2 may be from different string indexes | |
833 | * | |
834 | * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units | |
835 | * c1=d800 cp1=10001 c2=dc00 cp2=10000 | |
836 | * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } | |
837 | * | |
838 | * therefore, use same fix-up as in ustring.c/uprv_strCompare() | |
839 | * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ | |
840 | * so we have slightly different pointer/start/limit comparisons here | |
841 | */ | |
842 | ||
843 | if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { | |
844 | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ | |
845 | if( | |
846 | (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || | |
847 | (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) | |
848 | ) { | |
849 | /* part of a surrogate pair, leave >=d800 */ | |
850 | } else { | |
851 | /* BMP code point - may be surrogate code point - make <d800 */ | |
852 | c1-=0x2800; | |
853 | } | |
854 | ||
855 | if( | |
856 | (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || | |
857 | (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) | |
858 | ) { | |
859 | /* part of a surrogate pair, leave >=d800 */ | |
860 | } else { | |
861 | /* BMP code point - may be surrogate code point - make <d800 */ | |
862 | c2-=0x2800; | |
863 | } | |
864 | } | |
865 | ||
b331163b A |
866 | cmpRes=c1-c2; |
867 | break; | |
374ca955 | 868 | } |
b331163b A |
869 | |
870 | if(matchLen1) { | |
871 | *matchLen1=m1-org1; | |
872 | *matchLen2=m2-org2; | |
873 | } | |
874 | return cmpRes; | |
875 | } | |
876 | ||
877 | /* internal function */ | |
878 | U_CFUNC int32_t | |
879 | u_strcmpFold(const UChar *s1, int32_t length1, | |
880 | const UChar *s2, int32_t length2, | |
881 | uint32_t options, | |
882 | UErrorCode *pErrorCode) { | |
883 | return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode); | |
b75a7d8f A |
884 | } |
885 | ||
374ca955 | 886 | /* public API functions */ |
b75a7d8f A |
887 | |
888 | U_CAPI int32_t U_EXPORT2 | |
889 | u_strCaseCompare(const UChar *s1, int32_t length1, | |
890 | const UChar *s2, int32_t length2, | |
891 | uint32_t options, | |
892 | UErrorCode *pErrorCode) { | |
893 | /* argument checking */ | |
894 | if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { | |
895 | return 0; | |
896 | } | |
897 | if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { | |
898 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
899 | return 0; | |
900 | } | |
374ca955 A |
901 | return u_strcmpFold(s1, length1, s2, length2, |
902 | options|U_COMPARE_IGNORE_CASE, | |
903 | pErrorCode); | |
b75a7d8f A |
904 | } |
905 | ||
906 | U_CAPI int32_t U_EXPORT2 | |
907 | u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) { | |
908 | UErrorCode errorCode=U_ZERO_ERROR; | |
374ca955 A |
909 | return u_strcmpFold(s1, -1, s2, -1, |
910 | options|U_COMPARE_IGNORE_CASE, | |
911 | &errorCode); | |
b75a7d8f A |
912 | } |
913 | ||
914 | U_CAPI int32_t U_EXPORT2 | |
915 | u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) { | |
916 | UErrorCode errorCode=U_ZERO_ERROR; | |
374ca955 A |
917 | return u_strcmpFold(s1, length, s2, length, |
918 | options|U_COMPARE_IGNORE_CASE, | |
919 | &errorCode); | |
b75a7d8f A |
920 | } |
921 | ||
922 | U_CAPI int32_t U_EXPORT2 | |
923 | u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { | |
924 | UErrorCode errorCode=U_ZERO_ERROR; | |
374ca955 A |
925 | return u_strcmpFold(s1, n, s2, n, |
926 | options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), | |
927 | &errorCode); | |
b75a7d8f | 928 | } |
b331163b A |
929 | |
930 | /* internal API - detect length of shared prefix */ | |
931 | U_CAPI void | |
932 | u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1, | |
933 | const UChar *s2, int32_t length2, | |
934 | uint32_t options, | |
935 | int32_t *matchLen1, int32_t *matchLen2, | |
936 | UErrorCode *pErrorCode) { | |
937 | _cmpFold(s1, length1, s2, length2, options, | |
938 | matchLen1, matchLen2, pErrorCode); | |
939 | } |