]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
46f4442e | 4 | * Copyright (C) 2001-2006, International Business Machines |
374ca955 A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: unormcmp.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2004sep13 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * unorm_compare() function moved here from unorm.cpp for better modularization. | |
17 | * Depends on both normalization and case folding. | |
18 | * Allows unorm.cpp to not depend on any character properties code. | |
19 | */ | |
20 | ||
21 | #include "unicode/utypes.h" | |
22 | ||
23 | #if !UCONFIG_NO_NORMALIZATION | |
24 | ||
25 | #include "unicode/ustring.h" | |
26 | #include "unicode/unorm.h" | |
27 | #include "unicode/uniset.h" | |
28 | #include "unormimp.h" | |
29 | #include "ucase.h" | |
30 | #include "cmemory.h" | |
31 | ||
46f4442e A |
32 | U_NAMESPACE_USE |
33 | ||
374ca955 A |
34 | #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) |
35 | ||
36 | /* compare canonically equivalent ------------------------------------------- */ | |
37 | ||
38 | /* | |
39 | * Compare two strings for canonical equivalence. | |
40 | * Further options include case-insensitive comparison and | |
41 | * code point order (as opposed to code unit order). | |
42 | * | |
43 | * In this function, canonical equivalence is optional as well. | |
44 | * If canonical equivalence is tested, then both strings must fulfill | |
45 | * the FCD check. | |
46 | * | |
47 | * Semantically, this is equivalent to | |
48 | * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) | |
49 | * where code point order, NFD and foldCase are all optional. | |
50 | * | |
51 | * String comparisons almost always yield results before processing both strings | |
52 | * completely. | |
53 | * They are generally more efficient working incrementally instead of | |
54 | * performing the sub-processing (strlen, normalization, case-folding) | |
55 | * on the entire strings first. | |
56 | * | |
57 | * It is also unnecessary to not normalize identical characters. | |
58 | * | |
59 | * This function works in principle as follows: | |
60 | * | |
61 | * loop { | |
62 | * get one code unit c1 from s1 (-1 if end of source) | |
63 | * get one code unit c2 from s2 (-1 if end of source) | |
64 | * | |
65 | * if(either string finished) { | |
66 | * return result; | |
67 | * } | |
68 | * if(c1==c2) { | |
69 | * continue; | |
70 | * } | |
71 | * | |
72 | * // c1!=c2 | |
73 | * try to decompose/case-fold c1/c2, and continue if one does; | |
74 | * | |
75 | * // still c1!=c2 and neither decomposes/case-folds, return result | |
76 | * return c1-c2; | |
77 | * } | |
78 | * | |
79 | * When a character decomposes, then the pointer for that source changes to | |
80 | * the decomposition, pushing the previous pointer onto a stack. | |
81 | * When the end of the decomposition is reached, then the code unit reader | |
82 | * pops the previous source from the stack. | |
83 | * (Same for case-folding.) | |
84 | * | |
85 | * This is complicated further by operating on variable-width UTF-16. | |
86 | * The top part of the loop works on code units, while lookups for decomposition | |
87 | * and case-folding need code points. | |
88 | * Code points are assembled after the equality/end-of-source part. | |
89 | * The source pointer is only advanced beyond all code units when the code point | |
90 | * actually decomposes/case-folds. | |
91 | * | |
92 | * If we were on a trail surrogate unit when assembling a code point, | |
93 | * and the code point decomposes/case-folds, then the decomposition/folding | |
94 | * result must be compared with the part of the other string that corresponds to | |
95 | * this string's lead surrogate. | |
96 | * Since we only assemble a code point when hitting a trail unit when the | |
97 | * preceding lead units were identical, we back up the other string by one unit | |
98 | * in such a case. | |
99 | * | |
100 | * The optional code point order comparison at the end works with | |
101 | * the same fix-up as the other code point order comparison functions. | |
102 | * See ustring.c and the comment near the end of this function. | |
103 | * | |
104 | * Assumption: A decomposition or case-folding result string never contains | |
105 | * a single surrogate. This is a safe assumption in the Unicode Standard. | |
106 | * Therefore, we do not need to check for surrogate pairs across | |
107 | * decomposition/case-folding boundaries. | |
108 | * | |
109 | * Further assumptions (see verifications tstnorm.cpp): | |
110 | * The API function checks for FCD first, while the core function | |
111 | * first case-folds and then decomposes. This requires that case-folding does not | |
112 | * un-FCD any strings. | |
113 | * | |
114 | * The API function may also NFD the input and turn off decomposition. | |
115 | * This requires that case-folding does not un-NFD strings either. | |
116 | * | |
117 | * TODO If any of the above two assumptions is violated, | |
118 | * then this entire code must be re-thought. | |
119 | * If this happens, then a simple solution is to case-fold both strings up front | |
120 | * and to turn off UNORM_INPUT_IS_FCD. | |
121 | * We already do this when not both strings are in FCD because makeFCD | |
122 | * would be a partial NFD before the case folding, which does not work. | |
123 | * Note that all of this is only a problem when case-folding _and_ | |
124 | * canonical equivalence come together. | |
125 | * (Comments in unorm_compare() are more up to date than this TODO.) | |
126 | * | |
127 | * This function could be moved to a different source file, at increased cost | |
128 | * for calling the decomposition access function. | |
129 | */ | |
130 | ||
131 | /* stack element for previous-level source/decomposition pointers */ | |
132 | struct CmpEquivLevel { | |
133 | const UChar *start, *s, *limit; | |
134 | }; | |
135 | typedef struct CmpEquivLevel CmpEquivLevel; | |
136 | ||
137 | /* internal function */ | |
138 | static int32_t | |
139 | unorm_cmpEquivFold(const UChar *s1, int32_t length1, | |
140 | const UChar *s2, int32_t length2, | |
141 | uint32_t options, | |
142 | UErrorCode *pErrorCode) { | |
73c04bcf | 143 | const UCaseProps *csp; |
374ca955 A |
144 | |
145 | /* current-level start/limit - s1/s2 as current */ | |
146 | const UChar *start1, *start2, *limit1, *limit2; | |
147 | ||
148 | /* decomposition and case folding variables */ | |
149 | const UChar *p; | |
150 | int32_t length; | |
151 | ||
152 | /* stacks of previous-level start/current/limit */ | |
153 | CmpEquivLevel stack1[2], stack2[2]; | |
154 | ||
155 | /* decomposition buffers for Hangul */ | |
156 | UChar decomp1[4], decomp2[4]; | |
157 | ||
158 | /* case folding buffers, only use current-level start/limit */ | |
159 | UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; | |
160 | ||
161 | /* track which is the current level per string */ | |
162 | int32_t level1, level2; | |
163 | ||
164 | /* current code units, and code points for lookups */ | |
165 | UChar32 c1, c2, cp1, cp2; | |
166 | ||
167 | /* no argument error checking because this itself is not an API */ | |
168 | ||
169 | /* | |
170 | * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set | |
171 | * otherwise this function must behave exactly as uprv_strCompare() | |
172 | * not checking for that here makes testing this function easier | |
173 | */ | |
174 | ||
175 | /* normalization/properties data loaded? */ | |
176 | if( ((options&_COMPARE_EQUIV)!=0 && !unorm_haveData(pErrorCode)) || | |
177 | U_FAILURE(*pErrorCode) | |
178 | ) { | |
179 | return 0; | |
180 | } | |
181 | if((options&U_COMPARE_IGNORE_CASE)!=0) { | |
182 | csp=ucase_getSingleton(pErrorCode); | |
183 | if(U_FAILURE(*pErrorCode)) { | |
184 | return 0; | |
185 | } | |
186 | } else { | |
187 | csp=NULL; | |
188 | } | |
189 | ||
190 | /* initialize */ | |
191 | start1=s1; | |
192 | if(length1==-1) { | |
193 | limit1=NULL; | |
194 | } else { | |
195 | limit1=s1+length1; | |
196 | } | |
197 | ||
198 | start2=s2; | |
199 | if(length2==-1) { | |
200 | limit2=NULL; | |
201 | } else { | |
202 | limit2=s2+length2; | |
203 | } | |
204 | ||
205 | level1=level2=0; | |
206 | c1=c2=-1; | |
207 | ||
208 | /* comparison loop */ | |
209 | for(;;) { | |
210 | /* | |
211 | * here a code unit value of -1 means "get another code unit" | |
212 | * below it will mean "this source is finished" | |
213 | */ | |
214 | ||
215 | if(c1<0) { | |
216 | /* get next code unit from string 1, post-increment */ | |
217 | for(;;) { | |
218 | if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { | |
219 | if(level1==0) { | |
220 | c1=-1; | |
221 | break; | |
222 | } | |
223 | } else { | |
224 | ++s1; | |
225 | break; | |
226 | } | |
227 | ||
228 | /* reached end of level buffer, pop one level */ | |
229 | do { | |
230 | --level1; | |
231 | start1=stack1[level1].start; | |
232 | } while(start1==NULL); | |
233 | s1=stack1[level1].s; | |
234 | limit1=stack1[level1].limit; | |
235 | } | |
236 | } | |
237 | ||
238 | if(c2<0) { | |
239 | /* get next code unit from string 2, post-increment */ | |
240 | for(;;) { | |
241 | if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { | |
242 | if(level2==0) { | |
243 | c2=-1; | |
244 | break; | |
245 | } | |
246 | } else { | |
247 | ++s2; | |
248 | break; | |
249 | } | |
250 | ||
251 | /* reached end of level buffer, pop one level */ | |
252 | do { | |
253 | --level2; | |
254 | start2=stack2[level2].start; | |
255 | } while(start2==NULL); | |
256 | s2=stack2[level2].s; | |
257 | limit2=stack2[level2].limit; | |
258 | } | |
259 | } | |
260 | ||
261 | /* | |
262 | * compare c1 and c2 | |
263 | * either variable c1, c2 is -1 only if the corresponding string is finished | |
264 | */ | |
265 | if(c1==c2) { | |
266 | if(c1<0) { | |
267 | return 0; /* c1==c2==-1 indicating end of strings */ | |
268 | } | |
269 | c1=c2=-1; /* make us fetch new code units */ | |
270 | continue; | |
271 | } else if(c1<0) { | |
272 | return -1; /* string 1 ends before string 2 */ | |
273 | } else if(c2<0) { | |
274 | return 1; /* string 2 ends before string 1 */ | |
275 | } | |
276 | /* c1!=c2 && c1>=0 && c2>=0 */ | |
277 | ||
278 | /* get complete code points for c1, c2 for lookups if either is a surrogate */ | |
279 | cp1=c1; | |
280 | if(U_IS_SURROGATE(c1)) { | |
281 | UChar c; | |
282 | ||
283 | if(U_IS_SURROGATE_LEAD(c1)) { | |
284 | if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { | |
285 | /* advance ++s1; only below if cp1 decomposes/case-folds */ | |
286 | cp1=U16_GET_SUPPLEMENTARY(c1, c); | |
287 | } | |
288 | } else /* isTrail(c1) */ { | |
289 | if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { | |
290 | cp1=U16_GET_SUPPLEMENTARY(c, c1); | |
291 | } | |
292 | } | |
293 | } | |
294 | ||
295 | cp2=c2; | |
296 | if(U_IS_SURROGATE(c2)) { | |
297 | UChar c; | |
298 | ||
299 | if(U_IS_SURROGATE_LEAD(c2)) { | |
300 | if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { | |
301 | /* advance ++s2; only below if cp2 decomposes/case-folds */ | |
302 | cp2=U16_GET_SUPPLEMENTARY(c2, c); | |
303 | } | |
304 | } else /* isTrail(c2) */ { | |
305 | if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { | |
306 | cp2=U16_GET_SUPPLEMENTARY(c, c2); | |
307 | } | |
308 | } | |
309 | } | |
310 | ||
311 | /* | |
312 | * go down one level for each string | |
313 | * continue with the main loop as soon as there is a real change | |
314 | */ | |
315 | ||
316 | if( level1==0 && (options&U_COMPARE_IGNORE_CASE) && | |
317 | (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 | |
318 | ) { | |
319 | /* cp1 case-folds to the code point "length" or to p[length] */ | |
320 | if(U_IS_SURROGATE(c1)) { | |
321 | if(U_IS_SURROGATE_LEAD(c1)) { | |
322 | /* advance beyond source surrogate pair if it case-folds */ | |
323 | ++s1; | |
324 | } else /* isTrail(c1) */ { | |
325 | /* | |
326 | * we got a supplementary code point when hitting its trail surrogate, | |
327 | * therefore the lead surrogate must have been the same as in the other string; | |
328 | * compare this decomposition with the lead surrogate in the other string | |
329 | * remember that this simulates bulk text replacement: | |
330 | * the decomposition would replace the entire code point | |
331 | */ | |
332 | --s2; | |
333 | c2=*(s2-1); | |
334 | } | |
335 | } | |
336 | ||
337 | /* push current level pointers */ | |
338 | stack1[0].start=start1; | |
339 | stack1[0].s=s1; | |
340 | stack1[0].limit=limit1; | |
341 | ++level1; | |
342 | ||
343 | /* copy the folding result to fold1[] */ | |
344 | if(length<=UCASE_MAX_STRING_LENGTH) { | |
345 | u_memcpy(fold1, p, length); | |
346 | } else { | |
347 | int32_t i=0; | |
348 | U16_APPEND_UNSAFE(fold1, i, length); | |
349 | length=i; | |
350 | } | |
351 | ||
352 | /* set next level pointers to case folding */ | |
353 | start1=s1=fold1; | |
354 | limit1=fold1+length; | |
355 | ||
356 | /* get ready to read from decomposition, continue with loop */ | |
357 | c1=-1; | |
358 | continue; | |
359 | } | |
360 | ||
361 | if( level2==0 && (options&U_COMPARE_IGNORE_CASE) && | |
362 | (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 | |
363 | ) { | |
364 | /* cp2 case-folds to the code point "length" or to p[length] */ | |
365 | if(U_IS_SURROGATE(c2)) { | |
366 | if(U_IS_SURROGATE_LEAD(c2)) { | |
367 | /* advance beyond source surrogate pair if it case-folds */ | |
368 | ++s2; | |
369 | } else /* isTrail(c2) */ { | |
370 | /* | |
371 | * we got a supplementary code point when hitting its trail surrogate, | |
372 | * therefore the lead surrogate must have been the same as in the other string; | |
373 | * compare this decomposition with the lead surrogate in the other string | |
374 | * remember that this simulates bulk text replacement: | |
375 | * the decomposition would replace the entire code point | |
376 | */ | |
377 | --s1; | |
378 | c1=*(s1-1); | |
379 | } | |
380 | } | |
381 | ||
382 | /* push current level pointers */ | |
383 | stack2[0].start=start2; | |
384 | stack2[0].s=s2; | |
385 | stack2[0].limit=limit2; | |
386 | ++level2; | |
387 | ||
388 | /* copy the folding result to fold2[] */ | |
389 | if(length<=UCASE_MAX_STRING_LENGTH) { | |
390 | u_memcpy(fold2, p, length); | |
391 | } else { | |
392 | int32_t i=0; | |
393 | U16_APPEND_UNSAFE(fold2, i, length); | |
394 | length=i; | |
395 | } | |
396 | ||
397 | /* set next level pointers to case folding */ | |
398 | start2=s2=fold2; | |
399 | limit2=fold2+length; | |
400 | ||
401 | /* get ready to read from decomposition, continue with loop */ | |
402 | c2=-1; | |
403 | continue; | |
404 | } | |
405 | ||
406 | if( level1<2 && (options&_COMPARE_EQUIV) && | |
407 | 0!=(p=unorm_getCanonicalDecomposition((UChar32)cp1, decomp1, &length)) | |
408 | ) { | |
409 | /* cp1 decomposes into p[length] */ | |
410 | if(U_IS_SURROGATE(c1)) { | |
411 | if(U_IS_SURROGATE_LEAD(c1)) { | |
412 | /* advance beyond source surrogate pair if it decomposes */ | |
413 | ++s1; | |
414 | } else /* isTrail(c1) */ { | |
415 | /* | |
416 | * we got a supplementary code point when hitting its trail surrogate, | |
417 | * therefore the lead surrogate must have been the same as in the other string; | |
418 | * compare this decomposition with the lead surrogate in the other string | |
419 | * remember that this simulates bulk text replacement: | |
420 | * the decomposition would replace the entire code point | |
421 | */ | |
422 | --s2; | |
423 | c2=*(s2-1); | |
424 | } | |
425 | } | |
426 | ||
427 | /* push current level pointers */ | |
428 | stack1[level1].start=start1; | |
429 | stack1[level1].s=s1; | |
430 | stack1[level1].limit=limit1; | |
431 | ++level1; | |
432 | ||
433 | /* set empty intermediate level if skipped */ | |
434 | if(level1<2) { | |
435 | stack1[level1++].start=NULL; | |
436 | } | |
437 | ||
438 | /* set next level pointers to decomposition */ | |
439 | start1=s1=p; | |
440 | limit1=p+length; | |
441 | ||
442 | /* get ready to read from decomposition, continue with loop */ | |
443 | c1=-1; | |
444 | continue; | |
445 | } | |
446 | ||
447 | if( level2<2 && (options&_COMPARE_EQUIV) && | |
448 | 0!=(p=unorm_getCanonicalDecomposition((UChar32)cp2, decomp2, &length)) | |
449 | ) { | |
450 | /* cp2 decomposes into p[length] */ | |
451 | if(U_IS_SURROGATE(c2)) { | |
452 | if(U_IS_SURROGATE_LEAD(c2)) { | |
453 | /* advance beyond source surrogate pair if it decomposes */ | |
454 | ++s2; | |
455 | } else /* isTrail(c2) */ { | |
456 | /* | |
457 | * we got a supplementary code point when hitting its trail surrogate, | |
458 | * therefore the lead surrogate must have been the same as in the other string; | |
459 | * compare this decomposition with the lead surrogate in the other string | |
460 | * remember that this simulates bulk text replacement: | |
461 | * the decomposition would replace the entire code point | |
462 | */ | |
463 | --s1; | |
464 | c1=*(s1-1); | |
465 | } | |
466 | } | |
467 | ||
468 | /* push current level pointers */ | |
469 | stack2[level2].start=start2; | |
470 | stack2[level2].s=s2; | |
471 | stack2[level2].limit=limit2; | |
472 | ++level2; | |
473 | ||
474 | /* set empty intermediate level if skipped */ | |
475 | if(level2<2) { | |
476 | stack2[level2++].start=NULL; | |
477 | } | |
478 | ||
479 | /* set next level pointers to decomposition */ | |
480 | start2=s2=p; | |
481 | limit2=p+length; | |
482 | ||
483 | /* get ready to read from decomposition, continue with loop */ | |
484 | c2=-1; | |
485 | continue; | |
486 | } | |
487 | ||
488 | /* | |
489 | * no decomposition/case folding, max level for both sides: | |
490 | * return difference result | |
491 | * | |
492 | * code point order comparison must not just return cp1-cp2 | |
493 | * because when single surrogates are present then the surrogate pairs | |
494 | * that formed cp1 and cp2 may be from different string indexes | |
495 | * | |
496 | * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units | |
497 | * c1=d800 cp1=10001 c2=dc00 cp2=10000 | |
498 | * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } | |
499 | * | |
500 | * therefore, use same fix-up as in ustring.c/uprv_strCompare() | |
501 | * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ | |
502 | * so we have slightly different pointer/start/limit comparisons here | |
503 | */ | |
504 | ||
505 | if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { | |
506 | /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ | |
507 | if( | |
508 | (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || | |
509 | (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) | |
510 | ) { | |
511 | /* part of a surrogate pair, leave >=d800 */ | |
512 | } else { | |
513 | /* BMP code point - may be surrogate code point - make <d800 */ | |
514 | c1-=0x2800; | |
515 | } | |
516 | ||
517 | if( | |
518 | (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || | |
519 | (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) | |
520 | ) { | |
521 | /* part of a surrogate pair, leave >=d800 */ | |
522 | } else { | |
523 | /* BMP code point - may be surrogate code point - make <d800 */ | |
524 | c2-=0x2800; | |
525 | } | |
526 | } | |
527 | ||
528 | return c1-c2; | |
529 | } | |
530 | } | |
531 | ||
532 | U_CAPI int32_t U_EXPORT2 | |
533 | unorm_compare(const UChar *s1, int32_t length1, | |
534 | const UChar *s2, int32_t length2, | |
535 | uint32_t options, | |
536 | UErrorCode *pErrorCode) { | |
537 | UChar fcd1[300], fcd2[300]; | |
538 | UChar *d1, *d2; | |
539 | const UnicodeSet *nx; | |
540 | UNormalizationMode mode; | |
541 | int32_t normOptions; | |
542 | int32_t result; | |
543 | ||
544 | /* argument checking */ | |
545 | if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { | |
546 | return 0; | |
547 | } | |
548 | if(s1==0 || length1<-1 || s2==0 || length2<-1) { | |
549 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
550 | return 0; | |
551 | } | |
552 | ||
553 | if(!unorm_haveData(pErrorCode)) { | |
554 | return 0; | |
555 | } | |
556 | if(!uprv_haveProperties(pErrorCode)) { | |
557 | return 0; | |
558 | } | |
559 | ||
560 | normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT); | |
561 | nx=unorm_getNX(normOptions, pErrorCode); | |
562 | if(U_FAILURE(*pErrorCode)) { | |
563 | return 0; | |
564 | } | |
565 | ||
566 | d1=d2=0; | |
567 | options|=_COMPARE_EQUIV; | |
568 | result=0; | |
569 | ||
570 | /* | |
571 | * UAX #21 Case Mappings, as fixed for Unicode version 4 | |
572 | * (see Jitterbug 2021), defines a canonical caseless match as | |
573 | * | |
574 | * A string X is a canonical caseless match | |
575 | * for a string Y if and only if | |
576 | * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) | |
577 | * | |
578 | * For better performance, we check for FCD (or let the caller tell us that | |
579 | * both strings are in FCD) for the inner normalization. | |
580 | * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that | |
581 | * case-folding preserves the FCD-ness of a string. | |
582 | * The outer normalization is then only performed by unorm_cmpEquivFold() | |
583 | * when there is a difference. | |
584 | * | |
585 | * Exception: When using the Turkic case-folding option, we do perform | |
586 | * full NFD first. This is because in the Turkic case precomposed characters | |
587 | * with 0049 capital I or 0069 small i fold differently whether they | |
588 | * are first decomposed or not, so an FCD check - a check only for | |
589 | * canonical order - is not sufficient. | |
590 | */ | |
591 | if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) { | |
592 | mode=UNORM_NFD; | |
593 | options&=~UNORM_INPUT_IS_FCD; | |
594 | } else { | |
595 | mode=UNORM_FCD; | |
596 | } | |
597 | ||
598 | if(!(options&UNORM_INPUT_IS_FCD)) { | |
599 | int32_t _len1, _len2; | |
600 | UBool isFCD1, isFCD2; | |
601 | ||
602 | // check if s1 and/or s2 fulfill the FCD conditions | |
603 | isFCD1= UNORM_YES==unorm_internalQuickCheck(s1, length1, mode, TRUE, nx, pErrorCode); | |
604 | isFCD2= UNORM_YES==unorm_internalQuickCheck(s2, length2, mode, TRUE, nx, pErrorCode); | |
605 | if(U_FAILURE(*pErrorCode)) { | |
606 | return 0; | |
607 | } | |
608 | ||
609 | /* | |
610 | * ICU 2.4 had a further optimization: | |
611 | * If both strings were not in FCD, then they were both NFD'ed, | |
612 | * and the _COMPARE_EQUIV option was turned off. | |
613 | * It is not entirely clear that this is valid with the current | |
614 | * definition of the canonical caseless match. | |
615 | * Therefore, ICU 2.6 removes that optimization. | |
616 | */ | |
617 | ||
618 | if(!isFCD1) { | |
619 | _len1=unorm_internalNormalizeWithNX(fcd1, LENGTHOF(fcd1), | |
620 | s1, length1, | |
621 | mode, normOptions, nx, | |
622 | pErrorCode); | |
623 | if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { | |
624 | s1=fcd1; | |
625 | } else { | |
626 | d1=(UChar *)uprv_malloc(_len1*U_SIZEOF_UCHAR); | |
627 | if(d1==0) { | |
628 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
629 | goto cleanup; | |
630 | } | |
631 | ||
632 | *pErrorCode=U_ZERO_ERROR; | |
633 | _len1=unorm_internalNormalizeWithNX(d1, _len1, | |
634 | s1, length1, | |
635 | mode, normOptions, nx, | |
636 | pErrorCode); | |
637 | if(U_FAILURE(*pErrorCode)) { | |
638 | goto cleanup; | |
639 | } | |
640 | ||
641 | s1=d1; | |
642 | } | |
643 | length1=_len1; | |
644 | } | |
645 | ||
646 | if(!isFCD2) { | |
647 | _len2=unorm_internalNormalizeWithNX(fcd2, LENGTHOF(fcd2), | |
648 | s2, length2, | |
649 | mode, normOptions, nx, | |
650 | pErrorCode); | |
651 | if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { | |
652 | s2=fcd2; | |
653 | } else { | |
654 | d2=(UChar *)uprv_malloc(_len2*U_SIZEOF_UCHAR); | |
655 | if(d2==0) { | |
656 | *pErrorCode=U_MEMORY_ALLOCATION_ERROR; | |
657 | goto cleanup; | |
658 | } | |
659 | ||
660 | *pErrorCode=U_ZERO_ERROR; | |
661 | _len2=unorm_internalNormalizeWithNX(d2, _len2, | |
662 | s2, length2, | |
663 | mode, normOptions, nx, | |
664 | pErrorCode); | |
665 | if(U_FAILURE(*pErrorCode)) { | |
666 | goto cleanup; | |
667 | } | |
668 | ||
669 | s2=d2; | |
670 | } | |
671 | length2=_len2; | |
672 | } | |
673 | } | |
674 | ||
675 | if(U_SUCCESS(*pErrorCode)) { | |
676 | result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode); | |
677 | } | |
678 | ||
679 | cleanup: | |
680 | if(d1!=0) { | |
681 | uprv_free(d1); | |
682 | } | |
683 | if(d2!=0) { | |
684 | uprv_free(d2); | |
685 | } | |
686 | ||
687 | return result; | |
688 | } | |
689 | ||
690 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |