]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unormcmp.cpp
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / common / unormcmp.cpp
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
46f4442e 4* Copyright (C) 2001-2006, International Business Machines
374ca955
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: unormcmp.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2004sep13
14* created by: Markus W. Scherer
15*
16* unorm_compare() function moved here from unorm.cpp for better modularization.
17* Depends on both normalization and case folding.
18* Allows unorm.cpp to not depend on any character properties code.
19*/
20
21#include "unicode/utypes.h"
22
23#if !UCONFIG_NO_NORMALIZATION
24
25#include "unicode/ustring.h"
26#include "unicode/unorm.h"
27#include "unicode/uniset.h"
28#include "unormimp.h"
29#include "ucase.h"
30#include "cmemory.h"
31
46f4442e
A
32U_NAMESPACE_USE
33
374ca955
A
34#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
35
36/* compare canonically equivalent ------------------------------------------- */
37
38/*
39 * Compare two strings for canonical equivalence.
40 * Further options include case-insensitive comparison and
41 * code point order (as opposed to code unit order).
42 *
43 * In this function, canonical equivalence is optional as well.
44 * If canonical equivalence is tested, then both strings must fulfill
45 * the FCD check.
46 *
47 * Semantically, this is equivalent to
48 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
49 * where code point order, NFD and foldCase are all optional.
50 *
51 * String comparisons almost always yield results before processing both strings
52 * completely.
53 * They are generally more efficient working incrementally instead of
54 * performing the sub-processing (strlen, normalization, case-folding)
55 * on the entire strings first.
56 *
57 * It is also unnecessary to not normalize identical characters.
58 *
59 * This function works in principle as follows:
60 *
61 * loop {
62 * get one code unit c1 from s1 (-1 if end of source)
63 * get one code unit c2 from s2 (-1 if end of source)
64 *
65 * if(either string finished) {
66 * return result;
67 * }
68 * if(c1==c2) {
69 * continue;
70 * }
71 *
72 * // c1!=c2
73 * try to decompose/case-fold c1/c2, and continue if one does;
74 *
75 * // still c1!=c2 and neither decomposes/case-folds, return result
76 * return c1-c2;
77 * }
78 *
79 * When a character decomposes, then the pointer for that source changes to
80 * the decomposition, pushing the previous pointer onto a stack.
81 * When the end of the decomposition is reached, then the code unit reader
82 * pops the previous source from the stack.
83 * (Same for case-folding.)
84 *
85 * This is complicated further by operating on variable-width UTF-16.
86 * The top part of the loop works on code units, while lookups for decomposition
87 * and case-folding need code points.
88 * Code points are assembled after the equality/end-of-source part.
89 * The source pointer is only advanced beyond all code units when the code point
90 * actually decomposes/case-folds.
91 *
92 * If we were on a trail surrogate unit when assembling a code point,
93 * and the code point decomposes/case-folds, then the decomposition/folding
94 * result must be compared with the part of the other string that corresponds to
95 * this string's lead surrogate.
96 * Since we only assemble a code point when hitting a trail unit when the
97 * preceding lead units were identical, we back up the other string by one unit
98 * in such a case.
99 *
100 * The optional code point order comparison at the end works with
101 * the same fix-up as the other code point order comparison functions.
102 * See ustring.c and the comment near the end of this function.
103 *
104 * Assumption: A decomposition or case-folding result string never contains
105 * a single surrogate. This is a safe assumption in the Unicode Standard.
106 * Therefore, we do not need to check for surrogate pairs across
107 * decomposition/case-folding boundaries.
108 *
109 * Further assumptions (see verifications tstnorm.cpp):
110 * The API function checks for FCD first, while the core function
111 * first case-folds and then decomposes. This requires that case-folding does not
112 * un-FCD any strings.
113 *
114 * The API function may also NFD the input and turn off decomposition.
115 * This requires that case-folding does not un-NFD strings either.
116 *
117 * TODO If any of the above two assumptions is violated,
118 * then this entire code must be re-thought.
119 * If this happens, then a simple solution is to case-fold both strings up front
120 * and to turn off UNORM_INPUT_IS_FCD.
121 * We already do this when not both strings are in FCD because makeFCD
122 * would be a partial NFD before the case folding, which does not work.
123 * Note that all of this is only a problem when case-folding _and_
124 * canonical equivalence come together.
125 * (Comments in unorm_compare() are more up to date than this TODO.)
126 *
127 * This function could be moved to a different source file, at increased cost
128 * for calling the decomposition access function.
129 */
130
131/* stack element for previous-level source/decomposition pointers */
132struct CmpEquivLevel {
133 const UChar *start, *s, *limit;
134};
135typedef struct CmpEquivLevel CmpEquivLevel;
136
137/* internal function */
138static int32_t
139unorm_cmpEquivFold(const UChar *s1, int32_t length1,
140 const UChar *s2, int32_t length2,
141 uint32_t options,
142 UErrorCode *pErrorCode) {
73c04bcf 143 const UCaseProps *csp;
374ca955
A
144
145 /* current-level start/limit - s1/s2 as current */
146 const UChar *start1, *start2, *limit1, *limit2;
147
148 /* decomposition and case folding variables */
149 const UChar *p;
150 int32_t length;
151
152 /* stacks of previous-level start/current/limit */
153 CmpEquivLevel stack1[2], stack2[2];
154
155 /* decomposition buffers for Hangul */
156 UChar decomp1[4], decomp2[4];
157
158 /* case folding buffers, only use current-level start/limit */
159 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1];
160
161 /* track which is the current level per string */
162 int32_t level1, level2;
163
164 /* current code units, and code points for lookups */
165 UChar32 c1, c2, cp1, cp2;
166
167 /* no argument error checking because this itself is not an API */
168
169 /*
170 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
171 * otherwise this function must behave exactly as uprv_strCompare()
172 * not checking for that here makes testing this function easier
173 */
174
175 /* normalization/properties data loaded? */
176 if( ((options&_COMPARE_EQUIV)!=0 && !unorm_haveData(pErrorCode)) ||
177 U_FAILURE(*pErrorCode)
178 ) {
179 return 0;
180 }
181 if((options&U_COMPARE_IGNORE_CASE)!=0) {
182 csp=ucase_getSingleton(pErrorCode);
183 if(U_FAILURE(*pErrorCode)) {
184 return 0;
185 }
186 } else {
187 csp=NULL;
188 }
189
190 /* initialize */
191 start1=s1;
192 if(length1==-1) {
193 limit1=NULL;
194 } else {
195 limit1=s1+length1;
196 }
197
198 start2=s2;
199 if(length2==-1) {
200 limit2=NULL;
201 } else {
202 limit2=s2+length2;
203 }
204
205 level1=level2=0;
206 c1=c2=-1;
207
208 /* comparison loop */
209 for(;;) {
210 /*
211 * here a code unit value of -1 means "get another code unit"
212 * below it will mean "this source is finished"
213 */
214
215 if(c1<0) {
216 /* get next code unit from string 1, post-increment */
217 for(;;) {
218 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
219 if(level1==0) {
220 c1=-1;
221 break;
222 }
223 } else {
224 ++s1;
225 break;
226 }
227
228 /* reached end of level buffer, pop one level */
229 do {
230 --level1;
231 start1=stack1[level1].start;
232 } while(start1==NULL);
233 s1=stack1[level1].s;
234 limit1=stack1[level1].limit;
235 }
236 }
237
238 if(c2<0) {
239 /* get next code unit from string 2, post-increment */
240 for(;;) {
241 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
242 if(level2==0) {
243 c2=-1;
244 break;
245 }
246 } else {
247 ++s2;
248 break;
249 }
250
251 /* reached end of level buffer, pop one level */
252 do {
253 --level2;
254 start2=stack2[level2].start;
255 } while(start2==NULL);
256 s2=stack2[level2].s;
257 limit2=stack2[level2].limit;
258 }
259 }
260
261 /*
262 * compare c1 and c2
263 * either variable c1, c2 is -1 only if the corresponding string is finished
264 */
265 if(c1==c2) {
266 if(c1<0) {
267 return 0; /* c1==c2==-1 indicating end of strings */
268 }
269 c1=c2=-1; /* make us fetch new code units */
270 continue;
271 } else if(c1<0) {
272 return -1; /* string 1 ends before string 2 */
273 } else if(c2<0) {
274 return 1; /* string 2 ends before string 1 */
275 }
276 /* c1!=c2 && c1>=0 && c2>=0 */
277
278 /* get complete code points for c1, c2 for lookups if either is a surrogate */
279 cp1=c1;
280 if(U_IS_SURROGATE(c1)) {
281 UChar c;
282
283 if(U_IS_SURROGATE_LEAD(c1)) {
284 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) {
285 /* advance ++s1; only below if cp1 decomposes/case-folds */
286 cp1=U16_GET_SUPPLEMENTARY(c1, c);
287 }
288 } else /* isTrail(c1) */ {
289 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) {
290 cp1=U16_GET_SUPPLEMENTARY(c, c1);
291 }
292 }
293 }
294
295 cp2=c2;
296 if(U_IS_SURROGATE(c2)) {
297 UChar c;
298
299 if(U_IS_SURROGATE_LEAD(c2)) {
300 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) {
301 /* advance ++s2; only below if cp2 decomposes/case-folds */
302 cp2=U16_GET_SUPPLEMENTARY(c2, c);
303 }
304 } else /* isTrail(c2) */ {
305 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) {
306 cp2=U16_GET_SUPPLEMENTARY(c, c2);
307 }
308 }
309 }
310
311 /*
312 * go down one level for each string
313 * continue with the main loop as soon as there is a real change
314 */
315
316 if( level1==0 && (options&U_COMPARE_IGNORE_CASE) &&
317 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
318 ) {
319 /* cp1 case-folds to the code point "length" or to p[length] */
320 if(U_IS_SURROGATE(c1)) {
321 if(U_IS_SURROGATE_LEAD(c1)) {
322 /* advance beyond source surrogate pair if it case-folds */
323 ++s1;
324 } else /* isTrail(c1) */ {
325 /*
326 * we got a supplementary code point when hitting its trail surrogate,
327 * therefore the lead surrogate must have been the same as in the other string;
328 * compare this decomposition with the lead surrogate in the other string
329 * remember that this simulates bulk text replacement:
330 * the decomposition would replace the entire code point
331 */
332 --s2;
333 c2=*(s2-1);
334 }
335 }
336
337 /* push current level pointers */
338 stack1[0].start=start1;
339 stack1[0].s=s1;
340 stack1[0].limit=limit1;
341 ++level1;
342
343 /* copy the folding result to fold1[] */
344 if(length<=UCASE_MAX_STRING_LENGTH) {
345 u_memcpy(fold1, p, length);
346 } else {
347 int32_t i=0;
348 U16_APPEND_UNSAFE(fold1, i, length);
349 length=i;
350 }
351
352 /* set next level pointers to case folding */
353 start1=s1=fold1;
354 limit1=fold1+length;
355
356 /* get ready to read from decomposition, continue with loop */
357 c1=-1;
358 continue;
359 }
360
361 if( level2==0 && (options&U_COMPARE_IGNORE_CASE) &&
362 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
363 ) {
364 /* cp2 case-folds to the code point "length" or to p[length] */
365 if(U_IS_SURROGATE(c2)) {
366 if(U_IS_SURROGATE_LEAD(c2)) {
367 /* advance beyond source surrogate pair if it case-folds */
368 ++s2;
369 } else /* isTrail(c2) */ {
370 /*
371 * we got a supplementary code point when hitting its trail surrogate,
372 * therefore the lead surrogate must have been the same as in the other string;
373 * compare this decomposition with the lead surrogate in the other string
374 * remember that this simulates bulk text replacement:
375 * the decomposition would replace the entire code point
376 */
377 --s1;
378 c1=*(s1-1);
379 }
380 }
381
382 /* push current level pointers */
383 stack2[0].start=start2;
384 stack2[0].s=s2;
385 stack2[0].limit=limit2;
386 ++level2;
387
388 /* copy the folding result to fold2[] */
389 if(length<=UCASE_MAX_STRING_LENGTH) {
390 u_memcpy(fold2, p, length);
391 } else {
392 int32_t i=0;
393 U16_APPEND_UNSAFE(fold2, i, length);
394 length=i;
395 }
396
397 /* set next level pointers to case folding */
398 start2=s2=fold2;
399 limit2=fold2+length;
400
401 /* get ready to read from decomposition, continue with loop */
402 c2=-1;
403 continue;
404 }
405
406 if( level1<2 && (options&_COMPARE_EQUIV) &&
407 0!=(p=unorm_getCanonicalDecomposition((UChar32)cp1, decomp1, &length))
408 ) {
409 /* cp1 decomposes into p[length] */
410 if(U_IS_SURROGATE(c1)) {
411 if(U_IS_SURROGATE_LEAD(c1)) {
412 /* advance beyond source surrogate pair if it decomposes */
413 ++s1;
414 } else /* isTrail(c1) */ {
415 /*
416 * we got a supplementary code point when hitting its trail surrogate,
417 * therefore the lead surrogate must have been the same as in the other string;
418 * compare this decomposition with the lead surrogate in the other string
419 * remember that this simulates bulk text replacement:
420 * the decomposition would replace the entire code point
421 */
422 --s2;
423 c2=*(s2-1);
424 }
425 }
426
427 /* push current level pointers */
428 stack1[level1].start=start1;
429 stack1[level1].s=s1;
430 stack1[level1].limit=limit1;
431 ++level1;
432
433 /* set empty intermediate level if skipped */
434 if(level1<2) {
435 stack1[level1++].start=NULL;
436 }
437
438 /* set next level pointers to decomposition */
439 start1=s1=p;
440 limit1=p+length;
441
442 /* get ready to read from decomposition, continue with loop */
443 c1=-1;
444 continue;
445 }
446
447 if( level2<2 && (options&_COMPARE_EQUIV) &&
448 0!=(p=unorm_getCanonicalDecomposition((UChar32)cp2, decomp2, &length))
449 ) {
450 /* cp2 decomposes into p[length] */
451 if(U_IS_SURROGATE(c2)) {
452 if(U_IS_SURROGATE_LEAD(c2)) {
453 /* advance beyond source surrogate pair if it decomposes */
454 ++s2;
455 } else /* isTrail(c2) */ {
456 /*
457 * we got a supplementary code point when hitting its trail surrogate,
458 * therefore the lead surrogate must have been the same as in the other string;
459 * compare this decomposition with the lead surrogate in the other string
460 * remember that this simulates bulk text replacement:
461 * the decomposition would replace the entire code point
462 */
463 --s1;
464 c1=*(s1-1);
465 }
466 }
467
468 /* push current level pointers */
469 stack2[level2].start=start2;
470 stack2[level2].s=s2;
471 stack2[level2].limit=limit2;
472 ++level2;
473
474 /* set empty intermediate level if skipped */
475 if(level2<2) {
476 stack2[level2++].start=NULL;
477 }
478
479 /* set next level pointers to decomposition */
480 start2=s2=p;
481 limit2=p+length;
482
483 /* get ready to read from decomposition, continue with loop */
484 c2=-1;
485 continue;
486 }
487
488 /*
489 * no decomposition/case folding, max level for both sides:
490 * return difference result
491 *
492 * code point order comparison must not just return cp1-cp2
493 * because when single surrogates are present then the surrogate pairs
494 * that formed cp1 and cp2 may be from different string indexes
495 *
496 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
497 * c1=d800 cp1=10001 c2=dc00 cp2=10000
498 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
499 *
500 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
501 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
502 * so we have slightly different pointer/start/limit comparisons here
503 */
504
505 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
506 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
507 if(
508 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) ||
509 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2)))
510 ) {
511 /* part of a surrogate pair, leave >=d800 */
512 } else {
513 /* BMP code point - may be surrogate code point - make <d800 */
514 c1-=0x2800;
515 }
516
517 if(
518 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) ||
519 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2)))
520 ) {
521 /* part of a surrogate pair, leave >=d800 */
522 } else {
523 /* BMP code point - may be surrogate code point - make <d800 */
524 c2-=0x2800;
525 }
526 }
527
528 return c1-c2;
529 }
530}
531
532U_CAPI int32_t U_EXPORT2
533unorm_compare(const UChar *s1, int32_t length1,
534 const UChar *s2, int32_t length2,
535 uint32_t options,
536 UErrorCode *pErrorCode) {
537 UChar fcd1[300], fcd2[300];
538 UChar *d1, *d2;
539 const UnicodeSet *nx;
540 UNormalizationMode mode;
541 int32_t normOptions;
542 int32_t result;
543
544 /* argument checking */
545 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
546 return 0;
547 }
548 if(s1==0 || length1<-1 || s2==0 || length2<-1) {
549 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
550 return 0;
551 }
552
553 if(!unorm_haveData(pErrorCode)) {
554 return 0;
555 }
556 if(!uprv_haveProperties(pErrorCode)) {
557 return 0;
558 }
559
560 normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
561 nx=unorm_getNX(normOptions, pErrorCode);
562 if(U_FAILURE(*pErrorCode)) {
563 return 0;
564 }
565
566 d1=d2=0;
567 options|=_COMPARE_EQUIV;
568 result=0;
569
570 /*
571 * UAX #21 Case Mappings, as fixed for Unicode version 4
572 * (see Jitterbug 2021), defines a canonical caseless match as
573 *
574 * A string X is a canonical caseless match
575 * for a string Y if and only if
576 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
577 *
578 * For better performance, we check for FCD (or let the caller tell us that
579 * both strings are in FCD) for the inner normalization.
580 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
581 * case-folding preserves the FCD-ness of a string.
582 * The outer normalization is then only performed by unorm_cmpEquivFold()
583 * when there is a difference.
584 *
585 * Exception: When using the Turkic case-folding option, we do perform
586 * full NFD first. This is because in the Turkic case precomposed characters
587 * with 0049 capital I or 0069 small i fold differently whether they
588 * are first decomposed or not, so an FCD check - a check only for
589 * canonical order - is not sufficient.
590 */
591 if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
592 mode=UNORM_NFD;
593 options&=~UNORM_INPUT_IS_FCD;
594 } else {
595 mode=UNORM_FCD;
596 }
597
598 if(!(options&UNORM_INPUT_IS_FCD)) {
599 int32_t _len1, _len2;
600 UBool isFCD1, isFCD2;
601
602 // check if s1 and/or s2 fulfill the FCD conditions
603 isFCD1= UNORM_YES==unorm_internalQuickCheck(s1, length1, mode, TRUE, nx, pErrorCode);
604 isFCD2= UNORM_YES==unorm_internalQuickCheck(s2, length2, mode, TRUE, nx, pErrorCode);
605 if(U_FAILURE(*pErrorCode)) {
606 return 0;
607 }
608
609 /*
610 * ICU 2.4 had a further optimization:
611 * If both strings were not in FCD, then they were both NFD'ed,
612 * and the _COMPARE_EQUIV option was turned off.
613 * It is not entirely clear that this is valid with the current
614 * definition of the canonical caseless match.
615 * Therefore, ICU 2.6 removes that optimization.
616 */
617
618 if(!isFCD1) {
619 _len1=unorm_internalNormalizeWithNX(fcd1, LENGTHOF(fcd1),
620 s1, length1,
621 mode, normOptions, nx,
622 pErrorCode);
623 if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
624 s1=fcd1;
625 } else {
626 d1=(UChar *)uprv_malloc(_len1*U_SIZEOF_UCHAR);
627 if(d1==0) {
628 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
629 goto cleanup;
630 }
631
632 *pErrorCode=U_ZERO_ERROR;
633 _len1=unorm_internalNormalizeWithNX(d1, _len1,
634 s1, length1,
635 mode, normOptions, nx,
636 pErrorCode);
637 if(U_FAILURE(*pErrorCode)) {
638 goto cleanup;
639 }
640
641 s1=d1;
642 }
643 length1=_len1;
644 }
645
646 if(!isFCD2) {
647 _len2=unorm_internalNormalizeWithNX(fcd2, LENGTHOF(fcd2),
648 s2, length2,
649 mode, normOptions, nx,
650 pErrorCode);
651 if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
652 s2=fcd2;
653 } else {
654 d2=(UChar *)uprv_malloc(_len2*U_SIZEOF_UCHAR);
655 if(d2==0) {
656 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
657 goto cleanup;
658 }
659
660 *pErrorCode=U_ZERO_ERROR;
661 _len2=unorm_internalNormalizeWithNX(d2, _len2,
662 s2, length2,
663 mode, normOptions, nx,
664 pErrorCode);
665 if(U_FAILURE(*pErrorCode)) {
666 goto cleanup;
667 }
668
669 s2=d2;
670 }
671 length2=_len2;
672 }
673 }
674
675 if(U_SUCCESS(*pErrorCode)) {
676 result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
677 }
678
679cleanup:
680 if(d1!=0) {
681 uprv_free(d1);
682 }
683 if(d2!=0) {
684 uprv_free(d2);
685 }
686
687 return result;
688}
689
690#endif /* #if !UCONFIG_NO_NORMALIZATION */