2 *******************************************************************************
4 * Copyright (C) 2001-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: unormcmp.cpp
10 * tab size: 8 (not used)
13 * created on: 2004sep13
14 * created by: Markus W. Scherer
16 * unorm_compare() function moved here from unorm.cpp for better modularization.
17 * Depends on both normalization and case folding.
18 * Allows unorm.cpp to not depend on any character properties code.
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_NORMALIZATION
25 #include "unicode/unorm.h"
26 #include "unicode/ustring.h"
28 #include "normalizer2impl.h"
35 /* compare canonically equivalent ------------------------------------------- */
38 * Compare two strings for canonical equivalence.
39 * Further options include case-insensitive comparison and
40 * code point order (as opposed to code unit order).
42 * In this function, canonical equivalence is optional as well.
43 * If canonical equivalence is tested, then both strings must fulfill
46 * Semantically, this is equivalent to
47 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
48 * where code point order, NFD and foldCase are all optional.
50 * String comparisons almost always yield results before processing both strings
52 * They are generally more efficient working incrementally instead of
53 * performing the sub-processing (strlen, normalization, case-folding)
54 * on the entire strings first.
56 * It is also unnecessary to not normalize identical characters.
58 * This function works in principle as follows:
61 * get one code unit c1 from s1 (-1 if end of source)
62 * get one code unit c2 from s2 (-1 if end of source)
64 * if(either string finished) {
72 * try to decompose/case-fold c1/c2, and continue if one does;
74 * // still c1!=c2 and neither decomposes/case-folds, return result
78 * When a character decomposes, then the pointer for that source changes to
79 * the decomposition, pushing the previous pointer onto a stack.
80 * When the end of the decomposition is reached, then the code unit reader
81 * pops the previous source from the stack.
82 * (Same for case-folding.)
84 * This is complicated further by operating on variable-width UTF-16.
85 * The top part of the loop works on code units, while lookups for decomposition
86 * and case-folding need code points.
87 * Code points are assembled after the equality/end-of-source part.
88 * The source pointer is only advanced beyond all code units when the code point
89 * actually decomposes/case-folds.
91 * If we were on a trail surrogate unit when assembling a code point,
92 * and the code point decomposes/case-folds, then the decomposition/folding
93 * result must be compared with the part of the other string that corresponds to
94 * this string's lead surrogate.
95 * Since we only assemble a code point when hitting a trail unit when the
96 * preceding lead units were identical, we back up the other string by one unit
99 * The optional code point order comparison at the end works with
100 * the same fix-up as the other code point order comparison functions.
101 * See ustring.c and the comment near the end of this function.
103 * Assumption: A decomposition or case-folding result string never contains
104 * a single surrogate. This is a safe assumption in the Unicode Standard.
105 * Therefore, we do not need to check for surrogate pairs across
106 * decomposition/case-folding boundaries.
108 * Further assumptions (see verifications tstnorm.cpp):
109 * The API function checks for FCD first, while the core function
110 * first case-folds and then decomposes. This requires that case-folding does not
111 * un-FCD any strings.
113 * The API function may also NFD the input and turn off decomposition.
114 * This requires that case-folding does not un-NFD strings either.
116 * TODO If any of the above two assumptions is violated,
117 * then this entire code must be re-thought.
118 * If this happens, then a simple solution is to case-fold both strings up front
119 * and to turn off UNORM_INPUT_IS_FCD.
120 * We already do this when not both strings are in FCD because makeFCD
121 * would be a partial NFD before the case folding, which does not work.
122 * Note that all of this is only a problem when case-folding _and_
123 * canonical equivalence come together.
124 * (Comments in unorm_compare() are more up to date than this TODO.)
127 /* stack element for previous-level source/decomposition pointers */
128 struct CmpEquivLevel
{
129 const UChar
*start
, *s
, *limit
;
131 typedef struct CmpEquivLevel CmpEquivLevel
;
134 * Internal option for unorm_cmpEquivFold() for decomposing.
135 * If not set, just do strcasecmp().
137 #define _COMPARE_EQUIV 0x80000
139 /* internal function */
141 unorm_cmpEquivFold(const UChar
*s1
, int32_t length1
,
142 const UChar
*s2
, int32_t length2
,
144 UErrorCode
*pErrorCode
) {
145 const Normalizer2Impl
*nfcImpl
;
146 const UCaseProps
*csp
;
148 /* current-level start/limit - s1/s2 as current */
149 const UChar
*start1
, *start2
, *limit1
, *limit2
;
151 /* decomposition and case folding variables */
155 /* stacks of previous-level start/current/limit */
156 CmpEquivLevel stack1
[2], stack2
[2];
158 /* buffers for algorithmic decompositions */
159 UChar decomp1
[4], decomp2
[4];
161 /* case folding buffers, only use current-level start/limit */
162 UChar fold1
[UCASE_MAX_STRING_LENGTH
+1], fold2
[UCASE_MAX_STRING_LENGTH
+1];
164 /* track which is the current level per string */
165 int32_t level1
, level2
;
167 /* current code units, and code points for lookups */
168 UChar32 c1
, c2
, cp1
, cp2
;
170 /* no argument error checking because this itself is not an API */
173 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
174 * otherwise this function must behave exactly as uprv_strCompare()
175 * not checking for that here makes testing this function easier
178 /* normalization/properties data loaded? */
179 if((options
&_COMPARE_EQUIV
)!=0) {
180 nfcImpl
=Normalizer2Factory::getNFCImpl(*pErrorCode
);
184 if((options
&U_COMPARE_IGNORE_CASE
)!=0) {
185 csp
=ucase_getSingleton();
189 if(U_FAILURE(*pErrorCode
)) {
211 /* comparison loop */
214 * here a code unit value of -1 means "get another code unit"
215 * below it will mean "this source is finished"
219 /* get next code unit from string 1, post-increment */
221 if(s1
==limit1
|| ((c1
=*s1
)==0 && (limit1
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
231 /* reached end of level buffer, pop one level */
234 start1
=stack1
[level1
].start
; /*Not uninitialized*/
235 } while(start1
==NULL
);
236 s1
=stack1
[level1
].s
; /*Not uninitialized*/
237 limit1
=stack1
[level1
].limit
; /*Not uninitialized*/
242 /* get next code unit from string 2, post-increment */
244 if(s2
==limit2
|| ((c2
=*s2
)==0 && (limit2
==NULL
|| (options
&_STRNCMP_STYLE
)))) {
254 /* reached end of level buffer, pop one level */
257 start2
=stack2
[level2
].start
; /*Not uninitialized*/
258 } while(start2
==NULL
);
259 s2
=stack2
[level2
].s
; /*Not uninitialized*/
260 limit2
=stack2
[level2
].limit
; /*Not uninitialized*/
266 * either variable c1, c2 is -1 only if the corresponding string is finished
270 return 0; /* c1==c2==-1 indicating end of strings */
272 c1
=c2
=-1; /* make us fetch new code units */
275 return -1; /* string 1 ends before string 2 */
277 return 1; /* string 2 ends before string 1 */
279 /* c1!=c2 && c1>=0 && c2>=0 */
281 /* get complete code points for c1, c2 for lookups if either is a surrogate */
283 if(U_IS_SURROGATE(c1
)) {
286 if(U_IS_SURROGATE_LEAD(c1
)) {
287 if(s1
!=limit1
&& U16_IS_TRAIL(c
=*s1
)) {
288 /* advance ++s1; only below if cp1 decomposes/case-folds */
289 cp1
=U16_GET_SUPPLEMENTARY(c1
, c
);
291 } else /* isTrail(c1) */ {
292 if(start1
<=(s1
-2) && U16_IS_LEAD(c
=*(s1
-2))) {
293 cp1
=U16_GET_SUPPLEMENTARY(c
, c1
);
299 if(U_IS_SURROGATE(c2
)) {
302 if(U_IS_SURROGATE_LEAD(c2
)) {
303 if(s2
!=limit2
&& U16_IS_TRAIL(c
=*s2
)) {
304 /* advance ++s2; only below if cp2 decomposes/case-folds */
305 cp2
=U16_GET_SUPPLEMENTARY(c2
, c
);
307 } else /* isTrail(c2) */ {
308 if(start2
<=(s2
-2) && U16_IS_LEAD(c
=*(s2
-2))) {
309 cp2
=U16_GET_SUPPLEMENTARY(c
, c2
);
315 * go down one level for each string
316 * continue with the main loop as soon as there is a real change
319 if( level1
==0 && (options
&U_COMPARE_IGNORE_CASE
) &&
320 (length
=ucase_toFullFolding(csp
, (UChar32
)cp1
, &p
, options
))>=0
322 /* cp1 case-folds to the code point "length" or to p[length] */
323 if(U_IS_SURROGATE(c1
)) {
324 if(U_IS_SURROGATE_LEAD(c1
)) {
325 /* advance beyond source surrogate pair if it case-folds */
327 } else /* isTrail(c1) */ {
329 * we got a supplementary code point when hitting its trail surrogate,
330 * therefore the lead surrogate must have been the same as in the other string;
331 * compare this decomposition with the lead surrogate in the other string
332 * remember that this simulates bulk text replacement:
333 * the decomposition would replace the entire code point
340 /* push current level pointers */
341 stack1
[0].start
=start1
;
343 stack1
[0].limit
=limit1
;
346 /* copy the folding result to fold1[] */
347 if(length
<=UCASE_MAX_STRING_LENGTH
) {
348 u_memcpy(fold1
, p
, length
);
351 U16_APPEND_UNSAFE(fold1
, i
, length
);
355 /* set next level pointers to case folding */
359 /* get ready to read from decomposition, continue with loop */
364 if( level2
==0 && (options
&U_COMPARE_IGNORE_CASE
) &&
365 (length
=ucase_toFullFolding(csp
, (UChar32
)cp2
, &p
, options
))>=0
367 /* cp2 case-folds to the code point "length" or to p[length] */
368 if(U_IS_SURROGATE(c2
)) {
369 if(U_IS_SURROGATE_LEAD(c2
)) {
370 /* advance beyond source surrogate pair if it case-folds */
372 } else /* isTrail(c2) */ {
374 * we got a supplementary code point when hitting its trail surrogate,
375 * therefore the lead surrogate must have been the same as in the other string;
376 * compare this decomposition with the lead surrogate in the other string
377 * remember that this simulates bulk text replacement:
378 * the decomposition would replace the entire code point
385 /* push current level pointers */
386 stack2
[0].start
=start2
;
388 stack2
[0].limit
=limit2
;
391 /* copy the folding result to fold2[] */
392 if(length
<=UCASE_MAX_STRING_LENGTH
) {
393 u_memcpy(fold2
, p
, length
);
396 U16_APPEND_UNSAFE(fold2
, i
, length
);
400 /* set next level pointers to case folding */
404 /* get ready to read from decomposition, continue with loop */
409 if( level1
<2 && (options
&_COMPARE_EQUIV
) &&
410 0!=(p
=nfcImpl
->getDecomposition((UChar32
)cp1
, decomp1
, length
))
412 /* cp1 decomposes into p[length] */
413 if(U_IS_SURROGATE(c1
)) {
414 if(U_IS_SURROGATE_LEAD(c1
)) {
415 /* advance beyond source surrogate pair if it decomposes */
417 } else /* isTrail(c1) */ {
419 * we got a supplementary code point when hitting its trail surrogate,
420 * therefore the lead surrogate must have been the same as in the other string;
421 * compare this decomposition with the lead surrogate in the other string
422 * remember that this simulates bulk text replacement:
423 * the decomposition would replace the entire code point
430 /* push current level pointers */
431 stack1
[level1
].start
=start1
;
433 stack1
[level1
].limit
=limit1
;
436 /* set empty intermediate level if skipped */
438 stack1
[level1
++].start
=NULL
;
441 /* set next level pointers to decomposition */
445 /* get ready to read from decomposition, continue with loop */
450 if( level2
<2 && (options
&_COMPARE_EQUIV
) &&
451 0!=(p
=nfcImpl
->getDecomposition((UChar32
)cp2
, decomp2
, length
))
453 /* cp2 decomposes into p[length] */
454 if(U_IS_SURROGATE(c2
)) {
455 if(U_IS_SURROGATE_LEAD(c2
)) {
456 /* advance beyond source surrogate pair if it decomposes */
458 } else /* isTrail(c2) */ {
460 * we got a supplementary code point when hitting its trail surrogate,
461 * therefore the lead surrogate must have been the same as in the other string;
462 * compare this decomposition with the lead surrogate in the other string
463 * remember that this simulates bulk text replacement:
464 * the decomposition would replace the entire code point
471 /* push current level pointers */
472 stack2
[level2
].start
=start2
;
474 stack2
[level2
].limit
=limit2
;
477 /* set empty intermediate level if skipped */
479 stack2
[level2
++].start
=NULL
;
482 /* set next level pointers to decomposition */
486 /* get ready to read from decomposition, continue with loop */
492 * no decomposition/case folding, max level for both sides:
493 * return difference result
495 * code point order comparison must not just return cp1-cp2
496 * because when single surrogates are present then the surrogate pairs
497 * that formed cp1 and cp2 may be from different string indexes
499 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
500 * c1=d800 cp1=10001 c2=dc00 cp2=10000
501 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
503 * therefore, use same fix-up as in ustring.c/uprv_strCompare()
504 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
505 * so we have slightly different pointer/start/limit comparisons here
508 if(c1
>=0xd800 && c2
>=0xd800 && (options
&U_COMPARE_CODE_POINT_ORDER
)) {
509 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
511 (c1
<=0xdbff && s1
!=limit1
&& U16_IS_TRAIL(*s1
)) ||
512 (U16_IS_TRAIL(c1
) && start1
!=(s1
-1) && U16_IS_LEAD(*(s1
-2)))
514 /* part of a surrogate pair, leave >=d800 */
516 /* BMP code point - may be surrogate code point - make <d800 */
521 (c2
<=0xdbff && s2
!=limit2
&& U16_IS_TRAIL(*s2
)) ||
522 (U16_IS_TRAIL(c2
) && start2
!=(s2
-1) && U16_IS_LEAD(*(s2
-2)))
524 /* part of a surrogate pair, leave >=d800 */
526 /* BMP code point - may be surrogate code point - make <d800 */
536 UBool
_normalize(const Normalizer2
*n2
, const UChar
*s
, int32_t length
,
537 UnicodeString
&normalized
, UErrorCode
*pErrorCode
) {
538 UnicodeString
str(length
<0, s
, length
);
540 // check if s fulfill the conditions
541 int32_t spanQCYes
=n2
->spanQuickCheckYes(str
, *pErrorCode
);
542 if (U_FAILURE(*pErrorCode
)) {
546 * ICU 2.4 had a further optimization:
547 * If both strings were not in FCD, then they were both NFD'ed,
548 * and the _COMPARE_EQUIV option was turned off.
549 * It is not entirely clear that this is valid with the current
550 * definition of the canonical caseless match.
551 * Therefore, ICU 2.6 removes that optimization.
553 if(spanQCYes
<str
.length()) {
554 UnicodeString unnormalized
=str
.tempSubString(spanQCYes
);
555 normalized
.setTo(FALSE
, str
.getBuffer(), spanQCYes
);
556 n2
->normalizeSecondAndAppend(normalized
, unnormalized
, *pErrorCode
);
557 if (U_SUCCESS(*pErrorCode
)) {
564 U_CAPI
int32_t U_EXPORT2
565 unorm_compare(const UChar
*s1
, int32_t length1
,
566 const UChar
*s2
, int32_t length2
,
568 UErrorCode
*pErrorCode
) {
569 /* argument checking */
570 if(U_FAILURE(*pErrorCode
)) {
573 if(s1
==0 || length1
<-1 || s2
==0 || length2
<-1) {
574 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
578 UnicodeString fcd1
, fcd2
;
579 int32_t normOptions
=(int32_t)(options
>>UNORM_COMPARE_NORM_OPTIONS_SHIFT
);
580 options
|=_COMPARE_EQUIV
;
583 * UAX #21 Case Mappings, as fixed for Unicode version 4
584 * (see Jitterbug 2021), defines a canonical caseless match as
586 * A string X is a canonical caseless match
587 * for a string Y if and only if
588 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
590 * For better performance, we check for FCD (or let the caller tell us that
591 * both strings are in FCD) for the inner normalization.
592 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
593 * case-folding preserves the FCD-ness of a string.
594 * The outer normalization is then only performed by unorm_cmpEquivFold()
595 * when there is a difference.
597 * Exception: When using the Turkic case-folding option, we do perform
598 * full NFD first. This is because in the Turkic case precomposed characters
599 * with 0049 capital I or 0069 small i fold differently whether they
600 * are first decomposed or not, so an FCD check - a check only for
601 * canonical order - is not sufficient.
603 if(!(options
&UNORM_INPUT_IS_FCD
) || (options
&U_FOLD_CASE_EXCLUDE_SPECIAL_I
)) {
604 const Normalizer2
*n2
;
605 if(options
&U_FOLD_CASE_EXCLUDE_SPECIAL_I
) {
606 n2
=Normalizer2::getNFDInstance(*pErrorCode
);
608 n2
=Normalizer2Factory::getFCDInstance(*pErrorCode
);
610 if (U_FAILURE(*pErrorCode
)) {
614 if(normOptions
&UNORM_UNICODE_3_2
) {
615 const UnicodeSet
*uni32
=uniset_getUnicode32Instance(*pErrorCode
);
616 FilteredNormalizer2
fn2(*n2
, *uni32
);
617 if(_normalize(&fn2
, s1
, length1
, fcd1
, pErrorCode
)) {
619 length1
=fcd1
.length();
621 if(_normalize(&fn2
, s2
, length2
, fcd2
, pErrorCode
)) {
623 length2
=fcd2
.length();
626 if(_normalize(n2
, s1
, length1
, fcd1
, pErrorCode
)) {
628 length1
=fcd1
.length();
630 if(_normalize(n2
, s2
, length2
, fcd2
, pErrorCode
)) {
632 length2
=fcd2
.length();
637 if(U_SUCCESS(*pErrorCode
)) {
638 return unorm_cmpEquivFold(s1
, length1
, s2
, length2
, options
, pErrorCode
);
644 #endif /* #if !UCONFIG_NO_NORMALIZATION */