-/* compare canonically equivalent ------------------------------------------- */
-
-#else
-
-/*
- * Normalization is not built into the ICU library, but case-insensitive
- * comparisons are possible using unorm_cmpEquivFold().
- * The following simply disables the decomposition part.
- */
-
-static inline UBool
-_haveData(UErrorCode &errorCode) {
- if(U_SUCCESS(errorCode)) {
- errorCode=U_INTERNAL_PROGRAM_ERROR;
- }
- return FALSE;
-}
-
-static inline const UChar *
-_decompose(UChar32 /*c*/, UChar /*buffer*/[4], int32_t &/*length*/) {
- return NULL;
-}
-
-#endif /* #if !UCONFIG_NO_NORMALIZATION */
-
-/*
- * Compare two strings for canonical equivalence.
- * Further options include case-insensitive comparison and
- * code point order (as opposed to code unit order).
- *
- * In this function, canonical equivalence is optional as well.
- * If canonical equivalence is tested, then both strings must fulfill
- * the FCD check.
- *
- * Semantically, this is equivalent to
- * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
- * where code point order, NFD and foldCase are all optional.
- *
- * String comparisons almost always yield results before processing both strings
- * completely.
- * They are generally more efficient working incrementally instead of
- * performing the sub-processing (strlen, normalization, case-folding)
- * on the entire strings first.
- *
- * It is also unnecessary to not normalize identical characters.
- *
- * This function works in principle as follows:
- *
- * loop {
- * get one code unit c1 from s1 (-1 if end of source)
- * get one code unit c2 from s2 (-1 if end of source)
- *
- * if(either string finished) {
- * return result;
- * }
- * if(c1==c2) {
- * continue;
- * }
- *
- * // c1!=c2
- * try to decompose/case-fold c1/c2, and continue if one does;
- *
- * // still c1!=c2 and neither decomposes/case-folds, return result
- * return c1-c2;
- * }
- *
- * When a character decomposes, then the pointer for that source changes to
- * the decomposition, pushing the previous pointer onto a stack.
- * When the end of the decomposition is reached, then the code unit reader
- * pops the previous source from the stack.
- * (Same for case-folding.)
- *
- * This is complicated further by operating on variable-width UTF-16.
- * The top part of the loop works on code units, while lookups for decomposition
- * and case-folding need code points.
- * Code points are assembled after the equality/end-of-source part.
- * The source pointer is only advanced beyond all code units when the code point
- * actually decomposes/case-folds.
- *
- * If we were on a trail surrogate unit when assembling a code point,
- * and the code point decomposes/case-folds, then the decomposition/folding
- * result must be compared with the part of the other string that corresponds to
- * this string's lead surrogate.
- * Since we only assemble a code point when hitting a trail unit when the
- * preceding lead units were identical, we back up the other string by one unit
- * in such a case.
- *
- * The optional code point order comparison at the end works with
- * the same fix-up as the other code point order comparison functions.
- * See ustring.c and the comment near the end of this function.
- *
- * Assumption: A decomposition or case-folding result string never contains
- * a single surrogate. This is a safe assumption in the Unicode Standard.
- * Therefore, we do not need to check for surrogate pairs across
- * decomposition/case-folding boundaries.
- *
- * Further assumptions (see verifications tstnorm.cpp):
- * The API function checks for FCD first, while the core function
- * first case-folds and then decomposes. This requires that case-folding does not
- * un-FCD any strings.
- *
- * The API function may also NFD the input and turn off decomposition.
- * This requires that case-folding does not un-NFD strings either.
- *
- * TODO If any of the above two assumptions is violated,
- * then this entire code must be re-thought.
- * If this happens, then a simple solution is to case-fold both strings up front
- * and to turn off UNORM_INPUT_IS_FCD.
- * We already do this when not both strings are in FCD because makeFCD
- * would be a partial NFD before the case folding, which does not work.
- * Note that all of this is only a problem when case-folding _and_
- * canonical equivalence come together.
- *
- * This function could be moved to a different source file, at increased cost
- * for calling the decomposition access function.
- */
-
-// stack element for previous-level source/decomposition pointers
-struct CmpEquivLevel {
- const UChar *start, *s, *limit;
-};
-typedef struct CmpEquivLevel CmpEquivLevel;
-
-// internal function
-U_CAPI int32_t U_EXPORT2
-unorm_cmpEquivFold(const UChar *s1, int32_t length1,
- const UChar *s2, int32_t length2,
- uint32_t options,
- UErrorCode *pErrorCode) {
- // current-level start/limit - s1/s2 as current
- const UChar *start1, *start2, *limit1, *limit2;
-
- // decomposition variables
- const UChar *p;
- int32_t length;
-
- // stacks of previous-level start/current/limit
- CmpEquivLevel stack1[2], stack2[2];
-
- // decomposition buffers for Hangul
- UChar decomp1[4], decomp2[4];
-
- // case folding buffers, only use current-level start/limit
- UChar fold1[32], fold2[32];
-
- // track which is the current level per string
- int32_t level1, level2;
-
- // current code units, and code points for lookups
- int32_t c1, c2, cp1, cp2;
-
- // no argument error checking because this itself is not an API
-
- // assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
- // otherwise this function must behave exactly as uprv_strCompare()
- // not checking for that here makes testing this function easier
-
- // normalization/properties data loaded?
- if( ((options&_COMPARE_EQUIV)!=0 && !_haveData(*pErrorCode)) ||
- ((options&U_COMPARE_IGNORE_CASE)!=0 && !uprv_haveProperties(pErrorCode))
- ) {
- return 0;
- }
-
- // initialize
- start1=s1;
- if(length1==-1) {
- limit1=NULL;
- } else {
- limit1=s1+length1;
- }
-
- start2=s2;
- if(length2==-1) {
- limit2=NULL;
- } else {
- limit2=s2+length2;
- }
-
- level1=level2=0;
- c1=c2=-1;
-
- // comparison loop
- for(;;) {
- // here a code unit value of -1 means "get another code unit"
- // below it will mean "this source is finished"
-
- if(c1<0) {
- // get next code unit from string 1, post-increment
- for(;;) {
- if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) {
- if(level1==0) {
- c1=-1;
- break;
- }
- } else {
- ++s1;
- break;
- }
-
- // reached end of level buffer, pop one level
- do {
- --level1;
- start1=stack1[level1].start;
- } while(start1==NULL);
- s1=stack1[level1].s;
- limit1=stack1[level1].limit;
- }
- }
-
- if(c2<0) {
- // get next code unit from string 2, post-increment
- for(;;) {
- if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) {
- if(level2==0) {
- c2=-1;
- break;
- }
- } else {
- ++s2;
- break;
- }
-
- // reached end of level buffer, pop one level
- do {
- --level2;
- start2=stack2[level2].start;
- } while(start2==NULL);
- s2=stack2[level2].s;
- limit2=stack2[level2].limit;
- }
- }
-
- // compare c1 and c2
- // either variable c1, c2 is -1 only if the corresponding string is finished
- if(c1==c2) {
- if(c1<0) {
- return 0; // c1==c2==-1 indicating end of strings
- }
- c1=c2=-1; // make us fetch new code units
- continue;
- } else if(c1<0) {
- return -1; // string 1 ends before string 2
- } else if(c2<0) {
- return 1; // string 2 ends before string 1
- }
- // c1!=c2 && c1>=0 && c2>=0
-
- // get complete code points for c1, c2 for lookups if either is a surrogate
- cp1=c1;
- if(UTF_IS_SURROGATE(c1)) {
- UChar c;
-
- if(UTF_IS_SURROGATE_FIRST(c1)) {
- if(s1!=limit1 && UTF_IS_TRAIL(c=*s1)) {
- // advance ++s1; only below if cp1 decomposes/case-folds
- cp1=UTF16_GET_PAIR_VALUE(c1, c);
- }
- } else /* isTrail(c1) */ {
- if(start1<=(s1-2) && UTF_IS_LEAD(c=*(s1-2))) {
- cp1=UTF16_GET_PAIR_VALUE(c, c1);
- }
- }
- }
-
- cp2=c2;
- if(UTF_IS_SURROGATE(c2)) {
- UChar c;
-
- if(UTF_IS_SURROGATE_FIRST(c2)) {
- if(s2!=limit2 && UTF_IS_TRAIL(c=*s2)) {
- // advance ++s2; only below if cp2 decomposes/case-folds
- cp2=UTF16_GET_PAIR_VALUE(c2, c);
- }
- } else /* isTrail(c2) */ {
- if(start2<=(s2-2) && UTF_IS_LEAD(c=*(s2-2))) {
- cp2=UTF16_GET_PAIR_VALUE(c, c2);
- }
- }
- }
-
- // go down one level for each string
- // continue with the main loop as soon as there is a real change
-
- if( level1==0 && (options&U_COMPARE_IGNORE_CASE) &&
- (length=u_internalFoldCase((UChar32)cp1, fold1, 32, options))>=0
- ) {
- // cp1 case-folds to fold1[length]
- if(UTF_IS_SURROGATE(c1)) {
- if(UTF_IS_SURROGATE_FIRST(c1)) {
- // advance beyond source surrogate pair if it case-folds
- ++s1;
- } else /* isTrail(c1) */ {
- // we got a supplementary code point when hitting its trail surrogate,
- // therefore the lead surrogate must have been the same as in the other string;
- // compare this decomposition with the lead surrogate in the other string
- // remember that this simulates bulk text replacement:
- // the decomposition would replace the entire code point
- --s2;
- c2=*(s2-1);
- }
- }
-
- // push current level pointers
- stack1[0].start=start1;
- stack1[0].s=s1;
- stack1[0].limit=limit1;
- ++level1;
-
- // set next level pointers to case folding
- start1=s1=fold1;
- limit1=fold1+length;
-
- // get ready to read from decomposition, continue with loop
- c1=-1;
- continue;
- }
-
- if( level2==0 && (options&U_COMPARE_IGNORE_CASE) &&
- (length=u_internalFoldCase((UChar32)cp2, fold2, 32, options))>=0
- ) {
- // cp2 case-folds to fold2[length]
- if(UTF_IS_SURROGATE(c2)) {
- if(UTF_IS_SURROGATE_FIRST(c2)) {
- // advance beyond source surrogate pair if it case-folds
- ++s2;
- } else /* isTrail(c2) */ {
- // we got a supplementary code point when hitting its trail surrogate,
- // therefore the lead surrogate must have been the same as in the other string;
- // compare this decomposition with the lead surrogate in the other string
- // remember that this simulates bulk text replacement:
- // the decomposition would replace the entire code point
- --s1;
- c1=*(s1-1);
- }
- }
-
- // push current level pointers
- stack2[0].start=start2;
- stack2[0].s=s2;
- stack2[0].limit=limit2;
- ++level2;
-
- // set next level pointers to case folding
- start2=s2=fold2;
- limit2=fold2+length;
-
- // get ready to read from decomposition, continue with loop
- c2=-1;
- continue;
- }
-
- if( level1<2 && (options&_COMPARE_EQUIV) &&
- 0!=(p=_decompose((UChar32)cp1, decomp1, length))
- ) {
- // cp1 decomposes into p[length]
- if(UTF_IS_SURROGATE(c1)) {
- if(UTF_IS_SURROGATE_FIRST(c1)) {
- // advance beyond source surrogate pair if it decomposes
- ++s1;
- } else /* isTrail(c1) */ {
- // we got a supplementary code point when hitting its trail surrogate,
- // therefore the lead surrogate must have been the same as in the other string;
- // compare this decomposition with the lead surrogate in the other string
- // remember that this simulates bulk text replacement:
- // the decomposition would replace the entire code point
- --s2;
- c2=*(s2-1);
- }
- }
-
- // push current level pointers
- stack1[level1].start=start1;
- stack1[level1].s=s1;
- stack1[level1].limit=limit1;
- ++level1;
-
- // set empty intermediate level if skipped
- if(level1<2) {
- stack1[level1++].start=NULL;
- }
-
- // set next level pointers to decomposition
- start1=s1=p;
- limit1=p+length;
-
- // get ready to read from decomposition, continue with loop
- c1=-1;
- continue;
- }
-
- if( level2<2 && (options&_COMPARE_EQUIV) &&
- 0!=(p=_decompose((UChar32)cp2, decomp2, length))
- ) {
- // cp2 decomposes into p[length]
- if(UTF_IS_SURROGATE(c2)) {
- if(UTF_IS_SURROGATE_FIRST(c2)) {
- // advance beyond source surrogate pair if it decomposes
- ++s2;
- } else /* isTrail(c2) */ {
- // we got a supplementary code point when hitting its trail surrogate,
- // therefore the lead surrogate must have been the same as in the other string;
- // compare this decomposition with the lead surrogate in the other string
- // remember that this simulates bulk text replacement:
- // the decomposition would replace the entire code point
- --s1;
- c1=*(s1-1);
- }
- }
-
- // push current level pointers
- stack2[level2].start=start2;
- stack2[level2].s=s2;
- stack2[level2].limit=limit2;
- ++level2;
-
- // set empty intermediate level if skipped
- if(level2<2) {
- stack2[level2++].start=NULL;
- }
-
- // set next level pointers to decomposition
- start2=s2=p;
- limit2=p+length;
-
- // get ready to read from decomposition, continue with loop
- c2=-1;
- continue;
- }
-
- // no decomposition/case folding, max level for both sides:
- // return difference result
-
- // code point order comparison must not just return cp1-cp2
- // because when single surrogates are present then the surrogate pairs
- // that formed cp1 and cp2 may be from different string indexes
-
- // example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
- // c1=d800 cp1=10001 c2=dc00 cp2=10000
- // cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
-
- // therefore, use same fix-up as in ustring.c/uprv_strCompare()
- // except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
- // so we have slightly different pointer/start/limit comparisons here
-
- if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) {
- /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
- if(
- (c1<=0xdbff && s1!=limit1 && UTF_IS_TRAIL(*s1)) ||
- (UTF_IS_TRAIL(c1) && start1!=(s1-1) && UTF_IS_LEAD(*(s1-2)))
- ) {
- /* part of a surrogate pair, leave >=d800 */
- } else {
- /* BMP code point - may be surrogate code point - make <d800 */
- c1-=0x2800;
- }
-
- if(
- (c2<=0xdbff && s2!=limit2 && UTF_IS_TRAIL(*s2)) ||
- (UTF_IS_TRAIL(c2) && start2!=(s2-1) && UTF_IS_LEAD(*(s2-2)))
- ) {
- /* part of a surrogate pair, leave >=d800 */
- } else {
- /* BMP code point - may be surrogate code point - make <d800 */
- c2-=0x2800;
- }
- }
-
- return c1-c2;
- }
-}
-
-#if !UCONFIG_NO_NORMALIZATION
-
-U_CAPI int32_t U_EXPORT2
-unorm_compare(const UChar *s1, int32_t length1,
- const UChar *s2, int32_t length2,
- uint32_t options,
- UErrorCode *pErrorCode) {
- UChar fcd1[300], fcd2[300];
- UChar *d1, *d2;
- const UnicodeSet *nx;
- UNormalizationMode mode;
- int32_t result;
-
- /* argument checking */
- if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
- return 0;
- }
- if(s1==0 || length1<-1 || s2==0 || length2<-1) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return 0;
- }
-
- if(!_haveData(*pErrorCode)) {
- return 0;
- }
- if(!uprv_haveProperties(pErrorCode)) {
- return 0;
- }
-
- nx=getNX((int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT), *pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
-
- d1=d2=0;
- options|=_COMPARE_EQUIV;
- result=0;
-
- /*
- * UAX #21 Case Mappings, as fixed for Unicode version 4
- * (see Jitterbug 2021), defines a canonical caseless match as
- *
- * A string X is a canonical caseless match
- * for a string Y if and only if
- * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
- *
- * For better performance, we check for FCD (or let the caller tell us that
- * both strings are in FCD) for the inner normalization.
- * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
- * case-folding preserves the FCD-ness of a string.
- * The outer normalization is then only performed by unorm_cmpEquivFold()
- * when there is a difference.
- *
- * Exception: When using the Turkic case-folding option, we do perform
- * full NFD first. This is because in the Turkic case precomposed characters
- * with 0049 capital I or 0069 small i fold differently whether they
- * are first decomposed or not, so an FCD check - a check only for
- * canonical order - is not sufficient.
- */
- if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
- mode=UNORM_NFD;
- options&=~UNORM_INPUT_IS_FCD;
- } else {
- mode=UNORM_FCD;
- }
-
- if(!(options&UNORM_INPUT_IS_FCD)) {
- int32_t _len1, _len2;
- UBool isFCD1, isFCD2;
-
- // check if s1 and/or s2 fulfill the FCD conditions
- isFCD1= UNORM_YES==_quickCheck(s1, length1, mode, TRUE, nx, pErrorCode);
- isFCD2= UNORM_YES==_quickCheck(s2, length2, mode, TRUE, nx, pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
-
- /*
- * ICU 2.4 had a further optimization:
- * If both strings were not in FCD, then they were both NFD'ed,
- * and the _COMPARE_EQUIV option was turned off.
- * It is not entirely clear that this is valid with the current
- * definition of the canonical caseless match.
- * Therefore, ICU 2.6 removes that optimization.
- */
-
- if(!isFCD1) {
- _len1=unorm_internalNormalize(fcd1, LENGTHOF(fcd1),
- s1, length1,
- mode, nx,
- pErrorCode);
- if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
- s1=fcd1;
- } else {
- d1=(UChar *)uprv_malloc(_len1*U_SIZEOF_UCHAR);
- if(d1==0) {
- *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
-
- *pErrorCode=U_ZERO_ERROR;
- _len1=unorm_internalNormalize(d1, _len1,
- s1, length1,
- mode, nx,
- pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- goto cleanup;
- }
-
- s1=d1;
- }
- length1=_len1;
- }
-
- if(!isFCD2) {
- _len2=unorm_internalNormalize(fcd2, LENGTHOF(fcd2),
- s2, length2,
- mode, nx,
- pErrorCode);
- if(*pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
- s2=fcd2;
- } else {
- d2=(UChar *)uprv_malloc(_len2*U_SIZEOF_UCHAR);
- if(d2==0) {
- *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
- goto cleanup;
- }
-
- *pErrorCode=U_ZERO_ERROR;
- _len2=unorm_internalNormalize(d2, _len2,
- s2, length2,
- mode, nx,
- pErrorCode);
- if(U_FAILURE(*pErrorCode)) {
- goto cleanup;
- }
-
- s2=d2;
- }
- length2=_len2;
- }
- }
-
- if(U_SUCCESS(*pErrorCode)) {
- result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
- }
-
-cleanup:
- if(d1!=0) {
- uprv_free(d1);
- }
- if(d2!=0) {
- uprv_free(d2);
- }
-
- return result;
-}
-