2  ***************************************************************************** 
   3  * Copyright (C) 1996-2004, International Business Machines Corporation and  * 
   4  * others. All Rights Reserved.                                              * 
   5  ***************************************************************************** 
   8 #include "unicode/utypes.h" 
  10 #if !UCONFIG_NO_NORMALIZATION 
  12 #include "unicode/uset.h" 
  13 #include "unicode/ustring.h" 
  16 #include "unicode/caniter.h" 
  17 #include "unicode/normlzr.h" 
  18 #include "unicode/uchar.h" 
  22  * This class allows one to iterate through all the strings that are canonically equivalent to a given 
  23  * string. For example, here are some sample results: 
  24 Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 
  25 1: \u0041\u030A\u0064\u0307\u0327 
  26  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 
  27 2: \u0041\u030A\u0064\u0327\u0307 
  28  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 
  29 3: \u0041\u030A\u1E0B\u0327 
  30  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 
  31 4: \u0041\u030A\u1E11\u0307 
  32  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 
  33 5: \u00C5\u0064\u0307\u0327 
  34  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 
  35 6: \u00C5\u0064\u0327\u0307 
  36  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 
  38  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 
  40  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 
  41 9: \u212B\u0064\u0307\u0327 
  42  = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA} 
  43 10: \u212B\u0064\u0327\u0307 
  44  = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE} 
  45 11: \u212B\u1E0B\u0327 
  46  = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA} 
  47 12: \u212B\u1E11\u0307 
  48  = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE} 
  49  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 
  50  * since it has not been optimized for that situation. 
  55 static UBool PROGRESS 
= FALSE
; 
  58 #include "unicode/translit.h" 
  60 UErrorCode status 
= U_ZERO_ERROR
; 
  62 // Just for testing - remove, not thread safe. 
  63 static const char* UToS(const UnicodeString 
&source
) { 
  64   static char buffer
[256]; 
  65   buffer
[source
.extract(0, source
.length(), buffer
)] = 0; 
  69 static const UnicodeString 
&Tr(const UnicodeString 
&source
) { 
  70   static Transliterator 
*NAME 
= Transliterator::createInstance("name", UTRANS_FORWARD
, status
); 
  71   static UnicodeString result
; 
  73   NAME
->transliterate(result
); 
  81 // TODO: add boilerplate methods. 
  83 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator
) 
  86  *@param source string to get results for 
  88 CanonicalIterator::CanonicalIterator(const UnicodeString 
&sourceStr
, UErrorCode 
&status
) : 
  95     if(U_SUCCESS(status
)) { 
  96       setSource(sourceStr
, status
); 
 100 CanonicalIterator::~CanonicalIterator() { 
 104 void CanonicalIterator::cleanPieces() { 
 107     for(i 
= 0; i 
< pieces_length
; i
++) { 
 108       if(pieces
[i
] != NULL
) { 
 114     if(pieces_lengths 
!= NULL
) { 
 115       uprv_free(pieces_lengths
); 
 117     pieces_lengths 
= NULL
; 
 118     if(current 
!= NULL
) { 
 126  *@return gets the source: NOTE: it is the NFD form of source 
 128 UnicodeString 
CanonicalIterator::getSource() { 
 133  * Resets the iterator so that one can start again from the beginning. 
 135 void CanonicalIterator::reset() { 
 137     for (int i 
= 0; i 
< current_length
; ++i
) { 
 143  *@return the next string that is canonically equivalent. The value null is returned when 
 144  * the iteration is done. 
 146 UnicodeString 
CanonicalIterator::next() { 
 154     // delete old contents 
 157     // construct return value 
 159     for (i 
= 0; i 
< pieces_length
; ++i
) { 
 160         buffer
.append(pieces
[i
][current
[i
]]); 
 162     //String result = buffer.toString(); // not needed 
 164     // find next value for next time 
 166     for (i 
= current_length 
- 1; ; --i
) { 
 172         if (current
[i
] < pieces_lengths
[i
]) break; // got sequence 
 179  *@param set the source string to iterate against. This allows the same iterator to be used 
 180  * while changing the source string, saving object creation. 
 182 void CanonicalIterator::setSource(const UnicodeString 
&newSource
, UErrorCode 
&status
) { 
 183     Normalizer::normalize(newSource
, UNORM_NFD
, 0, source
, status
); 
 184     if(U_FAILURE(status
)) { 
 191     // catch degenerate case 
 192     if (newSource
.length() == 0) { 
 194         pieces 
= (UnicodeString 
**)uprv_malloc(sizeof(UnicodeString 
*)); 
 196         if (pieces 
== NULL
) { 
 197             status 
= U_MEMORY_ALLOCATION_ERROR
; 
 201         current 
= (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 
 203         if (current 
== NULL
) { 
 204             status 
= U_MEMORY_ALLOCATION_ERROR
; 
 210         pieces
[0] = new UnicodeString
[1]; 
 212         if (pieces
[0] == 0) { 
 213             status 
= U_MEMORY_ALLOCATION_ERROR
; 
 219         pieces
[0][0] = UnicodeString(); 
 220         pieces_lengths 
= (int32_t*)uprv_malloc(1 * sizeof(int32_t)); 
 222         if (pieces_lengths 
== 0) { 
 223             status 
= U_MEMORY_ALLOCATION_ERROR
; 
 229         pieces_lengths
[0] = 1; 
 234     UnicodeString 
*list 
= new UnicodeString
[source
.length()]; 
 237         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 241     int32_t list_length 
= 0; 
 244     // i should initialy be the number of code units at the  
 245     // start of the string 
 246     int32_t i 
= UTF16_CHAR_LENGTH(source
.char32At(0)); 
 249     // This code iterates through the source string and  
 250     // extracts segments that end up on a codepoint that 
 251     // doesn't start any decompositions. (Analysis is done 
 252     // on the NFD form - see above). 
 253     for (; i 
< source
.length(); i 
+= UTF16_CHAR_LENGTH(cp
)) { 
 254         cp 
= source
.char32At(i
); 
 255         if (unorm_isCanonSafeStart(cp
)) { 
 256             source
.extract(start
, i
-start
, list
[list_length
++]); // add up to i 
 260     source
.extract(start
, i
-start
, list
[list_length
++]); // add last one 
 263     // allocate the arrays, and find the strings that are CE to each segment 
 264     pieces 
= (UnicodeString 
**)uprv_malloc(list_length 
* sizeof(UnicodeString 
*)); 
 266     if (pieces 
== NULL
) { 
 267         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 271     pieces_length 
= list_length
; 
 272     pieces_lengths 
= (int32_t*)uprv_malloc(list_length 
* sizeof(int32_t)); 
 274     if (pieces_lengths 
== 0) { 
 275         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 282     current_length 
= list_length
; 
 283     current 
= (int32_t*)uprv_malloc(list_length 
* sizeof(int32_t)); 
 286         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 290         uprv_free(pieces_lengths
); 
 293     for (i 
= 0; i 
< current_length
; i
++) { 
 296     // for each segment, get all the combinations that can produce  
 297     // it after NFD normalization 
 298     for (i 
= 0; i 
< pieces_length
; ++i
) { 
 299         //if (PROGRESS) printf("SEGMENT\n"); 
 300         pieces
[i
] = getEquivalents(list
[i
], pieces_lengths
[i
], status
); 
 307  * Dumb recursive implementation of permutation. 
 309  * @param source the string to find permutations for 
 310  * @return the results in a set. 
 312 void U_EXPORT2 
CanonicalIterator::permute(UnicodeString 
&source
, UBool skipZeros
, Hashtable 
*result
, UErrorCode 
&status
) { 
 313     if(U_FAILURE(status
)) { 
 316     //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source))); 
 320     // if zero or one character, just return a set with it 
 321     // we check for length < 2 to keep from counting code points all the time 
 322     if (source
.length() <= 2 && source
.countChar32() <= 1) { 
 323       UnicodeString 
*toPut 
= new UnicodeString(source
); 
 326           status 
= U_MEMORY_ALLOCATION_ERROR
; 
 329       result
->put(source
, toPut
, status
); 
 333     // otherwise iterate through the string, and recursively permute all the other characters 
 335     Hashtable 
*subpermute 
= new Hashtable(status
); 
 337     if (subpermute 
== 0) { 
 338         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 341     if (U_SUCCESS(status
)) { 
 342         subpermute
->setValueDeleter(uhash_deleteUnicodeString
); 
 345     for (i 
= 0; i 
< source
.length(); i 
+= UTF16_CHAR_LENGTH(cp
)) { 
 346         cp 
= source
.char32At(i
); 
 347         const UHashElement 
*ne 
= NULL
; 
 349         UnicodeString subPermuteString 
= source
; 
 352         // if the character is canonical combining class zero, 
 354         if (skipZeros 
&& i 
!= 0 && u_getCombiningClass(cp
) == 0) { 
 355             //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 
 359         subpermute
->removeAll(); 
 361         // see what the permutations of the characters before and after this one are 
 362         //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp))); 
 363         permute(subPermuteString
.replace(i
, UTF16_CHAR_LENGTH(cp
), NULL
, 0), skipZeros
, subpermute
, status
); 
 364         /* Test for buffer overflows */ 
 365         if(U_FAILURE(status
)) { 
 369         // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents  
 370         // of source at this point. 
 372         // prefix this character to all of them 
 373         ne 
= subpermute
->nextElement(el
); 
 375           UnicodeString 
*permRes 
= (UnicodeString 
*)(ne
->value
.pointer
); 
 376           UnicodeString 
*chStr 
= new UnicodeString(cp
); 
 379               status 
= U_MEMORY_ALLOCATION_ERROR
; 
 383             chStr
->append(*permRes
); //*((UnicodeString *)(ne->value.pointer)); 
 384             //if (PROGRESS) printf("  Piece: %s\n", UToS(*chStr)); 
 385             result
->put(*chStr
, chStr
, status
); 
 386             ne 
= subpermute
->nextElement(el
); 
 395 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 
 396 UnicodeString
* CanonicalIterator::getEquivalents(const UnicodeString 
&segment
, int32_t &result_len
, UErrorCode 
&status
) { 
 397     //private String[] getEquivalents(String segment) 
 399     Hashtable 
*result 
= new Hashtable(status
); 
 402         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 405     if (U_SUCCESS(status
)) { 
 406         result
->setValueDeleter(uhash_deleteUnicodeString
); 
 409     int32_t segLen 
= segment
.extract(USeg
, 256, status
); 
 410     Hashtable 
*basic 
= getEquivalents2(USeg
, segLen
, status
); 
 411     //Hashtable *basic = getEquivalents2(segment, segLen, status); 
 413     // now get all the permutations 
 414     // add only the ones that are canonically equivalent 
 415     // TODO: optimize by not permuting any class zero. 
 417     Hashtable 
*permutations 
= new Hashtable(status
); 
 419     if (permutations 
== 0) { 
 420         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 425     if (U_SUCCESS(status
)) { 
 426         permutations
->setValueDeleter(uhash_deleteUnicodeString
); 
 429     const UHashElement 
*ne 
= NULL
; 
 431     //Iterator it = basic.iterator(); 
 432     ne 
= basic
->nextElement(el
); 
 433     //while (it.hasNext()) 
 435         //String item = (String) it.next(); 
 436         UnicodeString item 
= *((UnicodeString 
*)(ne
->value
.pointer
)); 
 438         permutations
->removeAll(); 
 439         permute(item
, CANITER_SKIP_ZEROES
, permutations
, status
); 
 440         const UHashElement 
*ne2 
= NULL
; 
 442         //Iterator it2 = permutations.iterator(); 
 443         ne2 
= permutations
->nextElement(el2
); 
 444         //while (it2.hasNext()) 
 445         while (ne2 
!= NULL
) { 
 446             //String possible = (String) it2.next(); 
 447             //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer))); 
 448             UnicodeString 
possible(*((UnicodeString 
*)(ne2
->value
.pointer
))); 
 449             UnicodeString attempt
; 
 450             Normalizer::normalize(possible
, UNORM_NFD
, 0, attempt
, status
); 
 452             // TODO: check if operator == is semanticaly the same as attempt.equals(segment) 
 453             if (attempt
==segment
) { 
 454                 //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible))); 
 455                 // TODO: use the hashtable just to catch duplicates - store strings directly (somehow). 
 456                 result
->put(possible
, new UnicodeString(possible
), status
); //add(possible); 
 458                 //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible))); 
 461           ne2 
= permutations
->nextElement(el2
); 
 463         ne 
= basic
->nextElement(el
); 
 466     /* Test for buffer overflows */ 
 467     if(U_FAILURE(status
)) { 
 473     // convert into a String[] to clean up storage 
 474     //String[] finalResult = new String[result.size()]; 
 475     UnicodeString 
*finalResult 
= NULL
; 
 477     if((resultCount 
= result
->count())) { 
 478       finalResult 
= new UnicodeString
[resultCount
]; 
 480       status 
= U_ILLEGAL_ARGUMENT_ERROR
; 
 483     if (finalResult 
== 0) { 
 484       if(U_SUCCESS(status
)) { 
 485         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 492     //result.toArray(finalResult); 
 495     ne 
= result
->nextElement(el
); 
 497       UnicodeString finResult 
= *((UnicodeString 
*)(ne
->value
.pointer
)); 
 498       finalResult
[result_len
++] = finResult
; 
 499       ne 
= result
->nextElement(el
); 
 509 Hashtable 
*CanonicalIterator::getEquivalents2(const UChar 
*segment
, int32_t segLen
, UErrorCode 
&status
) { 
 510 //Hashtable *CanonicalIterator::getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status) { 
 512     Hashtable 
*result 
= new Hashtable(status
); 
 515         status 
= U_MEMORY_ALLOCATION_ERROR
; 
 518     if (U_SUCCESS(status
)) { 
 519         result
->setValueDeleter(uhash_deleteUnicodeString
); 
 522     //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); 
 524     UnicodeString 
toPut(segment
, segLen
); 
 526     result
->put(toPut
, new UnicodeString(toPut
), status
); 
 528     USerializedSet starts
; 
 530     // cycle through all the characters 
 533     for (i 
= 0; i 
< segLen
; i 
+= UTF16_CHAR_LENGTH(cp
)) { 
 534         // see if any character is at the start of some decomposition 
 535         UTF_GET_CHAR(segment
, 0, i
, segLen
, cp
); 
 536         if (!unorm_getCanonStartSet(cp
, &starts
)) { 
 539         // if so, see which decompositions match  
 540         for(j 
= 0, cp 
= end
+1; cp 
<= end 
|| uset_getSerializedRange(&starts
, j
++, &cp
, &end
); ++cp
) { 
 541             //Hashtable *remainder = extract(cp, segment, segLen, i, status); 
 542             Hashtable 
*remainder 
= extract(cp
, segment
, segLen
, i
, status
); 
 543             if (remainder 
== NULL
) continue; 
 545             // there were some matches, so add all the possibilities to the set. 
 546             UnicodeString 
prefix(segment
, i
); 
 549             const UHashElement 
*ne 
= NULL
; 
 551             ne 
= remainder
->nextElement(el
); 
 553                 UnicodeString item 
= *((UnicodeString 
*)(ne
->value
.pointer
)); 
 554                 UnicodeString 
*toAdd 
= new UnicodeString(prefix
); 
 557                     status 
= U_MEMORY_ALLOCATION_ERROR
; 
 563                 result
->put(*toAdd
, toAdd
, status
); 
 565                 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); 
 567                 ne 
= remainder
->nextElement(el
); 
 574     /* Test for buffer overflows */ 
 575     if(U_FAILURE(status
)) { 
 582  * See if the decomposition of cp2 is at segment starting at segmentPos  
 583  * (with canonical rearrangment!) 
 584  * If so, take the remainder, and return the equivalents  
 586 Hashtable 
*CanonicalIterator::extract(UChar32 comp
, const UChar 
*segment
, int32_t segLen
, int32_t segmentPos
, UErrorCode 
&status
) { 
 587 //Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) { 
 588     //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp)))); 
 589     //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos); 
 591     const int32_t bufSize 
= 256; 
 595     int32_t inputLen 
= 0, decompLen
; 
 596     UChar stackBuffer
[4]; 
 599     U16_APPEND_UNSAFE(temp
, inputLen
, comp
); 
 600     decomp 
= unorm_getCanonicalDecomposition(comp
, stackBuffer
, &decompLen
); 
 603         stackBuffer
[0] = temp
[0]; 
 605             stackBuffer
[1] = temp
[1]; 
 607         decomp 
= stackBuffer
; 
 608         decompLen 
= inputLen
; 
 611     UChar 
*buff 
= temp
+inputLen
; 
 613     // See if it matches the start of segment (at segmentPos) 
 616     int32_t decompPos 
= 0; 
 618     UTF_NEXT_CHAR(decomp
, decompPos
, decompLen
, decompCp
); 
 621     UBool overflow 
= FALSE
; 
 625       UTF_NEXT_CHAR(segment
, i
, segLen
, cp
); 
 627         if (cp 
== decompCp
) { // if equal, eat another cp from decomp 
 629             //if (PROGRESS) printf("  matches: %s\n", UToS(Tr(UnicodeString(cp)))); 
 631             if (decompPos 
== decompLen
) { // done, have all decomp characters! 
 632                 //u_strcat(buff+bufLen, segment+i); 
 633                 uprv_memcpy(buff
+bufLen
, segment
+i
, (segLen
-i
)*sizeof(UChar
)); 
 639             UTF_NEXT_CHAR(decomp
, decompPos
, decompLen
, decompCp
); 
 641             //if (PROGRESS) printf("  buffer: %s\n", UToS(Tr(UnicodeString(cp)))); 
 643             // brute force approach 
 645             U16_APPEND(buff
, bufLen
, bufSize
, cp
, overflow
); 
 649                  * ### TODO handle buffer overflow 
 650                  * The buffer is large, but an overflow may still happen with 
 651                  * unusual input (many combining marks?). 
 652                  * Reallocate buffer and continue. 
 660             // since we know that the classes are monotonically increasing, after zero 
 662             // we can do an optimization 
 663             // there are only a few cases that work: zero, less, same, greater 
 664             // if both classes are the same, we fail 
 665             // if the decomp class < the segment class, we fail 
 667             segClass = getClass(cp); 
 668             if (decompClass <= segClass) return null; 
 672     if (!ok
) return NULL
; // we failed, characters left over 
 674     //if (PROGRESS) printf("Matches\n"); 
 677       Hashtable 
*result 
= new Hashtable(status
); 
 680           status 
= U_MEMORY_ALLOCATION_ERROR
; 
 683       result
->setValueDeleter(uhash_deleteUnicodeString
); 
 684       result
->put(UnicodeString(), new UnicodeString(), status
); 
 685       return result
; // succeed, but no remainder 
 688     // brute force approach 
 689     // check to make sure result is canonically equivalent 
 690     int32_t tempLen 
= inputLen 
+ bufLen
; 
 692     UChar trial
[bufSize
]; 
 693     unorm_decompose(trial
, bufSize
, temp
, tempLen
, FALSE
, 0, &status
); 
 695     /* Test for buffer overflows */ 
 696     if(U_FAILURE(status
)) { 
 700     if(uprv_memcmp(segment
+segmentPos
, trial
, (segLen 
- segmentPos
)*sizeof(UChar
)) != 0) { 
 704     return getEquivalents2(buff
, bufLen
, status
); 
 709 #endif /* #if !UCONFIG_NO_NORMALIZATION */