icuSources/common/caniter.cpp

   1 /*
   2  *****************************************************************************
   3  * Copyright (C) 1996-2004, International Business Machines Corporation and  *
   4  * others. All Rights Reserved.                                              *
   5  *****************************************************************************
   6  */
   7
   8 #include "unicode/utypes.h"
   9
  10 #if !UCONFIG_NO_NORMALIZATION
  11
  12 #include "unicode/uset.h"
  13 #include "unicode/ustring.h"
  14 #include "hash.h"
  15 #include "unormimp.h"
  16 #include "unicode/caniter.h"
  17 #include "unicode/normlzr.h"
  18 #include "unicode/uchar.h"
  19 #include "cmemory.h"
  20
  21 /**
  22  * This class allows one to iterate through all the strings that are canonically equivalent to a given
  23  * string. For example, here are some sample results:
  24 Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
  25 1: \u0041\u030A\u0064\u0307\u0327
  26  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
  27 2: \u0041\u030A\u0064\u0327\u0307
  28  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
  29 3: \u0041\u030A\u1E0B\u0327
  30  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
  31 4: \u0041\u030A\u1E11\u0307
  32  = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
  33 5: \u00C5\u0064\u0307\u0327
  34  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
  35 6: \u00C5\u0064\u0327\u0307
  36  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
  37 7: \u00C5\u1E0B\u0327
  38  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
  39 8: \u00C5\u1E11\u0307
  40  = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
  41 9: \u212B\u0064\u0307\u0327
  42  = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
  43 10: \u212B\u0064\u0327\u0307
  44  = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
  45 11: \u212B\u1E0B\u0327
  46  = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
  47 12: \u212B\u1E11\u0307
  48  = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
  49  *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
  50  * since it has not been optimized for that situation.
  51  *@author M. Davis
  52  *@draft
  53  */
  54 #if 0
  55 static UBool PROGRESS = FALSE;
  56
  57 #include <stdio.h>
  58 #include "unicode/translit.h"
  59
  60 UErrorCode status = U_ZERO_ERROR;
  61
  62 // Just for testing - remove, not thread safe.
  63 static const char* UToS(const UnicodeString &source) {
  64   static char buffer[256];
  65   buffer[source.extract(0, source.length(), buffer)] = 0;
  66   return buffer;
  67 }
  68
  69 static const UnicodeString &Tr(const UnicodeString &source) {
  70   static Transliterator *NAME = Transliterator::createInstance("name", UTRANS_FORWARD, status);
  71   static UnicodeString result;
  72   result = source;
  73   NAME->transliterate(result);
  74   return result;
  75 }
  76 #endif
  77 // public
  78
  79 U_NAMESPACE_BEGIN
  80
  81 // TODO: add boilerplate methods.
  82
  83 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)
  84
  85 /**
  86  *@param source string to get results for
  87  */
  88 CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :
  89     pieces(NULL),
  90     pieces_length(0),
  91     pieces_lengths(NULL),
  92     current(NULL),
  93     current_length(0)
  94 {
  95     if(U_SUCCESS(status)) {
  96       setSource(sourceStr, status);
  97     }
  98 }
  99
 100 CanonicalIterator::~CanonicalIterator() {
 101   cleanPieces();
 102 }
 103
 104 void CanonicalIterator::cleanPieces() {
 105   int32_t i = 0;
 106   if(pieces != NULL) {
 107     for(i = 0; i < pieces_length; i++) {
 108       if(pieces[i] != NULL) {
 109         delete[] pieces[i];
 110       }
 111     }
 112     uprv_free(pieces);
 113     pieces = NULL;
 114     if(pieces_lengths != NULL) {
 115       uprv_free(pieces_lengths);
 116     }
 117     pieces_lengths = NULL;
 118     if(current != NULL) {
 119       uprv_free(current);
 120     }
 121     current = NULL;
 122   }
 123 }
 124
 125 /**
 126  *@return gets the source: NOTE: it is the NFD form of source
 127  */
 128 UnicodeString CanonicalIterator::getSource() {
 129   return source;
 130 }
 131
 132 /**
 133  * Resets the iterator so that one can start again from the beginning.
 134  */
 135 void CanonicalIterator::reset() {
 136     done = FALSE;
 137     for (int i = 0; i < current_length; ++i) {
 138         current[i] = 0;
 139     }
 140 }
 141
 142 /**
 143  *@return the next string that is canonically equivalent. The value null is returned when
 144  * the iteration is done.
 145  */
 146 UnicodeString CanonicalIterator::next() {
 147     int32_t i = 0;
 148
 149     if (done) {
 150       buffer.setToBogus();
 151       return buffer;
 152     }
 153
 154     // delete old contents
 155     buffer.remove();
 156
 157     // construct return value
 158
 159     for (i = 0; i < pieces_length; ++i) {
 160         buffer.append(pieces[i][current[i]]);
 161     }
 162     //String result = buffer.toString(); // not needed
 163
 164     // find next value for next time
 165
 166     for (i = current_length - 1; ; --i) {
 167         if (i < 0) {
 168             done = TRUE;
 169             break;
 170         }
 171         current[i]++;
 172         if (current[i] < pieces_lengths[i]) break; // got sequence
 173         current[i] = 0;
 174     }
 175     return buffer;
 176 }
 177
 178 /**
 179  *@param set the source string to iterate against. This allows the same iterator to be used
 180  * while changing the source string, saving object creation.
 181  */
 182 void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {
 183     Normalizer::normalize(newSource, UNORM_NFD, 0, source, status);
 184     if(U_FAILURE(status)) {
 185       return;
 186     }
 187     done = FALSE;
 188
 189     cleanPieces();
 190
 191     // catch degenerate case
 192     if (newSource.length() == 0) {
 193         pieces_length = 1;
 194         pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *));
 195         /* test for NULL */
 196         if (pieces == NULL) {
 197             status = U_MEMORY_ALLOCATION_ERROR;
 198             return;
 199         }
 200         current_length = 1;
 201         current = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
 202         /* test for NULL */
 203         if (current == NULL) {
 204             status = U_MEMORY_ALLOCATION_ERROR;
 205             uprv_free(pieces);
 206             pieces = NULL;
 207             return;
 208         }
 209         current[0] = 0;
 210         pieces[0] = new UnicodeString[1];
 211         /* test for NULL */
 212         if (pieces[0] == 0) {
 213             status = U_MEMORY_ALLOCATION_ERROR;
 214             uprv_free(pieces);
 215             pieces = NULL;
 216             uprv_free(current);
 217             return;
 218         }
 219         pieces[0][0] = UnicodeString();
 220         pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
 221         /* test for NULL */
 222         if (pieces_lengths == 0) {
 223             status = U_MEMORY_ALLOCATION_ERROR;
 224             uprv_free(pieces);
 225             pieces = NULL;
 226             uprv_free(current);
 227             return;
 228         }
 229         pieces_lengths[0] = 1;
 230         return;
 231     }
 232
 233
 234     UnicodeString *list = new UnicodeString[source.length()];
 235     /* test for NULL */
 236     if (list == 0) {
 237         status = U_MEMORY_ALLOCATION_ERROR;
 238         return;
 239     }
 240
 241     int32_t list_length = 0;
 242     UChar32 cp = 0;
 243     int32_t start = 0;
 244     // i should initialy be the number of code units at the
 245     // start of the string
 246     int32_t i = UTF16_CHAR_LENGTH(source.char32At(0));
 247     //int32_t i = 1;
 248     // find the segments
 249     // This code iterates through the source string and
 250     // extracts segments that end up on a codepoint that
 251     // doesn't start any decompositions. (Analysis is done
 252     // on the NFD form - see above).
 253     for (; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
 254         cp = source.char32At(i);
 255         if (unorm_isCanonSafeStart(cp)) {
 256             source.extract(start, i-start, list[list_length++]); // add up to i
 257             start = i;
 258         }
 259     }
 260     source.extract(start, i-start, list[list_length++]); // add last one
 261
 262
 263     // allocate the arrays, and find the strings that are CE to each segment
 264     pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *));
 265     /* test for NULL */
 266     if (pieces == NULL) {
 267         status = U_MEMORY_ALLOCATION_ERROR;
 268         delete[] list;
 269         return;
 270     }
 271     pieces_length = list_length;
 272     pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
 273     /* test for NULL */
 274     if (pieces_lengths == 0) {
 275         status = U_MEMORY_ALLOCATION_ERROR;
 276         delete[] list;
 277         uprv_free(pieces);
 278         pieces = NULL;
 279         return;
 280     }
 281
 282     current_length = list_length;
 283     current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
 284     /* test for NULL */
 285     if (current == 0) {
 286         status = U_MEMORY_ALLOCATION_ERROR;
 287         delete[] list;
 288         uprv_free(pieces);
 289         pieces = NULL;
 290         uprv_free(pieces_lengths);
 291         return;
 292     }
 293     for (i = 0; i < current_length; i++) {
 294       current[i] = 0;
 295     }
 296     // for each segment, get all the combinations that can produce
 297     // it after NFD normalization
 298     for (i = 0; i < pieces_length; ++i) {
 299         //if (PROGRESS) printf("SEGMENT\n");
 300         pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);
 301     }
 302
 303     delete[] list;
 304 }
 305
 306 /**
 307  * Dumb recursive implementation of permutation.
 308  * TODO: optimize
 309  * @param source the string to find permutations for
 310  * @return the results in a set.
 311  */
 312 void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
 313     if(U_FAILURE(status)) {
 314       return;
 315     }
 316     //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
 317     int32_t i = 0;
 318
 319     // optimization:
 320     // if zero or one character, just return a set with it
 321     // we check for length < 2 to keep from counting code points all the time
 322     if (source.length() <= 2 && source.countChar32() <= 1) {
 323       UnicodeString *toPut = new UnicodeString(source);
 324       /* test for NULL */
 325       if (toPut == 0) {
 326           status = U_MEMORY_ALLOCATION_ERROR;
 327           return;
 328       }
 329       result->put(source, toPut, status);
 330       return;
 331     }
 332
 333     // otherwise iterate through the string, and recursively permute all the other characters
 334     UChar32 cp;
 335     Hashtable *subpermute = new Hashtable(status);
 336     /* test for NULL */
 337     if (subpermute == 0) {
 338         status = U_MEMORY_ALLOCATION_ERROR;
 339         return;
 340     }
 341     if (U_SUCCESS(status)) {
 342         subpermute->setValueDeleter(uhash_deleteUnicodeString);
 343     }
 344
 345     for (i = 0; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
 346         cp = source.char32At(i);
 347         const UHashElement *ne = NULL;
 348         int32_t el = -1;
 349         UnicodeString subPermuteString = source;
 350
 351         // optimization:
 352         // if the character is canonical combining class zero,
 353         // don't permute it
 354         if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {
 355             //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
 356             continue;
 357         }
 358
 359         subpermute->removeAll();
 360
 361         // see what the permutations of the characters before and after this one are
 362         //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
 363         permute(subPermuteString.replace(i, UTF16_CHAR_LENGTH(cp), NULL, 0), skipZeros, subpermute, status);
 364         /* Test for buffer overflows */
 365         if(U_FAILURE(status)) {
 366             delete subpermute;
 367             return;
 368         }
 369         // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents
 370         // of source at this point.
 371
 372         // prefix this character to all of them
 373         ne = subpermute->nextElement(el);
 374         while (ne != NULL) {
 375           UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);
 376           UnicodeString *chStr = new UnicodeString(cp);
 377           //test for  NULL
 378           if (chStr == NULL) {
 379               status = U_MEMORY_ALLOCATION_ERROR;
 380               delete subpermute;
 381               return;
 382           }
 383             chStr->append(*permRes); //*((UnicodeString *)(ne->value.pointer));
 384             //if (PROGRESS) printf("  Piece: %s\n", UToS(*chStr));
 385             result->put(*chStr, chStr, status);
 386             ne = subpermute->nextElement(el);
 387         }
 388     }
 389     delete subpermute;
 390     //return result;
 391 }
 392
 393 // privates
 394
 395 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
 396 UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {
 397     //private String[] getEquivalents(String segment)
 398
 399     Hashtable *result = new Hashtable(status);
 400     /* test for NULL */
 401     if (result == 0) {
 402         status = U_MEMORY_ALLOCATION_ERROR;
 403         return 0;
 404     }
 405     if (U_SUCCESS(status)) {
 406         result->setValueDeleter(uhash_deleteUnicodeString);
 407     }
 408     UChar USeg[256];
 409     int32_t segLen = segment.extract(USeg, 256, status);
 410     Hashtable *basic = getEquivalents2(USeg, segLen, status);
 411     //Hashtable *basic = getEquivalents2(segment, segLen, status);
 412
 413     // now get all the permutations
 414     // add only the ones that are canonically equivalent
 415     // TODO: optimize by not permuting any class zero.
 416
 417     Hashtable *permutations = new Hashtable(status);
 418     /* test for NULL */
 419     if (permutations == 0) {
 420         status = U_MEMORY_ALLOCATION_ERROR;
 421         delete result;
 422         delete basic;
 423         return 0;
 424     }
 425     if (U_SUCCESS(status)) {
 426         permutations->setValueDeleter(uhash_deleteUnicodeString);
 427     }
 428
 429     const UHashElement *ne = NULL;
 430     int32_t el = -1;
 431     //Iterator it = basic.iterator();
 432     ne = basic->nextElement(el);
 433     //while (it.hasNext())
 434     while (ne != NULL) {
 435         //String item = (String) it.next();
 436         UnicodeString item = *((UnicodeString *)(ne->value.pointer));
 437
 438         permutations->removeAll();
 439         permute(item, CANITER_SKIP_ZEROES, permutations, status);
 440         const UHashElement *ne2 = NULL;
 441         int32_t el2 = -1;
 442         //Iterator it2 = permutations.iterator();
 443         ne2 = permutations->nextElement(el2);
 444         //while (it2.hasNext())
 445         while (ne2 != NULL) {
 446             //String possible = (String) it2.next();
 447             //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
 448             UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
 449             UnicodeString attempt;
 450             Normalizer::normalize(possible, UNORM_NFD, 0, attempt, status);
 451
 452             // TODO: check if operator == is semanticaly the same as attempt.equals(segment)
 453             if (attempt==segment) {
 454                 //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));
 455                 // TODO: use the hashtable just to catch duplicates - store strings directly (somehow).
 456                 result->put(possible, new UnicodeString(possible), status); //add(possible);
 457             } else {
 458                 //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));
 459             }
 460
 461           ne2 = permutations->nextElement(el2);
 462         }
 463         ne = basic->nextElement(el);
 464     }
 465
 466     /* Test for buffer overflows */
 467     if(U_FAILURE(status)) {
 468         delete result;
 469         delete permutations;
 470         delete basic;
 471         return 0;
 472     }
 473     // convert into a String[] to clean up storage
 474     //String[] finalResult = new String[result.size()];
 475     UnicodeString *finalResult = NULL;
 476     int32_t resultCount;
 477     if((resultCount = result->count())) {
 478       finalResult = new UnicodeString[resultCount];
 479     } else {
 480       status = U_ILLEGAL_ARGUMENT_ERROR;
 481     }
 482     /* test for NULL */
 483     if (finalResult == 0) {
 484       if(U_SUCCESS(status)) {
 485         status = U_MEMORY_ALLOCATION_ERROR;
 486       }
 487       delete result;
 488       delete permutations;
 489       delete basic;
 490       return 0;
 491     }
 492     //result.toArray(finalResult);
 493     result_len = 0;
 494     el = -1;
 495     ne = result->nextElement(el);
 496     while(ne != NULL) {
 497       UnicodeString finResult = *((UnicodeString *)(ne->value.pointer));
 498       finalResult[result_len++] = finResult;
 499       ne = result->nextElement(el);
 500     }
 501
 502
 503     delete permutations;
 504     delete basic;
 505     delete result;
 506     return finalResult;
 507 }
 508
 509 Hashtable *CanonicalIterator::getEquivalents2(const UChar *segment, int32_t segLen, UErrorCode &status) {
 510 //Hashtable *CanonicalIterator::getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status) {
 511
 512     Hashtable *result = new Hashtable(status);
 513     /* test for NULL */
 514     if (result == 0) {
 515         status = U_MEMORY_ALLOCATION_ERROR;
 516         return 0;
 517     }
 518     if (U_SUCCESS(status)) {
 519         result->setValueDeleter(uhash_deleteUnicodeString);
 520     }
 521
 522     //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));
 523
 524     UnicodeString toPut(segment, segLen);
 525
 526     result->put(toPut, new UnicodeString(toPut), status);
 527
 528     USerializedSet starts;
 529
 530     // cycle through all the characters
 531     UChar32 cp, end = 0;
 532     int32_t i = 0, j;
 533     for (i = 0; i < segLen; i += UTF16_CHAR_LENGTH(cp)) {
 534         // see if any character is at the start of some decomposition
 535         UTF_GET_CHAR(segment, 0, i, segLen, cp);
 536         if (!unorm_getCanonStartSet(cp, &starts)) {
 537           continue;
 538         }
 539         // if so, see which decompositions match
 540         for(j = 0, cp = end+1; cp <= end || uset_getSerializedRange(&starts, j++, &cp, &end); ++cp) {
 541             //Hashtable *remainder = extract(cp, segment, segLen, i, status);
 542             Hashtable *remainder = extract(cp, segment, segLen, i, status);
 543             if (remainder == NULL) continue;
 544
 545             // there were some matches, so add all the possibilities to the set.
 546             UnicodeString prefix(segment, i);
 547             prefix += cp;
 548
 549             const UHashElement *ne = NULL;
 550             int32_t el = -1;
 551             ne = remainder->nextElement(el);
 552             while (ne != NULL) {
 553                 UnicodeString item = *((UnicodeString *)(ne->value.pointer));
 554                 UnicodeString *toAdd = new UnicodeString(prefix);
 555                 /* test for NULL */
 556                 if (toAdd == 0) {
 557                     status = U_MEMORY_ALLOCATION_ERROR;
 558                     delete result;
 559                     delete remainder;
 560                     return 0;
 561                 }
 562                 *toAdd += item;
 563                 result->put(*toAdd, toAdd, status);
 564
 565                 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
 566
 567                 ne = remainder->nextElement(el);
 568             }
 569
 570             delete remainder;
 571         }
 572     }
 573
 574     /* Test for buffer overflows */
 575     if(U_FAILURE(status)) {
 576         return 0;
 577     }
 578     return result;
 579 }
 580
 581 /**
 582  * See if the decomposition of cp2 is at segment starting at segmentPos
 583  * (with canonical rearrangment!)
 584  * If so, take the remainder, and return the equivalents
 585  */
 586 Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
 587 //Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
 588     //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));
 589     //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);
 590
 591     const int32_t bufSize = 256;
 592     int32_t bufLen = 0;
 593     UChar temp[bufSize];
 594
 595     int32_t inputLen = 0, decompLen;
 596     UChar stackBuffer[4];
 597     const UChar *decomp;
 598
 599     U16_APPEND_UNSAFE(temp, inputLen, comp);
 600     decomp = unorm_getCanonicalDecomposition(comp, stackBuffer, &decompLen);
 601     if(decomp == NULL) {
 602         /* copy temp */
 603         stackBuffer[0] = temp[0];
 604         if(inputLen > 1) {
 605             stackBuffer[1] = temp[1];
 606         }
 607         decomp = stackBuffer;
 608         decompLen = inputLen;
 609     }
 610
 611     UChar *buff = temp+inputLen;
 612
 613     // See if it matches the start of segment (at segmentPos)
 614     UBool ok = FALSE;
 615     UChar32 cp;
 616     int32_t decompPos = 0;
 617     UChar32 decompCp;
 618     UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
 619
 620     int32_t i;
 621     UBool overflow = FALSE;
 622
 623     i = segmentPos;
 624     while(i < segLen) {
 625       UTF_NEXT_CHAR(segment, i, segLen, cp);
 626
 627         if (cp == decompCp) { // if equal, eat another cp from decomp
 628
 629             //if (PROGRESS) printf("  matches: %s\n", UToS(Tr(UnicodeString(cp))));
 630
 631             if (decompPos == decompLen) { // done, have all decomp characters!
 632                 //u_strcat(buff+bufLen, segment+i);
 633                 uprv_memcpy(buff+bufLen, segment+i, (segLen-i)*sizeof(UChar));
 634                 bufLen+=segLen-i;
 635
 636                 ok = TRUE;
 637                 break;
 638             }
 639             UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
 640         } else {
 641             //if (PROGRESS) printf("  buffer: %s\n", UToS(Tr(UnicodeString(cp))));
 642
 643             // brute force approach
 644
 645             U16_APPEND(buff, bufLen, bufSize, cp, overflow);
 646
 647             if(overflow) {
 648                 /*
 649                  * ### TODO handle buffer overflow
 650                  * The buffer is large, but an overflow may still happen with
 651                  * unusual input (many combining marks?).
 652                  * Reallocate buffer and continue.
 653                  * markus 20020929
 654                  */
 655
 656                 overflow = FALSE;
 657             }
 658
 659             /* TODO: optimize
 660             // since we know that the classes are monotonically increasing, after zero
 661             // e.g. 0 5 7 9 0 3
 662             // we can do an optimization
 663             // there are only a few cases that work: zero, less, same, greater
 664             // if both classes are the same, we fail
 665             // if the decomp class < the segment class, we fail
 666
 667             segClass = getClass(cp);
 668             if (decompClass <= segClass) return null;
 669             */
 670         }
 671     }
 672     if (!ok) return NULL; // we failed, characters left over
 673
 674     //if (PROGRESS) printf("Matches\n");
 675
 676     if (bufLen == 0) {
 677       Hashtable *result = new Hashtable(status);
 678       /* test for NULL */
 679       if (result == 0) {
 680           status = U_MEMORY_ALLOCATION_ERROR;
 681           return 0;
 682       }
 683       result->setValueDeleter(uhash_deleteUnicodeString);
 684       result->put(UnicodeString(), new UnicodeString(), status);
 685       return result; // succeed, but no remainder
 686     }
 687
 688     // brute force approach
 689     // check to make sure result is canonically equivalent
 690     int32_t tempLen = inputLen + bufLen;
 691
 692     UChar trial[bufSize];
 693     unorm_decompose(trial, bufSize, temp, tempLen, FALSE, 0, &status);
 694
 695     /* Test for buffer overflows */
 696     if(U_FAILURE(status)) {
 697         return 0;
 698     }
 699
 700     if(uprv_memcmp(segment+segmentPos, trial, (segLen - segmentPos)*sizeof(UChar)) != 0) {
 701       return NULL;
 702     }
 703
 704     return getEquivalents2(buff, bufLen, status);
 705 }
 706
 707 U_NAMESPACE_END
 708
 709 #endif /* #if !UCONFIG_NO_NORMALIZATION */