2 *****************************************************************************
3 * Copyright (C) 1996-2004, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *****************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_NORMALIZATION
12 #include "unicode/uset.h"
13 #include "unicode/ustring.h"
16 #include "unicode/caniter.h"
17 #include "unicode/normlzr.h"
18 #include "unicode/uchar.h"
22 * This class allows one to iterate through all the strings that are canonically equivalent to a given
23 * string. For example, here are some sample results:
24 Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
25 1: \u0041\u030A\u0064\u0307\u0327
26 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
27 2: \u0041\u030A\u0064\u0327\u0307
28 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
29 3: \u0041\u030A\u1E0B\u0327
30 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
31 4: \u0041\u030A\u1E11\u0307
32 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
33 5: \u00C5\u0064\u0307\u0327
34 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
35 6: \u00C5\u0064\u0327\u0307
36 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
38 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
40 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
41 9: \u212B\u0064\u0307\u0327
42 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
43 10: \u212B\u0064\u0327\u0307
44 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
45 11: \u212B\u1E0B\u0327
46 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
47 12: \u212B\u1E11\u0307
48 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
49 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
50 * since it has not been optimized for that situation.
55 static UBool PROGRESS
= FALSE
;
58 #include "unicode/translit.h"
60 UErrorCode status
= U_ZERO_ERROR
;
62 // Just for testing - remove, not thread safe.
63 static const char* UToS(const UnicodeString
&source
) {
64 static char buffer
[256];
65 buffer
[source
.extract(0, source
.length(), buffer
)] = 0;
69 static const UnicodeString
&Tr(const UnicodeString
&source
) {
70 static Transliterator
*NAME
= Transliterator::createInstance("name", UTRANS_FORWARD
, status
);
71 static UnicodeString result
;
73 NAME
->transliterate(result
);
81 // TODO: add boilerplate methods.
83 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator
)
86 *@param source string to get results for
88 CanonicalIterator::CanonicalIterator(const UnicodeString
&sourceStr
, UErrorCode
&status
) :
95 if(U_SUCCESS(status
)) {
96 setSource(sourceStr
, status
);
100 CanonicalIterator::~CanonicalIterator() {
104 void CanonicalIterator::cleanPieces() {
107 for(i
= 0; i
< pieces_length
; i
++) {
108 if(pieces
[i
] != NULL
) {
114 if(pieces_lengths
!= NULL
) {
115 uprv_free(pieces_lengths
);
117 pieces_lengths
= NULL
;
118 if(current
!= NULL
) {
126 *@return gets the source: NOTE: it is the NFD form of source
128 UnicodeString
CanonicalIterator::getSource() {
133 * Resets the iterator so that one can start again from the beginning.
135 void CanonicalIterator::reset() {
137 for (int i
= 0; i
< current_length
; ++i
) {
143 *@return the next string that is canonically equivalent. The value null is returned when
144 * the iteration is done.
146 UnicodeString
CanonicalIterator::next() {
154 // delete old contents
157 // construct return value
159 for (i
= 0; i
< pieces_length
; ++i
) {
160 buffer
.append(pieces
[i
][current
[i
]]);
162 //String result = buffer.toString(); // not needed
164 // find next value for next time
166 for (i
= current_length
- 1; ; --i
) {
172 if (current
[i
] < pieces_lengths
[i
]) break; // got sequence
179 *@param set the source string to iterate against. This allows the same iterator to be used
180 * while changing the source string, saving object creation.
182 void CanonicalIterator::setSource(const UnicodeString
&newSource
, UErrorCode
&status
) {
183 Normalizer::normalize(newSource
, UNORM_NFD
, 0, source
, status
);
184 if(U_FAILURE(status
)) {
191 // catch degenerate case
192 if (newSource
.length() == 0) {
194 pieces
= (UnicodeString
**)uprv_malloc(sizeof(UnicodeString
*));
196 if (pieces
== NULL
) {
197 status
= U_MEMORY_ALLOCATION_ERROR
;
201 current
= (int32_t*)uprv_malloc(1 * sizeof(int32_t));
203 if (current
== NULL
) {
204 status
= U_MEMORY_ALLOCATION_ERROR
;
210 pieces
[0] = new UnicodeString
[1];
212 if (pieces
[0] == 0) {
213 status
= U_MEMORY_ALLOCATION_ERROR
;
219 pieces
[0][0] = UnicodeString();
220 pieces_lengths
= (int32_t*)uprv_malloc(1 * sizeof(int32_t));
222 if (pieces_lengths
== 0) {
223 status
= U_MEMORY_ALLOCATION_ERROR
;
229 pieces_lengths
[0] = 1;
234 UnicodeString
*list
= new UnicodeString
[source
.length()];
237 status
= U_MEMORY_ALLOCATION_ERROR
;
241 int32_t list_length
= 0;
244 // i should initialy be the number of code units at the
245 // start of the string
246 int32_t i
= UTF16_CHAR_LENGTH(source
.char32At(0));
249 // This code iterates through the source string and
250 // extracts segments that end up on a codepoint that
251 // doesn't start any decompositions. (Analysis is done
252 // on the NFD form - see above).
253 for (; i
< source
.length(); i
+= UTF16_CHAR_LENGTH(cp
)) {
254 cp
= source
.char32At(i
);
255 if (unorm_isCanonSafeStart(cp
)) {
256 source
.extract(start
, i
-start
, list
[list_length
++]); // add up to i
260 source
.extract(start
, i
-start
, list
[list_length
++]); // add last one
263 // allocate the arrays, and find the strings that are CE to each segment
264 pieces
= (UnicodeString
**)uprv_malloc(list_length
* sizeof(UnicodeString
*));
266 if (pieces
== NULL
) {
267 status
= U_MEMORY_ALLOCATION_ERROR
;
271 pieces_length
= list_length
;
272 pieces_lengths
= (int32_t*)uprv_malloc(list_length
* sizeof(int32_t));
274 if (pieces_lengths
== 0) {
275 status
= U_MEMORY_ALLOCATION_ERROR
;
282 current_length
= list_length
;
283 current
= (int32_t*)uprv_malloc(list_length
* sizeof(int32_t));
286 status
= U_MEMORY_ALLOCATION_ERROR
;
290 uprv_free(pieces_lengths
);
293 for (i
= 0; i
< current_length
; i
++) {
296 // for each segment, get all the combinations that can produce
297 // it after NFD normalization
298 for (i
= 0; i
< pieces_length
; ++i
) {
299 //if (PROGRESS) printf("SEGMENT\n");
300 pieces
[i
] = getEquivalents(list
[i
], pieces_lengths
[i
], status
);
307 * Dumb recursive implementation of permutation.
309 * @param source the string to find permutations for
310 * @return the results in a set.
312 void U_EXPORT2
CanonicalIterator::permute(UnicodeString
&source
, UBool skipZeros
, Hashtable
*result
, UErrorCode
&status
) {
313 if(U_FAILURE(status
)) {
316 //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
320 // if zero or one character, just return a set with it
321 // we check for length < 2 to keep from counting code points all the time
322 if (source
.length() <= 2 && source
.countChar32() <= 1) {
323 UnicodeString
*toPut
= new UnicodeString(source
);
326 status
= U_MEMORY_ALLOCATION_ERROR
;
329 result
->put(source
, toPut
, status
);
333 // otherwise iterate through the string, and recursively permute all the other characters
335 Hashtable
*subpermute
= new Hashtable(status
);
337 if (subpermute
== 0) {
338 status
= U_MEMORY_ALLOCATION_ERROR
;
341 if (U_SUCCESS(status
)) {
342 subpermute
->setValueDeleter(uhash_deleteUnicodeString
);
345 for (i
= 0; i
< source
.length(); i
+= UTF16_CHAR_LENGTH(cp
)) {
346 cp
= source
.char32At(i
);
347 const UHashElement
*ne
= NULL
;
349 UnicodeString subPermuteString
= source
;
352 // if the character is canonical combining class zero,
354 if (skipZeros
&& i
!= 0 && u_getCombiningClass(cp
) == 0) {
355 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
359 subpermute
->removeAll();
361 // see what the permutations of the characters before and after this one are
362 //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
363 permute(subPermuteString
.replace(i
, UTF16_CHAR_LENGTH(cp
), NULL
, 0), skipZeros
, subpermute
, status
);
364 /* Test for buffer overflows */
365 if(U_FAILURE(status
)) {
369 // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents
370 // of source at this point.
372 // prefix this character to all of them
373 ne
= subpermute
->nextElement(el
);
375 UnicodeString
*permRes
= (UnicodeString
*)(ne
->value
.pointer
);
376 UnicodeString
*chStr
= new UnicodeString(cp
);
379 status
= U_MEMORY_ALLOCATION_ERROR
;
383 chStr
->append(*permRes
); //*((UnicodeString *)(ne->value.pointer));
384 //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr));
385 result
->put(*chStr
, chStr
, status
);
386 ne
= subpermute
->nextElement(el
);
395 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
396 UnicodeString
* CanonicalIterator::getEquivalents(const UnicodeString
&segment
, int32_t &result_len
, UErrorCode
&status
) {
397 //private String[] getEquivalents(String segment)
399 Hashtable
*result
= new Hashtable(status
);
402 status
= U_MEMORY_ALLOCATION_ERROR
;
405 if (U_SUCCESS(status
)) {
406 result
->setValueDeleter(uhash_deleteUnicodeString
);
409 int32_t segLen
= segment
.extract(USeg
, 256, status
);
410 Hashtable
*basic
= getEquivalents2(USeg
, segLen
, status
);
411 //Hashtable *basic = getEquivalents2(segment, segLen, status);
413 // now get all the permutations
414 // add only the ones that are canonically equivalent
415 // TODO: optimize by not permuting any class zero.
417 Hashtable
*permutations
= new Hashtable(status
);
419 if (permutations
== 0) {
420 status
= U_MEMORY_ALLOCATION_ERROR
;
425 if (U_SUCCESS(status
)) {
426 permutations
->setValueDeleter(uhash_deleteUnicodeString
);
429 const UHashElement
*ne
= NULL
;
431 //Iterator it = basic.iterator();
432 ne
= basic
->nextElement(el
);
433 //while (it.hasNext())
435 //String item = (String) it.next();
436 UnicodeString item
= *((UnicodeString
*)(ne
->value
.pointer
));
438 permutations
->removeAll();
439 permute(item
, CANITER_SKIP_ZEROES
, permutations
, status
);
440 const UHashElement
*ne2
= NULL
;
442 //Iterator it2 = permutations.iterator();
443 ne2
= permutations
->nextElement(el2
);
444 //while (it2.hasNext())
445 while (ne2
!= NULL
) {
446 //String possible = (String) it2.next();
447 //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
448 UnicodeString
possible(*((UnicodeString
*)(ne2
->value
.pointer
)));
449 UnicodeString attempt
;
450 Normalizer::normalize(possible
, UNORM_NFD
, 0, attempt
, status
);
452 // TODO: check if operator == is semanticaly the same as attempt.equals(segment)
453 if (attempt
==segment
) {
454 //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));
455 // TODO: use the hashtable just to catch duplicates - store strings directly (somehow).
456 result
->put(possible
, new UnicodeString(possible
), status
); //add(possible);
458 //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));
461 ne2
= permutations
->nextElement(el2
);
463 ne
= basic
->nextElement(el
);
466 /* Test for buffer overflows */
467 if(U_FAILURE(status
)) {
473 // convert into a String[] to clean up storage
474 //String[] finalResult = new String[result.size()];
475 UnicodeString
*finalResult
= NULL
;
477 if((resultCount
= result
->count())) {
478 finalResult
= new UnicodeString
[resultCount
];
480 status
= U_ILLEGAL_ARGUMENT_ERROR
;
483 if (finalResult
== 0) {
484 if(U_SUCCESS(status
)) {
485 status
= U_MEMORY_ALLOCATION_ERROR
;
492 //result.toArray(finalResult);
495 ne
= result
->nextElement(el
);
497 UnicodeString finResult
= *((UnicodeString
*)(ne
->value
.pointer
));
498 finalResult
[result_len
++] = finResult
;
499 ne
= result
->nextElement(el
);
509 Hashtable
*CanonicalIterator::getEquivalents2(const UChar
*segment
, int32_t segLen
, UErrorCode
&status
) {
510 //Hashtable *CanonicalIterator::getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status) {
512 Hashtable
*result
= new Hashtable(status
);
515 status
= U_MEMORY_ALLOCATION_ERROR
;
518 if (U_SUCCESS(status
)) {
519 result
->setValueDeleter(uhash_deleteUnicodeString
);
522 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));
524 UnicodeString
toPut(segment
, segLen
);
526 result
->put(toPut
, new UnicodeString(toPut
), status
);
528 USerializedSet starts
;
530 // cycle through all the characters
533 for (i
= 0; i
< segLen
; i
+= UTF16_CHAR_LENGTH(cp
)) {
534 // see if any character is at the start of some decomposition
535 UTF_GET_CHAR(segment
, 0, i
, segLen
, cp
);
536 if (!unorm_getCanonStartSet(cp
, &starts
)) {
539 // if so, see which decompositions match
540 for(j
= 0, cp
= end
+1; cp
<= end
|| uset_getSerializedRange(&starts
, j
++, &cp
, &end
); ++cp
) {
541 //Hashtable *remainder = extract(cp, segment, segLen, i, status);
542 Hashtable
*remainder
= extract(cp
, segment
, segLen
, i
, status
);
543 if (remainder
== NULL
) continue;
545 // there were some matches, so add all the possibilities to the set.
546 UnicodeString
prefix(segment
, i
);
549 const UHashElement
*ne
= NULL
;
551 ne
= remainder
->nextElement(el
);
553 UnicodeString item
= *((UnicodeString
*)(ne
->value
.pointer
));
554 UnicodeString
*toAdd
= new UnicodeString(prefix
);
557 status
= U_MEMORY_ALLOCATION_ERROR
;
563 result
->put(*toAdd
, toAdd
, status
);
565 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
567 ne
= remainder
->nextElement(el
);
574 /* Test for buffer overflows */
575 if(U_FAILURE(status
)) {
582 * See if the decomposition of cp2 is at segment starting at segmentPos
583 * (with canonical rearrangment!)
584 * If so, take the remainder, and return the equivalents
586 Hashtable
*CanonicalIterator::extract(UChar32 comp
, const UChar
*segment
, int32_t segLen
, int32_t segmentPos
, UErrorCode
&status
) {
587 //Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
588 //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));
589 //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);
591 const int32_t bufSize
= 256;
595 int32_t inputLen
= 0, decompLen
;
596 UChar stackBuffer
[4];
599 U16_APPEND_UNSAFE(temp
, inputLen
, comp
);
600 decomp
= unorm_getCanonicalDecomposition(comp
, stackBuffer
, &decompLen
);
603 stackBuffer
[0] = temp
[0];
605 stackBuffer
[1] = temp
[1];
607 decomp
= stackBuffer
;
608 decompLen
= inputLen
;
611 UChar
*buff
= temp
+inputLen
;
613 // See if it matches the start of segment (at segmentPos)
616 int32_t decompPos
= 0;
618 UTF_NEXT_CHAR(decomp
, decompPos
, decompLen
, decompCp
);
621 UBool overflow
= FALSE
;
625 UTF_NEXT_CHAR(segment
, i
, segLen
, cp
);
627 if (cp
== decompCp
) { // if equal, eat another cp from decomp
629 //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));
631 if (decompPos
== decompLen
) { // done, have all decomp characters!
632 //u_strcat(buff+bufLen, segment+i);
633 uprv_memcpy(buff
+bufLen
, segment
+i
, (segLen
-i
)*sizeof(UChar
));
639 UTF_NEXT_CHAR(decomp
, decompPos
, decompLen
, decompCp
);
641 //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));
643 // brute force approach
645 U16_APPEND(buff
, bufLen
, bufSize
, cp
, overflow
);
649 * ### TODO handle buffer overflow
650 * The buffer is large, but an overflow may still happen with
651 * unusual input (many combining marks?).
652 * Reallocate buffer and continue.
660 // since we know that the classes are monotonically increasing, after zero
662 // we can do an optimization
663 // there are only a few cases that work: zero, less, same, greater
664 // if both classes are the same, we fail
665 // if the decomp class < the segment class, we fail
667 segClass = getClass(cp);
668 if (decompClass <= segClass) return null;
672 if (!ok
) return NULL
; // we failed, characters left over
674 //if (PROGRESS) printf("Matches\n");
677 Hashtable
*result
= new Hashtable(status
);
680 status
= U_MEMORY_ALLOCATION_ERROR
;
683 result
->setValueDeleter(uhash_deleteUnicodeString
);
684 result
->put(UnicodeString(), new UnicodeString(), status
);
685 return result
; // succeed, but no remainder
688 // brute force approach
689 // check to make sure result is canonically equivalent
690 int32_t tempLen
= inputLen
+ bufLen
;
692 UChar trial
[bufSize
];
693 unorm_decompose(trial
, bufSize
, temp
, tempLen
, FALSE
, 0, &status
);
695 /* Test for buffer overflows */
696 if(U_FAILURE(status
)) {
700 if(uprv_memcmp(segment
+segmentPos
, trial
, (segLen
- segmentPos
)*sizeof(UChar
)) != 0) {
704 return getEquivalents2(buff
, bufLen
, status
);
709 #endif /* #if !UCONFIG_NO_NORMALIZATION */