2 *****************************************************************************
3 * Copyright (C) 1996-2006, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 *****************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_NORMALIZATION
12 #include "unicode/uset.h"
13 #include "unicode/ustring.h"
16 #include "unicode/caniter.h"
17 #include "unicode/normlzr.h"
18 #include "unicode/uchar.h"
22 * This class allows one to iterate through all the strings that are canonically equivalent to a given
23 * string. For example, here are some sample results:
24 Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
25 1: \u0041\u030A\u0064\u0307\u0327
26 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
27 2: \u0041\u030A\u0064\u0327\u0307
28 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
29 3: \u0041\u030A\u1E0B\u0327
30 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
31 4: \u0041\u030A\u1E11\u0307
32 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
33 5: \u00C5\u0064\u0307\u0327
34 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
35 6: \u00C5\u0064\u0327\u0307
36 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
38 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
40 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
41 9: \u212B\u0064\u0307\u0327
42 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
43 10: \u212B\u0064\u0327\u0307
44 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
45 11: \u212B\u1E0B\u0327
46 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
47 12: \u212B\u1E11\u0307
48 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
49 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
50 * since it has not been optimized for that situation.
59 // TODO: add boilerplate methods.
61 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator
)
64 *@param source string to get results for
66 CanonicalIterator::CanonicalIterator(const UnicodeString
&sourceStr
, UErrorCode
&status
) :
73 if(U_SUCCESS(status
)) {
74 setSource(sourceStr
, status
);
78 CanonicalIterator::~CanonicalIterator() {
82 void CanonicalIterator::cleanPieces() {
85 for(i
= 0; i
< pieces_length
; i
++) {
86 if(pieces
[i
] != NULL
) {
94 if(pieces_lengths
!= NULL
) {
95 uprv_free(pieces_lengths
);
96 pieces_lengths
= NULL
;
106 *@return gets the source: NOTE: it is the NFD form of source
108 UnicodeString
CanonicalIterator::getSource() {
113 * Resets the iterator so that one can start again from the beginning.
115 void CanonicalIterator::reset() {
117 for (int i
= 0; i
< current_length
; ++i
) {
123 *@return the next string that is canonically equivalent. The value null is returned when
124 * the iteration is done.
126 UnicodeString
CanonicalIterator::next() {
134 // delete old contents
137 // construct return value
139 for (i
= 0; i
< pieces_length
; ++i
) {
140 buffer
.append(pieces
[i
][current
[i
]]);
142 //String result = buffer.toString(); // not needed
144 // find next value for next time
146 for (i
= current_length
- 1; ; --i
) {
152 if (current
[i
] < pieces_lengths
[i
]) break; // got sequence
159 *@param set the source string to iterate against. This allows the same iterator to be used
160 * while changing the source string, saving object creation.
162 void CanonicalIterator::setSource(const UnicodeString
&newSource
, UErrorCode
&status
) {
163 int32_t list_length
= 0;
167 UnicodeString
*list
= NULL
;
169 Normalizer::normalize(newSource
, UNORM_NFD
, 0, source
, status
);
170 if(U_FAILURE(status
)) {
177 // catch degenerate case
178 if (newSource
.length() == 0) {
179 pieces
= (UnicodeString
**)uprv_malloc(sizeof(UnicodeString
*));
180 pieces_lengths
= (int32_t*)uprv_malloc(1 * sizeof(int32_t));
182 current
= (int32_t*)uprv_malloc(1 * sizeof(int32_t));
184 if (pieces
== NULL
|| pieces_lengths
== NULL
|| current
== NULL
) {
185 status
= U_MEMORY_ALLOCATION_ERROR
;
186 goto CleanPartialInitialization
;
189 pieces
[0] = new UnicodeString
[1];
190 pieces_lengths
[0] = 1;
191 if (pieces
[0] == 0) {
192 status
= U_MEMORY_ALLOCATION_ERROR
;
193 goto CleanPartialInitialization
;
199 list
= new UnicodeString
[source
.length()];
201 status
= U_MEMORY_ALLOCATION_ERROR
;
202 goto CleanPartialInitialization
;
205 // i should initialy be the number of code units at the
206 // start of the string
207 i
= UTF16_CHAR_LENGTH(source
.char32At(0));
210 // This code iterates through the source string and
211 // extracts segments that end up on a codepoint that
212 // doesn't start any decompositions. (Analysis is done
213 // on the NFD form - see above).
214 for (; i
< source
.length(); i
+= UTF16_CHAR_LENGTH(cp
)) {
215 cp
= source
.char32At(i
);
216 if (unorm_isCanonSafeStart(cp
)) {
217 source
.extract(start
, i
-start
, list
[list_length
++]); // add up to i
221 source
.extract(start
, i
-start
, list
[list_length
++]); // add last one
224 // allocate the arrays, and find the strings that are CE to each segment
225 pieces
= (UnicodeString
**)uprv_malloc(list_length
* sizeof(UnicodeString
*));
226 pieces_length
= list_length
;
227 pieces_lengths
= (int32_t*)uprv_malloc(list_length
* sizeof(int32_t));
228 current
= (int32_t*)uprv_malloc(list_length
* sizeof(int32_t));
229 current_length
= list_length
;
230 if (pieces
== NULL
|| pieces_lengths
== NULL
|| current
== NULL
) {
231 status
= U_MEMORY_ALLOCATION_ERROR
;
232 goto CleanPartialInitialization
;
235 for (i
= 0; i
< current_length
; i
++) {
238 // for each segment, get all the combinations that can produce
239 // it after NFD normalization
240 for (i
= 0; i
< pieces_length
; ++i
) {
241 //if (PROGRESS) printf("SEGMENT\n");
242 pieces
[i
] = getEquivalents(list
[i
], pieces_lengths
[i
], status
);
247 // Common section to cleanup all local variables and reset object variables.
248 CleanPartialInitialization
:
256 * Dumb recursive implementation of permutation.
258 * @param source the string to find permutations for
259 * @return the results in a set.
261 void U_EXPORT2
CanonicalIterator::permute(UnicodeString
&source
, UBool skipZeros
, Hashtable
*result
, UErrorCode
&status
) {
262 if(U_FAILURE(status
)) {
265 //if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
269 // if zero or one character, just return a set with it
270 // we check for length < 2 to keep from counting code points all the time
271 if (source
.length() <= 2 && source
.countChar32() <= 1) {
272 UnicodeString
*toPut
= new UnicodeString(source
);
275 status
= U_MEMORY_ALLOCATION_ERROR
;
278 result
->put(source
, toPut
, status
);
282 // otherwise iterate through the string, and recursively permute all the other characters
284 Hashtable
subpermute(status
);
285 if(U_FAILURE(status
)) {
288 subpermute
.setValueDeleter(uhash_deleteUnicodeString
);
290 for (i
= 0; i
< source
.length(); i
+= UTF16_CHAR_LENGTH(cp
)) {
291 cp
= source
.char32At(i
);
292 const UHashElement
*ne
= NULL
;
294 UnicodeString subPermuteString
= source
;
297 // if the character is canonical combining class zero,
299 if (skipZeros
&& i
!= 0 && u_getCombiningClass(cp
) == 0) {
300 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
304 subpermute
.removeAll();
306 // see what the permutations of the characters before and after this one are
307 //Hashtable *subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));
308 permute(subPermuteString
.replace(i
, UTF16_CHAR_LENGTH(cp
), NULL
, 0), skipZeros
, &subpermute
, status
);
309 /* Test for buffer overflows */
310 if(U_FAILURE(status
)) {
313 // The upper replace is destructive. The question is do we have to make a copy, or we don't care about the contents
314 // of source at this point.
316 // prefix this character to all of them
317 ne
= subpermute
.nextElement(el
);
319 UnicodeString
*permRes
= (UnicodeString
*)(ne
->value
.pointer
);
320 UnicodeString
*chStr
= new UnicodeString(cp
);
323 status
= U_MEMORY_ALLOCATION_ERROR
;
326 chStr
->append(*permRes
); //*((UnicodeString *)(ne->value.pointer));
327 //if (PROGRESS) printf(" Piece: %s\n", UToS(*chStr));
328 result
->put(*chStr
, chStr
, status
);
329 ne
= subpermute
.nextElement(el
);
337 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
338 UnicodeString
* CanonicalIterator::getEquivalents(const UnicodeString
&segment
, int32_t &result_len
, UErrorCode
&status
) {
339 Hashtable
result(status
);
340 Hashtable
permutations(status
);
341 Hashtable
basic(status
);
342 if (U_FAILURE(status
)) {
345 result
.setValueDeleter(uhash_deleteUnicodeString
);
346 permutations
.setValueDeleter(uhash_deleteUnicodeString
);
347 basic
.setValueDeleter(uhash_deleteUnicodeString
);
350 int32_t segLen
= segment
.extract(USeg
, 256, status
);
351 getEquivalents2(&basic
, USeg
, segLen
, status
);
353 // now get all the permutations
354 // add only the ones that are canonically equivalent
355 // TODO: optimize by not permuting any class zero.
357 const UHashElement
*ne
= NULL
;
359 //Iterator it = basic.iterator();
360 ne
= basic
.nextElement(el
);
361 //while (it.hasNext())
363 //String item = (String) it.next();
364 UnicodeString item
= *((UnicodeString
*)(ne
->value
.pointer
));
366 permutations
.removeAll();
367 permute(item
, CANITER_SKIP_ZEROES
, &permutations
, status
);
368 const UHashElement
*ne2
= NULL
;
370 //Iterator it2 = permutations.iterator();
371 ne2
= permutations
.nextElement(el2
);
372 //while (it2.hasNext())
373 while (ne2
!= NULL
) {
374 //String possible = (String) it2.next();
375 //UnicodeString *possible = new UnicodeString(*((UnicodeString *)(ne2->value.pointer)));
376 UnicodeString
possible(*((UnicodeString
*)(ne2
->value
.pointer
)));
377 UnicodeString attempt
;
378 Normalizer::normalize(possible
, UNORM_NFD
, 0, attempt
, status
);
380 // TODO: check if operator == is semanticaly the same as attempt.equals(segment)
381 if (attempt
==segment
) {
382 //if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(*possible)));
383 // TODO: use the hashtable just to catch duplicates - store strings directly (somehow).
384 result
.put(possible
, new UnicodeString(possible
), status
); //add(possible);
386 //if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(*possible)));
389 ne2
= permutations
.nextElement(el2
);
391 ne
= basic
.nextElement(el
);
394 /* Test for buffer overflows */
395 if(U_FAILURE(status
)) {
398 // convert into a String[] to clean up storage
399 //String[] finalResult = new String[result.size()];
400 UnicodeString
*finalResult
= NULL
;
402 if((resultCount
= result
.count())) {
403 finalResult
= new UnicodeString
[resultCount
];
404 if (finalResult
== 0) {
405 status
= U_MEMORY_ALLOCATION_ERROR
;
410 status
= U_ILLEGAL_ARGUMENT_ERROR
;
413 //result.toArray(finalResult);
416 ne
= result
.nextElement(el
);
418 finalResult
[result_len
++] = *((UnicodeString
*)(ne
->value
.pointer
));
419 ne
= result
.nextElement(el
);
426 Hashtable
*CanonicalIterator::getEquivalents2(Hashtable
*fillinResult
, const UChar
*segment
, int32_t segLen
, UErrorCode
&status
) {
428 if (U_FAILURE(status
)) {
432 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));
434 UnicodeString
toPut(segment
, segLen
);
436 fillinResult
->put(toPut
, new UnicodeString(toPut
), status
);
438 USerializedSet starts
;
440 // cycle through all the characters
443 for (i
= 0; i
< segLen
; i
+= UTF16_CHAR_LENGTH(cp
)) {
444 // see if any character is at the start of some decomposition
445 UTF_GET_CHAR(segment
, 0, i
, segLen
, cp
);
446 if (!unorm_getCanonStartSet(cp
, &starts
)) {
449 // if so, see which decompositions match
450 for(j
= 0, cp
= end
+1; cp
<= end
|| uset_getSerializedRange(&starts
, j
++, &cp
, &end
); ++cp
) {
451 Hashtable
remainder(status
);
452 remainder
.setValueDeleter(uhash_deleteUnicodeString
);
453 if (extract(&remainder
, cp
, segment
, segLen
, i
, status
) == NULL
) {
457 // there were some matches, so add all the possibilities to the set.
458 UnicodeString
prefix(segment
, i
);
462 const UHashElement
*ne
= remainder
.nextElement(el
);
464 UnicodeString item
= *((UnicodeString
*)(ne
->value
.pointer
));
465 UnicodeString
*toAdd
= new UnicodeString(prefix
);
468 status
= U_MEMORY_ALLOCATION_ERROR
;
472 fillinResult
->put(*toAdd
, toAdd
, status
);
474 //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
476 ne
= remainder
.nextElement(el
);
481 /* Test for buffer overflows */
482 if(U_FAILURE(status
)) {
489 * See if the decomposition of cp2 is at segment starting at segmentPos
490 * (with canonical rearrangment!)
491 * If so, take the remainder, and return the equivalents
493 Hashtable
*CanonicalIterator::extract(Hashtable
*fillinResult
, UChar32 comp
, const UChar
*segment
, int32_t segLen
, int32_t segmentPos
, UErrorCode
&status
) {
494 //Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
495 //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));
496 //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);
498 if (U_FAILURE(status
)) {
502 const int32_t bufSize
= 256;
506 int32_t inputLen
= 0, decompLen
;
507 UChar stackBuffer
[4];
510 U16_APPEND_UNSAFE(temp
, inputLen
, comp
);
511 decomp
= unorm_getCanonicalDecomposition(comp
, stackBuffer
, &decompLen
);
514 stackBuffer
[0] = temp
[0];
516 stackBuffer
[1] = temp
[1];
518 decomp
= stackBuffer
;
519 decompLen
= inputLen
;
522 UChar
*buff
= temp
+inputLen
;
524 // See if it matches the start of segment (at segmentPos)
527 int32_t decompPos
= 0;
529 UTF_NEXT_CHAR(decomp
, decompPos
, decompLen
, decompCp
);
532 UBool overflow
= FALSE
;
536 UTF_NEXT_CHAR(segment
, i
, segLen
, cp
);
538 if (cp
== decompCp
) { // if equal, eat another cp from decomp
540 //if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));
542 if (decompPos
== decompLen
) { // done, have all decomp characters!
543 //u_strcat(buff+bufLen, segment+i);
544 uprv_memcpy(buff
+bufLen
, segment
+i
, (segLen
-i
)*sizeof(UChar
));
550 UTF_NEXT_CHAR(decomp
, decompPos
, decompLen
, decompCp
);
552 //if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));
554 // brute force approach
556 U16_APPEND(buff
, bufLen
, bufSize
, cp
, overflow
);
560 * ### TODO handle buffer overflow
561 * The buffer is large, but an overflow may still happen with
562 * unusual input (many combining marks?).
563 * Reallocate buffer and continue.
571 // since we know that the classes are monotonically increasing, after zero
573 // we can do an optimization
574 // there are only a few cases that work: zero, less, same, greater
575 // if both classes are the same, we fail
576 // if the decomp class < the segment class, we fail
578 segClass = getClass(cp);
579 if (decompClass <= segClass) return null;
584 return NULL
; // we failed, characters left over
586 //if (PROGRESS) printf("Matches\n");
589 fillinResult
->put(UnicodeString(), new UnicodeString(), status
);
590 return fillinResult
; // succeed, but no remainder
593 // brute force approach
594 // check to make sure result is canonically equivalent
595 int32_t tempLen
= inputLen
+ bufLen
;
597 UChar trial
[bufSize
];
598 unorm_decompose(trial
, bufSize
, temp
, tempLen
, FALSE
, 0, &status
);
601 || uprv_memcmp(segment
+segmentPos
, trial
, (segLen
- segmentPos
)*sizeof(UChar
)) != 0)
606 return getEquivalents2(fillinResult
, buff
, bufLen
, status
);
611 #endif /* #if !UCONFIG_NO_NORMALIZATION */