]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/gencase/store.c
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / tools / gencase / store.c
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
73c04bcf 4* Copyright (C) 2004-2005, International Business Machines
374ca955
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: store.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2004aug28
14* created by: Markus W. Scherer
15*
16* Store Unicode case mapping properties efficiently for
17* random access.
18*/
19
20#include <stdio.h>
21#include <stdlib.h>
22#include "unicode/utypes.h"
23#include "unicode/uchar.h"
24#include "unicode/ustring.h"
25#include "cmemory.h"
26#include "cstring.h"
27#include "filestrm.h"
28#include "utrie.h"
73c04bcf 29#include "uarrsort.h"
374ca955
A
30#include "unicode/udata.h"
31#include "unewdata.h"
32#include "propsvec.h"
73c04bcf 33#include "writesrc.h"
374ca955
A
34#include "gencase.h"
35
73c04bcf
A
36#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
37
374ca955
A
38/* Unicode case mapping properties file format ---------------------------------
39
40The file format prepared and written here contains several data
41structures that store indexes or data.
42
43Before the data contents described below, there are the headers required by
44the udata API for loading ICU data. Especially, a UDataInfo structure
45precedes the actual data. It contains platform properties values and the
46file format version.
47
73c04bcf
A
48The following is a description of format version 1.1 .
49
50Format version 1.1 adds data for case closure.
374ca955
A
51
52The file contains the following structures:
53
54 const int32_t indexes[i0] with values i0, i1, ...:
55 (see UCASE_IX_... constants for names of indexes)
56
57 i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
58 i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
59 i2 trieSize; -- size in bytes of the case mapping properties trie
60 i3 exceptionsLength; -- length in uint16_t of the exceptions array
73c04bcf 61 i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1)
374ca955 62
73c04bcf 63 i5..i14 reservedIndexes; -- reserved values; 0 for now
374ca955
A
64
65 i15 maxFullLength; -- maximum length of a full case mapping/folding string
66
67
73c04bcf 68 Serialized trie, see utrie.h;
374ca955
A
69
70 const uint16_t exceptions[exceptionsLength];
71
73c04bcf
A
72 const UChar unfold[unfoldLength];
73
374ca955
A
74
75Trie data word:
76Bits
77if(exception) {
78 15..4 unsigned exception index
79} else {
80 if(not uncased) {
81 15..6 signed delta to simple case mapping code point
82 (add delta to input code point)
83 } else {
84 6 the code point is case-ignorable
85 (U+0307 is also case-ignorable but has an exception)
86 }
87 5..4 0 normal character with cc=0
88 1 soft-dotted character
89 2 cc=230
90 3 other cc
91}
92 3 exception
93 2 case sensitive
94 1..0 0 uncased
95 1 lowercase
96 2 uppercase
97 3 titlecase
98
99
100Exceptions:
101A sub-array of the exceptions array is indexed by the exception index in a
102trie word.
103The sub-array consists of the following fields:
104 uint16_t excWord;
105 uint16_t optional values [];
106 UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
107
108excWord: (see UCASE_EXC_...)
109Bits
110 15 conditional case folding
111 14 conditional special casing
11213..12 same as non-exception trie data bits 5..4
113 moved here because the exception index needs more bits than the delta
114 0 normal character with cc=0
115 1 soft-dotted character
116 2 cc=230
117 3 other cc
11811.. 9 reserved
119 8 if set, then for each optional-value slot there are 2 uint16_t values
120 (high and low parts of 32-bit values)
121 instead of single ones
122 7.. 0 bits for which optional value is present
123
124Optional-value slots:
1250 lowercase mapping (code point)
1261 case folding (code point)
1272 uppercase mapping (code point)
1283 titlecase mapping (code point)
73c04bcf
A
1294 reserved
1305 reserved
1316 closure mappings (new in format version 1.1)
374ca955
A
1327 there is at least one full (string) case mapping
133 the length of each is encoded in a nibble of this optional value,
134 and the strings follow this optional value in the same order:
135 lower/fold/upper/title
136
73c04bcf
A
137The optional closure mappings value is used as follows:
138Bits 0..3 contain the length of a string of code points for case closure.
139The string immediately follows the full case mappings, or the closure value
140slot if there are no full case mappings.
141Bits 4..15 are reserved and could be used in the future to indicate the
142number of strings for case closure.
143Complete case closure for a code point is given by the union of all simple
144and full case mappings and foldings, plus the case closure code points
145(and potentially, in the future, case closure strings).
146
374ca955
A
147For space saving, some values are not stored. Lookups are as follows:
148- If special casing is conditional, then no full lower/upper/title mapping
149 strings are stored.
150- If case folding is conditional, then no simple or full case foldings are
151 stored.
152- Fall back in this order:
153 full (string) mapping -- if full mappings are used
154 simple (code point) mapping of the same type
155 simple fold->simple lower
156 simple title->simple upper
157 finally, the original code point (no mapping)
158
73c04bcf
A
159This fallback order is strict:
160In particular, the fallback from full case folding is to simple case folding,
161not to full lowercase mapping.
162
163Reverse case folding data ("unfold") array: (new in format version 1.1)
164
165This array stores some miscellaneous values followed by a table. The data maps
166back from multi-character strings to their original code points, for use
167in case closure.
168
169The table contains two columns of strings.
170The string in the first column is the case folding of each of the code points
171in the second column. The strings are terminated with NUL or by the end of the
172column, whichever comes first.
173
174The miscellaneous data takes up one pseudo-row and includes:
175- number of rows
176- number of UChars per row
177- number of UChars in the left (folding string) column
178
179The table is sorted by its first column. Values in the first column are unique.
180
374ca955
A
181----------------------------------------------------------------------------- */
182
183/* UDataInfo cf. udata.h */
184static UDataInfo dataInfo={
185 sizeof(UDataInfo),
186 0,
187
188 U_IS_BIG_ENDIAN,
189 U_CHARSET_FAMILY,
190 U_SIZEOF_UCHAR,
191 0,
192
193 /* dataFormat="cAsE" */
194 { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
73c04bcf 195 { 1, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
374ca955
A
196 { 4, 0, 1, 0 } /* dataVersion */
197};
198
199enum {
200 /* maximum number of exceptions expected */
201 MAX_EXC_COUNT=1000
202};
203
204/* exceptions values */
205static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100];
206static uint16_t exceptionsTop=0;
207static Props excProps[MAX_EXC_COUNT];
208static uint16_t exceptionsCount=0;
209
210/* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
211static int32_t maxFullLength=U16_MAX_LENGTH;
212
73c04bcf
A
213/* reverse case folding ("unfold") data */
214static UChar unfold[UGENCASE_UNFOLD_MAX_ROWS*UGENCASE_UNFOLD_WIDTH]={
215 0, UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH, 0, 0
216};
217static uint16_t unfoldRows=0;
218static uint16_t unfoldTop=UGENCASE_UNFOLD_WIDTH;
219
220/* Unicode versions --------------------------------------------------------- */
221
222static const UVersionInfo
223unicodeVersions[]={
224 { 1, 0, 0, 0 },
225 { 1, 1, 0, 0 },
226 { 2, 0, 0, 0 },
227 { 3, 0, 0, 0 },
228 { 3, 1, 0, 0 },
229 { 3, 2, 0, 0 },
230 { 4, 0, 0, 0 },
231 { 4, 0, 1, 0 },
232 { 4, 1, 0, 0 }
233};
234
235int32_t ucdVersion=UNI_4_1;
236
237static int32_t
238findUnicodeVersion(const UVersionInfo version) {
239 int32_t i;
240
241 for(i=0; /* while(version>unicodeVersions[i]) {} */
242 i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0;
243 ++i) {}
244 if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) {
245 --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */
246 }
247 return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */
248}
374ca955
A
249
250extern void
251setUnicodeVersion(const char *v) {
252 UVersionInfo version;
253 u_versionFromString(version, v);
254 uprv_memcpy(dataInfo.dataVersion, version, 4);
73c04bcf
A
255 ucdVersion=findUnicodeVersion(version);
256}
257
258/* -------------------------------------------------------------------------- */
259
260static void
261addUnfolding(UChar32 c, const UChar *s, int32_t length) {
262 int32_t i;
263
264 if(length>UGENCASE_UNFOLD_STRING_WIDTH) {
265 fprintf(stderr, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n",
266 (long)length, UGENCASE_UNFOLD_STRING_WIDTH);
267 exit(U_INTERNAL_PROGRAM_ERROR);
268 }
269 if(unfoldTop>=LENGTHOF(unfold)) {
270 fprintf(stderr, "gencase error: too many multi-character case foldings\n");
271 exit(U_BUFFER_OVERFLOW_ERROR);
272 }
273 u_memset(unfold+unfoldTop, 0, UGENCASE_UNFOLD_WIDTH);
274 u_memcpy(unfold+unfoldTop, s, length);
275
276 i=unfoldTop+UGENCASE_UNFOLD_STRING_WIDTH;
277 U16_APPEND_UNSAFE(unfold, i, c);
278
279 ++unfoldRows;
280 unfoldTop+=UGENCASE_UNFOLD_WIDTH;
374ca955
A
281}
282
283/* store a character's properties ------------------------------------------- */
284
285extern void
286setProps(Props *p) {
287 UErrorCode errorCode;
288 uint32_t value, oldValue;
289 int32_t delta;
73c04bcf 290 UBool isCaseIgnorable;
374ca955
A
291
292 /* get the non-UnicodeData.txt properties */
293 value=oldValue=upvec_getValue(pv, p->code, 0);
294
295 /* default: map to self */
296 delta=0;
297
298 if(p->gc==U_TITLECASE_LETTER) {
299 /* the Titlecase property is read late, from UnicodeData.txt */
300 value|=UCASE_TITLE;
301 }
302
303 if(p->upperCase!=0) {
304 /* uppercase mapping as delta if the character is lowercase */
305 if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
306 delta=p->upperCase-p->code;
307 } else {
308 value|=UCASE_EXCEPTION;
309 }
310 }
311 if(p->lowerCase!=0) {
312 /* lowercase mapping as delta if the character is uppercase or titlecase */
313 if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) {
314 delta=p->lowerCase-p->code;
315 } else {
316 value|=UCASE_EXCEPTION;
317 }
318 }
319 if(p->upperCase!=p->titleCase) {
320 value|=UCASE_EXCEPTION;
321 }
73c04bcf
A
322 if(p->closure[0]!=0) {
323 value|=UCASE_EXCEPTION;
324 }
374ca955
A
325 if(p->specialCasing!=NULL) {
326 value|=UCASE_EXCEPTION;
327 }
328 if(p->caseFolding!=NULL) {
329 value|=UCASE_EXCEPTION;
330 }
331
332 if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
333 value|=UCASE_EXCEPTION;
334 }
335
336 if(p->cc!=0) {
337 if(value&UCASE_DOT_MASK) {
338 fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n");
339 exit(U_INTERNAL_PROGRAM_ERROR);
340 }
341 if(p->cc==230) {
342 value|=UCASE_ABOVE;
343 } else {
344 value|=UCASE_OTHER_ACCENT;
345 }
346 }
347
348 /* encode case-ignorable as delta==1 on uncased characters */
73c04bcf
A
349 isCaseIgnorable=FALSE;
350 if((value&UCASE_TYPE_MASK)==UCASE_NONE) {
351 if(ucdVersion>=UNI_4_1) {
352 /* Unicode 4.1 and up: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk */
353 if(
354 (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
355 ((upvec_getValue(pv, p->code, 1)>>UGENCASE_IS_MID_LETTER_SHIFT)&1)!=0
356 ) {
357 isCaseIgnorable=TRUE;
358 }
359 } else {
360 /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */
361 if(
362 (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 ||
363 p->code==0x27 || p->code==0xad || p->code==0x2019
364 ) {
365 isCaseIgnorable=TRUE;
366 }
367 }
368 }
369
370 if(isCaseIgnorable && p->code!=0x307) {
374ca955
A
371 /*
372 * We use one of the delta/exception bits, which works because we only
373 * store the case-ignorable flag for uncased characters.
374 * There is no delta for uncased characters (see checks above).
375 * If there is an exception for an uncased, case-ignorable character
376 * (although there should not be any case mappings if it's uncased)
377 * then we have a problem.
378 * There is one character which is case-ignorable but has an exception:
379 * U+0307 is uncased, Mn, has conditional special casing and
380 * is therefore handled in code instead.
381 */
382 if(value&UCASE_EXCEPTION) {
383 fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n",
384 (unsigned long)p->code);
385 exit(U_INTERNAL_PROGRAM_ERROR);
386 }
387
388 delta=1;
389 }
390
391 /* handle exceptions */
392 if(value&UCASE_EXCEPTION) {
393 /* simply store exceptions for later processing and encoding */
394 value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
395 uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p));
396 if(++exceptionsCount==MAX_EXC_COUNT) {
397 fprintf(stderr, "gencase: too many exceptions\n");
398 exit(U_INDEX_OUTOFBOUNDS_ERROR);
399 }
400 } else {
401 /* store the simple case mapping delta */
402 value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
403 }
404
405 errorCode=U_ZERO_ERROR;
406 if( value!=oldValue &&
407 !upvec_setValue(pv, p->code, p->code+1, 0, value, 0xffffffff, &errorCode)
408 ) {
409 fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
410 u_errorName(errorCode));
411 exit(errorCode);
412 }
73c04bcf
A
413
414 /* add the multi-character case folding to the "unfold" data */
415 if(p->caseFolding!=NULL) {
416 int32_t length=p->caseFolding->full[0];
417 if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) {
418 addUnfolding(p->code, p->caseFolding->full+1, length);
419 }
420 }
374ca955
A
421}
422
423extern void
424addCaseSensitive(UChar32 first, UChar32 last) {
425 UErrorCode errorCode=U_ZERO_ERROR;
426 if(!upvec_setValue(pv, first, last+1, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode)) {
427 fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n",
428 u_errorName(errorCode));
429 exit(errorCode);
430 }
431}
432
73c04bcf
A
433/* finalize reverse case folding ("unfold") data ---------------------------- */
434
435static int32_t U_CALLCONV
436compareUnfold(const void *context, const void *left, const void *right) {
437 return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH);
438}
439
440static void
441makeUnfoldData() {
442 static const UChar
443 iDot[2]= { 0x69, 0x307 };
444
445 UChar *p, *q;
446 int32_t i, j, k;
447 UErrorCode errorCode;
448
449 /*
450 * add a case folding that we missed because it's conditional:
451 * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
452 */
453 addUnfolding(0x130, iDot, 2);
454
455 /* sort the data */
456 errorCode=U_ZERO_ERROR;
457 uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2,
458 compareUnfold, NULL, FALSE, &errorCode);
459
460 /* make unique-string rows by merging adjacent ones' code point columns */
461
462 /* make p point to row i-1 */
463 p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH;
464
465 for(i=1; i<unfoldRows;) {
466 if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) {
467 /* concatenate code point columns */
468 q=p+UGENCASE_UNFOLD_STRING_WIDTH;
469 for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {}
470 for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) {
471 q[j]=q[UGENCASE_UNFOLD_WIDTH+k];
472 }
473 if(j>UGENCASE_UNFOLD_CP_WIDTH) {
474 fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
475 (long)j, UGENCASE_UNFOLD_CP_WIDTH);
476 exit(U_BUFFER_OVERFLOW_ERROR);
477 }
478
479 /* move following rows up one */
480 --unfoldRows;
481 unfoldTop-=UGENCASE_UNFOLD_WIDTH;
482 u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH);
483 } else {
484 p+=UGENCASE_UNFOLD_WIDTH;
485 ++i;
486 }
487 }
488
489 unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows;
490
491 if(beVerbose) {
492 puts("unfold data:");
493
494 p=(UChar *)unfold;
495 for(i=0; i<unfoldRows; ++i) {
496 p+=UGENCASE_UNFOLD_WIDTH;
497 printf("[%2d] %04x %04x %04x <- %04x %04x\n",
498 (int)i, p[0], p[1], p[2], p[3], p[4]);
499 }
500 }
501}
502
503/* case closure ------------------------------------------------------------- */
504
505static void
506addClosureMapping(UChar32 src, UChar32 dest) {
507 uint32_t value;
508
509 if(beVerbose) {
510 printf("add closure mapping U+%04lx->U+%04lx\n",
511 (unsigned long)src, (unsigned long)dest);
512 }
513
514 value=upvec_getValue(pv, src, 0);
515 if(value&UCASE_EXCEPTION) {
516 Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
517 int32_t i;
518
519 /* append dest to src's closure array */
520 for(i=0;; ++i) {
521 if(i==LENGTHOF(p->closure)) {
522 fprintf(stderr, "closure[] overflow for U+%04lx->U+%04lx\n",
523 (unsigned long)src, (unsigned long)dest);
524 exit(U_BUFFER_OVERFLOW_ERROR);
525 } else if(p->closure[i]==dest) {
526 break; /* do not store duplicates */
527 } else if(p->closure[i]==0) {
528 p->closure[i]=dest;
529 break;
530 }
531 }
532 } else {
533 Props p2={ 0 };
534 UChar32 next;
535 UErrorCode errorCode;
536
537 /*
538 * decode value into p2 (enough for makeException() to work properly),
539 * add the closure mapping,
540 * and set the new exception for src
541 */
542 p2.code=src;
543 p2.closure[0]=dest;
544
545 if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
546 /* one simple case mapping, don't care which one */
547 next=src+((int16_t)value>>UCASE_DELTA_SHIFT);
548 if(next!=src) {
549 if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
550 p2.upperCase=p2.titleCase=next;
551 } else {
552 p2.lowerCase=next;
553 }
554 }
555 } else if(value&UCASE_DELTA_MASK) {
556 fprintf(stderr, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n",
557 (unsigned long)src);
558 exit(U_INTERNAL_PROGRAM_ERROR);
559 }
560
561 value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); /* remove previous simple mapping */
562 value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
563 value|=UCASE_EXCEPTION;
564 uprv_memcpy(excProps+exceptionsCount, &p2, sizeof(p2));
565 if(++exceptionsCount==MAX_EXC_COUNT) {
566 fprintf(stderr, "gencase: too many exceptions\n");
567 exit(U_INDEX_OUTOFBOUNDS_ERROR);
568 }
569
570 errorCode=U_ZERO_ERROR;
571 if(!upvec_setValue(pv, src, src+1, 0, value, 0xffffffff, &errorCode)) {
572 fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
573 u_errorName(errorCode));
574 exit(errorCode);
575 }
576 }
577}
578
579/*
580 * Find missing case mapping relationships and add mappings for case closure.
581 * This function starts from an "original" code point and recursively
582 * finds its case mappings and the case mappings of where it maps to.
583 *
584 * The recursion depth is capped at 3 nested calls of this function.
585 * In each call, the current code point is c, and the function enumerates
586 * all of c's simple (single-code point) case mappings.
587 * prev is the code point that case-mapped to c.
588 * prev2 is the code point that case-mapped to prev.
589 *
590 * The initial function call has prev2<0, prev<0, and c==orig
591 * (marking no code points).
592 * It enumerates c's case mappings and recurses without further action.
593 *
594 * The second-level function call has prev2<0, prev==orig, and c is
595 * the destination code point of one of prev's case mappings.
596 * The function checks if any of c's case mappings go back to orig
597 * and adds a closure mapping if not.
598 * In other words, it turns a case mapping relationship of
599 * orig->c
600 * into
601 * orig<->c
602 *
603 * The third-level function call has prev2==orig, prev>=0, and c is
604 * the destination code point of one of prev's case mappings.
605 * (And prev is the destination of one of prev2's case mappings.)
606 * The function checks if any of c's case mappings go back to orig
607 * and adds a closure mapping if not.
608 * In other words, it turns case mapping relationships of
609 * orig->prev->c or orig->prev<->c
610 * into
611 * orig->prev->c->orig or orig->prev<->c->orig
612 * etc.
613 * (Graphically, this closes a triangle.)
614 *
615 * With repeated application on all code points until no more closure mappings
616 * are added, all case equivalence groups get complete mappings.
617 * That is, in each group of code points with case relationships
618 * each code point will in the end have some mapping to each other
619 * code point in the group.
620 *
621 * @return TRUE if a closure mapping was added
622 */
623static UBool
624addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value) {
625 UChar32 next;
626 UBool someMappingsAdded=FALSE;
627
628 if(c!=orig) {
629 /* get the properties for c */
630 value=upvec_getValue(pv, c, 0);
631 }
632 /* else if c==orig then c's value was passed in */
633
634 if(value&UCASE_EXCEPTION) {
635 UChar32 set[32];
636 int32_t i, count=0;
637
638 Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
639
640 /*
641 * marker for whether any of c's mappings goes to orig
642 * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
643 */
644 UBool mapsToOrig=(UBool)(c==orig);
645
646 /* collect c's case mapping destinations in set[] */
647 if((next=p->upperCase)!=0 && next!=c) {
648 set[count++]=next;
649 }
650 if((next=p->lowerCase)!=0 && next!=c) {
651 set[count++]=next;
652 }
653 if(p->upperCase!=(next=p->titleCase) && next!=c) {
654 set[count++]=next;
655 }
656 if(p->caseFolding!=NULL && (next=p->caseFolding->simple)!=0 && next!=c) {
657 set[count++]=next;
658 }
659
660 /* append c's current closure mappings to set[] */
661 for(i=0; i<LENGTHOF(p->closure) && (next=p->closure[i])!=0; ++i) {
662 set[count++]=next;
663 }
664
665 /* process all code points to which c case-maps */
666 for(i=0; i<count; ++i) {
667 next=set[i]; /* next!=c */
668
669 if(next==orig) {
670 mapsToOrig=TRUE; /* remember that we map to orig */
671 } else if(prev2<0 && next!=prev) {
672 /*
673 * recurse unless
674 * we have reached maximum depth (prev2>=0) or
675 * this is a mapping to one of the previous code points (orig, prev, c)
676 */
677 someMappingsAdded|=addClosure(orig, prev, c, next, 0);
678 }
679 }
680
681 if(!mapsToOrig) {
682 addClosureMapping(c, orig);
683 return TRUE;
684 }
685 } else {
686 if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
687 /* one simple case mapping, don't care which one */
688 next=c+((int16_t)value>>UCASE_DELTA_SHIFT);
689 if(next!=c) {
690 /*
691 * recurse unless
692 * we have reached maximum depth (prev2>=0) or
693 * this is a mapping to one of the previous code points (orig, prev, c)
694 */
695 if(prev2<0 && next!=orig && next!=prev) {
696 someMappingsAdded|=addClosure(orig, prev, c, next, 0);
697 }
698
699 if(c!=orig && next!=orig) {
700 /* c does not map to orig, add a closure mapping c->orig */
701 addClosureMapping(c, orig);
702 return TRUE;
703 }
704 }
705 }
706 }
707
708 return someMappingsAdded;
709}
710
374ca955
A
711extern void
712makeCaseClosure() {
73c04bcf
A
713 UChar *p;
714 uint32_t *row;
715 uint32_t value;
716 UChar32 start, limit, c, c2;
717 int32_t i, j;
718 UBool someMappingsAdded;
719
720 /*
721 * finalize the "unfold" data because we need to use it to add closure mappings
722 * for situations like FB05->"st"<-FB06
723 * where we would otherwise miss the FB05<->FB06 relationship
724 */
725 makeUnfoldData();
726
727 /* use the "unfold" data to add mappings */
728
729 /* p always points to the code points; this loop ignores the strings completely */
730 p=unfold+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH;
731
732 for(i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) {
733 j=0;
734 U16_NEXT_UNSAFE(p, j, c);
735 while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) {
736 U16_NEXT_UNSAFE(p, j, c2);
737 addClosure(c, U_SENTINEL, c, c2, 0);
738 }
739 }
740
741 if(beVerbose) {
742 puts("---- ---- ---- ---- (done with closures from unfolding)");
743 }
744
745 /* add further closure mappings from analyzing simple mappings */
746 do {
747 someMappingsAdded=FALSE;
748
749 i=0;
750 while((row=upvec_getRow(pv, i, &start, &limit))!=NULL) {
751 value=*row;
752 if(value!=0) {
753 while(start<limit) {
754 if(addClosure(start, U_SENTINEL, U_SENTINEL, start, value)) {
755 someMappingsAdded=TRUE;
756
757 /*
758 * stop this loop because pv was changed and row is not valid any more
759 * skip all rows below the current start
760 */
761 while((row=upvec_getRow(pv, i, NULL, &limit))!=NULL && start>=limit) {
762 ++i;
763 }
764 row=NULL; /* signal to continue with outer loop, without further ++i */
765 break;
766 }
767 ++start;
768 }
769 if(row==NULL) {
770 continue; /* see row=NULL above */
771 }
772 }
773 ++i;
774 }
775
776 if(beVerbose && someMappingsAdded) {
777 puts("---- ---- ---- ----");
778 }
779 } while(someMappingsAdded);
374ca955
A
780}
781
782/* exceptions --------------------------------------------------------------- */
783
73c04bcf
A
784/* get the string length from zero-terminated code points in a limited-length array */
785static int32_t
786getLengthOfCodePoints(const UChar32 *s, int32_t maxLength) {
787 int32_t i, length;
788
789 for(i=length=0; i<maxLength && s[i]!=0; ++i) {
790 length+=U16_LENGTH(s[i]);
791 }
792 return length;
793}
794
374ca955
A
795static UBool
796fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) {
797 int32_t i, length;
798 UChar32 full;
799
800 length=*s++;
801 if(length==0 || length>U16_MAX_LENGTH) {
802 return FALSE;
803 }
804 i=0;
805 U16_NEXT(s, i, length, full);
806
807 if(simple==0) {
808 simple=c; /* UCD has no simple mapping if it's the same as the code point itself */
809 }
810 return (UBool)(i==length && full==simple);
811}
812
813static uint16_t
814makeException(uint32_t value, Props *p) {
815 uint32_t slots[8];
816 uint32_t slotBits;
817 uint16_t excWord, excIndex, excTop, i, count, length, fullLengths;
818 UBool doubleSlots;
819
820 /* excIndex will be returned for storing in the trie word */
821 excIndex=exceptionsTop;
822 if(excIndex>=UCASE_MAX_EXCEPTIONS) {
823 fprintf(stderr, "gencase error: too many exceptions words\n");
824 exit(U_BUFFER_OVERFLOW_ERROR);
825 }
826
827 excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */
828
829 /* copy and shift the soft-dotted bits */
830 excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT;
831
832 /* update maxFullLength */
833 if(p->specialCasing!=NULL) {
834 length=p->specialCasing->lowerCase[0];
835 if(length>maxFullLength) {
836 maxFullLength=length;
837 }
838 length=p->specialCasing->upperCase[0];
839 if(length>maxFullLength) {
840 maxFullLength=length;
841 }
842 length=p->specialCasing->titleCase[0];
843 if(length>maxFullLength) {
844 maxFullLength=length;
845 }
846 }
847 if(p->caseFolding!=NULL) {
848 length=p->caseFolding->full[0];
849 if(length>maxFullLength) {
850 maxFullLength=length;
851 }
852 }
853
854 /* set the bits for conditional mappings */
855 if(p->specialCasing!=NULL && p->specialCasing->isComplex) {
856 excWord|=UCASE_EXC_CONDITIONAL_SPECIAL;
857 p->specialCasing=NULL;
858 }
859 if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
860 excWord|=UCASE_EXC_CONDITIONAL_FOLD;
861 p->caseFolding=NULL;
862 }
863
864 /*
865 * Note:
866 * UCD stores no simple mappings when they are the same as the code point itself.
867 * SpecialCasing and CaseFolding do store simple mappings even if they are
868 * the same as the code point itself.
869 * Comparisons between simple regular mappings and simple special/folding
870 * mappings need to compensate for the difference by comparing with the
871 * original code point if a simple UCD mapping is missing (0).
872 */
873
874 /* remove redundant data */
875 if(p->specialCasing!=NULL) {
876 /* do not store full mappings if they are the same as the simple ones */
877 if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) {
878 p->specialCasing->lowerCase[0]=0;
879 }
880 if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) {
881 p->specialCasing->upperCase[0]=0;
882 }
883 if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) {
884 p->specialCasing->titleCase[0]=0;
885 }
886 }
887 if( p->caseFolding!=NULL &&
888 fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code)
889 ) {
890 p->caseFolding->full[0]=0;
891 }
892
893 /* write the optional slots */
894 slotBits=0;
895 count=0;
896
897 if(p->lowerCase!=0) {
898 slots[count]=(uint32_t)p->lowerCase;
899 slotBits|=slots[count];
900 ++count;
901 excWord|=U_MASK(UCASE_EXC_LOWER);
902 }
903 if( p->caseFolding!=NULL &&
904 p->caseFolding->simple!=0 &&
905 (p->lowerCase!=0 ?
906 p->caseFolding->simple!=p->lowerCase :
907 p->caseFolding->simple!=p->code)
908 ) {
909 slots[count]=(uint32_t)p->caseFolding->simple;
910 slotBits|=slots[count];
911 ++count;
912 excWord|=U_MASK(UCASE_EXC_FOLD);
913 }
914 if(p->upperCase!=0) {
915 slots[count]=(uint32_t)p->upperCase;
916 slotBits|=slots[count];
917 ++count;
918 excWord|=U_MASK(UCASE_EXC_UPPER);
919 }
920 if(p->upperCase!=p->titleCase) {
921 if(p->titleCase!=0) {
922 slots[count]=(uint32_t)p->titleCase;
923 } else {
924 slots[count]=(uint32_t)p->code;
925 }
926 slotBits|=slots[count];
927 ++count;
928 excWord|=U_MASK(UCASE_EXC_TITLE);
929 }
930
73c04bcf
A
931 /* length of case closure */
932 if(p->closure[0]!=0) {
933 length=getLengthOfCodePoints(p->closure, LENGTHOF(p->closure));
934 slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */
935 slotBits|=slots[count];
936 ++count;
937 excWord|=U_MASK(UCASE_EXC_CLOSURE);
938 }
939
374ca955
A
940 /* lengths of full case mapping strings, stored in the last slot */
941 fullLengths=0;
942 if(p->specialCasing!=NULL) {
943 fullLengths=p->specialCasing->lowerCase[0];
944 fullLengths|=p->specialCasing->upperCase[0]<<8;
945 fullLengths|=p->specialCasing->titleCase[0]<<12;
946 }
947 if(p->caseFolding!=NULL) {
948 fullLengths|=p->caseFolding->full[0]<<4;
949 }
950 if(fullLengths!=0) {
951 slots[count]=fullLengths;
952 slotBits|=slots[count];
953 ++count;
954 excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS);
955 }
956
957 /* write slots */
958 doubleSlots=(UBool)(slotBits>0xffff);
959 if(!doubleSlots) {
960 for(i=0; i<count; ++i) {
961 exceptions[excTop++]=(uint16_t)slots[i];
962 }
963 } else {
964 excWord|=UCASE_EXC_DOUBLE_SLOTS;
965 for(i=0; i<count; ++i) {
966 exceptions[excTop++]=(uint16_t)(slots[i]>>16);
967 exceptions[excTop++]=(uint16_t)slots[i];
968 }
969 }
970
971 /* write the full case mapping strings */
972 if(p->specialCasing!=NULL) {
973 length=(uint16_t)p->specialCasing->lowerCase[0];
974 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length);
975 excTop+=length;
976 }
977 if(p->caseFolding!=NULL) {
978 length=(uint16_t)p->caseFolding->full[0];
979 u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length);
980 excTop+=length;
981 }
982 if(p->specialCasing!=NULL) {
983 length=(uint16_t)p->specialCasing->upperCase[0];
984 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length);
985 excTop+=length;
986
987 length=(uint16_t)p->specialCasing->titleCase[0];
988 u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length);
989 excTop+=length;
990 }
991
73c04bcf
A
992 /* write the closure data */
993 if(p->closure[0]!=0) {
994 UChar32 c;
995
996 for(i=0; i<LENGTHOF(p->closure) && (c=p->closure[i])!=0; ++i) {
997 U16_APPEND_UNSAFE((UChar *)exceptions, excTop, c);
998 }
999 }
1000
374ca955
A
1001 exceptionsTop=excTop;
1002
1003 /* write the main exceptions word */
1004 exceptions[excIndex]=excWord;
1005
1006 return excIndex;
1007}
1008
1009extern void
1010makeExceptions() {
1011 uint32_t *row;
1012 uint32_t value;
1013 int32_t i;
1014 uint16_t excIndex;
1015
1016 i=0;
1017 while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) {
1018 value=*row;
1019 if(value&UCASE_EXCEPTION) {
1020 excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT));
1021 *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<<UCASE_EXC_SHIFT);
1022 }
1023 ++i;
1024 }
1025}
1026
1027/* generate output data ----------------------------------------------------- */
1028
1029extern void
73c04bcf 1030generateData(const char *dataDir, UBool csource) {
374ca955
A
1031 static int32_t indexes[UCASE_IX_TOP]={
1032 UCASE_IX_TOP
1033 };
1034 static uint8_t trieBlock[40000];
1035
1036 const uint32_t *row;
1037 UChar32 start, limit;
1038 int32_t i;
1039
1040 UNewDataMemory *pData;
1041 UNewTrie *pTrie;
1042 UErrorCode errorCode=U_ZERO_ERROR;
1043 int32_t trieSize;
1044 long dataLength;
1045
1046 pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE);
1047 if(pTrie==NULL) {
1048 fprintf(stderr, "gencase error: unable to create a UNewTrie\n");
1049 exit(U_MEMORY_ALLOCATION_ERROR);
1050 }
1051
1052 for(i=0; (row=upvec_getRow(pv, i, &start, &limit))!=NULL; ++i) {
1053 if(!utrie_setRange32(pTrie, start, limit, *row, TRUE)) {
1054 fprintf(stderr, "gencase error: unable to set trie value (overflow)\n");
1055 exit(U_BUFFER_OVERFLOW_ERROR);
1056 }
1057 }
1058
1059 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
1060 if(U_FAILURE(errorCode)) {
1061 fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
1062 exit(errorCode);
1063 }
1064
1065 indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop;
1066 indexes[UCASE_IX_TRIE_SIZE]=trieSize;
73c04bcf
A
1067 indexes[UCASE_IX_UNFOLD_LENGTH]=unfoldTop;
1068 indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop+2*unfoldTop;
374ca955
A
1069
1070 indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;
1071
1072 if(beVerbose) {
1073 printf("trie size in bytes: %5d\n", (int)trieSize);
1074 printf("number of code points with exceptions: %5d\n", exceptionsCount);
1075 printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop);
73c04bcf 1076 printf("size in bytes of reverse foldings: %5d\n", 2*unfoldTop);
374ca955
A
1077 printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]);
1078 }
1079
73c04bcf
A
1080 if(csource) {
1081 /* write .c file for hardcoded data */
1082 UTrie trie={ NULL };
1083 FILE *f;
1084
1085 utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
1086 if(U_FAILURE(errorCode)) {
1087 fprintf(
1088 stderr,
1089 "gencase error: failed to utrie_unserialize(ucase.icu trie) - %s\n",
1090 u_errorName(errorCode));
1091 return;
1092 }
374ca955 1093
73c04bcf
A
1094 f=usrc_create(dataDir, "ucase_props_data.c");
1095 if(f!=NULL) {
1096 usrc_writeArray(f,
1097 "static const UVersionInfo ucase_props_dataVersion={",
1098 dataInfo.dataVersion, 8, 4,
1099 "};\n\n");
1100 usrc_writeArray(f,
1101 "static const int32_t ucase_props_indexes[UCASE_IX_TOP]={",
1102 indexes, 32, UCASE_IX_TOP,
1103 "};\n\n");
1104 usrc_writeUTrieArrays(f,
1105 "static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL,
1106 &trie,
1107 "\n};\n\n");
1108 usrc_writeArray(f,
1109 "static const uint16_t ucase_props_exceptions[%ld]={\n",
1110 exceptions, 16, exceptionsTop,
1111 "\n};\n\n");
1112 usrc_writeArray(f,
1113 "static const uint16_t ucase_props_unfold[%ld]={\n",
1114 unfold, 16, unfoldTop,
1115 "\n};\n\n");
1116 fputs(
1117 "static const UCaseProps ucase_props_singleton={\n"
1118 " NULL,\n"
1119 " ucase_props_indexes,\n"
1120 " ucase_props_exceptions,\n"
1121 " ucase_props_unfold,\n",
1122 f);
1123 usrc_writeUTrieStruct(f,
1124 " {\n",
1125 &trie, "ucase_props_trieIndex", NULL, NULL,
1126 " },\n");
1127 usrc_writeArray(f, " { ", dataInfo.formatVersion, 8, 4, " }\n");
1128 fputs("};\n", f);
1129 fclose(f);
1130 }
1131 } else {
1132 /* write the data */
1133 pData=udata_create(dataDir, UCASE_DATA_TYPE, UCASE_DATA_NAME, &dataInfo,
1134 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
1135 if(U_FAILURE(errorCode)) {
1136 fprintf(stderr, "gencase: unable to create data memory, %s\n", u_errorName(errorCode));
1137 exit(errorCode);
1138 }
374ca955 1139
73c04bcf
A
1140 udata_writeBlock(pData, indexes, sizeof(indexes));
1141 udata_writeBlock(pData, trieBlock, trieSize);
1142 udata_writeBlock(pData, exceptions, 2*exceptionsTop);
1143 udata_writeBlock(pData, unfold, 2*unfoldTop);
374ca955 1144
73c04bcf
A
1145 /* finish up */
1146 dataLength=udata_finish(pData, &errorCode);
1147 if(U_FAILURE(errorCode)) {
1148 fprintf(stderr, "gencase: error %d writing the output file\n", errorCode);
1149 exit(errorCode);
1150 }
1151
1152 if(dataLength!=indexes[UCASE_IX_LENGTH]) {
1153 fprintf(stderr, "gencase: data length %ld != calculated size %d\n",
1154 dataLength, (int)indexes[UCASE_IX_LENGTH]);
1155 exit(U_INTERNAL_PROGRAM_ERROR);
1156 }
374ca955
A
1157 }
1158
1159 utrie_close(pTrie);
1160}
1161
1162/*
1163 * Hey, Emacs, please set the following:
1164 *
1165 * Local Variables:
1166 * indent-tabs-mode: nil
1167 * End:
1168 *
1169 */