]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
73c04bcf | 4 | * Copyright (C) 2004-2005, International Business Machines |
374ca955 A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: store.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2004aug28 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Store Unicode case mapping properties efficiently for | |
17 | * random access. | |
18 | */ | |
19 | ||
20 | #include <stdio.h> | |
21 | #include <stdlib.h> | |
22 | #include "unicode/utypes.h" | |
23 | #include "unicode/uchar.h" | |
24 | #include "unicode/ustring.h" | |
25 | #include "cmemory.h" | |
26 | #include "cstring.h" | |
27 | #include "filestrm.h" | |
28 | #include "utrie.h" | |
73c04bcf | 29 | #include "uarrsort.h" |
374ca955 A |
30 | #include "unicode/udata.h" |
31 | #include "unewdata.h" | |
32 | #include "propsvec.h" | |
73c04bcf | 33 | #include "writesrc.h" |
374ca955 A |
34 | #include "gencase.h" |
35 | ||
73c04bcf A |
36 | #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) |
37 | ||
374ca955 A |
38 | /* Unicode case mapping properties file format --------------------------------- |
39 | ||
40 | The file format prepared and written here contains several data | |
41 | structures that store indexes or data. | |
42 | ||
43 | Before the data contents described below, there are the headers required by | |
44 | the udata API for loading ICU data. Especially, a UDataInfo structure | |
45 | precedes the actual data. It contains platform properties values and the | |
46 | file format version. | |
47 | ||
73c04bcf A |
48 | The following is a description of format version 1.1 . |
49 | ||
50 | Format version 1.1 adds data for case closure. | |
374ca955 A |
51 | |
52 | The file contains the following structures: | |
53 | ||
54 | const int32_t indexes[i0] with values i0, i1, ...: | |
55 | (see UCASE_IX_... constants for names of indexes) | |
56 | ||
57 | i0 indexLength; -- length of indexes[] (UCASE_IX_TOP) | |
58 | i1 dataLength; -- length in bytes of the post-header data (incl. indexes[]) | |
59 | i2 trieSize; -- size in bytes of the case mapping properties trie | |
60 | i3 exceptionsLength; -- length in uint16_t of the exceptions array | |
73c04bcf | 61 | i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1) |
374ca955 | 62 | |
73c04bcf | 63 | i5..i14 reservedIndexes; -- reserved values; 0 for now |
374ca955 A |
64 | |
65 | i15 maxFullLength; -- maximum length of a full case mapping/folding string | |
66 | ||
67 | ||
73c04bcf | 68 | Serialized trie, see utrie.h; |
374ca955 A |
69 | |
70 | const uint16_t exceptions[exceptionsLength]; | |
71 | ||
73c04bcf A |
72 | const UChar unfold[unfoldLength]; |
73 | ||
374ca955 A |
74 | |
75 | Trie data word: | |
76 | Bits | |
77 | if(exception) { | |
78 | 15..4 unsigned exception index | |
79 | } else { | |
80 | if(not uncased) { | |
81 | 15..6 signed delta to simple case mapping code point | |
82 | (add delta to input code point) | |
83 | } else { | |
84 | 6 the code point is case-ignorable | |
85 | (U+0307 is also case-ignorable but has an exception) | |
86 | } | |
87 | 5..4 0 normal character with cc=0 | |
88 | 1 soft-dotted character | |
89 | 2 cc=230 | |
90 | 3 other cc | |
91 | } | |
92 | 3 exception | |
93 | 2 case sensitive | |
94 | 1..0 0 uncased | |
95 | 1 lowercase | |
96 | 2 uppercase | |
97 | 3 titlecase | |
98 | ||
99 | ||
100 | Exceptions: | |
101 | A sub-array of the exceptions array is indexed by the exception index in a | |
102 | trie word. | |
103 | The sub-array consists of the following fields: | |
104 | uint16_t excWord; | |
105 | uint16_t optional values []; | |
106 | UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase | |
107 | ||
108 | excWord: (see UCASE_EXC_...) | |
109 | Bits | |
110 | 15 conditional case folding | |
111 | 14 conditional special casing | |
112 | 13..12 same as non-exception trie data bits 5..4 | |
113 | moved here because the exception index needs more bits than the delta | |
114 | 0 normal character with cc=0 | |
115 | 1 soft-dotted character | |
116 | 2 cc=230 | |
117 | 3 other cc | |
118 | 11.. 9 reserved | |
119 | 8 if set, then for each optional-value slot there are 2 uint16_t values | |
120 | (high and low parts of 32-bit values) | |
121 | instead of single ones | |
122 | 7.. 0 bits for which optional value is present | |
123 | ||
124 | Optional-value slots: | |
125 | 0 lowercase mapping (code point) | |
126 | 1 case folding (code point) | |
127 | 2 uppercase mapping (code point) | |
128 | 3 titlecase mapping (code point) | |
73c04bcf A |
129 | 4 reserved |
130 | 5 reserved | |
131 | 6 closure mappings (new in format version 1.1) | |
374ca955 A |
132 | 7 there is at least one full (string) case mapping |
133 | the length of each is encoded in a nibble of this optional value, | |
134 | and the strings follow this optional value in the same order: | |
135 | lower/fold/upper/title | |
136 | ||
73c04bcf A |
137 | The optional closure mappings value is used as follows: |
138 | Bits 0..3 contain the length of a string of code points for case closure. | |
139 | The string immediately follows the full case mappings, or the closure value | |
140 | slot if there are no full case mappings. | |
141 | Bits 4..15 are reserved and could be used in the future to indicate the | |
142 | number of strings for case closure. | |
143 | Complete case closure for a code point is given by the union of all simple | |
144 | and full case mappings and foldings, plus the case closure code points | |
145 | (and potentially, in the future, case closure strings). | |
146 | ||
374ca955 A |
147 | For space saving, some values are not stored. Lookups are as follows: |
148 | - If special casing is conditional, then no full lower/upper/title mapping | |
149 | strings are stored. | |
150 | - If case folding is conditional, then no simple or full case foldings are | |
151 | stored. | |
152 | - Fall back in this order: | |
153 | full (string) mapping -- if full mappings are used | |
154 | simple (code point) mapping of the same type | |
155 | simple fold->simple lower | |
156 | simple title->simple upper | |
157 | finally, the original code point (no mapping) | |
158 | ||
73c04bcf A |
159 | This fallback order is strict: |
160 | In particular, the fallback from full case folding is to simple case folding, | |
161 | not to full lowercase mapping. | |
162 | ||
163 | Reverse case folding data ("unfold") array: (new in format version 1.1) | |
164 | ||
165 | This array stores some miscellaneous values followed by a table. The data maps | |
166 | back from multi-character strings to their original code points, for use | |
167 | in case closure. | |
168 | ||
169 | The table contains two columns of strings. | |
170 | The string in the first column is the case folding of each of the code points | |
171 | in the second column. The strings are terminated with NUL or by the end of the | |
172 | column, whichever comes first. | |
173 | ||
174 | The miscellaneous data takes up one pseudo-row and includes: | |
175 | - number of rows | |
176 | - number of UChars per row | |
177 | - number of UChars in the left (folding string) column | |
178 | ||
179 | The table is sorted by its first column. Values in the first column are unique. | |
180 | ||
374ca955 A |
181 | ----------------------------------------------------------------------------- */ |
182 | ||
183 | /* UDataInfo cf. udata.h */ | |
184 | static UDataInfo dataInfo={ | |
185 | sizeof(UDataInfo), | |
186 | 0, | |
187 | ||
188 | U_IS_BIG_ENDIAN, | |
189 | U_CHARSET_FAMILY, | |
190 | U_SIZEOF_UCHAR, | |
191 | 0, | |
192 | ||
193 | /* dataFormat="cAsE" */ | |
194 | { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 }, | |
73c04bcf | 195 | { 1, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ |
374ca955 A |
196 | { 4, 0, 1, 0 } /* dataVersion */ |
197 | }; | |
198 | ||
199 | enum { | |
200 | /* maximum number of exceptions expected */ | |
201 | MAX_EXC_COUNT=1000 | |
202 | }; | |
203 | ||
204 | /* exceptions values */ | |
205 | static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100]; | |
206 | static uint16_t exceptionsTop=0; | |
207 | static Props excProps[MAX_EXC_COUNT]; | |
208 | static uint16_t exceptionsCount=0; | |
209 | ||
210 | /* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */ | |
211 | static int32_t maxFullLength=U16_MAX_LENGTH; | |
212 | ||
73c04bcf A |
213 | /* reverse case folding ("unfold") data */ |
214 | static UChar unfold[UGENCASE_UNFOLD_MAX_ROWS*UGENCASE_UNFOLD_WIDTH]={ | |
215 | 0, UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH, 0, 0 | |
216 | }; | |
217 | static uint16_t unfoldRows=0; | |
218 | static uint16_t unfoldTop=UGENCASE_UNFOLD_WIDTH; | |
219 | ||
220 | /* Unicode versions --------------------------------------------------------- */ | |
221 | ||
222 | static const UVersionInfo | |
223 | unicodeVersions[]={ | |
224 | { 1, 0, 0, 0 }, | |
225 | { 1, 1, 0, 0 }, | |
226 | { 2, 0, 0, 0 }, | |
227 | { 3, 0, 0, 0 }, | |
228 | { 3, 1, 0, 0 }, | |
229 | { 3, 2, 0, 0 }, | |
230 | { 4, 0, 0, 0 }, | |
231 | { 4, 0, 1, 0 }, | |
232 | { 4, 1, 0, 0 } | |
233 | }; | |
234 | ||
235 | int32_t ucdVersion=UNI_4_1; | |
236 | ||
237 | static int32_t | |
238 | findUnicodeVersion(const UVersionInfo version) { | |
239 | int32_t i; | |
240 | ||
241 | for(i=0; /* while(version>unicodeVersions[i]) {} */ | |
242 | i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)>0; | |
243 | ++i) {} | |
244 | if(0<i && i<UNI_VER_COUNT && uprv_memcmp(version, unicodeVersions[i], 4)<0) { | |
245 | --i; /* fix 4.0.2 to land before 4.1, for valid x>=ucdVersion comparisons */ | |
246 | } | |
247 | return i; /* version>=unicodeVersions[i] && version<unicodeVersions[i+1]; possible: i==UNI_VER_COUNT */ | |
248 | } | |
374ca955 A |
249 | |
250 | extern void | |
251 | setUnicodeVersion(const char *v) { | |
252 | UVersionInfo version; | |
253 | u_versionFromString(version, v); | |
254 | uprv_memcpy(dataInfo.dataVersion, version, 4); | |
73c04bcf A |
255 | ucdVersion=findUnicodeVersion(version); |
256 | } | |
257 | ||
258 | /* -------------------------------------------------------------------------- */ | |
259 | ||
260 | static void | |
261 | addUnfolding(UChar32 c, const UChar *s, int32_t length) { | |
262 | int32_t i; | |
263 | ||
264 | if(length>UGENCASE_UNFOLD_STRING_WIDTH) { | |
265 | fprintf(stderr, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n", | |
266 | (long)length, UGENCASE_UNFOLD_STRING_WIDTH); | |
267 | exit(U_INTERNAL_PROGRAM_ERROR); | |
268 | } | |
269 | if(unfoldTop>=LENGTHOF(unfold)) { | |
270 | fprintf(stderr, "gencase error: too many multi-character case foldings\n"); | |
271 | exit(U_BUFFER_OVERFLOW_ERROR); | |
272 | } | |
273 | u_memset(unfold+unfoldTop, 0, UGENCASE_UNFOLD_WIDTH); | |
274 | u_memcpy(unfold+unfoldTop, s, length); | |
275 | ||
276 | i=unfoldTop+UGENCASE_UNFOLD_STRING_WIDTH; | |
277 | U16_APPEND_UNSAFE(unfold, i, c); | |
278 | ||
279 | ++unfoldRows; | |
280 | unfoldTop+=UGENCASE_UNFOLD_WIDTH; | |
374ca955 A |
281 | } |
282 | ||
283 | /* store a character's properties ------------------------------------------- */ | |
284 | ||
285 | extern void | |
286 | setProps(Props *p) { | |
287 | UErrorCode errorCode; | |
288 | uint32_t value, oldValue; | |
289 | int32_t delta; | |
73c04bcf | 290 | UBool isCaseIgnorable; |
374ca955 A |
291 | |
292 | /* get the non-UnicodeData.txt properties */ | |
293 | value=oldValue=upvec_getValue(pv, p->code, 0); | |
294 | ||
295 | /* default: map to self */ | |
296 | delta=0; | |
297 | ||
298 | if(p->gc==U_TITLECASE_LETTER) { | |
299 | /* the Titlecase property is read late, from UnicodeData.txt */ | |
300 | value|=UCASE_TITLE; | |
301 | } | |
302 | ||
303 | if(p->upperCase!=0) { | |
304 | /* uppercase mapping as delta if the character is lowercase */ | |
305 | if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { | |
306 | delta=p->upperCase-p->code; | |
307 | } else { | |
308 | value|=UCASE_EXCEPTION; | |
309 | } | |
310 | } | |
311 | if(p->lowerCase!=0) { | |
312 | /* lowercase mapping as delta if the character is uppercase or titlecase */ | |
313 | if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) { | |
314 | delta=p->lowerCase-p->code; | |
315 | } else { | |
316 | value|=UCASE_EXCEPTION; | |
317 | } | |
318 | } | |
319 | if(p->upperCase!=p->titleCase) { | |
320 | value|=UCASE_EXCEPTION; | |
321 | } | |
73c04bcf A |
322 | if(p->closure[0]!=0) { |
323 | value|=UCASE_EXCEPTION; | |
324 | } | |
374ca955 A |
325 | if(p->specialCasing!=NULL) { |
326 | value|=UCASE_EXCEPTION; | |
327 | } | |
328 | if(p->caseFolding!=NULL) { | |
329 | value|=UCASE_EXCEPTION; | |
330 | } | |
331 | ||
332 | if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) { | |
333 | value|=UCASE_EXCEPTION; | |
334 | } | |
335 | ||
336 | if(p->cc!=0) { | |
337 | if(value&UCASE_DOT_MASK) { | |
338 | fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n"); | |
339 | exit(U_INTERNAL_PROGRAM_ERROR); | |
340 | } | |
341 | if(p->cc==230) { | |
342 | value|=UCASE_ABOVE; | |
343 | } else { | |
344 | value|=UCASE_OTHER_ACCENT; | |
345 | } | |
346 | } | |
347 | ||
348 | /* encode case-ignorable as delta==1 on uncased characters */ | |
73c04bcf A |
349 | isCaseIgnorable=FALSE; |
350 | if((value&UCASE_TYPE_MASK)==UCASE_NONE) { | |
351 | if(ucdVersion>=UNI_4_1) { | |
352 | /* Unicode 4.1 and up: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk */ | |
353 | if( | |
354 | (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 || | |
355 | ((upvec_getValue(pv, p->code, 1)>>UGENCASE_IS_MID_LETTER_SHIFT)&1)!=0 | |
356 | ) { | |
357 | isCaseIgnorable=TRUE; | |
358 | } | |
359 | } else { | |
360 | /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */ | |
361 | if( | |
362 | (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 || | |
363 | p->code==0x27 || p->code==0xad || p->code==0x2019 | |
364 | ) { | |
365 | isCaseIgnorable=TRUE; | |
366 | } | |
367 | } | |
368 | } | |
369 | ||
370 | if(isCaseIgnorable && p->code!=0x307) { | |
374ca955 A |
371 | /* |
372 | * We use one of the delta/exception bits, which works because we only | |
373 | * store the case-ignorable flag for uncased characters. | |
374 | * There is no delta for uncased characters (see checks above). | |
375 | * If there is an exception for an uncased, case-ignorable character | |
376 | * (although there should not be any case mappings if it's uncased) | |
377 | * then we have a problem. | |
378 | * There is one character which is case-ignorable but has an exception: | |
379 | * U+0307 is uncased, Mn, has conditional special casing and | |
380 | * is therefore handled in code instead. | |
381 | */ | |
382 | if(value&UCASE_EXCEPTION) { | |
383 | fprintf(stderr, "gencase error: unable to encode case-ignorable for U+%04lx with exceptions\n", | |
384 | (unsigned long)p->code); | |
385 | exit(U_INTERNAL_PROGRAM_ERROR); | |
386 | } | |
387 | ||
388 | delta=1; | |
389 | } | |
390 | ||
391 | /* handle exceptions */ | |
392 | if(value&UCASE_EXCEPTION) { | |
393 | /* simply store exceptions for later processing and encoding */ | |
394 | value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT; | |
395 | uprv_memcpy(excProps+exceptionsCount, p, sizeof(*p)); | |
396 | if(++exceptionsCount==MAX_EXC_COUNT) { | |
397 | fprintf(stderr, "gencase: too many exceptions\n"); | |
398 | exit(U_INDEX_OUTOFBOUNDS_ERROR); | |
399 | } | |
400 | } else { | |
401 | /* store the simple case mapping delta */ | |
402 | value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK; | |
403 | } | |
404 | ||
405 | errorCode=U_ZERO_ERROR; | |
406 | if( value!=oldValue && | |
407 | !upvec_setValue(pv, p->code, p->code+1, 0, value, 0xffffffff, &errorCode) | |
408 | ) { | |
409 | fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n", | |
410 | u_errorName(errorCode)); | |
411 | exit(errorCode); | |
412 | } | |
73c04bcf A |
413 | |
414 | /* add the multi-character case folding to the "unfold" data */ | |
415 | if(p->caseFolding!=NULL) { | |
416 | int32_t length=p->caseFolding->full[0]; | |
417 | if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) { | |
418 | addUnfolding(p->code, p->caseFolding->full+1, length); | |
419 | } | |
420 | } | |
374ca955 A |
421 | } |
422 | ||
423 | extern void | |
424 | addCaseSensitive(UChar32 first, UChar32 last) { | |
425 | UErrorCode errorCode=U_ZERO_ERROR; | |
426 | if(!upvec_setValue(pv, first, last+1, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode)) { | |
427 | fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n", | |
428 | u_errorName(errorCode)); | |
429 | exit(errorCode); | |
430 | } | |
431 | } | |
432 | ||
73c04bcf A |
433 | /* finalize reverse case folding ("unfold") data ---------------------------- */ |
434 | ||
435 | static int32_t U_CALLCONV | |
436 | compareUnfold(const void *context, const void *left, const void *right) { | |
437 | return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH); | |
438 | } | |
439 | ||
440 | static void | |
441 | makeUnfoldData() { | |
442 | static const UChar | |
443 | iDot[2]= { 0x69, 0x307 }; | |
444 | ||
445 | UChar *p, *q; | |
446 | int32_t i, j, k; | |
447 | UErrorCode errorCode; | |
448 | ||
449 | /* | |
450 | * add a case folding that we missed because it's conditional: | |
451 | * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE | |
452 | */ | |
453 | addUnfolding(0x130, iDot, 2); | |
454 | ||
455 | /* sort the data */ | |
456 | errorCode=U_ZERO_ERROR; | |
457 | uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2, | |
458 | compareUnfold, NULL, FALSE, &errorCode); | |
459 | ||
460 | /* make unique-string rows by merging adjacent ones' code point columns */ | |
461 | ||
462 | /* make p point to row i-1 */ | |
463 | p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH; | |
464 | ||
465 | for(i=1; i<unfoldRows;) { | |
466 | if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) { | |
467 | /* concatenate code point columns */ | |
468 | q=p+UGENCASE_UNFOLD_STRING_WIDTH; | |
469 | for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {} | |
470 | for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) { | |
471 | q[j]=q[UGENCASE_UNFOLD_WIDTH+k]; | |
472 | } | |
473 | if(j>UGENCASE_UNFOLD_CP_WIDTH) { | |
474 | fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n", | |
475 | (long)j, UGENCASE_UNFOLD_CP_WIDTH); | |
476 | exit(U_BUFFER_OVERFLOW_ERROR); | |
477 | } | |
478 | ||
479 | /* move following rows up one */ | |
480 | --unfoldRows; | |
481 | unfoldTop-=UGENCASE_UNFOLD_WIDTH; | |
482 | u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH); | |
483 | } else { | |
484 | p+=UGENCASE_UNFOLD_WIDTH; | |
485 | ++i; | |
486 | } | |
487 | } | |
488 | ||
489 | unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows; | |
490 | ||
491 | if(beVerbose) { | |
492 | puts("unfold data:"); | |
493 | ||
494 | p=(UChar *)unfold; | |
495 | for(i=0; i<unfoldRows; ++i) { | |
496 | p+=UGENCASE_UNFOLD_WIDTH; | |
497 | printf("[%2d] %04x %04x %04x <- %04x %04x\n", | |
498 | (int)i, p[0], p[1], p[2], p[3], p[4]); | |
499 | } | |
500 | } | |
501 | } | |
502 | ||
503 | /* case closure ------------------------------------------------------------- */ | |
504 | ||
505 | static void | |
506 | addClosureMapping(UChar32 src, UChar32 dest) { | |
507 | uint32_t value; | |
508 | ||
509 | if(beVerbose) { | |
510 | printf("add closure mapping U+%04lx->U+%04lx\n", | |
511 | (unsigned long)src, (unsigned long)dest); | |
512 | } | |
513 | ||
514 | value=upvec_getValue(pv, src, 0); | |
515 | if(value&UCASE_EXCEPTION) { | |
516 | Props *p=excProps+(value>>UGENCASE_EXC_SHIFT); | |
517 | int32_t i; | |
518 | ||
519 | /* append dest to src's closure array */ | |
520 | for(i=0;; ++i) { | |
521 | if(i==LENGTHOF(p->closure)) { | |
522 | fprintf(stderr, "closure[] overflow for U+%04lx->U+%04lx\n", | |
523 | (unsigned long)src, (unsigned long)dest); | |
524 | exit(U_BUFFER_OVERFLOW_ERROR); | |
525 | } else if(p->closure[i]==dest) { | |
526 | break; /* do not store duplicates */ | |
527 | } else if(p->closure[i]==0) { | |
528 | p->closure[i]=dest; | |
529 | break; | |
530 | } | |
531 | } | |
532 | } else { | |
533 | Props p2={ 0 }; | |
534 | UChar32 next; | |
535 | UErrorCode errorCode; | |
536 | ||
537 | /* | |
538 | * decode value into p2 (enough for makeException() to work properly), | |
539 | * add the closure mapping, | |
540 | * and set the new exception for src | |
541 | */ | |
542 | p2.code=src; | |
543 | p2.closure[0]=dest; | |
544 | ||
545 | if((value&UCASE_TYPE_MASK)>UCASE_NONE) { | |
546 | /* one simple case mapping, don't care which one */ | |
547 | next=src+((int16_t)value>>UCASE_DELTA_SHIFT); | |
548 | if(next!=src) { | |
549 | if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { | |
550 | p2.upperCase=p2.titleCase=next; | |
551 | } else { | |
552 | p2.lowerCase=next; | |
553 | } | |
554 | } | |
555 | } else if(value&UCASE_DELTA_MASK) { | |
556 | fprintf(stderr, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n", | |
557 | (unsigned long)src); | |
558 | exit(U_INTERNAL_PROGRAM_ERROR); | |
559 | } | |
560 | ||
561 | value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); /* remove previous simple mapping */ | |
562 | value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT; | |
563 | value|=UCASE_EXCEPTION; | |
564 | uprv_memcpy(excProps+exceptionsCount, &p2, sizeof(p2)); | |
565 | if(++exceptionsCount==MAX_EXC_COUNT) { | |
566 | fprintf(stderr, "gencase: too many exceptions\n"); | |
567 | exit(U_INDEX_OUTOFBOUNDS_ERROR); | |
568 | } | |
569 | ||
570 | errorCode=U_ZERO_ERROR; | |
571 | if(!upvec_setValue(pv, src, src+1, 0, value, 0xffffffff, &errorCode)) { | |
572 | fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n", | |
573 | u_errorName(errorCode)); | |
574 | exit(errorCode); | |
575 | } | |
576 | } | |
577 | } | |
578 | ||
579 | /* | |
580 | * Find missing case mapping relationships and add mappings for case closure. | |
581 | * This function starts from an "original" code point and recursively | |
582 | * finds its case mappings and the case mappings of where it maps to. | |
583 | * | |
584 | * The recursion depth is capped at 3 nested calls of this function. | |
585 | * In each call, the current code point is c, and the function enumerates | |
586 | * all of c's simple (single-code point) case mappings. | |
587 | * prev is the code point that case-mapped to c. | |
588 | * prev2 is the code point that case-mapped to prev. | |
589 | * | |
590 | * The initial function call has prev2<0, prev<0, and c==orig | |
591 | * (marking no code points). | |
592 | * It enumerates c's case mappings and recurses without further action. | |
593 | * | |
594 | * The second-level function call has prev2<0, prev==orig, and c is | |
595 | * the destination code point of one of prev's case mappings. | |
596 | * The function checks if any of c's case mappings go back to orig | |
597 | * and adds a closure mapping if not. | |
598 | * In other words, it turns a case mapping relationship of | |
599 | * orig->c | |
600 | * into | |
601 | * orig<->c | |
602 | * | |
603 | * The third-level function call has prev2==orig, prev>=0, and c is | |
604 | * the destination code point of one of prev's case mappings. | |
605 | * (And prev is the destination of one of prev2's case mappings.) | |
606 | * The function checks if any of c's case mappings go back to orig | |
607 | * and adds a closure mapping if not. | |
608 | * In other words, it turns case mapping relationships of | |
609 | * orig->prev->c or orig->prev<->c | |
610 | * into | |
611 | * orig->prev->c->orig or orig->prev<->c->orig | |
612 | * etc. | |
613 | * (Graphically, this closes a triangle.) | |
614 | * | |
615 | * With repeated application on all code points until no more closure mappings | |
616 | * are added, all case equivalence groups get complete mappings. | |
617 | * That is, in each group of code points with case relationships | |
618 | * each code point will in the end have some mapping to each other | |
619 | * code point in the group. | |
620 | * | |
621 | * @return TRUE if a closure mapping was added | |
622 | */ | |
623 | static UBool | |
624 | addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value) { | |
625 | UChar32 next; | |
626 | UBool someMappingsAdded=FALSE; | |
627 | ||
628 | if(c!=orig) { | |
629 | /* get the properties for c */ | |
630 | value=upvec_getValue(pv, c, 0); | |
631 | } | |
632 | /* else if c==orig then c's value was passed in */ | |
633 | ||
634 | if(value&UCASE_EXCEPTION) { | |
635 | UChar32 set[32]; | |
636 | int32_t i, count=0; | |
637 | ||
638 | Props *p=excProps+(value>>UGENCASE_EXC_SHIFT); | |
639 | ||
640 | /* | |
641 | * marker for whether any of c's mappings goes to orig | |
642 | * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings | |
643 | */ | |
644 | UBool mapsToOrig=(UBool)(c==orig); | |
645 | ||
646 | /* collect c's case mapping destinations in set[] */ | |
647 | if((next=p->upperCase)!=0 && next!=c) { | |
648 | set[count++]=next; | |
649 | } | |
650 | if((next=p->lowerCase)!=0 && next!=c) { | |
651 | set[count++]=next; | |
652 | } | |
653 | if(p->upperCase!=(next=p->titleCase) && next!=c) { | |
654 | set[count++]=next; | |
655 | } | |
656 | if(p->caseFolding!=NULL && (next=p->caseFolding->simple)!=0 && next!=c) { | |
657 | set[count++]=next; | |
658 | } | |
659 | ||
660 | /* append c's current closure mappings to set[] */ | |
661 | for(i=0; i<LENGTHOF(p->closure) && (next=p->closure[i])!=0; ++i) { | |
662 | set[count++]=next; | |
663 | } | |
664 | ||
665 | /* process all code points to which c case-maps */ | |
666 | for(i=0; i<count; ++i) { | |
667 | next=set[i]; /* next!=c */ | |
668 | ||
669 | if(next==orig) { | |
670 | mapsToOrig=TRUE; /* remember that we map to orig */ | |
671 | } else if(prev2<0 && next!=prev) { | |
672 | /* | |
673 | * recurse unless | |
674 | * we have reached maximum depth (prev2>=0) or | |
675 | * this is a mapping to one of the previous code points (orig, prev, c) | |
676 | */ | |
677 | someMappingsAdded|=addClosure(orig, prev, c, next, 0); | |
678 | } | |
679 | } | |
680 | ||
681 | if(!mapsToOrig) { | |
682 | addClosureMapping(c, orig); | |
683 | return TRUE; | |
684 | } | |
685 | } else { | |
686 | if((value&UCASE_TYPE_MASK)>UCASE_NONE) { | |
687 | /* one simple case mapping, don't care which one */ | |
688 | next=c+((int16_t)value>>UCASE_DELTA_SHIFT); | |
689 | if(next!=c) { | |
690 | /* | |
691 | * recurse unless | |
692 | * we have reached maximum depth (prev2>=0) or | |
693 | * this is a mapping to one of the previous code points (orig, prev, c) | |
694 | */ | |
695 | if(prev2<0 && next!=orig && next!=prev) { | |
696 | someMappingsAdded|=addClosure(orig, prev, c, next, 0); | |
697 | } | |
698 | ||
699 | if(c!=orig && next!=orig) { | |
700 | /* c does not map to orig, add a closure mapping c->orig */ | |
701 | addClosureMapping(c, orig); | |
702 | return TRUE; | |
703 | } | |
704 | } | |
705 | } | |
706 | } | |
707 | ||
708 | return someMappingsAdded; | |
709 | } | |
710 | ||
374ca955 A |
711 | extern void |
712 | makeCaseClosure() { | |
73c04bcf A |
713 | UChar *p; |
714 | uint32_t *row; | |
715 | uint32_t value; | |
716 | UChar32 start, limit, c, c2; | |
717 | int32_t i, j; | |
718 | UBool someMappingsAdded; | |
719 | ||
720 | /* | |
721 | * finalize the "unfold" data because we need to use it to add closure mappings | |
722 | * for situations like FB05->"st"<-FB06 | |
723 | * where we would otherwise miss the FB05<->FB06 relationship | |
724 | */ | |
725 | makeUnfoldData(); | |
726 | ||
727 | /* use the "unfold" data to add mappings */ | |
728 | ||
729 | /* p always points to the code points; this loop ignores the strings completely */ | |
730 | p=unfold+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH; | |
731 | ||
732 | for(i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) { | |
733 | j=0; | |
734 | U16_NEXT_UNSAFE(p, j, c); | |
735 | while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) { | |
736 | U16_NEXT_UNSAFE(p, j, c2); | |
737 | addClosure(c, U_SENTINEL, c, c2, 0); | |
738 | } | |
739 | } | |
740 | ||
741 | if(beVerbose) { | |
742 | puts("---- ---- ---- ---- (done with closures from unfolding)"); | |
743 | } | |
744 | ||
745 | /* add further closure mappings from analyzing simple mappings */ | |
746 | do { | |
747 | someMappingsAdded=FALSE; | |
748 | ||
749 | i=0; | |
750 | while((row=upvec_getRow(pv, i, &start, &limit))!=NULL) { | |
751 | value=*row; | |
752 | if(value!=0) { | |
753 | while(start<limit) { | |
754 | if(addClosure(start, U_SENTINEL, U_SENTINEL, start, value)) { | |
755 | someMappingsAdded=TRUE; | |
756 | ||
757 | /* | |
758 | * stop this loop because pv was changed and row is not valid any more | |
759 | * skip all rows below the current start | |
760 | */ | |
761 | while((row=upvec_getRow(pv, i, NULL, &limit))!=NULL && start>=limit) { | |
762 | ++i; | |
763 | } | |
764 | row=NULL; /* signal to continue with outer loop, without further ++i */ | |
765 | break; | |
766 | } | |
767 | ++start; | |
768 | } | |
769 | if(row==NULL) { | |
770 | continue; /* see row=NULL above */ | |
771 | } | |
772 | } | |
773 | ++i; | |
774 | } | |
775 | ||
776 | if(beVerbose && someMappingsAdded) { | |
777 | puts("---- ---- ---- ----"); | |
778 | } | |
779 | } while(someMappingsAdded); | |
374ca955 A |
780 | } |
781 | ||
782 | /* exceptions --------------------------------------------------------------- */ | |
783 | ||
73c04bcf A |
784 | /* get the string length from zero-terminated code points in a limited-length array */ |
785 | static int32_t | |
786 | getLengthOfCodePoints(const UChar32 *s, int32_t maxLength) { | |
787 | int32_t i, length; | |
788 | ||
789 | for(i=length=0; i<maxLength && s[i]!=0; ++i) { | |
790 | length+=U16_LENGTH(s[i]); | |
791 | } | |
792 | return length; | |
793 | } | |
794 | ||
374ca955 A |
795 | static UBool |
796 | fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) { | |
797 | int32_t i, length; | |
798 | UChar32 full; | |
799 | ||
800 | length=*s++; | |
801 | if(length==0 || length>U16_MAX_LENGTH) { | |
802 | return FALSE; | |
803 | } | |
804 | i=0; | |
805 | U16_NEXT(s, i, length, full); | |
806 | ||
807 | if(simple==0) { | |
808 | simple=c; /* UCD has no simple mapping if it's the same as the code point itself */ | |
809 | } | |
810 | return (UBool)(i==length && full==simple); | |
811 | } | |
812 | ||
813 | static uint16_t | |
814 | makeException(uint32_t value, Props *p) { | |
815 | uint32_t slots[8]; | |
816 | uint32_t slotBits; | |
817 | uint16_t excWord, excIndex, excTop, i, count, length, fullLengths; | |
818 | UBool doubleSlots; | |
819 | ||
820 | /* excIndex will be returned for storing in the trie word */ | |
821 | excIndex=exceptionsTop; | |
822 | if(excIndex>=UCASE_MAX_EXCEPTIONS) { | |
823 | fprintf(stderr, "gencase error: too many exceptions words\n"); | |
824 | exit(U_BUFFER_OVERFLOW_ERROR); | |
825 | } | |
826 | ||
827 | excTop=excIndex+1; /* +1 for excWord which will be stored at excIndex */ | |
828 | ||
829 | /* copy and shift the soft-dotted bits */ | |
830 | excWord=((uint16_t)value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT; | |
831 | ||
832 | /* update maxFullLength */ | |
833 | if(p->specialCasing!=NULL) { | |
834 | length=p->specialCasing->lowerCase[0]; | |
835 | if(length>maxFullLength) { | |
836 | maxFullLength=length; | |
837 | } | |
838 | length=p->specialCasing->upperCase[0]; | |
839 | if(length>maxFullLength) { | |
840 | maxFullLength=length; | |
841 | } | |
842 | length=p->specialCasing->titleCase[0]; | |
843 | if(length>maxFullLength) { | |
844 | maxFullLength=length; | |
845 | } | |
846 | } | |
847 | if(p->caseFolding!=NULL) { | |
848 | length=p->caseFolding->full[0]; | |
849 | if(length>maxFullLength) { | |
850 | maxFullLength=length; | |
851 | } | |
852 | } | |
853 | ||
854 | /* set the bits for conditional mappings */ | |
855 | if(p->specialCasing!=NULL && p->specialCasing->isComplex) { | |
856 | excWord|=UCASE_EXC_CONDITIONAL_SPECIAL; | |
857 | p->specialCasing=NULL; | |
858 | } | |
859 | if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) { | |
860 | excWord|=UCASE_EXC_CONDITIONAL_FOLD; | |
861 | p->caseFolding=NULL; | |
862 | } | |
863 | ||
864 | /* | |
865 | * Note: | |
866 | * UCD stores no simple mappings when they are the same as the code point itself. | |
867 | * SpecialCasing and CaseFolding do store simple mappings even if they are | |
868 | * the same as the code point itself. | |
869 | * Comparisons between simple regular mappings and simple special/folding | |
870 | * mappings need to compensate for the difference by comparing with the | |
871 | * original code point if a simple UCD mapping is missing (0). | |
872 | */ | |
873 | ||
874 | /* remove redundant data */ | |
875 | if(p->specialCasing!=NULL) { | |
876 | /* do not store full mappings if they are the same as the simple ones */ | |
877 | if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) { | |
878 | p->specialCasing->lowerCase[0]=0; | |
879 | } | |
880 | if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) { | |
881 | p->specialCasing->upperCase[0]=0; | |
882 | } | |
883 | if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) { | |
884 | p->specialCasing->titleCase[0]=0; | |
885 | } | |
886 | } | |
887 | if( p->caseFolding!=NULL && | |
888 | fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code) | |
889 | ) { | |
890 | p->caseFolding->full[0]=0; | |
891 | } | |
892 | ||
893 | /* write the optional slots */ | |
894 | slotBits=0; | |
895 | count=0; | |
896 | ||
897 | if(p->lowerCase!=0) { | |
898 | slots[count]=(uint32_t)p->lowerCase; | |
899 | slotBits|=slots[count]; | |
900 | ++count; | |
901 | excWord|=U_MASK(UCASE_EXC_LOWER); | |
902 | } | |
903 | if( p->caseFolding!=NULL && | |
904 | p->caseFolding->simple!=0 && | |
905 | (p->lowerCase!=0 ? | |
906 | p->caseFolding->simple!=p->lowerCase : | |
907 | p->caseFolding->simple!=p->code) | |
908 | ) { | |
909 | slots[count]=(uint32_t)p->caseFolding->simple; | |
910 | slotBits|=slots[count]; | |
911 | ++count; | |
912 | excWord|=U_MASK(UCASE_EXC_FOLD); | |
913 | } | |
914 | if(p->upperCase!=0) { | |
915 | slots[count]=(uint32_t)p->upperCase; | |
916 | slotBits|=slots[count]; | |
917 | ++count; | |
918 | excWord|=U_MASK(UCASE_EXC_UPPER); | |
919 | } | |
920 | if(p->upperCase!=p->titleCase) { | |
921 | if(p->titleCase!=0) { | |
922 | slots[count]=(uint32_t)p->titleCase; | |
923 | } else { | |
924 | slots[count]=(uint32_t)p->code; | |
925 | } | |
926 | slotBits|=slots[count]; | |
927 | ++count; | |
928 | excWord|=U_MASK(UCASE_EXC_TITLE); | |
929 | } | |
930 | ||
73c04bcf A |
931 | /* length of case closure */ |
932 | if(p->closure[0]!=0) { | |
933 | length=getLengthOfCodePoints(p->closure, LENGTHOF(p->closure)); | |
934 | slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */ | |
935 | slotBits|=slots[count]; | |
936 | ++count; | |
937 | excWord|=U_MASK(UCASE_EXC_CLOSURE); | |
938 | } | |
939 | ||
374ca955 A |
940 | /* lengths of full case mapping strings, stored in the last slot */ |
941 | fullLengths=0; | |
942 | if(p->specialCasing!=NULL) { | |
943 | fullLengths=p->specialCasing->lowerCase[0]; | |
944 | fullLengths|=p->specialCasing->upperCase[0]<<8; | |
945 | fullLengths|=p->specialCasing->titleCase[0]<<12; | |
946 | } | |
947 | if(p->caseFolding!=NULL) { | |
948 | fullLengths|=p->caseFolding->full[0]<<4; | |
949 | } | |
950 | if(fullLengths!=0) { | |
951 | slots[count]=fullLengths; | |
952 | slotBits|=slots[count]; | |
953 | ++count; | |
954 | excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS); | |
955 | } | |
956 | ||
957 | /* write slots */ | |
958 | doubleSlots=(UBool)(slotBits>0xffff); | |
959 | if(!doubleSlots) { | |
960 | for(i=0; i<count; ++i) { | |
961 | exceptions[excTop++]=(uint16_t)slots[i]; | |
962 | } | |
963 | } else { | |
964 | excWord|=UCASE_EXC_DOUBLE_SLOTS; | |
965 | for(i=0; i<count; ++i) { | |
966 | exceptions[excTop++]=(uint16_t)(slots[i]>>16); | |
967 | exceptions[excTop++]=(uint16_t)slots[i]; | |
968 | } | |
969 | } | |
970 | ||
971 | /* write the full case mapping strings */ | |
972 | if(p->specialCasing!=NULL) { | |
973 | length=(uint16_t)p->specialCasing->lowerCase[0]; | |
974 | u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length); | |
975 | excTop+=length; | |
976 | } | |
977 | if(p->caseFolding!=NULL) { | |
978 | length=(uint16_t)p->caseFolding->full[0]; | |
979 | u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length); | |
980 | excTop+=length; | |
981 | } | |
982 | if(p->specialCasing!=NULL) { | |
983 | length=(uint16_t)p->specialCasing->upperCase[0]; | |
984 | u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length); | |
985 | excTop+=length; | |
986 | ||
987 | length=(uint16_t)p->specialCasing->titleCase[0]; | |
988 | u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length); | |
989 | excTop+=length; | |
990 | } | |
991 | ||
73c04bcf A |
992 | /* write the closure data */ |
993 | if(p->closure[0]!=0) { | |
994 | UChar32 c; | |
995 | ||
996 | for(i=0; i<LENGTHOF(p->closure) && (c=p->closure[i])!=0; ++i) { | |
997 | U16_APPEND_UNSAFE((UChar *)exceptions, excTop, c); | |
998 | } | |
999 | } | |
1000 | ||
374ca955 A |
1001 | exceptionsTop=excTop; |
1002 | ||
1003 | /* write the main exceptions word */ | |
1004 | exceptions[excIndex]=excWord; | |
1005 | ||
1006 | return excIndex; | |
1007 | } | |
1008 | ||
1009 | extern void | |
1010 | makeExceptions() { | |
1011 | uint32_t *row; | |
1012 | uint32_t value; | |
1013 | int32_t i; | |
1014 | uint16_t excIndex; | |
1015 | ||
1016 | i=0; | |
1017 | while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) { | |
1018 | value=*row; | |
1019 | if(value&UCASE_EXCEPTION) { | |
1020 | excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT)); | |
1021 | *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex<<UCASE_EXC_SHIFT); | |
1022 | } | |
1023 | ++i; | |
1024 | } | |
1025 | } | |
1026 | ||
1027 | /* generate output data ----------------------------------------------------- */ | |
1028 | ||
1029 | extern void | |
73c04bcf | 1030 | generateData(const char *dataDir, UBool csource) { |
374ca955 A |
1031 | static int32_t indexes[UCASE_IX_TOP]={ |
1032 | UCASE_IX_TOP | |
1033 | }; | |
1034 | static uint8_t trieBlock[40000]; | |
1035 | ||
1036 | const uint32_t *row; | |
1037 | UChar32 start, limit; | |
1038 | int32_t i; | |
1039 | ||
1040 | UNewDataMemory *pData; | |
1041 | UNewTrie *pTrie; | |
1042 | UErrorCode errorCode=U_ZERO_ERROR; | |
1043 | int32_t trieSize; | |
1044 | long dataLength; | |
1045 | ||
1046 | pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE); | |
1047 | if(pTrie==NULL) { | |
1048 | fprintf(stderr, "gencase error: unable to create a UNewTrie\n"); | |
1049 | exit(U_MEMORY_ALLOCATION_ERROR); | |
1050 | } | |
1051 | ||
1052 | for(i=0; (row=upvec_getRow(pv, i, &start, &limit))!=NULL; ++i) { | |
1053 | if(!utrie_setRange32(pTrie, start, limit, *row, TRUE)) { | |
1054 | fprintf(stderr, "gencase error: unable to set trie value (overflow)\n"); | |
1055 | exit(U_BUFFER_OVERFLOW_ERROR); | |
1056 | } | |
1057 | } | |
1058 | ||
1059 | trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode); | |
1060 | if(U_FAILURE(errorCode)) { | |
1061 | fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize); | |
1062 | exit(errorCode); | |
1063 | } | |
1064 | ||
1065 | indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop; | |
1066 | indexes[UCASE_IX_TRIE_SIZE]=trieSize; | |
73c04bcf A |
1067 | indexes[UCASE_IX_UNFOLD_LENGTH]=unfoldTop; |
1068 | indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop+2*unfoldTop; | |
374ca955 A |
1069 | |
1070 | indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength; | |
1071 | ||
1072 | if(beVerbose) { | |
1073 | printf("trie size in bytes: %5d\n", (int)trieSize); | |
1074 | printf("number of code points with exceptions: %5d\n", exceptionsCount); | |
1075 | printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop); | |
73c04bcf | 1076 | printf("size in bytes of reverse foldings: %5d\n", 2*unfoldTop); |
374ca955 A |
1077 | printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]); |
1078 | } | |
1079 | ||
73c04bcf A |
1080 | if(csource) { |
1081 | /* write .c file for hardcoded data */ | |
1082 | UTrie trie={ NULL }; | |
1083 | FILE *f; | |
1084 | ||
1085 | utrie_unserialize(&trie, trieBlock, trieSize, &errorCode); | |
1086 | if(U_FAILURE(errorCode)) { | |
1087 | fprintf( | |
1088 | stderr, | |
1089 | "gencase error: failed to utrie_unserialize(ucase.icu trie) - %s\n", | |
1090 | u_errorName(errorCode)); | |
1091 | return; | |
1092 | } | |
374ca955 | 1093 | |
73c04bcf A |
1094 | f=usrc_create(dataDir, "ucase_props_data.c"); |
1095 | if(f!=NULL) { | |
1096 | usrc_writeArray(f, | |
1097 | "static const UVersionInfo ucase_props_dataVersion={", | |
1098 | dataInfo.dataVersion, 8, 4, | |
1099 | "};\n\n"); | |
1100 | usrc_writeArray(f, | |
1101 | "static const int32_t ucase_props_indexes[UCASE_IX_TOP]={", | |
1102 | indexes, 32, UCASE_IX_TOP, | |
1103 | "};\n\n"); | |
1104 | usrc_writeUTrieArrays(f, | |
1105 | "static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL, | |
1106 | &trie, | |
1107 | "\n};\n\n"); | |
1108 | usrc_writeArray(f, | |
1109 | "static const uint16_t ucase_props_exceptions[%ld]={\n", | |
1110 | exceptions, 16, exceptionsTop, | |
1111 | "\n};\n\n"); | |
1112 | usrc_writeArray(f, | |
1113 | "static const uint16_t ucase_props_unfold[%ld]={\n", | |
1114 | unfold, 16, unfoldTop, | |
1115 | "\n};\n\n"); | |
1116 | fputs( | |
1117 | "static const UCaseProps ucase_props_singleton={\n" | |
1118 | " NULL,\n" | |
1119 | " ucase_props_indexes,\n" | |
1120 | " ucase_props_exceptions,\n" | |
1121 | " ucase_props_unfold,\n", | |
1122 | f); | |
1123 | usrc_writeUTrieStruct(f, | |
1124 | " {\n", | |
1125 | &trie, "ucase_props_trieIndex", NULL, NULL, | |
1126 | " },\n"); | |
1127 | usrc_writeArray(f, " { ", dataInfo.formatVersion, 8, 4, " }\n"); | |
1128 | fputs("};\n", f); | |
1129 | fclose(f); | |
1130 | } | |
1131 | } else { | |
1132 | /* write the data */ | |
1133 | pData=udata_create(dataDir, UCASE_DATA_TYPE, UCASE_DATA_NAME, &dataInfo, | |
1134 | haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); | |
1135 | if(U_FAILURE(errorCode)) { | |
1136 | fprintf(stderr, "gencase: unable to create data memory, %s\n", u_errorName(errorCode)); | |
1137 | exit(errorCode); | |
1138 | } | |
374ca955 | 1139 | |
73c04bcf A |
1140 | udata_writeBlock(pData, indexes, sizeof(indexes)); |
1141 | udata_writeBlock(pData, trieBlock, trieSize); | |
1142 | udata_writeBlock(pData, exceptions, 2*exceptionsTop); | |
1143 | udata_writeBlock(pData, unfold, 2*unfoldTop); | |
374ca955 | 1144 | |
73c04bcf A |
1145 | /* finish up */ |
1146 | dataLength=udata_finish(pData, &errorCode); | |
1147 | if(U_FAILURE(errorCode)) { | |
1148 | fprintf(stderr, "gencase: error %d writing the output file\n", errorCode); | |
1149 | exit(errorCode); | |
1150 | } | |
1151 | ||
1152 | if(dataLength!=indexes[UCASE_IX_LENGTH]) { | |
1153 | fprintf(stderr, "gencase: data length %ld != calculated size %d\n", | |
1154 | dataLength, (int)indexes[UCASE_IX_LENGTH]); | |
1155 | exit(U_INTERNAL_PROGRAM_ERROR); | |
1156 | } | |
374ca955 A |
1157 | } |
1158 | ||
1159 | utrie_close(pTrie); | |
1160 | } | |
1161 | ||
1162 | /* | |
1163 | * Hey, Emacs, please set the following: | |
1164 | * | |
1165 | * Local Variables: | |
1166 | * indent-tabs-mode: nil | |
1167 | * End: | |
1168 | * | |
1169 | */ |