]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
73c04bcf | 4 | * Copyright (C) 1999-2006, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: store.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 1999dec11 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Store Unicode character properties efficiently for | |
17 | * random access. | |
18 | */ | |
19 | ||
20 | #include <stdio.h> | |
b75a7d8f A |
21 | #include "unicode/utypes.h" |
22 | #include "unicode/uchar.h" | |
23 | #include "cmemory.h" | |
24 | #include "cstring.h" | |
b75a7d8f A |
25 | #include "utrie.h" |
26 | #include "unicode/udata.h" | |
27 | #include "unewdata.h" | |
73c04bcf | 28 | #include "writesrc.h" |
b75a7d8f A |
29 | #include "uprops.h" |
30 | #include "genprops.h" | |
31 | ||
32 | #define DO_DEBUG_OUT 0 | |
33 | ||
34 | /* Unicode character properties file format ------------------------------------ | |
35 | ||
36 | The file format prepared and written here contains several data | |
37 | structures that store indexes or data. | |
38 | ||
39 | Before the data contents described below, there are the headers required by | |
40 | the udata API for loading ICU data. Especially, a UDataInfo structure | |
41 | precedes the actual data. It contains platform properties values and the | |
42 | file format version. | |
43 | ||
73c04bcf A |
44 | The following is a description of format version 4 . |
45 | ||
46 | The format changes between version 3 and 4 because the properties related to | |
47 | case mappings and bidi/shaping are pulled out into separate files | |
48 | for modularization. | |
49 | In order to reduce the need for code changes, some of the previous data | |
50 | structures are omitted, rather than rearranging everything. | |
51 | ||
52 | For details see "Changes in format version 4" below. | |
b75a7d8f A |
53 | |
54 | Data contents: | |
55 | ||
56 | The contents is a parsed, binary form of several Unicode character | |
57 | database files, most prominently UnicodeData.txt. | |
58 | ||
59 | Any Unicode code point from 0 to 0x10ffff can be looked up to get | |
60 | the properties, if any, for that code point. This means that the input | |
61 | to the lookup are 21-bit unsigned integers, with not all of the | |
62 | 21-bit range used. | |
63 | ||
64 | It is assumed that client code keeps a uint32_t pointer | |
65 | to the beginning of the data: | |
66 | ||
67 | const uint32_t *p32; | |
68 | ||
69 | Formally, the file contains the following structures: | |
70 | ||
71 | const int32_t indexes[16] with values i0..i15: | |
72 | ||
73c04bcf A |
73 | i0 indicates the length of the main trie. |
74 | i0..i3 all have the same value in format version 4.0; | |
75 | the related props32[] and exceptions[] and uchars[] were used in format version 3 | |
76 | ||
b75a7d8f A |
77 | i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words |
78 | i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words | |
79 | i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings | |
80 | ||
81 | i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties | |
82 | i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors | |
83 | i5 additionalVectorsColumns; -- number of 32-bit words per properties vector | |
84 | ||
85 | i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table | |
86 | i7..i9 reservedIndexes; -- reserved values; 0 for now | |
87 | ||
73c04bcf A |
88 | i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+) |
89 | i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2) | |
b75a7d8f A |
90 | i12..i15 reservedIndexes; -- reserved values; 0 for now |
91 | ||
92 | PT serialized properties trie, see utrie.h (byte size: 4*(i0-16)) | |
93 | ||
73c04bcf A |
94 | P, E, and U are not used (empty) in format version 4 |
95 | ||
b75a7d8f A |
96 | P const uint32_t props32[i1-i0]; |
97 | E const uint32_t exceptions[i2-i1]; | |
98 | U const UChar uchars[2*(i3-i2)]; | |
99 | ||
100 | AT serialized trie for additional properties (byte size: 4*(i4-i3)) | |
101 | PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4]; | |
102 | ||
103 | Trie lookup and properties: | |
104 | ||
105 | In order to condense the data for the 21-bit code space, several properties of | |
106 | the Unicode code assignment are exploited: | |
107 | - The code space is sparse. | |
108 | - There are several 10k of consecutive codes with the same properties. | |
109 | - Characters and scripts are allocated in groups of 16 code points. | |
110 | - Inside blocks for scripts the properties are often repetitive. | |
111 | - The 21-bit space is not fully used for Unicode. | |
112 | ||
113 | The lookup of properties for a given code point is done with a trie lookup, | |
114 | using the UTrie implementation. | |
73c04bcf | 115 | The trie lookup result is a 16-bit properties word. |
b75a7d8f A |
116 | |
117 | With a given Unicode code point | |
118 | ||
119 | UChar32 c; | |
120 | ||
121 | and 0<=c<0x110000, the lookup is done like this: | |
122 | ||
73c04bcf A |
123 | uint16_t props; |
124 | UTRIE_GET16(trie, c, props); | |
b75a7d8f | 125 | |
73c04bcf | 126 | Each 16-bit properties word contains: |
b75a7d8f A |
127 | |
128 | 0.. 4 general category | |
73c04bcf A |
129 | 5.. 7 numeric type |
130 | non-digit numbers are stored with multiple types and pseudo-types | |
131 | in order to facilitate compact encoding: | |
132 | 0 no numeric value (0) | |
133 | 1 decimal digit value (0..9) | |
134 | 2 digit value (0..9) | |
135 | 3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff | |
136 | 4 (internal type UPROPS_NT_FRACTION) fraction | |
137 | 5 (internal type UPROPS_NT_LARGE) large number >0xff | |
138 | 6..7 reserved | |
139 | ||
140 | when returning the numeric type from a public API, | |
141 | internal types must be turned into U_NT_NUMERIC | |
142 | ||
143 | 8..15 numeric value | |
144 | encoding of fractions and large numbers see below | |
145 | ||
146 | Fractions: | |
147 | // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down) | |
148 | int32_t num, den; | |
149 | num=n>>3; // num=0..31 | |
150 | den=(n&7)+2; // den=2..9 | |
151 | if(num==0) { | |
152 | num=-1; // num=-1 or 1..31 | |
153 | } | |
154 | double result=(double)num/(double)den; | |
155 | ||
156 | Large numbers: | |
157 | // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down) | |
158 | int32_t m, e; | |
159 | m=n>>4; // m=0..15 | |
160 | e=(n&0xf); | |
161 | if(m==0) { | |
162 | m=1; // for large powers of 10 | |
163 | e+=18; // e=18..33 | |
164 | } else { | |
165 | e+=2; // e=2..17 | |
166 | } // m==10..15 are reserved | |
167 | double result=(double)m*10^e; | |
b75a7d8f A |
168 | |
169 | --- Additional properties (new in format version 2.1) --- | |
170 | ||
171 | The second trie for additional properties (AT) is also a UTrie with 16-bit data. | |
172 | The data words consist of 32-bit unit indexes (not row indexes!) into the | |
173 | table of unique properties vectors (PV). | |
174 | Each vector contains a set of properties. | |
175 | The width of a vector (number of uint32_t per row) may change | |
176 | with the formatVersion, it is stored in i5. | |
177 | ||
178 | Current properties: see icu/source/common/uprops.h | |
179 | ||
180 | --- Changes in format version 3.1 --- | |
181 | ||
182 | See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT. | |
183 | ||
184 | --- Changes in format version 3.2 --- | |
185 | ||
186 | - The tries use linear Latin-1 ranges. | |
187 | - The additional properties bits store full properties XYZ instead | |
188 | of partial Other_XYZ, so that changes in the derivation formulas | |
189 | need not be tracked in runtime library code. | |
190 | - Joining Type and Line Break are also stored completely, so that uprops.c | |
191 | needs no runtime formulas for enumerated properties either. | |
192 | - Store the case-sensitive flag in the main properties word. | |
193 | - i10 also contains U_LB_COUNT and U_EA_COUNT. | |
194 | - i11 contains maxValues2 for vector word 2. | |
195 | ||
73c04bcf A |
196 | --- Changes in format version 4 --- |
197 | ||
198 | The format changes between version 3 and 4 because the properties related to | |
199 | case mappings and bidi/shaping are pulled out into separate files | |
200 | for modularization. | |
201 | In order to reduce the need for code changes, some of the previous data | |
202 | structures are omitted, rather than rearranging everything. | |
203 | ||
204 | (The change to format version 4 is for ICU 3.4. The last CVS revision of | |
205 | genprops/store.c for format version 3.2 is 1.48.) | |
206 | ||
207 | The main trie's data is significantly simplified: | |
208 | - The trie's 16-bit data word is used directly instead of as an index | |
209 | into props32[]. | |
210 | - The trie uses the default trie folding functions instead of custom ones. | |
211 | - Numeric values are stored directly in the trie data word, with special | |
212 | encodings. | |
213 | - No more exception data (the data that needed it was pulled out, or, in the | |
214 | case of numeric values, encoded differently). | |
215 | - No more string data (pulled out - was for case mappings). | |
216 | ||
217 | Also, some of the previously used properties vector bits are reserved again. | |
218 | ||
219 | The indexes[] values for the omitted structures are still filled in | |
220 | (indicating zero-length arrays) so that the swapper code remains unchanged. | |
221 | ||
b75a7d8f A |
222 | ----------------------------------------------------------------------------- */ |
223 | ||
224 | /* UDataInfo cf. udata.h */ | |
225 | static UDataInfo dataInfo={ | |
226 | sizeof(UDataInfo), | |
227 | 0, | |
228 | ||
229 | U_IS_BIG_ENDIAN, | |
230 | U_CHARSET_FAMILY, | |
231 | U_SIZEOF_UCHAR, | |
232 | 0, | |
233 | ||
234 | { 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */ | |
73c04bcf | 235 | { 4, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ |
374ca955 | 236 | { 4, 0, 1, 0 } /* dataVersion */ |
b75a7d8f A |
237 | }; |
238 | ||
b75a7d8f A |
239 | static UNewTrie *pTrie=NULL; |
240 | ||
b75a7d8f A |
241 | /* -------------------------------------------------------------------------- */ |
242 | ||
243 | extern void | |
244 | setUnicodeVersion(const char *v) { | |
245 | UVersionInfo version; | |
246 | u_versionFromString(version, v); | |
247 | uprv_memcpy(dataInfo.dataVersion, version, 4); | |
248 | } | |
249 | ||
250 | extern void | |
251 | initStore() { | |
73c04bcf | 252 | pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE); |
b75a7d8f A |
253 | if(pTrie==NULL) { |
254 | fprintf(stderr, "error: unable to create a UNewTrie\n"); | |
255 | exit(U_MEMORY_ALLOCATION_ERROR); | |
256 | } | |
257 | ||
b75a7d8f A |
258 | initAdditionalProperties(); |
259 | } | |
260 | ||
73c04bcf A |
261 | extern void |
262 | exitStore() { | |
263 | utrie_close(pTrie); | |
264 | exitAdditionalProperties(); | |
265 | } | |
266 | ||
267 | static uint32_t printNumericTypeValueError(Props *p) { | |
268 | fprintf(stderr, "genprops error: unable to encode numeric type & value %d %ld/%lu E%d\n", | |
269 | (int)p->numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent); | |
270 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
271 | return 0; | |
272 | } | |
273 | ||
b75a7d8f A |
274 | /* store a character's properties ------------------------------------------- */ |
275 | ||
276 | extern uint32_t | |
277 | makeProps(Props *p) { | |
73c04bcf A |
278 | uint32_t den; |
279 | int32_t type, value, exp; | |
280 | ||
281 | /* encode numeric type & value */ | |
282 | type=p->numericType; | |
283 | value=p->numericValue; | |
284 | den=p->denominator; | |
285 | exp=p->exponent; | |
286 | ||
287 | if(den!=0) { | |
288 | /* fraction */ | |
289 | if( type!=U_NT_NUMERIC || | |
290 | value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM || | |
291 | den<UPROPS_FRACTION_MIN_DEN || UPROPS_FRACTION_MAX_DEN<den || | |
292 | exp!=0 | |
293 | ) { | |
294 | return printNumericTypeValueError(p); | |
b75a7d8f | 295 | } |
73c04bcf A |
296 | type=UPROPS_NT_FRACTION; |
297 | ||
298 | if(value==-1) { | |
299 | value=0; | |
b75a7d8f | 300 | } |
73c04bcf A |
301 | den-=UPROPS_FRACTION_DEN_OFFSET; |
302 | value=(value<<UPROPS_FRACTION_NUM_SHIFT)|den; | |
303 | } else if(exp!=0) { | |
304 | /* very large value */ | |
305 | if( type!=U_NT_NUMERIC || | |
306 | value<1 || 9<value || | |
307 | exp<UPROPS_LARGE_MIN_EXP || UPROPS_LARGE_MAX_EXP_EXTRA<exp | |
308 | ) { | |
309 | return printNumericTypeValueError(p); | |
b75a7d8f | 310 | } |
73c04bcf | 311 | type=UPROPS_NT_LARGE; |
b75a7d8f | 312 | |
73c04bcf A |
313 | if(exp<=UPROPS_LARGE_MAX_EXP) { |
314 | /* 1..9 * 10^(2..17) */ | |
315 | exp-=UPROPS_LARGE_EXP_OFFSET; | |
316 | } else { | |
317 | /* 1 * 10^(18..33) */ | |
318 | if(value!=1) { | |
319 | return printNumericTypeValueError(p); | |
b75a7d8f | 320 | } |
73c04bcf A |
321 | value=0; |
322 | exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA; | |
b75a7d8f | 323 | } |
73c04bcf A |
324 | value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp; |
325 | } else if(value>UPROPS_MAX_SMALL_NUMBER) { | |
326 | /* large value */ | |
327 | if(type!=U_NT_NUMERIC) { | |
328 | return printNumericTypeValueError(p); | |
329 | } | |
330 | type=UPROPS_NT_LARGE; | |
b75a7d8f | 331 | |
73c04bcf A |
332 | /* split the value into mantissa and exponent, base 10 */ |
333 | while((value%10)==0) { | |
334 | value/=10; | |
335 | ++exp; | |
336 | } | |
337 | if(value>9) { | |
338 | return printNumericTypeValueError(p); | |
339 | } | |
b75a7d8f | 340 | |
73c04bcf A |
341 | exp-=UPROPS_LARGE_EXP_OFFSET; |
342 | value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp; | |
343 | } else if(value<0) { | |
344 | /* unable to encode negative values, other than fractions -1/x */ | |
345 | return printNumericTypeValueError(p); | |
b75a7d8f | 346 | |
73c04bcf | 347 | /* } else normal value=0..0xff { */ |
b75a7d8f A |
348 | } |
349 | ||
73c04bcf A |
350 | /* encode the properties */ |
351 | return | |
b75a7d8f | 352 | (uint32_t)p->generalCategory | |
73c04bcf A |
353 | ((uint32_t)type<<UPROPS_NUMERIC_TYPE_SHIFT) | |
354 | ((uint32_t)value<<UPROPS_NUMERIC_VALUE_SHIFT); | |
b75a7d8f A |
355 | } |
356 | ||
357 | extern void | |
358 | addProps(uint32_t c, uint32_t x) { | |
359 | if(!utrie_set32(pTrie, (UChar32)c, x)) { | |
360 | fprintf(stderr, "error: too many entries for the properties trie\n"); | |
361 | exit(U_BUFFER_OVERFLOW_ERROR); | |
362 | } | |
363 | } | |
364 | ||
b75a7d8f A |
365 | extern uint32_t |
366 | getProps(uint32_t c) { | |
367 | return utrie_get32(pTrie, (UChar32)c, NULL); | |
368 | } | |
369 | ||
370 | /* areas of same properties ------------------------------------------------- */ | |
371 | ||
372 | extern void | |
373 | repeatProps(uint32_t first, uint32_t last, uint32_t x) { | |
374 | if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) { | |
375 | fprintf(stderr, "error: too many entries for the properties trie\n"); | |
376 | exit(U_BUFFER_OVERFLOW_ERROR); | |
377 | } | |
378 | } | |
379 | ||
b75a7d8f A |
380 | /* generate output data ----------------------------------------------------- */ |
381 | ||
b75a7d8f | 382 | extern void |
73c04bcf | 383 | generateData(const char *dataDir, UBool csource) { |
b75a7d8f A |
384 | static int32_t indexes[UPROPS_INDEX_COUNT]={ |
385 | 0, 0, 0, 0, | |
386 | 0, 0, 0, 0, | |
387 | 0, 0, 0, 0, | |
388 | 0, 0, 0, 0 | |
389 | }; | |
390 | static uint8_t trieBlock[40000]; | |
391 | static uint8_t additionalProps[120000]; | |
392 | ||
393 | UNewDataMemory *pData; | |
394 | UErrorCode errorCode=U_ZERO_ERROR; | |
73c04bcf | 395 | uint32_t size = 0; |
b75a7d8f A |
396 | int32_t trieSize, additionalPropsSize, offset; |
397 | long dataLength; | |
398 | ||
73c04bcf | 399 | trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode); |
b75a7d8f A |
400 | if(U_FAILURE(errorCode)) { |
401 | fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize); | |
402 | exit(errorCode); | |
403 | } | |
404 | ||
405 | offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */ | |
406 | ||
73c04bcf | 407 | /* round up trie size to 4-alignment */ |
b75a7d8f A |
408 | trieSize=(trieSize+3)&~3; |
409 | offset+=trieSize>>2; | |
73c04bcf A |
410 | indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */ |
411 | indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */ | |
412 | indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */ | |
b75a7d8f A |
413 | indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset; |
414 | ||
415 | if(beVerbose) { | |
374ca955 | 416 | printf("trie size in bytes: %5u\n", (int)trieSize); |
b75a7d8f A |
417 | } |
418 | ||
73c04bcf A |
419 | if(csource) { |
420 | /* write .c file for hardcoded data */ | |
421 | UTrie trie={ NULL }; | |
422 | FILE *f; | |
423 | ||
424 | utrie_unserialize(&trie, trieBlock, trieSize, &errorCode); | |
425 | if(U_FAILURE(errorCode)) { | |
426 | fprintf( | |
427 | stderr, | |
428 | "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n", | |
429 | u_errorName(errorCode)); | |
430 | return; | |
431 | } | |
b75a7d8f | 432 | |
73c04bcf A |
433 | f=usrc_create(dataDir, "uchar_props_data.c"); |
434 | if(f!=NULL) { | |
435 | usrc_writeArray(f, | |
436 | "static const UVersionInfo formatVersion={", | |
437 | dataInfo.formatVersion, 8, 4, | |
438 | "};\n\n"); | |
439 | usrc_writeArray(f, | |
440 | "static const UVersionInfo dataVersion={", | |
441 | dataInfo.dataVersion, 8, 4, | |
442 | "};\n\n"); | |
443 | usrc_writeUTrieArrays(f, | |
444 | "static const uint16_t propsTrie_index[%ld]={\n", NULL, | |
445 | &trie, | |
446 | "\n};\n\n"); | |
447 | usrc_writeUTrieStruct(f, | |
448 | "static const UTrie propsTrie={\n", | |
449 | &trie, "propsTrie_index", NULL, NULL, | |
450 | "};\n\n"); | |
451 | ||
452 | additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes); | |
453 | size=4*offset+additionalPropsSize; /* total size of data */ | |
454 | ||
455 | usrc_writeArray(f, | |
456 | "static const int32_t indexes[UPROPS_INDEX_COUNT]={", | |
457 | indexes, 32, UPROPS_INDEX_COUNT, | |
458 | "};\n\n"); | |
459 | fclose(f); | |
460 | } | |
461 | } else { | |
462 | /* write the data */ | |
463 | pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo, | |
464 | haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); | |
465 | if(U_FAILURE(errorCode)) { | |
466 | fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode)); | |
467 | exit(errorCode); | |
468 | } | |
b75a7d8f | 469 | |
73c04bcf A |
470 | additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes); |
471 | size=4*offset+additionalPropsSize; /* total size of data */ | |
b75a7d8f | 472 | |
73c04bcf A |
473 | udata_writeBlock(pData, indexes, sizeof(indexes)); |
474 | udata_writeBlock(pData, trieBlock, trieSize); | |
475 | udata_writeBlock(pData, additionalProps, additionalPropsSize); | |
b75a7d8f | 476 | |
73c04bcf A |
477 | /* finish up */ |
478 | dataLength=udata_finish(pData, &errorCode); | |
479 | if(U_FAILURE(errorCode)) { | |
480 | fprintf(stderr, "genprops: error %d writing the output file\n", errorCode); | |
481 | exit(errorCode); | |
482 | } | |
b75a7d8f | 483 | |
73c04bcf A |
484 | if(dataLength!=(long)size) { |
485 | fprintf(stderr, "genprops: data length %ld != calculated size %lu\n", | |
486 | dataLength, (unsigned long)size); | |
487 | exit(U_INTERNAL_PROGRAM_ERROR); | |
488 | } | |
b75a7d8f A |
489 | } |
490 | ||
73c04bcf A |
491 | if(beVerbose) { |
492 | printf("data size: %6lu\n", (unsigned long)size); | |
b75a7d8f | 493 | } |
b75a7d8f A |
494 | } |
495 | ||
496 | /* | |
497 | * Hey, Emacs, please set the following: | |
498 | * | |
499 | * Local Variables: | |
500 | * indent-tabs-mode: nil | |
501 | * End: | |
502 | * | |
503 | */ |