]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/genprops/store.c
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / tools / genprops / store.c
CommitLineData
b75a7d8f
A
1/*
2*******************************************************************************
3*
73c04bcf 4* Copyright (C) 1999-2006, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: store.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 1999dec11
14* created by: Markus W. Scherer
15*
16* Store Unicode character properties efficiently for
17* random access.
18*/
19
20#include <stdio.h>
b75a7d8f
A
21#include "unicode/utypes.h"
22#include "unicode/uchar.h"
23#include "cmemory.h"
24#include "cstring.h"
b75a7d8f
A
25#include "utrie.h"
26#include "unicode/udata.h"
27#include "unewdata.h"
73c04bcf 28#include "writesrc.h"
b75a7d8f
A
29#include "uprops.h"
30#include "genprops.h"
31
32#define DO_DEBUG_OUT 0
33
34/* Unicode character properties file format ------------------------------------
35
36The file format prepared and written here contains several data
37structures that store indexes or data.
38
39Before the data contents described below, there are the headers required by
40the udata API for loading ICU data. Especially, a UDataInfo structure
41precedes the actual data. It contains platform properties values and the
42file format version.
43
73c04bcf
A
44The following is a description of format version 4 .
45
46The format changes between version 3 and 4 because the properties related to
47case mappings and bidi/shaping are pulled out into separate files
48for modularization.
49In order to reduce the need for code changes, some of the previous data
50structures are omitted, rather than rearranging everything.
51
52For details see "Changes in format version 4" below.
b75a7d8f
A
53
54Data contents:
55
56The contents is a parsed, binary form of several Unicode character
57database files, most prominently UnicodeData.txt.
58
59Any Unicode code point from 0 to 0x10ffff can be looked up to get
60the properties, if any, for that code point. This means that the input
61to the lookup are 21-bit unsigned integers, with not all of the
6221-bit range used.
63
64It is assumed that client code keeps a uint32_t pointer
65to the beginning of the data:
66
67 const uint32_t *p32;
68
69Formally, the file contains the following structures:
70
71 const int32_t indexes[16] with values i0..i15:
72
73c04bcf
A
73 i0 indicates the length of the main trie.
74 i0..i3 all have the same value in format version 4.0;
75 the related props32[] and exceptions[] and uchars[] were used in format version 3
76
b75a7d8f
A
77 i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
78 i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
79 i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
80
81 i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
82 i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
83 i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
84
85 i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
86 i7..i9 reservedIndexes; -- reserved values; 0 for now
87
73c04bcf
A
88 i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
89 i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
b75a7d8f
A
90 i12..i15 reservedIndexes; -- reserved values; 0 for now
91
92 PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
93
73c04bcf
A
94 P, E, and U are not used (empty) in format version 4
95
b75a7d8f
A
96 P const uint32_t props32[i1-i0];
97 E const uint32_t exceptions[i2-i1];
98 U const UChar uchars[2*(i3-i2)];
99
100 AT serialized trie for additional properties (byte size: 4*(i4-i3))
101 PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
102
103Trie lookup and properties:
104
105In order to condense the data for the 21-bit code space, several properties of
106the Unicode code assignment are exploited:
107- The code space is sparse.
108- There are several 10k of consecutive codes with the same properties.
109- Characters and scripts are allocated in groups of 16 code points.
110- Inside blocks for scripts the properties are often repetitive.
111- The 21-bit space is not fully used for Unicode.
112
113The lookup of properties for a given code point is done with a trie lookup,
114using the UTrie implementation.
73c04bcf 115The trie lookup result is a 16-bit properties word.
b75a7d8f
A
116
117With a given Unicode code point
118
119 UChar32 c;
120
121and 0<=c<0x110000, the lookup is done like this:
122
73c04bcf
A
123 uint16_t props;
124 UTRIE_GET16(trie, c, props);
b75a7d8f 125
73c04bcf 126Each 16-bit properties word contains:
b75a7d8f
A
127
128 0.. 4 general category
73c04bcf
A
129 5.. 7 numeric type
130 non-digit numbers are stored with multiple types and pseudo-types
131 in order to facilitate compact encoding:
132 0 no numeric value (0)
133 1 decimal digit value (0..9)
134 2 digit value (0..9)
135 3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff
136 4 (internal type UPROPS_NT_FRACTION) fraction
137 5 (internal type UPROPS_NT_LARGE) large number >0xff
138 6..7 reserved
139
140 when returning the numeric type from a public API,
141 internal types must be turned into U_NT_NUMERIC
142
143 8..15 numeric value
144 encoding of fractions and large numbers see below
145
146Fractions:
147 // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
148 int32_t num, den;
149 num=n>>3; // num=0..31
150 den=(n&7)+2; // den=2..9
151 if(num==0) {
152 num=-1; // num=-1 or 1..31
153 }
154 double result=(double)num/(double)den;
155
156Large numbers:
157 // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
158 int32_t m, e;
159 m=n>>4; // m=0..15
160 e=(n&0xf);
161 if(m==0) {
162 m=1; // for large powers of 10
163 e+=18; // e=18..33
164 } else {
165 e+=2; // e=2..17
166 } // m==10..15 are reserved
167 double result=(double)m*10^e;
b75a7d8f
A
168
169--- Additional properties (new in format version 2.1) ---
170
171The second trie for additional properties (AT) is also a UTrie with 16-bit data.
172The data words consist of 32-bit unit indexes (not row indexes!) into the
173table of unique properties vectors (PV).
174Each vector contains a set of properties.
175The width of a vector (number of uint32_t per row) may change
176with the formatVersion, it is stored in i5.
177
178Current properties: see icu/source/common/uprops.h
179
180--- Changes in format version 3.1 ---
181
182See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
183
184--- Changes in format version 3.2 ---
185
186- The tries use linear Latin-1 ranges.
187- The additional properties bits store full properties XYZ instead
188 of partial Other_XYZ, so that changes in the derivation formulas
189 need not be tracked in runtime library code.
190- Joining Type and Line Break are also stored completely, so that uprops.c
191 needs no runtime formulas for enumerated properties either.
192- Store the case-sensitive flag in the main properties word.
193- i10 also contains U_LB_COUNT and U_EA_COUNT.
194- i11 contains maxValues2 for vector word 2.
195
73c04bcf
A
196--- Changes in format version 4 ---
197
198The format changes between version 3 and 4 because the properties related to
199case mappings and bidi/shaping are pulled out into separate files
200for modularization.
201In order to reduce the need for code changes, some of the previous data
202structures are omitted, rather than rearranging everything.
203
204(The change to format version 4 is for ICU 3.4. The last CVS revision of
205genprops/store.c for format version 3.2 is 1.48.)
206
207The main trie's data is significantly simplified:
208- The trie's 16-bit data word is used directly instead of as an index
209 into props32[].
210- The trie uses the default trie folding functions instead of custom ones.
211- Numeric values are stored directly in the trie data word, with special
212 encodings.
213- No more exception data (the data that needed it was pulled out, or, in the
214 case of numeric values, encoded differently).
215- No more string data (pulled out - was for case mappings).
216
217Also, some of the previously used properties vector bits are reserved again.
218
219The indexes[] values for the omitted structures are still filled in
220(indicating zero-length arrays) so that the swapper code remains unchanged.
221
b75a7d8f
A
222----------------------------------------------------------------------------- */
223
224/* UDataInfo cf. udata.h */
225static UDataInfo dataInfo={
226 sizeof(UDataInfo),
227 0,
228
229 U_IS_BIG_ENDIAN,
230 U_CHARSET_FAMILY,
231 U_SIZEOF_UCHAR,
232 0,
233
234 { 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
73c04bcf 235 { 4, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
374ca955 236 { 4, 0, 1, 0 } /* dataVersion */
b75a7d8f
A
237};
238
b75a7d8f
A
239static UNewTrie *pTrie=NULL;
240
b75a7d8f
A
241/* -------------------------------------------------------------------------- */
242
243extern void
244setUnicodeVersion(const char *v) {
245 UVersionInfo version;
246 u_versionFromString(version, v);
247 uprv_memcpy(dataInfo.dataVersion, version, 4);
248}
249
250extern void
251initStore() {
73c04bcf 252 pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
b75a7d8f
A
253 if(pTrie==NULL) {
254 fprintf(stderr, "error: unable to create a UNewTrie\n");
255 exit(U_MEMORY_ALLOCATION_ERROR);
256 }
257
b75a7d8f
A
258 initAdditionalProperties();
259}
260
73c04bcf
A
261extern void
262exitStore() {
263 utrie_close(pTrie);
264 exitAdditionalProperties();
265}
266
267static uint32_t printNumericTypeValueError(Props *p) {
268 fprintf(stderr, "genprops error: unable to encode numeric type & value %d %ld/%lu E%d\n",
269 (int)p->numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent);
270 exit(U_ILLEGAL_ARGUMENT_ERROR);
271 return 0;
272}
273
b75a7d8f
A
274/* store a character's properties ------------------------------------------- */
275
276extern uint32_t
277makeProps(Props *p) {
73c04bcf
A
278 uint32_t den;
279 int32_t type, value, exp;
280
281 /* encode numeric type & value */
282 type=p->numericType;
283 value=p->numericValue;
284 den=p->denominator;
285 exp=p->exponent;
286
287 if(den!=0) {
288 /* fraction */
289 if( type!=U_NT_NUMERIC ||
290 value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM ||
291 den<UPROPS_FRACTION_MIN_DEN || UPROPS_FRACTION_MAX_DEN<den ||
292 exp!=0
293 ) {
294 return printNumericTypeValueError(p);
b75a7d8f 295 }
73c04bcf
A
296 type=UPROPS_NT_FRACTION;
297
298 if(value==-1) {
299 value=0;
b75a7d8f 300 }
73c04bcf
A
301 den-=UPROPS_FRACTION_DEN_OFFSET;
302 value=(value<<UPROPS_FRACTION_NUM_SHIFT)|den;
303 } else if(exp!=0) {
304 /* very large value */
305 if( type!=U_NT_NUMERIC ||
306 value<1 || 9<value ||
307 exp<UPROPS_LARGE_MIN_EXP || UPROPS_LARGE_MAX_EXP_EXTRA<exp
308 ) {
309 return printNumericTypeValueError(p);
b75a7d8f 310 }
73c04bcf 311 type=UPROPS_NT_LARGE;
b75a7d8f 312
73c04bcf
A
313 if(exp<=UPROPS_LARGE_MAX_EXP) {
314 /* 1..9 * 10^(2..17) */
315 exp-=UPROPS_LARGE_EXP_OFFSET;
316 } else {
317 /* 1 * 10^(18..33) */
318 if(value!=1) {
319 return printNumericTypeValueError(p);
b75a7d8f 320 }
73c04bcf
A
321 value=0;
322 exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA;
b75a7d8f 323 }
73c04bcf
A
324 value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
325 } else if(value>UPROPS_MAX_SMALL_NUMBER) {
326 /* large value */
327 if(type!=U_NT_NUMERIC) {
328 return printNumericTypeValueError(p);
329 }
330 type=UPROPS_NT_LARGE;
b75a7d8f 331
73c04bcf
A
332 /* split the value into mantissa and exponent, base 10 */
333 while((value%10)==0) {
334 value/=10;
335 ++exp;
336 }
337 if(value>9) {
338 return printNumericTypeValueError(p);
339 }
b75a7d8f 340
73c04bcf
A
341 exp-=UPROPS_LARGE_EXP_OFFSET;
342 value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
343 } else if(value<0) {
344 /* unable to encode negative values, other than fractions -1/x */
345 return printNumericTypeValueError(p);
b75a7d8f 346
73c04bcf 347 /* } else normal value=0..0xff { */
b75a7d8f
A
348 }
349
73c04bcf
A
350 /* encode the properties */
351 return
b75a7d8f 352 (uint32_t)p->generalCategory |
73c04bcf
A
353 ((uint32_t)type<<UPROPS_NUMERIC_TYPE_SHIFT) |
354 ((uint32_t)value<<UPROPS_NUMERIC_VALUE_SHIFT);
b75a7d8f
A
355}
356
357extern void
358addProps(uint32_t c, uint32_t x) {
359 if(!utrie_set32(pTrie, (UChar32)c, x)) {
360 fprintf(stderr, "error: too many entries for the properties trie\n");
361 exit(U_BUFFER_OVERFLOW_ERROR);
362 }
363}
364
b75a7d8f
A
365extern uint32_t
366getProps(uint32_t c) {
367 return utrie_get32(pTrie, (UChar32)c, NULL);
368}
369
370/* areas of same properties ------------------------------------------------- */
371
372extern void
373repeatProps(uint32_t first, uint32_t last, uint32_t x) {
374 if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) {
375 fprintf(stderr, "error: too many entries for the properties trie\n");
376 exit(U_BUFFER_OVERFLOW_ERROR);
377 }
378}
379
b75a7d8f
A
380/* generate output data ----------------------------------------------------- */
381
b75a7d8f 382extern void
73c04bcf 383generateData(const char *dataDir, UBool csource) {
b75a7d8f
A
384 static int32_t indexes[UPROPS_INDEX_COUNT]={
385 0, 0, 0, 0,
386 0, 0, 0, 0,
387 0, 0, 0, 0,
388 0, 0, 0, 0
389 };
390 static uint8_t trieBlock[40000];
391 static uint8_t additionalProps[120000];
392
393 UNewDataMemory *pData;
394 UErrorCode errorCode=U_ZERO_ERROR;
73c04bcf 395 uint32_t size = 0;
b75a7d8f
A
396 int32_t trieSize, additionalPropsSize, offset;
397 long dataLength;
398
73c04bcf 399 trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
b75a7d8f
A
400 if(U_FAILURE(errorCode)) {
401 fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
402 exit(errorCode);
403 }
404
405 offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
406
73c04bcf 407 /* round up trie size to 4-alignment */
b75a7d8f
A
408 trieSize=(trieSize+3)&~3;
409 offset+=trieSize>>2;
73c04bcf
A
410 indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */
411 indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */
412 indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */
b75a7d8f
A
413 indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
414
415 if(beVerbose) {
374ca955 416 printf("trie size in bytes: %5u\n", (int)trieSize);
b75a7d8f
A
417 }
418
73c04bcf
A
419 if(csource) {
420 /* write .c file for hardcoded data */
421 UTrie trie={ NULL };
422 FILE *f;
423
424 utrie_unserialize(&trie, trieBlock, trieSize, &errorCode);
425 if(U_FAILURE(errorCode)) {
426 fprintf(
427 stderr,
428 "genprops error: failed to utrie_unserialize(uprops.icu main trie) - %s\n",
429 u_errorName(errorCode));
430 return;
431 }
b75a7d8f 432
73c04bcf
A
433 f=usrc_create(dataDir, "uchar_props_data.c");
434 if(f!=NULL) {
435 usrc_writeArray(f,
436 "static const UVersionInfo formatVersion={",
437 dataInfo.formatVersion, 8, 4,
438 "};\n\n");
439 usrc_writeArray(f,
440 "static const UVersionInfo dataVersion={",
441 dataInfo.dataVersion, 8, 4,
442 "};\n\n");
443 usrc_writeUTrieArrays(f,
444 "static const uint16_t propsTrie_index[%ld]={\n", NULL,
445 &trie,
446 "\n};\n\n");
447 usrc_writeUTrieStruct(f,
448 "static const UTrie propsTrie={\n",
449 &trie, "propsTrie_index", NULL, NULL,
450 "};\n\n");
451
452 additionalPropsSize=writeAdditionalData(f, additionalProps, sizeof(additionalProps), indexes);
453 size=4*offset+additionalPropsSize; /* total size of data */
454
455 usrc_writeArray(f,
456 "static const int32_t indexes[UPROPS_INDEX_COUNT]={",
457 indexes, 32, UPROPS_INDEX_COUNT,
458 "};\n\n");
459 fclose(f);
460 }
461 } else {
462 /* write the data */
463 pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
464 haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
465 if(U_FAILURE(errorCode)) {
466 fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
467 exit(errorCode);
468 }
b75a7d8f 469
73c04bcf
A
470 additionalPropsSize=writeAdditionalData(NULL, additionalProps, sizeof(additionalProps), indexes);
471 size=4*offset+additionalPropsSize; /* total size of data */
b75a7d8f 472
73c04bcf
A
473 udata_writeBlock(pData, indexes, sizeof(indexes));
474 udata_writeBlock(pData, trieBlock, trieSize);
475 udata_writeBlock(pData, additionalProps, additionalPropsSize);
b75a7d8f 476
73c04bcf
A
477 /* finish up */
478 dataLength=udata_finish(pData, &errorCode);
479 if(U_FAILURE(errorCode)) {
480 fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
481 exit(errorCode);
482 }
b75a7d8f 483
73c04bcf
A
484 if(dataLength!=(long)size) {
485 fprintf(stderr, "genprops: data length %ld != calculated size %lu\n",
486 dataLength, (unsigned long)size);
487 exit(U_INTERNAL_PROGRAM_ERROR);
488 }
b75a7d8f
A
489 }
490
73c04bcf
A
491 if(beVerbose) {
492 printf("data size: %6lu\n", (unsigned long)size);
b75a7d8f 493 }
b75a7d8f
A
494}
495
496/*
497 * Hey, Emacs, please set the following:
498 *
499 * Local Variables:
500 * indent-tabs-mode: nil
501 * End:
502 *
503 */