]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2004-2005, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: store.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2004dec30 | |
14 | * created by: Markus W. Scherer | |
15 | * | |
16 | * Store Unicode bidi/shaping properties efficiently for | |
17 | * random access. | |
18 | */ | |
19 | ||
20 | #include <stdio.h> | |
21 | #include <stdlib.h> | |
22 | #include "unicode/utypes.h" | |
23 | #include "unicode/uchar.h" | |
24 | #include "cmemory.h" | |
25 | #include "cstring.h" | |
26 | #include "utrie.h" | |
27 | #include "uarrsort.h" | |
28 | #include "unicode/udata.h" | |
29 | #include "unewdata.h" | |
30 | #include "propsvec.h" | |
31 | #include "writesrc.h" | |
32 | #include "ubidi_props.h" | |
33 | #include "genbidi.h" | |
34 | ||
35 | #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) | |
36 | ||
37 | /* Unicode bidi/shaping properties file format --------------------------------- | |
38 | ||
39 | The file format prepared and written here contains several data | |
40 | structures that store indexes or data. | |
41 | ||
42 | Before the data contents described below, there are the headers required by | |
43 | the udata API for loading ICU data. Especially, a UDataInfo structure | |
44 | precedes the actual data. It contains platform properties values and the | |
45 | file format version. | |
46 | ||
47 | The following is a description of format version 1.0 . | |
48 | ||
49 | The file contains the following structures: | |
50 | ||
51 | const int32_t indexes[i0] with values i0, i1, ...: | |
52 | (see UBIDI_IX_... constants for names of indexes) | |
53 | ||
54 | i0 indexLength; -- length of indexes[] (UBIDI_IX_TOP) | |
55 | i1 dataLength; -- length in bytes of the post-header data (incl. indexes[]) | |
56 | i2 trieSize; -- size in bytes of the bidi/shaping properties trie | |
57 | i3 mirrorLength; -- length in uint32_t of the bidi mirroring array | |
58 | ||
59 | i4 jgStart; -- first code point with Joining_Group data | |
60 | i5 jgLimit; -- limit code point for Joining_Group data | |
61 | ||
62 | i6..i14 reservedIndexes; -- reserved values; 0 for now | |
63 | ||
64 | i15 maxValues; -- maximum code values for enumerated properties | |
65 | bits 23..16 contain the max value for Joining_Group, | |
66 | otherwise the bits are used like enum fields in the trie word | |
67 | ||
68 | Serialized trie, see utrie.h; | |
69 | ||
70 | const uint32_t mirrors[mirrorLength]; | |
71 | ||
72 | const uint8_t jgArray[i5-i4]; -- (i5-i4) is always a multiple of 4 | |
73 | ||
74 | Trie data word: | |
75 | Bits | |
76 | 15..13 signed delta to bidi mirroring code point | |
77 | (add delta to input code point) | |
78 | 0 no such code point (source maps to itself) | |
79 | -3..-1, 1..3 delta | |
80 | -4 look in mirrors table | |
81 | 12 is mirrored | |
82 | 11 Bidi_Control | |
83 | 10 Join_Control | |
84 | 9.. 8 reserved (set to 0) | |
85 | 7.. 5 Joining_Type | |
86 | 4.. 0 BiDi category | |
87 | ||
88 | ||
89 | Mirrors: | |
90 | Stores some of the bidi mirroring data, where each code point maps to | |
91 | at most one other. | |
92 | Most code points do not have a mirroring code point; most that do have a signed | |
93 | delta stored in the trie data value. Only those where the delta does not fit | |
94 | into the trie data are stored in this table. | |
95 | ||
96 | Logically, this is a two-column table with source and mirror code points. | |
97 | ||
98 | Physically, the table is compressed by taking advantage of the fact that each | |
99 | mirror code point is also a source code point | |
100 | (each of them is a mirror of the other). | |
101 | Therefore, both logical columns contain the same set of code points, which needs | |
102 | to be stored only once. | |
103 | ||
104 | The table stores source code points, and also for each the index of its mirror | |
105 | code point in the same table, in a simple array of uint32_t. | |
106 | Bits | |
107 | 31..21 index to mirror code point (unsigned) | |
108 | 20.. 0 source code point | |
109 | ||
110 | The table is sorted by source code points. | |
111 | ||
112 | ||
113 | Joining_Group array: | |
114 | The Joining_Group values do not fit into the 16-bit trie, but the data is also | |
115 | limited to a small range of code points (Arabic and Syriac) and not | |
116 | well compressible. | |
117 | ||
118 | The start and limit code points for the range are stored in the indexes[] | |
119 | array, and the jgArray[] stores a byte for each of these code points, | |
120 | containing the Joining_Group value. | |
121 | ||
122 | All code points outside of this range have No_Joining_Group (0). | |
123 | ||
124 | ----------------------------------------------------------------------------- */ | |
125 | ||
126 | /* UDataInfo cf. udata.h */ | |
127 | static UDataInfo dataInfo={ | |
128 | sizeof(UDataInfo), | |
129 | 0, | |
130 | ||
131 | U_IS_BIG_ENDIAN, | |
132 | U_CHARSET_FAMILY, | |
133 | U_SIZEOF_UCHAR, | |
134 | 0, | |
135 | ||
136 | /* dataFormat="BiDi" */ | |
137 | { UBIDI_FMT_0, UBIDI_FMT_1, UBIDI_FMT_2, UBIDI_FMT_3 }, | |
138 | { 1, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ | |
139 | { 4, 0, 1, 0 } /* dataVersion */ | |
140 | }; | |
141 | ||
142 | /* exceptions values */ | |
143 | static uint32_t mirrors[UBIDI_MAX_MIRROR_INDEX+1][2]; | |
144 | static uint16_t mirrorTop=0; | |
145 | ||
146 | /* -------------------------------------------------------------------------- */ | |
147 | ||
148 | extern void | |
149 | setUnicodeVersion(const char *v) { | |
150 | UVersionInfo version; | |
151 | u_versionFromString(version, v); | |
152 | uprv_memcpy(dataInfo.dataVersion, version, 4); | |
153 | } | |
154 | ||
155 | /* bidi mirroring table ----------------------------------------------------- */ | |
156 | ||
157 | extern void | |
158 | addMirror(UChar32 src, UChar32 mirror) { | |
159 | UErrorCode errorCode; | |
160 | int32_t delta; | |
161 | ||
162 | delta=mirror-src; | |
163 | if(delta==0) { | |
164 | return; /* mapping to self=no mapping */ | |
165 | } | |
166 | ||
167 | if(delta<UBIDI_MIN_MIRROR_DELTA || UBIDI_MAX_MIRROR_DELTA<delta) { | |
168 | /* delta does not fit into the trie properties value, store in the mirrors[] table */ | |
169 | if(mirrorTop==LENGTHOF(mirrors)) { | |
170 | fprintf(stderr, "genbidi error: too many long-distance mirroring mappings\n"); | |
171 | exit(U_BUFFER_OVERFLOW_ERROR); | |
172 | } | |
173 | ||
174 | /* possible: search the table so far and see if src is already listed */ | |
175 | ||
176 | mirrors[mirrorTop][0]=(uint32_t)src; | |
177 | mirrors[mirrorTop][1]=(uint32_t)mirror; | |
178 | ++mirrorTop; | |
179 | ||
180 | /* set an escape marker in src's properties */ | |
181 | delta=UBIDI_ESC_MIRROR_DELTA; | |
182 | } | |
183 | ||
184 | errorCode=U_ZERO_ERROR; | |
185 | if( | |
186 | !upvec_setValue( | |
187 | pv, src, src+1, 0, | |
188 | (uint32_t)delta<<UBIDI_MIRROR_DELTA_SHIFT, (uint32_t)(-1)<<UBIDI_MIRROR_DELTA_SHIFT, | |
189 | &errorCode) | |
190 | ) { | |
191 | fprintf(stderr, "genbidi error: unable to set mirroring delta, code: %s\n", | |
192 | u_errorName(errorCode)); | |
193 | exit(errorCode); | |
194 | } | |
195 | } | |
196 | ||
197 | static int32_t U_CALLCONV | |
198 | compareMirror(const void *context, const void *left, const void *right) { | |
199 | UChar32 l, r; | |
200 | ||
201 | l=UBIDI_GET_MIRROR_CODE_POINT(((const uint32_t *)left)[0]); | |
202 | r=UBIDI_GET_MIRROR_CODE_POINT(((const uint32_t *)right)[0]); | |
203 | return l-r; | |
204 | } | |
205 | ||
206 | static void | |
207 | makeMirror() { | |
208 | uint32_t *reducedMirror; | |
209 | UErrorCode errorCode; | |
210 | int32_t i, j, start, limit, step; | |
211 | uint32_t c; | |
212 | ||
213 | /* sort the mirroring table by source code points */ | |
214 | errorCode=U_ZERO_ERROR; | |
215 | uprv_sortArray(mirrors, mirrorTop, 8, | |
216 | compareMirror, NULL, FALSE, &errorCode); | |
217 | ||
218 | /* | |
219 | * reduce the 2-column table to a single column | |
220 | * by putting the index to the mirror entry into the source entry | |
221 | * | |
222 | * first: | |
223 | * find each mirror code point in the source column and set each other's indexes | |
224 | * | |
225 | * second: | |
226 | * reduce the table, combine the source code points with their indexes | |
227 | * and store as a simple array of uint32_t | |
228 | */ | |
229 | for(i=0; i<mirrorTop; ++i) { | |
230 | c=mirrors[i][1]; /* mirror code point */ | |
231 | if(c>0x1fffff) { | |
232 | continue; /* this entry already has an index */ | |
233 | } | |
234 | ||
235 | /* search for the mirror code point in the source column */ | |
236 | if(c<mirrors[i][0]) { | |
237 | /* search before i */ | |
238 | start=i-1; | |
239 | limit=-1; | |
240 | step=-1; | |
241 | } else { | |
242 | start=i+1; | |
243 | limit=mirrorTop; | |
244 | step=1; | |
245 | } | |
246 | ||
247 | for(j=start;; j+=step) { | |
248 | if(j==limit) { | |
249 | fprintf(stderr, | |
250 | "genbidi error: bidi mirror does not roundtrip - %04lx->%04lx->?\n", | |
251 | (long)mirrors[i][0], (long)mirrors[i][1]); | |
252 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
253 | } | |
254 | if(c==mirrors[j][0]) { | |
255 | /* | |
256 | * found the mirror code point c in the source column, | |
257 | * set both entries' indexes to each other | |
258 | */ | |
259 | if(UBIDI_GET_MIRROR_CODE_POINT(mirrors[i][0])!=UBIDI_GET_MIRROR_CODE_POINT(mirrors[j][1])) { | |
260 | /* roundtrip check fails */ | |
261 | fprintf(stderr, | |
262 | "genbidi error: bidi mirrors do not roundtrip - %04lx->%04lx->%04lx\n", | |
263 | (long)mirrors[i][0], (long)mirrors[i][1], (long)mirrors[j][1]); | |
264 | errorCode=U_ILLEGAL_ARGUMENT_ERROR; | |
265 | } else { | |
266 | mirrors[i][1]|=(uint32_t)j<<UBIDI_MIRROR_INDEX_SHIFT; | |
267 | mirrors[j][1]|=(uint32_t)i<<UBIDI_MIRROR_INDEX_SHIFT; | |
268 | } | |
269 | break; | |
270 | } | |
271 | } | |
272 | } | |
273 | ||
274 | /* now the second step, the actual reduction of the table */ | |
275 | reducedMirror=mirrors[0]; | |
276 | for(i=0; i<mirrorTop; ++i) { | |
277 | reducedMirror[i]=mirrors[i][0]|(mirrors[i][1]&~0x1fffff); | |
278 | } | |
279 | ||
280 | if(U_FAILURE(errorCode)) { | |
281 | exit(errorCode); | |
282 | } | |
283 | } | |
284 | ||
285 | /* generate output data ----------------------------------------------------- */ | |
286 | ||
287 | extern void | |
288 | generateData(const char *dataDir, UBool csource) { | |
289 | static int32_t indexes[UBIDI_IX_TOP]={ | |
290 | UBIDI_IX_TOP | |
291 | }; | |
292 | static uint8_t trieBlock[40000]; | |
293 | static uint8_t jgArray[0x300]; /* at most for U+0600..U+08FF */ | |
294 | ||
295 | const uint32_t *row; | |
296 | UChar32 start, limit, prev, jgStart; | |
297 | int32_t i; | |
298 | ||
299 | UNewDataMemory *pData; | |
300 | UNewTrie *pTrie; | |
301 | UErrorCode errorCode=U_ZERO_ERROR; | |
302 | int32_t trieSize; | |
303 | long dataLength; | |
304 | ||
305 | makeMirror(); | |
306 | ||
307 | pTrie=utrie_open(NULL, NULL, 20000, 0, 0, TRUE); | |
308 | if(pTrie==NULL) { | |
309 | fprintf(stderr, "genbidi error: unable to create a UNewTrie\n"); | |
310 | exit(U_MEMORY_ALLOCATION_ERROR); | |
311 | } | |
312 | ||
313 | prev=jgStart=0; | |
314 | for(i=0; (row=upvec_getRow(pv, i, &start, &limit))!=NULL; ++i) { | |
315 | /* store most values from vector column 0 in the trie */ | |
316 | if(!utrie_setRange32(pTrie, start, limit, *row, TRUE)) { | |
317 | fprintf(stderr, "genbidi error: unable to set trie value (overflow)\n"); | |
318 | exit(U_BUFFER_OVERFLOW_ERROR); | |
319 | } | |
320 | ||
321 | /* store Joining_Group values from vector column 1 in a simple byte array */ | |
322 | if(row[1]!=0) { | |
323 | if(start<0x600 || 0x900<=limit) { | |
324 | fprintf(stderr, "genbidi error: Joining_Group for out-of-range code points U+%04lx..U+%04lx\n", | |
325 | (long)start, (long)limit); | |
326 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
327 | } | |
328 | ||
329 | if(prev==0) { | |
330 | /* first code point with any value */ | |
331 | prev=jgStart=start; | |
332 | } else { | |
333 | /* add No_Joining_Group for code points between prev and start */ | |
334 | while(prev<start) { | |
335 | jgArray[prev++ -jgStart]=0; | |
336 | } | |
337 | } | |
338 | ||
339 | /* set Joining_Group value for start..limit */ | |
340 | while(prev<limit) { | |
341 | jgArray[prev++ -jgStart]=(uint8_t)row[1]; | |
342 | } | |
343 | } | |
344 | } | |
345 | ||
346 | /* finish jgArray, pad to multiple of 4 */ | |
347 | while((prev-jgStart)&3) { | |
348 | jgArray[prev++ -jgStart]=0; | |
349 | } | |
350 | indexes[UBIDI_IX_JG_START]=jgStart; | |
351 | indexes[UBIDI_IX_JG_LIMIT]=prev; | |
352 | ||
353 | trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode); | |
354 | if(U_FAILURE(errorCode)) { | |
355 | fprintf(stderr, "genbidi error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize); | |
356 | exit(errorCode); | |
357 | } | |
358 | ||
359 | indexes[UBIDI_IX_TRIE_SIZE]=trieSize; | |
360 | indexes[UBIDI_IX_MIRROR_LENGTH]=mirrorTop; | |
361 | indexes[UBIDI_IX_LENGTH]= | |
362 | (int32_t)sizeof(indexes)+ | |
363 | trieSize+ | |
364 | 4*mirrorTop+ | |
365 | (prev-jgStart); | |
366 | ||
367 | if(beVerbose) { | |
368 | printf("trie size in bytes: %5d\n", (int)trieSize); | |
369 | printf("size in bytes of mirroring table: %5d\n", (int)(4*mirrorTop)); | |
370 | printf("length of Joining_Group array: %5d (U+%04x..U+%04x)\n", (int)(prev-jgStart), (int)jgStart, (int)(prev-1)); | |
371 | printf("data size: %5d\n", (int)indexes[UBIDI_IX_LENGTH]); | |
372 | } | |
373 | ||
374 | indexes[UBIDI_MAX_VALUES_INDEX]= | |
375 | ((int32_t)U_CHAR_DIRECTION_COUNT-1)| | |
376 | (((int32_t)U_JT_COUNT-1)<<UBIDI_JT_SHIFT)| | |
377 | (((int32_t)U_JG_COUNT-1)<<UBIDI_MAX_JG_SHIFT); | |
378 | ||
379 | if(csource) { | |
380 | /* write .c file for hardcoded data */ | |
381 | UTrie trie={ NULL }; | |
382 | FILE *f; | |
383 | ||
384 | utrie_unserialize(&trie, trieBlock, trieSize, &errorCode); | |
385 | if(U_FAILURE(errorCode)) { | |
386 | fprintf( | |
387 | stderr, | |
388 | "genbidi error: failed to utrie_unserialize(ubidi.icu trie) - %s\n", | |
389 | u_errorName(errorCode)); | |
390 | return; | |
391 | } | |
392 | ||
393 | f=usrc_create(dataDir, "ubidi_props_data.c"); | |
394 | if(f!=NULL) { | |
395 | usrc_writeArray(f, | |
396 | "static const UVersionInfo ubidi_props_dataVersion={", | |
397 | dataInfo.dataVersion, 8, 4, | |
398 | "};\n\n"); | |
399 | usrc_writeArray(f, | |
400 | "static const int32_t ubidi_props_indexes[UBIDI_IX_TOP]={", | |
401 | indexes, 32, UBIDI_IX_TOP, | |
402 | "};\n\n"); | |
403 | usrc_writeUTrieArrays(f, | |
404 | "static const uint16_t ubidi_props_trieIndex[%ld]={\n", NULL, | |
405 | &trie, | |
406 | "\n};\n\n"); | |
407 | usrc_writeArray(f, | |
408 | "static const uint32_t ubidi_props_mirrors[%ld]={\n", | |
409 | mirrors, 32, mirrorTop, | |
410 | "\n};\n\n"); | |
411 | usrc_writeArray(f, | |
412 | "static const uint8_t ubidi_props_jgArray[%ld]={\n", | |
413 | jgArray, 8, prev-jgStart, | |
414 | "\n};\n\n"); | |
415 | fputs( | |
416 | "static const UBiDiProps ubidi_props_singleton={\n" | |
417 | " NULL,\n" | |
418 | " ubidi_props_indexes,\n" | |
419 | " ubidi_props_mirrors,\n" | |
420 | " ubidi_props_jgArray,\n", | |
421 | f); | |
422 | usrc_writeUTrieStruct(f, | |
423 | " {\n", | |
424 | &trie, "ubidi_props_trieIndex", NULL, NULL, | |
425 | " },\n"); | |
426 | usrc_writeArray(f, " { ", dataInfo.formatVersion, 8, 4, " }\n"); | |
427 | fputs("};\n", f); | |
428 | fclose(f); | |
429 | } | |
430 | } else { | |
431 | /* write the data */ | |
432 | pData=udata_create(dataDir, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &dataInfo, | |
433 | haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); | |
434 | if(U_FAILURE(errorCode)) { | |
435 | fprintf(stderr, "genbidi: unable to create data memory, %s\n", u_errorName(errorCode)); | |
436 | exit(errorCode); | |
437 | } | |
438 | ||
439 | udata_writeBlock(pData, indexes, sizeof(indexes)); | |
440 | udata_writeBlock(pData, trieBlock, trieSize); | |
441 | udata_writeBlock(pData, mirrors, 4*mirrorTop); | |
442 | udata_writeBlock(pData, jgArray, prev-jgStart); | |
443 | ||
444 | /* finish up */ | |
445 | dataLength=udata_finish(pData, &errorCode); | |
446 | if(U_FAILURE(errorCode)) { | |
447 | fprintf(stderr, "genbidi: error %d writing the output file\n", errorCode); | |
448 | exit(errorCode); | |
449 | } | |
450 | ||
451 | if(dataLength!=indexes[UBIDI_IX_LENGTH]) { | |
452 | fprintf(stderr, "genbidi: data length %ld != calculated size %d\n", | |
453 | dataLength, (int)indexes[UBIDI_IX_LENGTH]); | |
454 | exit(U_INTERNAL_PROGRAM_ERROR); | |
455 | } | |
456 | } | |
457 | ||
458 | utrie_close(pTrie); | |
459 | upvec_close(pv); | |
460 | } | |
461 | ||
462 | /* | |
463 | * Hey, Emacs, please set the following: | |
464 | * | |
465 | * Local Variables: | |
466 | * indent-tabs-mode: nil | |
467 | * End: | |
468 | * | |
469 | */ |