]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 1999-2004, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: store.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2003-02-06 | |
14 | * created by: Ram Viswanadha | |
15 | * | |
16 | */ | |
17 | ||
18 | #include <stdio.h> | |
19 | #include <stdlib.h> | |
20 | #include "unicode/utypes.h" | |
21 | #include "cmemory.h" | |
22 | #include "cstring.h" | |
23 | #include "filestrm.h" | |
24 | #include "unicode/udata.h" | |
25 | #include "utrie.h" | |
26 | #include "unewdata.h" | |
27 | #include "gensprep.h" | |
28 | #include "uhash.h" | |
29 | ||
30 | ||
31 | #ifdef WIN32 | |
32 | # pragma warning(disable: 4100) | |
33 | #endif | |
34 | ||
35 | #define DO_DEBUG_OUT 0 | |
36 | ||
37 | ||
38 | /* | |
39 | * StringPrep profile file format ------------------------------------ | |
40 | * | |
41 | * The file format prepared and written here contains a 16-bit trie and a mapping table. | |
42 | * | |
43 | * Before the data contents described below, there are the headers required by | |
44 | * the udata API for loading ICU data. Especially, a UDataInfo structure | |
45 | * precedes the actual data. It contains platform properties values and the | |
46 | * file format version. | |
47 | * | |
48 | * The following is a description of format version 2. | |
49 | * | |
50 | * Data contents: | |
51 | * | |
52 | * The contents is a parsed, binary form of RFC3454 and possibly | |
53 | * NormalizationCorrections.txt depending on the options specified on the profile. | |
54 | * | |
55 | * Any Unicode code point from 0 to 0x10ffff can be looked up to get | |
56 | * the trie-word, if any, for that code point. This means that the input | |
57 | * to the lookup are 21-bit unsigned integers, with not all of the | |
58 | * 21-bit range used. | |
59 | * | |
60 | * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c. | |
61 | * After that there are the following structures: | |
62 | * | |
63 | * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file | |
64 | * | |
65 | * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE] | |
66 | * | |
67 | * uint16_t mappingTable[]; -- Contains the sequecence of code units that the code point maps to | |
68 | * size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] | |
69 | * | |
70 | * The indexes array contains the following values: | |
71 | * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes | |
72 | * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes | |
73 | * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt | |
74 | * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table | |
75 | * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table | |
76 | * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table | |
77 | * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table | |
78 | * indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON | |
79 | * | |
80 | * | |
81 | * StringPrep Trie : | |
82 | * | |
83 | * The StringPrep tries is a 16-bit trie that contains data for the profile. | |
84 | * Each code point is associated with a value (trie-word) in the trie. | |
85 | * | |
86 | * - structure of data words from the trie | |
87 | * | |
88 | * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0) | |
89 | * represents the type associated with the code point | |
90 | * if(trieWord >= _SPREP_TYPE_THRESHOLD){ | |
91 | * type = trieWord - 0xFFF0; | |
92 | * } | |
93 | * The type can be : | |
94 | * USPREP_UNASSIGNED | |
95 | * USPREP_PROHIBITED | |
96 | * USPREP_DELETE | |
97 | * | |
98 | * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and | |
99 | * contains distribution described below | |
100 | * | |
101 | * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped. | |
102 | * 1 - ON : The value in the next 14 bits is an index into the mapping table | |
103 | * OFF: The value in the next 14 bits is an delta value from the code point | |
104 | * 2..15 - Contains data as described by bit 1. If all bits are set | |
105 | * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE | |
106 | * | |
107 | * | |
108 | * Mapping Table: | |
109 | * The data in mapping table is sorted according to the length of the mapping sequence. | |
110 | * If the type of the code point is USPREP_MAP and value in trie word is an index, the index | |
111 | * is compared with start indexes of sequence length start to figure out the length according to | |
112 | * the following algorithm: | |
113 | * | |
114 | * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] && | |
115 | * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){ | |
116 | * length = 1; | |
117 | * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] && | |
118 | * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){ | |
119 | * length = 2; | |
120 | * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] && | |
121 | * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){ | |
122 | * length = 3; | |
123 | * }else{ | |
124 | * // The first position in the mapping table contains the length | |
125 | * // of the sequence | |
126 | * length = mappingTable[index++]; | |
127 | * | |
128 | * } | |
129 | * | |
130 | */ | |
131 | ||
132 | /* file data ---------------------------------------------------------------- */ | |
133 | /* indexes[] value names */ | |
134 | ||
135 | #if UCONFIG_NO_IDNA | |
136 | ||
137 | /* dummy UDataInfo cf. udata.h */ | |
138 | static UDataInfo dataInfo = { | |
139 | sizeof(UDataInfo), | |
140 | 0, | |
141 | ||
142 | U_IS_BIG_ENDIAN, | |
143 | U_CHARSET_FAMILY, | |
144 | U_SIZEOF_UCHAR, | |
145 | 0, | |
146 | ||
147 | { 0, 0, 0, 0 }, /* dummy dataFormat */ | |
148 | { 0, 0, 0, 0 }, /* dummy formatVersion */ | |
149 | { 0, 0, 0, 0 } /* dummy dataVersion */ | |
150 | }; | |
151 | ||
152 | #else | |
153 | ||
154 | static int32_t indexes[_SPREP_INDEX_TOP]={ 0 }; | |
155 | ||
156 | static uint16_t* mappingData= NULL; | |
157 | static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */ | |
158 | static int16_t currentIndex = 0; /* the current index into the data trie */ | |
159 | static int32_t maxLength = 0; /* maximum length of mapping string */ | |
160 | ||
161 | ||
162 | /* UDataInfo cf. udata.h */ | |
163 | static UDataInfo dataInfo={ | |
164 | sizeof(UDataInfo), | |
165 | 0, | |
166 | ||
167 | U_IS_BIG_ENDIAN, | |
168 | U_CHARSET_FAMILY, | |
169 | U_SIZEOF_UCHAR, | |
170 | 0, | |
171 | ||
172 | { 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */ | |
173 | { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ | |
174 | { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ | |
175 | }; | |
176 | void | |
177 | setUnicodeVersion(const char *v) { | |
178 | UVersionInfo version; | |
179 | u_versionFromString(version, v); | |
180 | uprv_memcpy(dataInfo.dataVersion, version, 4); | |
181 | } | |
182 | ||
183 | void | |
184 | setUnicodeVersionNC(UVersionInfo version){ | |
185 | uint32_t univer = version[0] << 24; | |
186 | univer += version[1] << 16; | |
187 | univer += version[2] << 8; | |
188 | univer += version[3]; | |
189 | indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer; | |
190 | } | |
191 | static UNewTrie *sprepTrie; | |
192 | ||
193 | #define MAX_DATA_LENGTH 11500 | |
194 | ||
195 | ||
196 | #define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191 | |
197 | #define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192 | |
198 | ||
199 | ||
200 | extern void | |
201 | init() { | |
202 | ||
203 | sprepTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); | |
204 | uprv_memset(sprepTrie, 0, sizeof(UNewTrie)); | |
205 | ||
206 | /* initialize the two tries */ | |
207 | if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, FALSE)) { | |
208 | fprintf(stderr, "error: failed to initialize tries\n"); | |
209 | exit(U_MEMORY_ALLOCATION_ERROR); | |
210 | } | |
211 | } | |
212 | ||
213 | static UHashtable* hashTable = NULL; | |
214 | ||
215 | ||
216 | typedef struct ValueStruct { | |
217 | UChar* mapping; | |
218 | int16_t length; | |
219 | UStringPrepType type; | |
220 | } ValueStruct; | |
221 | ||
222 | /* Callback for deleting the value from the hashtable */ | |
223 | static void U_CALLCONV valueDeleter(void* obj){ | |
224 | ValueStruct* value = (ValueStruct*) obj; | |
225 | uprv_free(value->mapping); | |
226 | uprv_free(value); | |
227 | } | |
228 | ||
229 | /* Callback for hashing the entry */ | |
230 | static int32_t U_CALLCONV hashEntry(const UHashTok parm) { | |
231 | return parm.integer; | |
232 | } | |
233 | ||
234 | /* Callback for comparing two entries */ | |
235 | static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) { | |
236 | return (UBool)(p1.integer != p2.integer); | |
237 | } | |
238 | ||
239 | ||
240 | static void | |
241 | storeMappingData(){ | |
242 | ||
243 | int32_t pos = -1; | |
244 | const UHashElement* element = NULL; | |
245 | ValueStruct* value = NULL; | |
246 | int32_t codepoint = 0; | |
247 | int32_t elementCount = uhash_count(hashTable); | |
248 | int32_t writtenElementCount = 0; | |
249 | int32_t mappingLength = 1; /* minimum mapping length */ | |
250 | int32_t oldMappingLength = 0; | |
251 | uint16_t trieWord =0; | |
252 | int32_t limitIndex = 0; | |
253 | ||
254 | /*initialize the mapping data */ | |
255 | mappingData = (uint16_t*) uprv_malloc(U_SIZEOF_UCHAR * (mappingDataCapacity)); | |
256 | ||
257 | uprv_memset(mappingData,0,U_SIZEOF_UCHAR * mappingDataCapacity); | |
258 | ||
259 | while(writtenElementCount < elementCount){ | |
260 | ||
261 | while( (element = uhash_nextElement(hashTable, &pos))!=NULL){ | |
262 | ||
263 | codepoint = element->key.integer; | |
264 | value = (ValueStruct*)element->value.pointer; | |
265 | ||
266 | /* store the start of indexes */ | |
267 | if(oldMappingLength != mappingLength){ | |
268 | /* Assume that index[] is used according to the enums defined */ | |
269 | if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){ | |
270 | indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex; | |
271 | } | |
272 | if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH && | |
273 | mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){ | |
274 | ||
275 | limitIndex = currentIndex; | |
276 | ||
277 | } | |
278 | oldMappingLength = mappingLength; | |
279 | } | |
280 | ||
281 | if(value->length == mappingLength){ | |
282 | uint32_t savedTrieWord = 0; | |
283 | trieWord = currentIndex << 2; | |
284 | /* turn on the 2nd bit to signal that the following bits contain an index */ | |
285 | trieWord += 0x02; | |
286 | ||
287 | if(trieWord > _SPREP_TYPE_THRESHOLD){ | |
288 | fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD); | |
289 | exit(U_ILLEGAL_CHAR_FOUND); | |
290 | } | |
291 | /* figure out if the code point has type already stored */ | |
292 | savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); | |
293 | if(savedTrieWord!=0){ | |
294 | if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ | |
295 | /* turn on the first bit in trie word */ | |
296 | trieWord += 0x01; | |
297 | }else{ | |
298 | /* | |
299 | * the codepoint has value something other than prohibited | |
300 | * and a mapping .. error! | |
301 | */ | |
302 | fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); | |
303 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
304 | } | |
305 | } | |
306 | ||
307 | /* now set the value in the trie */ | |
308 | if(!utrie_set32(sprepTrie,codepoint,trieWord)){ | |
309 | fprintf(stderr,"Could not set the value for code point.\n"); | |
310 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
311 | } | |
312 | ||
313 | /* written the trie word for the codepoint... increment the count*/ | |
314 | writtenElementCount++; | |
315 | ||
316 | /* sanity check are we exceeding the max number allowed */ | |
317 | if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){ | |
318 | fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n", currentIndex+value->length, _SPREP_MAX_INDEX_VALUE); | |
319 | exit(U_INDEX_OUTOFBOUNDS_ERROR); | |
320 | } | |
321 | ||
322 | /* copy the mapping data */ | |
323 | if(currentIndex+value->length+1 <= mappingDataCapacity){ | |
324 | /* write the length */ | |
325 | if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ | |
326 | /* the cast here is safe since we donot expect the length to be > 65535 */ | |
327 | mappingData[currentIndex++] = (uint16_t) mappingLength; | |
328 | } | |
329 | /* copy the contents to mappindData array */ | |
330 | uprv_memmove(mappingData+currentIndex, value->mapping, value->length*U_SIZEOF_UCHAR); | |
331 | currentIndex += value->length; | |
332 | ||
333 | }else{ | |
334 | /* realloc */ | |
335 | UChar* newMappingData = (uint16_t*) uprv_malloc(U_SIZEOF_UCHAR * mappingDataCapacity*2); | |
336 | if(newMappingData == NULL){ | |
337 | fprintf(stderr, "Could not realloc the mapping data!\n"); | |
338 | exit(U_MEMORY_ALLOCATION_ERROR); | |
339 | } | |
340 | uprv_memmove(newMappingData, mappingData, U_SIZEOF_UCHAR * mappingDataCapacity); | |
341 | mappingDataCapacity *= 2; | |
342 | uprv_free(mappingData); | |
343 | mappingData = newMappingData; | |
344 | /* write the length */ | |
345 | if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ | |
346 | /* the cast here is safe since we donot expect the length to be > 65535 */ | |
347 | mappingData[currentIndex++] = (uint16_t) mappingLength; | |
348 | } | |
349 | /* continue copying */ | |
350 | uprv_memmove(mappingData+currentIndex, value->mapping, value->length*U_SIZEOF_UCHAR); | |
351 | currentIndex += value->length; | |
352 | } | |
353 | ||
354 | } | |
355 | } | |
356 | mappingLength++; | |
357 | pos = -1; | |
358 | } | |
359 | /* set the last length for range check */ | |
360 | if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){ | |
361 | indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1; | |
362 | }else{ | |
363 | indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex; | |
364 | } | |
365 | ||
366 | } | |
367 | ||
368 | extern void setOptions(int32_t options){ | |
369 | indexes[_SPREP_OPTIONS] = options; | |
370 | } | |
371 | extern void | |
372 | storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, | |
373 | UStringPrepType type, UErrorCode* status){ | |
374 | ||
375 | ||
376 | UChar* map = NULL; | |
377 | int16_t adjustedLen=0, i; | |
378 | uint16_t trieWord = 0; | |
379 | ValueStruct *value = NULL; | |
380 | uint32_t savedTrieWord = 0; | |
381 | ||
382 | /* initialize the hashtable */ | |
383 | if(hashTable==NULL){ | |
384 | hashTable = uhash_open(hashEntry, compareEntries, status); | |
385 | uhash_setValueDeleter(hashTable, valueDeleter); | |
386 | } | |
387 | ||
388 | /* figure out if the code point has type already stored */ | |
389 | savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); | |
390 | if(savedTrieWord!=0){ | |
391 | if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ | |
392 | /* turn on the first bit in trie word */ | |
393 | trieWord += 0x01; | |
394 | }else{ | |
395 | /* | |
396 | * the codepoint has value something other than prohibited | |
397 | * and a mapping .. error! | |
398 | */ | |
399 | fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); | |
400 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
401 | } | |
402 | } | |
403 | ||
404 | /* figure out the real length */ | |
405 | for(i=0; i<length; i++){ | |
406 | if(mapping[i] > 0xFFFF){ | |
407 | adjustedLen +=2; | |
408 | }else{ | |
409 | adjustedLen++; | |
410 | } | |
411 | } | |
412 | ||
413 | if(adjustedLen == 0){ | |
414 | trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2); | |
415 | /* make sure that the value of trieWord is less than the threshold */ | |
416 | if(trieWord < _SPREP_TYPE_THRESHOLD){ | |
417 | /* now set the value in the trie */ | |
418 | if(!utrie_set32(sprepTrie,codepoint,trieWord)){ | |
419 | fprintf(stderr,"Could not set the value for code point.\n"); | |
420 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
421 | } | |
422 | /* value is set so just return */ | |
423 | return; | |
424 | }else{ | |
425 | fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); | |
426 | exit(U_ILLEGAL_CHAR_FOUND); | |
427 | } | |
428 | } | |
429 | ||
430 | if(adjustedLen == 1){ | |
431 | /* calculate the delta */ | |
432 | int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]); | |
433 | if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){ | |
434 | ||
435 | trieWord = delta << 2; | |
436 | ||
437 | ||
438 | /* make sure that the second bit is OFF */ | |
439 | if((trieWord & 0x02) != 0 ){ | |
440 | fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n"); | |
441 | exit(U_INTERNAL_PROGRAM_ERROR); | |
442 | } | |
443 | /* make sure that the value of trieWord is less than the threshold */ | |
444 | if(trieWord < _SPREP_TYPE_THRESHOLD){ | |
445 | /* now set the value in the trie */ | |
446 | if(!utrie_set32(sprepTrie,codepoint,trieWord)){ | |
447 | fprintf(stderr,"Could not set the value for code point.\n"); | |
448 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
449 | } | |
450 | /* value is set so just return */ | |
451 | return; | |
452 | } | |
453 | } | |
454 | /* | |
455 | * if the delta is not in the given range or if the trieWord is larger than the threshold | |
456 | * just fall through for storing the mapping in the mapping table | |
457 | */ | |
458 | } | |
459 | ||
460 | map = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * (adjustedLen+1)); | |
461 | uprv_memset(map,0,U_SIZEOF_UCHAR * (adjustedLen+1)); | |
462 | ||
463 | i=0; | |
464 | ||
465 | while(i<length){ | |
466 | if(mapping[i] <= 0xFFFF){ | |
467 | map[i] = (uint16_t)mapping[i]; | |
468 | }else{ | |
469 | map[i] = UTF16_LEAD(mapping[i]); | |
470 | map[i+1] = UTF16_TRAIL(mapping[i]); | |
471 | } | |
472 | i++; | |
473 | } | |
474 | ||
475 | value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct)); | |
476 | value->mapping = map; | |
477 | value->type = type; | |
478 | value->length = adjustedLen; | |
479 | if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){ | |
480 | mappingDataCapacity++; | |
481 | } | |
482 | if(maxLength < value->length){ | |
483 | maxLength = value->length; | |
484 | } | |
485 | uhash_iput(hashTable,codepoint,value,status); | |
486 | mappingDataCapacity += adjustedLen; | |
487 | ||
488 | if(U_FAILURE(*status)){ | |
489 | fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status)); | |
490 | exit(*status); | |
491 | } | |
492 | } | |
493 | ||
494 | ||
495 | extern void | |
496 | storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status){ | |
497 | uint16_t trieWord = 0; | |
498 | ||
499 | trieWord += (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */ | |
500 | if(trieWord > 0xFFFF){ | |
501 | fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n"); | |
502 | exit(U_ILLEGAL_CHAR_FOUND); | |
503 | } | |
504 | if(start == end){ | |
505 | uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL); | |
506 | if(savedTrieWord>0){ | |
507 | if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){ | |
508 | /* | |
509 | * A mapping is stored in the trie word | |
510 | * and the only other possible type that a | |
511 | * code point can have is USPREP_PROHIBITED | |
512 | * | |
513 | */ | |
514 | ||
515 | /* turn on the 0th bit in the savedTrieWord */ | |
516 | savedTrieWord += 0x01; | |
517 | ||
518 | /* the downcast is safe since we only save 16 bit values */ | |
519 | trieWord = (uint16_t)savedTrieWord; | |
520 | ||
521 | /* make sure that the value of trieWord is less than the threshold */ | |
522 | if(trieWord < _SPREP_TYPE_THRESHOLD){ | |
523 | /* now set the value in the trie */ | |
524 | if(!utrie_set32(sprepTrie,start,trieWord)){ | |
525 | fprintf(stderr,"Could not set the value for code point.\n"); | |
526 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
527 | } | |
528 | /* value is set so just return */ | |
529 | return; | |
530 | }else{ | |
531 | fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); | |
532 | exit(U_ILLEGAL_CHAR_FOUND); | |
533 | } | |
534 | ||
535 | }else if(savedTrieWord != trieWord){ | |
536 | fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start); | |
537 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
538 | } | |
539 | /* if savedTrieWord == trieWord .. fall through and set the value */ | |
540 | } | |
541 | if(!utrie_set32(sprepTrie,start,trieWord)){ | |
542 | fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start); | |
543 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
544 | } | |
545 | }else{ | |
546 | if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, FALSE)){ | |
547 | fprintf(stderr,"Value for certain codepoint already set.\n"); | |
548 | exit(U_ILLEGAL_CHAR_FOUND); | |
549 | } | |
550 | } | |
551 | ||
552 | } | |
553 | ||
554 | /* folding value: just store the offset (16 bits) if there is any non-0 entry */ | |
555 | static uint32_t U_CALLCONV | |
556 | getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { | |
557 | uint32_t foldedValue, value; | |
558 | UChar32 limit=0; | |
559 | UBool inBlockZero; | |
560 | ||
561 | foldedValue=0; | |
562 | ||
563 | limit=start+0x400; | |
564 | while(start<limit) { | |
565 | value=utrie_get32(trie, start, &inBlockZero); | |
566 | if(inBlockZero) { | |
567 | start+=UTRIE_DATA_BLOCK_LENGTH; | |
568 | } else if(value!=0) { | |
569 | return (uint32_t)offset; | |
570 | } else { | |
571 | ++start; | |
572 | } | |
573 | } | |
574 | return 0; | |
575 | ||
576 | } | |
577 | ||
578 | #endif /* #if !UCONFIG_NO_IDNA */ | |
579 | ||
580 | extern void | |
581 | generateData(const char *dataDir, const char *packageName, const char* bundleName) { | |
582 | static uint8_t sprepTrieBlock[100000]; | |
583 | ||
584 | UNewDataMemory *pData; | |
585 | UErrorCode errorCode=U_ZERO_ERROR; | |
586 | int32_t size, dataLength; | |
587 | char* fileName = (char*) uprv_malloc(uprv_strlen(bundleName) +100); | |
588 | ||
589 | #if UCONFIG_NO_IDNA | |
590 | ||
591 | size=0; | |
592 | ||
593 | #else | |
594 | ||
595 | int32_t sprepTrieSize; | |
596 | ||
597 | /* sort and add mapping data */ | |
598 | storeMappingData(); | |
599 | ||
600 | sprepTrieSize=utrie_serialize(sprepTrie, sprepTrieBlock, sizeof(sprepTrieBlock), getFoldedValue, TRUE, &errorCode); | |
601 | if(U_FAILURE(errorCode)) { | |
602 | fprintf(stderr, "error: utrie_serialize(sprep trie) failed, %s\n", u_errorName(errorCode)); | |
603 | exit(errorCode); | |
604 | } | |
605 | ||
606 | size = sprepTrieSize + mappingDataCapacity*U_SIZEOF_UCHAR + sizeof(indexes); | |
607 | if(beVerbose) { | |
608 | printf("size of sprep trie %5u bytes\n", (int)sprepTrieSize); | |
609 | printf("size of " U_ICUDATA_NAME "_%s." DATA_TYPE " contents: %ld bytes\n", bundleName,(long)size); | |
610 | printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity * U_SIZEOF_UCHAR); | |
611 | printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex); | |
612 | printf("Maximum length of the mapping string is : %i \n", (int)maxLength); | |
613 | } | |
614 | ||
615 | #endif | |
616 | ||
617 | if(packageName != NULL) { | |
618 | uprv_strcpy(fileName,packageName); | |
619 | uprv_strcat(fileName,"_"); | |
620 | } else { | |
621 | fileName[0]=0; | |
622 | } | |
623 | uprv_strcat(fileName,bundleName); | |
624 | /* write the data */ | |
625 | pData=udata_create(dataDir, DATA_TYPE, fileName, &dataInfo, | |
626 | haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); | |
627 | if(U_FAILURE(errorCode)) { | |
628 | fprintf(stderr, "gensprep: unable to create the output file, error %d\n", errorCode); | |
629 | exit(errorCode); | |
630 | } | |
631 | ||
632 | #if !UCONFIG_NO_IDNA | |
633 | ||
634 | indexes[_SPREP_INDEX_TRIE_SIZE]=sprepTrieSize; | |
635 | indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]=mappingDataCapacity*U_SIZEOF_UCHAR; | |
636 | ||
637 | udata_writeBlock(pData, indexes, sizeof(indexes)); | |
638 | udata_writeBlock(pData, sprepTrieBlock, sprepTrieSize); | |
639 | udata_writeBlock(pData, mappingData, indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]); | |
640 | ||
641 | ||
642 | #endif | |
643 | ||
644 | /* finish up */ | |
645 | dataLength=udata_finish(pData, &errorCode); | |
646 | if(U_FAILURE(errorCode)) { | |
647 | fprintf(stderr, "gensprep: error %d writing the output file\n", errorCode); | |
648 | exit(errorCode); | |
649 | } | |
650 | ||
651 | if(dataLength!=size) { | |
652 | fprintf(stderr, "gensprep error: data length %ld != calculated size %ld\n", | |
653 | (long)dataLength, (long)size); | |
654 | exit(U_INTERNAL_PROGRAM_ERROR); | |
655 | } | |
656 | ||
657 | #if !UCONFIG_NO_IDNA | |
658 | /* done with writing the data .. close the hashtable */ | |
659 | uhash_close(hashTable); | |
660 | #endif | |
661 | } | |
662 | ||
663 | #if !UCONFIG_NO_IDNA | |
664 | ||
665 | extern void | |
666 | cleanUpData(void) { | |
667 | ||
668 | utrie_close(sprepTrie); | |
669 | uprv_free(sprepTrie); | |
670 | } | |
671 | ||
672 | #endif /* #if !UCONFIG_NO_IDNA */ | |
673 | ||
674 | /* | |
675 | * Hey, Emacs, please set the following: | |
676 | * | |
677 | * Local Variables: | |
678 | * indent-tabs-mode: nil | |
679 | * End: | |
680 | * | |
681 | */ |