]>
Commit | Line | Data |
---|---|---|
374ca955 A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
73c04bcf | 4 | * Copyright (C) 1999-2006, International Business Machines |
374ca955 A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: store.c | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2003-02-06 | |
14 | * created by: Ram Viswanadha | |
15 | * | |
16 | */ | |
17 | ||
18 | #include <stdio.h> | |
19 | #include <stdlib.h> | |
20 | #include "unicode/utypes.h" | |
21 | #include "cmemory.h" | |
22 | #include "cstring.h" | |
23 | #include "filestrm.h" | |
24 | #include "unicode/udata.h" | |
25 | #include "utrie.h" | |
26 | #include "unewdata.h" | |
27 | #include "gensprep.h" | |
28 | #include "uhash.h" | |
29 | ||
30 | ||
374ca955 A |
31 | #define DO_DEBUG_OUT 0 |
32 | ||
33 | ||
34 | /* | |
35 | * StringPrep profile file format ------------------------------------ | |
36 | * | |
37 | * The file format prepared and written here contains a 16-bit trie and a mapping table. | |
38 | * | |
39 | * Before the data contents described below, there are the headers required by | |
40 | * the udata API for loading ICU data. Especially, a UDataInfo structure | |
41 | * precedes the actual data. It contains platform properties values and the | |
42 | * file format version. | |
43 | * | |
44 | * The following is a description of format version 2. | |
45 | * | |
46 | * Data contents: | |
47 | * | |
48 | * The contents is a parsed, binary form of RFC3454 and possibly | |
49 | * NormalizationCorrections.txt depending on the options specified on the profile. | |
50 | * | |
51 | * Any Unicode code point from 0 to 0x10ffff can be looked up to get | |
52 | * the trie-word, if any, for that code point. This means that the input | |
53 | * to the lookup are 21-bit unsigned integers, with not all of the | |
54 | * 21-bit range used. | |
55 | * | |
56 | * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c. | |
57 | * After that there are the following structures: | |
58 | * | |
59 | * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file | |
60 | * | |
61 | * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE] | |
62 | * | |
63 | * uint16_t mappingTable[]; -- Contains the sequecence of code units that the code point maps to | |
64 | * size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] | |
65 | * | |
66 | * The indexes array contains the following values: | |
67 | * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes | |
68 | * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes | |
69 | * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt | |
70 | * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table | |
71 | * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table | |
72 | * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table | |
73 | * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table | |
74 | * indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON | |
75 | * | |
76 | * | |
77 | * StringPrep Trie : | |
78 | * | |
79 | * The StringPrep tries is a 16-bit trie that contains data for the profile. | |
80 | * Each code point is associated with a value (trie-word) in the trie. | |
81 | * | |
82 | * - structure of data words from the trie | |
83 | * | |
84 | * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0) | |
85 | * represents the type associated with the code point | |
86 | * if(trieWord >= _SPREP_TYPE_THRESHOLD){ | |
87 | * type = trieWord - 0xFFF0; | |
88 | * } | |
89 | * The type can be : | |
90 | * USPREP_UNASSIGNED | |
91 | * USPREP_PROHIBITED | |
92 | * USPREP_DELETE | |
93 | * | |
94 | * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and | |
95 | * contains distribution described below | |
96 | * | |
97 | * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped. | |
98 | * 1 - ON : The value in the next 14 bits is an index into the mapping table | |
99 | * OFF: The value in the next 14 bits is an delta value from the code point | |
100 | * 2..15 - Contains data as described by bit 1. If all bits are set | |
101 | * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE | |
102 | * | |
103 | * | |
104 | * Mapping Table: | |
105 | * The data in mapping table is sorted according to the length of the mapping sequence. | |
106 | * If the type of the code point is USPREP_MAP and value in trie word is an index, the index | |
107 | * is compared with start indexes of sequence length start to figure out the length according to | |
108 | * the following algorithm: | |
109 | * | |
110 | * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] && | |
111 | * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){ | |
112 | * length = 1; | |
113 | * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] && | |
114 | * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){ | |
115 | * length = 2; | |
116 | * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] && | |
117 | * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){ | |
118 | * length = 3; | |
119 | * }else{ | |
120 | * // The first position in the mapping table contains the length | |
121 | * // of the sequence | |
122 | * length = mappingTable[index++]; | |
123 | * | |
124 | * } | |
125 | * | |
126 | */ | |
127 | ||
128 | /* file data ---------------------------------------------------------------- */ | |
129 | /* indexes[] value names */ | |
130 | ||
131 | #if UCONFIG_NO_IDNA | |
132 | ||
133 | /* dummy UDataInfo cf. udata.h */ | |
134 | static UDataInfo dataInfo = { | |
135 | sizeof(UDataInfo), | |
136 | 0, | |
137 | ||
138 | U_IS_BIG_ENDIAN, | |
139 | U_CHARSET_FAMILY, | |
140 | U_SIZEOF_UCHAR, | |
141 | 0, | |
142 | ||
143 | { 0, 0, 0, 0 }, /* dummy dataFormat */ | |
144 | { 0, 0, 0, 0 }, /* dummy formatVersion */ | |
145 | { 0, 0, 0, 0 } /* dummy dataVersion */ | |
146 | }; | |
147 | ||
148 | #else | |
149 | ||
150 | static int32_t indexes[_SPREP_INDEX_TOP]={ 0 }; | |
151 | ||
152 | static uint16_t* mappingData= NULL; | |
153 | static int32_t mappingDataCapacity = 0; /* we skip the first index in mapping data */ | |
154 | static int16_t currentIndex = 0; /* the current index into the data trie */ | |
155 | static int32_t maxLength = 0; /* maximum length of mapping string */ | |
156 | ||
157 | ||
158 | /* UDataInfo cf. udata.h */ | |
159 | static UDataInfo dataInfo={ | |
160 | sizeof(UDataInfo), | |
161 | 0, | |
162 | ||
163 | U_IS_BIG_ENDIAN, | |
164 | U_CHARSET_FAMILY, | |
165 | U_SIZEOF_UCHAR, | |
166 | 0, | |
167 | ||
168 | { 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */ | |
169 | { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ | |
170 | { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ | |
171 | }; | |
172 | void | |
173 | setUnicodeVersion(const char *v) { | |
174 | UVersionInfo version; | |
175 | u_versionFromString(version, v); | |
176 | uprv_memcpy(dataInfo.dataVersion, version, 4); | |
177 | } | |
178 | ||
179 | void | |
180 | setUnicodeVersionNC(UVersionInfo version){ | |
181 | uint32_t univer = version[0] << 24; | |
182 | univer += version[1] << 16; | |
183 | univer += version[2] << 8; | |
184 | univer += version[3]; | |
185 | indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] = univer; | |
186 | } | |
187 | static UNewTrie *sprepTrie; | |
188 | ||
189 | #define MAX_DATA_LENGTH 11500 | |
190 | ||
191 | ||
192 | #define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191 | |
193 | #define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192 | |
194 | ||
195 | ||
196 | extern void | |
197 | init() { | |
198 | ||
199 | sprepTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); | |
200 | uprv_memset(sprepTrie, 0, sizeof(UNewTrie)); | |
201 | ||
202 | /* initialize the two tries */ | |
203 | if(NULL==utrie_open(sprepTrie, NULL, MAX_DATA_LENGTH, 0, 0, FALSE)) { | |
204 | fprintf(stderr, "error: failed to initialize tries\n"); | |
205 | exit(U_MEMORY_ALLOCATION_ERROR); | |
206 | } | |
207 | } | |
208 | ||
209 | static UHashtable* hashTable = NULL; | |
210 | ||
211 | ||
212 | typedef struct ValueStruct { | |
213 | UChar* mapping; | |
214 | int16_t length; | |
215 | UStringPrepType type; | |
216 | } ValueStruct; | |
217 | ||
218 | /* Callback for deleting the value from the hashtable */ | |
219 | static void U_CALLCONV valueDeleter(void* obj){ | |
220 | ValueStruct* value = (ValueStruct*) obj; | |
221 | uprv_free(value->mapping); | |
222 | uprv_free(value); | |
223 | } | |
224 | ||
225 | /* Callback for hashing the entry */ | |
226 | static int32_t U_CALLCONV hashEntry(const UHashTok parm) { | |
227 | return parm.integer; | |
228 | } | |
229 | ||
230 | /* Callback for comparing two entries */ | |
231 | static UBool U_CALLCONV compareEntries(const UHashTok p1, const UHashTok p2) { | |
232 | return (UBool)(p1.integer != p2.integer); | |
233 | } | |
234 | ||
235 | ||
236 | static void | |
237 | storeMappingData(){ | |
238 | ||
239 | int32_t pos = -1; | |
240 | const UHashElement* element = NULL; | |
241 | ValueStruct* value = NULL; | |
242 | int32_t codepoint = 0; | |
243 | int32_t elementCount = uhash_count(hashTable); | |
244 | int32_t writtenElementCount = 0; | |
245 | int32_t mappingLength = 1; /* minimum mapping length */ | |
246 | int32_t oldMappingLength = 0; | |
247 | uint16_t trieWord =0; | |
248 | int32_t limitIndex = 0; | |
249 | ||
250 | /*initialize the mapping data */ | |
251 | mappingData = (uint16_t*) uprv_malloc(U_SIZEOF_UCHAR * (mappingDataCapacity)); | |
252 | ||
253 | uprv_memset(mappingData,0,U_SIZEOF_UCHAR * mappingDataCapacity); | |
254 | ||
255 | while(writtenElementCount < elementCount){ | |
256 | ||
257 | while( (element = uhash_nextElement(hashTable, &pos))!=NULL){ | |
258 | ||
259 | codepoint = element->key.integer; | |
260 | value = (ValueStruct*)element->value.pointer; | |
261 | ||
262 | /* store the start of indexes */ | |
263 | if(oldMappingLength != mappingLength){ | |
264 | /* Assume that index[] is used according to the enums defined */ | |
265 | if(oldMappingLength <=_SPREP_MAX_INDEX_TOP_LENGTH){ | |
266 | indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex; | |
267 | } | |
268 | if(oldMappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH && | |
269 | mappingLength == _SPREP_MAX_INDEX_TOP_LENGTH +1){ | |
270 | ||
271 | limitIndex = currentIndex; | |
272 | ||
273 | } | |
274 | oldMappingLength = mappingLength; | |
275 | } | |
276 | ||
277 | if(value->length == mappingLength){ | |
278 | uint32_t savedTrieWord = 0; | |
279 | trieWord = currentIndex << 2; | |
280 | /* turn on the 2nd bit to signal that the following bits contain an index */ | |
281 | trieWord += 0x02; | |
282 | ||
283 | if(trieWord > _SPREP_TYPE_THRESHOLD){ | |
284 | fprintf(stderr,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD); | |
285 | exit(U_ILLEGAL_CHAR_FOUND); | |
286 | } | |
287 | /* figure out if the code point has type already stored */ | |
288 | savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); | |
289 | if(savedTrieWord!=0){ | |
290 | if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ | |
291 | /* turn on the first bit in trie word */ | |
292 | trieWord += 0x01; | |
293 | }else{ | |
294 | /* | |
295 | * the codepoint has value something other than prohibited | |
296 | * and a mapping .. error! | |
297 | */ | |
298 | fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); | |
299 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
300 | } | |
301 | } | |
302 | ||
303 | /* now set the value in the trie */ | |
304 | if(!utrie_set32(sprepTrie,codepoint,trieWord)){ | |
305 | fprintf(stderr,"Could not set the value for code point.\n"); | |
306 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
307 | } | |
308 | ||
309 | /* written the trie word for the codepoint... increment the count*/ | |
310 | writtenElementCount++; | |
311 | ||
312 | /* sanity check are we exceeding the max number allowed */ | |
313 | if(currentIndex+value->length+1 > _SPREP_MAX_INDEX_VALUE){ | |
314 | fprintf(stderr, "Too many entries in the mapping table %i. Maximum allowed is %i\n", currentIndex+value->length, _SPREP_MAX_INDEX_VALUE); | |
315 | exit(U_INDEX_OUTOFBOUNDS_ERROR); | |
316 | } | |
317 | ||
318 | /* copy the mapping data */ | |
319 | if(currentIndex+value->length+1 <= mappingDataCapacity){ | |
320 | /* write the length */ | |
321 | if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ | |
322 | /* the cast here is safe since we donot expect the length to be > 65535 */ | |
323 | mappingData[currentIndex++] = (uint16_t) mappingLength; | |
324 | } | |
325 | /* copy the contents to mappindData array */ | |
326 | uprv_memmove(mappingData+currentIndex, value->mapping, value->length*U_SIZEOF_UCHAR); | |
327 | currentIndex += value->length; | |
328 | ||
329 | }else{ | |
330 | /* realloc */ | |
331 | UChar* newMappingData = (uint16_t*) uprv_malloc(U_SIZEOF_UCHAR * mappingDataCapacity*2); | |
332 | if(newMappingData == NULL){ | |
333 | fprintf(stderr, "Could not realloc the mapping data!\n"); | |
334 | exit(U_MEMORY_ALLOCATION_ERROR); | |
335 | } | |
336 | uprv_memmove(newMappingData, mappingData, U_SIZEOF_UCHAR * mappingDataCapacity); | |
337 | mappingDataCapacity *= 2; | |
338 | uprv_free(mappingData); | |
339 | mappingData = newMappingData; | |
340 | /* write the length */ | |
341 | if(mappingLength > _SPREP_MAX_INDEX_TOP_LENGTH ){ | |
342 | /* the cast here is safe since we donot expect the length to be > 65535 */ | |
343 | mappingData[currentIndex++] = (uint16_t) mappingLength; | |
344 | } | |
345 | /* continue copying */ | |
346 | uprv_memmove(mappingData+currentIndex, value->mapping, value->length*U_SIZEOF_UCHAR); | |
347 | currentIndex += value->length; | |
348 | } | |
349 | ||
350 | } | |
351 | } | |
352 | mappingLength++; | |
353 | pos = -1; | |
354 | } | |
355 | /* set the last length for range check */ | |
356 | if(mappingLength <= _SPREP_MAX_INDEX_TOP_LENGTH){ | |
357 | indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION+mappingLength] = currentIndex+1; | |
358 | }else{ | |
359 | indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] = limitIndex; | |
360 | } | |
361 | ||
362 | } | |
363 | ||
364 | extern void setOptions(int32_t options){ | |
365 | indexes[_SPREP_OPTIONS] = options; | |
366 | } | |
367 | extern void | |
368 | storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, | |
369 | UStringPrepType type, UErrorCode* status){ | |
370 | ||
371 | ||
372 | UChar* map = NULL; | |
373 | int16_t adjustedLen=0, i; | |
374 | uint16_t trieWord = 0; | |
375 | ValueStruct *value = NULL; | |
376 | uint32_t savedTrieWord = 0; | |
377 | ||
378 | /* initialize the hashtable */ | |
379 | if(hashTable==NULL){ | |
73c04bcf | 380 | hashTable = uhash_open(hashEntry, compareEntries, NULL, status); |
374ca955 A |
381 | uhash_setValueDeleter(hashTable, valueDeleter); |
382 | } | |
383 | ||
384 | /* figure out if the code point has type already stored */ | |
385 | savedTrieWord= utrie_get32(sprepTrie,codepoint,NULL); | |
386 | if(savedTrieWord!=0){ | |
387 | if((savedTrieWord- _SPREP_TYPE_THRESHOLD) == USPREP_PROHIBITED){ | |
388 | /* turn on the first bit in trie word */ | |
389 | trieWord += 0x01; | |
390 | }else{ | |
391 | /* | |
392 | * the codepoint has value something other than prohibited | |
393 | * and a mapping .. error! | |
394 | */ | |
395 | fprintf(stderr,"Type for codepoint \\U%08X already set!.\n", (int)codepoint); | |
396 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
397 | } | |
398 | } | |
399 | ||
400 | /* figure out the real length */ | |
401 | for(i=0; i<length; i++){ | |
402 | if(mapping[i] > 0xFFFF){ | |
403 | adjustedLen +=2; | |
404 | }else{ | |
405 | adjustedLen++; | |
406 | } | |
407 | } | |
408 | ||
409 | if(adjustedLen == 0){ | |
410 | trieWord = (uint16_t)(_SPREP_MAX_INDEX_VALUE << 2); | |
411 | /* make sure that the value of trieWord is less than the threshold */ | |
412 | if(trieWord < _SPREP_TYPE_THRESHOLD){ | |
413 | /* now set the value in the trie */ | |
414 | if(!utrie_set32(sprepTrie,codepoint,trieWord)){ | |
415 | fprintf(stderr,"Could not set the value for code point.\n"); | |
416 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
417 | } | |
418 | /* value is set so just return */ | |
419 | return; | |
420 | }else{ | |
421 | fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); | |
422 | exit(U_ILLEGAL_CHAR_FOUND); | |
423 | } | |
424 | } | |
425 | ||
426 | if(adjustedLen == 1){ | |
427 | /* calculate the delta */ | |
428 | int16_t delta = (int16_t)((int32_t)codepoint - (int16_t) mapping[0]); | |
429 | if(delta >= SPREP_DELTA_RANGE_NEGATIVE_LIMIT && delta <= SPREP_DELTA_RANGE_POSITIVE_LIMIT){ | |
430 | ||
431 | trieWord = delta << 2; | |
432 | ||
433 | ||
434 | /* make sure that the second bit is OFF */ | |
435 | if((trieWord & 0x02) != 0 ){ | |
436 | fprintf(stderr,"The second bit in the trie word is not zero while storing a delta.\n"); | |
437 | exit(U_INTERNAL_PROGRAM_ERROR); | |
438 | } | |
439 | /* make sure that the value of trieWord is less than the threshold */ | |
440 | if(trieWord < _SPREP_TYPE_THRESHOLD){ | |
441 | /* now set the value in the trie */ | |
442 | if(!utrie_set32(sprepTrie,codepoint,trieWord)){ | |
443 | fprintf(stderr,"Could not set the value for code point.\n"); | |
444 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
445 | } | |
446 | /* value is set so just return */ | |
447 | return; | |
448 | } | |
449 | } | |
450 | /* | |
451 | * if the delta is not in the given range or if the trieWord is larger than the threshold | |
452 | * just fall through for storing the mapping in the mapping table | |
453 | */ | |
454 | } | |
455 | ||
456 | map = (UChar*) uprv_malloc(U_SIZEOF_UCHAR * (adjustedLen+1)); | |
457 | uprv_memset(map,0,U_SIZEOF_UCHAR * (adjustedLen+1)); | |
458 | ||
459 | i=0; | |
460 | ||
461 | while(i<length){ | |
462 | if(mapping[i] <= 0xFFFF){ | |
463 | map[i] = (uint16_t)mapping[i]; | |
464 | }else{ | |
465 | map[i] = UTF16_LEAD(mapping[i]); | |
466 | map[i+1] = UTF16_TRAIL(mapping[i]); | |
467 | } | |
468 | i++; | |
469 | } | |
470 | ||
471 | value = (ValueStruct*) uprv_malloc(sizeof(ValueStruct)); | |
472 | value->mapping = map; | |
473 | value->type = type; | |
474 | value->length = adjustedLen; | |
475 | if(value->length > _SPREP_MAX_INDEX_TOP_LENGTH){ | |
476 | mappingDataCapacity++; | |
477 | } | |
478 | if(maxLength < value->length){ | |
479 | maxLength = value->length; | |
480 | } | |
481 | uhash_iput(hashTable,codepoint,value,status); | |
482 | mappingDataCapacity += adjustedLen; | |
483 | ||
484 | if(U_FAILURE(*status)){ | |
485 | fprintf(stderr, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status)); | |
486 | exit(*status); | |
487 | } | |
488 | } | |
489 | ||
490 | ||
491 | extern void | |
492 | storeRange(uint32_t start, uint32_t end, UStringPrepType type,UErrorCode* status){ | |
493 | uint16_t trieWord = 0; | |
494 | ||
73c04bcf | 495 | if((int)(_SPREP_TYPE_THRESHOLD + type) > 0xFFFF){ |
374ca955 A |
496 | fprintf(stderr,"trieWord cannot contain value greater than 0xFFFF.\n"); |
497 | exit(U_ILLEGAL_CHAR_FOUND); | |
498 | } | |
73c04bcf | 499 | trieWord = (_SPREP_TYPE_THRESHOLD + type); /* the top 4 bits contain the value */ |
374ca955 A |
500 | if(start == end){ |
501 | uint32_t savedTrieWord = utrie_get32(sprepTrie, start, NULL); | |
502 | if(savedTrieWord>0){ | |
503 | if(savedTrieWord < _SPREP_TYPE_THRESHOLD && type == USPREP_PROHIBITED){ | |
504 | /* | |
505 | * A mapping is stored in the trie word | |
506 | * and the only other possible type that a | |
507 | * code point can have is USPREP_PROHIBITED | |
508 | * | |
509 | */ | |
510 | ||
511 | /* turn on the 0th bit in the savedTrieWord */ | |
512 | savedTrieWord += 0x01; | |
513 | ||
514 | /* the downcast is safe since we only save 16 bit values */ | |
515 | trieWord = (uint16_t)savedTrieWord; | |
516 | ||
517 | /* make sure that the value of trieWord is less than the threshold */ | |
518 | if(trieWord < _SPREP_TYPE_THRESHOLD){ | |
519 | /* now set the value in the trie */ | |
520 | if(!utrie_set32(sprepTrie,start,trieWord)){ | |
521 | fprintf(stderr,"Could not set the value for code point.\n"); | |
522 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
523 | } | |
524 | /* value is set so just return */ | |
525 | return; | |
526 | }else{ | |
527 | fprintf(stderr,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD); | |
528 | exit(U_ILLEGAL_CHAR_FOUND); | |
529 | } | |
530 | ||
531 | }else if(savedTrieWord != trieWord){ | |
532 | fprintf(stderr,"Value for codepoint \\U%08X already set!.\n", (int)start); | |
533 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
534 | } | |
535 | /* if savedTrieWord == trieWord .. fall through and set the value */ | |
536 | } | |
537 | if(!utrie_set32(sprepTrie,start,trieWord)){ | |
538 | fprintf(stderr,"Could not set the value for code point \\U%08X.\n", (int)start); | |
539 | exit(U_ILLEGAL_ARGUMENT_ERROR); | |
540 | } | |
541 | }else{ | |
542 | if(!utrie_setRange32(sprepTrie, start, end+1, trieWord, FALSE)){ | |
543 | fprintf(stderr,"Value for certain codepoint already set.\n"); | |
544 | exit(U_ILLEGAL_CHAR_FOUND); | |
545 | } | |
546 | } | |
547 | ||
548 | } | |
549 | ||
550 | /* folding value: just store the offset (16 bits) if there is any non-0 entry */ | |
551 | static uint32_t U_CALLCONV | |
552 | getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) { | |
553 | uint32_t foldedValue, value; | |
554 | UChar32 limit=0; | |
555 | UBool inBlockZero; | |
556 | ||
557 | foldedValue=0; | |
558 | ||
559 | limit=start+0x400; | |
560 | while(start<limit) { | |
561 | value=utrie_get32(trie, start, &inBlockZero); | |
562 | if(inBlockZero) { | |
563 | start+=UTRIE_DATA_BLOCK_LENGTH; | |
564 | } else if(value!=0) { | |
565 | return (uint32_t)offset; | |
566 | } else { | |
567 | ++start; | |
568 | } | |
569 | } | |
570 | return 0; | |
571 | ||
572 | } | |
573 | ||
574 | #endif /* #if !UCONFIG_NO_IDNA */ | |
575 | ||
576 | extern void | |
73c04bcf | 577 | generateData(const char *dataDir, const char* bundleName) { |
374ca955 A |
578 | static uint8_t sprepTrieBlock[100000]; |
579 | ||
580 | UNewDataMemory *pData; | |
581 | UErrorCode errorCode=U_ZERO_ERROR; | |
582 | int32_t size, dataLength; | |
583 | char* fileName = (char*) uprv_malloc(uprv_strlen(bundleName) +100); | |
584 | ||
585 | #if UCONFIG_NO_IDNA | |
586 | ||
587 | size=0; | |
588 | ||
589 | #else | |
590 | ||
591 | int32_t sprepTrieSize; | |
592 | ||
593 | /* sort and add mapping data */ | |
594 | storeMappingData(); | |
595 | ||
596 | sprepTrieSize=utrie_serialize(sprepTrie, sprepTrieBlock, sizeof(sprepTrieBlock), getFoldedValue, TRUE, &errorCode); | |
597 | if(U_FAILURE(errorCode)) { | |
598 | fprintf(stderr, "error: utrie_serialize(sprep trie) failed, %s\n", u_errorName(errorCode)); | |
599 | exit(errorCode); | |
600 | } | |
601 | ||
602 | size = sprepTrieSize + mappingDataCapacity*U_SIZEOF_UCHAR + sizeof(indexes); | |
603 | if(beVerbose) { | |
604 | printf("size of sprep trie %5u bytes\n", (int)sprepTrieSize); | |
605 | printf("size of " U_ICUDATA_NAME "_%s." DATA_TYPE " contents: %ld bytes\n", bundleName,(long)size); | |
606 | printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity * U_SIZEOF_UCHAR); | |
607 | printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex); | |
608 | printf("Maximum length of the mapping string is : %i \n", (int)maxLength); | |
609 | } | |
610 | ||
611 | #endif | |
612 | ||
73c04bcf | 613 | fileName[0]=0; |
374ca955 A |
614 | uprv_strcat(fileName,bundleName); |
615 | /* write the data */ | |
616 | pData=udata_create(dataDir, DATA_TYPE, fileName, &dataInfo, | |
617 | haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); | |
618 | if(U_FAILURE(errorCode)) { | |
619 | fprintf(stderr, "gensprep: unable to create the output file, error %d\n", errorCode); | |
620 | exit(errorCode); | |
621 | } | |
622 | ||
623 | #if !UCONFIG_NO_IDNA | |
624 | ||
625 | indexes[_SPREP_INDEX_TRIE_SIZE]=sprepTrieSize; | |
626 | indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]=mappingDataCapacity*U_SIZEOF_UCHAR; | |
627 | ||
628 | udata_writeBlock(pData, indexes, sizeof(indexes)); | |
629 | udata_writeBlock(pData, sprepTrieBlock, sprepTrieSize); | |
630 | udata_writeBlock(pData, mappingData, indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]); | |
631 | ||
632 | ||
633 | #endif | |
634 | ||
635 | /* finish up */ | |
636 | dataLength=udata_finish(pData, &errorCode); | |
637 | if(U_FAILURE(errorCode)) { | |
638 | fprintf(stderr, "gensprep: error %d writing the output file\n", errorCode); | |
639 | exit(errorCode); | |
640 | } | |
641 | ||
642 | if(dataLength!=size) { | |
643 | fprintf(stderr, "gensprep error: data length %ld != calculated size %ld\n", | |
644 | (long)dataLength, (long)size); | |
645 | exit(U_INTERNAL_PROGRAM_ERROR); | |
646 | } | |
647 | ||
648 | #if !UCONFIG_NO_IDNA | |
649 | /* done with writing the data .. close the hashtable */ | |
650 | uhash_close(hashTable); | |
651 | #endif | |
652 | } | |
653 | ||
654 | #if !UCONFIG_NO_IDNA | |
655 | ||
656 | extern void | |
657 | cleanUpData(void) { | |
658 | ||
659 | utrie_close(sprepTrie); | |
660 | uprv_free(sprepTrie); | |
661 | } | |
662 | ||
663 | #endif /* #if !UCONFIG_NO_IDNA */ | |
664 | ||
665 | /* | |
666 | * Hey, Emacs, please set the following: | |
667 | * | |
668 | * Local Variables: | |
669 | * indent-tabs-mode: nil | |
670 | * End: | |
671 | * | |
672 | */ |