2 *******************************************************************************
4 * Copyright (C) 1999-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2003-02-06
14 * created by: Ram Viswanadha
20 #include "unicode/utypes.h"
24 #include "unicode/udata.h"
32 # pragma warning(disable: 4100)
35 #define DO_DEBUG_OUT 0
39 * StringPrep profile file format ------------------------------------
41 * The file format prepared and written here contains a 16-bit trie and a mapping table.
43 * Before the data contents described below, there are the headers required by
44 * the udata API for loading ICU data. Especially, a UDataInfo structure
45 * precedes the actual data. It contains platform properties values and the
46 * file format version.
48 * The following is a description of format version 2.
52 * The contents is a parsed, binary form of RFC3454 and possibly
53 * NormalizationCorrections.txt depending on the options specified on the profile.
55 * Any Unicode code point from 0 to 0x10ffff can be looked up to get
56 * the trie-word, if any, for that code point. This means that the input
57 * to the lookup are 21-bit unsigned integers, with not all of the
60 * *.spp files customarily begin with a UDataInfo structure, see udata.h and .c.
61 * After that there are the following structures:
63 * int32_t indexes[_SPREP_INDEX_TOP]; -- _SPREP_INDEX_TOP=16, see enum in sprpimpl.h file
65 * UTrie stringPrepTrie; -- size in bytes=indexes[_SPREP_INDEX_TRIE_SIZE]
67 * uint16_t mappingTable[]; -- Contains the sequecence of code units that the code point maps to
68 * size in bytes = indexes[_SPREP_INDEX_MAPPING_DATA_SIZE]
70 * The indexes array contains the following values:
71 * indexes[_SPREP_INDEX_TRIE_SIZE] -- The size of the StringPrep trie in bytes
72 * indexes[_SPREP_INDEX_MAPPING_DATA_SIZE] -- The size of the mappingTable in bytes
73 * indexes[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION] -- The index of Unicode version of last entry in NormalizationCorrections.txt
74 * indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] -- The starting index of 1 UChar mapping index in the mapping table
75 * indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] -- The starting index of 2 UChars mapping index in the mapping table
76 * indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] -- The starting index of 3 UChars mapping index in the mapping table
77 * indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START] -- The starting index of 4 UChars mapping index in the mapping table
78 * indexes[_SPREP_OPTIONS] -- Bit set of options to turn on in the profile, e.g: USPREP_NORMALIZATION_ON, USPREP_CHECK_BIDI_ON
83 * The StringPrep tries is a 16-bit trie that contains data for the profile.
84 * Each code point is associated with a value (trie-word) in the trie.
86 * - structure of data words from the trie
88 * i) A value greater than or equal to _SPREP_TYPE_THRESHOLD (0xFFF0)
89 * represents the type associated with the code point
90 * if(trieWord >= _SPREP_TYPE_THRESHOLD){
91 * type = trieWord - 0xFFF0;
98 * ii) A value less than _SPREP_TYPE_THRESHOLD means the type is USPREP_MAP and
99 * contains distribution described below
101 * 0 - ON : The code point is prohibited (USPREP_PROHIBITED). This is to allow for codepoint that are both prohibited and mapped.
102 * 1 - ON : The value in the next 14 bits is an index into the mapping table
103 * OFF: The value in the next 14 bits is an delta value from the code point
104 * 2..15 - Contains data as described by bit 1. If all bits are set
105 * (value = _SPREP_MAX_INDEX_VALUE) then the type is USPREP_DELETE
109 * The data in mapping table is sorted according to the length of the mapping sequence.
110 * If the type of the code point is USPREP_MAP and value in trie word is an index, the index
111 * is compared with start indexes of sequence length start to figure out the length according to
112 * the following algorithm:
114 * if( index >= indexes[_SPREP_ONE_UCHAR_MAPPING_INDEX_START] &&
115 * index < indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START]){
117 * }else if(index >= indexes[_SPREP_TWO_UCHARS_MAPPING_INDEX_START] &&
118 * index < indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START]){
120 * }else if(index >= indexes[_SPREP_THREE_UCHARS_MAPPING_INDEX_START] &&
121 * index < indexes[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START]){
124 * // The first position in the mapping table contains the length
126 * length = mappingTable[index++];
132 /* file data ---------------------------------------------------------------- */
133 /* indexes[] value names */
137 /* dummy UDataInfo cf. udata.h */
138 static UDataInfo dataInfo
= {
147 { 0, 0, 0, 0 }, /* dummy dataFormat */
148 { 0, 0, 0, 0 }, /* dummy formatVersion */
149 { 0, 0, 0, 0 } /* dummy dataVersion */
154 static int32_t indexes
[_SPREP_INDEX_TOP
]={ 0 };
156 static uint16_t* mappingData
= NULL
;
157 static int32_t mappingDataCapacity
= 0; /* we skip the first index in mapping data */
158 static int16_t currentIndex
= 0; /* the current index into the data trie */
159 static int32_t maxLength
= 0; /* maximum length of mapping string */
162 /* UDataInfo cf. udata.h */
163 static UDataInfo dataInfo
={
172 { 0x53, 0x50, 0x52, 0x50 }, /* dataFormat="SPRP" */
173 { 3, 2, UTRIE_SHIFT
, UTRIE_INDEX_SHIFT
}, /* formatVersion */
174 { 3, 2, 0, 0 } /* dataVersion (Unicode version) */
177 setUnicodeVersion(const char *v
) {
178 UVersionInfo version
;
179 u_versionFromString(version
, v
);
180 uprv_memcpy(dataInfo
.dataVersion
, version
, 4);
184 setUnicodeVersionNC(UVersionInfo version
){
185 uint32_t univer
= version
[0] << 24;
186 univer
+= version
[1] << 16;
187 univer
+= version
[2] << 8;
188 univer
+= version
[3];
189 indexes
[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION
] = univer
;
191 static UNewTrie
*sprepTrie
;
193 #define MAX_DATA_LENGTH 11500
196 #define SPREP_DELTA_RANGE_POSITIVE_LIMIT 8191
197 #define SPREP_DELTA_RANGE_NEGATIVE_LIMIT -8192
203 sprepTrie
= (UNewTrie
*)uprv_malloc(sizeof(UNewTrie
));
204 uprv_memset(sprepTrie
, 0, sizeof(UNewTrie
));
206 /* initialize the two tries */
207 if(NULL
==utrie_open(sprepTrie
, NULL
, MAX_DATA_LENGTH
, 0, 0, FALSE
)) {
208 fprintf(stderr
, "error: failed to initialize tries\n");
209 exit(U_MEMORY_ALLOCATION_ERROR
);
213 static UHashtable
* hashTable
= NULL
;
216 typedef struct ValueStruct
{
219 UStringPrepType type
;
222 /* Callback for deleting the value from the hashtable */
223 static void U_CALLCONV
valueDeleter(void* obj
){
224 ValueStruct
* value
= (ValueStruct
*) obj
;
225 uprv_free(value
->mapping
);
229 /* Callback for hashing the entry */
230 static int32_t U_CALLCONV
hashEntry(const UHashTok parm
) {
234 /* Callback for comparing two entries */
235 static UBool U_CALLCONV
compareEntries(const UHashTok p1
, const UHashTok p2
) {
236 return (UBool
)(p1
.integer
!= p2
.integer
);
244 const UHashElement
* element
= NULL
;
245 ValueStruct
* value
= NULL
;
246 int32_t codepoint
= 0;
247 int32_t elementCount
= uhash_count(hashTable
);
248 int32_t writtenElementCount
= 0;
249 int32_t mappingLength
= 1; /* minimum mapping length */
250 int32_t oldMappingLength
= 0;
251 uint16_t trieWord
=0;
252 int32_t limitIndex
= 0;
254 /*initialize the mapping data */
255 mappingData
= (uint16_t*) uprv_malloc(U_SIZEOF_UCHAR
* (mappingDataCapacity
));
257 uprv_memset(mappingData
,0,U_SIZEOF_UCHAR
* mappingDataCapacity
);
259 while(writtenElementCount
< elementCount
){
261 while( (element
= uhash_nextElement(hashTable
, &pos
))!=NULL
){
263 codepoint
= element
->key
.integer
;
264 value
= (ValueStruct
*)element
->value
.pointer
;
266 /* store the start of indexes */
267 if(oldMappingLength
!= mappingLength
){
268 /* Assume that index[] is used according to the enums defined */
269 if(oldMappingLength
<=_SPREP_MAX_INDEX_TOP_LENGTH
){
270 indexes
[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION
+mappingLength
] = currentIndex
;
272 if(oldMappingLength
<= _SPREP_MAX_INDEX_TOP_LENGTH
&&
273 mappingLength
== _SPREP_MAX_INDEX_TOP_LENGTH
+1){
275 limitIndex
= currentIndex
;
278 oldMappingLength
= mappingLength
;
281 if(value
->length
== mappingLength
){
282 uint32_t savedTrieWord
= 0;
283 trieWord
= currentIndex
<< 2;
284 /* turn on the 2nd bit to signal that the following bits contain an index */
287 if(trieWord
> _SPREP_TYPE_THRESHOLD
){
288 fprintf(stderr
,"trieWord cannot contain value greater than 0x%04X.\n",_SPREP_TYPE_THRESHOLD
);
289 exit(U_ILLEGAL_CHAR_FOUND
);
291 /* figure out if the code point has type already stored */
292 savedTrieWord
= utrie_get32(sprepTrie
,codepoint
,NULL
);
293 if(savedTrieWord
!=0){
294 if((savedTrieWord
- _SPREP_TYPE_THRESHOLD
) == USPREP_PROHIBITED
){
295 /* turn on the first bit in trie word */
299 * the codepoint has value something other than prohibited
300 * and a mapping .. error!
302 fprintf(stderr
,"Type for codepoint \\U%08X already set!.\n", (int)codepoint
);
303 exit(U_ILLEGAL_ARGUMENT_ERROR
);
307 /* now set the value in the trie */
308 if(!utrie_set32(sprepTrie
,codepoint
,trieWord
)){
309 fprintf(stderr
,"Could not set the value for code point.\n");
310 exit(U_ILLEGAL_ARGUMENT_ERROR
);
313 /* written the trie word for the codepoint... increment the count*/
314 writtenElementCount
++;
316 /* sanity check are we exceeding the max number allowed */
317 if(currentIndex
+value
->length
+1 > _SPREP_MAX_INDEX_VALUE
){
318 fprintf(stderr
, "Too many entries in the mapping table %i. Maximum allowed is %i\n", currentIndex
+value
->length
, _SPREP_MAX_INDEX_VALUE
);
319 exit(U_INDEX_OUTOFBOUNDS_ERROR
);
322 /* copy the mapping data */
323 if(currentIndex
+value
->length
+1 <= mappingDataCapacity
){
324 /* write the length */
325 if(mappingLength
> _SPREP_MAX_INDEX_TOP_LENGTH
){
326 /* the cast here is safe since we donot expect the length to be > 65535 */
327 mappingData
[currentIndex
++] = (uint16_t) mappingLength
;
329 /* copy the contents to mappindData array */
330 uprv_memmove(mappingData
+currentIndex
, value
->mapping
, value
->length
*U_SIZEOF_UCHAR
);
331 currentIndex
+= value
->length
;
335 UChar
* newMappingData
= (uint16_t*) uprv_malloc(U_SIZEOF_UCHAR
* mappingDataCapacity
*2);
336 if(newMappingData
== NULL
){
337 fprintf(stderr
, "Could not realloc the mapping data!\n");
338 exit(U_MEMORY_ALLOCATION_ERROR
);
340 uprv_memmove(newMappingData
, mappingData
, U_SIZEOF_UCHAR
* mappingDataCapacity
);
341 mappingDataCapacity
*= 2;
342 uprv_free(mappingData
);
343 mappingData
= newMappingData
;
344 /* write the length */
345 if(mappingLength
> _SPREP_MAX_INDEX_TOP_LENGTH
){
346 /* the cast here is safe since we donot expect the length to be > 65535 */
347 mappingData
[currentIndex
++] = (uint16_t) mappingLength
;
349 /* continue copying */
350 uprv_memmove(mappingData
+currentIndex
, value
->mapping
, value
->length
*U_SIZEOF_UCHAR
);
351 currentIndex
+= value
->length
;
359 /* set the last length for range check */
360 if(mappingLength
<= _SPREP_MAX_INDEX_TOP_LENGTH
){
361 indexes
[_SPREP_NORM_CORRECTNS_LAST_UNI_VERSION
+mappingLength
] = currentIndex
+1;
363 indexes
[_SPREP_FOUR_UCHARS_MAPPING_INDEX_START
] = limitIndex
;
368 extern void setOptions(int32_t options
){
369 indexes
[_SPREP_OPTIONS
] = options
;
372 storeMapping(uint32_t codepoint
, uint32_t* mapping
,int32_t length
,
373 UStringPrepType type
, UErrorCode
* status
){
377 int16_t adjustedLen
=0, i
;
378 uint16_t trieWord
= 0;
379 ValueStruct
*value
= NULL
;
380 uint32_t savedTrieWord
= 0;
382 /* initialize the hashtable */
384 hashTable
= uhash_open(hashEntry
, compareEntries
, status
);
385 uhash_setValueDeleter(hashTable
, valueDeleter
);
388 /* figure out if the code point has type already stored */
389 savedTrieWord
= utrie_get32(sprepTrie
,codepoint
,NULL
);
390 if(savedTrieWord
!=0){
391 if((savedTrieWord
- _SPREP_TYPE_THRESHOLD
) == USPREP_PROHIBITED
){
392 /* turn on the first bit in trie word */
396 * the codepoint has value something other than prohibited
397 * and a mapping .. error!
399 fprintf(stderr
,"Type for codepoint \\U%08X already set!.\n", (int)codepoint
);
400 exit(U_ILLEGAL_ARGUMENT_ERROR
);
404 /* figure out the real length */
405 for(i
=0; i
<length
; i
++){
406 if(mapping
[i
] > 0xFFFF){
413 if(adjustedLen
== 0){
414 trieWord
= (uint16_t)(_SPREP_MAX_INDEX_VALUE
<< 2);
415 /* make sure that the value of trieWord is less than the threshold */
416 if(trieWord
< _SPREP_TYPE_THRESHOLD
){
417 /* now set the value in the trie */
418 if(!utrie_set32(sprepTrie
,codepoint
,trieWord
)){
419 fprintf(stderr
,"Could not set the value for code point.\n");
420 exit(U_ILLEGAL_ARGUMENT_ERROR
);
422 /* value is set so just return */
425 fprintf(stderr
,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD
);
426 exit(U_ILLEGAL_CHAR_FOUND
);
430 if(adjustedLen
== 1){
431 /* calculate the delta */
432 int16_t delta
= (int16_t)((int32_t)codepoint
- (int16_t) mapping
[0]);
433 if(delta
>= SPREP_DELTA_RANGE_NEGATIVE_LIMIT
&& delta
<= SPREP_DELTA_RANGE_POSITIVE_LIMIT
){
435 trieWord
= delta
<< 2;
438 /* make sure that the second bit is OFF */
439 if((trieWord
& 0x02) != 0 ){
440 fprintf(stderr
,"The second bit in the trie word is not zero while storing a delta.\n");
441 exit(U_INTERNAL_PROGRAM_ERROR
);
443 /* make sure that the value of trieWord is less than the threshold */
444 if(trieWord
< _SPREP_TYPE_THRESHOLD
){
445 /* now set the value in the trie */
446 if(!utrie_set32(sprepTrie
,codepoint
,trieWord
)){
447 fprintf(stderr
,"Could not set the value for code point.\n");
448 exit(U_ILLEGAL_ARGUMENT_ERROR
);
450 /* value is set so just return */
455 * if the delta is not in the given range or if the trieWord is larger than the threshold
456 * just fall through for storing the mapping in the mapping table
460 map
= (UChar
*) uprv_malloc(U_SIZEOF_UCHAR
* (adjustedLen
+1));
461 uprv_memset(map
,0,U_SIZEOF_UCHAR
* (adjustedLen
+1));
466 if(mapping
[i
] <= 0xFFFF){
467 map
[i
] = (uint16_t)mapping
[i
];
469 map
[i
] = UTF16_LEAD(mapping
[i
]);
470 map
[i
+1] = UTF16_TRAIL(mapping
[i
]);
475 value
= (ValueStruct
*) uprv_malloc(sizeof(ValueStruct
));
476 value
->mapping
= map
;
478 value
->length
= adjustedLen
;
479 if(value
->length
> _SPREP_MAX_INDEX_TOP_LENGTH
){
480 mappingDataCapacity
++;
482 if(maxLength
< value
->length
){
483 maxLength
= value
->length
;
485 uhash_iput(hashTable
,codepoint
,value
,status
);
486 mappingDataCapacity
+= adjustedLen
;
488 if(U_FAILURE(*status
)){
489 fprintf(stderr
, "Failed to put entries into the hastable. Error: %s\n", u_errorName(*status
));
496 storeRange(uint32_t start
, uint32_t end
, UStringPrepType type
,UErrorCode
* status
){
497 uint16_t trieWord
= 0;
499 trieWord
+= (_SPREP_TYPE_THRESHOLD
+ type
); /* the top 4 bits contain the value */
500 if(trieWord
> 0xFFFF){
501 fprintf(stderr
,"trieWord cannot contain value greater than 0xFFFF.\n");
502 exit(U_ILLEGAL_CHAR_FOUND
);
505 uint32_t savedTrieWord
= utrie_get32(sprepTrie
, start
, NULL
);
507 if(savedTrieWord
< _SPREP_TYPE_THRESHOLD
&& type
== USPREP_PROHIBITED
){
509 * A mapping is stored in the trie word
510 * and the only other possible type that a
511 * code point can have is USPREP_PROHIBITED
515 /* turn on the 0th bit in the savedTrieWord */
516 savedTrieWord
+= 0x01;
518 /* the downcast is safe since we only save 16 bit values */
519 trieWord
= (uint16_t)savedTrieWord
;
521 /* make sure that the value of trieWord is less than the threshold */
522 if(trieWord
< _SPREP_TYPE_THRESHOLD
){
523 /* now set the value in the trie */
524 if(!utrie_set32(sprepTrie
,start
,trieWord
)){
525 fprintf(stderr
,"Could not set the value for code point.\n");
526 exit(U_ILLEGAL_ARGUMENT_ERROR
);
528 /* value is set so just return */
531 fprintf(stderr
,"trieWord cannot contain value greater than threshold 0x%04X.\n",_SPREP_TYPE_THRESHOLD
);
532 exit(U_ILLEGAL_CHAR_FOUND
);
535 }else if(savedTrieWord
!= trieWord
){
536 fprintf(stderr
,"Value for codepoint \\U%08X already set!.\n", (int)start
);
537 exit(U_ILLEGAL_ARGUMENT_ERROR
);
539 /* if savedTrieWord == trieWord .. fall through and set the value */
541 if(!utrie_set32(sprepTrie
,start
,trieWord
)){
542 fprintf(stderr
,"Could not set the value for code point \\U%08X.\n", (int)start
);
543 exit(U_ILLEGAL_ARGUMENT_ERROR
);
546 if(!utrie_setRange32(sprepTrie
, start
, end
+1, trieWord
, FALSE
)){
547 fprintf(stderr
,"Value for certain codepoint already set.\n");
548 exit(U_ILLEGAL_CHAR_FOUND
);
554 /* folding value: just store the offset (16 bits) if there is any non-0 entry */
555 static uint32_t U_CALLCONV
556 getFoldedValue(UNewTrie
*trie
, UChar32 start
, int32_t offset
) {
557 uint32_t foldedValue
, value
;
565 value
=utrie_get32(trie
, start
, &inBlockZero
);
567 start
+=UTRIE_DATA_BLOCK_LENGTH
;
568 } else if(value
!=0) {
569 return (uint32_t)offset
;
578 #endif /* #if !UCONFIG_NO_IDNA */
581 generateData(const char *dataDir
, const char *packageName
, const char* bundleName
) {
582 static uint8_t sprepTrieBlock
[100000];
584 UNewDataMemory
*pData
;
585 UErrorCode errorCode
=U_ZERO_ERROR
;
586 int32_t size
, dataLength
;
587 char* fileName
= (char*) uprv_malloc(uprv_strlen(bundleName
) +100);
595 int32_t sprepTrieSize
;
597 /* sort and add mapping data */
600 sprepTrieSize
=utrie_serialize(sprepTrie
, sprepTrieBlock
, sizeof(sprepTrieBlock
), getFoldedValue
, TRUE
, &errorCode
);
601 if(U_FAILURE(errorCode
)) {
602 fprintf(stderr
, "error: utrie_serialize(sprep trie) failed, %s\n", u_errorName(errorCode
));
606 size
= sprepTrieSize
+ mappingDataCapacity
*U_SIZEOF_UCHAR
+ sizeof(indexes
);
608 printf("size of sprep trie %5u bytes\n", (int)sprepTrieSize
);
609 printf("size of " U_ICUDATA_NAME
"_%s." DATA_TYPE
" contents: %ld bytes\n", bundleName
,(long)size
);
610 printf("size of mapping data array %5u bytes\n",(int)mappingDataCapacity
* U_SIZEOF_UCHAR
);
611 printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex
);
612 printf("Maximum length of the mapping string is : %i \n", (int)maxLength
);
617 if(packageName
!= NULL
) {
618 uprv_strcpy(fileName
,packageName
);
619 uprv_strcat(fileName
,"_");
623 uprv_strcat(fileName
,bundleName
);
625 pData
=udata_create(dataDir
, DATA_TYPE
, fileName
, &dataInfo
,
626 haveCopyright
? U_COPYRIGHT_STRING
: NULL
, &errorCode
);
627 if(U_FAILURE(errorCode
)) {
628 fprintf(stderr
, "gensprep: unable to create the output file, error %d\n", errorCode
);
634 indexes
[_SPREP_INDEX_TRIE_SIZE
]=sprepTrieSize
;
635 indexes
[_SPREP_INDEX_MAPPING_DATA_SIZE
]=mappingDataCapacity
*U_SIZEOF_UCHAR
;
637 udata_writeBlock(pData
, indexes
, sizeof(indexes
));
638 udata_writeBlock(pData
, sprepTrieBlock
, sprepTrieSize
);
639 udata_writeBlock(pData
, mappingData
, indexes
[_SPREP_INDEX_MAPPING_DATA_SIZE
]);
645 dataLength
=udata_finish(pData
, &errorCode
);
646 if(U_FAILURE(errorCode
)) {
647 fprintf(stderr
, "gensprep: error %d writing the output file\n", errorCode
);
651 if(dataLength
!=size
) {
652 fprintf(stderr
, "gensprep error: data length %ld != calculated size %ld\n",
653 (long)dataLength
, (long)size
);
654 exit(U_INTERNAL_PROGRAM_ERROR
);
658 /* done with writing the data .. close the hashtable */
659 uhash_close(hashTable
);
668 utrie_close(sprepTrie
);
669 uprv_free(sprepTrie
);
672 #endif /* #if !UCONFIG_NO_IDNA */
675 * Hey, Emacs, please set the following:
678 * indent-tabs-mode: nil