2 ******************************************************************************
4 * Copyright (C) 1999-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 1999oct04
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/udata.h"
29 /* prototypes ------------------------------------------------------------- */
31 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
33 static const char DATA_NAME
[] = "unames";
34 static const char DATA_TYPE
[] = "icu";
37 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
38 #define GROUP_MASK (LINES_PER_GROUP-1)
42 offsetHigh
, offsetLow
; /* avoid padding */
47 uint8_t type
, variant
;
52 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
;
56 const char *otherName
;
60 #define DO_FIND_NAME NULL
62 static UDataMemory
*uCharNamesData
=NULL
;
63 static UCharNames
*uCharNames
=NULL
;
64 static UErrorCode gLoadErrorCode
=U_ZERO_ERROR
;
67 * Maximum length of character names (regular & 1.0).
69 static int32_t gMaxNameLength
=0;
72 * Set of chars used in character names (regular & 1.0).
73 * Chars are platform-dependent (can be EBCDIC).
75 static uint32_t gNameSet
[8]={ 0 };
77 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
78 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
79 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
81 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
83 static const char * const charCatNames
[U_CHAR_EXTENDED_CATEGORY_COUNT
] = {
92 "combining spacing mark",
93 "decimal digit number",
98 "paragraph separator",
106 "connector punctuation",
112 "initial punctuation",
119 /* implementation ----------------------------------------------------------- */
121 static UBool U_CALLCONV
unames_cleanup(void)
124 udata_close(uCharNamesData
);
125 uCharNamesData
= NULL
;
134 static UBool U_CALLCONV
135 isAcceptable(void *context
,
136 const char *type
, const char *name
,
137 const UDataInfo
*pInfo
) {
140 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
141 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
142 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
143 pInfo
->dataFormat
[1]==0x6e &&
144 pInfo
->dataFormat
[2]==0x61 &&
145 pInfo
->dataFormat
[3]==0x6d &&
146 pInfo
->formatVersion
[0]==1);
150 isDataLoaded(UErrorCode
*pErrorCode
) {
151 /* load UCharNames from file if necessary */
154 /* do this because double-checked locking is broken */
155 UMTX_CHECK(NULL
, (uCharNames
!=NULL
), isCached
);
161 /* check error code from previous attempt */
162 if(U_FAILURE(gLoadErrorCode
)) {
163 *pErrorCode
=gLoadErrorCode
;
167 /* open the data outside the mutex block */
168 data
=udata_openChoice(NULL
, DATA_TYPE
, DATA_NAME
, isAcceptable
, NULL
, pErrorCode
);
169 if(U_FAILURE(*pErrorCode
)) {
170 gLoadErrorCode
=*pErrorCode
;
174 names
=(UCharNames
*)udata_getMemory(data
);
176 /* in the mutex block, set the data for this process */
179 if(uCharNames
==NULL
) {
184 ucln_common_registerCleanup(UCLN_COMMON_UNAMES
, unames_cleanup
);
189 /* if a different thread set it first, then close the extra data */
191 udata_close(data
); /* NULL if it was set correctly */
197 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
198 if((bufferLength)>0) { \
205 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
208 * Important: expandName() and compareName() are almost the same -
209 * apply fixes to both.
211 * UnicodeData.txt uses ';' as a field separator, so no
212 * field can contain ';' as part of its contents.
213 * In unames.dat, it is marked as token[';']==-1 only if the
214 * semicolon is used in the data file - which is iff we
215 * have Unicode 1.0 names or ISO comments.
216 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments
217 * although we know that it will never be part of a name.
220 expandName(UCharNames
*names
,
221 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
222 char *buffer
, uint16_t bufferLength
) {
223 uint16_t *tokens
=(uint16_t *)names
+8;
224 uint16_t token
, tokenCount
=*tokens
++, bufferPos
=0;
225 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
228 if(nameChoice
==U_UNICODE_10_CHAR_NAME
|| nameChoice
==U_ISO_COMMENT
) {
230 * skip the modern name if it is not requested _and_
231 * if the semicolon byte value is a character, not a token number
233 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
234 while(nameLength
>0) {
240 if(nameChoice
==U_ISO_COMMENT
) {
241 /* skip the Unicode 1.0 name as well to get the ISO comment */
242 while(nameLength
>0) {
251 * the semicolon byte value is a token number, therefore
252 * only modern names are stored in unames.dat and there is no
253 * such requested Unicode 1.0 name here
259 /* write each letter directly, and write a token word per token */
260 while(nameLength
>0) {
266 /* implicit letter */
267 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
274 if(token
==(uint16_t)(-2)) {
275 /* this is a lead byte for a double-byte token */
276 token
=tokens
[c
<<8|*name
++];
279 if(token
==(uint16_t)(-1)) {
281 /* explicit letter */
282 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
284 /* stop, but skip the semicolon if we are seeking
285 extended names and there was no 2.0 name but there
287 if(!bufferPos
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
288 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
296 /* write token word */
297 uint8_t *tokenString
=tokenStrings
+token
;
298 while((c
=*tokenString
++)!=0) {
299 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
314 * compareName() is almost the same as expandName() except that it compares
315 * the currently expanded name to an input name.
316 * It returns the match/no match result as soon as possible.
319 compareName(UCharNames
*names
,
320 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
321 const char *otherName
) {
322 uint16_t *tokens
=(uint16_t *)names
+8;
323 uint16_t token
, tokenCount
=*tokens
++;
324 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
326 const char *origOtherName
= otherName
;
328 if(nameChoice
==U_UNICODE_10_CHAR_NAME
) {
330 * skip the modern name if it is not requested _and_
331 * if the semicolon byte value is a character, not a token number
333 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
334 while(nameLength
>0) {
342 * the semicolon byte value is a token number, therefore
343 * only modern names are stored in unames.dat and there is no
344 * such requested Unicode 1.0 name here
350 /* compare each letter directly, and compare a token word per token */
351 while(nameLength
>0) {
357 /* implicit letter */
358 if((char)c
!=*otherName
++) {
367 if(token
==(uint16_t)(-2)) {
368 /* this is a lead byte for a double-byte token */
369 token
=tokens
[c
<<8|*name
++];
372 if(token
==(uint16_t)(-1)) {
374 /* explicit letter */
375 if((char)c
!=*otherName
++) {
379 /* stop, but skip the semicolon if we are seeking
380 extended names and there was no 2.0 name but there
382 if(otherName
== origOtherName
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
383 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
391 /* write token word */
392 uint8_t *tokenString
=tokenStrings
+token
;
393 while((c
=*tokenString
++)!=0) {
394 if((char)c
!=*otherName
++) {
402 /* complete match? */
403 return (UBool
)(*otherName
==0);
406 static uint8_t getCharCat(UChar32 cp
) {
409 if (UTF_IS_UNICODE_NONCHAR(cp
)) {
410 return U_NONCHARACTER_CODE_POINT
;
413 if ((cat
= u_charType(cp
)) == U_SURROGATE
) {
414 cat
= UTF_IS_LEAD(cp
) ? U_LEAD_SURROGATE
: U_TRAIL_SURROGATE
;
420 static const char *getCharCatName(UChar32 cp
) {
421 uint8_t cat
= getCharCat(cp
);
423 /* Return unknown if the table of names above is not up to
426 if (cat
>= LENGTHOF(charCatNames
)) {
429 return charCatNames
[cat
];
433 static uint16_t getExtName(uint32_t code
, char *buffer
, uint16_t bufferLength
) {
434 const char *catname
= getCharCatName(code
);
440 WRITE_CHAR(buffer
, bufferLength
, length
, '<');
441 while (catname
[length
- 1]) {
442 WRITE_CHAR(buffer
, bufferLength
, length
, catname
[length
- 1]);
444 WRITE_CHAR(buffer
, bufferLength
, length
, '-');
445 for (cp
= code
, ndigits
= 0; cp
; ++ndigits
, cp
>>= 4)
449 for (cp
= code
, i
= ndigits
; (cp
|| i
> 0) && bufferLength
; cp
>>= 4, bufferLength
--) {
450 uint8_t v
= (uint8_t)(cp
& 0xf);
451 buffer
[--i
] = (v
< 10 ? '0' + v
: 'A' + v
- 10);
455 WRITE_CHAR(buffer
, bufferLength
, length
, '>');
461 * getGroup() does a binary search for the group that contains the
462 * Unicode code point "code".
463 * The return value is always a valid Group* that may contain "code"
464 * or else is the highest group before "code".
465 * If the lowest group is after "code", then that one is returned.
468 getGroup(UCharNames
*names
, uint32_t code
) {
469 uint16_t groupMSB
=(uint16_t)(code
>>GROUP_SHIFT
),
471 limit
=*(uint16_t *)((char *)names
+names
->groupsOffset
),
473 Group
*groups
=(Group
*)((char *)names
+names
->groupsOffset
+2);
475 /* binary search for the group of names that contains the one for code */
476 while(start
<limit
-1) {
477 number
=(uint16_t)((start
+limit
)/2);
478 if(groupMSB
<groups
[number
].groupMSB
) {
485 /* return this regardless of whether it is an exact match */
490 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
491 * expands them into offsets and lengths for each string.
492 * Lengths are stored with a variable-width encoding in consecutive nibbles:
493 * If a nibble<0xc, then it is the length itself (0=empty string).
494 * If a nibble>=0xc, then it forms a length value with the following nibble.
495 * Calculation see below.
496 * The offsets and lengths arrays must be at least 33 (one more) long because
497 * there is no check here at the end if the last nibble is still used.
499 static const uint8_t *
500 expandGroupLengths(const uint8_t *s
,
501 uint16_t offsets
[LINES_PER_GROUP
+1], uint16_t lengths
[LINES_PER_GROUP
+1]) {
502 /* read the lengths of the 32 strings in this group and get each string's offset */
503 uint16_t i
=0, offset
=0, length
=0;
506 /* all 32 lengths must be read to get the offset of the first group string */
507 while(i
<LINES_PER_GROUP
) {
510 /* read even nibble - MSBs of lengthByte */
512 /* double-nibble length spread across two bytes */
513 length
=(uint16_t)(((length
&0x3)<<4|lengthByte
>>4)+12);
515 } else if((lengthByte
/* &0xf0 */)>=0xc0) {
516 /* double-nibble length spread across this one byte */
517 length
=(uint16_t)((lengthByte
&0x3f)+12);
519 /* single-nibble length in MSBs */
520 length
=(uint16_t)(lengthByte
>>4);
530 /* read odd nibble - LSBs of lengthByte */
531 if((lengthByte
&0xf0)==0) {
532 /* this nibble was not consumed for a double-nibble length above */
535 /* single-nibble length in LSBs */
543 length
=0; /* prevent double-nibble detection in the next iteration */
547 /* now, s is at the first group string */
552 expandGroupName(UCharNames
*names
, Group
*group
,
553 uint16_t lineNumber
, UCharNameChoice nameChoice
,
554 char *buffer
, uint16_t bufferLength
) {
555 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
556 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+
557 (group
->offsetHigh
<<16|group
->offsetLow
);
558 s
=expandGroupLengths(s
, offsets
, lengths
);
559 return expandName(names
, s
+offsets
[lineNumber
], lengths
[lineNumber
], nameChoice
,
560 buffer
, bufferLength
);
564 getName(UCharNames
*names
, uint32_t code
, UCharNameChoice nameChoice
,
565 char *buffer
, uint16_t bufferLength
) {
566 Group
*group
=getGroup(names
, code
);
567 if((uint16_t)(code
>>GROUP_SHIFT
)==group
->groupMSB
) {
568 return expandGroupName(names
, group
, (uint16_t)(code
&GROUP_MASK
), nameChoice
,
569 buffer
, bufferLength
);
571 /* group not found */
581 * enumGroupNames() enumerates all the names in a 32-group
582 * and either calls the enumerator function or finds a given input name.
585 enumGroupNames(UCharNames
*names
, Group
*group
,
586 UChar32 start
, UChar32 end
,
587 UEnumCharNamesFn
*fn
, void *context
,
588 UCharNameChoice nameChoice
) {
589 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
590 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+
591 (group
->offsetHigh
<<16|group
->offsetLow
);
593 s
=expandGroupLengths(s
, offsets
, lengths
);
594 if(fn
!=DO_FIND_NAME
) {
599 length
=expandName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, buffer
, sizeof(buffer
));
600 if (!length
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
601 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
603 /* here, we assume that the buffer is large enough */
605 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
612 const char *otherName
=((FindName
*)context
)->otherName
;
614 if(compareName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, otherName
)) {
615 ((FindName
*)context
)->code
=start
;
625 * enumExtNames enumerate extended names.
626 * It only needs to do it if it is called with a real function and not
627 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
628 * for extended names by itself.
631 enumExtNames(UChar32 start
, UChar32 end
,
632 UEnumCharNamesFn
*fn
, void *context
)
634 if(fn
!=DO_FIND_NAME
) {
639 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
640 /* here, we assume that the buffer is large enough */
642 if(!fn(context
, start
, U_EXTENDED_CHAR_NAME
, buffer
, length
)) {
654 enumNames(UCharNames
*names
,
655 UChar32 start
, UChar32 limit
,
656 UEnumCharNamesFn
*fn
, void *context
,
657 UCharNameChoice nameChoice
) {
658 uint16_t startGroupMSB
, endGroupMSB
, groupCount
;
659 Group
*group
, *groupLimit
;
661 startGroupMSB
=(uint16_t)(start
>>GROUP_SHIFT
);
662 endGroupMSB
=(uint16_t)((limit
-1)>>GROUP_SHIFT
);
664 /* find the group that contains start, or the highest before it */
665 group
=getGroup(names
, start
);
667 if(startGroupMSB
==endGroupMSB
) {
668 if(startGroupMSB
==group
->groupMSB
) {
669 /* if start and limit-1 are in the same group, then enumerate only in that one */
670 return enumGroupNames(names
, group
, start
, limit
-1, fn
, context
, nameChoice
);
673 groupCount
=*(uint16_t *)((char *)names
+names
->groupsOffset
);
674 groupLimit
=(Group
*)((char *)names
+names
->groupsOffset
+2)+groupCount
;
676 if(startGroupMSB
==group
->groupMSB
) {
677 /* enumerate characters in the partial start group */
678 if((start
&GROUP_MASK
)!=0) {
679 if(!enumGroupNames(names
, group
,
680 start
, ((UChar32
)startGroupMSB
<<GROUP_SHIFT
)+LINES_PER_GROUP
-1,
681 fn
, context
, nameChoice
)) {
684 ++group
; /* continue with the next group */
686 } else if(startGroupMSB
>group
->groupMSB
) {
687 /* make sure that we start enumerating with the first group after start */
688 if (group
+ 1 < groupLimit
&& (group
+ 1)->groupMSB
> startGroupMSB
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
689 UChar32 end
= (group
+ 1)->groupMSB
<< GROUP_SHIFT
;
693 if (!enumExtNames(start
, end
- 1, fn
, context
)) {
700 /* enumerate entire groups between the start- and end-groups */
701 while(group
<groupLimit
&& group
->groupMSB
<endGroupMSB
) {
702 start
=(UChar32
)group
->groupMSB
<<GROUP_SHIFT
;
703 if(!enumGroupNames(names
, group
, start
, start
+LINES_PER_GROUP
-1, fn
, context
, nameChoice
)) {
706 if (group
+ 1 < groupLimit
&& (group
+ 1)->groupMSB
> group
->groupMSB
+ 1 && nameChoice
== U_EXTENDED_CHAR_NAME
) {
707 UChar32 end
= (group
+ 1)->groupMSB
<< GROUP_SHIFT
;
711 if (!enumExtNames((group
->groupMSB
+ 1) << GROUP_SHIFT
, end
- 1, fn
, context
)) {
718 /* enumerate within the end group (group->groupMSB==endGroupMSB) */
719 if(group
<groupLimit
&& group
->groupMSB
==endGroupMSB
) {
720 return enumGroupNames(names
, group
, (limit
-1)&~GROUP_MASK
, limit
-1, fn
, context
, nameChoice
);
721 } else if (nameChoice
== U_EXTENDED_CHAR_NAME
&& group
== groupLimit
) {
722 UChar32 next
= ((group
- 1)->groupMSB
+ 1) << GROUP_SHIFT
;
731 /* we have not found a group, which means everything is made of
733 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
734 if (limit
> UCHAR_MAX_VALUE
+ 1) {
735 limit
= UCHAR_MAX_VALUE
+ 1;
737 return enumExtNames(start
, limit
- 1, fn
, context
);
744 writeFactorSuffix(const uint16_t *factors
, uint16_t count
,
745 const char *s
, /* suffix elements */
747 uint16_t indexes
[8], /* output fields from here */
748 const char *elementBases
[8], const char *elements
[8],
749 char *buffer
, uint16_t bufferLength
) {
750 uint16_t i
, factor
, bufferPos
=0;
753 /* write elements according to the factors */
756 * the factorized elements are determined by modulo arithmetic
757 * with the factors of this algorithm
759 * note that for fewer operations, count is decremented here
762 for(i
=count
; i
>0; --i
) {
764 indexes
[i
]=(uint16_t)(code%factor
);
768 * we don't need to calculate the last modulus because start<=code<=end
769 * guarantees here that code<=factors[0]
771 indexes
[0]=(uint16_t)code
;
773 /* write each element */
775 if(elementBases
!=NULL
) {
779 /* skip indexes[i] strings */
791 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
794 /* we do not need to perform the rest of this loop for i==count - break here */
799 /* skip the rest of the strings for this factors[i] */
800 factor
=(uint16_t)(factors
[i
]-indexes
[i
]-1);
819 * Parts of findAlgName() are almost the same as some of getAlgName().
820 * Fixes must be applied to both.
823 getAlgName(AlgorithmicRange
*range
, uint32_t code
, UCharNameChoice nameChoice
,
824 char *buffer
, uint16_t bufferLength
) {
825 uint16_t bufferPos
=0;
828 * Do not write algorithmic Unicode 1.0 names because
829 * Unihan names are the same as the modern ones,
830 * extension A was only introduced with Unicode 3.0, and
831 * the Hangul syllable block was moved and changed around Unicode 1.1.5.
833 if(nameChoice
==U_UNICODE_10_CHAR_NAME
) {
841 switch(range
->type
) {
843 /* name = prefix hex-digits */
844 const char *s
=(const char *)(range
+1);
851 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
854 /* write hexadecimal code point value */
855 count
=range
->variant
;
858 if(count
<bufferLength
) {
863 if(--i
<bufferLength
) {
879 /* name = prefix factorized-elements */
881 const uint16_t *factors
=(const uint16_t *)(range
+1);
882 uint16_t count
=range
->variant
;
883 const char *s
=(const char *)(factors
+count
);
888 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
891 bufferPos
+=writeFactorSuffix(factors
, count
,
892 s
, code
-range
->start
, indexes
, NULL
, NULL
, buffer
, bufferLength
);
908 * Important: enumAlgNames() and findAlgName() are almost the same.
909 * Any fix must be applied to both.
912 enumAlgNames(AlgorithmicRange
*range
,
913 UChar32 start
, UChar32 limit
,
914 UEnumCharNamesFn
*fn
, void *context
,
915 UCharNameChoice nameChoice
) {
919 if(nameChoice
==U_UNICODE_10_CHAR_NAME
) {
923 switch(range
->type
) {
928 /* get the full name of the start character */
929 length
=getAlgName(range
, (uint32_t)start
, nameChoice
, buffer
, sizeof(buffer
));
934 /* call the enumerator function with this first character */
935 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
939 /* go to the end of the name; all these names have the same length */
945 /* enumerate the rest of the names */
946 while(++start
<limit
) {
947 /* increment the hexadecimal number on a character-basis */
951 if(('0'<=c
&& c
<'9') || ('A'<=c
&& c
<'F')) {
962 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
970 const char *elementBases
[8], *elements
[8];
971 const uint16_t *factors
=(const uint16_t *)(range
+1);
972 uint16_t count
=range
->variant
;
973 const char *s
=(const char *)(factors
+count
);
975 uint16_t prefixLength
, i
, index
;
979 /* name = prefix factorized-elements */
989 /* append the suffix of the start character */
990 length
=(uint16_t)(prefixLength
+writeFactorSuffix(factors
, count
,
991 s
, (uint32_t)start
-range
->start
,
992 indexes
, elementBases
, elements
,
993 suffix
, (uint16_t)(sizeof(buffer
)-prefixLength
)));
995 /* call the enumerator function with this first character */
996 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1000 /* enumerate the rest of the names */
1001 while(++start
<limit
) {
1002 /* increment the indexes in lexical order bound by the factors */
1005 index
=(uint16_t)(indexes
[--i
]+1);
1006 if(index
<factors
[i
]) {
1007 /* skip one index and its element string */
1015 /* reset this index to 0 and its element string to the first one */
1017 elements
[i
]=elementBases
[i
];
1021 /* to make matters a little easier, just append all elements to the suffix */
1023 length
=prefixLength
;
1024 for(i
=0; i
<count
; ++i
) {
1026 while((c
=*s
++)!=0) {
1031 /* zero-terminate */
1034 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1041 /* undefined type */
1049 * findAlgName() is almost the same as enumAlgNames() except that it
1050 * returns the code point for a name if it fits into the range.
1051 * It returns 0xffff otherwise.
1054 findAlgName(AlgorithmicRange
*range
, UCharNameChoice nameChoice
, const char *otherName
) {
1057 if(nameChoice
==U_UNICODE_10_CHAR_NAME
) {
1061 switch(range
->type
) {
1063 /* name = prefix hex-digits */
1064 const char *s
=(const char *)(range
+1);
1069 /* compare prefix */
1070 while((c
=*s
++)!=0) {
1071 if((char)c
!=*otherName
++) {
1076 /* read hexadecimal code point value */
1077 count
=range
->variant
;
1079 for(i
=0; i
<count
; ++i
) {
1081 if('0'<=c
&& c
<='9') {
1082 code
=(code
<<4)|(c
-'0');
1083 } else if('A'<=c
&& c
<='F') {
1084 code
=(code
<<4)|(c
-'A'+10);
1090 /* does it fit into the range? */
1091 if(*otherName
==0 && range
->start
<=(uint32_t)code
&& (uint32_t)code
<=range
->end
) {
1098 uint16_t indexes
[8];
1099 const char *elementBases
[8], *elements
[8];
1100 const uint16_t *factors
=(const uint16_t *)(range
+1);
1101 uint16_t count
=range
->variant
;
1102 const char *s
=(const char *)(factors
+count
), *t
;
1103 UChar32 start
, limit
;
1108 /* name = prefix factorized-elements */
1110 /* compare prefix */
1111 while((c
=*s
++)!=0) {
1112 if((char)c
!=*otherName
++) {
1117 start
=(UChar32
)range
->start
;
1118 limit
=(UChar32
)(range
->end
+1);
1120 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1121 writeFactorSuffix(factors
, count
, s
, 0,
1122 indexes
, elementBases
, elements
, buffer
, sizeof(buffer
));
1124 /* compare the first suffix */
1125 if(0==uprv_strcmp(otherName
, buffer
)) {
1129 /* enumerate and compare the rest of the suffixes */
1130 while(++start
<limit
) {
1131 /* increment the indexes in lexical order bound by the factors */
1134 index
=(uint16_t)(indexes
[--i
]+1);
1135 if(index
<factors
[i
]) {
1136 /* skip one index and its element string */
1143 /* reset this index to 0 and its element string to the first one */
1145 elements
[i
]=elementBases
[i
];
1149 /* to make matters a little easier, just compare all elements of the suffix */
1151 for(i
=0; i
<count
; ++i
) {
1153 while((c
=*s
++)!=0) {
1155 s
=""; /* does not match */
1167 /* undefined type */
1174 /* sets of name characters, maximum name lengths ---------------------------- */
1176 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1177 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1180 calcStringSetLength(uint32_t set
[8], const char *s
) {
1184 while((c
=*s
++)!=0) {
1192 calcAlgNameSetsLengths(int32_t maxNameLength
) {
1193 AlgorithmicRange
*range
;
1195 uint32_t rangeCount
;
1198 /* enumerate algorithmic ranges */
1199 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1201 range
=(AlgorithmicRange
*)(p
+1);
1202 while(rangeCount
>0) {
1203 switch(range
->type
) {
1205 /* name = prefix + (range->variant times) hex-digits */
1207 length
=calcStringSetLength(gNameSet
, (const char *)(range
+1))+range
->variant
;
1208 if(length
>maxNameLength
) {
1209 maxNameLength
=length
;
1213 /* name = prefix factorized-elements */
1214 const uint16_t *factors
=(const uint16_t *)(range
+1);
1216 int32_t i
, count
=range
->variant
, factor
, factorLength
, maxFactorLength
;
1219 s
=(const char *)(factors
+count
);
1220 length
=calcStringSetLength(gNameSet
, s
);
1221 s
+=length
+1; /* start of factor suffixes */
1223 /* get the set and maximum factor suffix length for each factor */
1224 for(i
=0; i
<count
; ++i
) {
1226 for(factor
=factors
[i
]; factor
>0; --factor
) {
1227 factorLength
=calcStringSetLength(gNameSet
, s
);
1229 if(factorLength
>maxFactorLength
) {
1230 maxFactorLength
=factorLength
;
1233 length
+=maxFactorLength
;
1236 if(length
>maxNameLength
) {
1237 maxNameLength
=length
;
1246 range
=(AlgorithmicRange
*)((uint8_t *)range
+range
->size
);
1249 return maxNameLength
;
1253 calcExtNameSetsLengths(int32_t maxNameLength
) {
1256 for(i
=0; i
<LENGTHOF(charCatNames
); ++i
) {
1258 * for each category, count the length of the category name
1262 * 6 for most hex digits per code point
1264 length
=9+calcStringSetLength(gNameSet
, charCatNames
[i
]);
1265 if(length
>maxNameLength
) {
1266 maxNameLength
=length
;
1269 return maxNameLength
;
1273 calcNameSetLength(const uint16_t *tokens
, uint16_t tokenCount
, const uint8_t *tokenStrings
, int8_t *tokenLengths
,
1275 const uint8_t **pLine
, const uint8_t *lineLimit
) {
1276 const uint8_t *line
=*pLine
;
1277 int32_t length
=0, tokenLength
;
1280 while(line
!=lineLimit
&& (c
=*line
++)!=(uint8_t)';') {
1282 /* implicit letter */
1287 if(token
==(uint16_t)(-2)) {
1288 /* this is a lead byte for a double-byte token */
1292 if(token
==(uint16_t)(-1)) {
1293 /* explicit letter */
1297 /* count token word */
1298 if(tokenLengths
!=NULL
) {
1299 /* use cached token length */
1300 tokenLength
=tokenLengths
[c
];
1301 if(tokenLength
==0) {
1302 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1303 tokenLengths
[c
]=(int8_t)tokenLength
;
1306 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1308 length
+=tokenLength
;
1318 calcGroupNameSetsLengths(int32_t maxNameLength
) {
1319 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
1321 uint16_t *tokens
=(uint16_t *)uCharNames
+8;
1322 uint16_t tokenCount
=*tokens
++;
1323 uint8_t *tokenStrings
=(uint8_t *)uCharNames
+uCharNames
->tokenStringOffset
;
1325 int8_t *tokenLengths
;
1329 const uint8_t *s
, *line
, *lineLimit
;
1331 int32_t groupCount
, lineNumber
, length
;
1333 tokenLengths
=(int8_t *)uprv_malloc(tokenCount
);
1334 if(tokenLengths
!=NULL
) {
1335 uprv_memset(tokenLengths
, 0, tokenCount
);
1338 groups
=(uint16_t *)((char *)uCharNames
+uCharNames
->groupsOffset
);
1339 groupCount
=*groups
++;
1340 group
=(Group
*)groups
;
1342 /* enumerate all groups */
1343 while(groupCount
>0) {
1344 s
=(uint8_t *)uCharNames
+uCharNames
->groupStringOffset
+
1345 ((int32_t)group
->offsetHigh
<<16|group
->offsetLow
);
1346 s
=expandGroupLengths(s
, offsets
, lengths
);
1348 /* enumerate all lines in each group */
1349 for(lineNumber
=0; lineNumber
<LINES_PER_GROUP
; ++lineNumber
) {
1350 line
=s
+offsets
[lineNumber
];
1351 length
=lengths
[lineNumber
];
1356 lineLimit
=line
+length
;
1358 /* read regular name */
1359 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1360 if(length
>maxNameLength
) {
1361 maxNameLength
=length
;
1363 if(line
==lineLimit
) {
1367 /* read Unicode 1.0 name */
1368 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1369 if(length
>maxNameLength
) {
1370 maxNameLength
=length
;
1372 if(line
==lineLimit
) {
1376 /* read ISO comment */
1377 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1384 if(tokenLengths
!=NULL
) {
1385 uprv_free(tokenLengths
);
1388 /* set gMax... - name length last for threading */
1389 gMaxNameLength
=maxNameLength
;
1393 calcNameSetsLengths(UErrorCode
*pErrorCode
) {
1394 static const char extChars
[]="0123456789ABCDEF<>-";
1395 int32_t i
, maxNameLength
;
1397 if(gMaxNameLength
!=0) {
1401 if(!isDataLoaded(pErrorCode
)) {
1405 /* set hex digits, used in various names, and <>-, used in extended names */
1406 for(i
=0; i
<sizeof(extChars
)-1; ++i
) {
1407 SET_ADD(gNameSet
, extChars
[i
]);
1410 /* set sets and lengths from algorithmic names */
1411 maxNameLength
=calcAlgNameSetsLengths(0);
1413 /* set sets and lengths from extended names */
1414 maxNameLength
=calcExtNameSetsLengths(maxNameLength
);
1416 /* set sets and lengths from group names, set global maximum values */
1417 calcGroupNameSetsLengths(maxNameLength
);
1422 /* public API --------------------------------------------------------------- */
1424 U_CAPI
int32_t U_EXPORT2
1425 u_charName(UChar32 code
, UCharNameChoice nameChoice
,
1426 char *buffer
, int32_t bufferLength
,
1427 UErrorCode
*pErrorCode
) {
1428 AlgorithmicRange
*algRange
;
1433 /* check the argument values */
1434 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1436 } else if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
||
1437 bufferLength
<0 || (bufferLength
>0 && buffer
==NULL
)
1439 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1443 if((uint32_t)code
>UCHAR_MAX_VALUE
|| !isDataLoaded(pErrorCode
)) {
1444 return u_terminateChars(buffer
, bufferLength
, 0, pErrorCode
);
1449 /* try algorithmic names first */
1450 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1452 algRange
=(AlgorithmicRange
*)(p
+1);
1454 if(algRange
->start
<=(uint32_t)code
&& (uint32_t)code
<=algRange
->end
) {
1455 length
=getAlgName(algRange
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1458 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1463 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1464 length
= getName(uCharNames
, (uint32_t )code
, U_EXTENDED_CHAR_NAME
, buffer
, (uint16_t) bufferLength
);
1466 /* extended character name */
1467 length
= getExtName((uint32_t) code
, buffer
, (uint16_t) bufferLength
);
1470 /* normal character name */
1471 length
=getName(uCharNames
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1475 return u_terminateChars(buffer
, bufferLength
, length
, pErrorCode
);
1478 U_CAPI
int32_t U_EXPORT2
1479 u_getISOComment(UChar32 c
,
1480 char *dest
, int32_t destCapacity
,
1481 UErrorCode
*pErrorCode
) {
1484 /* check the argument values */
1485 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1487 } else if(destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1488 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1492 if((uint32_t)c
>UCHAR_MAX_VALUE
|| !isDataLoaded(pErrorCode
)) {
1493 return u_terminateChars(dest
, destCapacity
, 0, pErrorCode
);
1496 /* the ISO comment is stored like a normal character name */
1497 length
=getName(uCharNames
, (uint32_t)c
, U_ISO_COMMENT
, dest
, (uint16_t)destCapacity
);
1498 return u_terminateChars(dest
, destCapacity
, length
, pErrorCode
);
1501 U_CAPI UChar32 U_EXPORT2
1502 u_charFromName(UCharNameChoice nameChoice
,
1504 UErrorCode
*pErrorCode
) {
1505 char upper
[120], lower
[120];
1507 AlgorithmicRange
*algRange
;
1512 UChar32 error
= 0xffff; /* Undefined, but use this for backwards compatibility. */
1514 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1518 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| name
==NULL
|| *name
==0) {
1519 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1523 if(!isDataLoaded(pErrorCode
)) {
1527 /* construct the uppercase and lowercase of the name first */
1528 for(i
=0; i
<sizeof(upper
); ++i
) {
1529 if((c0
=*name
++)!=0) {
1530 upper
[i
]=uprv_toupper(c0
);
1531 lower
[i
]=uprv_tolower(c0
);
1533 upper
[i
]=lower
[i
]=0;
1537 if(i
==sizeof(upper
)) {
1538 /* name too long, there is no such character */
1539 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1543 /* try extended names first */
1544 if (lower
[0] == '<') {
1545 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1546 if (lower
[--i
] == '>') {
1547 for (--i
; lower
[i
] && lower
[i
] != '-'; --i
) {
1550 if (lower
[i
] == '-') { /* We've got a category. */
1555 for (++i
; lower
[i
] != '>'; ++i
) {
1556 if (lower
[i
] >= '0' && lower
[i
] <= '9') {
1557 cp
= (cp
<< 4) + lower
[i
] - '0';
1558 } else if (lower
[i
] >= 'a' && lower
[i
] <= 'f') {
1559 cp
= (cp
<< 4) + lower
[i
] - 'a' + 10;
1561 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1566 /* Now validate the category name.
1567 We could use a binary search, or a trie, if
1568 we really wanted to. */
1570 for (lower
[i
] = 0, cIdx
= 0; cIdx
< LENGTHOF(charCatNames
); ++cIdx
) {
1572 if (!uprv_strcmp(lower
+ 1, charCatNames
[cIdx
])) {
1573 if (getCharCat(cp
) == cIdx
) {
1583 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1587 /* try algorithmic names now */
1588 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1590 algRange
=(AlgorithmicRange
*)(p
+1);
1592 if((cp
=findAlgName(algRange
, nameChoice
, upper
))!=0xffff) {
1595 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1599 /* normal character name */
1600 findName
.otherName
=upper
;
1601 findName
.code
=error
;
1602 enumNames(uCharNames
, 0, UCHAR_MAX_VALUE
+ 1, DO_FIND_NAME
, &findName
, nameChoice
);
1603 if (findName
.code
== error
) {
1604 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1606 return findName
.code
;
1609 U_CAPI
void U_EXPORT2
1610 u_enumCharNames(UChar32 start
, UChar32 limit
,
1611 UEnumCharNamesFn
*fn
,
1613 UCharNameChoice nameChoice
,
1614 UErrorCode
*pErrorCode
) {
1615 AlgorithmicRange
*algRange
;
1619 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1623 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| fn
==NULL
) {
1624 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1628 if((uint32_t) limit
> UCHAR_MAX_VALUE
+ 1) {
1629 limit
= UCHAR_MAX_VALUE
+ 1;
1631 if((uint32_t)start
>=(uint32_t)limit
) {
1635 if(!isDataLoaded(pErrorCode
)) {
1639 /* interleave the data-driven ones with the algorithmic ones */
1640 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1641 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1643 algRange
=(AlgorithmicRange
*)(p
+1);
1645 /* enumerate the character names before the current algorithmic range */
1646 /* here: start<limit */
1647 if((uint32_t)start
<algRange
->start
) {
1648 if((uint32_t)limit
<=algRange
->start
) {
1649 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1652 if(!enumNames(uCharNames
, start
, (UChar32
)algRange
->start
, fn
, context
, nameChoice
)) {
1655 start
=(UChar32
)algRange
->start
;
1657 /* enumerate the character names in the current algorithmic range */
1658 /* here: algRange->start<=start<limit */
1659 if((uint32_t)start
<=algRange
->end
) {
1660 if((uint32_t)limit
<=(algRange
->end
+1)) {
1661 enumAlgNames(algRange
, start
, limit
, fn
, context
, nameChoice
);
1664 if(!enumAlgNames(algRange
, start
, (UChar32
)algRange
->end
+1, fn
, context
, nameChoice
)) {
1667 start
=(UChar32
)algRange
->end
+1;
1669 /* continue to the next algorithmic range (here: start<limit) */
1670 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1673 /* enumerate the character names after the last algorithmic range */
1674 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1677 U_CAPI
int32_t U_EXPORT2
1678 uprv_getMaxCharNameLength() {
1679 UErrorCode errorCode
=U_ZERO_ERROR
;
1680 if(calcNameSetsLengths(&errorCode
)) {
1681 return gMaxNameLength
;
1688 * Converts the char set cset into a Unicode set uset.
1689 * @param cset Set of 256 bit flags corresponding to a set of chars.
1690 * @param uset USet to receive characters. Existing contents are deleted.
1693 charSetToUSet(uint32_t cset
[8], const USetAdder
*sa
) {
1698 UErrorCode errorCode
;
1700 errorCode
=U_ZERO_ERROR
;
1702 if(!calcNameSetsLengths(&errorCode
)) {
1706 /* build a char string with all chars that are used in character names */
1708 for(i
=0; i
<256; ++i
) {
1709 if(SET_CONTAINS(cset
, i
)) {
1710 cs
[length
++]=(char)i
;
1714 /* convert the char string to a UChar string */
1715 u_charsToUChars(cs
, us
, length
);
1717 /* add each UChar to the USet */
1718 for(i
=0; i
<length
; ++i
) {
1719 if(us
[i
]!=0 || cs
[i
]==0) { /* non-invariant chars become (UChar)0 */
1720 sa
->add(sa
->set
, us
[i
]);
1726 * Fills set with characters that are used in Unicode character names.
1727 * @param set USet to receive characters.
1729 U_CAPI
void U_EXPORT2
1730 uprv_getCharNameCharacters(const USetAdder
*sa
) {
1731 charSetToUSet(gNameSet
, sa
);
1734 /* data swapping ------------------------------------------------------------ */
1737 * The token table contains non-negative entries for token bytes,
1738 * and -1 for bytes that represent themselves in the data file's charset.
1739 * -2 entries are used for lead bytes.
1741 * Direct bytes (-1 entries) must be translated from the input charset family
1742 * to the output charset family.
1743 * makeTokenMap() writes a permutation mapping for this.
1744 * Use it once for single-/lead-byte tokens and once more for all trail byte
1745 * tokens. (';' is an unused trail byte marked with -1.)
1748 makeTokenMap(const UDataSwapper
*ds
,
1749 int16_t tokens
[], uint16_t tokenCount
,
1751 UErrorCode
*pErrorCode
) {
1752 UBool usedOutChar
[256];
1756 if(U_FAILURE(*pErrorCode
)) {
1760 if(ds
->inCharset
==ds
->outCharset
) {
1761 /* Same charset family: identity permutation */
1762 for(i
=0; i
<256; ++i
) {
1766 uprv_memset(map
, 0, 256);
1767 uprv_memset(usedOutChar
, 0, 256);
1769 if(tokenCount
>256) {
1773 /* set the direct bytes (byte 0 always maps to itself) */
1774 for(i
=1; i
<tokenCount
; ++i
) {
1776 /* convert the direct byte character */
1778 ds
->swapInvChars(ds
, &c1
, 1, &c2
, pErrorCode
);
1779 if(U_FAILURE(*pErrorCode
)) {
1780 udata_printError(ds
, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1785 /* enter the converted character into the map and mark it used */
1787 usedOutChar
[c2
]=TRUE
;
1791 /* set the mappings for the rest of the permutation */
1792 for(i
=j
=1; i
<tokenCount
; ++i
) {
1793 /* set mappings that were not set for direct bytes */
1795 /* set an output byte value that was not used as an output byte above */
1796 while(usedOutChar
[j
]) {
1799 map
[i
]=(uint8_t)j
++;
1804 * leave mappings at tokenCount and above unset if tokenCount<256
1805 * because they won't be used
1810 U_CAPI
int32_t U_EXPORT2
1811 uchar_swapNames(const UDataSwapper
*ds
,
1812 const void *inData
, int32_t length
, void *outData
,
1813 UErrorCode
*pErrorCode
) {
1814 const UDataInfo
*pInfo
;
1817 const uint8_t *inBytes
;
1820 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
,
1821 offset
, i
, count
, stringsCount
;
1823 const AlgorithmicRange
*inRange
;
1824 AlgorithmicRange
*outRange
;
1826 /* udata_swapDataHeader checks the arguments */
1827 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
1828 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1832 /* check data format and format version */
1833 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
1835 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
1836 pInfo
->dataFormat
[1]==0x6e &&
1837 pInfo
->dataFormat
[2]==0x61 &&
1838 pInfo
->dataFormat
[3]==0x6d &&
1839 pInfo
->formatVersion
[0]==1
1841 udata_printError(ds
, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1842 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
1843 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
1844 pInfo
->formatVersion
[0]);
1845 *pErrorCode
=U_UNSUPPORTED_ERROR
;
1849 inBytes
=(const uint8_t *)inData
+headerSize
;
1850 outBytes
=(uint8_t *)outData
+headerSize
;
1852 algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]);
1856 (uint32_t)length
<(algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]))
1858 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1860 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1866 /* preflighting: iterate through algorithmic ranges */
1867 offset
=algNamesOffset
;
1868 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
1871 for(i
=0; i
<count
; ++i
) {
1872 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
1873 offset
+=ds
->readUInt16(inRange
->size
);
1880 int16_t tokens
[512];
1881 uint16_t tokenCount
;
1883 uint8_t map
[256], trailMap
[256];
1885 /* copy the data for inaccessible bytes */
1886 if(inBytes
!=outBytes
) {
1887 uprv_memcpy(outBytes
, inBytes
, length
);
1890 /* the initial 4 offsets first */
1891 tokenStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[0]);
1892 groupsOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[1]);
1893 groupStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[2]);
1894 ds
->swapArray32(ds
, inBytes
, 16, outBytes
, pErrorCode
);
1897 * now the tokens table
1898 * it needs to be permutated along with the compressed name strings
1900 p
=(const uint16_t *)(inBytes
+16);
1901 q
=(uint16_t *)(outBytes
+16);
1903 /* read and swap the tokenCount */
1904 tokenCount
=ds
->readUInt16(*p
);
1905 ds
->swapArray16(ds
, p
, 2, q
, pErrorCode
);
1909 /* read the first 512 tokens and make the token maps */
1910 if(tokenCount
<=512) {
1915 for(i
=0; i
<count
; ++i
) {
1916 tokens
[i
]=udata_readInt16(ds
, p
[i
]);
1919 tokens
[i
]=0; /* fill the rest of the tokens array if tokenCount<512 */
1921 makeTokenMap(ds
, tokens
, tokenCount
, map
, pErrorCode
);
1922 makeTokenMap(ds
, tokens
+256, (uint16_t)(tokenCount
>256 ? tokenCount
-256 : 0), trailMap
, pErrorCode
);
1923 if(U_FAILURE(*pErrorCode
)) {
1928 * swap and permutate the tokens
1929 * go through a temporary array to support in-place swapping
1931 temp
=(uint16_t *)uprv_malloc(tokenCount
*2);
1933 udata_printError(ds
, "out of memory swapping %u unames.icu tokens\n",
1935 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1939 /* swap and permutate single-/lead-byte tokens */
1940 for(i
=0; i
<tokenCount
&& i
<256; ++i
) {
1941 ds
->swapArray16(ds
, p
+i
, 2, temp
+map
[i
], pErrorCode
);
1944 /* swap and permutate trail-byte tokens */
1945 for(; i
<tokenCount
; ++i
) {
1946 ds
->swapArray16(ds
, p
+i
, 2, temp
+(i
&0xffffff00)+trailMap
[i
&0xff], pErrorCode
);
1949 /* copy the result into the output and free the temporary array */
1950 uprv_memcpy(q
, temp
, tokenCount
*2);
1954 * swap the token strings but not a possible padding byte after
1955 * the terminating NUL of the last string
1957 udata_swapInvStringBlock(ds
, inBytes
+tokenStringOffset
, (int32_t)(groupsOffset
-tokenStringOffset
),
1958 outBytes
+tokenStringOffset
, pErrorCode
);
1959 if(U_FAILURE(*pErrorCode
)) {
1960 udata_printError(ds
, "uchar_swapNames(token strings) failed\n");
1964 /* swap the group table */
1965 count
=ds
->readUInt16(*((const uint16_t *)(inBytes
+groupsOffset
)));
1966 ds
->swapArray16(ds
, inBytes
+groupsOffset
, (int32_t)((1+count
*3)*2),
1967 outBytes
+groupsOffset
, pErrorCode
);
1970 * swap the group strings
1971 * swap the string bytes but not the nibble-encoded string lengths
1973 if(ds
->inCharset
!=ds
->outCharset
) {
1974 uint16_t offsets
[LINES_PER_GROUP
+1], lengths
[LINES_PER_GROUP
+1];
1976 const uint8_t *inStrings
, *nextInStrings
;
1977 uint8_t *outStrings
;
1981 inStrings
=inBytes
+groupStringOffset
;
1982 outStrings
=outBytes
+groupStringOffset
;
1984 stringsCount
=algNamesOffset
-groupStringOffset
;
1986 /* iterate through string groups until only a few padding bytes are left */
1987 while(stringsCount
>32) {
1988 nextInStrings
=expandGroupLengths(inStrings
, offsets
, lengths
);
1990 /* move past the length bytes */
1991 stringsCount
-=(uint32_t)(nextInStrings
-inStrings
);
1992 outStrings
+=nextInStrings
-inStrings
;
1993 inStrings
=nextInStrings
;
1995 count
=offsets
[31]+lengths
[31]; /* total number of string bytes in this group */
1996 stringsCount
-=count
;
1998 /* swap the string bytes using map[] and trailMap[] */
2001 *outStrings
++=map
[c
];
2005 /* token lead byte: swap the trail byte, too */
2006 *outStrings
++=trailMap
[*inStrings
++];
2013 /* swap the algorithmic ranges */
2014 offset
=algNamesOffset
;
2015 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
2016 ds
->swapArray32(ds
, inBytes
+offset
, 4, outBytes
+offset
, pErrorCode
);
2019 for(i
=0; i
<count
; ++i
) {
2020 if(offset
>(uint32_t)length
) {
2021 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2023 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2027 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
2028 outRange
=(AlgorithmicRange
*)(outBytes
+offset
);
2029 offset
+=ds
->readUInt16(inRange
->size
);
2031 ds
->swapArray32(ds
, inRange
, 8, outRange
, pErrorCode
);
2032 ds
->swapArray16(ds
, &inRange
->size
, 2, &outRange
->size
, pErrorCode
);
2033 switch(inRange
->type
) {
2035 /* swap prefix string */
2036 ds
->swapInvChars(ds
, inRange
+1, (int32_t)uprv_strlen((const char *)(inRange
+1)),
2037 outRange
+1, pErrorCode
);
2038 if(U_FAILURE(*pErrorCode
)) {
2039 udata_printError(ds
, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2046 /* swap factors and the prefix and factor strings */
2047 uint32_t factorsCount
;
2049 factorsCount
=inRange
->variant
;
2050 p
=(const uint16_t *)(inRange
+1);
2051 q
=(uint16_t *)(outRange
+1);
2052 ds
->swapArray16(ds
, p
, (int32_t)(factorsCount
*2), q
, pErrorCode
);
2054 /* swap the strings, up to the last terminating NUL */
2057 stringsCount
=(uint32_t)((inBytes
+offset
)-(const uint8_t *)p
);
2058 while(stringsCount
>0 && ((const uint8_t *)p
)[stringsCount
-1]!=0) {
2061 ds
->swapInvChars(ds
, p
, (int32_t)stringsCount
, q
, pErrorCode
);
2065 udata_printError(ds
, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2067 *pErrorCode
=U_UNSUPPORTED_ERROR
;
2073 return headerSize
+(int32_t)offset
;
2077 * Hey, Emacs, please set the following:
2080 * indent-tabs-mode: nil