2 ******************************************************************************
4 * Copyright (C) 1999-2009, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 1999oct04
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/udata.h"
29 /* prototypes ------------------------------------------------------------- */
31 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
33 static const char DATA_NAME
[] = "unames";
34 static const char DATA_TYPE
[] = "icu";
37 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
38 #define GROUP_MASK (LINES_PER_GROUP-1)
41 * This struct was replaced by explicitly accessing equivalent
42 * fields from triples of uint16_t.
43 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
44 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
45 * would advance by 6 bytes (3 uint16_t).
47 * We can't just change the data structure because it's loaded from a data file,
48 * and we don't want to make it less compact, so we changed the access code.
50 * For details see ICU tickets 6331 and 6008.
53 offsetHigh, offsetLow; / * avoid padding * /
64 * Get the 32-bit group offset.
65 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
66 * @return group offset (int32_t)
68 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
70 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
71 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
75 uint8_t type
, variant
;
80 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
;
84 * Get the groups table from a UCharNames struct.
85 * The groups table consists of one uint16_t groupCount followed by
86 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
87 * and the comment for the old struct Group above.
89 * @param names (const UCharNames *) pointer to the UCharNames indexes
90 * @return (const uint16_t *) pointer to the groups table
92 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
95 const char *otherName
;
99 #define DO_FIND_NAME NULL
101 static UDataMemory
*uCharNamesData
=NULL
;
102 static UCharNames
*uCharNames
=NULL
;
103 static UErrorCode gLoadErrorCode
=U_ZERO_ERROR
;
106 * Maximum length of character names (regular & 1.0).
108 static int32_t gMaxNameLength
=0;
111 * Set of chars used in character names (regular & 1.0).
112 * Chars are platform-dependent (can be EBCDIC).
114 static uint32_t gNameSet
[8]={ 0 };
116 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
117 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
118 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
120 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
122 static const char * const charCatNames
[U_CHAR_EXTENDED_CATEGORY_COUNT
] = {
131 "combining spacing mark",
132 "decimal digit number",
137 "paragraph separator",
145 "connector punctuation",
151 "initial punctuation",
158 /* implementation ----------------------------------------------------------- */
160 static UBool U_CALLCONV
unames_cleanup(void)
163 udata_close(uCharNamesData
);
164 uCharNamesData
= NULL
;
173 static UBool U_CALLCONV
174 isAcceptable(void *context
,
175 const char *type
, const char *name
,
176 const UDataInfo
*pInfo
) {
179 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
180 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
181 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
182 pInfo
->dataFormat
[1]==0x6e &&
183 pInfo
->dataFormat
[2]==0x61 &&
184 pInfo
->dataFormat
[3]==0x6d &&
185 pInfo
->formatVersion
[0]==1);
189 isDataLoaded(UErrorCode
*pErrorCode
) {
190 /* load UCharNames from file if necessary */
193 /* do this because double-checked locking is broken */
194 UMTX_CHECK(NULL
, (uCharNames
!=NULL
), isCached
);
200 /* check error code from previous attempt */
201 if(U_FAILURE(gLoadErrorCode
)) {
202 *pErrorCode
=gLoadErrorCode
;
206 /* open the data outside the mutex block */
207 data
=udata_openChoice(NULL
, DATA_TYPE
, DATA_NAME
, isAcceptable
, NULL
, pErrorCode
);
208 if(U_FAILURE(*pErrorCode
)) {
209 gLoadErrorCode
=*pErrorCode
;
213 names
=(UCharNames
*)udata_getMemory(data
);
215 /* in the mutex block, set the data for this process */
218 if(uCharNames
==NULL
) {
223 ucln_common_registerCleanup(UCLN_COMMON_UNAMES
, unames_cleanup
);
228 /* if a different thread set it first, then close the extra data */
230 udata_close(data
); /* NULL if it was set correctly */
236 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
237 if((bufferLength)>0) { \
244 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
247 * Important: expandName() and compareName() are almost the same -
248 * apply fixes to both.
250 * UnicodeData.txt uses ';' as a field separator, so no
251 * field can contain ';' as part of its contents.
252 * In unames.dat, it is marked as token[';']==-1 only if the
253 * semicolon is used in the data file - which is iff we
254 * have Unicode 1.0 names or ISO comments or aliases.
255 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
256 * although we know that it will never be part of a name.
259 expandName(UCharNames
*names
,
260 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
261 char *buffer
, uint16_t bufferLength
) {
262 uint16_t *tokens
=(uint16_t *)names
+8;
263 uint16_t token
, tokenCount
=*tokens
++, bufferPos
=0;
264 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
267 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
269 * skip the modern name if it is not requested _and_
270 * if the semicolon byte value is a character, not a token number
272 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
273 int fieldIndex
= nameChoice
==U_ISO_COMMENT
? 2 : nameChoice
;
275 while(nameLength
>0) {
281 } while(--fieldIndex
>0);
284 * the semicolon byte value is a token number, therefore
285 * only modern names are stored in unames.dat and there is no
286 * such requested alternate name here
292 /* write each letter directly, and write a token word per token */
293 while(nameLength
>0) {
299 /* implicit letter */
300 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
307 if(token
==(uint16_t)(-2)) {
308 /* this is a lead byte for a double-byte token */
309 token
=tokens
[c
<<8|*name
++];
312 if(token
==(uint16_t)(-1)) {
314 /* explicit letter */
315 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
317 /* stop, but skip the semicolon if we are seeking
318 extended names and there was no 2.0 name but there
320 if(!bufferPos
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
321 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
329 /* write token word */
330 uint8_t *tokenString
=tokenStrings
+token
;
331 while((c
=*tokenString
++)!=0) {
332 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
347 * compareName() is almost the same as expandName() except that it compares
348 * the currently expanded name to an input name.
349 * It returns the match/no match result as soon as possible.
352 compareName(UCharNames
*names
,
353 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
354 const char *otherName
) {
355 uint16_t *tokens
=(uint16_t *)names
+8;
356 uint16_t token
, tokenCount
=*tokens
++;
357 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
359 const char *origOtherName
= otherName
;
361 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
363 * skip the modern name if it is not requested _and_
364 * if the semicolon byte value is a character, not a token number
366 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
367 int fieldIndex
= nameChoice
==U_ISO_COMMENT
? 2 : nameChoice
;
369 while(nameLength
>0) {
375 } while(--fieldIndex
>0);
378 * the semicolon byte value is a token number, therefore
379 * only modern names are stored in unames.dat and there is no
380 * such requested alternate name here
386 /* compare each letter directly, and compare a token word per token */
387 while(nameLength
>0) {
393 /* implicit letter */
394 if((char)c
!=*otherName
++) {
403 if(token
==(uint16_t)(-2)) {
404 /* this is a lead byte for a double-byte token */
405 token
=tokens
[c
<<8|*name
++];
408 if(token
==(uint16_t)(-1)) {
410 /* explicit letter */
411 if((char)c
!=*otherName
++) {
415 /* stop, but skip the semicolon if we are seeking
416 extended names and there was no 2.0 name but there
418 if(otherName
== origOtherName
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
419 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
427 /* write token word */
428 uint8_t *tokenString
=tokenStrings
+token
;
429 while((c
=*tokenString
++)!=0) {
430 if((char)c
!=*otherName
++) {
438 /* complete match? */
439 return (UBool
)(*otherName
==0);
442 static uint8_t getCharCat(UChar32 cp
) {
445 if (UTF_IS_UNICODE_NONCHAR(cp
)) {
446 return U_NONCHARACTER_CODE_POINT
;
449 if ((cat
= u_charType(cp
)) == U_SURROGATE
) {
450 cat
= UTF_IS_LEAD(cp
) ? U_LEAD_SURROGATE
: U_TRAIL_SURROGATE
;
456 static const char *getCharCatName(UChar32 cp
) {
457 uint8_t cat
= getCharCat(cp
);
459 /* Return unknown if the table of names above is not up to
462 if (cat
>= LENGTHOF(charCatNames
)) {
465 return charCatNames
[cat
];
469 static uint16_t getExtName(uint32_t code
, char *buffer
, uint16_t bufferLength
) {
470 const char *catname
= getCharCatName(code
);
476 WRITE_CHAR(buffer
, bufferLength
, length
, '<');
477 while (catname
[length
- 1]) {
478 WRITE_CHAR(buffer
, bufferLength
, length
, catname
[length
- 1]);
480 WRITE_CHAR(buffer
, bufferLength
, length
, '-');
481 for (cp
= code
, ndigits
= 0; cp
; ++ndigits
, cp
>>= 4)
485 for (cp
= code
, i
= ndigits
; (cp
|| i
> 0) && bufferLength
; cp
>>= 4, bufferLength
--) {
486 uint8_t v
= (uint8_t)(cp
& 0xf);
487 buffer
[--i
] = (v
< 10 ? '0' + v
: 'A' + v
- 10);
491 WRITE_CHAR(buffer
, bufferLength
, length
, '>');
497 * getGroup() does a binary search for the group that contains the
498 * Unicode code point "code".
499 * The return value is always a valid Group* that may contain "code"
500 * or else is the highest group before "code".
501 * If the lowest group is after "code", then that one is returned.
503 static const uint16_t *
504 getGroup(UCharNames
*names
, uint32_t code
) {
505 const uint16_t *groups
=GET_GROUPS(names
);
506 uint16_t groupMSB
=(uint16_t)(code
>>GROUP_SHIFT
),
511 /* binary search for the group of names that contains the one for code */
512 while(start
<limit
-1) {
513 number
=(uint16_t)((start
+limit
)/2);
514 if(groupMSB
<groups
[number
*GROUP_LENGTH
+GROUP_MSB
]) {
521 /* return this regardless of whether it is an exact match */
522 return groups
+start
*GROUP_LENGTH
;
526 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
527 * expands them into offsets and lengths for each string.
528 * Lengths are stored with a variable-width encoding in consecutive nibbles:
529 * If a nibble<0xc, then it is the length itself (0=empty string).
530 * If a nibble>=0xc, then it forms a length value with the following nibble.
531 * Calculation see below.
532 * The offsets and lengths arrays must be at least 33 (one more) long because
533 * there is no check here at the end if the last nibble is still used.
535 static const uint8_t *
536 expandGroupLengths(const uint8_t *s
,
537 uint16_t offsets
[LINES_PER_GROUP
+1], uint16_t lengths
[LINES_PER_GROUP
+1]) {
538 /* read the lengths of the 32 strings in this group and get each string's offset */
539 uint16_t i
=0, offset
=0, length
=0;
542 /* all 32 lengths must be read to get the offset of the first group string */
543 while(i
<LINES_PER_GROUP
) {
546 /* read even nibble - MSBs of lengthByte */
548 /* double-nibble length spread across two bytes */
549 length
=(uint16_t)(((length
&0x3)<<4|lengthByte
>>4)+12);
551 } else if((lengthByte
/* &0xf0 */)>=0xc0) {
552 /* double-nibble length spread across this one byte */
553 length
=(uint16_t)((lengthByte
&0x3f)+12);
555 /* single-nibble length in MSBs */
556 length
=(uint16_t)(lengthByte
>>4);
566 /* read odd nibble - LSBs of lengthByte */
567 if((lengthByte
&0xf0)==0) {
568 /* this nibble was not consumed for a double-nibble length above */
571 /* single-nibble length in LSBs */
579 length
=0; /* prevent double-nibble detection in the next iteration */
583 /* now, s is at the first group string */
588 expandGroupName(UCharNames
*names
, const uint16_t *group
,
589 uint16_t lineNumber
, UCharNameChoice nameChoice
,
590 char *buffer
, uint16_t bufferLength
) {
591 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
592 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+GET_GROUP_OFFSET(group
);
593 s
=expandGroupLengths(s
, offsets
, lengths
);
594 return expandName(names
, s
+offsets
[lineNumber
], lengths
[lineNumber
], nameChoice
,
595 buffer
, bufferLength
);
599 getName(UCharNames
*names
, uint32_t code
, UCharNameChoice nameChoice
,
600 char *buffer
, uint16_t bufferLength
) {
601 const uint16_t *group
=getGroup(names
, code
);
602 if((uint16_t)(code
>>GROUP_SHIFT
)==group
[GROUP_MSB
]) {
603 return expandGroupName(names
, group
, (uint16_t)(code
&GROUP_MASK
), nameChoice
,
604 buffer
, bufferLength
);
606 /* group not found */
616 * enumGroupNames() enumerates all the names in a 32-group
617 * and either calls the enumerator function or finds a given input name.
620 enumGroupNames(UCharNames
*names
, const uint16_t *group
,
621 UChar32 start
, UChar32 end
,
622 UEnumCharNamesFn
*fn
, void *context
,
623 UCharNameChoice nameChoice
) {
624 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
625 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+GET_GROUP_OFFSET(group
);
627 s
=expandGroupLengths(s
, offsets
, lengths
);
628 if(fn
!=DO_FIND_NAME
) {
633 length
=expandName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, buffer
, sizeof(buffer
));
634 if (!length
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
635 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
637 /* here, we assume that the buffer is large enough */
639 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
646 const char *otherName
=((FindName
*)context
)->otherName
;
648 if(compareName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, otherName
)) {
649 ((FindName
*)context
)->code
=start
;
659 * enumExtNames enumerate extended names.
660 * It only needs to do it if it is called with a real function and not
661 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
662 * for extended names by itself.
665 enumExtNames(UChar32 start
, UChar32 end
,
666 UEnumCharNamesFn
*fn
, void *context
)
668 if(fn
!=DO_FIND_NAME
) {
673 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
674 /* here, we assume that the buffer is large enough */
676 if(!fn(context
, start
, U_EXTENDED_CHAR_NAME
, buffer
, length
)) {
688 enumNames(UCharNames
*names
,
689 UChar32 start
, UChar32 limit
,
690 UEnumCharNamesFn
*fn
, void *context
,
691 UCharNameChoice nameChoice
) {
692 uint16_t startGroupMSB
, endGroupMSB
, groupCount
;
693 const uint16_t *group
, *groupLimit
;
695 startGroupMSB
=(uint16_t)(start
>>GROUP_SHIFT
);
696 endGroupMSB
=(uint16_t)((limit
-1)>>GROUP_SHIFT
);
698 /* find the group that contains start, or the highest before it */
699 group
=getGroup(names
, start
);
701 if(startGroupMSB
==endGroupMSB
) {
702 if(startGroupMSB
==group
[GROUP_MSB
]) {
703 /* if start and limit-1 are in the same group, then enumerate only in that one */
704 return enumGroupNames(names
, group
, start
, limit
-1, fn
, context
, nameChoice
);
707 const uint16_t *groups
=GET_GROUPS(names
);
708 groupCount
=*groups
++;
709 groupLimit
=groups
+groupCount
*GROUP_LENGTH
;
711 if(startGroupMSB
==group
[GROUP_MSB
]) {
712 /* enumerate characters in the partial start group */
713 if((start
&GROUP_MASK
)!=0) {
714 if(!enumGroupNames(names
, group
,
715 start
, ((UChar32
)startGroupMSB
<<GROUP_SHIFT
)+LINES_PER_GROUP
-1,
716 fn
, context
, nameChoice
)) {
719 group
=NEXT_GROUP(group
); /* continue with the next group */
721 } else if(startGroupMSB
>group
[GROUP_MSB
]) {
722 /* make sure that we start enumerating with the first group after start */
723 const uint16_t *nextGroup
=NEXT_GROUP(group
);
724 if (nextGroup
< groupLimit
&& nextGroup
[GROUP_MSB
] > startGroupMSB
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
725 UChar32 end
= nextGroup
[GROUP_MSB
] << GROUP_SHIFT
;
729 if (!enumExtNames(start
, end
- 1, fn
, context
)) {
736 /* enumerate entire groups between the start- and end-groups */
737 while(group
<groupLimit
&& group
[GROUP_MSB
]<endGroupMSB
) {
738 const uint16_t *nextGroup
;
739 start
=(UChar32
)group
[GROUP_MSB
]<<GROUP_SHIFT
;
740 if(!enumGroupNames(names
, group
, start
, start
+LINES_PER_GROUP
-1, fn
, context
, nameChoice
)) {
743 nextGroup
=NEXT_GROUP(group
);
744 if (nextGroup
< groupLimit
&& nextGroup
[GROUP_MSB
] > group
[GROUP_MSB
] + 1 && nameChoice
== U_EXTENDED_CHAR_NAME
) {
745 UChar32 end
= nextGroup
[GROUP_MSB
] << GROUP_SHIFT
;
749 if (!enumExtNames((group
[GROUP_MSB
] + 1) << GROUP_SHIFT
, end
- 1, fn
, context
)) {
756 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
757 if(group
<groupLimit
&& group
[GROUP_MSB
]==endGroupMSB
) {
758 return enumGroupNames(names
, group
, (limit
-1)&~GROUP_MASK
, limit
-1, fn
, context
, nameChoice
);
759 } else if (nameChoice
== U_EXTENDED_CHAR_NAME
&& group
== groupLimit
) {
760 UChar32 next
= (PREV_GROUP(group
)[GROUP_MSB
] + 1) << GROUP_SHIFT
;
769 /* we have not found a group, which means everything is made of
771 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
772 if (limit
> UCHAR_MAX_VALUE
+ 1) {
773 limit
= UCHAR_MAX_VALUE
+ 1;
775 return enumExtNames(start
, limit
- 1, fn
, context
);
782 writeFactorSuffix(const uint16_t *factors
, uint16_t count
,
783 const char *s
, /* suffix elements */
785 uint16_t indexes
[8], /* output fields from here */
786 const char *elementBases
[8], const char *elements
[8],
787 char *buffer
, uint16_t bufferLength
) {
788 uint16_t i
, factor
, bufferPos
=0;
791 /* write elements according to the factors */
794 * the factorized elements are determined by modulo arithmetic
795 * with the factors of this algorithm
797 * note that for fewer operations, count is decremented here
800 for(i
=count
; i
>0; --i
) {
802 indexes
[i
]=(uint16_t)(code%factor
);
806 * we don't need to calculate the last modulus because start<=code<=end
807 * guarantees here that code<=factors[0]
809 indexes
[0]=(uint16_t)code
;
811 /* write each element */
813 if(elementBases
!=NULL
) {
817 /* skip indexes[i] strings */
829 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
832 /* we do not need to perform the rest of this loop for i==count - break here */
837 /* skip the rest of the strings for this factors[i] */
838 factor
=(uint16_t)(factors
[i
]-indexes
[i
]-1);
857 * Parts of findAlgName() are almost the same as some of getAlgName().
858 * Fixes must be applied to both.
861 getAlgName(AlgorithmicRange
*range
, uint32_t code
, UCharNameChoice nameChoice
,
862 char *buffer
, uint16_t bufferLength
) {
863 uint16_t bufferPos
=0;
865 /* Only the normative character name can be algorithmic. */
866 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
874 switch(range
->type
) {
876 /* name = prefix hex-digits */
877 const char *s
=(const char *)(range
+1);
884 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
887 /* write hexadecimal code point value */
888 count
=range
->variant
;
891 if(count
<bufferLength
) {
896 if(--i
<bufferLength
) {
912 /* name = prefix factorized-elements */
914 const uint16_t *factors
=(const uint16_t *)(range
+1);
915 uint16_t count
=range
->variant
;
916 const char *s
=(const char *)(factors
+count
);
921 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
924 bufferPos
+=writeFactorSuffix(factors
, count
,
925 s
, code
-range
->start
, indexes
, NULL
, NULL
, buffer
, bufferLength
);
941 * Important: enumAlgNames() and findAlgName() are almost the same.
942 * Any fix must be applied to both.
945 enumAlgNames(AlgorithmicRange
*range
,
946 UChar32 start
, UChar32 limit
,
947 UEnumCharNamesFn
*fn
, void *context
,
948 UCharNameChoice nameChoice
) {
952 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
956 switch(range
->type
) {
961 /* get the full name of the start character */
962 length
=getAlgName(range
, (uint32_t)start
, nameChoice
, buffer
, sizeof(buffer
));
967 /* call the enumerator function with this first character */
968 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
972 /* go to the end of the name; all these names have the same length */
978 /* enumerate the rest of the names */
979 while(++start
<limit
) {
980 /* increment the hexadecimal number on a character-basis */
984 if(('0'<=c
&& c
<'9') || ('A'<=c
&& c
<'F')) {
995 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1002 uint16_t indexes
[8];
1003 const char *elementBases
[8], *elements
[8];
1004 const uint16_t *factors
=(const uint16_t *)(range
+1);
1005 uint16_t count
=range
->variant
;
1006 const char *s
=(const char *)(factors
+count
);
1008 uint16_t prefixLength
, i
, idx
;
1012 /* name = prefix factorized-elements */
1017 while((c
=*s
++)!=0) {
1022 /* append the suffix of the start character */
1023 length
=(uint16_t)(prefixLength
+writeFactorSuffix(factors
, count
,
1024 s
, (uint32_t)start
-range
->start
,
1025 indexes
, elementBases
, elements
,
1026 suffix
, (uint16_t)(sizeof(buffer
)-prefixLength
)));
1028 /* call the enumerator function with this first character */
1029 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1033 /* enumerate the rest of the names */
1034 while(++start
<limit
) {
1035 /* increment the indexes in lexical order bound by the factors */
1038 idx
=(uint16_t)(indexes
[--i
]+1);
1039 if(idx
<factors
[i
]) {
1040 /* skip one index and its element string */
1048 /* reset this index to 0 and its element string to the first one */
1050 elements
[i
]=elementBases
[i
];
1054 /* to make matters a little easier, just append all elements to the suffix */
1056 length
=prefixLength
;
1057 for(i
=0; i
<count
; ++i
) {
1059 while((c
=*s
++)!=0) {
1064 /* zero-terminate */
1067 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1074 /* undefined type */
1082 * findAlgName() is almost the same as enumAlgNames() except that it
1083 * returns the code point for a name if it fits into the range.
1084 * It returns 0xffff otherwise.
1087 findAlgName(AlgorithmicRange
*range
, UCharNameChoice nameChoice
, const char *otherName
) {
1090 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
1094 switch(range
->type
) {
1096 /* name = prefix hex-digits */
1097 const char *s
=(const char *)(range
+1);
1102 /* compare prefix */
1103 while((c
=*s
++)!=0) {
1104 if((char)c
!=*otherName
++) {
1109 /* read hexadecimal code point value */
1110 count
=range
->variant
;
1112 for(i
=0; i
<count
; ++i
) {
1114 if('0'<=c
&& c
<='9') {
1115 code
=(code
<<4)|(c
-'0');
1116 } else if('A'<=c
&& c
<='F') {
1117 code
=(code
<<4)|(c
-'A'+10);
1123 /* does it fit into the range? */
1124 if(*otherName
==0 && range
->start
<=(uint32_t)code
&& (uint32_t)code
<=range
->end
) {
1131 uint16_t indexes
[8];
1132 const char *elementBases
[8], *elements
[8];
1133 const uint16_t *factors
=(const uint16_t *)(range
+1);
1134 uint16_t count
=range
->variant
;
1135 const char *s
=(const char *)(factors
+count
), *t
;
1136 UChar32 start
, limit
;
1141 /* name = prefix factorized-elements */
1143 /* compare prefix */
1144 while((c
=*s
++)!=0) {
1145 if((char)c
!=*otherName
++) {
1150 start
=(UChar32
)range
->start
;
1151 limit
=(UChar32
)(range
->end
+1);
1153 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1154 writeFactorSuffix(factors
, count
, s
, 0,
1155 indexes
, elementBases
, elements
, buffer
, sizeof(buffer
));
1157 /* compare the first suffix */
1158 if(0==uprv_strcmp(otherName
, buffer
)) {
1162 /* enumerate and compare the rest of the suffixes */
1163 while(++start
<limit
) {
1164 /* increment the indexes in lexical order bound by the factors */
1167 idx
=(uint16_t)(indexes
[--i
]+1);
1168 if(idx
<factors
[i
]) {
1169 /* skip one index and its element string */
1176 /* reset this index to 0 and its element string to the first one */
1178 elements
[i
]=elementBases
[i
];
1182 /* to make matters a little easier, just compare all elements of the suffix */
1184 for(i
=0; i
<count
; ++i
) {
1186 while((c
=*s
++)!=0) {
1188 s
=""; /* does not match */
1200 /* undefined type */
1207 /* sets of name characters, maximum name lengths ---------------------------- */
1209 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1210 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1213 calcStringSetLength(uint32_t set
[8], const char *s
) {
1217 while((c
=*s
++)!=0) {
1225 calcAlgNameSetsLengths(int32_t maxNameLength
) {
1226 AlgorithmicRange
*range
;
1228 uint32_t rangeCount
;
1231 /* enumerate algorithmic ranges */
1232 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1234 range
=(AlgorithmicRange
*)(p
+1);
1235 while(rangeCount
>0) {
1236 switch(range
->type
) {
1238 /* name = prefix + (range->variant times) hex-digits */
1240 length
=calcStringSetLength(gNameSet
, (const char *)(range
+1))+range
->variant
;
1241 if(length
>maxNameLength
) {
1242 maxNameLength
=length
;
1246 /* name = prefix factorized-elements */
1247 const uint16_t *factors
=(const uint16_t *)(range
+1);
1249 int32_t i
, count
=range
->variant
, factor
, factorLength
, maxFactorLength
;
1252 s
=(const char *)(factors
+count
);
1253 length
=calcStringSetLength(gNameSet
, s
);
1254 s
+=length
+1; /* start of factor suffixes */
1256 /* get the set and maximum factor suffix length for each factor */
1257 for(i
=0; i
<count
; ++i
) {
1259 for(factor
=factors
[i
]; factor
>0; --factor
) {
1260 factorLength
=calcStringSetLength(gNameSet
, s
);
1262 if(factorLength
>maxFactorLength
) {
1263 maxFactorLength
=factorLength
;
1266 length
+=maxFactorLength
;
1269 if(length
>maxNameLength
) {
1270 maxNameLength
=length
;
1279 range
=(AlgorithmicRange
*)((uint8_t *)range
+range
->size
);
1282 return maxNameLength
;
1286 calcExtNameSetsLengths(int32_t maxNameLength
) {
1289 for(i
=0; i
<LENGTHOF(charCatNames
); ++i
) {
1291 * for each category, count the length of the category name
1295 * 6 for most hex digits per code point
1297 length
=9+calcStringSetLength(gNameSet
, charCatNames
[i
]);
1298 if(length
>maxNameLength
) {
1299 maxNameLength
=length
;
1302 return maxNameLength
;
1306 calcNameSetLength(const uint16_t *tokens
, uint16_t tokenCount
, const uint8_t *tokenStrings
, int8_t *tokenLengths
,
1308 const uint8_t **pLine
, const uint8_t *lineLimit
) {
1309 const uint8_t *line
=*pLine
;
1310 int32_t length
=0, tokenLength
;
1313 while(line
!=lineLimit
&& (c
=*line
++)!=(uint8_t)';') {
1315 /* implicit letter */
1320 if(token
==(uint16_t)(-2)) {
1321 /* this is a lead byte for a double-byte token */
1325 if(token
==(uint16_t)(-1)) {
1326 /* explicit letter */
1330 /* count token word */
1331 if(tokenLengths
!=NULL
) {
1332 /* use cached token length */
1333 tokenLength
=tokenLengths
[c
];
1334 if(tokenLength
==0) {
1335 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1336 tokenLengths
[c
]=(int8_t)tokenLength
;
1339 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1341 length
+=tokenLength
;
1351 calcGroupNameSetsLengths(int32_t maxNameLength
) {
1352 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
1354 uint16_t *tokens
=(uint16_t *)uCharNames
+8;
1355 uint16_t tokenCount
=*tokens
++;
1356 uint8_t *tokenStrings
=(uint8_t *)uCharNames
+uCharNames
->tokenStringOffset
;
1358 int8_t *tokenLengths
;
1360 const uint16_t *group
;
1361 const uint8_t *s
, *line
, *lineLimit
;
1363 int32_t groupCount
, lineNumber
, length
;
1365 tokenLengths
=(int8_t *)uprv_malloc(tokenCount
);
1366 if(tokenLengths
!=NULL
) {
1367 uprv_memset(tokenLengths
, 0, tokenCount
);
1370 group
=GET_GROUPS(uCharNames
);
1371 groupCount
=*group
++;
1373 /* enumerate all groups */
1374 while(groupCount
>0) {
1375 s
=(uint8_t *)uCharNames
+uCharNames
->groupStringOffset
+GET_GROUP_OFFSET(group
);
1376 s
=expandGroupLengths(s
, offsets
, lengths
);
1378 /* enumerate all lines in each group */
1379 for(lineNumber
=0; lineNumber
<LINES_PER_GROUP
; ++lineNumber
) {
1380 line
=s
+offsets
[lineNumber
];
1381 length
=lengths
[lineNumber
];
1386 lineLimit
=line
+length
;
1388 /* read regular name */
1389 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1390 if(length
>maxNameLength
) {
1391 maxNameLength
=length
;
1393 if(line
==lineLimit
) {
1397 /* read Unicode 1.0 name */
1398 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1399 if(length
>maxNameLength
) {
1400 maxNameLength
=length
;
1402 if(line
==lineLimit
) {
1406 /* read ISO comment */
1407 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1410 group
=NEXT_GROUP(group
);
1414 if(tokenLengths
!=NULL
) {
1415 uprv_free(tokenLengths
);
1418 /* set gMax... - name length last for threading */
1419 gMaxNameLength
=maxNameLength
;
1423 calcNameSetsLengths(UErrorCode
*pErrorCode
) {
1424 static const char extChars
[]="0123456789ABCDEF<>-";
1425 int32_t i
, maxNameLength
;
1427 if(gMaxNameLength
!=0) {
1431 if(!isDataLoaded(pErrorCode
)) {
1435 /* set hex digits, used in various names, and <>-, used in extended names */
1436 for(i
=0; i
<sizeof(extChars
)-1; ++i
) {
1437 SET_ADD(gNameSet
, extChars
[i
]);
1440 /* set sets and lengths from algorithmic names */
1441 maxNameLength
=calcAlgNameSetsLengths(0);
1443 /* set sets and lengths from extended names */
1444 maxNameLength
=calcExtNameSetsLengths(maxNameLength
);
1446 /* set sets and lengths from group names, set global maximum values */
1447 calcGroupNameSetsLengths(maxNameLength
);
1452 /* public API --------------------------------------------------------------- */
1454 U_CAPI
int32_t U_EXPORT2
1455 u_charName(UChar32 code
, UCharNameChoice nameChoice
,
1456 char *buffer
, int32_t bufferLength
,
1457 UErrorCode
*pErrorCode
) {
1458 AlgorithmicRange
*algRange
;
1463 /* check the argument values */
1464 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1466 } else if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
||
1467 bufferLength
<0 || (bufferLength
>0 && buffer
==NULL
)
1469 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1473 if((uint32_t)code
>UCHAR_MAX_VALUE
|| !isDataLoaded(pErrorCode
)) {
1474 return u_terminateChars(buffer
, bufferLength
, 0, pErrorCode
);
1479 /* try algorithmic names first */
1480 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1482 algRange
=(AlgorithmicRange
*)(p
+1);
1484 if(algRange
->start
<=(uint32_t)code
&& (uint32_t)code
<=algRange
->end
) {
1485 length
=getAlgName(algRange
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1488 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1493 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1494 length
= getName(uCharNames
, (uint32_t )code
, U_EXTENDED_CHAR_NAME
, buffer
, (uint16_t) bufferLength
);
1496 /* extended character name */
1497 length
= getExtName((uint32_t) code
, buffer
, (uint16_t) bufferLength
);
1500 /* normal character name */
1501 length
=getName(uCharNames
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1505 return u_terminateChars(buffer
, bufferLength
, length
, pErrorCode
);
1508 U_CAPI
int32_t U_EXPORT2
1509 u_getISOComment(UChar32 c
,
1510 char *dest
, int32_t destCapacity
,
1511 UErrorCode
*pErrorCode
) {
1514 /* check the argument values */
1515 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1517 } else if(destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1518 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1522 if((uint32_t)c
>UCHAR_MAX_VALUE
|| !isDataLoaded(pErrorCode
)) {
1523 return u_terminateChars(dest
, destCapacity
, 0, pErrorCode
);
1526 /* the ISO comment is stored like a normal character name */
1527 length
=getName(uCharNames
, (uint32_t)c
, U_ISO_COMMENT
, dest
, (uint16_t)destCapacity
);
1528 return u_terminateChars(dest
, destCapacity
, length
, pErrorCode
);
1531 U_CAPI UChar32 U_EXPORT2
1532 u_charFromName(UCharNameChoice nameChoice
,
1534 UErrorCode
*pErrorCode
) {
1535 char upper
[120], lower
[120];
1537 AlgorithmicRange
*algRange
;
1542 UChar32 error
= 0xffff; /* Undefined, but use this for backwards compatibility. */
1544 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1548 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| name
==NULL
|| *name
==0) {
1549 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1553 if(!isDataLoaded(pErrorCode
)) {
1557 /* construct the uppercase and lowercase of the name first */
1558 for(i
=0; i
<sizeof(upper
); ++i
) {
1559 if((c0
=*name
++)!=0) {
1560 upper
[i
]=uprv_toupper(c0
);
1561 lower
[i
]=uprv_tolower(c0
);
1563 upper
[i
]=lower
[i
]=0;
1567 if(i
==sizeof(upper
)) {
1568 /* name too long, there is no such character */
1569 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1573 /* try extended names first */
1574 if (lower
[0] == '<') {
1575 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1576 if (lower
[--i
] == '>') {
1577 for (--i
; lower
[i
] && lower
[i
] != '-'; --i
) {
1580 if (lower
[i
] == '-') { /* We've got a category. */
1585 for (++i
; lower
[i
] != '>'; ++i
) {
1586 if (lower
[i
] >= '0' && lower
[i
] <= '9') {
1587 cp
= (cp
<< 4) + lower
[i
] - '0';
1588 } else if (lower
[i
] >= 'a' && lower
[i
] <= 'f') {
1589 cp
= (cp
<< 4) + lower
[i
] - 'a' + 10;
1591 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1596 /* Now validate the category name.
1597 We could use a binary search, or a trie, if
1598 we really wanted to. */
1600 for (lower
[i
] = 0, cIdx
= 0; cIdx
< LENGTHOF(charCatNames
); ++cIdx
) {
1602 if (!uprv_strcmp(lower
+ 1, charCatNames
[cIdx
])) {
1603 if (getCharCat(cp
) == cIdx
) {
1613 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1617 /* try algorithmic names now */
1618 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1620 algRange
=(AlgorithmicRange
*)(p
+1);
1622 if((cp
=findAlgName(algRange
, nameChoice
, upper
))!=0xffff) {
1625 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1629 /* normal character name */
1630 findName
.otherName
=upper
;
1631 findName
.code
=error
;
1632 enumNames(uCharNames
, 0, UCHAR_MAX_VALUE
+ 1, DO_FIND_NAME
, &findName
, nameChoice
);
1633 if (findName
.code
== error
) {
1634 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1636 return findName
.code
;
1639 U_CAPI
void U_EXPORT2
1640 u_enumCharNames(UChar32 start
, UChar32 limit
,
1641 UEnumCharNamesFn
*fn
,
1643 UCharNameChoice nameChoice
,
1644 UErrorCode
*pErrorCode
) {
1645 AlgorithmicRange
*algRange
;
1649 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1653 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| fn
==NULL
) {
1654 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1658 if((uint32_t) limit
> UCHAR_MAX_VALUE
+ 1) {
1659 limit
= UCHAR_MAX_VALUE
+ 1;
1661 if((uint32_t)start
>=(uint32_t)limit
) {
1665 if(!isDataLoaded(pErrorCode
)) {
1669 /* interleave the data-driven ones with the algorithmic ones */
1670 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1671 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1673 algRange
=(AlgorithmicRange
*)(p
+1);
1675 /* enumerate the character names before the current algorithmic range */
1676 /* here: start<limit */
1677 if((uint32_t)start
<algRange
->start
) {
1678 if((uint32_t)limit
<=algRange
->start
) {
1679 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1682 if(!enumNames(uCharNames
, start
, (UChar32
)algRange
->start
, fn
, context
, nameChoice
)) {
1685 start
=(UChar32
)algRange
->start
;
1687 /* enumerate the character names in the current algorithmic range */
1688 /* here: algRange->start<=start<limit */
1689 if((uint32_t)start
<=algRange
->end
) {
1690 if((uint32_t)limit
<=(algRange
->end
+1)) {
1691 enumAlgNames(algRange
, start
, limit
, fn
, context
, nameChoice
);
1694 if(!enumAlgNames(algRange
, start
, (UChar32
)algRange
->end
+1, fn
, context
, nameChoice
)) {
1697 start
=(UChar32
)algRange
->end
+1;
1699 /* continue to the next algorithmic range (here: start<limit) */
1700 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1703 /* enumerate the character names after the last algorithmic range */
1704 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1707 U_CAPI
int32_t U_EXPORT2
1708 uprv_getMaxCharNameLength() {
1709 UErrorCode errorCode
=U_ZERO_ERROR
;
1710 if(calcNameSetsLengths(&errorCode
)) {
1711 return gMaxNameLength
;
1718 * Converts the char set cset into a Unicode set uset.
1719 * @param cset Set of 256 bit flags corresponding to a set of chars.
1720 * @param uset USet to receive characters. Existing contents are deleted.
1723 charSetToUSet(uint32_t cset
[8], const USetAdder
*sa
) {
1728 UErrorCode errorCode
;
1730 errorCode
=U_ZERO_ERROR
;
1732 if(!calcNameSetsLengths(&errorCode
)) {
1736 /* build a char string with all chars that are used in character names */
1738 for(i
=0; i
<256; ++i
) {
1739 if(SET_CONTAINS(cset
, i
)) {
1740 cs
[length
++]=(char)i
;
1744 /* convert the char string to a UChar string */
1745 u_charsToUChars(cs
, us
, length
);
1747 /* add each UChar to the USet */
1748 for(i
=0; i
<length
; ++i
) {
1749 if(us
[i
]!=0 || cs
[i
]==0) { /* non-invariant chars become (UChar)0 */
1750 sa
->add(sa
->set
, us
[i
]);
1756 * Fills set with characters that are used in Unicode character names.
1757 * @param set USet to receive characters.
1759 U_CAPI
void U_EXPORT2
1760 uprv_getCharNameCharacters(const USetAdder
*sa
) {
1761 charSetToUSet(gNameSet
, sa
);
1764 /* data swapping ------------------------------------------------------------ */
1767 * The token table contains non-negative entries for token bytes,
1768 * and -1 for bytes that represent themselves in the data file's charset.
1769 * -2 entries are used for lead bytes.
1771 * Direct bytes (-1 entries) must be translated from the input charset family
1772 * to the output charset family.
1773 * makeTokenMap() writes a permutation mapping for this.
1774 * Use it once for single-/lead-byte tokens and once more for all trail byte
1775 * tokens. (';' is an unused trail byte marked with -1.)
1778 makeTokenMap(const UDataSwapper
*ds
,
1779 int16_t tokens
[], uint16_t tokenCount
,
1781 UErrorCode
*pErrorCode
) {
1782 UBool usedOutChar
[256];
1786 if(U_FAILURE(*pErrorCode
)) {
1790 if(ds
->inCharset
==ds
->outCharset
) {
1791 /* Same charset family: identity permutation */
1792 for(i
=0; i
<256; ++i
) {
1796 uprv_memset(map
, 0, 256);
1797 uprv_memset(usedOutChar
, 0, 256);
1799 if(tokenCount
>256) {
1803 /* set the direct bytes (byte 0 always maps to itself) */
1804 for(i
=1; i
<tokenCount
; ++i
) {
1806 /* convert the direct byte character */
1808 ds
->swapInvChars(ds
, &c1
, 1, &c2
, pErrorCode
);
1809 if(U_FAILURE(*pErrorCode
)) {
1810 udata_printError(ds
, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1815 /* enter the converted character into the map and mark it used */
1817 usedOutChar
[c2
]=TRUE
;
1821 /* set the mappings for the rest of the permutation */
1822 for(i
=j
=1; i
<tokenCount
; ++i
) {
1823 /* set mappings that were not set for direct bytes */
1825 /* set an output byte value that was not used as an output byte above */
1826 while(usedOutChar
[j
]) {
1829 map
[i
]=(uint8_t)j
++;
1834 * leave mappings at tokenCount and above unset if tokenCount<256
1835 * because they won't be used
1840 U_CAPI
int32_t U_EXPORT2
1841 uchar_swapNames(const UDataSwapper
*ds
,
1842 const void *inData
, int32_t length
, void *outData
,
1843 UErrorCode
*pErrorCode
) {
1844 const UDataInfo
*pInfo
;
1847 const uint8_t *inBytes
;
1850 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
,
1851 offset
, i
, count
, stringsCount
;
1853 const AlgorithmicRange
*inRange
;
1854 AlgorithmicRange
*outRange
;
1856 /* udata_swapDataHeader checks the arguments */
1857 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
1858 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1862 /* check data format and format version */
1863 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
1865 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
1866 pInfo
->dataFormat
[1]==0x6e &&
1867 pInfo
->dataFormat
[2]==0x61 &&
1868 pInfo
->dataFormat
[3]==0x6d &&
1869 pInfo
->formatVersion
[0]==1
1871 udata_printError(ds
, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1872 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
1873 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
1874 pInfo
->formatVersion
[0]);
1875 *pErrorCode
=U_UNSUPPORTED_ERROR
;
1879 inBytes
=(const uint8_t *)inData
+headerSize
;
1880 outBytes
=(uint8_t *)outData
+headerSize
;
1882 algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]);
1886 (uint32_t)length
<(algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]))
1888 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1890 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1896 /* preflighting: iterate through algorithmic ranges */
1897 offset
=algNamesOffset
;
1898 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
1901 for(i
=0; i
<count
; ++i
) {
1902 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
1903 offset
+=ds
->readUInt16(inRange
->size
);
1910 int16_t tokens
[512];
1911 uint16_t tokenCount
;
1913 uint8_t map
[256], trailMap
[256];
1915 /* copy the data for inaccessible bytes */
1916 if(inBytes
!=outBytes
) {
1917 uprv_memcpy(outBytes
, inBytes
, length
);
1920 /* the initial 4 offsets first */
1921 tokenStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[0]);
1922 groupsOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[1]);
1923 groupStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[2]);
1924 ds
->swapArray32(ds
, inBytes
, 16, outBytes
, pErrorCode
);
1927 * now the tokens table
1928 * it needs to be permutated along with the compressed name strings
1930 p
=(const uint16_t *)(inBytes
+16);
1931 q
=(uint16_t *)(outBytes
+16);
1933 /* read and swap the tokenCount */
1934 tokenCount
=ds
->readUInt16(*p
);
1935 ds
->swapArray16(ds
, p
, 2, q
, pErrorCode
);
1939 /* read the first 512 tokens and make the token maps */
1940 if(tokenCount
<=512) {
1945 for(i
=0; i
<count
; ++i
) {
1946 tokens
[i
]=udata_readInt16(ds
, p
[i
]);
1949 tokens
[i
]=0; /* fill the rest of the tokens array if tokenCount<512 */
1951 makeTokenMap(ds
, tokens
, tokenCount
, map
, pErrorCode
);
1952 makeTokenMap(ds
, tokens
+256, (uint16_t)(tokenCount
>256 ? tokenCount
-256 : 0), trailMap
, pErrorCode
);
1953 if(U_FAILURE(*pErrorCode
)) {
1958 * swap and permutate the tokens
1959 * go through a temporary array to support in-place swapping
1961 temp
=(uint16_t *)uprv_malloc(tokenCount
*2);
1963 udata_printError(ds
, "out of memory swapping %u unames.icu tokens\n",
1965 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1969 /* swap and permutate single-/lead-byte tokens */
1970 for(i
=0; i
<tokenCount
&& i
<256; ++i
) {
1971 ds
->swapArray16(ds
, p
+i
, 2, temp
+map
[i
], pErrorCode
);
1974 /* swap and permutate trail-byte tokens */
1975 for(; i
<tokenCount
; ++i
) {
1976 ds
->swapArray16(ds
, p
+i
, 2, temp
+(i
&0xffffff00)+trailMap
[i
&0xff], pErrorCode
);
1979 /* copy the result into the output and free the temporary array */
1980 uprv_memcpy(q
, temp
, tokenCount
*2);
1984 * swap the token strings but not a possible padding byte after
1985 * the terminating NUL of the last string
1987 udata_swapInvStringBlock(ds
, inBytes
+tokenStringOffset
, (int32_t)(groupsOffset
-tokenStringOffset
),
1988 outBytes
+tokenStringOffset
, pErrorCode
);
1989 if(U_FAILURE(*pErrorCode
)) {
1990 udata_printError(ds
, "uchar_swapNames(token strings) failed\n");
1994 /* swap the group table */
1995 count
=ds
->readUInt16(*((const uint16_t *)(inBytes
+groupsOffset
)));
1996 ds
->swapArray16(ds
, inBytes
+groupsOffset
, (int32_t)((1+count
*3)*2),
1997 outBytes
+groupsOffset
, pErrorCode
);
2000 * swap the group strings
2001 * swap the string bytes but not the nibble-encoded string lengths
2003 if(ds
->inCharset
!=ds
->outCharset
) {
2004 uint16_t offsets
[LINES_PER_GROUP
+1], lengths
[LINES_PER_GROUP
+1];
2006 const uint8_t *inStrings
, *nextInStrings
;
2007 uint8_t *outStrings
;
2011 inStrings
=inBytes
+groupStringOffset
;
2012 outStrings
=outBytes
+groupStringOffset
;
2014 stringsCount
=algNamesOffset
-groupStringOffset
;
2016 /* iterate through string groups until only a few padding bytes are left */
2017 while(stringsCount
>32) {
2018 nextInStrings
=expandGroupLengths(inStrings
, offsets
, lengths
);
2020 /* move past the length bytes */
2021 stringsCount
-=(uint32_t)(nextInStrings
-inStrings
);
2022 outStrings
+=nextInStrings
-inStrings
;
2023 inStrings
=nextInStrings
;
2025 count
=offsets
[31]+lengths
[31]; /* total number of string bytes in this group */
2026 stringsCount
-=count
;
2028 /* swap the string bytes using map[] and trailMap[] */
2031 *outStrings
++=map
[c
];
2035 /* token lead byte: swap the trail byte, too */
2036 *outStrings
++=trailMap
[*inStrings
++];
2043 /* swap the algorithmic ranges */
2044 offset
=algNamesOffset
;
2045 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
2046 ds
->swapArray32(ds
, inBytes
+offset
, 4, outBytes
+offset
, pErrorCode
);
2049 for(i
=0; i
<count
; ++i
) {
2050 if(offset
>(uint32_t)length
) {
2051 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2053 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2057 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
2058 outRange
=(AlgorithmicRange
*)(outBytes
+offset
);
2059 offset
+=ds
->readUInt16(inRange
->size
);
2061 ds
->swapArray32(ds
, inRange
, 8, outRange
, pErrorCode
);
2062 ds
->swapArray16(ds
, &inRange
->size
, 2, &outRange
->size
, pErrorCode
);
2063 switch(inRange
->type
) {
2065 /* swap prefix string */
2066 ds
->swapInvChars(ds
, inRange
+1, (int32_t)uprv_strlen((const char *)(inRange
+1)),
2067 outRange
+1, pErrorCode
);
2068 if(U_FAILURE(*pErrorCode
)) {
2069 udata_printError(ds
, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2076 /* swap factors and the prefix and factor strings */
2077 uint32_t factorsCount
;
2079 factorsCount
=inRange
->variant
;
2080 p
=(const uint16_t *)(inRange
+1);
2081 q
=(uint16_t *)(outRange
+1);
2082 ds
->swapArray16(ds
, p
, (int32_t)(factorsCount
*2), q
, pErrorCode
);
2084 /* swap the strings, up to the last terminating NUL */
2087 stringsCount
=(uint32_t)((inBytes
+offset
)-(const uint8_t *)p
);
2088 while(stringsCount
>0 && ((const uint8_t *)p
)[stringsCount
-1]!=0) {
2091 ds
->swapInvChars(ds
, p
, (int32_t)stringsCount
, q
, pErrorCode
);
2095 udata_printError(ds
, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2097 *pErrorCode
=U_UNSUPPORTED_ERROR
;
2103 return headerSize
+(int32_t)offset
;
2107 * Hey, Emacs, please set the following:
2110 * indent-tabs-mode: nil