1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ******************************************************************************
6 * Copyright (C) 1999-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 ******************************************************************************
12 * tab size: 8 (not used)
15 * created on: 1999oct04
16 * created by: Markus W. Scherer
19 #include "unicode/utypes.h"
20 #include "unicode/putil.h"
21 #include "unicode/uchar.h"
22 #include "unicode/udata.h"
23 #include "unicode/utf.h"
24 #include "unicode/utf16.h"
36 /* prototypes ------------------------------------------------------------- */
38 static const char DATA_NAME
[] = "unames";
39 static const char DATA_TYPE
[] = "icu";
42 #define LINES_PER_GROUP (1L<<GROUP_SHIFT)
43 #define GROUP_MASK (LINES_PER_GROUP-1)
46 * This struct was replaced by explicitly accessing equivalent
47 * fields from triples of uint16_t.
48 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
49 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
50 * would advance by 6 bytes (3 uint16_t).
52 * We can't just change the data structure because it's loaded from a data file,
53 * and we don't want to make it less compact, so we changed the access code.
55 * For details see ICU tickets 6331 and 6008.
58 offsetHigh, offsetLow; / * avoid padding * /
69 * Get the 32-bit group offset.
70 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
71 * @return group offset (int32_t)
73 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
75 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
76 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
80 uint8_t type
, variant
;
85 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
;
89 * Get the groups table from a UCharNames struct.
90 * The groups table consists of one uint16_t groupCount followed by
91 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
92 * and the comment for the old struct Group above.
94 * @param names (const UCharNames *) pointer to the UCharNames indexes
95 * @return (const uint16_t *) pointer to the groups table
97 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
100 const char *otherName
;
104 #define DO_FIND_NAME NULL
106 static UDataMemory
*uCharNamesData
=NULL
;
107 static UCharNames
*uCharNames
=NULL
;
108 static icu::UInitOnce gCharNamesInitOnce
= U_INITONCE_INITIALIZER
;
111 * Maximum length of character names (regular & 1.0).
113 static int32_t gMaxNameLength
=0;
116 * Set of chars used in character names (regular & 1.0).
117 * Chars are platform-dependent (can be EBCDIC).
119 static uint32_t gNameSet
[8]={ 0 };
121 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
122 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
123 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
125 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
127 static const char * const charCatNames
[U_CHAR_EXTENDED_CATEGORY_COUNT
] = {
136 "combining spacing mark",
137 "decimal digit number",
142 "paragraph separator",
150 "connector punctuation",
156 "initial punctuation",
163 /* implementation ----------------------------------------------------------- */
165 static UBool U_CALLCONV
unames_cleanup(void)
168 udata_close(uCharNamesData
);
169 uCharNamesData
= NULL
;
174 gCharNamesInitOnce
.reset();
179 static UBool U_CALLCONV
180 isAcceptable(void * /*context*/,
181 const char * /*type*/, const char * /*name*/,
182 const UDataInfo
*pInfo
) {
185 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
186 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
187 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
188 pInfo
->dataFormat
[1]==0x6e &&
189 pInfo
->dataFormat
[2]==0x61 &&
190 pInfo
->dataFormat
[3]==0x6d &&
191 pInfo
->formatVersion
[0]==1);
194 static void U_CALLCONV
195 loadCharNames(UErrorCode
&status
) {
196 U_ASSERT(uCharNamesData
== NULL
);
197 U_ASSERT(uCharNames
== NULL
);
199 uCharNamesData
= udata_openChoice(NULL
, DATA_TYPE
, DATA_NAME
, isAcceptable
, NULL
, &status
);
200 if(U_FAILURE(status
)) {
201 uCharNamesData
= NULL
;
203 uCharNames
= (UCharNames
*)udata_getMemory(uCharNamesData
);
205 ucln_common_registerCleanup(UCLN_COMMON_UNAMES
, unames_cleanup
);
210 isDataLoaded(UErrorCode
*pErrorCode
) {
211 umtx_initOnce(gCharNamesInitOnce
, &loadCharNames
, *pErrorCode
);
212 return U_SUCCESS(*pErrorCode
);
215 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
216 if((bufferLength)>0) { \
223 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
226 * Important: expandName() and compareName() are almost the same -
227 * apply fixes to both.
229 * UnicodeData.txt uses ';' as a field separator, so no
230 * field can contain ';' as part of its contents.
231 * In unames.dat, it is marked as token[';']==-1 only if the
232 * semicolon is used in the data file - which is iff we
233 * have Unicode 1.0 names or ISO comments or aliases.
234 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
235 * although we know that it will never be part of a name.
238 expandName(UCharNames
*names
,
239 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
240 char *buffer
, uint16_t bufferLength
) {
241 uint16_t *tokens
=(uint16_t *)names
+8;
242 uint16_t token
, tokenCount
=*tokens
++, bufferPos
=0;
243 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
246 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
248 * skip the modern name if it is not requested _and_
249 * if the semicolon byte value is a character, not a token number
251 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
252 int fieldIndex
= nameChoice
==U_ISO_COMMENT
? 2 : nameChoice
;
254 while(nameLength
>0) {
260 } while(--fieldIndex
>0);
263 * the semicolon byte value is a token number, therefore
264 * only modern names are stored in unames.dat and there is no
265 * such requested alternate name here
271 /* write each letter directly, and write a token word per token */
272 while(nameLength
>0) {
278 /* implicit letter */
279 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
286 if(token
==(uint16_t)(-2)) {
287 /* this is a lead byte for a double-byte token */
288 token
=tokens
[c
<<8|*name
++];
291 if(token
==(uint16_t)(-1)) {
293 /* explicit letter */
294 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
296 /* stop, but skip the semicolon if we are seeking
297 extended names and there was no 2.0 name but there
299 if(!bufferPos
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
300 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
308 /* write token word */
309 uint8_t *tokenString
=tokenStrings
+token
;
310 while((c
=*tokenString
++)!=0) {
311 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
326 * compareName() is almost the same as expandName() except that it compares
327 * the currently expanded name to an input name.
328 * It returns the match/no match result as soon as possible.
331 compareName(UCharNames
*names
,
332 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
333 const char *otherName
) {
334 uint16_t *tokens
=(uint16_t *)names
+8;
335 uint16_t token
, tokenCount
=*tokens
++;
336 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
338 const char *origOtherName
= otherName
;
340 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
342 * skip the modern name if it is not requested _and_
343 * if the semicolon byte value is a character, not a token number
345 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
346 int fieldIndex
= nameChoice
==U_ISO_COMMENT
? 2 : nameChoice
;
348 while(nameLength
>0) {
354 } while(--fieldIndex
>0);
357 * the semicolon byte value is a token number, therefore
358 * only modern names are stored in unames.dat and there is no
359 * such requested alternate name here
365 /* compare each letter directly, and compare a token word per token */
366 while(nameLength
>0) {
372 /* implicit letter */
373 if((char)c
!=*otherName
++) {
382 if(token
==(uint16_t)(-2)) {
383 /* this is a lead byte for a double-byte token */
384 token
=tokens
[c
<<8|*name
++];
387 if(token
==(uint16_t)(-1)) {
389 /* explicit letter */
390 if((char)c
!=*otherName
++) {
394 /* stop, but skip the semicolon if we are seeking
395 extended names and there was no 2.0 name but there
397 if(otherName
== origOtherName
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
398 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
406 /* write token word */
407 uint8_t *tokenString
=tokenStrings
+token
;
408 while((c
=*tokenString
++)!=0) {
409 if((char)c
!=*otherName
++) {
417 /* complete match? */
418 return (UBool
)(*otherName
==0);
421 static uint8_t getCharCat(UChar32 cp
) {
424 if (U_IS_UNICODE_NONCHAR(cp
)) {
425 return U_NONCHARACTER_CODE_POINT
;
428 if ((cat
= u_charType(cp
)) == U_SURROGATE
) {
429 cat
= U_IS_LEAD(cp
) ? U_LEAD_SURROGATE
: U_TRAIL_SURROGATE
;
435 static const char *getCharCatName(UChar32 cp
) {
436 uint8_t cat
= getCharCat(cp
);
438 /* Return unknown if the table of names above is not up to
441 if (cat
>= UPRV_LENGTHOF(charCatNames
)) {
444 return charCatNames
[cat
];
448 static uint16_t getExtName(uint32_t code
, char *buffer
, uint16_t bufferLength
) {
449 const char *catname
= getCharCatName(code
);
455 WRITE_CHAR(buffer
, bufferLength
, length
, '<');
456 while (catname
[length
- 1]) {
457 WRITE_CHAR(buffer
, bufferLength
, length
, catname
[length
- 1]);
459 WRITE_CHAR(buffer
, bufferLength
, length
, '-');
460 for (cp
= code
, ndigits
= 0; cp
; ++ndigits
, cp
>>= 4)
464 for (cp
= code
, i
= ndigits
; (cp
|| i
> 0) && bufferLength
; cp
>>= 4, bufferLength
--) {
465 uint8_t v
= (uint8_t)(cp
& 0xf);
466 buffer
[--i
] = (v
< 10 ? '0' + v
: 'A' + v
- 10);
469 length
+= static_cast<uint16_t>(ndigits
);
470 WRITE_CHAR(buffer
, bufferLength
, length
, '>');
476 * getGroup() does a binary search for the group that contains the
477 * Unicode code point "code".
478 * The return value is always a valid Group* that may contain "code"
479 * or else is the highest group before "code".
480 * If the lowest group is after "code", then that one is returned.
482 static const uint16_t *
483 getGroup(UCharNames
*names
, uint32_t code
) {
484 const uint16_t *groups
=GET_GROUPS(names
);
485 uint16_t groupMSB
=(uint16_t)(code
>>GROUP_SHIFT
),
490 /* binary search for the group of names that contains the one for code */
491 while(start
<limit
-1) {
492 number
=(uint16_t)((start
+limit
)/2);
493 if(groupMSB
<groups
[number
*GROUP_LENGTH
+GROUP_MSB
]) {
500 /* return this regardless of whether it is an exact match */
501 return groups
+start
*GROUP_LENGTH
;
505 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
506 * expands them into offsets and lengths for each string.
507 * Lengths are stored with a variable-width encoding in consecutive nibbles:
508 * If a nibble<0xc, then it is the length itself (0=empty string).
509 * If a nibble>=0xc, then it forms a length value with the following nibble.
510 * Calculation see below.
511 * The offsets and lengths arrays must be at least 33 (one more) long because
512 * there is no check here at the end if the last nibble is still used.
514 static const uint8_t *
515 expandGroupLengths(const uint8_t *s
,
516 uint16_t offsets
[LINES_PER_GROUP
+1], uint16_t lengths
[LINES_PER_GROUP
+1]) {
517 /* read the lengths of the 32 strings in this group and get each string's offset */
518 uint16_t i
=0, offset
=0, length
=0;
521 /* all 32 lengths must be read to get the offset of the first group string */
522 while(i
<LINES_PER_GROUP
) {
525 /* read even nibble - MSBs of lengthByte */
527 /* double-nibble length spread across two bytes */
528 length
=(uint16_t)(((length
&0x3)<<4|lengthByte
>>4)+12);
530 } else if((lengthByte
/* &0xf0 */)>=0xc0) {
531 /* double-nibble length spread across this one byte */
532 length
=(uint16_t)((lengthByte
&0x3f)+12);
534 /* single-nibble length in MSBs */
535 length
=(uint16_t)(lengthByte
>>4);
545 /* read odd nibble - LSBs of lengthByte */
546 if((lengthByte
&0xf0)==0) {
547 /* this nibble was not consumed for a double-nibble length above */
550 /* single-nibble length in LSBs */
558 length
=0; /* prevent double-nibble detection in the next iteration */
562 /* now, s is at the first group string */
567 expandGroupName(UCharNames
*names
, const uint16_t *group
,
568 uint16_t lineNumber
, UCharNameChoice nameChoice
,
569 char *buffer
, uint16_t bufferLength
) {
570 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
571 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+GET_GROUP_OFFSET(group
);
572 s
=expandGroupLengths(s
, offsets
, lengths
);
573 return expandName(names
, s
+offsets
[lineNumber
], lengths
[lineNumber
], nameChoice
,
574 buffer
, bufferLength
);
578 getName(UCharNames
*names
, uint32_t code
, UCharNameChoice nameChoice
,
579 char *buffer
, uint16_t bufferLength
) {
580 const uint16_t *group
=getGroup(names
, code
);
581 if((uint16_t)(code
>>GROUP_SHIFT
)==group
[GROUP_MSB
]) {
582 return expandGroupName(names
, group
, (uint16_t)(code
&GROUP_MASK
), nameChoice
,
583 buffer
, bufferLength
);
585 /* group not found */
595 * enumGroupNames() enumerates all the names in a 32-group
596 * and either calls the enumerator function or finds a given input name.
599 enumGroupNames(UCharNames
*names
, const uint16_t *group
,
600 UChar32 start
, UChar32 end
,
601 UEnumCharNamesFn
*fn
, void *context
,
602 UCharNameChoice nameChoice
) {
603 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
604 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+GET_GROUP_OFFSET(group
);
606 s
=expandGroupLengths(s
, offsets
, lengths
);
607 if(fn
!=DO_FIND_NAME
) {
612 length
=expandName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, buffer
, sizeof(buffer
));
613 if (!length
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
614 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
616 /* here, we assume that the buffer is large enough */
618 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
625 const char *otherName
=((FindName
*)context
)->otherName
;
627 if(compareName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, otherName
)) {
628 ((FindName
*)context
)->code
=start
;
638 * enumExtNames enumerate extended names.
639 * It only needs to do it if it is called with a real function and not
640 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
641 * for extended names by itself.
644 enumExtNames(UChar32 start
, UChar32 end
,
645 UEnumCharNamesFn
*fn
, void *context
)
647 if(fn
!=DO_FIND_NAME
) {
652 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
653 /* here, we assume that the buffer is large enough */
655 if(!fn(context
, start
, U_EXTENDED_CHAR_NAME
, buffer
, length
)) {
667 enumNames(UCharNames
*names
,
668 UChar32 start
, UChar32 limit
,
669 UEnumCharNamesFn
*fn
, void *context
,
670 UCharNameChoice nameChoice
) {
671 uint16_t startGroupMSB
, endGroupMSB
, groupCount
;
672 const uint16_t *group
, *groupLimit
;
674 startGroupMSB
=(uint16_t)(start
>>GROUP_SHIFT
);
675 endGroupMSB
=(uint16_t)((limit
-1)>>GROUP_SHIFT
);
677 /* find the group that contains start, or the highest before it */
678 group
=getGroup(names
, start
);
680 if(startGroupMSB
<group
[GROUP_MSB
] && nameChoice
==U_EXTENDED_CHAR_NAME
) {
681 /* enumerate synthetic names between start and the group start */
682 UChar32 extLimit
=((UChar32
)group
[GROUP_MSB
]<<GROUP_SHIFT
);
686 if(!enumExtNames(start
, extLimit
-1, fn
, context
)) {
692 if(startGroupMSB
==endGroupMSB
) {
693 if(startGroupMSB
==group
[GROUP_MSB
]) {
694 /* if start and limit-1 are in the same group, then enumerate only in that one */
695 return enumGroupNames(names
, group
, start
, limit
-1, fn
, context
, nameChoice
);
698 const uint16_t *groups
=GET_GROUPS(names
);
699 groupCount
=*groups
++;
700 groupLimit
=groups
+groupCount
*GROUP_LENGTH
;
702 if(startGroupMSB
==group
[GROUP_MSB
]) {
703 /* enumerate characters in the partial start group */
704 if((start
&GROUP_MASK
)!=0) {
705 if(!enumGroupNames(names
, group
,
706 start
, ((UChar32
)startGroupMSB
<<GROUP_SHIFT
)+LINES_PER_GROUP
-1,
707 fn
, context
, nameChoice
)) {
710 group
=NEXT_GROUP(group
); /* continue with the next group */
712 } else if(startGroupMSB
>group
[GROUP_MSB
]) {
713 /* make sure that we start enumerating with the first group after start */
714 const uint16_t *nextGroup
=NEXT_GROUP(group
);
715 if (nextGroup
< groupLimit
&& nextGroup
[GROUP_MSB
] > startGroupMSB
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
716 UChar32 end
= nextGroup
[GROUP_MSB
] << GROUP_SHIFT
;
720 if (!enumExtNames(start
, end
- 1, fn
, context
)) {
727 /* enumerate entire groups between the start- and end-groups */
728 while(group
<groupLimit
&& group
[GROUP_MSB
]<endGroupMSB
) {
729 const uint16_t *nextGroup
;
730 start
=(UChar32
)group
[GROUP_MSB
]<<GROUP_SHIFT
;
731 if(!enumGroupNames(names
, group
, start
, start
+LINES_PER_GROUP
-1, fn
, context
, nameChoice
)) {
734 nextGroup
=NEXT_GROUP(group
);
735 if (nextGroup
< groupLimit
&& nextGroup
[GROUP_MSB
] > group
[GROUP_MSB
] + 1 && nameChoice
== U_EXTENDED_CHAR_NAME
) {
736 UChar32 end
= nextGroup
[GROUP_MSB
] << GROUP_SHIFT
;
740 if (!enumExtNames((group
[GROUP_MSB
] + 1) << GROUP_SHIFT
, end
- 1, fn
, context
)) {
747 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
748 if(group
<groupLimit
&& group
[GROUP_MSB
]==endGroupMSB
) {
749 return enumGroupNames(names
, group
, (limit
-1)&~GROUP_MASK
, limit
-1, fn
, context
, nameChoice
);
750 } else if (nameChoice
== U_EXTENDED_CHAR_NAME
&& group
== groupLimit
) {
751 UChar32 next
= (PREV_GROUP(group
)[GROUP_MSB
] + 1) << GROUP_SHIFT
;
760 /* we have not found a group, which means everything is made of
762 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
763 if (limit
> UCHAR_MAX_VALUE
+ 1) {
764 limit
= UCHAR_MAX_VALUE
+ 1;
766 return enumExtNames(start
, limit
- 1, fn
, context
);
773 writeFactorSuffix(const uint16_t *factors
, uint16_t count
,
774 const char *s
, /* suffix elements */
776 uint16_t indexes
[8], /* output fields from here */
777 const char *elementBases
[8], const char *elements
[8],
778 char *buffer
, uint16_t bufferLength
) {
779 uint16_t i
, factor
, bufferPos
=0;
782 /* write elements according to the factors */
785 * the factorized elements are determined by modulo arithmetic
786 * with the factors of this algorithm
788 * note that for fewer operations, count is decremented here
791 for(i
=count
; i
>0; --i
) {
793 indexes
[i
]=(uint16_t)(code%factor
);
797 * we don't need to calculate the last modulus because start<=code<=end
798 * guarantees here that code<=factors[0]
800 indexes
[0]=(uint16_t)code
;
802 /* write each element */
804 if(elementBases
!=NULL
) {
808 /* skip indexes[i] strings */
820 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
823 /* we do not need to perform the rest of this loop for i==count - break here */
828 /* skip the rest of the strings for this factors[i] */
829 factor
=(uint16_t)(factors
[i
]-indexes
[i
]-1);
848 * Parts of findAlgName() are almost the same as some of getAlgName().
849 * Fixes must be applied to both.
852 getAlgName(AlgorithmicRange
*range
, uint32_t code
, UCharNameChoice nameChoice
,
853 char *buffer
, uint16_t bufferLength
) {
854 uint16_t bufferPos
=0;
856 /* Only the normative character name can be algorithmic. */
857 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
865 switch(range
->type
) {
867 /* name = prefix hex-digits */
868 const char *s
=(const char *)(range
+1);
875 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
878 /* write hexadecimal code point value */
879 count
=range
->variant
;
882 if(count
<bufferLength
) {
887 if(--i
<bufferLength
) {
903 /* name = prefix factorized-elements */
905 const uint16_t *factors
=(const uint16_t *)(range
+1);
906 uint16_t count
=range
->variant
;
907 const char *s
=(const char *)(factors
+count
);
912 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
915 bufferPos
+=writeFactorSuffix(factors
, count
,
916 s
, code
-range
->start
, indexes
, NULL
, NULL
, buffer
, bufferLength
);
932 * Important: enumAlgNames() and findAlgName() are almost the same.
933 * Any fix must be applied to both.
936 enumAlgNames(AlgorithmicRange
*range
,
937 UChar32 start
, UChar32 limit
,
938 UEnumCharNamesFn
*fn
, void *context
,
939 UCharNameChoice nameChoice
) {
943 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
947 switch(range
->type
) {
952 /* get the full name of the start character */
953 length
=getAlgName(range
, (uint32_t)start
, nameChoice
, buffer
, sizeof(buffer
));
958 /* call the enumerator function with this first character */
959 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
963 /* go to the end of the name; all these names have the same length */
969 /* enumerate the rest of the names */
970 while(++start
<limit
) {
971 /* increment the hexadecimal number on a character-basis */
975 if(('0'<=c
&& c
<'9') || ('A'<=c
&& c
<'F')) {
986 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
994 const char *elementBases
[8], *elements
[8];
995 const uint16_t *factors
=(const uint16_t *)(range
+1);
996 uint16_t count
=range
->variant
;
997 const char *s
=(const char *)(factors
+count
);
999 uint16_t prefixLength
, i
, idx
;
1003 /* name = prefix factorized-elements */
1008 while((c
=*s
++)!=0) {
1013 /* append the suffix of the start character */
1014 length
=(uint16_t)(prefixLength
+writeFactorSuffix(factors
, count
,
1015 s
, (uint32_t)start
-range
->start
,
1016 indexes
, elementBases
, elements
,
1017 suffix
, (uint16_t)(sizeof(buffer
)-prefixLength
)));
1019 /* call the enumerator function with this first character */
1020 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1024 /* enumerate the rest of the names */
1025 while(++start
<limit
) {
1026 /* increment the indexes in lexical order bound by the factors */
1029 idx
=(uint16_t)(indexes
[--i
]+1);
1030 if(idx
<factors
[i
]) {
1031 /* skip one index and its element string */
1039 /* reset this index to 0 and its element string to the first one */
1041 elements
[i
]=elementBases
[i
];
1045 /* to make matters a little easier, just append all elements to the suffix */
1047 length
=prefixLength
;
1048 for(i
=0; i
<count
; ++i
) {
1050 while((c
=*s
++)!=0) {
1055 /* zero-terminate */
1058 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1065 /* undefined type */
1073 * findAlgName() is almost the same as enumAlgNames() except that it
1074 * returns the code point for a name if it fits into the range.
1075 * It returns 0xffff otherwise.
1078 findAlgName(AlgorithmicRange
*range
, UCharNameChoice nameChoice
, const char *otherName
) {
1081 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
1085 switch(range
->type
) {
1087 /* name = prefix hex-digits */
1088 const char *s
=(const char *)(range
+1);
1093 /* compare prefix */
1094 while((c
=*s
++)!=0) {
1095 if((char)c
!=*otherName
++) {
1100 /* read hexadecimal code point value */
1101 count
=range
->variant
;
1103 for(i
=0; i
<count
; ++i
) {
1105 if('0'<=c
&& c
<='9') {
1106 code
=(code
<<4)|(c
-'0');
1107 } else if('A'<=c
&& c
<='F') {
1108 code
=(code
<<4)|(c
-'A'+10);
1114 /* does it fit into the range? */
1115 if(*otherName
==0 && range
->start
<=(uint32_t)code
&& (uint32_t)code
<=range
->end
) {
1122 uint16_t indexes
[8];
1123 const char *elementBases
[8], *elements
[8];
1124 const uint16_t *factors
=(const uint16_t *)(range
+1);
1125 uint16_t count
=range
->variant
;
1126 const char *s
=(const char *)(factors
+count
), *t
;
1127 UChar32 start
, limit
;
1132 /* name = prefix factorized-elements */
1134 /* compare prefix */
1135 while((c
=*s
++)!=0) {
1136 if((char)c
!=*otherName
++) {
1141 start
=(UChar32
)range
->start
;
1142 limit
=(UChar32
)(range
->end
+1);
1144 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1145 writeFactorSuffix(factors
, count
, s
, 0,
1146 indexes
, elementBases
, elements
, buffer
, sizeof(buffer
));
1148 /* compare the first suffix */
1149 if(0==uprv_strcmp(otherName
, buffer
)) {
1153 /* enumerate and compare the rest of the suffixes */
1154 while(++start
<limit
) {
1155 /* increment the indexes in lexical order bound by the factors */
1158 idx
=(uint16_t)(indexes
[--i
]+1);
1159 if(idx
<factors
[i
]) {
1160 /* skip one index and its element string */
1167 /* reset this index to 0 and its element string to the first one */
1169 elements
[i
]=elementBases
[i
];
1173 /* to make matters a little easier, just compare all elements of the suffix */
1175 for(i
=0; i
<count
; ++i
) {
1177 while((c
=*s
++)!=0) {
1179 s
=""; /* does not match */
1191 /* undefined type */
1198 /* sets of name characters, maximum name lengths ---------------------------- */
1200 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1201 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1204 calcStringSetLength(uint32_t set
[8], const char *s
) {
1208 while((c
=*s
++)!=0) {
1216 calcAlgNameSetsLengths(int32_t maxNameLength
) {
1217 AlgorithmicRange
*range
;
1219 uint32_t rangeCount
;
1222 /* enumerate algorithmic ranges */
1223 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1225 range
=(AlgorithmicRange
*)(p
+1);
1226 while(rangeCount
>0) {
1227 switch(range
->type
) {
1229 /* name = prefix + (range->variant times) hex-digits */
1231 length
=calcStringSetLength(gNameSet
, (const char *)(range
+1))+range
->variant
;
1232 if(length
>maxNameLength
) {
1233 maxNameLength
=length
;
1237 /* name = prefix factorized-elements */
1238 const uint16_t *factors
=(const uint16_t *)(range
+1);
1240 int32_t i
, count
=range
->variant
, factor
, factorLength
, maxFactorLength
;
1243 s
=(const char *)(factors
+count
);
1244 length
=calcStringSetLength(gNameSet
, s
);
1245 s
+=length
+1; /* start of factor suffixes */
1247 /* get the set and maximum factor suffix length for each factor */
1248 for(i
=0; i
<count
; ++i
) {
1250 for(factor
=factors
[i
]; factor
>0; --factor
) {
1251 factorLength
=calcStringSetLength(gNameSet
, s
);
1253 if(factorLength
>maxFactorLength
) {
1254 maxFactorLength
=factorLength
;
1257 length
+=maxFactorLength
;
1260 if(length
>maxNameLength
) {
1261 maxNameLength
=length
;
1270 range
=(AlgorithmicRange
*)((uint8_t *)range
+range
->size
);
1273 return maxNameLength
;
1277 calcExtNameSetsLengths(int32_t maxNameLength
) {
1280 for(i
=0; i
<UPRV_LENGTHOF(charCatNames
); ++i
) {
1282 * for each category, count the length of the category name
1286 * 6 for most hex digits per code point
1288 length
=9+calcStringSetLength(gNameSet
, charCatNames
[i
]);
1289 if(length
>maxNameLength
) {
1290 maxNameLength
=length
;
1293 return maxNameLength
;
1297 calcNameSetLength(const uint16_t *tokens
, uint16_t tokenCount
, const uint8_t *tokenStrings
, int8_t *tokenLengths
,
1299 const uint8_t **pLine
, const uint8_t *lineLimit
) {
1300 const uint8_t *line
=*pLine
;
1301 int32_t length
=0, tokenLength
;
1304 while(line
!=lineLimit
&& (c
=*line
++)!=(uint8_t)';') {
1306 /* implicit letter */
1311 if(token
==(uint16_t)(-2)) {
1312 /* this is a lead byte for a double-byte token */
1316 if(token
==(uint16_t)(-1)) {
1317 /* explicit letter */
1321 /* count token word */
1322 if(tokenLengths
!=NULL
) {
1323 /* use cached token length */
1324 tokenLength
=tokenLengths
[c
];
1325 if(tokenLength
==0) {
1326 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1327 tokenLengths
[c
]=(int8_t)tokenLength
;
1330 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1332 length
+=tokenLength
;
1342 calcGroupNameSetsLengths(int32_t maxNameLength
) {
1343 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
1345 uint16_t *tokens
=(uint16_t *)uCharNames
+8;
1346 uint16_t tokenCount
=*tokens
++;
1347 uint8_t *tokenStrings
=(uint8_t *)uCharNames
+uCharNames
->tokenStringOffset
;
1349 int8_t *tokenLengths
;
1351 const uint16_t *group
;
1352 const uint8_t *s
, *line
, *lineLimit
;
1354 int32_t groupCount
, lineNumber
, length
;
1356 tokenLengths
=(int8_t *)uprv_malloc(tokenCount
);
1357 if(tokenLengths
!=NULL
) {
1358 uprv_memset(tokenLengths
, 0, tokenCount
);
1361 group
=GET_GROUPS(uCharNames
);
1362 groupCount
=*group
++;
1364 /* enumerate all groups */
1365 while(groupCount
>0) {
1366 s
=(uint8_t *)uCharNames
+uCharNames
->groupStringOffset
+GET_GROUP_OFFSET(group
);
1367 s
=expandGroupLengths(s
, offsets
, lengths
);
1369 /* enumerate all lines in each group */
1370 for(lineNumber
=0; lineNumber
<LINES_PER_GROUP
; ++lineNumber
) {
1371 line
=s
+offsets
[lineNumber
];
1372 length
=lengths
[lineNumber
];
1377 lineLimit
=line
+length
;
1379 /* read regular name */
1380 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1381 if(length
>maxNameLength
) {
1382 maxNameLength
=length
;
1384 if(line
==lineLimit
) {
1388 /* read Unicode 1.0 name */
1389 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1390 if(length
>maxNameLength
) {
1391 maxNameLength
=length
;
1393 if(line
==lineLimit
) {
1397 /* read ISO comment */
1398 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1401 group
=NEXT_GROUP(group
);
1405 if(tokenLengths
!=NULL
) {
1406 uprv_free(tokenLengths
);
1409 /* set gMax... - name length last for threading */
1410 gMaxNameLength
=maxNameLength
;
1414 calcNameSetsLengths(UErrorCode
*pErrorCode
) {
1415 static const char extChars
[]="0123456789ABCDEF<>-";
1416 int32_t i
, maxNameLength
;
1418 if(gMaxNameLength
!=0) {
1422 if(!isDataLoaded(pErrorCode
)) {
1426 /* set hex digits, used in various names, and <>-, used in extended names */
1427 for(i
=0; i
<(int32_t)sizeof(extChars
)-1; ++i
) {
1428 SET_ADD(gNameSet
, extChars
[i
]);
1431 /* set sets and lengths from algorithmic names */
1432 maxNameLength
=calcAlgNameSetsLengths(0);
1434 /* set sets and lengths from extended names */
1435 maxNameLength
=calcExtNameSetsLengths(maxNameLength
);
1437 /* set sets and lengths from group names, set global maximum values */
1438 calcGroupNameSetsLengths(maxNameLength
);
1445 /* public API --------------------------------------------------------------- */
1449 U_CAPI
int32_t U_EXPORT2
1450 u_charName(UChar32 code
, UCharNameChoice nameChoice
,
1451 char *buffer
, int32_t bufferLength
,
1452 UErrorCode
*pErrorCode
) {
1453 AlgorithmicRange
*algRange
;
1458 /* check the argument values */
1459 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1461 } else if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
||
1462 bufferLength
<0 || (bufferLength
>0 && buffer
==NULL
)
1464 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1468 if((uint32_t)code
>UCHAR_MAX_VALUE
|| !isDataLoaded(pErrorCode
)) {
1469 return u_terminateChars(buffer
, bufferLength
, 0, pErrorCode
);
1474 /* try algorithmic names first */
1475 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1477 algRange
=(AlgorithmicRange
*)(p
+1);
1479 if(algRange
->start
<=(uint32_t)code
&& (uint32_t)code
<=algRange
->end
) {
1480 length
=getAlgName(algRange
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1483 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1488 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1489 length
= getName(uCharNames
, (uint32_t )code
, U_EXTENDED_CHAR_NAME
, buffer
, (uint16_t) bufferLength
);
1491 /* extended character name */
1492 length
= getExtName((uint32_t) code
, buffer
, (uint16_t) bufferLength
);
1495 /* normal character name */
1496 length
=getName(uCharNames
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1500 return u_terminateChars(buffer
, bufferLength
, length
, pErrorCode
);
1503 U_CAPI
int32_t U_EXPORT2
1504 u_getISOComment(UChar32
/*c*/,
1505 char *dest
, int32_t destCapacity
,
1506 UErrorCode
*pErrorCode
) {
1507 /* check the argument values */
1508 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1510 } else if(destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1511 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1515 return u_terminateChars(dest
, destCapacity
, 0, pErrorCode
);
1518 U_CAPI UChar32 U_EXPORT2
1519 u_charFromName(UCharNameChoice nameChoice
,
1521 UErrorCode
*pErrorCode
) {
1522 char upper
[120], lower
[120];
1524 AlgorithmicRange
*algRange
;
1529 static constexpr UChar32 error
= 0xffff; /* Undefined, but use this for backwards compatibility. */
1531 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1535 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| name
==NULL
|| *name
==0) {
1536 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1540 if(!isDataLoaded(pErrorCode
)) {
1544 /* construct the uppercase and lowercase of the name first */
1545 for(i
=0; i
<sizeof(upper
); ++i
) {
1546 if((c0
=*name
++)!=0) {
1547 upper
[i
]=uprv_toupper(c0
);
1548 lower
[i
]=uprv_tolower(c0
);
1550 upper
[i
]=lower
[i
]=0;
1554 if(i
==sizeof(upper
)) {
1555 /* name too long, there is no such character */
1556 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1559 // i==strlen(name)==strlen(lower)==strlen(upper)
1561 /* try extended names first */
1562 if (lower
[0] == '<') {
1563 if (nameChoice
== U_EXTENDED_CHAR_NAME
&& lower
[--i
] == '>') {
1564 // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
1566 while (i
>= 3 && lower
[--i
] != '-') {}
1568 // There should be 1 to 8 hex digits.
1569 int32_t hexLength
= limit
- (i
+ 1);
1570 if (i
>= 2 && lower
[i
] == '-' && 1 <= hexLength
&& hexLength
<= 8) {
1575 for (++i
; i
< limit
; ++i
) {
1576 if (lower
[i
] >= '0' && lower
[i
] <= '9') {
1577 cp
= (cp
<< 4) + lower
[i
] - '0';
1578 } else if (lower
[i
] >= 'a' && lower
[i
] <= 'f') {
1579 cp
= (cp
<< 4) + lower
[i
] - 'a' + 10;
1581 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1584 // Prevent signed-integer overflow and out-of-range code points.
1585 if (cp
> UCHAR_MAX_VALUE
) {
1586 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1591 /* Now validate the category name.
1592 We could use a binary search, or a trie, if
1593 we really wanted to. */
1594 uint8_t cat
= getCharCat(cp
);
1595 for (lower
[i
] = 0, cIdx
= 0; cIdx
< UPRV_LENGTHOF(charCatNames
); ++cIdx
) {
1597 if (!uprv_strcmp(lower
+ 1, charCatNames
[cIdx
])) {
1607 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1611 /* try algorithmic names now */
1612 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1614 algRange
=(AlgorithmicRange
*)(p
+1);
1616 if((cp
=findAlgName(algRange
, nameChoice
, upper
))!=0xffff) {
1619 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1623 /* normal character name */
1624 findName
.otherName
=upper
;
1625 findName
.code
=error
;
1626 enumNames(uCharNames
, 0, UCHAR_MAX_VALUE
+ 1, DO_FIND_NAME
, &findName
, nameChoice
);
1627 if (findName
.code
== error
) {
1628 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1630 return findName
.code
;
1633 U_CAPI
void U_EXPORT2
1634 u_enumCharNames(UChar32 start
, UChar32 limit
,
1635 UEnumCharNamesFn
*fn
,
1637 UCharNameChoice nameChoice
,
1638 UErrorCode
*pErrorCode
) {
1639 AlgorithmicRange
*algRange
;
1643 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1647 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| fn
==NULL
) {
1648 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1652 if((uint32_t) limit
> UCHAR_MAX_VALUE
+ 1) {
1653 limit
= UCHAR_MAX_VALUE
+ 1;
1655 if((uint32_t)start
>=(uint32_t)limit
) {
1659 if(!isDataLoaded(pErrorCode
)) {
1663 /* interleave the data-driven ones with the algorithmic ones */
1664 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1665 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1667 algRange
=(AlgorithmicRange
*)(p
+1);
1669 /* enumerate the character names before the current algorithmic range */
1670 /* here: start<limit */
1671 if((uint32_t)start
<algRange
->start
) {
1672 if((uint32_t)limit
<=algRange
->start
) {
1673 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1676 if(!enumNames(uCharNames
, start
, (UChar32
)algRange
->start
, fn
, context
, nameChoice
)) {
1679 start
=(UChar32
)algRange
->start
;
1681 /* enumerate the character names in the current algorithmic range */
1682 /* here: algRange->start<=start<limit */
1683 if((uint32_t)start
<=algRange
->end
) {
1684 if((uint32_t)limit
<=(algRange
->end
+1)) {
1685 enumAlgNames(algRange
, start
, limit
, fn
, context
, nameChoice
);
1688 if(!enumAlgNames(algRange
, start
, (UChar32
)algRange
->end
+1, fn
, context
, nameChoice
)) {
1691 start
=(UChar32
)algRange
->end
+1;
1693 /* continue to the next algorithmic range (here: start<limit) */
1694 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1697 /* enumerate the character names after the last algorithmic range */
1698 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1701 U_CAPI
int32_t U_EXPORT2
1702 uprv_getMaxCharNameLength() {
1703 UErrorCode errorCode
=U_ZERO_ERROR
;
1704 if(calcNameSetsLengths(&errorCode
)) {
1705 return gMaxNameLength
;
1712 * Converts the char set cset into a Unicode set uset.
1713 * @param cset Set of 256 bit flags corresponding to a set of chars.
1714 * @param uset USet to receive characters. Existing contents are deleted.
1717 charSetToUSet(uint32_t cset
[8], const USetAdder
*sa
) {
1722 UErrorCode errorCode
;
1724 errorCode
=U_ZERO_ERROR
;
1726 if(!calcNameSetsLengths(&errorCode
)) {
1730 /* build a char string with all chars that are used in character names */
1732 for(i
=0; i
<256; ++i
) {
1733 if(SET_CONTAINS(cset
, i
)) {
1734 cs
[length
++]=(char)i
;
1738 /* convert the char string to a UChar string */
1739 u_charsToUChars(cs
, us
, length
);
1741 /* add each UChar to the USet */
1742 for(i
=0; i
<length
; ++i
) {
1743 if(us
[i
]!=0 || cs
[i
]==0) { /* non-invariant chars become (UChar)0 */
1744 sa
->add(sa
->set
, us
[i
]);
1750 * Fills set with characters that are used in Unicode character names.
1751 * @param set USet to receive characters.
1753 U_CAPI
void U_EXPORT2
1754 uprv_getCharNameCharacters(const USetAdder
*sa
) {
1755 charSetToUSet(gNameSet
, sa
);
1758 /* data swapping ------------------------------------------------------------ */
1761 * The token table contains non-negative entries for token bytes,
1762 * and -1 for bytes that represent themselves in the data file's charset.
1763 * -2 entries are used for lead bytes.
1765 * Direct bytes (-1 entries) must be translated from the input charset family
1766 * to the output charset family.
1767 * makeTokenMap() writes a permutation mapping for this.
1768 * Use it once for single-/lead-byte tokens and once more for all trail byte
1769 * tokens. (';' is an unused trail byte marked with -1.)
1772 makeTokenMap(const UDataSwapper
*ds
,
1773 int16_t tokens
[], uint16_t tokenCount
,
1775 UErrorCode
*pErrorCode
) {
1776 UBool usedOutChar
[256];
1780 if(U_FAILURE(*pErrorCode
)) {
1784 if(ds
->inCharset
==ds
->outCharset
) {
1785 /* Same charset family: identity permutation */
1786 for(i
=0; i
<256; ++i
) {
1790 uprv_memset(map
, 0, 256);
1791 uprv_memset(usedOutChar
, 0, 256);
1793 if(tokenCount
>256) {
1797 /* set the direct bytes (byte 0 always maps to itself) */
1798 for(i
=1; i
<tokenCount
; ++i
) {
1800 /* convert the direct byte character */
1802 ds
->swapInvChars(ds
, &c1
, 1, &c2
, pErrorCode
);
1803 if(U_FAILURE(*pErrorCode
)) {
1804 udata_printError(ds
, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1809 /* enter the converted character into the map and mark it used */
1811 usedOutChar
[c2
]=TRUE
;
1815 /* set the mappings for the rest of the permutation */
1816 for(i
=j
=1; i
<tokenCount
; ++i
) {
1817 /* set mappings that were not set for direct bytes */
1819 /* set an output byte value that was not used as an output byte above */
1820 while(usedOutChar
[j
]) {
1823 map
[i
]=(uint8_t)j
++;
1828 * leave mappings at tokenCount and above unset if tokenCount<256
1829 * because they won't be used
1834 U_CAPI
int32_t U_EXPORT2
1835 uchar_swapNames(const UDataSwapper
*ds
,
1836 const void *inData
, int32_t length
, void *outData
,
1837 UErrorCode
*pErrorCode
) {
1838 const UDataInfo
*pInfo
;
1841 const uint8_t *inBytes
;
1844 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
,
1845 offset
, i
, count
, stringsCount
;
1847 const AlgorithmicRange
*inRange
;
1848 AlgorithmicRange
*outRange
;
1850 /* udata_swapDataHeader checks the arguments */
1851 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
1852 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1856 /* check data format and format version */
1857 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
1859 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
1860 pInfo
->dataFormat
[1]==0x6e &&
1861 pInfo
->dataFormat
[2]==0x61 &&
1862 pInfo
->dataFormat
[3]==0x6d &&
1863 pInfo
->formatVersion
[0]==1
1865 udata_printError(ds
, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1866 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
1867 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
1868 pInfo
->formatVersion
[0]);
1869 *pErrorCode
=U_UNSUPPORTED_ERROR
;
1873 inBytes
=(const uint8_t *)inData
+headerSize
;
1874 outBytes
=(uint8_t *)outData
+headerSize
;
1876 algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]);
1880 (uint32_t)length
<(algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]))
1882 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1884 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1890 /* preflighting: iterate through algorithmic ranges */
1891 offset
=algNamesOffset
;
1892 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
1895 for(i
=0; i
<count
; ++i
) {
1896 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
1897 offset
+=ds
->readUInt16(inRange
->size
);
1904 int16_t tokens
[512];
1905 uint16_t tokenCount
;
1907 uint8_t map
[256], trailMap
[256];
1909 /* copy the data for inaccessible bytes */
1910 if(inBytes
!=outBytes
) {
1911 uprv_memcpy(outBytes
, inBytes
, length
);
1914 /* the initial 4 offsets first */
1915 tokenStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[0]);
1916 groupsOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[1]);
1917 groupStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[2]);
1918 ds
->swapArray32(ds
, inBytes
, 16, outBytes
, pErrorCode
);
1921 * now the tokens table
1922 * it needs to be permutated along with the compressed name strings
1924 p
=(const uint16_t *)(inBytes
+16);
1925 q
=(uint16_t *)(outBytes
+16);
1927 /* read and swap the tokenCount */
1928 tokenCount
=ds
->readUInt16(*p
);
1929 ds
->swapArray16(ds
, p
, 2, q
, pErrorCode
);
1933 /* read the first 512 tokens and make the token maps */
1934 if(tokenCount
<=512) {
1939 for(i
=0; i
<count
; ++i
) {
1940 tokens
[i
]=udata_readInt16(ds
, p
[i
]);
1943 tokens
[i
]=0; /* fill the rest of the tokens array if tokenCount<512 */
1945 makeTokenMap(ds
, tokens
, tokenCount
, map
, pErrorCode
);
1946 makeTokenMap(ds
, tokens
+256, (uint16_t)(tokenCount
>256 ? tokenCount
-256 : 0), trailMap
, pErrorCode
);
1947 if(U_FAILURE(*pErrorCode
)) {
1952 * swap and permutate the tokens
1953 * go through a temporary array to support in-place swapping
1955 temp
=(uint16_t *)uprv_malloc(tokenCount
*2);
1957 udata_printError(ds
, "out of memory swapping %u unames.icu tokens\n",
1959 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1963 /* swap and permutate single-/lead-byte tokens */
1964 for(i
=0; i
<tokenCount
&& i
<256; ++i
) {
1965 ds
->swapArray16(ds
, p
+i
, 2, temp
+map
[i
], pErrorCode
);
1968 /* swap and permutate trail-byte tokens */
1969 for(; i
<tokenCount
; ++i
) {
1970 ds
->swapArray16(ds
, p
+i
, 2, temp
+(i
&0xffffff00)+trailMap
[i
&0xff], pErrorCode
);
1973 /* copy the result into the output and free the temporary array */
1974 uprv_memcpy(q
, temp
, tokenCount
*2);
1978 * swap the token strings but not a possible padding byte after
1979 * the terminating NUL of the last string
1981 udata_swapInvStringBlock(ds
, inBytes
+tokenStringOffset
, (int32_t)(groupsOffset
-tokenStringOffset
),
1982 outBytes
+tokenStringOffset
, pErrorCode
);
1983 if(U_FAILURE(*pErrorCode
)) {
1984 udata_printError(ds
, "uchar_swapNames(token strings) failed\n");
1988 /* swap the group table */
1989 count
=ds
->readUInt16(*((const uint16_t *)(inBytes
+groupsOffset
)));
1990 ds
->swapArray16(ds
, inBytes
+groupsOffset
, (int32_t)((1+count
*3)*2),
1991 outBytes
+groupsOffset
, pErrorCode
);
1994 * swap the group strings
1995 * swap the string bytes but not the nibble-encoded string lengths
1997 if(ds
->inCharset
!=ds
->outCharset
) {
1998 uint16_t offsets
[LINES_PER_GROUP
+1], lengths
[LINES_PER_GROUP
+1];
2000 const uint8_t *inStrings
, *nextInStrings
;
2001 uint8_t *outStrings
;
2005 inStrings
=inBytes
+groupStringOffset
;
2006 outStrings
=outBytes
+groupStringOffset
;
2008 stringsCount
=algNamesOffset
-groupStringOffset
;
2010 /* iterate through string groups until only a few padding bytes are left */
2011 while(stringsCount
>32) {
2012 nextInStrings
=expandGroupLengths(inStrings
, offsets
, lengths
);
2014 /* move past the length bytes */
2015 stringsCount
-=(uint32_t)(nextInStrings
-inStrings
);
2016 outStrings
+=nextInStrings
-inStrings
;
2017 inStrings
=nextInStrings
;
2019 count
=offsets
[31]+lengths
[31]; /* total number of string bytes in this group */
2020 stringsCount
-=count
;
2022 /* swap the string bytes using map[] and trailMap[] */
2025 *outStrings
++=map
[c
];
2029 /* token lead byte: swap the trail byte, too */
2030 *outStrings
++=trailMap
[*inStrings
++];
2037 /* swap the algorithmic ranges */
2038 offset
=algNamesOffset
;
2039 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
2040 ds
->swapArray32(ds
, inBytes
+offset
, 4, outBytes
+offset
, pErrorCode
);
2043 for(i
=0; i
<count
; ++i
) {
2044 if(offset
>(uint32_t)length
) {
2045 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2047 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2051 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
2052 outRange
=(AlgorithmicRange
*)(outBytes
+offset
);
2053 offset
+=ds
->readUInt16(inRange
->size
);
2055 ds
->swapArray32(ds
, inRange
, 8, outRange
, pErrorCode
);
2056 ds
->swapArray16(ds
, &inRange
->size
, 2, &outRange
->size
, pErrorCode
);
2057 switch(inRange
->type
) {
2059 /* swap prefix string */
2060 ds
->swapInvChars(ds
, inRange
+1, (int32_t)uprv_strlen((const char *)(inRange
+1)),
2061 outRange
+1, pErrorCode
);
2062 if(U_FAILURE(*pErrorCode
)) {
2063 udata_printError(ds
, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2070 /* swap factors and the prefix and factor strings */
2071 uint32_t factorsCount
;
2073 factorsCount
=inRange
->variant
;
2074 p
=(const uint16_t *)(inRange
+1);
2075 q
=(uint16_t *)(outRange
+1);
2076 ds
->swapArray16(ds
, p
, (int32_t)(factorsCount
*2), q
, pErrorCode
);
2078 /* swap the strings, up to the last terminating NUL */
2081 stringsCount
=(uint32_t)((inBytes
+offset
)-(const uint8_t *)p
);
2082 while(stringsCount
>0 && ((const uint8_t *)p
)[stringsCount
-1]!=0) {
2085 ds
->swapInvChars(ds
, p
, (int32_t)stringsCount
, q
, pErrorCode
);
2089 udata_printError(ds
, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2091 *pErrorCode
=U_UNSUPPORTED_ERROR
;
2097 return headerSize
+(int32_t)offset
;
2101 * Hey, Emacs, please set the following:
2104 * indent-tabs-mode: nil