2 ******************************************************************************
4 * Copyright (C) 1999-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 1999oct04
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/udata.h"
21 #include "unicode/utf.h"
22 #include "unicode/utf16.h"
31 /* prototypes ------------------------------------------------------------- */
33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
35 static const char DATA_NAME
[] = "unames";
36 static const char DATA_TYPE
[] = "icu";
39 #define LINES_PER_GROUP (1L<<GROUP_SHIFT)
40 #define GROUP_MASK (LINES_PER_GROUP-1)
43 * This struct was replaced by explicitly accessing equivalent
44 * fields from triples of uint16_t.
45 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
46 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
47 * would advance by 6 bytes (3 uint16_t).
49 * We can't just change the data structure because it's loaded from a data file,
50 * and we don't want to make it less compact, so we changed the access code.
52 * For details see ICU tickets 6331 and 6008.
55 offsetHigh, offsetLow; / * avoid padding * /
66 * Get the 32-bit group offset.
67 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
68 * @return group offset (int32_t)
70 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
72 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
73 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
77 uint8_t type
, variant
;
82 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
;
86 * Get the groups table from a UCharNames struct.
87 * The groups table consists of one uint16_t groupCount followed by
88 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
89 * and the comment for the old struct Group above.
91 * @param names (const UCharNames *) pointer to the UCharNames indexes
92 * @return (const uint16_t *) pointer to the groups table
94 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
97 const char *otherName
;
101 #define DO_FIND_NAME NULL
103 static UDataMemory
*uCharNamesData
=NULL
;
104 static UCharNames
*uCharNames
=NULL
;
105 static UErrorCode gLoadErrorCode
=U_ZERO_ERROR
;
108 * Maximum length of character names (regular & 1.0).
110 static int32_t gMaxNameLength
=0;
113 * Set of chars used in character names (regular & 1.0).
114 * Chars are platform-dependent (can be EBCDIC).
116 static uint32_t gNameSet
[8]={ 0 };
118 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
119 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
120 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
122 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
124 static const char * const charCatNames
[U_CHAR_EXTENDED_CATEGORY_COUNT
] = {
133 "combining spacing mark",
134 "decimal digit number",
139 "paragraph separator",
147 "connector punctuation",
153 "initial punctuation",
160 /* implementation ----------------------------------------------------------- */
162 static UBool U_CALLCONV
unames_cleanup(void)
165 udata_close(uCharNamesData
);
166 uCharNamesData
= NULL
;
175 static UBool U_CALLCONV
176 isAcceptable(void * /*context*/,
177 const char * /*type*/, const char * /*name*/,
178 const UDataInfo
*pInfo
) {
181 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
182 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
183 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
184 pInfo
->dataFormat
[1]==0x6e &&
185 pInfo
->dataFormat
[2]==0x61 &&
186 pInfo
->dataFormat
[3]==0x6d &&
187 pInfo
->formatVersion
[0]==1);
191 isDataLoaded(UErrorCode
*pErrorCode
) {
192 /* load UCharNames from file if necessary */
195 /* do this because double-checked locking is broken */
196 UMTX_CHECK(NULL
, (uCharNames
!=NULL
), isCached
);
202 /* check error code from previous attempt */
203 if(U_FAILURE(gLoadErrorCode
)) {
204 *pErrorCode
=gLoadErrorCode
;
208 /* open the data outside the mutex block */
209 data
=udata_openChoice(NULL
, DATA_TYPE
, DATA_NAME
, isAcceptable
, NULL
, pErrorCode
);
210 if(U_FAILURE(*pErrorCode
)) {
211 gLoadErrorCode
=*pErrorCode
;
215 names
=(UCharNames
*)udata_getMemory(data
);
217 /* in the mutex block, set the data for this process */
220 if(uCharNames
==NULL
) {
225 ucln_common_registerCleanup(UCLN_COMMON_UNAMES
, unames_cleanup
);
230 /* if a different thread set it first, then close the extra data */
232 udata_close(data
); /* NULL if it was set correctly */
238 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
239 if((bufferLength)>0) { \
246 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
249 * Important: expandName() and compareName() are almost the same -
250 * apply fixes to both.
252 * UnicodeData.txt uses ';' as a field separator, so no
253 * field can contain ';' as part of its contents.
254 * In unames.dat, it is marked as token[';']==-1 only if the
255 * semicolon is used in the data file - which is iff we
256 * have Unicode 1.0 names or ISO comments or aliases.
257 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
258 * although we know that it will never be part of a name.
261 expandName(UCharNames
*names
,
262 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
263 char *buffer
, uint16_t bufferLength
) {
264 uint16_t *tokens
=(uint16_t *)names
+8;
265 uint16_t token
, tokenCount
=*tokens
++, bufferPos
=0;
266 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
269 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
271 * skip the modern name if it is not requested _and_
272 * if the semicolon byte value is a character, not a token number
274 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
275 int fieldIndex
= nameChoice
==U_ISO_COMMENT
? 2 : nameChoice
;
277 while(nameLength
>0) {
283 } while(--fieldIndex
>0);
286 * the semicolon byte value is a token number, therefore
287 * only modern names are stored in unames.dat and there is no
288 * such requested alternate name here
294 /* write each letter directly, and write a token word per token */
295 while(nameLength
>0) {
301 /* implicit letter */
302 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
309 if(token
==(uint16_t)(-2)) {
310 /* this is a lead byte for a double-byte token */
311 token
=tokens
[c
<<8|*name
++];
314 if(token
==(uint16_t)(-1)) {
316 /* explicit letter */
317 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
319 /* stop, but skip the semicolon if we are seeking
320 extended names and there was no 2.0 name but there
322 if(!bufferPos
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
323 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
331 /* write token word */
332 uint8_t *tokenString
=tokenStrings
+token
;
333 while((c
=*tokenString
++)!=0) {
334 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
349 * compareName() is almost the same as expandName() except that it compares
350 * the currently expanded name to an input name.
351 * It returns the match/no match result as soon as possible.
354 compareName(UCharNames
*names
,
355 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
356 const char *otherName
) {
357 uint16_t *tokens
=(uint16_t *)names
+8;
358 uint16_t token
, tokenCount
=*tokens
++;
359 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
361 const char *origOtherName
= otherName
;
363 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
365 * skip the modern name if it is not requested _and_
366 * if the semicolon byte value is a character, not a token number
368 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
369 int fieldIndex
= nameChoice
==U_ISO_COMMENT
? 2 : nameChoice
;
371 while(nameLength
>0) {
377 } while(--fieldIndex
>0);
380 * the semicolon byte value is a token number, therefore
381 * only modern names are stored in unames.dat and there is no
382 * such requested alternate name here
388 /* compare each letter directly, and compare a token word per token */
389 while(nameLength
>0) {
395 /* implicit letter */
396 if((char)c
!=*otherName
++) {
405 if(token
==(uint16_t)(-2)) {
406 /* this is a lead byte for a double-byte token */
407 token
=tokens
[c
<<8|*name
++];
410 if(token
==(uint16_t)(-1)) {
412 /* explicit letter */
413 if((char)c
!=*otherName
++) {
417 /* stop, but skip the semicolon if we are seeking
418 extended names and there was no 2.0 name but there
420 if(otherName
== origOtherName
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
421 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
429 /* write token word */
430 uint8_t *tokenString
=tokenStrings
+token
;
431 while((c
=*tokenString
++)!=0) {
432 if((char)c
!=*otherName
++) {
440 /* complete match? */
441 return (UBool
)(*otherName
==0);
444 static uint8_t getCharCat(UChar32 cp
) {
447 if (U_IS_UNICODE_NONCHAR(cp
)) {
448 return U_NONCHARACTER_CODE_POINT
;
451 if ((cat
= u_charType(cp
)) == U_SURROGATE
) {
452 cat
= U_IS_LEAD(cp
) ? U_LEAD_SURROGATE
: U_TRAIL_SURROGATE
;
458 static const char *getCharCatName(UChar32 cp
) {
459 uint8_t cat
= getCharCat(cp
);
461 /* Return unknown if the table of names above is not up to
464 if (cat
>= LENGTHOF(charCatNames
)) {
467 return charCatNames
[cat
];
471 static uint16_t getExtName(uint32_t code
, char *buffer
, uint16_t bufferLength
) {
472 const char *catname
= getCharCatName(code
);
478 WRITE_CHAR(buffer
, bufferLength
, length
, '<');
479 while (catname
[length
- 1]) {
480 WRITE_CHAR(buffer
, bufferLength
, length
, catname
[length
- 1]);
482 WRITE_CHAR(buffer
, bufferLength
, length
, '-');
483 for (cp
= code
, ndigits
= 0; cp
; ++ndigits
, cp
>>= 4)
487 for (cp
= code
, i
= ndigits
; (cp
|| i
> 0) && bufferLength
; cp
>>= 4, bufferLength
--) {
488 uint8_t v
= (uint8_t)(cp
& 0xf);
489 buffer
[--i
] = (v
< 10 ? '0' + v
: 'A' + v
- 10);
493 WRITE_CHAR(buffer
, bufferLength
, length
, '>');
499 * getGroup() does a binary search for the group that contains the
500 * Unicode code point "code".
501 * The return value is always a valid Group* that may contain "code"
502 * or else is the highest group before "code".
503 * If the lowest group is after "code", then that one is returned.
505 static const uint16_t *
506 getGroup(UCharNames
*names
, uint32_t code
) {
507 const uint16_t *groups
=GET_GROUPS(names
);
508 uint16_t groupMSB
=(uint16_t)(code
>>GROUP_SHIFT
),
513 /* binary search for the group of names that contains the one for code */
514 while(start
<limit
-1) {
515 number
=(uint16_t)((start
+limit
)/2);
516 if(groupMSB
<groups
[number
*GROUP_LENGTH
+GROUP_MSB
]) {
523 /* return this regardless of whether it is an exact match */
524 return groups
+start
*GROUP_LENGTH
;
528 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
529 * expands them into offsets and lengths for each string.
530 * Lengths are stored with a variable-width encoding in consecutive nibbles:
531 * If a nibble<0xc, then it is the length itself (0=empty string).
532 * If a nibble>=0xc, then it forms a length value with the following nibble.
533 * Calculation see below.
534 * The offsets and lengths arrays must be at least 33 (one more) long because
535 * there is no check here at the end if the last nibble is still used.
537 static const uint8_t *
538 expandGroupLengths(const uint8_t *s
,
539 uint16_t offsets
[LINES_PER_GROUP
+1], uint16_t lengths
[LINES_PER_GROUP
+1]) {
540 /* read the lengths of the 32 strings in this group and get each string's offset */
541 uint16_t i
=0, offset
=0, length
=0;
544 /* all 32 lengths must be read to get the offset of the first group string */
545 while(i
<LINES_PER_GROUP
) {
548 /* read even nibble - MSBs of lengthByte */
550 /* double-nibble length spread across two bytes */
551 length
=(uint16_t)(((length
&0x3)<<4|lengthByte
>>4)+12);
553 } else if((lengthByte
/* &0xf0 */)>=0xc0) {
554 /* double-nibble length spread across this one byte */
555 length
=(uint16_t)((lengthByte
&0x3f)+12);
557 /* single-nibble length in MSBs */
558 length
=(uint16_t)(lengthByte
>>4);
568 /* read odd nibble - LSBs of lengthByte */
569 if((lengthByte
&0xf0)==0) {
570 /* this nibble was not consumed for a double-nibble length above */
573 /* single-nibble length in LSBs */
581 length
=0; /* prevent double-nibble detection in the next iteration */
585 /* now, s is at the first group string */
590 expandGroupName(UCharNames
*names
, const uint16_t *group
,
591 uint16_t lineNumber
, UCharNameChoice nameChoice
,
592 char *buffer
, uint16_t bufferLength
) {
593 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
594 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+GET_GROUP_OFFSET(group
);
595 s
=expandGroupLengths(s
, offsets
, lengths
);
596 return expandName(names
, s
+offsets
[lineNumber
], lengths
[lineNumber
], nameChoice
,
597 buffer
, bufferLength
);
601 getName(UCharNames
*names
, uint32_t code
, UCharNameChoice nameChoice
,
602 char *buffer
, uint16_t bufferLength
) {
603 const uint16_t *group
=getGroup(names
, code
);
604 if((uint16_t)(code
>>GROUP_SHIFT
)==group
[GROUP_MSB
]) {
605 return expandGroupName(names
, group
, (uint16_t)(code
&GROUP_MASK
), nameChoice
,
606 buffer
, bufferLength
);
608 /* group not found */
618 * enumGroupNames() enumerates all the names in a 32-group
619 * and either calls the enumerator function or finds a given input name.
622 enumGroupNames(UCharNames
*names
, const uint16_t *group
,
623 UChar32 start
, UChar32 end
,
624 UEnumCharNamesFn
*fn
, void *context
,
625 UCharNameChoice nameChoice
) {
626 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
627 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+GET_GROUP_OFFSET(group
);
629 s
=expandGroupLengths(s
, offsets
, lengths
);
630 if(fn
!=DO_FIND_NAME
) {
635 length
=expandName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, buffer
, sizeof(buffer
));
636 if (!length
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
637 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
639 /* here, we assume that the buffer is large enough */
641 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
648 const char *otherName
=((FindName
*)context
)->otherName
;
650 if(compareName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, otherName
)) {
651 ((FindName
*)context
)->code
=start
;
661 * enumExtNames enumerate extended names.
662 * It only needs to do it if it is called with a real function and not
663 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
664 * for extended names by itself.
667 enumExtNames(UChar32 start
, UChar32 end
,
668 UEnumCharNamesFn
*fn
, void *context
)
670 if(fn
!=DO_FIND_NAME
) {
675 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
676 /* here, we assume that the buffer is large enough */
678 if(!fn(context
, start
, U_EXTENDED_CHAR_NAME
, buffer
, length
)) {
690 enumNames(UCharNames
*names
,
691 UChar32 start
, UChar32 limit
,
692 UEnumCharNamesFn
*fn
, void *context
,
693 UCharNameChoice nameChoice
) {
694 uint16_t startGroupMSB
, endGroupMSB
, groupCount
;
695 const uint16_t *group
, *groupLimit
;
697 startGroupMSB
=(uint16_t)(start
>>GROUP_SHIFT
);
698 endGroupMSB
=(uint16_t)((limit
-1)>>GROUP_SHIFT
);
700 /* find the group that contains start, or the highest before it */
701 group
=getGroup(names
, start
);
703 if(startGroupMSB
<group
[GROUP_MSB
] && nameChoice
==U_EXTENDED_CHAR_NAME
) {
704 /* enumerate synthetic names between start and the group start */
705 UChar32 extLimit
=((UChar32
)group
[GROUP_MSB
]<<GROUP_SHIFT
);
709 if(!enumExtNames(start
, extLimit
-1, fn
, context
)) {
715 if(startGroupMSB
==endGroupMSB
) {
716 if(startGroupMSB
==group
[GROUP_MSB
]) {
717 /* if start and limit-1 are in the same group, then enumerate only in that one */
718 return enumGroupNames(names
, group
, start
, limit
-1, fn
, context
, nameChoice
);
721 const uint16_t *groups
=GET_GROUPS(names
);
722 groupCount
=*groups
++;
723 groupLimit
=groups
+groupCount
*GROUP_LENGTH
;
725 if(startGroupMSB
==group
[GROUP_MSB
]) {
726 /* enumerate characters in the partial start group */
727 if((start
&GROUP_MASK
)!=0) {
728 if(!enumGroupNames(names
, group
,
729 start
, ((UChar32
)startGroupMSB
<<GROUP_SHIFT
)+LINES_PER_GROUP
-1,
730 fn
, context
, nameChoice
)) {
733 group
=NEXT_GROUP(group
); /* continue with the next group */
735 } else if(startGroupMSB
>group
[GROUP_MSB
]) {
736 /* make sure that we start enumerating with the first group after start */
737 const uint16_t *nextGroup
=NEXT_GROUP(group
);
738 if (nextGroup
< groupLimit
&& nextGroup
[GROUP_MSB
] > startGroupMSB
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
739 UChar32 end
= nextGroup
[GROUP_MSB
] << GROUP_SHIFT
;
743 if (!enumExtNames(start
, end
- 1, fn
, context
)) {
750 /* enumerate entire groups between the start- and end-groups */
751 while(group
<groupLimit
&& group
[GROUP_MSB
]<endGroupMSB
) {
752 const uint16_t *nextGroup
;
753 start
=(UChar32
)group
[GROUP_MSB
]<<GROUP_SHIFT
;
754 if(!enumGroupNames(names
, group
, start
, start
+LINES_PER_GROUP
-1, fn
, context
, nameChoice
)) {
757 nextGroup
=NEXT_GROUP(group
);
758 if (nextGroup
< groupLimit
&& nextGroup
[GROUP_MSB
] > group
[GROUP_MSB
] + 1 && nameChoice
== U_EXTENDED_CHAR_NAME
) {
759 UChar32 end
= nextGroup
[GROUP_MSB
] << GROUP_SHIFT
;
763 if (!enumExtNames((group
[GROUP_MSB
] + 1) << GROUP_SHIFT
, end
- 1, fn
, context
)) {
770 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
771 if(group
<groupLimit
&& group
[GROUP_MSB
]==endGroupMSB
) {
772 return enumGroupNames(names
, group
, (limit
-1)&~GROUP_MASK
, limit
-1, fn
, context
, nameChoice
);
773 } else if (nameChoice
== U_EXTENDED_CHAR_NAME
&& group
== groupLimit
) {
774 UChar32 next
= (PREV_GROUP(group
)[GROUP_MSB
] + 1) << GROUP_SHIFT
;
783 /* we have not found a group, which means everything is made of
785 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
786 if (limit
> UCHAR_MAX_VALUE
+ 1) {
787 limit
= UCHAR_MAX_VALUE
+ 1;
789 return enumExtNames(start
, limit
- 1, fn
, context
);
796 writeFactorSuffix(const uint16_t *factors
, uint16_t count
,
797 const char *s
, /* suffix elements */
799 uint16_t indexes
[8], /* output fields from here */
800 const char *elementBases
[8], const char *elements
[8],
801 char *buffer
, uint16_t bufferLength
) {
802 uint16_t i
, factor
, bufferPos
=0;
805 /* write elements according to the factors */
808 * the factorized elements are determined by modulo arithmetic
809 * with the factors of this algorithm
811 * note that for fewer operations, count is decremented here
814 for(i
=count
; i
>0; --i
) {
816 indexes
[i
]=(uint16_t)(code%factor
);
820 * we don't need to calculate the last modulus because start<=code<=end
821 * guarantees here that code<=factors[0]
823 indexes
[0]=(uint16_t)code
;
825 /* write each element */
827 if(elementBases
!=NULL
) {
831 /* skip indexes[i] strings */
843 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
846 /* we do not need to perform the rest of this loop for i==count - break here */
851 /* skip the rest of the strings for this factors[i] */
852 factor
=(uint16_t)(factors
[i
]-indexes
[i
]-1);
871 * Parts of findAlgName() are almost the same as some of getAlgName().
872 * Fixes must be applied to both.
875 getAlgName(AlgorithmicRange
*range
, uint32_t code
, UCharNameChoice nameChoice
,
876 char *buffer
, uint16_t bufferLength
) {
877 uint16_t bufferPos
=0;
879 /* Only the normative character name can be algorithmic. */
880 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
888 switch(range
->type
) {
890 /* name = prefix hex-digits */
891 const char *s
=(const char *)(range
+1);
898 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
901 /* write hexadecimal code point value */
902 count
=range
->variant
;
905 if(count
<bufferLength
) {
910 if(--i
<bufferLength
) {
926 /* name = prefix factorized-elements */
928 const uint16_t *factors
=(const uint16_t *)(range
+1);
929 uint16_t count
=range
->variant
;
930 const char *s
=(const char *)(factors
+count
);
935 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
938 bufferPos
+=writeFactorSuffix(factors
, count
,
939 s
, code
-range
->start
, indexes
, NULL
, NULL
, buffer
, bufferLength
);
955 * Important: enumAlgNames() and findAlgName() are almost the same.
956 * Any fix must be applied to both.
959 enumAlgNames(AlgorithmicRange
*range
,
960 UChar32 start
, UChar32 limit
,
961 UEnumCharNamesFn
*fn
, void *context
,
962 UCharNameChoice nameChoice
) {
966 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
970 switch(range
->type
) {
975 /* get the full name of the start character */
976 length
=getAlgName(range
, (uint32_t)start
, nameChoice
, buffer
, sizeof(buffer
));
981 /* call the enumerator function with this first character */
982 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
986 /* go to the end of the name; all these names have the same length */
992 /* enumerate the rest of the names */
993 while(++start
<limit
) {
994 /* increment the hexadecimal number on a character-basis */
998 if(('0'<=c
&& c
<'9') || ('A'<=c
&& c
<'F')) {
1009 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1016 uint16_t indexes
[8];
1017 const char *elementBases
[8], *elements
[8];
1018 const uint16_t *factors
=(const uint16_t *)(range
+1);
1019 uint16_t count
=range
->variant
;
1020 const char *s
=(const char *)(factors
+count
);
1022 uint16_t prefixLength
, i
, idx
;
1026 /* name = prefix factorized-elements */
1031 while((c
=*s
++)!=0) {
1036 /* append the suffix of the start character */
1037 length
=(uint16_t)(prefixLength
+writeFactorSuffix(factors
, count
,
1038 s
, (uint32_t)start
-range
->start
,
1039 indexes
, elementBases
, elements
,
1040 suffix
, (uint16_t)(sizeof(buffer
)-prefixLength
)));
1042 /* call the enumerator function with this first character */
1043 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1047 /* enumerate the rest of the names */
1048 while(++start
<limit
) {
1049 /* increment the indexes in lexical order bound by the factors */
1052 idx
=(uint16_t)(indexes
[--i
]+1);
1053 if(idx
<factors
[i
]) {
1054 /* skip one index and its element string */
1062 /* reset this index to 0 and its element string to the first one */
1064 elements
[i
]=elementBases
[i
];
1068 /* to make matters a little easier, just append all elements to the suffix */
1070 length
=prefixLength
;
1071 for(i
=0; i
<count
; ++i
) {
1073 while((c
=*s
++)!=0) {
1078 /* zero-terminate */
1081 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1088 /* undefined type */
1096 * findAlgName() is almost the same as enumAlgNames() except that it
1097 * returns the code point for a name if it fits into the range.
1098 * It returns 0xffff otherwise.
1101 findAlgName(AlgorithmicRange
*range
, UCharNameChoice nameChoice
, const char *otherName
) {
1104 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
1108 switch(range
->type
) {
1110 /* name = prefix hex-digits */
1111 const char *s
=(const char *)(range
+1);
1116 /* compare prefix */
1117 while((c
=*s
++)!=0) {
1118 if((char)c
!=*otherName
++) {
1123 /* read hexadecimal code point value */
1124 count
=range
->variant
;
1126 for(i
=0; i
<count
; ++i
) {
1128 if('0'<=c
&& c
<='9') {
1129 code
=(code
<<4)|(c
-'0');
1130 } else if('A'<=c
&& c
<='F') {
1131 code
=(code
<<4)|(c
-'A'+10);
1137 /* does it fit into the range? */
1138 if(*otherName
==0 && range
->start
<=(uint32_t)code
&& (uint32_t)code
<=range
->end
) {
1145 uint16_t indexes
[8];
1146 const char *elementBases
[8], *elements
[8];
1147 const uint16_t *factors
=(const uint16_t *)(range
+1);
1148 uint16_t count
=range
->variant
;
1149 const char *s
=(const char *)(factors
+count
), *t
;
1150 UChar32 start
, limit
;
1155 /* name = prefix factorized-elements */
1157 /* compare prefix */
1158 while((c
=*s
++)!=0) {
1159 if((char)c
!=*otherName
++) {
1164 start
=(UChar32
)range
->start
;
1165 limit
=(UChar32
)(range
->end
+1);
1167 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1168 writeFactorSuffix(factors
, count
, s
, 0,
1169 indexes
, elementBases
, elements
, buffer
, sizeof(buffer
));
1171 /* compare the first suffix */
1172 if(0==uprv_strcmp(otherName
, buffer
)) {
1176 /* enumerate and compare the rest of the suffixes */
1177 while(++start
<limit
) {
1178 /* increment the indexes in lexical order bound by the factors */
1181 idx
=(uint16_t)(indexes
[--i
]+1);
1182 if(idx
<factors
[i
]) {
1183 /* skip one index and its element string */
1190 /* reset this index to 0 and its element string to the first one */
1192 elements
[i
]=elementBases
[i
];
1196 /* to make matters a little easier, just compare all elements of the suffix */
1198 for(i
=0; i
<count
; ++i
) {
1200 while((c
=*s
++)!=0) {
1202 s
=""; /* does not match */
1214 /* undefined type */
1221 /* sets of name characters, maximum name lengths ---------------------------- */
1223 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1224 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1227 calcStringSetLength(uint32_t set
[8], const char *s
) {
1231 while((c
=*s
++)!=0) {
1239 calcAlgNameSetsLengths(int32_t maxNameLength
) {
1240 AlgorithmicRange
*range
;
1242 uint32_t rangeCount
;
1245 /* enumerate algorithmic ranges */
1246 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1248 range
=(AlgorithmicRange
*)(p
+1);
1249 while(rangeCount
>0) {
1250 switch(range
->type
) {
1252 /* name = prefix + (range->variant times) hex-digits */
1254 length
=calcStringSetLength(gNameSet
, (const char *)(range
+1))+range
->variant
;
1255 if(length
>maxNameLength
) {
1256 maxNameLength
=length
;
1260 /* name = prefix factorized-elements */
1261 const uint16_t *factors
=(const uint16_t *)(range
+1);
1263 int32_t i
, count
=range
->variant
, factor
, factorLength
, maxFactorLength
;
1266 s
=(const char *)(factors
+count
);
1267 length
=calcStringSetLength(gNameSet
, s
);
1268 s
+=length
+1; /* start of factor suffixes */
1270 /* get the set and maximum factor suffix length for each factor */
1271 for(i
=0; i
<count
; ++i
) {
1273 for(factor
=factors
[i
]; factor
>0; --factor
) {
1274 factorLength
=calcStringSetLength(gNameSet
, s
);
1276 if(factorLength
>maxFactorLength
) {
1277 maxFactorLength
=factorLength
;
1280 length
+=maxFactorLength
;
1283 if(length
>maxNameLength
) {
1284 maxNameLength
=length
;
1293 range
=(AlgorithmicRange
*)((uint8_t *)range
+range
->size
);
1296 return maxNameLength
;
1300 calcExtNameSetsLengths(int32_t maxNameLength
) {
1303 for(i
=0; i
<LENGTHOF(charCatNames
); ++i
) {
1305 * for each category, count the length of the category name
1309 * 6 for most hex digits per code point
1311 length
=9+calcStringSetLength(gNameSet
, charCatNames
[i
]);
1312 if(length
>maxNameLength
) {
1313 maxNameLength
=length
;
1316 return maxNameLength
;
1320 calcNameSetLength(const uint16_t *tokens
, uint16_t tokenCount
, const uint8_t *tokenStrings
, int8_t *tokenLengths
,
1322 const uint8_t **pLine
, const uint8_t *lineLimit
) {
1323 const uint8_t *line
=*pLine
;
1324 int32_t length
=0, tokenLength
;
1327 while(line
!=lineLimit
&& (c
=*line
++)!=(uint8_t)';') {
1329 /* implicit letter */
1334 if(token
==(uint16_t)(-2)) {
1335 /* this is a lead byte for a double-byte token */
1339 if(token
==(uint16_t)(-1)) {
1340 /* explicit letter */
1344 /* count token word */
1345 if(tokenLengths
!=NULL
) {
1346 /* use cached token length */
1347 tokenLength
=tokenLengths
[c
];
1348 if(tokenLength
==0) {
1349 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1350 tokenLengths
[c
]=(int8_t)tokenLength
;
1353 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1355 length
+=tokenLength
;
1365 calcGroupNameSetsLengths(int32_t maxNameLength
) {
1366 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
1368 uint16_t *tokens
=(uint16_t *)uCharNames
+8;
1369 uint16_t tokenCount
=*tokens
++;
1370 uint8_t *tokenStrings
=(uint8_t *)uCharNames
+uCharNames
->tokenStringOffset
;
1372 int8_t *tokenLengths
;
1374 const uint16_t *group
;
1375 const uint8_t *s
, *line
, *lineLimit
;
1377 int32_t groupCount
, lineNumber
, length
;
1379 tokenLengths
=(int8_t *)uprv_malloc(tokenCount
);
1380 if(tokenLengths
!=NULL
) {
1381 uprv_memset(tokenLengths
, 0, tokenCount
);
1384 group
=GET_GROUPS(uCharNames
);
1385 groupCount
=*group
++;
1387 /* enumerate all groups */
1388 while(groupCount
>0) {
1389 s
=(uint8_t *)uCharNames
+uCharNames
->groupStringOffset
+GET_GROUP_OFFSET(group
);
1390 s
=expandGroupLengths(s
, offsets
, lengths
);
1392 /* enumerate all lines in each group */
1393 for(lineNumber
=0; lineNumber
<LINES_PER_GROUP
; ++lineNumber
) {
1394 line
=s
+offsets
[lineNumber
];
1395 length
=lengths
[lineNumber
];
1400 lineLimit
=line
+length
;
1402 /* read regular name */
1403 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1404 if(length
>maxNameLength
) {
1405 maxNameLength
=length
;
1407 if(line
==lineLimit
) {
1411 /* read Unicode 1.0 name */
1412 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1413 if(length
>maxNameLength
) {
1414 maxNameLength
=length
;
1416 if(line
==lineLimit
) {
1420 /* read ISO comment */
1421 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1424 group
=NEXT_GROUP(group
);
1428 if(tokenLengths
!=NULL
) {
1429 uprv_free(tokenLengths
);
1432 /* set gMax... - name length last for threading */
1433 gMaxNameLength
=maxNameLength
;
1437 calcNameSetsLengths(UErrorCode
*pErrorCode
) {
1438 static const char extChars
[]="0123456789ABCDEF<>-";
1439 int32_t i
, maxNameLength
;
1441 if(gMaxNameLength
!=0) {
1445 if(!isDataLoaded(pErrorCode
)) {
1449 /* set hex digits, used in various names, and <>-, used in extended names */
1450 for(i
=0; i
<(int32_t)sizeof(extChars
)-1; ++i
) {
1451 SET_ADD(gNameSet
, extChars
[i
]);
1454 /* set sets and lengths from algorithmic names */
1455 maxNameLength
=calcAlgNameSetsLengths(0);
1457 /* set sets and lengths from extended names */
1458 maxNameLength
=calcExtNameSetsLengths(maxNameLength
);
1460 /* set sets and lengths from group names, set global maximum values */
1461 calcGroupNameSetsLengths(maxNameLength
);
1466 /* public API --------------------------------------------------------------- */
1468 U_CAPI
int32_t U_EXPORT2
1469 u_charName(UChar32 code
, UCharNameChoice nameChoice
,
1470 char *buffer
, int32_t bufferLength
,
1471 UErrorCode
*pErrorCode
) {
1472 AlgorithmicRange
*algRange
;
1477 /* check the argument values */
1478 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1480 } else if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
||
1481 bufferLength
<0 || (bufferLength
>0 && buffer
==NULL
)
1483 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1487 if((uint32_t)code
>UCHAR_MAX_VALUE
|| !isDataLoaded(pErrorCode
)) {
1488 return u_terminateChars(buffer
, bufferLength
, 0, pErrorCode
);
1493 /* try algorithmic names first */
1494 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1496 algRange
=(AlgorithmicRange
*)(p
+1);
1498 if(algRange
->start
<=(uint32_t)code
&& (uint32_t)code
<=algRange
->end
) {
1499 length
=getAlgName(algRange
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1502 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1507 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1508 length
= getName(uCharNames
, (uint32_t )code
, U_EXTENDED_CHAR_NAME
, buffer
, (uint16_t) bufferLength
);
1510 /* extended character name */
1511 length
= getExtName((uint32_t) code
, buffer
, (uint16_t) bufferLength
);
1514 /* normal character name */
1515 length
=getName(uCharNames
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1519 return u_terminateChars(buffer
, bufferLength
, length
, pErrorCode
);
1522 U_CAPI
int32_t U_EXPORT2
1523 u_getISOComment(UChar32
/*c*/,
1524 char *dest
, int32_t destCapacity
,
1525 UErrorCode
*pErrorCode
) {
1526 /* check the argument values */
1527 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1529 } else if(destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1530 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1534 return u_terminateChars(dest
, destCapacity
, 0, pErrorCode
);
1537 U_CAPI UChar32 U_EXPORT2
1538 u_charFromName(UCharNameChoice nameChoice
,
1540 UErrorCode
*pErrorCode
) {
1541 char upper
[120], lower
[120];
1543 AlgorithmicRange
*algRange
;
1548 UChar32 error
= 0xffff; /* Undefined, but use this for backwards compatibility. */
1550 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1554 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| name
==NULL
|| *name
==0) {
1555 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1559 if(!isDataLoaded(pErrorCode
)) {
1563 /* construct the uppercase and lowercase of the name first */
1564 for(i
=0; i
<sizeof(upper
); ++i
) {
1565 if((c0
=*name
++)!=0) {
1566 upper
[i
]=uprv_toupper(c0
);
1567 lower
[i
]=uprv_tolower(c0
);
1569 upper
[i
]=lower
[i
]=0;
1573 if(i
==sizeof(upper
)) {
1574 /* name too long, there is no such character */
1575 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1579 /* try extended names first */
1580 if (lower
[0] == '<') {
1581 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1582 if (lower
[--i
] == '>') {
1583 for (--i
; lower
[i
] && lower
[i
] != '-'; --i
) {
1586 if (lower
[i
] == '-') { /* We've got a category. */
1591 for (++i
; lower
[i
] != '>'; ++i
) {
1592 if (lower
[i
] >= '0' && lower
[i
] <= '9') {
1593 cp
= (cp
<< 4) + lower
[i
] - '0';
1594 } else if (lower
[i
] >= 'a' && lower
[i
] <= 'f') {
1595 cp
= (cp
<< 4) + lower
[i
] - 'a' + 10;
1597 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1602 /* Now validate the category name.
1603 We could use a binary search, or a trie, if
1604 we really wanted to. */
1606 for (lower
[i
] = 0, cIdx
= 0; cIdx
< LENGTHOF(charCatNames
); ++cIdx
) {
1608 if (!uprv_strcmp(lower
+ 1, charCatNames
[cIdx
])) {
1609 if (getCharCat(cp
) == cIdx
) {
1619 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1623 /* try algorithmic names now */
1624 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1626 algRange
=(AlgorithmicRange
*)(p
+1);
1628 if((cp
=findAlgName(algRange
, nameChoice
, upper
))!=0xffff) {
1631 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1635 /* normal character name */
1636 findName
.otherName
=upper
;
1637 findName
.code
=error
;
1638 enumNames(uCharNames
, 0, UCHAR_MAX_VALUE
+ 1, DO_FIND_NAME
, &findName
, nameChoice
);
1639 if (findName
.code
== error
) {
1640 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1642 return findName
.code
;
1645 U_CAPI
void U_EXPORT2
1646 u_enumCharNames(UChar32 start
, UChar32 limit
,
1647 UEnumCharNamesFn
*fn
,
1649 UCharNameChoice nameChoice
,
1650 UErrorCode
*pErrorCode
) {
1651 AlgorithmicRange
*algRange
;
1655 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1659 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| fn
==NULL
) {
1660 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1664 if((uint32_t) limit
> UCHAR_MAX_VALUE
+ 1) {
1665 limit
= UCHAR_MAX_VALUE
+ 1;
1667 if((uint32_t)start
>=(uint32_t)limit
) {
1671 if(!isDataLoaded(pErrorCode
)) {
1675 /* interleave the data-driven ones with the algorithmic ones */
1676 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1677 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1679 algRange
=(AlgorithmicRange
*)(p
+1);
1681 /* enumerate the character names before the current algorithmic range */
1682 /* here: start<limit */
1683 if((uint32_t)start
<algRange
->start
) {
1684 if((uint32_t)limit
<=algRange
->start
) {
1685 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1688 if(!enumNames(uCharNames
, start
, (UChar32
)algRange
->start
, fn
, context
, nameChoice
)) {
1691 start
=(UChar32
)algRange
->start
;
1693 /* enumerate the character names in the current algorithmic range */
1694 /* here: algRange->start<=start<limit */
1695 if((uint32_t)start
<=algRange
->end
) {
1696 if((uint32_t)limit
<=(algRange
->end
+1)) {
1697 enumAlgNames(algRange
, start
, limit
, fn
, context
, nameChoice
);
1700 if(!enumAlgNames(algRange
, start
, (UChar32
)algRange
->end
+1, fn
, context
, nameChoice
)) {
1703 start
=(UChar32
)algRange
->end
+1;
1705 /* continue to the next algorithmic range (here: start<limit) */
1706 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1709 /* enumerate the character names after the last algorithmic range */
1710 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1713 U_CAPI
int32_t U_EXPORT2
1714 uprv_getMaxCharNameLength() {
1715 UErrorCode errorCode
=U_ZERO_ERROR
;
1716 if(calcNameSetsLengths(&errorCode
)) {
1717 return gMaxNameLength
;
1724 * Converts the char set cset into a Unicode set uset.
1725 * @param cset Set of 256 bit flags corresponding to a set of chars.
1726 * @param uset USet to receive characters. Existing contents are deleted.
1729 charSetToUSet(uint32_t cset
[8], const USetAdder
*sa
) {
1734 UErrorCode errorCode
;
1736 errorCode
=U_ZERO_ERROR
;
1738 if(!calcNameSetsLengths(&errorCode
)) {
1742 /* build a char string with all chars that are used in character names */
1744 for(i
=0; i
<256; ++i
) {
1745 if(SET_CONTAINS(cset
, i
)) {
1746 cs
[length
++]=(char)i
;
1750 /* convert the char string to a UChar string */
1751 u_charsToUChars(cs
, us
, length
);
1753 /* add each UChar to the USet */
1754 for(i
=0; i
<length
; ++i
) {
1755 if(us
[i
]!=0 || cs
[i
]==0) { /* non-invariant chars become (UChar)0 */
1756 sa
->add(sa
->set
, us
[i
]);
1762 * Fills set with characters that are used in Unicode character names.
1763 * @param set USet to receive characters.
1765 U_CAPI
void U_EXPORT2
1766 uprv_getCharNameCharacters(const USetAdder
*sa
) {
1767 charSetToUSet(gNameSet
, sa
);
1770 /* data swapping ------------------------------------------------------------ */
1773 * The token table contains non-negative entries for token bytes,
1774 * and -1 for bytes that represent themselves in the data file's charset.
1775 * -2 entries are used for lead bytes.
1777 * Direct bytes (-1 entries) must be translated from the input charset family
1778 * to the output charset family.
1779 * makeTokenMap() writes a permutation mapping for this.
1780 * Use it once for single-/lead-byte tokens and once more for all trail byte
1781 * tokens. (';' is an unused trail byte marked with -1.)
1784 makeTokenMap(const UDataSwapper
*ds
,
1785 int16_t tokens
[], uint16_t tokenCount
,
1787 UErrorCode
*pErrorCode
) {
1788 UBool usedOutChar
[256];
1792 if(U_FAILURE(*pErrorCode
)) {
1796 if(ds
->inCharset
==ds
->outCharset
) {
1797 /* Same charset family: identity permutation */
1798 for(i
=0; i
<256; ++i
) {
1802 uprv_memset(map
, 0, 256);
1803 uprv_memset(usedOutChar
, 0, 256);
1805 if(tokenCount
>256) {
1809 /* set the direct bytes (byte 0 always maps to itself) */
1810 for(i
=1; i
<tokenCount
; ++i
) {
1812 /* convert the direct byte character */
1814 ds
->swapInvChars(ds
, &c1
, 1, &c2
, pErrorCode
);
1815 if(U_FAILURE(*pErrorCode
)) {
1816 udata_printError(ds
, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1821 /* enter the converted character into the map and mark it used */
1823 usedOutChar
[c2
]=TRUE
;
1827 /* set the mappings for the rest of the permutation */
1828 for(i
=j
=1; i
<tokenCount
; ++i
) {
1829 /* set mappings that were not set for direct bytes */
1831 /* set an output byte value that was not used as an output byte above */
1832 while(usedOutChar
[j
]) {
1835 map
[i
]=(uint8_t)j
++;
1840 * leave mappings at tokenCount and above unset if tokenCount<256
1841 * because they won't be used
1846 U_CAPI
int32_t U_EXPORT2
1847 uchar_swapNames(const UDataSwapper
*ds
,
1848 const void *inData
, int32_t length
, void *outData
,
1849 UErrorCode
*pErrorCode
) {
1850 const UDataInfo
*pInfo
;
1853 const uint8_t *inBytes
;
1856 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
,
1857 offset
, i
, count
, stringsCount
;
1859 const AlgorithmicRange
*inRange
;
1860 AlgorithmicRange
*outRange
;
1862 /* udata_swapDataHeader checks the arguments */
1863 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
1864 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1868 /* check data format and format version */
1869 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
1871 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
1872 pInfo
->dataFormat
[1]==0x6e &&
1873 pInfo
->dataFormat
[2]==0x61 &&
1874 pInfo
->dataFormat
[3]==0x6d &&
1875 pInfo
->formatVersion
[0]==1
1877 udata_printError(ds
, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1878 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
1879 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
1880 pInfo
->formatVersion
[0]);
1881 *pErrorCode
=U_UNSUPPORTED_ERROR
;
1885 inBytes
=(const uint8_t *)inData
+headerSize
;
1886 outBytes
=(uint8_t *)outData
+headerSize
;
1888 algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]);
1892 (uint32_t)length
<(algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]))
1894 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1896 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1902 /* preflighting: iterate through algorithmic ranges */
1903 offset
=algNamesOffset
;
1904 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
1907 for(i
=0; i
<count
; ++i
) {
1908 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
1909 offset
+=ds
->readUInt16(inRange
->size
);
1916 int16_t tokens
[512];
1917 uint16_t tokenCount
;
1919 uint8_t map
[256], trailMap
[256];
1921 /* copy the data for inaccessible bytes */
1922 if(inBytes
!=outBytes
) {
1923 uprv_memcpy(outBytes
, inBytes
, length
);
1926 /* the initial 4 offsets first */
1927 tokenStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[0]);
1928 groupsOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[1]);
1929 groupStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[2]);
1930 ds
->swapArray32(ds
, inBytes
, 16, outBytes
, pErrorCode
);
1933 * now the tokens table
1934 * it needs to be permutated along with the compressed name strings
1936 p
=(const uint16_t *)(inBytes
+16);
1937 q
=(uint16_t *)(outBytes
+16);
1939 /* read and swap the tokenCount */
1940 tokenCount
=ds
->readUInt16(*p
);
1941 ds
->swapArray16(ds
, p
, 2, q
, pErrorCode
);
1945 /* read the first 512 tokens and make the token maps */
1946 if(tokenCount
<=512) {
1951 for(i
=0; i
<count
; ++i
) {
1952 tokens
[i
]=udata_readInt16(ds
, p
[i
]);
1955 tokens
[i
]=0; /* fill the rest of the tokens array if tokenCount<512 */
1957 makeTokenMap(ds
, tokens
, tokenCount
, map
, pErrorCode
);
1958 makeTokenMap(ds
, tokens
+256, (uint16_t)(tokenCount
>256 ? tokenCount
-256 : 0), trailMap
, pErrorCode
);
1959 if(U_FAILURE(*pErrorCode
)) {
1964 * swap and permutate the tokens
1965 * go through a temporary array to support in-place swapping
1967 temp
=(uint16_t *)uprv_malloc(tokenCount
*2);
1969 udata_printError(ds
, "out of memory swapping %u unames.icu tokens\n",
1971 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1975 /* swap and permutate single-/lead-byte tokens */
1976 for(i
=0; i
<tokenCount
&& i
<256; ++i
) {
1977 ds
->swapArray16(ds
, p
+i
, 2, temp
+map
[i
], pErrorCode
);
1980 /* swap and permutate trail-byte tokens */
1981 for(; i
<tokenCount
; ++i
) {
1982 ds
->swapArray16(ds
, p
+i
, 2, temp
+(i
&0xffffff00)+trailMap
[i
&0xff], pErrorCode
);
1985 /* copy the result into the output and free the temporary array */
1986 uprv_memcpy(q
, temp
, tokenCount
*2);
1990 * swap the token strings but not a possible padding byte after
1991 * the terminating NUL of the last string
1993 udata_swapInvStringBlock(ds
, inBytes
+tokenStringOffset
, (int32_t)(groupsOffset
-tokenStringOffset
),
1994 outBytes
+tokenStringOffset
, pErrorCode
);
1995 if(U_FAILURE(*pErrorCode
)) {
1996 udata_printError(ds
, "uchar_swapNames(token strings) failed\n");
2000 /* swap the group table */
2001 count
=ds
->readUInt16(*((const uint16_t *)(inBytes
+groupsOffset
)));
2002 ds
->swapArray16(ds
, inBytes
+groupsOffset
, (int32_t)((1+count
*3)*2),
2003 outBytes
+groupsOffset
, pErrorCode
);
2006 * swap the group strings
2007 * swap the string bytes but not the nibble-encoded string lengths
2009 if(ds
->inCharset
!=ds
->outCharset
) {
2010 uint16_t offsets
[LINES_PER_GROUP
+1], lengths
[LINES_PER_GROUP
+1];
2012 const uint8_t *inStrings
, *nextInStrings
;
2013 uint8_t *outStrings
;
2017 inStrings
=inBytes
+groupStringOffset
;
2018 outStrings
=outBytes
+groupStringOffset
;
2020 stringsCount
=algNamesOffset
-groupStringOffset
;
2022 /* iterate through string groups until only a few padding bytes are left */
2023 while(stringsCount
>32) {
2024 nextInStrings
=expandGroupLengths(inStrings
, offsets
, lengths
);
2026 /* move past the length bytes */
2027 stringsCount
-=(uint32_t)(nextInStrings
-inStrings
);
2028 outStrings
+=nextInStrings
-inStrings
;
2029 inStrings
=nextInStrings
;
2031 count
=offsets
[31]+lengths
[31]; /* total number of string bytes in this group */
2032 stringsCount
-=count
;
2034 /* swap the string bytes using map[] and trailMap[] */
2037 *outStrings
++=map
[c
];
2041 /* token lead byte: swap the trail byte, too */
2042 *outStrings
++=trailMap
[*inStrings
++];
2049 /* swap the algorithmic ranges */
2050 offset
=algNamesOffset
;
2051 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
2052 ds
->swapArray32(ds
, inBytes
+offset
, 4, outBytes
+offset
, pErrorCode
);
2055 for(i
=0; i
<count
; ++i
) {
2056 if(offset
>(uint32_t)length
) {
2057 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2059 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2063 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
2064 outRange
=(AlgorithmicRange
*)(outBytes
+offset
);
2065 offset
+=ds
->readUInt16(inRange
->size
);
2067 ds
->swapArray32(ds
, inRange
, 8, outRange
, pErrorCode
);
2068 ds
->swapArray16(ds
, &inRange
->size
, 2, &outRange
->size
, pErrorCode
);
2069 switch(inRange
->type
) {
2071 /* swap prefix string */
2072 ds
->swapInvChars(ds
, inRange
+1, (int32_t)uprv_strlen((const char *)(inRange
+1)),
2073 outRange
+1, pErrorCode
);
2074 if(U_FAILURE(*pErrorCode
)) {
2075 udata_printError(ds
, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2082 /* swap factors and the prefix and factor strings */
2083 uint32_t factorsCount
;
2085 factorsCount
=inRange
->variant
;
2086 p
=(const uint16_t *)(inRange
+1);
2087 q
=(uint16_t *)(outRange
+1);
2088 ds
->swapArray16(ds
, p
, (int32_t)(factorsCount
*2), q
, pErrorCode
);
2090 /* swap the strings, up to the last terminating NUL */
2093 stringsCount
=(uint32_t)((inBytes
+offset
)-(const uint8_t *)p
);
2094 while(stringsCount
>0 && ((const uint8_t *)p
)[stringsCount
-1]!=0) {
2097 ds
->swapInvChars(ds
, p
, (int32_t)stringsCount
, q
, pErrorCode
);
2101 udata_printError(ds
, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2103 *pErrorCode
=U_UNSUPPORTED_ERROR
;
2109 return headerSize
+(int32_t)offset
;
2113 * Hey, Emacs, please set the following:
2116 * indent-tabs-mode: nil