2 ******************************************************************************
4 * Copyright (C) 1999-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 1999oct04
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/udata.h"
29 /* prototypes ------------------------------------------------------------- */
31 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
33 static const char DATA_NAME
[] = "unames";
34 static const char DATA_TYPE
[] = "icu";
37 #define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
38 #define GROUP_MASK (LINES_PER_GROUP-1)
42 offsetHigh
, offsetLow
; /* avoid padding */
47 uint8_t type
, variant
;
52 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
;
56 const char *otherName
;
60 #define DO_FIND_NAME NULL
62 static UDataMemory
*uCharNamesData
=NULL
;
63 static UCharNames
*uCharNames
=NULL
;
64 static UErrorCode gLoadErrorCode
=U_ZERO_ERROR
;
67 * Maximum length of character names (regular & 1.0).
68 * Maximum length of ISO comments.
70 static int32_t gMaxNameLength
=0, gMaxISOCommentLength
=0;
73 * Set of chars used in character names (regular & 1.0).
74 * Set of chars used in ISO comments.
75 * Chars are platform-dependent (can be EBCDIC).
77 static uint32_t gNameSet
[8]={ 0 }, gISOCommentSet
[8]={ 0 };
79 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
80 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
81 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
83 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
85 static const char * const
86 charCatNames
[U_CHAR_EXTENDED_CATEGORY_COUNT
];
88 /* implementation ----------------------------------------------------------- */
90 static UBool U_CALLCONV
unames_cleanup(void)
93 udata_close(uCharNamesData
);
94 uCharNamesData
= NULL
;
103 static UBool U_CALLCONV
104 isAcceptable(void *context
,
105 const char *type
, const char *name
,
106 const UDataInfo
*pInfo
) {
109 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
110 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
111 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
112 pInfo
->dataFormat
[1]==0x6e &&
113 pInfo
->dataFormat
[2]==0x61 &&
114 pInfo
->dataFormat
[3]==0x6d &&
115 pInfo
->formatVersion
[0]==1);
119 isDataLoaded(UErrorCode
*pErrorCode
) {
120 /* load UCharNames from file if necessary */
123 /* do this because double-checked locking is broken */
125 isCached
=uCharNames
!=NULL
;
132 /* check error code from previous attempt */
133 if(U_FAILURE(gLoadErrorCode
)) {
134 *pErrorCode
=gLoadErrorCode
;
138 /* open the data outside the mutex block */
139 data
=udata_openChoice(NULL
, DATA_TYPE
, DATA_NAME
, isAcceptable
, NULL
, pErrorCode
);
140 if(U_FAILURE(*pErrorCode
)) {
141 gLoadErrorCode
=*pErrorCode
;
145 names
=(UCharNames
*)udata_getMemory(data
);
147 /* in the mutex block, set the data for this process */
150 if(uCharNames
==NULL
) {
155 ucln_common_registerCleanup(UCLN_COMMON_UNAMES
, unames_cleanup
);
160 /* if a different thread set it first, then close the extra data */
162 udata_close(data
); /* NULL if it was set correctly */
168 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
169 if((bufferLength)>0) { \
176 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
179 * Important: expandName() and compareName() are almost the same -
180 * apply fixes to both.
182 * UnicodeData.txt uses ';' as a field separator, so no
183 * field can contain ';' as part of its contents.
184 * In unames.dat, it is marked as token[';']==-1 only if the
185 * semicolon is used in the data file - which is iff we
186 * have Unicode 1.0 names or ISO comments.
187 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments
188 * although we know that it will never be part of a name.
191 expandName(UCharNames
*names
,
192 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
193 char *buffer
, uint16_t bufferLength
) {
194 uint16_t *tokens
=(uint16_t *)names
+8;
195 uint16_t token
, tokenCount
=*tokens
++, bufferPos
=0;
196 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
199 if(nameChoice
==U_UNICODE_10_CHAR_NAME
|| nameChoice
==U_ISO_COMMENT
) {
201 * skip the modern name if it is not requested _and_
202 * if the semicolon byte value is a character, not a token number
204 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
205 while(nameLength
>0) {
211 if(nameChoice
==U_ISO_COMMENT
) {
212 /* skip the Unicode 1.0 name as well to get the ISO comment */
213 while(nameLength
>0) {
222 * the semicolon byte value is a token number, therefore
223 * only modern names are stored in unames.dat and there is no
224 * such requested Unicode 1.0 name here
230 /* write each letter directly, and write a token word per token */
231 while(nameLength
>0) {
237 /* implicit letter */
238 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
245 if(token
==(uint16_t)(-2)) {
246 /* this is a lead byte for a double-byte token */
247 token
=tokens
[c
<<8|*name
++];
250 if(token
==(uint16_t)(-1)) {
252 /* explicit letter */
253 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
255 /* stop, but skip the semicolon if we are seeking
256 extended names and there was no 2.0 name but there
258 if(!bufferPos
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
259 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
267 /* write token word */
268 uint8_t *tokenString
=tokenStrings
+token
;
269 while((c
=*tokenString
++)!=0) {
270 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
285 * compareName() is almost the same as expandName() except that it compares
286 * the currently expanded name to an input name.
287 * It returns the match/no match result as soon as possible.
290 compareName(UCharNames
*names
,
291 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
292 const char *otherName
) {
293 uint16_t *tokens
=(uint16_t *)names
+8;
294 uint16_t token
, tokenCount
=*tokens
++;
295 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
297 const char *origOtherName
= otherName
;
299 if(nameChoice
==U_UNICODE_10_CHAR_NAME
) {
301 * skip the modern name if it is not requested _and_
302 * if the semicolon byte value is a character, not a token number
304 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
305 while(nameLength
>0) {
313 * the semicolon byte value is a token number, therefore
314 * only modern names are stored in unames.dat and there is no
315 * such requested Unicode 1.0 name here
321 /* compare each letter directly, and compare a token word per token */
322 while(nameLength
>0) {
328 /* implicit letter */
329 if((char)c
!=*otherName
++) {
338 if(token
==(uint16_t)(-2)) {
339 /* this is a lead byte for a double-byte token */
340 token
=tokens
[c
<<8|*name
++];
343 if(token
==(uint16_t)(-1)) {
345 /* explicit letter */
346 if((char)c
!=*otherName
++) {
350 /* stop, but skip the semicolon if we are seeking
351 extended names and there was no 2.0 name but there
353 if(otherName
== origOtherName
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
354 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
362 /* write token word */
363 uint8_t *tokenString
=tokenStrings
+token
;
364 while((c
=*tokenString
++)!=0) {
365 if((char)c
!=*otherName
++) {
373 /* complete match? */
374 return (UBool
)(*otherName
==0);
377 static const char * const charCatNames
[U_CHAR_EXTENDED_CATEGORY_COUNT
] = {
386 "combining spacing mark",
387 "decimal digit number",
392 "paragraph separator",
400 "connector punctuation",
406 "initial punctuation",
413 static uint8_t getCharCat(UChar32 cp
) {
416 if (UTF_IS_UNICODE_NONCHAR(cp
)) {
417 return U_NONCHARACTER_CODE_POINT
;
420 if ((cat
= u_charType(cp
)) == U_SURROGATE
) {
421 cat
= UTF_IS_LEAD(cp
) ? U_LEAD_SURROGATE
: U_TRAIL_SURROGATE
;
427 static const char *getCharCatName(UChar32 cp
) {
428 uint8_t cat
= getCharCat(cp
);
430 /* Return unknown if the table of names above is not up to
433 if (cat
>= LENGTHOF(charCatNames
)) {
436 return charCatNames
[cat
];
440 static uint16_t getExtName(uint32_t code
, char *buffer
, uint16_t bufferLength
) {
441 const char *catname
= getCharCatName(code
);
447 WRITE_CHAR(buffer
, bufferLength
, length
, '<');
448 while (catname
[length
- 1]) {
449 WRITE_CHAR(buffer
, bufferLength
, length
, catname
[length
- 1]);
451 WRITE_CHAR(buffer
, bufferLength
, length
, '-');
452 for (cp
= code
, ndigits
= 0; cp
; ++ndigits
, cp
>>= 4)
456 for (cp
= code
, i
= ndigits
; (cp
|| i
> 0) && bufferLength
; cp
>>= 4, bufferLength
--) {
457 uint8_t v
= (uint8_t)(cp
& 0xf);
458 buffer
[--i
] = (v
< 10 ? '0' + v
: 'A' + v
- 10);
462 WRITE_CHAR(buffer
, bufferLength
, length
, '>');
468 * getGroup() does a binary search for the group that contains the
469 * Unicode code point "code".
470 * The return value is always a valid Group* that may contain "code"
471 * or else is the highest group before "code".
472 * If the lowest group is after "code", then that one is returned.
475 getGroup(UCharNames
*names
, uint32_t code
) {
476 uint16_t groupMSB
=(uint16_t)(code
>>GROUP_SHIFT
),
478 limit
=*(uint16_t *)((char *)names
+names
->groupsOffset
),
480 Group
*groups
=(Group
*)((char *)names
+names
->groupsOffset
+2);
482 /* binary search for the group of names that contains the one for code */
483 while(start
<limit
-1) {
484 number
=(uint16_t)((start
+limit
)/2);
485 if(groupMSB
<groups
[number
].groupMSB
) {
492 /* return this regardless of whether it is an exact match */
497 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
498 * expands them into offsets and lengths for each string.
499 * Lengths are stored with a variable-width encoding in consecutive nibbles:
500 * If a nibble<0xc, then it is the length itself (0=empty string).
501 * If a nibble>=0xc, then it forms a length value with the following nibble.
502 * Calculation see below.
503 * The offsets and lengths arrays must be at least 33 (one more) long because
504 * there is no check here at the end if the last nibble is still used.
506 static const uint8_t *
507 expandGroupLengths(const uint8_t *s
,
508 uint16_t offsets
[LINES_PER_GROUP
+1], uint16_t lengths
[LINES_PER_GROUP
+1]) {
509 /* read the lengths of the 32 strings in this group and get each string's offset */
510 uint16_t i
=0, offset
=0, length
=0;
513 /* all 32 lengths must be read to get the offset of the first group string */
514 while(i
<LINES_PER_GROUP
) {
517 /* read even nibble - MSBs of lengthByte */
519 /* double-nibble length spread across two bytes */
520 length
=(uint16_t)(((length
&0x3)<<4|lengthByte
>>4)+12);
522 } else if((lengthByte
/* &0xf0 */)>=0xc0) {
523 /* double-nibble length spread across this one byte */
524 length
=(uint16_t)((lengthByte
&0x3f)+12);
526 /* single-nibble length in MSBs */
527 length
=(uint16_t)(lengthByte
>>4);
537 /* read odd nibble - LSBs of lengthByte */
538 if((lengthByte
&0xf0)==0) {
539 /* this nibble was not consumed for a double-nibble length above */
542 /* single-nibble length in LSBs */
550 length
=0; /* prevent double-nibble detection in the next iteration */
554 /* now, s is at the first group string */
559 expandGroupName(UCharNames
*names
, Group
*group
,
560 uint16_t lineNumber
, UCharNameChoice nameChoice
,
561 char *buffer
, uint16_t bufferLength
) {
562 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
563 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+
564 (group
->offsetHigh
<<16|group
->offsetLow
);
565 s
=expandGroupLengths(s
, offsets
, lengths
);
566 return expandName(names
, s
+offsets
[lineNumber
], lengths
[lineNumber
], nameChoice
,
567 buffer
, bufferLength
);
571 getName(UCharNames
*names
, uint32_t code
, UCharNameChoice nameChoice
,
572 char *buffer
, uint16_t bufferLength
) {
573 Group
*group
=getGroup(names
, code
);
574 if((uint16_t)(code
>>GROUP_SHIFT
)==group
->groupMSB
) {
575 return expandGroupName(names
, group
, (uint16_t)(code
&GROUP_MASK
), nameChoice
,
576 buffer
, bufferLength
);
578 /* group not found */
588 * enumGroupNames() enumerates all the names in a 32-group
589 * and either calls the enumerator function or finds a given input name.
592 enumGroupNames(UCharNames
*names
, Group
*group
,
593 UChar32 start
, UChar32 end
,
594 UEnumCharNamesFn
*fn
, void *context
,
595 UCharNameChoice nameChoice
) {
596 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
597 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+
598 (group
->offsetHigh
<<16|group
->offsetLow
);
600 s
=expandGroupLengths(s
, offsets
, lengths
);
601 if(fn
!=DO_FIND_NAME
) {
606 length
=expandName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, buffer
, sizeof(buffer
));
607 if (!length
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
608 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
610 /* here, we assume that the buffer is large enough */
612 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
619 const char *otherName
=((FindName
*)context
)->otherName
;
621 if(compareName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, otherName
)) {
622 ((FindName
*)context
)->code
=start
;
632 * enumExtNames enumerate extended names.
633 * It only needs to do it if it is called with a real function and not
634 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
635 * for extended names by itself.
638 enumExtNames(UChar32 start
, UChar32 end
,
639 UEnumCharNamesFn
*fn
, void *context
)
641 if(fn
!=DO_FIND_NAME
) {
646 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
647 /* here, we assume that the buffer is large enough */
649 if(!fn(context
, start
, U_EXTENDED_CHAR_NAME
, buffer
, length
)) {
661 enumNames(UCharNames
*names
,
662 UChar32 start
, UChar32 limit
,
663 UEnumCharNamesFn
*fn
, void *context
,
664 UCharNameChoice nameChoice
) {
665 uint16_t startGroupMSB
, endGroupMSB
, groupCount
;
666 Group
*group
, *groupLimit
;
668 startGroupMSB
=(uint16_t)(start
>>GROUP_SHIFT
);
669 endGroupMSB
=(uint16_t)((limit
-1)>>GROUP_SHIFT
);
671 /* find the group that contains start, or the highest before it */
672 group
=getGroup(names
, start
);
674 if(startGroupMSB
==endGroupMSB
) {
675 if(startGroupMSB
==group
->groupMSB
) {
676 /* if start and limit-1 are in the same group, then enumerate only in that one */
677 return enumGroupNames(names
, group
, start
, limit
-1, fn
, context
, nameChoice
);
680 groupCount
=*(uint16_t *)((char *)names
+names
->groupsOffset
);
681 groupLimit
=(Group
*)((char *)names
+names
->groupsOffset
+2)+groupCount
;
683 if(startGroupMSB
==group
->groupMSB
) {
684 /* enumerate characters in the partial start group */
685 if((start
&GROUP_MASK
)!=0) {
686 if(!enumGroupNames(names
, group
,
687 start
, ((UChar32
)startGroupMSB
<<GROUP_SHIFT
)+LINES_PER_GROUP
-1,
688 fn
, context
, nameChoice
)) {
691 ++group
; /* continue with the next group */
693 } else if(startGroupMSB
>group
->groupMSB
) {
694 /* make sure that we start enumerating with the first group after start */
695 if (group
+ 1 < groupLimit
&& (group
+ 1)->groupMSB
> startGroupMSB
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
696 UChar32 end
= (group
+ 1)->groupMSB
<< GROUP_SHIFT
;
700 if (!enumExtNames(start
, end
- 1, fn
, context
)) {
707 /* enumerate entire groups between the start- and end-groups */
708 while(group
<groupLimit
&& group
->groupMSB
<endGroupMSB
) {
709 start
=(UChar32
)group
->groupMSB
<<GROUP_SHIFT
;
710 if(!enumGroupNames(names
, group
, start
, start
+LINES_PER_GROUP
-1, fn
, context
, nameChoice
)) {
713 if (group
+ 1 < groupLimit
&& (group
+ 1)->groupMSB
> group
->groupMSB
+ 1 && nameChoice
== U_EXTENDED_CHAR_NAME
) {
714 UChar32 end
= (group
+ 1)->groupMSB
<< GROUP_SHIFT
;
718 if (!enumExtNames((group
->groupMSB
+ 1) << GROUP_SHIFT
, end
- 1, fn
, context
)) {
725 /* enumerate within the end group (group->groupMSB==endGroupMSB) */
726 if(group
<groupLimit
&& group
->groupMSB
==endGroupMSB
) {
727 return enumGroupNames(names
, group
, (limit
-1)&~GROUP_MASK
, limit
-1, fn
, context
, nameChoice
);
728 } else if (nameChoice
== U_EXTENDED_CHAR_NAME
&& group
== groupLimit
) {
729 UChar32 next
= ((group
- 1)->groupMSB
+ 1) << GROUP_SHIFT
;
738 /* we have not found a group, which means everything is made of
740 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
741 if (limit
> UCHAR_MAX_VALUE
+ 1) {
742 limit
= UCHAR_MAX_VALUE
+ 1;
744 return enumExtNames(start
, limit
- 1, fn
, context
);
751 writeFactorSuffix(const uint16_t *factors
, uint16_t count
,
752 const char *s
, /* suffix elements */
754 uint16_t indexes
[8], /* output fields from here */
755 const char *elementBases
[8], const char *elements
[8],
756 char *buffer
, uint16_t bufferLength
) {
757 uint16_t i
, factor
, bufferPos
=0;
760 /* write elements according to the factors */
763 * the factorized elements are determined by modulo arithmetic
764 * with the factors of this algorithm
766 * note that for fewer operations, count is decremented here
769 for(i
=count
; i
>0; --i
) {
771 indexes
[i
]=(uint16_t)(code%factor
);
775 * we don't need to calculate the last modulus because start<=code<=end
776 * guarantees here that code<=factors[0]
778 indexes
[0]=(uint16_t)code
;
780 /* write each element */
782 if(elementBases
!=NULL
) {
786 /* skip indexes[i] strings */
798 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
801 /* we do not need to perform the rest of this loop for i==count - break here */
806 /* skip the rest of the strings for this factors[i] */
807 factor
=(uint16_t)(factors
[i
]-indexes
[i
]-1);
826 * Parts of findAlgName() are almost the same as some of getAlgName().
827 * Fixes must be applied to both.
830 getAlgName(AlgorithmicRange
*range
, uint32_t code
, UCharNameChoice nameChoice
,
831 char *buffer
, uint16_t bufferLength
) {
832 uint16_t bufferPos
=0;
835 * Do not write algorithmic Unicode 1.0 names because
836 * Unihan names are the same as the modern ones,
837 * extension A was only introduced with Unicode 3.0, and
838 * the Hangul syllable block was moved and changed around Unicode 1.1.5.
840 if(nameChoice
==U_UNICODE_10_CHAR_NAME
) {
848 switch(range
->type
) {
850 /* name = prefix hex-digits */
851 const char *s
=(const char *)(range
+1);
858 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
861 /* write hexadecimal code point value */
862 count
=range
->variant
;
865 if(count
<bufferLength
) {
870 if(--i
<bufferLength
) {
886 /* name = prefix factorized-elements */
888 const uint16_t *factors
=(const uint16_t *)(range
+1);
889 uint16_t count
=range
->variant
;
890 const char *s
=(const char *)(factors
+count
);
895 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
898 bufferPos
+=writeFactorSuffix(factors
, count
,
899 s
, code
-range
->start
, indexes
, NULL
, NULL
, buffer
, bufferLength
);
915 * Important: enumAlgNames() and findAlgName() are almost the same.
916 * Any fix must be applied to both.
919 enumAlgNames(AlgorithmicRange
*range
,
920 UChar32 start
, UChar32 limit
,
921 UEnumCharNamesFn
*fn
, void *context
,
922 UCharNameChoice nameChoice
) {
926 if(nameChoice
==U_UNICODE_10_CHAR_NAME
) {
930 switch(range
->type
) {
935 /* get the full name of the start character */
936 length
=getAlgName(range
, (uint32_t)start
, nameChoice
, buffer
, sizeof(buffer
));
941 /* call the enumerator function with this first character */
942 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
946 /* go to the end of the name; all these names have the same length */
952 /* enumerate the rest of the names */
953 while(++start
<limit
) {
954 /* increment the hexadecimal number on a character-basis */
958 if(('0'<=c
&& c
<'9') || ('A'<=c
&& c
<'F')) {
969 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
977 const char *elementBases
[8], *elements
[8];
978 const uint16_t *factors
=(const uint16_t *)(range
+1);
979 uint16_t count
=range
->variant
;
980 const char *s
=(const char *)(factors
+count
);
982 uint16_t prefixLength
, i
, index
;
986 /* name = prefix factorized-elements */
996 /* append the suffix of the start character */
997 length
=(uint16_t)(prefixLength
+writeFactorSuffix(factors
, count
,
998 s
, (uint32_t)start
-range
->start
,
999 indexes
, elementBases
, elements
,
1000 suffix
, (uint16_t)(sizeof(buffer
)-prefixLength
)));
1002 /* call the enumerator function with this first character */
1003 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1007 /* enumerate the rest of the names */
1008 while(++start
<limit
) {
1009 /* increment the indexes in lexical order bound by the factors */
1012 index
=(uint16_t)(indexes
[--i
]+1);
1013 if(index
<factors
[i
]) {
1014 /* skip one index and its element string */
1022 /* reset this index to 0 and its element string to the first one */
1024 elements
[i
]=elementBases
[i
];
1028 /* to make matters a little easier, just append all elements to the suffix */
1030 length
=prefixLength
;
1031 for(i
=0; i
<count
; ++i
) {
1033 while((c
=*s
++)!=0) {
1038 /* zero-terminate */
1041 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1048 /* undefined type */
1056 * findAlgName() is almost the same as enumAlgNames() except that it
1057 * returns the code point for a name if it fits into the range.
1058 * It returns 0xffff otherwise.
1061 findAlgName(AlgorithmicRange
*range
, UCharNameChoice nameChoice
, const char *otherName
) {
1064 if(nameChoice
==U_UNICODE_10_CHAR_NAME
) {
1068 switch(range
->type
) {
1070 /* name = prefix hex-digits */
1071 const char *s
=(const char *)(range
+1);
1076 /* compare prefix */
1077 while((c
=*s
++)!=0) {
1078 if((char)c
!=*otherName
++) {
1083 /* read hexadecimal code point value */
1084 count
=range
->variant
;
1086 for(i
=0; i
<count
; ++i
) {
1088 if('0'<=c
&& c
<='9') {
1089 code
=(code
<<4)|(c
-'0');
1090 } else if('A'<=c
&& c
<='F') {
1091 code
=(code
<<4)|(c
-'A'+10);
1097 /* does it fit into the range? */
1098 if(*otherName
==0 && range
->start
<=(uint32_t)code
&& (uint32_t)code
<=range
->end
) {
1105 uint16_t indexes
[8];
1106 const char *elementBases
[8], *elements
[8];
1107 const uint16_t *factors
=(const uint16_t *)(range
+1);
1108 uint16_t count
=range
->variant
;
1109 const char *s
=(const char *)(factors
+count
), *t
;
1110 UChar32 start
, limit
;
1115 /* name = prefix factorized-elements */
1117 /* compare prefix */
1118 while((c
=*s
++)!=0) {
1119 if((char)c
!=*otherName
++) {
1124 start
=(UChar32
)range
->start
;
1125 limit
=(UChar32
)(range
->end
+1);
1127 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1128 writeFactorSuffix(factors
, count
, s
, 0,
1129 indexes
, elementBases
, elements
, buffer
, sizeof(buffer
));
1131 /* compare the first suffix */
1132 if(0==uprv_strcmp(otherName
, buffer
)) {
1136 /* enumerate and compare the rest of the suffixes */
1137 while(++start
<limit
) {
1138 /* increment the indexes in lexical order bound by the factors */
1141 index
=(uint16_t)(indexes
[--i
]+1);
1142 if(index
<factors
[i
]) {
1143 /* skip one index and its element string */
1150 /* reset this index to 0 and its element string to the first one */
1152 elements
[i
]=elementBases
[i
];
1156 /* to make matters a little easier, just compare all elements of the suffix */
1158 for(i
=0; i
<count
; ++i
) {
1160 while((c
=*s
++)!=0) {
1162 s
=""; /* does not match */
1174 /* undefined type */
1181 /* sets of name characters, maximum name lengths ---------------------------- */
1183 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1184 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1187 calcStringSetLength(uint32_t set
[8], const char *s
) {
1191 while((c
=*s
++)!=0) {
1199 calcAlgNameSetsLengths(int32_t maxNameLength
) {
1200 AlgorithmicRange
*range
;
1202 uint32_t rangeCount
;
1205 /* enumerate algorithmic ranges */
1206 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1208 range
=(AlgorithmicRange
*)(p
+1);
1209 while(rangeCount
>0) {
1210 switch(range
->type
) {
1212 /* name = prefix + (range->variant times) hex-digits */
1214 length
=calcStringSetLength(gNameSet
, (const char *)(range
+1))+range
->variant
;
1215 if(length
>maxNameLength
) {
1216 maxNameLength
=length
;
1220 /* name = prefix factorized-elements */
1221 const uint16_t *factors
=(const uint16_t *)(range
+1);
1223 int32_t i
, count
=range
->variant
, factor
, factorLength
, maxFactorLength
;
1226 s
=(const char *)(factors
+count
);
1227 length
=calcStringSetLength(gNameSet
, s
);
1228 s
+=length
+1; /* start of factor suffixes */
1230 /* get the set and maximum factor suffix length for each factor */
1231 for(i
=0; i
<count
; ++i
) {
1233 for(factor
=factors
[i
]; factor
>0; --factor
) {
1234 factorLength
=calcStringSetLength(gNameSet
, s
);
1236 if(factorLength
>maxFactorLength
) {
1237 maxFactorLength
=factorLength
;
1240 length
+=maxFactorLength
;
1243 if(length
>maxNameLength
) {
1244 maxNameLength
=length
;
1253 range
=(AlgorithmicRange
*)((uint8_t *)range
+range
->size
);
1256 return maxNameLength
;
1260 calcExtNameSetsLengths(int32_t maxNameLength
) {
1263 for(i
=0; i
<LENGTHOF(charCatNames
); ++i
) {
1265 * for each category, count the length of the category name
1269 * 6 for most hex digits per code point
1271 length
=9+calcStringSetLength(gNameSet
, charCatNames
[i
]);
1272 if(length
>maxNameLength
) {
1273 maxNameLength
=length
;
1276 return maxNameLength
;
1280 calcNameSetLength(const uint16_t *tokens
, uint16_t tokenCount
, const uint8_t *tokenStrings
, int8_t *tokenLengths
,
1282 const uint8_t **pLine
, const uint8_t *lineLimit
) {
1283 const uint8_t *line
=*pLine
;
1284 int32_t length
=0, tokenLength
;
1287 while(line
!=lineLimit
&& (c
=*line
++)!=(uint8_t)';') {
1289 /* implicit letter */
1294 if(token
==(uint16_t)(-2)) {
1295 /* this is a lead byte for a double-byte token */
1299 if(token
==(uint16_t)(-1)) {
1300 /* explicit letter */
1304 /* count token word */
1305 if(tokenLengths
!=NULL
) {
1306 /* use cached token length */
1307 tokenLength
=tokenLengths
[c
];
1308 if(tokenLength
==0) {
1309 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1310 tokenLengths
[c
]=(int8_t)tokenLength
;
1313 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1315 length
+=tokenLength
;
1325 calcGroupNameSetsLengths(int32_t maxNameLength
) {
1326 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
1328 uint16_t *tokens
=(uint16_t *)uCharNames
+8;
1329 uint16_t tokenCount
=*tokens
++;
1330 uint8_t *tokenStrings
=(uint8_t *)uCharNames
+uCharNames
->tokenStringOffset
;
1332 int8_t *tokenLengths
;
1336 const uint8_t *s
, *line
, *lineLimit
;
1338 int32_t maxISOCommentLength
=0;
1339 int32_t groupCount
, lineNumber
, length
;
1341 tokenLengths
=(int8_t *)uprv_malloc(tokenCount
);
1342 if(tokenLengths
!=NULL
) {
1343 uprv_memset(tokenLengths
, 0, tokenCount
);
1346 groups
=(uint16_t *)((char *)uCharNames
+uCharNames
->groupsOffset
);
1347 groupCount
=*groups
++;
1348 group
=(Group
*)groups
;
1350 /* enumerate all groups */
1351 while(groupCount
>0) {
1352 s
=(uint8_t *)uCharNames
+uCharNames
->groupStringOffset
+
1353 ((int32_t)group
->offsetHigh
<<16|group
->offsetLow
);
1354 s
=expandGroupLengths(s
, offsets
, lengths
);
1356 /* enumerate all lines in each group */
1357 for(lineNumber
=0; lineNumber
<LINES_PER_GROUP
; ++lineNumber
) {
1358 line
=s
+offsets
[lineNumber
];
1359 length
=lengths
[lineNumber
];
1364 lineLimit
=line
+length
;
1366 /* read regular name */
1367 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1368 if(length
>maxNameLength
) {
1369 maxNameLength
=length
;
1371 if(line
==lineLimit
) {
1375 /* read Unicode 1.0 name */
1376 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1377 if(length
>maxNameLength
) {
1378 maxNameLength
=length
;
1380 if(line
==lineLimit
) {
1384 /* read ISO comment */
1385 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gISOCommentSet
, &line
, lineLimit
);
1386 if(length
>maxISOCommentLength
) {
1387 maxISOCommentLength
=length
;
1395 if(tokenLengths
!=NULL
) {
1396 uprv_free(tokenLengths
);
1399 /* set gMax... - name length last for threading */
1400 gMaxISOCommentLength
=maxISOCommentLength
;
1401 gMaxNameLength
=maxNameLength
;
1405 calcNameSetsLengths(UErrorCode
*pErrorCode
) {
1406 static const char extChars
[]="0123456789ABCDEF<>-";
1407 int32_t i
, maxNameLength
;
1409 if(gMaxNameLength
!=0) {
1413 if(!isDataLoaded(pErrorCode
)) {
1417 /* set hex digits, used in various names, and <>-, used in extended names */
1418 for(i
=0; i
<sizeof(extChars
)-1; ++i
) {
1419 SET_ADD(gNameSet
, extChars
[i
]);
1422 /* set sets and lengths from algorithmic names */
1423 maxNameLength
=calcAlgNameSetsLengths(0);
1425 /* set sets and lengths from extended names */
1426 maxNameLength
=calcExtNameSetsLengths(maxNameLength
);
1428 /* set sets and lengths from group names, set global maximum values */
1429 calcGroupNameSetsLengths(maxNameLength
);
1434 /* public API --------------------------------------------------------------- */
1436 U_CAPI
int32_t U_EXPORT2
1437 u_charName(UChar32 code
, UCharNameChoice nameChoice
,
1438 char *buffer
, int32_t bufferLength
,
1439 UErrorCode
*pErrorCode
) {
1440 AlgorithmicRange
*algRange
;
1445 /* check the argument values */
1446 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1448 } else if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
||
1449 bufferLength
<0 || (bufferLength
>0 && buffer
==NULL
)
1451 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1455 if((uint32_t)code
>UCHAR_MAX_VALUE
|| !isDataLoaded(pErrorCode
)) {
1456 return u_terminateChars(buffer
, bufferLength
, 0, pErrorCode
);
1461 /* try algorithmic names first */
1462 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1464 algRange
=(AlgorithmicRange
*)(p
+1);
1466 if(algRange
->start
<=(uint32_t)code
&& (uint32_t)code
<=algRange
->end
) {
1467 length
=getAlgName(algRange
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1470 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1475 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1476 length
= getName(uCharNames
, (uint32_t )code
, U_EXTENDED_CHAR_NAME
, buffer
, (uint16_t) bufferLength
);
1478 /* extended character name */
1479 length
= getExtName((uint32_t) code
, buffer
, (uint16_t) bufferLength
);
1482 /* normal character name */
1483 length
=getName(uCharNames
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1487 return u_terminateChars(buffer
, bufferLength
, length
, pErrorCode
);
1490 U_CAPI
int32_t U_EXPORT2
1491 u_getISOComment(UChar32 c
,
1492 char *dest
, int32_t destCapacity
,
1493 UErrorCode
*pErrorCode
) {
1496 /* check the argument values */
1497 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1499 } else if(destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1500 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1504 if((uint32_t)c
>UCHAR_MAX_VALUE
|| !isDataLoaded(pErrorCode
)) {
1505 return u_terminateChars(dest
, destCapacity
, 0, pErrorCode
);
1508 /* the ISO comment is stored like a normal character name */
1509 length
=getName(uCharNames
, (uint32_t)c
, U_ISO_COMMENT
, dest
, (uint16_t)destCapacity
);
1510 return u_terminateChars(dest
, destCapacity
, length
, pErrorCode
);
1513 U_CAPI UChar32 U_EXPORT2
1514 u_charFromName(UCharNameChoice nameChoice
,
1516 UErrorCode
*pErrorCode
) {
1517 char upper
[120], lower
[120];
1519 AlgorithmicRange
*algRange
;
1524 UChar32 error
= 0xffff; /* Undefined, but use this for backwards compatibility. */
1526 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1530 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| name
==NULL
|| *name
==0) {
1531 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1535 if(!isDataLoaded(pErrorCode
)) {
1539 /* construct the uppercase and lowercase of the name first */
1540 for(i
=0; i
<sizeof(upper
); ++i
) {
1541 if((c0
=*name
++)!=0) {
1542 upper
[i
]=uprv_toupper(c0
);
1543 lower
[i
]=uprv_tolower(c0
);
1545 upper
[i
]=lower
[i
]=0;
1549 if(i
==sizeof(upper
)) {
1550 /* name too long, there is no such character */
1551 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1555 /* try extended names first */
1556 if (lower
[0] == '<') {
1557 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1558 if (lower
[--i
] == '>') {
1559 for (--i
; lower
[i
] && lower
[i
] != '-'; --i
) {
1562 if (lower
[i
] == '-') { /* We've got a category. */
1567 for (++i
; lower
[i
] != '>'; ++i
) {
1568 if (lower
[i
] >= '0' && lower
[i
] <= '9') {
1569 cp
= (cp
<< 4) + lower
[i
] - '0';
1570 } else if (lower
[i
] >= 'a' && lower
[i
] <= 'f') {
1571 cp
= (cp
<< 4) + lower
[i
] - 'a' + 10;
1573 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1578 /* Now validate the category name.
1579 We could use a binary search, or a trie, if
1580 we really wanted to. */
1582 for (lower
[i
] = 0, cIdx
= 0; cIdx
< LENGTHOF(charCatNames
); ++cIdx
) {
1584 if (!uprv_strcmp(lower
+ 1, charCatNames
[cIdx
])) {
1585 if (getCharCat(cp
) == cIdx
) {
1595 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1599 /* try algorithmic names now */
1600 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1602 algRange
=(AlgorithmicRange
*)(p
+1);
1604 if((cp
=findAlgName(algRange
, nameChoice
, upper
))!=0xffff) {
1607 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1611 /* normal character name */
1612 findName
.otherName
=upper
;
1613 findName
.code
=error
;
1614 enumNames(uCharNames
, 0, UCHAR_MAX_VALUE
+ 1, DO_FIND_NAME
, &findName
, nameChoice
);
1615 if (findName
.code
== error
) {
1616 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1618 return findName
.code
;
1621 U_CAPI
void U_EXPORT2
1622 u_enumCharNames(UChar32 start
, UChar32 limit
,
1623 UEnumCharNamesFn
*fn
,
1625 UCharNameChoice nameChoice
,
1626 UErrorCode
*pErrorCode
) {
1627 AlgorithmicRange
*algRange
;
1631 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1635 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| fn
==NULL
) {
1636 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1640 if((uint32_t) limit
> UCHAR_MAX_VALUE
+ 1) {
1641 limit
= UCHAR_MAX_VALUE
+ 1;
1643 if((uint32_t)start
>=(uint32_t)limit
) {
1647 if(!isDataLoaded(pErrorCode
)) {
1651 /* interleave the data-driven ones with the algorithmic ones */
1652 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1653 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1655 algRange
=(AlgorithmicRange
*)(p
+1);
1657 /* enumerate the character names before the current algorithmic range */
1658 /* here: start<limit */
1659 if((uint32_t)start
<algRange
->start
) {
1660 if((uint32_t)limit
<=algRange
->start
) {
1661 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1664 if(!enumNames(uCharNames
, start
, (UChar32
)algRange
->start
, fn
, context
, nameChoice
)) {
1667 start
=(UChar32
)algRange
->start
;
1669 /* enumerate the character names in the current algorithmic range */
1670 /* here: algRange->start<=start<limit */
1671 if((uint32_t)start
<=algRange
->end
) {
1672 if((uint32_t)limit
<=(algRange
->end
+1)) {
1673 enumAlgNames(algRange
, start
, limit
, fn
, context
, nameChoice
);
1676 if(!enumAlgNames(algRange
, start
, (UChar32
)algRange
->end
+1, fn
, context
, nameChoice
)) {
1679 start
=(UChar32
)algRange
->end
+1;
1681 /* continue to the next algorithmic range (here: start<limit) */
1682 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1685 /* enumerate the character names after the last algorithmic range */
1686 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1689 U_CAPI
int32_t U_EXPORT2
1690 uprv_getMaxCharNameLength() {
1691 UErrorCode errorCode
=U_ZERO_ERROR
;
1692 if(calcNameSetsLengths(&errorCode
)) {
1693 return gMaxNameLength
;
1701 Currently not used but left for future use. Probably by UnicodeSet.
1702 urename.h and uprops.h changed accordingly.
1704 U_CAPI
int32_t U_EXPORT2
1705 uprv_getMaxISOCommentLength() {
1706 UErrorCode errorCode
=U_ZERO_ERROR
;
1707 if(calcNameSetsLengths(&errorCode
)) {
1708 return gMaxISOCommentLength
;
1716 * Converts the char set cset into a Unicode set uset.
1717 * @param cset Set of 256 bit flags corresponding to a set of chars.
1718 * @param uset USet to receive characters. Existing contents are deleted.
1721 charSetToUSet(uint32_t cset
[8], USetAdder
*sa
) {
1726 UErrorCode errorCode
;
1728 errorCode
=U_ZERO_ERROR
;
1730 if(!calcNameSetsLengths(&errorCode
)) {
1734 /* build a char string with all chars that are used in character names */
1736 for(i
=0; i
<256; ++i
) {
1737 if(SET_CONTAINS(cset
, i
)) {
1738 cs
[length
++]=(char)i
;
1742 /* convert the char string to a UChar string */
1743 u_charsToUChars(cs
, us
, length
);
1745 /* add each UChar to the USet */
1746 for(i
=0; i
<length
; ++i
) {
1747 if(us
[i
]!=0 || cs
[i
]==0) { /* non-invariant chars become (UChar)0 */
1748 sa
->add(sa
->set
, us
[i
]);
1754 * Fills set with characters that are used in Unicode character names.
1755 * @param set USet to receive characters.
1757 U_CAPI
void U_EXPORT2
1758 uprv_getCharNameCharacters(USetAdder
*sa
) {
1759 charSetToUSet(gNameSet
, sa
);
1764 Currently not used but left for future use. Probably by UnicodeSet.
1765 urename.h and uprops.h changed accordingly.
1768 * Fills set with characters that are used in Unicode character names.
1769 * @param set USetAdder to receive characters.
1771 U_CAPI
void U_EXPORT2
1772 uprv_getISOCommentCharacters(USetAdder
*sa
) {
1773 charSetToUSet(gISOCommentSet
, sa
);
1777 /* data swapping ------------------------------------------------------------ */
1780 * The token table contains non-negative entries for token bytes,
1781 * and -1 for bytes that represent themselves in the data file's charset.
1782 * -2 entries are used for lead bytes.
1784 * Direct bytes (-1 entries) must be translated from the input charset family
1785 * to the output charset family.
1786 * makeTokenMap() writes a permutation mapping for this.
1787 * Use it once for single-/lead-byte tokens and once more for all trail byte
1788 * tokens. (';' is an unused trail byte marked with -1.)
1791 makeTokenMap(const UDataSwapper
*ds
,
1792 int16_t tokens
[], uint16_t tokenCount
,
1794 UErrorCode
*pErrorCode
) {
1795 UBool usedOutChar
[256];
1799 if(U_FAILURE(*pErrorCode
)) {
1803 if(ds
->inCharset
==ds
->outCharset
) {
1804 /* Same charset family: identity permutation */
1805 for(i
=0; i
<256; ++i
) {
1809 uprv_memset(map
, 0, 256);
1810 uprv_memset(usedOutChar
, 0, 256);
1812 if(tokenCount
>256) {
1816 /* set the direct bytes (byte 0 always maps to itself) */
1817 for(i
=1; i
<tokenCount
; ++i
) {
1819 /* convert the direct byte character */
1821 ds
->swapInvChars(ds
, &c1
, 1, &c2
, pErrorCode
);
1822 if(U_FAILURE(*pErrorCode
)) {
1823 udata_printError(ds
, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d) - %s\n",
1824 i
, ds
->inCharset
, u_errorName(*pErrorCode
));
1828 /* enter the converted character into the map and mark it used */
1830 usedOutChar
[c2
]=TRUE
;
1834 /* set the mappings for the rest of the permutation */
1835 for(i
=j
=1; i
<tokenCount
; ++i
) {
1836 /* set mappings that were not set for direct bytes */
1838 /* set an output byte value that was not used as an output byte above */
1839 while(usedOutChar
[j
]) {
1842 map
[i
]=(uint8_t)j
++;
1847 * leave mappings at tokenCount and above unset if tokenCount<256
1848 * because they won't be used
1853 U_CAPI
int32_t U_EXPORT2
1854 uchar_swapNames(const UDataSwapper
*ds
,
1855 const void *inData
, int32_t length
, void *outData
,
1856 UErrorCode
*pErrorCode
) {
1857 const UDataInfo
*pInfo
;
1860 const uint8_t *inBytes
;
1863 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
,
1864 offset
, i
, count
, stringsCount
;
1866 const AlgorithmicRange
*inRange
;
1867 AlgorithmicRange
*outRange
;
1869 /* udata_swapDataHeader checks the arguments */
1870 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
1871 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1875 /* check data format and format version */
1876 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
1878 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
1879 pInfo
->dataFormat
[1]==0x6e &&
1880 pInfo
->dataFormat
[2]==0x61 &&
1881 pInfo
->dataFormat
[3]==0x6d &&
1882 pInfo
->formatVersion
[0]==1
1884 udata_printError(ds
, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1885 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
1886 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
1887 pInfo
->formatVersion
[0]);
1888 *pErrorCode
=U_UNSUPPORTED_ERROR
;
1892 inBytes
=(const uint8_t *)inData
+headerSize
;
1893 outBytes
=(uint8_t *)outData
+headerSize
;
1895 algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]);
1899 (uint32_t)length
<(algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]))
1901 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1903 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1909 /* preflighting: iterate through algorithmic ranges */
1910 offset
=algNamesOffset
;
1911 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
1914 for(i
=0; i
<count
; ++i
) {
1915 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
1916 offset
+=ds
->readUInt16(inRange
->size
);
1923 int16_t tokens
[512];
1924 uint16_t tokenCount
;
1926 uint8_t map
[256], trailMap
[256];
1928 /* copy the data for inaccessible bytes */
1929 if(inBytes
!=outBytes
) {
1930 uprv_memcpy(outBytes
, inBytes
, length
);
1933 /* the initial 4 offsets first */
1934 tokenStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[0]);
1935 groupsOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[1]);
1936 groupStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[2]);
1937 ds
->swapArray32(ds
, inBytes
, 16, outBytes
, pErrorCode
);
1940 * now the tokens table
1941 * it needs to be permutated along with the compressed name strings
1943 p
=(const uint16_t *)(inBytes
+16);
1944 q
=(uint16_t *)(outBytes
+16);
1946 /* read and swap the tokenCount */
1947 tokenCount
=ds
->readUInt16(*p
);
1948 ds
->swapArray16(ds
, p
, 2, q
, pErrorCode
);
1952 /* read the first 512 tokens and make the token maps */
1953 if(tokenCount
<=512) {
1958 for(i
=0; i
<count
; ++i
) {
1959 tokens
[i
]=udata_readInt16(ds
, p
[i
]);
1962 tokens
[i
]=0; /* fill the rest of the tokens array if tokenCount<512 */
1964 makeTokenMap(ds
, tokens
, tokenCount
, map
, pErrorCode
);
1965 makeTokenMap(ds
, tokens
+256, (uint16_t)(tokenCount
>256 ? tokenCount
-256 : 0), trailMap
, pErrorCode
);
1966 if(U_FAILURE(*pErrorCode
)) {
1971 * swap and permutate the tokens
1972 * go through a temporary array to support in-place swapping
1974 temp
=(uint16_t *)uprv_malloc(tokenCount
*2);
1976 udata_printError(ds
, "out of memory swapping %u unames.icu tokens\n",
1978 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1982 /* swap and permutate single-/lead-byte tokens */
1983 for(i
=0; i
<tokenCount
&& i
<256; ++i
) {
1984 ds
->swapArray16(ds
, p
+i
, 2, temp
+map
[i
], pErrorCode
);
1987 /* swap and permutate trail-byte tokens */
1988 for(; i
<tokenCount
; ++i
) {
1989 ds
->swapArray16(ds
, p
+i
, 2, temp
+(i
&0xffffff00)+trailMap
[i
&0xff], pErrorCode
);
1992 /* copy the result into the output and free the temporary array */
1993 uprv_memcpy(q
, temp
, tokenCount
*2);
1997 * swap the token strings but not a possible padding byte after
1998 * the terminating NUL of the last string
2000 udata_swapInvStringBlock(ds
, inBytes
+tokenStringOffset
, (int32_t)(groupsOffset
-tokenStringOffset
),
2001 outBytes
+tokenStringOffset
, pErrorCode
);
2002 if(U_FAILURE(*pErrorCode
)) {
2003 udata_printError(ds
, "uchar_swapNames(token strings) failed - %s\n",
2004 u_errorName(*pErrorCode
));
2008 /* swap the group table */
2009 count
=ds
->readUInt16(*((const uint16_t *)(inBytes
+groupsOffset
)));
2010 ds
->swapArray16(ds
, inBytes
+groupsOffset
, (int32_t)((1+count
*3)*2),
2011 outBytes
+groupsOffset
, pErrorCode
);
2014 * swap the group strings
2015 * swap the string bytes but not the nibble-encoded string lengths
2017 if(ds
->inCharset
!=ds
->outCharset
) {
2018 uint16_t offsets
[LINES_PER_GROUP
+1], lengths
[LINES_PER_GROUP
+1];
2020 const uint8_t *inStrings
, *nextInStrings
;
2021 uint8_t *outStrings
;
2025 inStrings
=inBytes
+groupStringOffset
;
2026 outStrings
=outBytes
+groupStringOffset
;
2028 stringsCount
=algNamesOffset
-groupStringOffset
;
2030 /* iterate through string groups until only a few padding bytes are left */
2031 while(stringsCount
>32) {
2032 nextInStrings
=expandGroupLengths(inStrings
, offsets
, lengths
);
2034 /* move past the length bytes */
2035 stringsCount
-=(uint32_t)(nextInStrings
-inStrings
);
2036 outStrings
+=nextInStrings
-inStrings
;
2037 inStrings
=nextInStrings
;
2039 count
=offsets
[31]+lengths
[31]; /* total number of string bytes in this group */
2040 stringsCount
-=count
;
2042 /* swap the string bytes using map[] and trailMap[] */
2045 *outStrings
++=map
[c
];
2049 /* token lead byte: swap the trail byte, too */
2050 *outStrings
++=trailMap
[*inStrings
++];
2057 /* swap the algorithmic ranges */
2058 offset
=algNamesOffset
;
2059 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
2060 ds
->swapArray32(ds
, inBytes
+offset
, 4, outBytes
+offset
, pErrorCode
);
2063 for(i
=0; i
<count
; ++i
) {
2064 if(offset
>(uint32_t)length
) {
2065 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2067 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2071 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
2072 outRange
=(AlgorithmicRange
*)(outBytes
+offset
);
2073 offset
+=ds
->readUInt16(inRange
->size
);
2075 ds
->swapArray32(ds
, inRange
, 8, outRange
, pErrorCode
);
2076 ds
->swapArray16(ds
, &inRange
->size
, 2, &outRange
->size
, pErrorCode
);
2077 switch(inRange
->type
) {
2079 /* swap prefix string */
2080 ds
->swapInvChars(ds
, inRange
+1, (int32_t)uprv_strlen((const char *)(inRange
+1)),
2081 outRange
+1, pErrorCode
);
2082 if(U_FAILURE(*pErrorCode
)) {
2083 udata_printError(ds
, "uchar_swapNames(prefix string of algorithmic range %u) failed - %s\n",
2084 i
, u_errorName(*pErrorCode
));
2090 /* swap factors and the prefix and factor strings */
2091 uint16_t factors
[8];
2092 uint32_t j
, factorsCount
;
2094 factorsCount
=inRange
->variant
;
2095 if(factorsCount
==0 || factorsCount
>LENGTHOF(factors
)) {
2096 udata_printError(ds
, "uchar_swapNames(): too many factors (%u) in algorithmic range %u\n",
2098 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2102 /* read and swap the factors */
2103 p
=(const uint16_t *)(inRange
+1);
2104 q
=(uint16_t *)(outRange
+1);
2105 for(j
=0; j
<factorsCount
; ++j
) {
2106 factors
[j
]=ds
->readUInt16(p
[j
]);
2108 ds
->swapArray16(ds
, p
, (int32_t)(factorsCount
*2), q
, pErrorCode
);
2110 /* swap the strings, up to the last terminating NUL */
2113 stringsCount
=(uint32_t)((inBytes
+offset
)-(const uint8_t *)p
);
2114 while(stringsCount
>0 && ((const uint8_t *)p
)[stringsCount
-1]!=0) {
2117 ds
->swapInvChars(ds
, p
, (int32_t)stringsCount
, q
, pErrorCode
);
2121 udata_printError(ds
, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2123 *pErrorCode
=U_UNSUPPORTED_ERROR
;
2129 return headerSize
+(int32_t)offset
;
2133 * Hey, Emacs, please set the following:
2136 * indent-tabs-mode: nil