2 ******************************************************************************
4 * Copyright (C) 1999-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 ******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 1999oct04
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
18 #include "unicode/putil.h"
19 #include "unicode/uchar.h"
20 #include "unicode/udata.h"
21 #include "unicode/utf.h"
22 #include "unicode/utf16.h"
34 /* prototypes ------------------------------------------------------------- */
36 static const char DATA_NAME
[] = "unames";
37 static const char DATA_TYPE
[] = "icu";
40 #define LINES_PER_GROUP (1L<<GROUP_SHIFT)
41 #define GROUP_MASK (LINES_PER_GROUP-1)
44 * This struct was replaced by explicitly accessing equivalent
45 * fields from triples of uint16_t.
46 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
47 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
48 * would advance by 6 bytes (3 uint16_t).
50 * We can't just change the data structure because it's loaded from a data file,
51 * and we don't want to make it less compact, so we changed the access code.
53 * For details see ICU tickets 6331 and 6008.
56 offsetHigh, offsetLow; / * avoid padding * /
67 * Get the 32-bit group offset.
68 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
69 * @return group offset (int32_t)
71 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
73 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
74 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
78 uint8_t type
, variant
;
83 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
;
87 * Get the groups table from a UCharNames struct.
88 * The groups table consists of one uint16_t groupCount followed by
89 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
90 * and the comment for the old struct Group above.
92 * @param names (const UCharNames *) pointer to the UCharNames indexes
93 * @return (const uint16_t *) pointer to the groups table
95 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
98 const char *otherName
;
102 #define DO_FIND_NAME NULL
104 static UDataMemory
*uCharNamesData
=NULL
;
105 static UCharNames
*uCharNames
=NULL
;
106 static icu::UInitOnce gCharNamesInitOnce
= U_INITONCE_INITIALIZER
;
109 * Maximum length of character names (regular & 1.0).
111 static int32_t gMaxNameLength
=0;
114 * Set of chars used in character names (regular & 1.0).
115 * Chars are platform-dependent (can be EBCDIC).
117 static uint32_t gNameSet
[8]={ 0 };
119 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
120 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
121 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
123 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
125 static const char * const charCatNames
[U_CHAR_EXTENDED_CATEGORY_COUNT
] = {
134 "combining spacing mark",
135 "decimal digit number",
140 "paragraph separator",
148 "connector punctuation",
154 "initial punctuation",
161 /* implementation ----------------------------------------------------------- */
163 static UBool U_CALLCONV
unames_cleanup(void)
166 udata_close(uCharNamesData
);
167 uCharNamesData
= NULL
;
172 gCharNamesInitOnce
.reset();
177 static UBool U_CALLCONV
178 isAcceptable(void * /*context*/,
179 const char * /*type*/, const char * /*name*/,
180 const UDataInfo
*pInfo
) {
183 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
184 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
185 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
186 pInfo
->dataFormat
[1]==0x6e &&
187 pInfo
->dataFormat
[2]==0x61 &&
188 pInfo
->dataFormat
[3]==0x6d &&
189 pInfo
->formatVersion
[0]==1);
192 static void U_CALLCONV
193 loadCharNames(UErrorCode
&status
) {
194 U_ASSERT(uCharNamesData
== NULL
);
195 U_ASSERT(uCharNames
== NULL
);
197 uCharNamesData
= udata_openChoice(NULL
, DATA_TYPE
, DATA_NAME
, isAcceptable
, NULL
, &status
);
198 if(U_FAILURE(status
)) {
199 uCharNamesData
= NULL
;
201 uCharNames
= (UCharNames
*)udata_getMemory(uCharNamesData
);
203 ucln_common_registerCleanup(UCLN_COMMON_UNAMES
, unames_cleanup
);
208 isDataLoaded(UErrorCode
*pErrorCode
) {
209 umtx_initOnce(gCharNamesInitOnce
, &loadCharNames
, *pErrorCode
);
210 return U_SUCCESS(*pErrorCode
);
213 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
214 if((bufferLength)>0) { \
221 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
224 * Important: expandName() and compareName() are almost the same -
225 * apply fixes to both.
227 * UnicodeData.txt uses ';' as a field separator, so no
228 * field can contain ';' as part of its contents.
229 * In unames.dat, it is marked as token[';']==-1 only if the
230 * semicolon is used in the data file - which is iff we
231 * have Unicode 1.0 names or ISO comments or aliases.
232 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
233 * although we know that it will never be part of a name.
236 expandName(UCharNames
*names
,
237 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
238 char *buffer
, uint16_t bufferLength
) {
239 uint16_t *tokens
=(uint16_t *)names
+8;
240 uint16_t token
, tokenCount
=*tokens
++, bufferPos
=0;
241 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
244 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
246 * skip the modern name if it is not requested _and_
247 * if the semicolon byte value is a character, not a token number
249 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
250 int fieldIndex
= nameChoice
==U_ISO_COMMENT
? 2 : nameChoice
;
252 while(nameLength
>0) {
258 } while(--fieldIndex
>0);
261 * the semicolon byte value is a token number, therefore
262 * only modern names are stored in unames.dat and there is no
263 * such requested alternate name here
269 /* write each letter directly, and write a token word per token */
270 while(nameLength
>0) {
276 /* implicit letter */
277 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
284 if(token
==(uint16_t)(-2)) {
285 /* this is a lead byte for a double-byte token */
286 token
=tokens
[c
<<8|*name
++];
289 if(token
==(uint16_t)(-1)) {
291 /* explicit letter */
292 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
294 /* stop, but skip the semicolon if we are seeking
295 extended names and there was no 2.0 name but there
297 if(!bufferPos
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
298 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
306 /* write token word */
307 uint8_t *tokenString
=tokenStrings
+token
;
308 while((c
=*tokenString
++)!=0) {
309 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
324 * compareName() is almost the same as expandName() except that it compares
325 * the currently expanded name to an input name.
326 * It returns the match/no match result as soon as possible.
329 compareName(UCharNames
*names
,
330 const uint8_t *name
, uint16_t nameLength
, UCharNameChoice nameChoice
,
331 const char *otherName
) {
332 uint16_t *tokens
=(uint16_t *)names
+8;
333 uint16_t token
, tokenCount
=*tokens
++;
334 uint8_t *tokenStrings
=(uint8_t *)names
+names
->tokenStringOffset
;
336 const char *origOtherName
= otherName
;
338 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
340 * skip the modern name if it is not requested _and_
341 * if the semicolon byte value is a character, not a token number
343 if((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
344 int fieldIndex
= nameChoice
==U_ISO_COMMENT
? 2 : nameChoice
;
346 while(nameLength
>0) {
352 } while(--fieldIndex
>0);
355 * the semicolon byte value is a token number, therefore
356 * only modern names are stored in unames.dat and there is no
357 * such requested alternate name here
363 /* compare each letter directly, and compare a token word per token */
364 while(nameLength
>0) {
370 /* implicit letter */
371 if((char)c
!=*otherName
++) {
380 if(token
==(uint16_t)(-2)) {
381 /* this is a lead byte for a double-byte token */
382 token
=tokens
[c
<<8|*name
++];
385 if(token
==(uint16_t)(-1)) {
387 /* explicit letter */
388 if((char)c
!=*otherName
++) {
392 /* stop, but skip the semicolon if we are seeking
393 extended names and there was no 2.0 name but there
395 if(otherName
== origOtherName
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
396 if ((uint8_t)';'>=tokenCount
|| tokens
[(uint8_t)';']==(uint16_t)(-1)) {
404 /* write token word */
405 uint8_t *tokenString
=tokenStrings
+token
;
406 while((c
=*tokenString
++)!=0) {
407 if((char)c
!=*otherName
++) {
415 /* complete match? */
416 return (UBool
)(*otherName
==0);
419 static uint8_t getCharCat(UChar32 cp
) {
422 if (U_IS_UNICODE_NONCHAR(cp
)) {
423 return U_NONCHARACTER_CODE_POINT
;
426 if ((cat
= u_charType(cp
)) == U_SURROGATE
) {
427 cat
= U_IS_LEAD(cp
) ? U_LEAD_SURROGATE
: U_TRAIL_SURROGATE
;
433 static const char *getCharCatName(UChar32 cp
) {
434 uint8_t cat
= getCharCat(cp
);
436 /* Return unknown if the table of names above is not up to
439 if (cat
>= UPRV_LENGTHOF(charCatNames
)) {
442 return charCatNames
[cat
];
446 static uint16_t getExtName(uint32_t code
, char *buffer
, uint16_t bufferLength
) {
447 const char *catname
= getCharCatName(code
);
453 WRITE_CHAR(buffer
, bufferLength
, length
, '<');
454 while (catname
[length
- 1]) {
455 WRITE_CHAR(buffer
, bufferLength
, length
, catname
[length
- 1]);
457 WRITE_CHAR(buffer
, bufferLength
, length
, '-');
458 for (cp
= code
, ndigits
= 0; cp
; ++ndigits
, cp
>>= 4)
462 for (cp
= code
, i
= ndigits
; (cp
|| i
> 0) && bufferLength
; cp
>>= 4, bufferLength
--) {
463 uint8_t v
= (uint8_t)(cp
& 0xf);
464 buffer
[--i
] = (v
< 10 ? '0' + v
: 'A' + v
- 10);
468 WRITE_CHAR(buffer
, bufferLength
, length
, '>');
474 * getGroup() does a binary search for the group that contains the
475 * Unicode code point "code".
476 * The return value is always a valid Group* that may contain "code"
477 * or else is the highest group before "code".
478 * If the lowest group is after "code", then that one is returned.
480 static const uint16_t *
481 getGroup(UCharNames
*names
, uint32_t code
) {
482 const uint16_t *groups
=GET_GROUPS(names
);
483 uint16_t groupMSB
=(uint16_t)(code
>>GROUP_SHIFT
),
488 /* binary search for the group of names that contains the one for code */
489 while(start
<limit
-1) {
490 number
=(uint16_t)((start
+limit
)/2);
491 if(groupMSB
<groups
[number
*GROUP_LENGTH
+GROUP_MSB
]) {
498 /* return this regardless of whether it is an exact match */
499 return groups
+start
*GROUP_LENGTH
;
503 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
504 * expands them into offsets and lengths for each string.
505 * Lengths are stored with a variable-width encoding in consecutive nibbles:
506 * If a nibble<0xc, then it is the length itself (0=empty string).
507 * If a nibble>=0xc, then it forms a length value with the following nibble.
508 * Calculation see below.
509 * The offsets and lengths arrays must be at least 33 (one more) long because
510 * there is no check here at the end if the last nibble is still used.
512 static const uint8_t *
513 expandGroupLengths(const uint8_t *s
,
514 uint16_t offsets
[LINES_PER_GROUP
+1], uint16_t lengths
[LINES_PER_GROUP
+1]) {
515 /* read the lengths of the 32 strings in this group and get each string's offset */
516 uint16_t i
=0, offset
=0, length
=0;
519 /* all 32 lengths must be read to get the offset of the first group string */
520 while(i
<LINES_PER_GROUP
) {
523 /* read even nibble - MSBs of lengthByte */
525 /* double-nibble length spread across two bytes */
526 length
=(uint16_t)(((length
&0x3)<<4|lengthByte
>>4)+12);
528 } else if((lengthByte
/* &0xf0 */)>=0xc0) {
529 /* double-nibble length spread across this one byte */
530 length
=(uint16_t)((lengthByte
&0x3f)+12);
532 /* single-nibble length in MSBs */
533 length
=(uint16_t)(lengthByte
>>4);
543 /* read odd nibble - LSBs of lengthByte */
544 if((lengthByte
&0xf0)==0) {
545 /* this nibble was not consumed for a double-nibble length above */
548 /* single-nibble length in LSBs */
556 length
=0; /* prevent double-nibble detection in the next iteration */
560 /* now, s is at the first group string */
565 expandGroupName(UCharNames
*names
, const uint16_t *group
,
566 uint16_t lineNumber
, UCharNameChoice nameChoice
,
567 char *buffer
, uint16_t bufferLength
) {
568 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
569 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+GET_GROUP_OFFSET(group
);
570 s
=expandGroupLengths(s
, offsets
, lengths
);
571 return expandName(names
, s
+offsets
[lineNumber
], lengths
[lineNumber
], nameChoice
,
572 buffer
, bufferLength
);
576 getName(UCharNames
*names
, uint32_t code
, UCharNameChoice nameChoice
,
577 char *buffer
, uint16_t bufferLength
) {
578 const uint16_t *group
=getGroup(names
, code
);
579 if((uint16_t)(code
>>GROUP_SHIFT
)==group
[GROUP_MSB
]) {
580 return expandGroupName(names
, group
, (uint16_t)(code
&GROUP_MASK
), nameChoice
,
581 buffer
, bufferLength
);
583 /* group not found */
593 * enumGroupNames() enumerates all the names in a 32-group
594 * and either calls the enumerator function or finds a given input name.
597 enumGroupNames(UCharNames
*names
, const uint16_t *group
,
598 UChar32 start
, UChar32 end
,
599 UEnumCharNamesFn
*fn
, void *context
,
600 UCharNameChoice nameChoice
) {
601 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
602 const uint8_t *s
=(uint8_t *)names
+names
->groupStringOffset
+GET_GROUP_OFFSET(group
);
604 s
=expandGroupLengths(s
, offsets
, lengths
);
605 if(fn
!=DO_FIND_NAME
) {
610 length
=expandName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, buffer
, sizeof(buffer
));
611 if (!length
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
612 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
614 /* here, we assume that the buffer is large enough */
616 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
623 const char *otherName
=((FindName
*)context
)->otherName
;
625 if(compareName(names
, s
+offsets
[start
&GROUP_MASK
], lengths
[start
&GROUP_MASK
], nameChoice
, otherName
)) {
626 ((FindName
*)context
)->code
=start
;
636 * enumExtNames enumerate extended names.
637 * It only needs to do it if it is called with a real function and not
638 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
639 * for extended names by itself.
642 enumExtNames(UChar32 start
, UChar32 end
,
643 UEnumCharNamesFn
*fn
, void *context
)
645 if(fn
!=DO_FIND_NAME
) {
650 buffer
[length
= getExtName(start
, buffer
, sizeof(buffer
))] = 0;
651 /* here, we assume that the buffer is large enough */
653 if(!fn(context
, start
, U_EXTENDED_CHAR_NAME
, buffer
, length
)) {
665 enumNames(UCharNames
*names
,
666 UChar32 start
, UChar32 limit
,
667 UEnumCharNamesFn
*fn
, void *context
,
668 UCharNameChoice nameChoice
) {
669 uint16_t startGroupMSB
, endGroupMSB
, groupCount
;
670 const uint16_t *group
, *groupLimit
;
672 startGroupMSB
=(uint16_t)(start
>>GROUP_SHIFT
);
673 endGroupMSB
=(uint16_t)((limit
-1)>>GROUP_SHIFT
);
675 /* find the group that contains start, or the highest before it */
676 group
=getGroup(names
, start
);
678 if(startGroupMSB
<group
[GROUP_MSB
] && nameChoice
==U_EXTENDED_CHAR_NAME
) {
679 /* enumerate synthetic names between start and the group start */
680 UChar32 extLimit
=((UChar32
)group
[GROUP_MSB
]<<GROUP_SHIFT
);
684 if(!enumExtNames(start
, extLimit
-1, fn
, context
)) {
690 if(startGroupMSB
==endGroupMSB
) {
691 if(startGroupMSB
==group
[GROUP_MSB
]) {
692 /* if start and limit-1 are in the same group, then enumerate only in that one */
693 return enumGroupNames(names
, group
, start
, limit
-1, fn
, context
, nameChoice
);
696 const uint16_t *groups
=GET_GROUPS(names
);
697 groupCount
=*groups
++;
698 groupLimit
=groups
+groupCount
*GROUP_LENGTH
;
700 if(startGroupMSB
==group
[GROUP_MSB
]) {
701 /* enumerate characters in the partial start group */
702 if((start
&GROUP_MASK
)!=0) {
703 if(!enumGroupNames(names
, group
,
704 start
, ((UChar32
)startGroupMSB
<<GROUP_SHIFT
)+LINES_PER_GROUP
-1,
705 fn
, context
, nameChoice
)) {
708 group
=NEXT_GROUP(group
); /* continue with the next group */
710 } else if(startGroupMSB
>group
[GROUP_MSB
]) {
711 /* make sure that we start enumerating with the first group after start */
712 const uint16_t *nextGroup
=NEXT_GROUP(group
);
713 if (nextGroup
< groupLimit
&& nextGroup
[GROUP_MSB
] > startGroupMSB
&& nameChoice
== U_EXTENDED_CHAR_NAME
) {
714 UChar32 end
= nextGroup
[GROUP_MSB
] << GROUP_SHIFT
;
718 if (!enumExtNames(start
, end
- 1, fn
, context
)) {
725 /* enumerate entire groups between the start- and end-groups */
726 while(group
<groupLimit
&& group
[GROUP_MSB
]<endGroupMSB
) {
727 const uint16_t *nextGroup
;
728 start
=(UChar32
)group
[GROUP_MSB
]<<GROUP_SHIFT
;
729 if(!enumGroupNames(names
, group
, start
, start
+LINES_PER_GROUP
-1, fn
, context
, nameChoice
)) {
732 nextGroup
=NEXT_GROUP(group
);
733 if (nextGroup
< groupLimit
&& nextGroup
[GROUP_MSB
] > group
[GROUP_MSB
] + 1 && nameChoice
== U_EXTENDED_CHAR_NAME
) {
734 UChar32 end
= nextGroup
[GROUP_MSB
] << GROUP_SHIFT
;
738 if (!enumExtNames((group
[GROUP_MSB
] + 1) << GROUP_SHIFT
, end
- 1, fn
, context
)) {
745 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
746 if(group
<groupLimit
&& group
[GROUP_MSB
]==endGroupMSB
) {
747 return enumGroupNames(names
, group
, (limit
-1)&~GROUP_MASK
, limit
-1, fn
, context
, nameChoice
);
748 } else if (nameChoice
== U_EXTENDED_CHAR_NAME
&& group
== groupLimit
) {
749 UChar32 next
= (PREV_GROUP(group
)[GROUP_MSB
] + 1) << GROUP_SHIFT
;
758 /* we have not found a group, which means everything is made of
760 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
761 if (limit
> UCHAR_MAX_VALUE
+ 1) {
762 limit
= UCHAR_MAX_VALUE
+ 1;
764 return enumExtNames(start
, limit
- 1, fn
, context
);
771 writeFactorSuffix(const uint16_t *factors
, uint16_t count
,
772 const char *s
, /* suffix elements */
774 uint16_t indexes
[8], /* output fields from here */
775 const char *elementBases
[8], const char *elements
[8],
776 char *buffer
, uint16_t bufferLength
) {
777 uint16_t i
, factor
, bufferPos
=0;
780 /* write elements according to the factors */
783 * the factorized elements are determined by modulo arithmetic
784 * with the factors of this algorithm
786 * note that for fewer operations, count is decremented here
789 for(i
=count
; i
>0; --i
) {
791 indexes
[i
]=(uint16_t)(code%factor
);
795 * we don't need to calculate the last modulus because start<=code<=end
796 * guarantees here that code<=factors[0]
798 indexes
[0]=(uint16_t)code
;
800 /* write each element */
802 if(elementBases
!=NULL
) {
806 /* skip indexes[i] strings */
818 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
821 /* we do not need to perform the rest of this loop for i==count - break here */
826 /* skip the rest of the strings for this factors[i] */
827 factor
=(uint16_t)(factors
[i
]-indexes
[i
]-1);
846 * Parts of findAlgName() are almost the same as some of getAlgName().
847 * Fixes must be applied to both.
850 getAlgName(AlgorithmicRange
*range
, uint32_t code
, UCharNameChoice nameChoice
,
851 char *buffer
, uint16_t bufferLength
) {
852 uint16_t bufferPos
=0;
854 /* Only the normative character name can be algorithmic. */
855 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
863 switch(range
->type
) {
865 /* name = prefix hex-digits */
866 const char *s
=(const char *)(range
+1);
873 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
876 /* write hexadecimal code point value */
877 count
=range
->variant
;
880 if(count
<bufferLength
) {
885 if(--i
<bufferLength
) {
901 /* name = prefix factorized-elements */
903 const uint16_t *factors
=(const uint16_t *)(range
+1);
904 uint16_t count
=range
->variant
;
905 const char *s
=(const char *)(factors
+count
);
910 WRITE_CHAR(buffer
, bufferLength
, bufferPos
, c
);
913 bufferPos
+=writeFactorSuffix(factors
, count
,
914 s
, code
-range
->start
, indexes
, NULL
, NULL
, buffer
, bufferLength
);
930 * Important: enumAlgNames() and findAlgName() are almost the same.
931 * Any fix must be applied to both.
934 enumAlgNames(AlgorithmicRange
*range
,
935 UChar32 start
, UChar32 limit
,
936 UEnumCharNamesFn
*fn
, void *context
,
937 UCharNameChoice nameChoice
) {
941 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
945 switch(range
->type
) {
950 /* get the full name of the start character */
951 length
=getAlgName(range
, (uint32_t)start
, nameChoice
, buffer
, sizeof(buffer
));
956 /* call the enumerator function with this first character */
957 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
961 /* go to the end of the name; all these names have the same length */
967 /* enumerate the rest of the names */
968 while(++start
<limit
) {
969 /* increment the hexadecimal number on a character-basis */
973 if(('0'<=c
&& c
<'9') || ('A'<=c
&& c
<'F')) {
984 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
992 const char *elementBases
[8], *elements
[8];
993 const uint16_t *factors
=(const uint16_t *)(range
+1);
994 uint16_t count
=range
->variant
;
995 const char *s
=(const char *)(factors
+count
);
997 uint16_t prefixLength
, i
, idx
;
1001 /* name = prefix factorized-elements */
1006 while((c
=*s
++)!=0) {
1011 /* append the suffix of the start character */
1012 length
=(uint16_t)(prefixLength
+writeFactorSuffix(factors
, count
,
1013 s
, (uint32_t)start
-range
->start
,
1014 indexes
, elementBases
, elements
,
1015 suffix
, (uint16_t)(sizeof(buffer
)-prefixLength
)));
1017 /* call the enumerator function with this first character */
1018 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1022 /* enumerate the rest of the names */
1023 while(++start
<limit
) {
1024 /* increment the indexes in lexical order bound by the factors */
1027 idx
=(uint16_t)(indexes
[--i
]+1);
1028 if(idx
<factors
[i
]) {
1029 /* skip one index and its element string */
1037 /* reset this index to 0 and its element string to the first one */
1039 elements
[i
]=elementBases
[i
];
1043 /* to make matters a little easier, just append all elements to the suffix */
1045 length
=prefixLength
;
1046 for(i
=0; i
<count
; ++i
) {
1048 while((c
=*s
++)!=0) {
1053 /* zero-terminate */
1056 if(!fn(context
, start
, nameChoice
, buffer
, length
)) {
1063 /* undefined type */
1071 * findAlgName() is almost the same as enumAlgNames() except that it
1072 * returns the code point for a name if it fits into the range.
1073 * It returns 0xffff otherwise.
1076 findAlgName(AlgorithmicRange
*range
, UCharNameChoice nameChoice
, const char *otherName
) {
1079 if(nameChoice
!=U_UNICODE_CHAR_NAME
&& nameChoice
!=U_EXTENDED_CHAR_NAME
) {
1083 switch(range
->type
) {
1085 /* name = prefix hex-digits */
1086 const char *s
=(const char *)(range
+1);
1091 /* compare prefix */
1092 while((c
=*s
++)!=0) {
1093 if((char)c
!=*otherName
++) {
1098 /* read hexadecimal code point value */
1099 count
=range
->variant
;
1101 for(i
=0; i
<count
; ++i
) {
1103 if('0'<=c
&& c
<='9') {
1104 code
=(code
<<4)|(c
-'0');
1105 } else if('A'<=c
&& c
<='F') {
1106 code
=(code
<<4)|(c
-'A'+10);
1112 /* does it fit into the range? */
1113 if(*otherName
==0 && range
->start
<=(uint32_t)code
&& (uint32_t)code
<=range
->end
) {
1120 uint16_t indexes
[8];
1121 const char *elementBases
[8], *elements
[8];
1122 const uint16_t *factors
=(const uint16_t *)(range
+1);
1123 uint16_t count
=range
->variant
;
1124 const char *s
=(const char *)(factors
+count
), *t
;
1125 UChar32 start
, limit
;
1130 /* name = prefix factorized-elements */
1132 /* compare prefix */
1133 while((c
=*s
++)!=0) {
1134 if((char)c
!=*otherName
++) {
1139 start
=(UChar32
)range
->start
;
1140 limit
=(UChar32
)(range
->end
+1);
1142 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1143 writeFactorSuffix(factors
, count
, s
, 0,
1144 indexes
, elementBases
, elements
, buffer
, sizeof(buffer
));
1146 /* compare the first suffix */
1147 if(0==uprv_strcmp(otherName
, buffer
)) {
1151 /* enumerate and compare the rest of the suffixes */
1152 while(++start
<limit
) {
1153 /* increment the indexes in lexical order bound by the factors */
1156 idx
=(uint16_t)(indexes
[--i
]+1);
1157 if(idx
<factors
[i
]) {
1158 /* skip one index and its element string */
1165 /* reset this index to 0 and its element string to the first one */
1167 elements
[i
]=elementBases
[i
];
1171 /* to make matters a little easier, just compare all elements of the suffix */
1173 for(i
=0; i
<count
; ++i
) {
1175 while((c
=*s
++)!=0) {
1177 s
=""; /* does not match */
1189 /* undefined type */
1196 /* sets of name characters, maximum name lengths ---------------------------- */
1198 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1199 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1202 calcStringSetLength(uint32_t set
[8], const char *s
) {
1206 while((c
=*s
++)!=0) {
1214 calcAlgNameSetsLengths(int32_t maxNameLength
) {
1215 AlgorithmicRange
*range
;
1217 uint32_t rangeCount
;
1220 /* enumerate algorithmic ranges */
1221 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1223 range
=(AlgorithmicRange
*)(p
+1);
1224 while(rangeCount
>0) {
1225 switch(range
->type
) {
1227 /* name = prefix + (range->variant times) hex-digits */
1229 length
=calcStringSetLength(gNameSet
, (const char *)(range
+1))+range
->variant
;
1230 if(length
>maxNameLength
) {
1231 maxNameLength
=length
;
1235 /* name = prefix factorized-elements */
1236 const uint16_t *factors
=(const uint16_t *)(range
+1);
1238 int32_t i
, count
=range
->variant
, factor
, factorLength
, maxFactorLength
;
1241 s
=(const char *)(factors
+count
);
1242 length
=calcStringSetLength(gNameSet
, s
);
1243 s
+=length
+1; /* start of factor suffixes */
1245 /* get the set and maximum factor suffix length for each factor */
1246 for(i
=0; i
<count
; ++i
) {
1248 for(factor
=factors
[i
]; factor
>0; --factor
) {
1249 factorLength
=calcStringSetLength(gNameSet
, s
);
1251 if(factorLength
>maxFactorLength
) {
1252 maxFactorLength
=factorLength
;
1255 length
+=maxFactorLength
;
1258 if(length
>maxNameLength
) {
1259 maxNameLength
=length
;
1268 range
=(AlgorithmicRange
*)((uint8_t *)range
+range
->size
);
1271 return maxNameLength
;
1275 calcExtNameSetsLengths(int32_t maxNameLength
) {
1278 for(i
=0; i
<UPRV_LENGTHOF(charCatNames
); ++i
) {
1280 * for each category, count the length of the category name
1284 * 6 for most hex digits per code point
1286 length
=9+calcStringSetLength(gNameSet
, charCatNames
[i
]);
1287 if(length
>maxNameLength
) {
1288 maxNameLength
=length
;
1291 return maxNameLength
;
1295 calcNameSetLength(const uint16_t *tokens
, uint16_t tokenCount
, const uint8_t *tokenStrings
, int8_t *tokenLengths
,
1297 const uint8_t **pLine
, const uint8_t *lineLimit
) {
1298 const uint8_t *line
=*pLine
;
1299 int32_t length
=0, tokenLength
;
1302 while(line
!=lineLimit
&& (c
=*line
++)!=(uint8_t)';') {
1304 /* implicit letter */
1309 if(token
==(uint16_t)(-2)) {
1310 /* this is a lead byte for a double-byte token */
1314 if(token
==(uint16_t)(-1)) {
1315 /* explicit letter */
1319 /* count token word */
1320 if(tokenLengths
!=NULL
) {
1321 /* use cached token length */
1322 tokenLength
=tokenLengths
[c
];
1323 if(tokenLength
==0) {
1324 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1325 tokenLengths
[c
]=(int8_t)tokenLength
;
1328 tokenLength
=calcStringSetLength(set
, (const char *)tokenStrings
+token
);
1330 length
+=tokenLength
;
1340 calcGroupNameSetsLengths(int32_t maxNameLength
) {
1341 uint16_t offsets
[LINES_PER_GROUP
+2], lengths
[LINES_PER_GROUP
+2];
1343 uint16_t *tokens
=(uint16_t *)uCharNames
+8;
1344 uint16_t tokenCount
=*tokens
++;
1345 uint8_t *tokenStrings
=(uint8_t *)uCharNames
+uCharNames
->tokenStringOffset
;
1347 int8_t *tokenLengths
;
1349 const uint16_t *group
;
1350 const uint8_t *s
, *line
, *lineLimit
;
1352 int32_t groupCount
, lineNumber
, length
;
1354 tokenLengths
=(int8_t *)uprv_malloc(tokenCount
);
1355 if(tokenLengths
!=NULL
) {
1356 uprv_memset(tokenLengths
, 0, tokenCount
);
1359 group
=GET_GROUPS(uCharNames
);
1360 groupCount
=*group
++;
1362 /* enumerate all groups */
1363 while(groupCount
>0) {
1364 s
=(uint8_t *)uCharNames
+uCharNames
->groupStringOffset
+GET_GROUP_OFFSET(group
);
1365 s
=expandGroupLengths(s
, offsets
, lengths
);
1367 /* enumerate all lines in each group */
1368 for(lineNumber
=0; lineNumber
<LINES_PER_GROUP
; ++lineNumber
) {
1369 line
=s
+offsets
[lineNumber
];
1370 length
=lengths
[lineNumber
];
1375 lineLimit
=line
+length
;
1377 /* read regular name */
1378 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1379 if(length
>maxNameLength
) {
1380 maxNameLength
=length
;
1382 if(line
==lineLimit
) {
1386 /* read Unicode 1.0 name */
1387 length
=calcNameSetLength(tokens
, tokenCount
, tokenStrings
, tokenLengths
, gNameSet
, &line
, lineLimit
);
1388 if(length
>maxNameLength
) {
1389 maxNameLength
=length
;
1391 if(line
==lineLimit
) {
1395 /* read ISO comment */
1396 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1399 group
=NEXT_GROUP(group
);
1403 if(tokenLengths
!=NULL
) {
1404 uprv_free(tokenLengths
);
1407 /* set gMax... - name length last for threading */
1408 gMaxNameLength
=maxNameLength
;
1412 calcNameSetsLengths(UErrorCode
*pErrorCode
) {
1413 static const char extChars
[]="0123456789ABCDEF<>-";
1414 int32_t i
, maxNameLength
;
1416 if(gMaxNameLength
!=0) {
1420 if(!isDataLoaded(pErrorCode
)) {
1424 /* set hex digits, used in various names, and <>-, used in extended names */
1425 for(i
=0; i
<(int32_t)sizeof(extChars
)-1; ++i
) {
1426 SET_ADD(gNameSet
, extChars
[i
]);
1429 /* set sets and lengths from algorithmic names */
1430 maxNameLength
=calcAlgNameSetsLengths(0);
1432 /* set sets and lengths from extended names */
1433 maxNameLength
=calcExtNameSetsLengths(maxNameLength
);
1435 /* set sets and lengths from group names, set global maximum values */
1436 calcGroupNameSetsLengths(maxNameLength
);
1443 /* public API --------------------------------------------------------------- */
1447 U_CAPI
int32_t U_EXPORT2
1448 u_charName(UChar32 code
, UCharNameChoice nameChoice
,
1449 char *buffer
, int32_t bufferLength
,
1450 UErrorCode
*pErrorCode
) {
1451 AlgorithmicRange
*algRange
;
1456 /* check the argument values */
1457 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1459 } else if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
||
1460 bufferLength
<0 || (bufferLength
>0 && buffer
==NULL
)
1462 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1466 if((uint32_t)code
>UCHAR_MAX_VALUE
|| !isDataLoaded(pErrorCode
)) {
1467 return u_terminateChars(buffer
, bufferLength
, 0, pErrorCode
);
1472 /* try algorithmic names first */
1473 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1475 algRange
=(AlgorithmicRange
*)(p
+1);
1477 if(algRange
->start
<=(uint32_t)code
&& (uint32_t)code
<=algRange
->end
) {
1478 length
=getAlgName(algRange
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1481 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1486 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1487 length
= getName(uCharNames
, (uint32_t )code
, U_EXTENDED_CHAR_NAME
, buffer
, (uint16_t) bufferLength
);
1489 /* extended character name */
1490 length
= getExtName((uint32_t) code
, buffer
, (uint16_t) bufferLength
);
1493 /* normal character name */
1494 length
=getName(uCharNames
, (uint32_t)code
, nameChoice
, buffer
, (uint16_t)bufferLength
);
1498 return u_terminateChars(buffer
, bufferLength
, length
, pErrorCode
);
1501 U_CAPI
int32_t U_EXPORT2
1502 u_getISOComment(UChar32
/*c*/,
1503 char *dest
, int32_t destCapacity
,
1504 UErrorCode
*pErrorCode
) {
1505 /* check the argument values */
1506 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1508 } else if(destCapacity
<0 || (destCapacity
>0 && dest
==NULL
)) {
1509 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1513 return u_terminateChars(dest
, destCapacity
, 0, pErrorCode
);
1516 U_CAPI UChar32 U_EXPORT2
1517 u_charFromName(UCharNameChoice nameChoice
,
1519 UErrorCode
*pErrorCode
) {
1520 char upper
[120], lower
[120];
1522 AlgorithmicRange
*algRange
;
1527 UChar32 error
= 0xffff; /* Undefined, but use this for backwards compatibility. */
1529 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1533 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| name
==NULL
|| *name
==0) {
1534 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1538 if(!isDataLoaded(pErrorCode
)) {
1542 /* construct the uppercase and lowercase of the name first */
1543 for(i
=0; i
<sizeof(upper
); ++i
) {
1544 if((c0
=*name
++)!=0) {
1545 upper
[i
]=uprv_toupper(c0
);
1546 lower
[i
]=uprv_tolower(c0
);
1548 upper
[i
]=lower
[i
]=0;
1552 if(i
==sizeof(upper
)) {
1553 /* name too long, there is no such character */
1554 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1557 // i==strlen(name)==strlen(lower)==strlen(upper)
1559 /* try extended names first */
1560 if (lower
[0] == '<') {
1561 if (nameChoice
== U_EXTENDED_CHAR_NAME
) {
1562 // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
1563 if (lower
[--i
] == '>' && i
>= 3 && lower
[--i
] != '-') {
1564 while (i
>= 3 && lower
[--i
] != '-') {}
1566 if (i
>= 2 && lower
[i
] == '-') {
1571 for (++i
; lower
[i
] != '>'; ++i
) {
1572 if (lower
[i
] >= '0' && lower
[i
] <= '9') {
1573 cp
= (cp
<< 4) + lower
[i
] - '0';
1574 } else if (lower
[i
] >= 'a' && lower
[i
] <= 'f') {
1575 cp
= (cp
<< 4) + lower
[i
] - 'a' + 10;
1577 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1582 /* Now validate the category name.
1583 We could use a binary search, or a trie, if
1584 we really wanted to. */
1586 for (lower
[i
] = 0, cIdx
= 0; cIdx
< UPRV_LENGTHOF(charCatNames
); ++cIdx
) {
1588 if (!uprv_strcmp(lower
+ 1, charCatNames
[cIdx
])) {
1589 if (getCharCat(cp
) == cIdx
) {
1599 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1603 /* try algorithmic names now */
1604 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1606 algRange
=(AlgorithmicRange
*)(p
+1);
1608 if((cp
=findAlgName(algRange
, nameChoice
, upper
))!=0xffff) {
1611 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1615 /* normal character name */
1616 findName
.otherName
=upper
;
1617 findName
.code
=error
;
1618 enumNames(uCharNames
, 0, UCHAR_MAX_VALUE
+ 1, DO_FIND_NAME
, &findName
, nameChoice
);
1619 if (findName
.code
== error
) {
1620 *pErrorCode
= U_ILLEGAL_CHAR_FOUND
;
1622 return findName
.code
;
1625 U_CAPI
void U_EXPORT2
1626 u_enumCharNames(UChar32 start
, UChar32 limit
,
1627 UEnumCharNamesFn
*fn
,
1629 UCharNameChoice nameChoice
,
1630 UErrorCode
*pErrorCode
) {
1631 AlgorithmicRange
*algRange
;
1635 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1639 if(nameChoice
>=U_CHAR_NAME_CHOICE_COUNT
|| fn
==NULL
) {
1640 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1644 if((uint32_t) limit
> UCHAR_MAX_VALUE
+ 1) {
1645 limit
= UCHAR_MAX_VALUE
+ 1;
1647 if((uint32_t)start
>=(uint32_t)limit
) {
1651 if(!isDataLoaded(pErrorCode
)) {
1655 /* interleave the data-driven ones with the algorithmic ones */
1656 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1657 p
=(uint32_t *)((uint8_t *)uCharNames
+uCharNames
->algNamesOffset
);
1659 algRange
=(AlgorithmicRange
*)(p
+1);
1661 /* enumerate the character names before the current algorithmic range */
1662 /* here: start<limit */
1663 if((uint32_t)start
<algRange
->start
) {
1664 if((uint32_t)limit
<=algRange
->start
) {
1665 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1668 if(!enumNames(uCharNames
, start
, (UChar32
)algRange
->start
, fn
, context
, nameChoice
)) {
1671 start
=(UChar32
)algRange
->start
;
1673 /* enumerate the character names in the current algorithmic range */
1674 /* here: algRange->start<=start<limit */
1675 if((uint32_t)start
<=algRange
->end
) {
1676 if((uint32_t)limit
<=(algRange
->end
+1)) {
1677 enumAlgNames(algRange
, start
, limit
, fn
, context
, nameChoice
);
1680 if(!enumAlgNames(algRange
, start
, (UChar32
)algRange
->end
+1, fn
, context
, nameChoice
)) {
1683 start
=(UChar32
)algRange
->end
+1;
1685 /* continue to the next algorithmic range (here: start<limit) */
1686 algRange
=(AlgorithmicRange
*)((uint8_t *)algRange
+algRange
->size
);
1689 /* enumerate the character names after the last algorithmic range */
1690 enumNames(uCharNames
, start
, limit
, fn
, context
, nameChoice
);
1693 U_CAPI
int32_t U_EXPORT2
1694 uprv_getMaxCharNameLength() {
1695 UErrorCode errorCode
=U_ZERO_ERROR
;
1696 if(calcNameSetsLengths(&errorCode
)) {
1697 return gMaxNameLength
;
1704 * Converts the char set cset into a Unicode set uset.
1705 * @param cset Set of 256 bit flags corresponding to a set of chars.
1706 * @param uset USet to receive characters. Existing contents are deleted.
1709 charSetToUSet(uint32_t cset
[8], const USetAdder
*sa
) {
1714 UErrorCode errorCode
;
1716 errorCode
=U_ZERO_ERROR
;
1718 if(!calcNameSetsLengths(&errorCode
)) {
1722 /* build a char string with all chars that are used in character names */
1724 for(i
=0; i
<256; ++i
) {
1725 if(SET_CONTAINS(cset
, i
)) {
1726 cs
[length
++]=(char)i
;
1730 /* convert the char string to a UChar string */
1731 u_charsToUChars(cs
, us
, length
);
1733 /* add each UChar to the USet */
1734 for(i
=0; i
<length
; ++i
) {
1735 if(us
[i
]!=0 || cs
[i
]==0) { /* non-invariant chars become (UChar)0 */
1736 sa
->add(sa
->set
, us
[i
]);
1742 * Fills set with characters that are used in Unicode character names.
1743 * @param set USet to receive characters.
1745 U_CAPI
void U_EXPORT2
1746 uprv_getCharNameCharacters(const USetAdder
*sa
) {
1747 charSetToUSet(gNameSet
, sa
);
1750 /* data swapping ------------------------------------------------------------ */
1753 * The token table contains non-negative entries for token bytes,
1754 * and -1 for bytes that represent themselves in the data file's charset.
1755 * -2 entries are used for lead bytes.
1757 * Direct bytes (-1 entries) must be translated from the input charset family
1758 * to the output charset family.
1759 * makeTokenMap() writes a permutation mapping for this.
1760 * Use it once for single-/lead-byte tokens and once more for all trail byte
1761 * tokens. (';' is an unused trail byte marked with -1.)
1764 makeTokenMap(const UDataSwapper
*ds
,
1765 int16_t tokens
[], uint16_t tokenCount
,
1767 UErrorCode
*pErrorCode
) {
1768 UBool usedOutChar
[256];
1772 if(U_FAILURE(*pErrorCode
)) {
1776 if(ds
->inCharset
==ds
->outCharset
) {
1777 /* Same charset family: identity permutation */
1778 for(i
=0; i
<256; ++i
) {
1782 uprv_memset(map
, 0, 256);
1783 uprv_memset(usedOutChar
, 0, 256);
1785 if(tokenCount
>256) {
1789 /* set the direct bytes (byte 0 always maps to itself) */
1790 for(i
=1; i
<tokenCount
; ++i
) {
1792 /* convert the direct byte character */
1794 ds
->swapInvChars(ds
, &c1
, 1, &c2
, pErrorCode
);
1795 if(U_FAILURE(*pErrorCode
)) {
1796 udata_printError(ds
, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1801 /* enter the converted character into the map and mark it used */
1803 usedOutChar
[c2
]=TRUE
;
1807 /* set the mappings for the rest of the permutation */
1808 for(i
=j
=1; i
<tokenCount
; ++i
) {
1809 /* set mappings that were not set for direct bytes */
1811 /* set an output byte value that was not used as an output byte above */
1812 while(usedOutChar
[j
]) {
1815 map
[i
]=(uint8_t)j
++;
1820 * leave mappings at tokenCount and above unset if tokenCount<256
1821 * because they won't be used
1826 U_CAPI
int32_t U_EXPORT2
1827 uchar_swapNames(const UDataSwapper
*ds
,
1828 const void *inData
, int32_t length
, void *outData
,
1829 UErrorCode
*pErrorCode
) {
1830 const UDataInfo
*pInfo
;
1833 const uint8_t *inBytes
;
1836 uint32_t tokenStringOffset
, groupsOffset
, groupStringOffset
, algNamesOffset
,
1837 offset
, i
, count
, stringsCount
;
1839 const AlgorithmicRange
*inRange
;
1840 AlgorithmicRange
*outRange
;
1842 /* udata_swapDataHeader checks the arguments */
1843 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
1844 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1848 /* check data format and format version */
1849 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
1851 pInfo
->dataFormat
[0]==0x75 && /* dataFormat="unam" */
1852 pInfo
->dataFormat
[1]==0x6e &&
1853 pInfo
->dataFormat
[2]==0x61 &&
1854 pInfo
->dataFormat
[3]==0x6d &&
1855 pInfo
->formatVersion
[0]==1
1857 udata_printError(ds
, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1858 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
1859 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
1860 pInfo
->formatVersion
[0]);
1861 *pErrorCode
=U_UNSUPPORTED_ERROR
;
1865 inBytes
=(const uint8_t *)inData
+headerSize
;
1866 outBytes
=(uint8_t *)outData
+headerSize
;
1868 algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]);
1872 (uint32_t)length
<(algNamesOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[3]))
1874 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1876 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1882 /* preflighting: iterate through algorithmic ranges */
1883 offset
=algNamesOffset
;
1884 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
1887 for(i
=0; i
<count
; ++i
) {
1888 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
1889 offset
+=ds
->readUInt16(inRange
->size
);
1896 int16_t tokens
[512];
1897 uint16_t tokenCount
;
1899 uint8_t map
[256], trailMap
[256];
1901 /* copy the data for inaccessible bytes */
1902 if(inBytes
!=outBytes
) {
1903 uprv_memcpy(outBytes
, inBytes
, length
);
1906 /* the initial 4 offsets first */
1907 tokenStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[0]);
1908 groupsOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[1]);
1909 groupStringOffset
=ds
->readUInt32(((const uint32_t *)inBytes
)[2]);
1910 ds
->swapArray32(ds
, inBytes
, 16, outBytes
, pErrorCode
);
1913 * now the tokens table
1914 * it needs to be permutated along with the compressed name strings
1916 p
=(const uint16_t *)(inBytes
+16);
1917 q
=(uint16_t *)(outBytes
+16);
1919 /* read and swap the tokenCount */
1920 tokenCount
=ds
->readUInt16(*p
);
1921 ds
->swapArray16(ds
, p
, 2, q
, pErrorCode
);
1925 /* read the first 512 tokens and make the token maps */
1926 if(tokenCount
<=512) {
1931 for(i
=0; i
<count
; ++i
) {
1932 tokens
[i
]=udata_readInt16(ds
, p
[i
]);
1935 tokens
[i
]=0; /* fill the rest of the tokens array if tokenCount<512 */
1937 makeTokenMap(ds
, tokens
, tokenCount
, map
, pErrorCode
);
1938 makeTokenMap(ds
, tokens
+256, (uint16_t)(tokenCount
>256 ? tokenCount
-256 : 0), trailMap
, pErrorCode
);
1939 if(U_FAILURE(*pErrorCode
)) {
1944 * swap and permutate the tokens
1945 * go through a temporary array to support in-place swapping
1947 temp
=(uint16_t *)uprv_malloc(tokenCount
*2);
1949 udata_printError(ds
, "out of memory swapping %u unames.icu tokens\n",
1951 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
1955 /* swap and permutate single-/lead-byte tokens */
1956 for(i
=0; i
<tokenCount
&& i
<256; ++i
) {
1957 ds
->swapArray16(ds
, p
+i
, 2, temp
+map
[i
], pErrorCode
);
1960 /* swap and permutate trail-byte tokens */
1961 for(; i
<tokenCount
; ++i
) {
1962 ds
->swapArray16(ds
, p
+i
, 2, temp
+(i
&0xffffff00)+trailMap
[i
&0xff], pErrorCode
);
1965 /* copy the result into the output and free the temporary array */
1966 uprv_memcpy(q
, temp
, tokenCount
*2);
1970 * swap the token strings but not a possible padding byte after
1971 * the terminating NUL of the last string
1973 udata_swapInvStringBlock(ds
, inBytes
+tokenStringOffset
, (int32_t)(groupsOffset
-tokenStringOffset
),
1974 outBytes
+tokenStringOffset
, pErrorCode
);
1975 if(U_FAILURE(*pErrorCode
)) {
1976 udata_printError(ds
, "uchar_swapNames(token strings) failed\n");
1980 /* swap the group table */
1981 count
=ds
->readUInt16(*((const uint16_t *)(inBytes
+groupsOffset
)));
1982 ds
->swapArray16(ds
, inBytes
+groupsOffset
, (int32_t)((1+count
*3)*2),
1983 outBytes
+groupsOffset
, pErrorCode
);
1986 * swap the group strings
1987 * swap the string bytes but not the nibble-encoded string lengths
1989 if(ds
->inCharset
!=ds
->outCharset
) {
1990 uint16_t offsets
[LINES_PER_GROUP
+1], lengths
[LINES_PER_GROUP
+1];
1992 const uint8_t *inStrings
, *nextInStrings
;
1993 uint8_t *outStrings
;
1997 inStrings
=inBytes
+groupStringOffset
;
1998 outStrings
=outBytes
+groupStringOffset
;
2000 stringsCount
=algNamesOffset
-groupStringOffset
;
2002 /* iterate through string groups until only a few padding bytes are left */
2003 while(stringsCount
>32) {
2004 nextInStrings
=expandGroupLengths(inStrings
, offsets
, lengths
);
2006 /* move past the length bytes */
2007 stringsCount
-=(uint32_t)(nextInStrings
-inStrings
);
2008 outStrings
+=nextInStrings
-inStrings
;
2009 inStrings
=nextInStrings
;
2011 count
=offsets
[31]+lengths
[31]; /* total number of string bytes in this group */
2012 stringsCount
-=count
;
2014 /* swap the string bytes using map[] and trailMap[] */
2017 *outStrings
++=map
[c
];
2021 /* token lead byte: swap the trail byte, too */
2022 *outStrings
++=trailMap
[*inStrings
++];
2029 /* swap the algorithmic ranges */
2030 offset
=algNamesOffset
;
2031 count
=ds
->readUInt32(*((const uint32_t *)(inBytes
+offset
)));
2032 ds
->swapArray32(ds
, inBytes
+offset
, 4, outBytes
+offset
, pErrorCode
);
2035 for(i
=0; i
<count
; ++i
) {
2036 if(offset
>(uint32_t)length
) {
2037 udata_printError(ds
, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2039 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2043 inRange
=(const AlgorithmicRange
*)(inBytes
+offset
);
2044 outRange
=(AlgorithmicRange
*)(outBytes
+offset
);
2045 offset
+=ds
->readUInt16(inRange
->size
);
2047 ds
->swapArray32(ds
, inRange
, 8, outRange
, pErrorCode
);
2048 ds
->swapArray16(ds
, &inRange
->size
, 2, &outRange
->size
, pErrorCode
);
2049 switch(inRange
->type
) {
2051 /* swap prefix string */
2052 ds
->swapInvChars(ds
, inRange
+1, (int32_t)uprv_strlen((const char *)(inRange
+1)),
2053 outRange
+1, pErrorCode
);
2054 if(U_FAILURE(*pErrorCode
)) {
2055 udata_printError(ds
, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2062 /* swap factors and the prefix and factor strings */
2063 uint32_t factorsCount
;
2065 factorsCount
=inRange
->variant
;
2066 p
=(const uint16_t *)(inRange
+1);
2067 q
=(uint16_t *)(outRange
+1);
2068 ds
->swapArray16(ds
, p
, (int32_t)(factorsCount
*2), q
, pErrorCode
);
2070 /* swap the strings, up to the last terminating NUL */
2073 stringsCount
=(uint32_t)((inBytes
+offset
)-(const uint8_t *)p
);
2074 while(stringsCount
>0 && ((const uint8_t *)p
)[stringsCount
-1]!=0) {
2077 ds
->swapInvChars(ds
, p
, (int32_t)stringsCount
, q
, pErrorCode
);
2081 udata_printError(ds
, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2083 *pErrorCode
=U_UNSUPPORTED_ERROR
;
2089 return headerSize
+(int32_t)offset
;
2093 * Hey, Emacs, please set the following:
2096 * indent-tabs-mode: nil