]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unames.c
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / common / unames.c
CommitLineData
b75a7d8f
A
1/*
2******************************************************************************
3*
73c04bcf 4* Copyright (C) 1999-2006, International Business Machines
b75a7d8f
A
5* Corporation and others. All Rights Reserved.
6*
7******************************************************************************
8* file name: unames.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 1999oct04
14* created by: Markus W. Scherer
15*/
16
b75a7d8f 17#include "unicode/utypes.h"
374ca955 18#include "unicode/putil.h"
b75a7d8f
A
19#include "unicode/uchar.h"
20#include "unicode/udata.h"
b75a7d8f
A
21#include "ustr_imp.h"
22#include "umutex.h"
23#include "cmemory.h"
24#include "cstring.h"
25#include "ucln_cmn.h"
374ca955 26#include "udataswp.h"
b75a7d8f
A
27#include "uprops.h"
28
29/* prototypes ------------------------------------------------------------- */
30
31#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
32
33static const char DATA_NAME[] = "unames";
34static const char DATA_TYPE[] = "icu";
35
36#define GROUP_SHIFT 5
37#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
38#define GROUP_MASK (LINES_PER_GROUP-1)
39
40typedef struct {
41 uint16_t groupMSB,
42 offsetHigh, offsetLow; /* avoid padding */
43} Group;
44
45typedef struct {
46 uint32_t start, end;
47 uint8_t type, variant;
48 uint16_t size;
49} AlgorithmicRange;
50
51typedef struct {
52 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
53} UCharNames;
54
55typedef struct {
56 const char *otherName;
57 UChar32 code;
58} FindName;
59
60#define DO_FIND_NAME NULL
61
62static UDataMemory *uCharNamesData=NULL;
63static UCharNames *uCharNames=NULL;
64static UErrorCode gLoadErrorCode=U_ZERO_ERROR;
65
66/*
67 * Maximum length of character names (regular & 1.0).
b75a7d8f 68 */
73c04bcf 69static int32_t gMaxNameLength=0;
b75a7d8f
A
70
71/*
72 * Set of chars used in character names (regular & 1.0).
b75a7d8f
A
73 * Chars are platform-dependent (can be EBCDIC).
74 */
73c04bcf 75static uint32_t gNameSet[8]={ 0 };
b75a7d8f 76
b75a7d8f
A
77#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
78#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
79#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
80
81#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
82
73c04bcf
A
83static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
84 "unassigned",
85 "uppercase letter",
86 "lowercase letter",
87 "titlecase letter",
88 "modifier letter",
89 "other letter",
90 "non spacing mark",
91 "enclosing mark",
92 "combining spacing mark",
93 "decimal digit number",
94 "letter number",
95 "other number",
96 "space separator",
97 "line separator",
98 "paragraph separator",
99 "control",
100 "format",
101 "private use area",
102 "surrogate",
103 "dash punctuation",
104 "start punctuation",
105 "end punctuation",
106 "connector punctuation",
107 "other punctuation",
108 "math symbol",
109 "currency symbol",
110 "modifier symbol",
111 "other symbol",
112 "initial punctuation",
113 "final punctuation",
114 "noncharacter",
115 "lead surrogate",
116 "trail surrogate"
117};
b75a7d8f 118
374ca955 119/* implementation ----------------------------------------------------------- */
b75a7d8f 120
374ca955
A
121static UBool U_CALLCONV unames_cleanup(void)
122{
123 if(uCharNamesData) {
124 udata_close(uCharNamesData);
125 uCharNamesData = NULL;
126 }
127 if(uCharNames) {
128 uCharNames = NULL;
129 }
130 gMaxNameLength=0;
131 return TRUE;
132}
b75a7d8f 133
374ca955
A
134static UBool U_CALLCONV
135isAcceptable(void *context,
136 const char *type, const char *name,
137 const UDataInfo *pInfo) {
138 return (UBool)(
139 pInfo->size>=20 &&
140 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
141 pInfo->charsetFamily==U_CHARSET_FAMILY &&
142 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
143 pInfo->dataFormat[1]==0x6e &&
144 pInfo->dataFormat[2]==0x61 &&
145 pInfo->dataFormat[3]==0x6d &&
146 pInfo->formatVersion[0]==1);
147}
b75a7d8f 148
374ca955
A
149static UBool
150isDataLoaded(UErrorCode *pErrorCode) {
151 /* load UCharNames from file if necessary */
152 UBool isCached;
b75a7d8f 153
374ca955 154 /* do this because double-checked locking is broken */
73c04bcf 155 UMTX_CHECK(NULL, (uCharNames!=NULL), isCached);
b75a7d8f 156
374ca955
A
157 if(!isCached) {
158 UCharNames *names;
159 UDataMemory *data;
b75a7d8f 160
374ca955
A
161 /* check error code from previous attempt */
162 if(U_FAILURE(gLoadErrorCode)) {
163 *pErrorCode=gLoadErrorCode;
164 return FALSE;
b75a7d8f 165 }
b75a7d8f 166
374ca955
A
167 /* open the data outside the mutex block */
168 data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode);
169 if(U_FAILURE(*pErrorCode)) {
170 gLoadErrorCode=*pErrorCode;
171 return FALSE;
b75a7d8f 172 }
b75a7d8f 173
374ca955 174 names=(UCharNames *)udata_getMemory(data);
b75a7d8f 175
374ca955
A
176 /* in the mutex block, set the data for this process */
177 {
178 umtx_lock(NULL);
179 if(uCharNames==NULL) {
180 uCharNames=names;
181 uCharNamesData=data;
182 data=NULL;
183 names=NULL;
184 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
185 }
186 umtx_unlock(NULL);
187 }
b75a7d8f 188
374ca955
A
189 /* if a different thread set it first, then close the extra data */
190 if(data!=NULL) {
191 udata_close(data); /* NULL if it was set correctly */
192 }
b75a7d8f 193 }
374ca955
A
194 return TRUE;
195}
b75a7d8f 196
374ca955
A
197#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
198 if((bufferLength)>0) { \
199 *(buffer)++=c; \
200 --(bufferLength); \
201 } \
202 ++(bufferPos); \
b75a7d8f
A
203}
204
374ca955 205#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
b75a7d8f 206
374ca955
A
207/*
208 * Important: expandName() and compareName() are almost the same -
209 * apply fixes to both.
210 *
211 * UnicodeData.txt uses ';' as a field separator, so no
212 * field can contain ';' as part of its contents.
213 * In unames.dat, it is marked as token[';']==-1 only if the
214 * semicolon is used in the data file - which is iff we
215 * have Unicode 1.0 names or ISO comments.
216 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments
217 * although we know that it will never be part of a name.
218 */
219static uint16_t
220expandName(UCharNames *names,
221 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
222 char *buffer, uint16_t bufferLength) {
223 uint16_t *tokens=(uint16_t *)names+8;
224 uint16_t token, tokenCount=*tokens++, bufferPos=0;
225 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
226 uint8_t c;
227
228 if(nameChoice==U_UNICODE_10_CHAR_NAME || nameChoice==U_ISO_COMMENT) {
229 /*
230 * skip the modern name if it is not requested _and_
231 * if the semicolon byte value is a character, not a token number
232 */
233 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
234 while(nameLength>0) {
235 --nameLength;
236 if(*name++==';') {
237 break;
238 }
239 }
240 if(nameChoice==U_ISO_COMMENT) {
241 /* skip the Unicode 1.0 name as well to get the ISO comment */
242 while(nameLength>0) {
243 --nameLength;
244 if(*name++==';') {
245 break;
246 }
247 }
248 }
249 } else {
250 /*
251 * the semicolon byte value is a token number, therefore
252 * only modern names are stored in unames.dat and there is no
253 * such requested Unicode 1.0 name here
254 */
255 nameLength=0;
256 }
b75a7d8f
A
257 }
258
259 /* write each letter directly, and write a token word per token */
260 while(nameLength>0) {
261 --nameLength;
262 c=*name++;
263
264 if(c>=tokenCount) {
265 if(c!=';') {
266 /* implicit letter */
267 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
268 } else {
269 /* finished */
270 break;
271 }
272 } else {
273 token=tokens[c];
274 if(token==(uint16_t)(-2)) {
275 /* this is a lead byte for a double-byte token */
276 token=tokens[c<<8|*name++];
277 --nameLength;
278 }
279 if(token==(uint16_t)(-1)) {
280 if(c!=';') {
281 /* explicit letter */
282 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
283 } else {
284 /* stop, but skip the semicolon if we are seeking
285 extended names and there was no 2.0 name but there
286 is a 1.0 name. */
287 if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
288 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
289 continue;
290 }
291 }
292 /* finished */
293 break;
294 }
295 } else {
296 /* write token word */
297 uint8_t *tokenString=tokenStrings+token;
298 while((c=*tokenString++)!=0) {
299 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
300 }
301 }
302 }
303 }
304
305 /* zero-terminate */
306 if(bufferLength>0) {
307 *buffer=0;
308 }
309
310 return bufferPos;
311}
312
313/*
314 * compareName() is almost the same as expandName() except that it compares
315 * the currently expanded name to an input name.
316 * It returns the match/no match result as soon as possible.
317 */
318static UBool
319compareName(UCharNames *names,
320 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
321 const char *otherName) {
322 uint16_t *tokens=(uint16_t *)names+8;
323 uint16_t token, tokenCount=*tokens++;
324 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
325 uint8_t c;
326 const char *origOtherName = otherName;
327
328 if(nameChoice==U_UNICODE_10_CHAR_NAME) {
329 /*
330 * skip the modern name if it is not requested _and_
331 * if the semicolon byte value is a character, not a token number
332 */
333 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
334 while(nameLength>0) {
335 --nameLength;
336 if(*name++==';') {
337 break;
338 }
339 }
340 } else {
341 /*
342 * the semicolon byte value is a token number, therefore
343 * only modern names are stored in unames.dat and there is no
344 * such requested Unicode 1.0 name here
345 */
346 nameLength=0;
347 }
348 }
349
350 /* compare each letter directly, and compare a token word per token */
351 while(nameLength>0) {
352 --nameLength;
353 c=*name++;
354
355 if(c>=tokenCount) {
356 if(c!=';') {
357 /* implicit letter */
358 if((char)c!=*otherName++) {
359 return FALSE;
360 }
361 } else {
362 /* finished */
363 break;
364 }
365 } else {
366 token=tokens[c];
367 if(token==(uint16_t)(-2)) {
368 /* this is a lead byte for a double-byte token */
369 token=tokens[c<<8|*name++];
370 --nameLength;
371 }
372 if(token==(uint16_t)(-1)) {
373 if(c!=';') {
374 /* explicit letter */
375 if((char)c!=*otherName++) {
376 return FALSE;
377 }
378 } else {
379 /* stop, but skip the semicolon if we are seeking
380 extended names and there was no 2.0 name but there
381 is a 1.0 name. */
382 if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
383 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
384 continue;
385 }
386 }
387 /* finished */
388 break;
389 }
390 } else {
391 /* write token word */
392 uint8_t *tokenString=tokenStrings+token;
393 while((c=*tokenString++)!=0) {
394 if((char)c!=*otherName++) {
395 return FALSE;
396 }
397 }
398 }
399 }
400 }
374ca955
A
401
402 /* complete match? */
403 return (UBool)(*otherName==0);
404}
405
374ca955
A
406static uint8_t getCharCat(UChar32 cp) {
407 uint8_t cat;
408
409 if (UTF_IS_UNICODE_NONCHAR(cp)) {
410 return U_NONCHARACTER_CODE_POINT;
411 }
412
413 if ((cat = u_charType(cp)) == U_SURROGATE) {
414 cat = UTF_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
415 }
416
417 return cat;
418}
419
420static const char *getCharCatName(UChar32 cp) {
421 uint8_t cat = getCharCat(cp);
422
423 /* Return unknown if the table of names above is not up to
424 date. */
425
426 if (cat >= LENGTHOF(charCatNames)) {
427 return "unknown";
428 } else {
429 return charCatNames[cat];
430 }
431}
432
433static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
434 const char *catname = getCharCatName(code);
435 uint16_t length = 0;
436
437 UChar32 cp;
438 int ndigits, i;
439
440 WRITE_CHAR(buffer, bufferLength, length, '<');
441 while (catname[length - 1]) {
442 WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
443 }
444 WRITE_CHAR(buffer, bufferLength, length, '-');
445 for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
446 ;
447 if (ndigits < 4)
448 ndigits = 4;
449 for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
450 uint8_t v = (uint8_t)(cp & 0xf);
451 buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
452 }
453 buffer += ndigits;
454 length += ndigits;
455 WRITE_CHAR(buffer, bufferLength, length, '>');
456
457 return length;
458}
459
460/*
461 * getGroup() does a binary search for the group that contains the
462 * Unicode code point "code".
463 * The return value is always a valid Group* that may contain "code"
464 * or else is the highest group before "code".
465 * If the lowest group is after "code", then that one is returned.
466 */
467static Group *
468getGroup(UCharNames *names, uint32_t code) {
469 uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
470 start=0,
471 limit=*(uint16_t *)((char *)names+names->groupsOffset),
472 number;
473 Group *groups=(Group *)((char *)names+names->groupsOffset+2);
474
475 /* binary search for the group of names that contains the one for code */
476 while(start<limit-1) {
477 number=(uint16_t)((start+limit)/2);
478 if(groupMSB<groups[number].groupMSB) {
479 limit=number;
480 } else {
481 start=number;
482 }
483 }
484
485 /* return this regardless of whether it is an exact match */
486 return groups+start;
487}
488
489/*
490 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
491 * expands them into offsets and lengths for each string.
492 * Lengths are stored with a variable-width encoding in consecutive nibbles:
493 * If a nibble<0xc, then it is the length itself (0=empty string).
494 * If a nibble>=0xc, then it forms a length value with the following nibble.
495 * Calculation see below.
496 * The offsets and lengths arrays must be at least 33 (one more) long because
497 * there is no check here at the end if the last nibble is still used.
498 */
499static const uint8_t *
500expandGroupLengths(const uint8_t *s,
501 uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
502 /* read the lengths of the 32 strings in this group and get each string's offset */
503 uint16_t i=0, offset=0, length=0;
504 uint8_t lengthByte;
505
506 /* all 32 lengths must be read to get the offset of the first group string */
507 while(i<LINES_PER_GROUP) {
508 lengthByte=*s++;
509
510 /* read even nibble - MSBs of lengthByte */
511 if(length>=12) {
512 /* double-nibble length spread across two bytes */
513 length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
514 lengthByte&=0xf;
515 } else if((lengthByte /* &0xf0 */)>=0xc0) {
516 /* double-nibble length spread across this one byte */
517 length=(uint16_t)((lengthByte&0x3f)+12);
518 } else {
519 /* single-nibble length in MSBs */
520 length=(uint16_t)(lengthByte>>4);
521 lengthByte&=0xf;
522 }
523
524 *offsets++=offset;
525 *lengths++=length;
526
527 offset+=length;
528 ++i;
529
530 /* read odd nibble - LSBs of lengthByte */
531 if((lengthByte&0xf0)==0) {
532 /* this nibble was not consumed for a double-nibble length above */
533 length=lengthByte;
534 if(length<12) {
535 /* single-nibble length in LSBs */
536 *offsets++=offset;
537 *lengths++=length;
538
539 offset+=length;
540 ++i;
541 }
542 } else {
543 length=0; /* prevent double-nibble detection in the next iteration */
544 }
545 }
546
547 /* now, s is at the first group string */
548 return s;
549}
550
551static uint16_t
552expandGroupName(UCharNames *names, Group *group,
553 uint16_t lineNumber, UCharNameChoice nameChoice,
554 char *buffer, uint16_t bufferLength) {
555 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
556 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+
557 (group->offsetHigh<<16|group->offsetLow);
558 s=expandGroupLengths(s, offsets, lengths);
559 return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
560 buffer, bufferLength);
561}
562
563static uint16_t
564getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
565 char *buffer, uint16_t bufferLength) {
566 Group *group=getGroup(names, code);
567 if((uint16_t)(code>>GROUP_SHIFT)==group->groupMSB) {
568 return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
569 buffer, bufferLength);
570 } else {
571 /* group not found */
572 /* zero-terminate */
573 if(bufferLength>0) {
574 *buffer=0;
575 }
576 return 0;
577 }
b75a7d8f
A
578}
579
580/*
581 * enumGroupNames() enumerates all the names in a 32-group
582 * and either calls the enumerator function or finds a given input name.
583 */
584static UBool
585enumGroupNames(UCharNames *names, Group *group,
586 UChar32 start, UChar32 end,
587 UEnumCharNamesFn *fn, void *context,
588 UCharNameChoice nameChoice) {
589 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
590 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+
591 (group->offsetHigh<<16|group->offsetLow);
592
593 s=expandGroupLengths(s, offsets, lengths);
594 if(fn!=DO_FIND_NAME) {
595 char buffer[200];
596 uint16_t length;
597
598 while(start<=end) {
599 length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
600 if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
601 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
602 }
603 /* here, we assume that the buffer is large enough */
604 if(length>0) {
605 if(!fn(context, start, nameChoice, buffer, length)) {
606 return FALSE;
607 }
608 }
609 ++start;
610 }
611 } else {
612 const char *otherName=((FindName *)context)->otherName;
613 while(start<=end) {
614 if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
615 ((FindName *)context)->code=start;
616 return FALSE;
617 }
618 ++start;
619 }
620 }
621 return TRUE;
622}
623
624/*
625 * enumExtNames enumerate extended names.
626 * It only needs to do it if it is called with a real function and not
627 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
628 * for extended names by itself.
629 */
630static UBool
631enumExtNames(UChar32 start, UChar32 end,
632 UEnumCharNamesFn *fn, void *context)
633{
634 if(fn!=DO_FIND_NAME) {
635 char buffer[200];
636 uint16_t length;
637
638 while(start<=end) {
639 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
640 /* here, we assume that the buffer is large enough */
641 if(length>0) {
642 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
643 return FALSE;
644 }
645 }
646 ++start;
647 }
648 }
649
650 return TRUE;
651}
652
653static UBool
654enumNames(UCharNames *names,
655 UChar32 start, UChar32 limit,
656 UEnumCharNamesFn *fn, void *context,
657 UCharNameChoice nameChoice) {
658 uint16_t startGroupMSB, endGroupMSB, groupCount;
659 Group *group, *groupLimit;
660
661 startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
662 endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
663
664 /* find the group that contains start, or the highest before it */
665 group=getGroup(names, start);
666
667 if(startGroupMSB==endGroupMSB) {
668 if(startGroupMSB==group->groupMSB) {
669 /* if start and limit-1 are in the same group, then enumerate only in that one */
670 return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
671 }
672 } else {
673 groupCount=*(uint16_t *)((char *)names+names->groupsOffset);
674 groupLimit=(Group *)((char *)names+names->groupsOffset+2)+groupCount;
675
676 if(startGroupMSB==group->groupMSB) {
677 /* enumerate characters in the partial start group */
678 if((start&GROUP_MASK)!=0) {
679 if(!enumGroupNames(names, group,
680 start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
681 fn, context, nameChoice)) {
682 return FALSE;
683 }
684 ++group; /* continue with the next group */
685 }
686 } else if(startGroupMSB>group->groupMSB) {
687 /* make sure that we start enumerating with the first group after start */
688 if (group + 1 < groupLimit && (group + 1)->groupMSB > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
689 UChar32 end = (group + 1)->groupMSB << GROUP_SHIFT;
690 if (end > limit) {
691 end = limit;
692 }
693 if (!enumExtNames(start, end - 1, fn, context)) {
694 return FALSE;
695 }
696 }
697 ++group;
698 }
699
700 /* enumerate entire groups between the start- and end-groups */
701 while(group<groupLimit && group->groupMSB<endGroupMSB) {
702 start=(UChar32)group->groupMSB<<GROUP_SHIFT;
703 if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
704 return FALSE;
705 }
706 if (group + 1 < groupLimit && (group + 1)->groupMSB > group->groupMSB + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
707 UChar32 end = (group + 1)->groupMSB << GROUP_SHIFT;
708 if (end > limit) {
709 end = limit;
710 }
711 if (!enumExtNames((group->groupMSB + 1) << GROUP_SHIFT, end - 1, fn, context)) {
712 return FALSE;
713 }
714 }
715 ++group;
716 }
717
718 /* enumerate within the end group (group->groupMSB==endGroupMSB) */
719 if(group<groupLimit && group->groupMSB==endGroupMSB) {
720 return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
721 } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
722 UChar32 next = ((group - 1)->groupMSB + 1) << GROUP_SHIFT;
723 if (next > start) {
724 start = next;
725 }
726 } else {
727 return TRUE;
728 }
729 }
730
731 /* we have not found a group, which means everything is made of
732 extended names. */
733 if (nameChoice == U_EXTENDED_CHAR_NAME) {
734 if (limit > UCHAR_MAX_VALUE + 1) {
735 limit = UCHAR_MAX_VALUE + 1;
736 }
737 return enumExtNames(start, limit - 1, fn, context);
738 }
739
740 return TRUE;
741}
742
374ca955
A
743static uint16_t
744writeFactorSuffix(const uint16_t *factors, uint16_t count,
745 const char *s, /* suffix elements */
746 uint32_t code,
747 uint16_t indexes[8], /* output fields from here */
748 const char *elementBases[8], const char *elements[8],
749 char *buffer, uint16_t bufferLength) {
750 uint16_t i, factor, bufferPos=0;
751 char c;
752
753 /* write elements according to the factors */
754
755 /*
756 * the factorized elements are determined by modulo arithmetic
757 * with the factors of this algorithm
758 *
759 * note that for fewer operations, count is decremented here
760 */
761 --count;
762 for(i=count; i>0; --i) {
763 factor=factors[i];
764 indexes[i]=(uint16_t)(code%factor);
765 code/=factor;
766 }
767 /*
768 * we don't need to calculate the last modulus because start<=code<=end
769 * guarantees here that code<=factors[0]
770 */
771 indexes[0]=(uint16_t)code;
772
773 /* write each element */
774 for(;;) {
775 if(elementBases!=NULL) {
776 *elementBases++=s;
777 }
778
779 /* skip indexes[i] strings */
780 factor=indexes[i];
781 while(factor>0) {
782 while(*s++!=0) {}
783 --factor;
784 }
785 if(elements!=NULL) {
786 *elements++=s;
787 }
788
789 /* write element */
790 while((c=*s++)!=0) {
791 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
792 }
793
794 /* we do not need to perform the rest of this loop for i==count - break here */
795 if(i>=count) {
796 break;
797 }
798
799 /* skip the rest of the strings for this factors[i] */
800 factor=(uint16_t)(factors[i]-indexes[i]-1);
801 while(factor>0) {
802 while(*s++!=0) {}
803 --factor;
804 }
805
806 ++i;
807 }
808
809 /* zero-terminate */
810 if(bufferLength>0) {
811 *buffer=0;
812 }
813
814 return bufferPos;
815}
816
b75a7d8f
A
817/*
818 * Important:
819 * Parts of findAlgName() are almost the same as some of getAlgName().
820 * Fixes must be applied to both.
821 */
822static uint16_t
823getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
824 char *buffer, uint16_t bufferLength) {
825 uint16_t bufferPos=0;
826
827 /*
828 * Do not write algorithmic Unicode 1.0 names because
829 * Unihan names are the same as the modern ones,
830 * extension A was only introduced with Unicode 3.0, and
831 * the Hangul syllable block was moved and changed around Unicode 1.1.5.
832 */
833 if(nameChoice==U_UNICODE_10_CHAR_NAME) {
834 /* zero-terminate */
835 if(bufferLength>0) {
836 *buffer=0;
837 }
838 return 0;
839 }
840
841 switch(range->type) {
842 case 0: {
843 /* name = prefix hex-digits */
844 const char *s=(const char *)(range+1);
845 char c;
846
847 uint16_t i, count;
848
849 /* copy prefix */
850 while((c=*s++)!=0) {
851 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
852 }
853
854 /* write hexadecimal code point value */
855 count=range->variant;
856
857 /* zero-terminate */
858 if(count<bufferLength) {
859 buffer[count]=0;
860 }
861
862 for(i=count; i>0;) {
863 if(--i<bufferLength) {
864 c=(char)(code&0xf);
865 if(c<10) {
866 c+='0';
867 } else {
868 c+='A'-10;
869 }
870 buffer[i]=c;
871 }
872 code>>=4;
873 }
874
875 bufferPos+=count;
876 break;
877 }
878 case 1: {
879 /* name = prefix factorized-elements */
880 uint16_t indexes[8];
881 const uint16_t *factors=(const uint16_t *)(range+1);
882 uint16_t count=range->variant;
883 const char *s=(const char *)(factors+count);
884 char c;
885
886 /* copy prefix */
887 while((c=*s++)!=0) {
888 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
889 }
890
891 bufferPos+=writeFactorSuffix(factors, count,
892 s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
893 break;
894 }
895 default:
896 /* undefined type */
897 /* zero-terminate */
898 if(bufferLength>0) {
899 *buffer=0;
900 }
901 break;
902 }
903
904 return bufferPos;
905}
906
b75a7d8f
A
907/*
908 * Important: enumAlgNames() and findAlgName() are almost the same.
909 * Any fix must be applied to both.
910 */
911static UBool
912enumAlgNames(AlgorithmicRange *range,
913 UChar32 start, UChar32 limit,
914 UEnumCharNamesFn *fn, void *context,
915 UCharNameChoice nameChoice) {
916 char buffer[200];
917 uint16_t length;
918
919 if(nameChoice==U_UNICODE_10_CHAR_NAME) {
920 return TRUE;
921 }
922
923 switch(range->type) {
924 case 0: {
925 char *s, *end;
926 char c;
927
928 /* get the full name of the start character */
929 length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
930 if(length<=0) {
931 return TRUE;
932 }
933
934 /* call the enumerator function with this first character */
935 if(!fn(context, start, nameChoice, buffer, length)) {
936 return FALSE;
937 }
938
939 /* go to the end of the name; all these names have the same length */
940 end=buffer;
941 while(*end!=0) {
942 ++end;
943 }
944
945 /* enumerate the rest of the names */
946 while(++start<limit) {
947 /* increment the hexadecimal number on a character-basis */
948 s=end;
949 for (;;) {
950 c=*--s;
951 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
952 *s=(char)(c+1);
953 break;
954 } else if(c=='9') {
955 *s='A';
956 break;
957 } else if(c=='F') {
958 *s='0';
959 }
960 }
961
962 if(!fn(context, start, nameChoice, buffer, length)) {
963 return FALSE;
964 }
965 }
966 break;
967 }
968 case 1: {
969 uint16_t indexes[8];
970 const char *elementBases[8], *elements[8];
971 const uint16_t *factors=(const uint16_t *)(range+1);
972 uint16_t count=range->variant;
973 const char *s=(const char *)(factors+count);
974 char *suffix, *t;
975 uint16_t prefixLength, i, index;
976
977 char c;
978
979 /* name = prefix factorized-elements */
980
981 /* copy prefix */
982 suffix=buffer;
983 prefixLength=0;
984 while((c=*s++)!=0) {
985 *suffix++=c;
986 ++prefixLength;
987 }
988
989 /* append the suffix of the start character */
990 length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
991 s, (uint32_t)start-range->start,
992 indexes, elementBases, elements,
993 suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
994
995 /* call the enumerator function with this first character */
996 if(!fn(context, start, nameChoice, buffer, length)) {
997 return FALSE;
998 }
999
1000 /* enumerate the rest of the names */
1001 while(++start<limit) {
1002 /* increment the indexes in lexical order bound by the factors */
1003 i=count;
1004 for (;;) {
1005 index=(uint16_t)(indexes[--i]+1);
1006 if(index<factors[i]) {
1007 /* skip one index and its element string */
1008 indexes[i]=index;
1009 s=elements[i];
1010 while(*s++!=0) {
1011 }
1012 elements[i]=s;
1013 break;
1014 } else {
1015 /* reset this index to 0 and its element string to the first one */
1016 indexes[i]=0;
1017 elements[i]=elementBases[i];
1018 }
1019 }
1020
1021 /* to make matters a little easier, just append all elements to the suffix */
1022 t=suffix;
1023 length=prefixLength;
1024 for(i=0; i<count; ++i) {
1025 s=elements[i];
1026 while((c=*s++)!=0) {
1027 *t++=c;
1028 ++length;
1029 }
1030 }
1031 /* zero-terminate */
1032 *t=0;
1033
1034 if(!fn(context, start, nameChoice, buffer, length)) {
1035 return FALSE;
1036 }
1037 }
1038 break;
1039 }
1040 default:
1041 /* undefined type */
1042 break;
1043 }
1044
1045 return TRUE;
1046}
1047
1048/*
1049 * findAlgName() is almost the same as enumAlgNames() except that it
1050 * returns the code point for a name if it fits into the range.
1051 * It returns 0xffff otherwise.
1052 */
1053static UChar32
1054findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1055 UChar32 code;
1056
1057 if(nameChoice==U_UNICODE_10_CHAR_NAME) {
1058 return 0xffff;
1059 }
1060
1061 switch(range->type) {
1062 case 0: {
1063 /* name = prefix hex-digits */
1064 const char *s=(const char *)(range+1);
1065 char c;
1066
1067 uint16_t i, count;
1068
1069 /* compare prefix */
1070 while((c=*s++)!=0) {
1071 if((char)c!=*otherName++) {
1072 return 0xffff;
1073 }
1074 }
1075
1076 /* read hexadecimal code point value */
1077 count=range->variant;
1078 code=0;
1079 for(i=0; i<count; ++i) {
1080 c=*otherName++;
1081 if('0'<=c && c<='9') {
1082 code=(code<<4)|(c-'0');
1083 } else if('A'<=c && c<='F') {
1084 code=(code<<4)|(c-'A'+10);
1085 } else {
1086 return 0xffff;
1087 }
1088 }
1089
1090 /* does it fit into the range? */
1091 if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1092 return code;
1093 }
1094 break;
1095 }
1096 case 1: {
1097 char buffer[64];
1098 uint16_t indexes[8];
1099 const char *elementBases[8], *elements[8];
1100 const uint16_t *factors=(const uint16_t *)(range+1);
1101 uint16_t count=range->variant;
1102 const char *s=(const char *)(factors+count), *t;
1103 UChar32 start, limit;
1104 uint16_t i, index;
1105
1106 char c;
1107
1108 /* name = prefix factorized-elements */
1109
1110 /* compare prefix */
1111 while((c=*s++)!=0) {
1112 if((char)c!=*otherName++) {
1113 return 0xffff;
1114 }
1115 }
1116
1117 start=(UChar32)range->start;
1118 limit=(UChar32)(range->end+1);
1119
1120 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1121 writeFactorSuffix(factors, count, s, 0,
1122 indexes, elementBases, elements, buffer, sizeof(buffer));
1123
1124 /* compare the first suffix */
1125 if(0==uprv_strcmp(otherName, buffer)) {
1126 return start;
1127 }
1128
1129 /* enumerate and compare the rest of the suffixes */
1130 while(++start<limit) {
1131 /* increment the indexes in lexical order bound by the factors */
1132 i=count;
1133 for (;;) {
1134 index=(uint16_t)(indexes[--i]+1);
1135 if(index<factors[i]) {
1136 /* skip one index and its element string */
374ca955
A
1137 indexes[i]=index;
1138 s=elements[i];
1139 while(*s++!=0) {}
1140 elements[i]=s;
1141 break;
1142 } else {
1143 /* reset this index to 0 and its element string to the first one */
1144 indexes[i]=0;
1145 elements[i]=elementBases[i];
1146 }
1147 }
b75a7d8f 1148
374ca955
A
1149 /* to make matters a little easier, just compare all elements of the suffix */
1150 t=otherName;
1151 for(i=0; i<count; ++i) {
1152 s=elements[i];
1153 while((c=*s++)!=0) {
1154 if(c!=*t++) {
1155 s=""; /* does not match */
1156 i=99;
1157 }
1158 }
1159 }
1160 if(i<99 && *t==0) {
1161 return start;
1162 }
1163 }
1164 break;
b75a7d8f 1165 }
374ca955
A
1166 default:
1167 /* undefined type */
1168 break;
b75a7d8f 1169 }
b75a7d8f 1170
374ca955 1171 return 0xffff;
b75a7d8f
A
1172}
1173
1174/* sets of name characters, maximum name lengths ---------------------------- */
1175
1176#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1177#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1178
1179static int32_t
1180calcStringSetLength(uint32_t set[8], const char *s) {
1181 int32_t length=0;
1182 char c;
1183
1184 while((c=*s++)!=0) {
1185 SET_ADD(set, c);
1186 ++length;
1187 }
1188 return length;
1189}
1190
1191static int32_t
1192calcAlgNameSetsLengths(int32_t maxNameLength) {
1193 AlgorithmicRange *range;
1194 uint32_t *p;
1195 uint32_t rangeCount;
1196 int32_t length;
1197
1198 /* enumerate algorithmic ranges */
1199 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1200 rangeCount=*p;
1201 range=(AlgorithmicRange *)(p+1);
1202 while(rangeCount>0) {
1203 switch(range->type) {
1204 case 0:
1205 /* name = prefix + (range->variant times) hex-digits */
1206 /* prefix */
1207 length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1208 if(length>maxNameLength) {
1209 maxNameLength=length;
1210 }
1211 break;
1212 case 1: {
1213 /* name = prefix factorized-elements */
1214 const uint16_t *factors=(const uint16_t *)(range+1);
1215 const char *s;
1216 int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1217
1218 /* prefix length */
1219 s=(const char *)(factors+count);
1220 length=calcStringSetLength(gNameSet, s);
1221 s+=length+1; /* start of factor suffixes */
1222
1223 /* get the set and maximum factor suffix length for each factor */
1224 for(i=0; i<count; ++i) {
1225 maxFactorLength=0;
1226 for(factor=factors[i]; factor>0; --factor) {
1227 factorLength=calcStringSetLength(gNameSet, s);
1228 s+=factorLength+1;
1229 if(factorLength>maxFactorLength) {
1230 maxFactorLength=factorLength;
1231 }
1232 }
1233 length+=maxFactorLength;
1234 }
1235
1236 if(length>maxNameLength) {
1237 maxNameLength=length;
1238 }
1239 break;
1240 }
1241 default:
1242 /* unknown type */
1243 break;
1244 }
1245
1246 range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1247 --rangeCount;
1248 }
1249 return maxNameLength;
1250}
1251
1252static int32_t
1253calcExtNameSetsLengths(int32_t maxNameLength) {
1254 int32_t i, length;
1255
1256 for(i=0; i<LENGTHOF(charCatNames); ++i) {
1257 /*
1258 * for each category, count the length of the category name
1259 * plus 9=
1260 * 2 for <>
1261 * 1 for -
1262 * 6 for most hex digits per code point
1263 */
1264 length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1265 if(length>maxNameLength) {
1266 maxNameLength=length;
1267 }
1268 }
1269 return maxNameLength;
1270}
1271
1272static int32_t
1273calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1274 uint32_t set[8],
1275 const uint8_t **pLine, const uint8_t *lineLimit) {
1276 const uint8_t *line=*pLine;
1277 int32_t length=0, tokenLength;
1278 uint16_t c, token;
1279
1280 while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1281 if(c>=tokenCount) {
1282 /* implicit letter */
1283 SET_ADD(set, c);
1284 ++length;
1285 } else {
1286 token=tokens[c];
1287 if(token==(uint16_t)(-2)) {
1288 /* this is a lead byte for a double-byte token */
1289 c=c<<8|*line++;
1290 token=tokens[c];
1291 }
1292 if(token==(uint16_t)(-1)) {
1293 /* explicit letter */
1294 SET_ADD(set, c);
1295 ++length;
1296 } else {
1297 /* count token word */
1298 if(tokenLengths!=NULL) {
1299 /* use cached token length */
1300 tokenLength=tokenLengths[c];
1301 if(tokenLength==0) {
1302 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1303 tokenLengths[c]=(int8_t)tokenLength;
1304 }
1305 } else {
1306 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1307 }
1308 length+=tokenLength;
1309 }
1310 }
1311 }
1312
1313 *pLine=line;
1314 return length;
1315}
1316
1317static void
1318calcGroupNameSetsLengths(int32_t maxNameLength) {
1319 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1320
1321 uint16_t *tokens=(uint16_t *)uCharNames+8;
1322 uint16_t tokenCount=*tokens++;
1323 uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1324
1325 int8_t *tokenLengths;
1326
1327 uint16_t *groups;
1328 Group *group;
1329 const uint8_t *s, *line, *lineLimit;
1330
b75a7d8f
A
1331 int32_t groupCount, lineNumber, length;
1332
1333 tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1334 if(tokenLengths!=NULL) {
1335 uprv_memset(tokenLengths, 0, tokenCount);
1336 }
1337
1338 groups=(uint16_t *)((char *)uCharNames+uCharNames->groupsOffset);
1339 groupCount=*groups++;
1340 group=(Group *)groups;
1341
1342 /* enumerate all groups */
1343 while(groupCount>0) {
1344 s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+
1345 ((int32_t)group->offsetHigh<<16|group->offsetLow);
1346 s=expandGroupLengths(s, offsets, lengths);
1347
1348 /* enumerate all lines in each group */
1349 for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1350 line=s+offsets[lineNumber];
1351 length=lengths[lineNumber];
1352 if(length==0) {
1353 continue;
1354 }
1355
1356 lineLimit=line+length;
1357
374ca955
A
1358 /* read regular name */
1359 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1360 if(length>maxNameLength) {
1361 maxNameLength=length;
1362 }
1363 if(line==lineLimit) {
1364 continue;
1365 }
1366
1367 /* read Unicode 1.0 name */
1368 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1369 if(length>maxNameLength) {
1370 maxNameLength=length;
1371 }
1372 if(line==lineLimit) {
1373 continue;
1374 }
1375
1376 /* read ISO comment */
73c04bcf 1377 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
374ca955
A
1378 }
1379
1380 ++group;
1381 --groupCount;
1382 }
1383
1384 if(tokenLengths!=NULL) {
1385 uprv_free(tokenLengths);
1386 }
1387
1388 /* set gMax... - name length last for threading */
374ca955
A
1389 gMaxNameLength=maxNameLength;
1390}
1391
1392static UBool
1393calcNameSetsLengths(UErrorCode *pErrorCode) {
1394 static const char extChars[]="0123456789ABCDEF<>-";
1395 int32_t i, maxNameLength;
1396
1397 if(gMaxNameLength!=0) {
1398 return TRUE;
1399 }
1400
1401 if(!isDataLoaded(pErrorCode)) {
1402 return FALSE;
1403 }
1404
1405 /* set hex digits, used in various names, and <>-, used in extended names */
1406 for(i=0; i<sizeof(extChars)-1; ++i) {
1407 SET_ADD(gNameSet, extChars[i]);
1408 }
1409
1410 /* set sets and lengths from algorithmic names */
1411 maxNameLength=calcAlgNameSetsLengths(0);
1412
1413 /* set sets and lengths from extended names */
1414 maxNameLength=calcExtNameSetsLengths(maxNameLength);
1415
1416 /* set sets and lengths from group names, set global maximum values */
1417 calcGroupNameSetsLengths(maxNameLength);
1418
1419 return TRUE;
1420}
1421
1422/* public API --------------------------------------------------------------- */
1423
1424U_CAPI int32_t U_EXPORT2
1425u_charName(UChar32 code, UCharNameChoice nameChoice,
1426 char *buffer, int32_t bufferLength,
1427 UErrorCode *pErrorCode) {
1428 AlgorithmicRange *algRange;
1429 uint32_t *p;
1430 uint32_t i;
1431 int32_t length;
1432
1433 /* check the argument values */
1434 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1435 return 0;
1436 } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1437 bufferLength<0 || (bufferLength>0 && buffer==NULL)
1438 ) {
1439 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1440 return 0;
1441 }
1442
1443 if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1444 return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1445 }
1446
1447 length=0;
1448
1449 /* try algorithmic names first */
1450 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1451 i=*p;
1452 algRange=(AlgorithmicRange *)(p+1);
1453 while(i>0) {
1454 if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1455 length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1456 break;
1457 }
1458 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1459 --i;
1460 }
1461
1462 if(i==0) {
1463 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1464 length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1465 if (!length) {
1466 /* extended character name */
1467 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1468 }
1469 } else {
1470 /* normal character name */
1471 length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1472 }
1473 }
1474
1475 return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1476}
1477
1478U_CAPI int32_t U_EXPORT2
1479u_getISOComment(UChar32 c,
1480 char *dest, int32_t destCapacity,
1481 UErrorCode *pErrorCode) {
1482 int32_t length;
1483
1484 /* check the argument values */
1485 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1486 return 0;
1487 } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1488 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1489 return 0;
1490 }
1491
1492 if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1493 return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1494 }
1495
1496 /* the ISO comment is stored like a normal character name */
1497 length=getName(uCharNames, (uint32_t)c, U_ISO_COMMENT, dest, (uint16_t)destCapacity);
1498 return u_terminateChars(dest, destCapacity, length, pErrorCode);
1499}
1500
1501U_CAPI UChar32 U_EXPORT2
1502u_charFromName(UCharNameChoice nameChoice,
1503 const char *name,
1504 UErrorCode *pErrorCode) {
1505 char upper[120], lower[120];
1506 FindName findName;
1507 AlgorithmicRange *algRange;
1508 uint32_t *p;
1509 uint32_t i;
1510 UChar32 cp = 0;
1511 char c0;
1512 UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
1513
1514 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1515 return error;
1516 }
1517
1518 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1519 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1520 return error;
1521 }
1522
1523 if(!isDataLoaded(pErrorCode)) {
1524 return error;
1525 }
1526
1527 /* construct the uppercase and lowercase of the name first */
1528 for(i=0; i<sizeof(upper); ++i) {
1529 if((c0=*name++)!=0) {
1530 upper[i]=uprv_toupper(c0);
1531 lower[i]=uprv_tolower(c0);
1532 } else {
1533 upper[i]=lower[i]=0;
1534 break;
1535 }
1536 }
1537 if(i==sizeof(upper)) {
1538 /* name too long, there is no such character */
1539 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1540 return error;
1541 }
1542
1543 /* try extended names first */
1544 if (lower[0] == '<') {
1545 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1546 if (lower[--i] == '>') {
1547 for (--i; lower[i] && lower[i] != '-'; --i) {
1548 }
1549
1550 if (lower[i] == '-') { /* We've got a category. */
1551 uint32_t cIdx;
1552
1553 lower[i] = 0;
1554
1555 for (++i; lower[i] != '>'; ++i) {
1556 if (lower[i] >= '0' && lower[i] <= '9') {
1557 cp = (cp << 4) + lower[i] - '0';
1558 } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1559 cp = (cp << 4) + lower[i] - 'a' + 10;
1560 } else {
1561 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1562 return error;
1563 }
1564 }
1565
1566 /* Now validate the category name.
1567 We could use a binary search, or a trie, if
1568 we really wanted to. */
b75a7d8f 1569
374ca955 1570 for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) {
b75a7d8f 1571
374ca955
A
1572 if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1573 if (getCharCat(cp) == cIdx) {
1574 return cp;
1575 }
1576 break;
1577 }
1578 }
1579 }
b75a7d8f
A
1580 }
1581 }
1582
374ca955
A
1583 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1584 return error;
b75a7d8f
A
1585 }
1586
374ca955
A
1587 /* try algorithmic names now */
1588 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1589 i=*p;
1590 algRange=(AlgorithmicRange *)(p+1);
1591 while(i>0) {
1592 if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1593 return cp;
1594 }
1595 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1596 --i;
b75a7d8f
A
1597 }
1598
374ca955
A
1599 /* normal character name */
1600 findName.otherName=upper;
1601 findName.code=error;
1602 enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1603 if (findName.code == error) {
1604 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1605 }
1606 return findName.code;
b75a7d8f
A
1607}
1608
374ca955
A
1609U_CAPI void U_EXPORT2
1610u_enumCharNames(UChar32 start, UChar32 limit,
1611 UEnumCharNamesFn *fn,
1612 void *context,
1613 UCharNameChoice nameChoice,
1614 UErrorCode *pErrorCode) {
1615 AlgorithmicRange *algRange;
1616 uint32_t *p;
1617 uint32_t i;
b75a7d8f 1618
374ca955
A
1619 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1620 return;
b75a7d8f
A
1621 }
1622
374ca955
A
1623 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1624 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1625 return;
b75a7d8f
A
1626 }
1627
374ca955
A
1628 if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1629 limit = UCHAR_MAX_VALUE + 1;
1630 }
1631 if((uint32_t)start>=(uint32_t)limit) {
1632 return;
b75a7d8f
A
1633 }
1634
374ca955
A
1635 if(!isDataLoaded(pErrorCode)) {
1636 return;
1637 }
b75a7d8f 1638
374ca955
A
1639 /* interleave the data-driven ones with the algorithmic ones */
1640 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1641 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1642 i=*p;
1643 algRange=(AlgorithmicRange *)(p+1);
1644 while(i>0) {
1645 /* enumerate the character names before the current algorithmic range */
1646 /* here: start<limit */
1647 if((uint32_t)start<algRange->start) {
1648 if((uint32_t)limit<=algRange->start) {
1649 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1650 return;
1651 }
1652 if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1653 return;
1654 }
1655 start=(UChar32)algRange->start;
1656 }
1657 /* enumerate the character names in the current algorithmic range */
1658 /* here: algRange->start<=start<limit */
1659 if((uint32_t)start<=algRange->end) {
1660 if((uint32_t)limit<=(algRange->end+1)) {
1661 enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1662 return;
1663 }
1664 if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1665 return;
1666 }
1667 start=(UChar32)algRange->end+1;
1668 }
1669 /* continue to the next algorithmic range (here: start<limit) */
1670 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1671 --i;
1672 }
1673 /* enumerate the character names after the last algorithmic range */
1674 enumNames(uCharNames, start, limit, fn, context, nameChoice);
b75a7d8f
A
1675}
1676
1677U_CAPI int32_t U_EXPORT2
1678uprv_getMaxCharNameLength() {
1679 UErrorCode errorCode=U_ZERO_ERROR;
1680 if(calcNameSetsLengths(&errorCode)) {
1681 return gMaxNameLength;
1682 } else {
1683 return 0;
1684 }
1685}
1686
b75a7d8f
A
1687/**
1688 * Converts the char set cset into a Unicode set uset.
1689 * @param cset Set of 256 bit flags corresponding to a set of chars.
1690 * @param uset USet to receive characters. Existing contents are deleted.
1691 */
1692static void
73c04bcf 1693charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
b75a7d8f
A
1694 UChar us[256];
1695 char cs[256];
1696
1697 int32_t i, length;
1698 UErrorCode errorCode;
1699
1700 errorCode=U_ZERO_ERROR;
b75a7d8f
A
1701
1702 if(!calcNameSetsLengths(&errorCode)) {
1703 return;
1704 }
1705
1706 /* build a char string with all chars that are used in character names */
1707 length=0;
1708 for(i=0; i<256; ++i) {
1709 if(SET_CONTAINS(cset, i)) {
1710 cs[length++]=(char)i;
1711 }
1712 }
1713
1714 /* convert the char string to a UChar string */
1715 u_charsToUChars(cs, us, length);
1716
1717 /* add each UChar to the USet */
1718 for(i=0; i<length; ++i) {
1719 if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
374ca955 1720 sa->add(sa->set, us[i]);
b75a7d8f
A
1721 }
1722 }
1723}
1724
1725/**
1726 * Fills set with characters that are used in Unicode character names.
374ca955 1727 * @param set USet to receive characters.
b75a7d8f
A
1728 */
1729U_CAPI void U_EXPORT2
73c04bcf 1730uprv_getCharNameCharacters(const USetAdder *sa) {
374ca955 1731 charSetToUSet(gNameSet, sa);
b75a7d8f
A
1732}
1733
374ca955
A
1734/* data swapping ------------------------------------------------------------ */
1735
1736/*
1737 * The token table contains non-negative entries for token bytes,
1738 * and -1 for bytes that represent themselves in the data file's charset.
1739 * -2 entries are used for lead bytes.
1740 *
1741 * Direct bytes (-1 entries) must be translated from the input charset family
1742 * to the output charset family.
1743 * makeTokenMap() writes a permutation mapping for this.
1744 * Use it once for single-/lead-byte tokens and once more for all trail byte
1745 * tokens. (';' is an unused trail byte marked with -1.)
1746 */
1747static void
1748makeTokenMap(const UDataSwapper *ds,
1749 int16_t tokens[], uint16_t tokenCount,
1750 uint8_t map[256],
1751 UErrorCode *pErrorCode) {
1752 UBool usedOutChar[256];
1753 uint16_t i, j;
1754 uint8_t c1, c2;
1755
1756 if(U_FAILURE(*pErrorCode)) {
1757 return;
1758 }
1759
1760 if(ds->inCharset==ds->outCharset) {
1761 /* Same charset family: identity permutation */
1762 for(i=0; i<256; ++i) {
1763 map[i]=(uint8_t)i;
1764 }
1765 } else {
1766 uprv_memset(map, 0, 256);
1767 uprv_memset(usedOutChar, 0, 256);
1768
1769 if(tokenCount>256) {
1770 tokenCount=256;
1771 }
1772
1773 /* set the direct bytes (byte 0 always maps to itself) */
1774 for(i=1; i<tokenCount; ++i) {
1775 if(tokens[i]==-1) {
1776 /* convert the direct byte character */
1777 c1=(uint8_t)i;
1778 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1779 if(U_FAILURE(*pErrorCode)) {
73c04bcf
A
1780 udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1781 i, ds->inCharset);
374ca955
A
1782 return;
1783 }
1784
1785 /* enter the converted character into the map and mark it used */
1786 map[c1]=c2;
1787 usedOutChar[c2]=TRUE;
1788 }
1789 }
1790
1791 /* set the mappings for the rest of the permutation */
1792 for(i=j=1; i<tokenCount; ++i) {
1793 /* set mappings that were not set for direct bytes */
1794 if(map[i]==0) {
1795 /* set an output byte value that was not used as an output byte above */
1796 while(usedOutChar[j]) {
1797 ++j;
1798 }
1799 map[i]=(uint8_t)j++;
1800 }
1801 }
1802
1803 /*
1804 * leave mappings at tokenCount and above unset if tokenCount<256
1805 * because they won't be used
1806 */
1807 }
1808}
1809
1810U_CAPI int32_t U_EXPORT2
1811uchar_swapNames(const UDataSwapper *ds,
1812 const void *inData, int32_t length, void *outData,
1813 UErrorCode *pErrorCode) {
1814 const UDataInfo *pInfo;
1815 int32_t headerSize;
1816
1817 const uint8_t *inBytes;
1818 uint8_t *outBytes;
1819
1820 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1821 offset, i, count, stringsCount;
1822
1823 const AlgorithmicRange *inRange;
1824 AlgorithmicRange *outRange;
1825
1826 /* udata_swapDataHeader checks the arguments */
1827 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1828 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1829 return 0;
1830 }
1831
1832 /* check data format and format version */
1833 pInfo=(const UDataInfo *)((const char *)inData+4);
1834 if(!(
1835 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
1836 pInfo->dataFormat[1]==0x6e &&
1837 pInfo->dataFormat[2]==0x61 &&
1838 pInfo->dataFormat[3]==0x6d &&
1839 pInfo->formatVersion[0]==1
1840 )) {
1841 udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1842 pInfo->dataFormat[0], pInfo->dataFormat[1],
1843 pInfo->dataFormat[2], pInfo->dataFormat[3],
1844 pInfo->formatVersion[0]);
1845 *pErrorCode=U_UNSUPPORTED_ERROR;
1846 return 0;
1847 }
1848
1849 inBytes=(const uint8_t *)inData+headerSize;
1850 outBytes=(uint8_t *)outData+headerSize;
1851 if(length<0) {
1852 algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1853 } else {
1854 length-=headerSize;
1855 if( length<20 ||
1856 (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1857 ) {
1858 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1859 length);
1860 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1861 return 0;
1862 }
1863 }
1864
1865 if(length<0) {
1866 /* preflighting: iterate through algorithmic ranges */
1867 offset=algNamesOffset;
1868 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1869 offset+=4;
1870
1871 for(i=0; i<count; ++i) {
1872 inRange=(const AlgorithmicRange *)(inBytes+offset);
1873 offset+=ds->readUInt16(inRange->size);
1874 }
1875 } else {
1876 /* swap data */
1877 const uint16_t *p;
1878 uint16_t *q, *temp;
1879
1880 int16_t tokens[512];
1881 uint16_t tokenCount;
1882
1883 uint8_t map[256], trailMap[256];
1884
1885 /* copy the data for inaccessible bytes */
1886 if(inBytes!=outBytes) {
1887 uprv_memcpy(outBytes, inBytes, length);
1888 }
1889
1890 /* the initial 4 offsets first */
1891 tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1892 groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1893 groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1894 ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1895
1896 /*
1897 * now the tokens table
1898 * it needs to be permutated along with the compressed name strings
1899 */
1900 p=(const uint16_t *)(inBytes+16);
1901 q=(uint16_t *)(outBytes+16);
1902
1903 /* read and swap the tokenCount */
1904 tokenCount=ds->readUInt16(*p);
1905 ds->swapArray16(ds, p, 2, q, pErrorCode);
1906 ++p;
1907 ++q;
1908
1909 /* read the first 512 tokens and make the token maps */
1910 if(tokenCount<=512) {
1911 count=tokenCount;
1912 } else {
1913 count=512;
1914 }
1915 for(i=0; i<count; ++i) {
1916 tokens[i]=udata_readInt16(ds, p[i]);
1917 }
1918 for(; i<512; ++i) {
1919 tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1920 }
1921 makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1922 makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1923 if(U_FAILURE(*pErrorCode)) {
1924 return 0;
1925 }
1926
1927 /*
1928 * swap and permutate the tokens
1929 * go through a temporary array to support in-place swapping
1930 */
1931 temp=(uint16_t *)uprv_malloc(tokenCount*2);
1932 if(temp==NULL) {
1933 udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1934 tokenCount);
1935 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1936 return 0;
1937 }
1938
1939 /* swap and permutate single-/lead-byte tokens */
1940 for(i=0; i<tokenCount && i<256; ++i) {
1941 ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1942 }
1943
1944 /* swap and permutate trail-byte tokens */
1945 for(; i<tokenCount; ++i) {
1946 ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1947 }
1948
1949 /* copy the result into the output and free the temporary array */
1950 uprv_memcpy(q, temp, tokenCount*2);
1951 uprv_free(temp);
1952
1953 /*
1954 * swap the token strings but not a possible padding byte after
1955 * the terminating NUL of the last string
1956 */
1957 udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1958 outBytes+tokenStringOffset, pErrorCode);
1959 if(U_FAILURE(*pErrorCode)) {
73c04bcf 1960 udata_printError(ds, "uchar_swapNames(token strings) failed\n");
374ca955
A
1961 return 0;
1962 }
1963
1964 /* swap the group table */
1965 count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1966 ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1967 outBytes+groupsOffset, pErrorCode);
1968
1969 /*
1970 * swap the group strings
1971 * swap the string bytes but not the nibble-encoded string lengths
1972 */
1973 if(ds->inCharset!=ds->outCharset) {
1974 uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
1975
1976 const uint8_t *inStrings, *nextInStrings;
1977 uint8_t *outStrings;
1978
1979 uint8_t c;
1980
1981 inStrings=inBytes+groupStringOffset;
1982 outStrings=outBytes+groupStringOffset;
1983
1984 stringsCount=algNamesOffset-groupStringOffset;
1985
1986 /* iterate through string groups until only a few padding bytes are left */
1987 while(stringsCount>32) {
1988 nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
1989
1990 /* move past the length bytes */
1991 stringsCount-=(uint32_t)(nextInStrings-inStrings);
1992 outStrings+=nextInStrings-inStrings;
1993 inStrings=nextInStrings;
1994
1995 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
1996 stringsCount-=count;
1997
1998 /* swap the string bytes using map[] and trailMap[] */
1999 while(count>0) {
2000 c=*inStrings++;
2001 *outStrings++=map[c];
2002 if(tokens[c]!=-2) {
2003 --count;
2004 } else {
2005 /* token lead byte: swap the trail byte, too */
2006 *outStrings++=trailMap[*inStrings++];
2007 count-=2;
2008 }
2009 }
2010 }
2011 }
2012
2013 /* swap the algorithmic ranges */
2014 offset=algNamesOffset;
2015 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2016 ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2017 offset+=4;
2018
2019 for(i=0; i<count; ++i) {
2020 if(offset>(uint32_t)length) {
2021 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2022 length, i);
2023 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2024 return 0;
2025 }
2026
2027 inRange=(const AlgorithmicRange *)(inBytes+offset);
2028 outRange=(AlgorithmicRange *)(outBytes+offset);
2029 offset+=ds->readUInt16(inRange->size);
2030
2031 ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2032 ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2033 switch(inRange->type) {
2034 case 0:
2035 /* swap prefix string */
2036 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2037 outRange+1, pErrorCode);
2038 if(U_FAILURE(*pErrorCode)) {
73c04bcf
A
2039 udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2040 i);
374ca955
A
2041 return 0;
2042 }
2043 break;
2044 case 1:
2045 {
2046 /* swap factors and the prefix and factor strings */
73c04bcf 2047 uint32_t factorsCount;
374ca955
A
2048
2049 factorsCount=inRange->variant;
374ca955
A
2050 p=(const uint16_t *)(inRange+1);
2051 q=(uint16_t *)(outRange+1);
374ca955
A
2052 ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2053
2054 /* swap the strings, up to the last terminating NUL */
2055 p+=factorsCount;
2056 q+=factorsCount;
2057 stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2058 while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2059 --stringsCount;
2060 }
2061 ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2062 }
2063 break;
2064 default:
2065 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2066 inRange->type, i);
2067 *pErrorCode=U_UNSUPPORTED_ERROR;
2068 return 0;
2069 }
2070 }
2071 }
2072
2073 return headerSize+(int32_t)offset;
2074}
2075
b75a7d8f
A
2076/*
2077 * Hey, Emacs, please set the following:
2078 *
2079 * Local Variables:
2080 * indent-tabs-mode: nil
2081 * End:
2082 *
2083 */