]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unames.cpp
ICU-62141.0.1.tar.gz
[apple/icu.git] / icuSources / common / unames.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 1999-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: unames.c
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 1999oct04
16 * created by: Markus W. Scherer
17 */
18
19 #include "unicode/utypes.h"
20 #include "unicode/putil.h"
21 #include "unicode/uchar.h"
22 #include "unicode/udata.h"
23 #include "unicode/utf.h"
24 #include "unicode/utf16.h"
25 #include "uassert.h"
26 #include "ustr_imp.h"
27 #include "umutex.h"
28 #include "cmemory.h"
29 #include "cstring.h"
30 #include "ucln_cmn.h"
31 #include "udataswp.h"
32 #include "uprops.h"
33
34 U_NAMESPACE_BEGIN
35
36 /* prototypes ------------------------------------------------------------- */
37
38 static const char DATA_NAME[] = "unames";
39 static const char DATA_TYPE[] = "icu";
40
41 #define GROUP_SHIFT 5
42 #define LINES_PER_GROUP (1L<<GROUP_SHIFT)
43 #define GROUP_MASK (LINES_PER_GROUP-1)
44
45 /*
46 * This struct was replaced by explicitly accessing equivalent
47 * fields from triples of uint16_t.
48 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
49 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
50 * would advance by 6 bytes (3 uint16_t).
51 *
52 * We can't just change the data structure because it's loaded from a data file,
53 * and we don't want to make it less compact, so we changed the access code.
54 *
55 * For details see ICU tickets 6331 and 6008.
56 typedef struct {
57 uint16_t groupMSB,
58 offsetHigh, offsetLow; / * avoid padding * /
59 } Group;
60 */
61 enum {
62 GROUP_MSB,
63 GROUP_OFFSET_HIGH,
64 GROUP_OFFSET_LOW,
65 GROUP_LENGTH
66 };
67
68 /*
69 * Get the 32-bit group offset.
70 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
71 * @return group offset (int32_t)
72 */
73 #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
74
75 #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
76 #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
77
78 typedef struct {
79 uint32_t start, end;
80 uint8_t type, variant;
81 uint16_t size;
82 } AlgorithmicRange;
83
84 typedef struct {
85 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
86 } UCharNames;
87
88 /*
89 * Get the groups table from a UCharNames struct.
90 * The groups table consists of one uint16_t groupCount followed by
91 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
92 * and the comment for the old struct Group above.
93 *
94 * @param names (const UCharNames *) pointer to the UCharNames indexes
95 * @return (const uint16_t *) pointer to the groups table
96 */
97 #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
98
99 typedef struct {
100 const char *otherName;
101 UChar32 code;
102 } FindName;
103
104 #define DO_FIND_NAME NULL
105
106 static UDataMemory *uCharNamesData=NULL;
107 static UCharNames *uCharNames=NULL;
108 static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
109
110 /*
111 * Maximum length of character names (regular & 1.0).
112 */
113 static int32_t gMaxNameLength=0;
114
115 /*
116 * Set of chars used in character names (regular & 1.0).
117 * Chars are platform-dependent (can be EBCDIC).
118 */
119 static uint32_t gNameSet[8]={ 0 };
120
121 #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
122 #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
123 #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
124
125 #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
126
127 static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
128 "unassigned",
129 "uppercase letter",
130 "lowercase letter",
131 "titlecase letter",
132 "modifier letter",
133 "other letter",
134 "non spacing mark",
135 "enclosing mark",
136 "combining spacing mark",
137 "decimal digit number",
138 "letter number",
139 "other number",
140 "space separator",
141 "line separator",
142 "paragraph separator",
143 "control",
144 "format",
145 "private use area",
146 "surrogate",
147 "dash punctuation",
148 "start punctuation",
149 "end punctuation",
150 "connector punctuation",
151 "other punctuation",
152 "math symbol",
153 "currency symbol",
154 "modifier symbol",
155 "other symbol",
156 "initial punctuation",
157 "final punctuation",
158 "noncharacter",
159 "lead surrogate",
160 "trail surrogate"
161 };
162
163 /* implementation ----------------------------------------------------------- */
164
165 static UBool U_CALLCONV unames_cleanup(void)
166 {
167 if(uCharNamesData) {
168 udata_close(uCharNamesData);
169 uCharNamesData = NULL;
170 }
171 if(uCharNames) {
172 uCharNames = NULL;
173 }
174 gCharNamesInitOnce.reset();
175 gMaxNameLength=0;
176 return TRUE;
177 }
178
179 static UBool U_CALLCONV
180 isAcceptable(void * /*context*/,
181 const char * /*type*/, const char * /*name*/,
182 const UDataInfo *pInfo) {
183 return (UBool)(
184 pInfo->size>=20 &&
185 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
186 pInfo->charsetFamily==U_CHARSET_FAMILY &&
187 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
188 pInfo->dataFormat[1]==0x6e &&
189 pInfo->dataFormat[2]==0x61 &&
190 pInfo->dataFormat[3]==0x6d &&
191 pInfo->formatVersion[0]==1);
192 }
193
194 static void U_CALLCONV
195 loadCharNames(UErrorCode &status) {
196 U_ASSERT(uCharNamesData == NULL);
197 U_ASSERT(uCharNames == NULL);
198
199 uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
200 if(U_FAILURE(status)) {
201 uCharNamesData = NULL;
202 } else {
203 uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
204 }
205 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
206 }
207
208
209 static UBool
210 isDataLoaded(UErrorCode *pErrorCode) {
211 umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
212 return U_SUCCESS(*pErrorCode);
213 }
214
215 #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
216 if((bufferLength)>0) { \
217 *(buffer)++=c; \
218 --(bufferLength); \
219 } \
220 ++(bufferPos); \
221 }
222
223 #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
224
225 /*
226 * Important: expandName() and compareName() are almost the same -
227 * apply fixes to both.
228 *
229 * UnicodeData.txt uses ';' as a field separator, so no
230 * field can contain ';' as part of its contents.
231 * In unames.dat, it is marked as token[';']==-1 only if the
232 * semicolon is used in the data file - which is iff we
233 * have Unicode 1.0 names or ISO comments or aliases.
234 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
235 * although we know that it will never be part of a name.
236 */
237 static uint16_t
238 expandName(UCharNames *names,
239 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
240 char *buffer, uint16_t bufferLength) {
241 uint16_t *tokens=(uint16_t *)names+8;
242 uint16_t token, tokenCount=*tokens++, bufferPos=0;
243 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
244 uint8_t c;
245
246 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
247 /*
248 * skip the modern name if it is not requested _and_
249 * if the semicolon byte value is a character, not a token number
250 */
251 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
252 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
253 do {
254 while(nameLength>0) {
255 --nameLength;
256 if(*name++==';') {
257 break;
258 }
259 }
260 } while(--fieldIndex>0);
261 } else {
262 /*
263 * the semicolon byte value is a token number, therefore
264 * only modern names are stored in unames.dat and there is no
265 * such requested alternate name here
266 */
267 nameLength=0;
268 }
269 }
270
271 /* write each letter directly, and write a token word per token */
272 while(nameLength>0) {
273 --nameLength;
274 c=*name++;
275
276 if(c>=tokenCount) {
277 if(c!=';') {
278 /* implicit letter */
279 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
280 } else {
281 /* finished */
282 break;
283 }
284 } else {
285 token=tokens[c];
286 if(token==(uint16_t)(-2)) {
287 /* this is a lead byte for a double-byte token */
288 token=tokens[c<<8|*name++];
289 --nameLength;
290 }
291 if(token==(uint16_t)(-1)) {
292 if(c!=';') {
293 /* explicit letter */
294 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
295 } else {
296 /* stop, but skip the semicolon if we are seeking
297 extended names and there was no 2.0 name but there
298 is a 1.0 name. */
299 if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
300 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
301 continue;
302 }
303 }
304 /* finished */
305 break;
306 }
307 } else {
308 /* write token word */
309 uint8_t *tokenString=tokenStrings+token;
310 while((c=*tokenString++)!=0) {
311 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
312 }
313 }
314 }
315 }
316
317 /* zero-terminate */
318 if(bufferLength>0) {
319 *buffer=0;
320 }
321
322 return bufferPos;
323 }
324
325 /*
326 * compareName() is almost the same as expandName() except that it compares
327 * the currently expanded name to an input name.
328 * It returns the match/no match result as soon as possible.
329 */
330 static UBool
331 compareName(UCharNames *names,
332 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
333 const char *otherName) {
334 uint16_t *tokens=(uint16_t *)names+8;
335 uint16_t token, tokenCount=*tokens++;
336 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
337 uint8_t c;
338 const char *origOtherName = otherName;
339
340 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
341 /*
342 * skip the modern name if it is not requested _and_
343 * if the semicolon byte value is a character, not a token number
344 */
345 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
346 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
347 do {
348 while(nameLength>0) {
349 --nameLength;
350 if(*name++==';') {
351 break;
352 }
353 }
354 } while(--fieldIndex>0);
355 } else {
356 /*
357 * the semicolon byte value is a token number, therefore
358 * only modern names are stored in unames.dat and there is no
359 * such requested alternate name here
360 */
361 nameLength=0;
362 }
363 }
364
365 /* compare each letter directly, and compare a token word per token */
366 while(nameLength>0) {
367 --nameLength;
368 c=*name++;
369
370 if(c>=tokenCount) {
371 if(c!=';') {
372 /* implicit letter */
373 if((char)c!=*otherName++) {
374 return FALSE;
375 }
376 } else {
377 /* finished */
378 break;
379 }
380 } else {
381 token=tokens[c];
382 if(token==(uint16_t)(-2)) {
383 /* this is a lead byte for a double-byte token */
384 token=tokens[c<<8|*name++];
385 --nameLength;
386 }
387 if(token==(uint16_t)(-1)) {
388 if(c!=';') {
389 /* explicit letter */
390 if((char)c!=*otherName++) {
391 return FALSE;
392 }
393 } else {
394 /* stop, but skip the semicolon if we are seeking
395 extended names and there was no 2.0 name but there
396 is a 1.0 name. */
397 if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
398 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
399 continue;
400 }
401 }
402 /* finished */
403 break;
404 }
405 } else {
406 /* write token word */
407 uint8_t *tokenString=tokenStrings+token;
408 while((c=*tokenString++)!=0) {
409 if((char)c!=*otherName++) {
410 return FALSE;
411 }
412 }
413 }
414 }
415 }
416
417 /* complete match? */
418 return (UBool)(*otherName==0);
419 }
420
421 static uint8_t getCharCat(UChar32 cp) {
422 uint8_t cat;
423
424 if (U_IS_UNICODE_NONCHAR(cp)) {
425 return U_NONCHARACTER_CODE_POINT;
426 }
427
428 if ((cat = u_charType(cp)) == U_SURROGATE) {
429 cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
430 }
431
432 return cat;
433 }
434
435 static const char *getCharCatName(UChar32 cp) {
436 uint8_t cat = getCharCat(cp);
437
438 /* Return unknown if the table of names above is not up to
439 date. */
440
441 if (cat >= UPRV_LENGTHOF(charCatNames)) {
442 return "unknown";
443 } else {
444 return charCatNames[cat];
445 }
446 }
447
448 static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
449 const char *catname = getCharCatName(code);
450 uint16_t length = 0;
451
452 UChar32 cp;
453 int ndigits, i;
454
455 WRITE_CHAR(buffer, bufferLength, length, '<');
456 while (catname[length - 1]) {
457 WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
458 }
459 WRITE_CHAR(buffer, bufferLength, length, '-');
460 for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
461 ;
462 if (ndigits < 4)
463 ndigits = 4;
464 for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
465 uint8_t v = (uint8_t)(cp & 0xf);
466 buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
467 }
468 buffer += ndigits;
469 length += ndigits;
470 WRITE_CHAR(buffer, bufferLength, length, '>');
471
472 return length;
473 }
474
475 /*
476 * getGroup() does a binary search for the group that contains the
477 * Unicode code point "code".
478 * The return value is always a valid Group* that may contain "code"
479 * or else is the highest group before "code".
480 * If the lowest group is after "code", then that one is returned.
481 */
482 static const uint16_t *
483 getGroup(UCharNames *names, uint32_t code) {
484 const uint16_t *groups=GET_GROUPS(names);
485 uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
486 start=0,
487 limit=*groups++,
488 number;
489
490 /* binary search for the group of names that contains the one for code */
491 while(start<limit-1) {
492 number=(uint16_t)((start+limit)/2);
493 if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
494 limit=number;
495 } else {
496 start=number;
497 }
498 }
499
500 /* return this regardless of whether it is an exact match */
501 return groups+start*GROUP_LENGTH;
502 }
503
504 /*
505 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
506 * expands them into offsets and lengths for each string.
507 * Lengths are stored with a variable-width encoding in consecutive nibbles:
508 * If a nibble<0xc, then it is the length itself (0=empty string).
509 * If a nibble>=0xc, then it forms a length value with the following nibble.
510 * Calculation see below.
511 * The offsets and lengths arrays must be at least 33 (one more) long because
512 * there is no check here at the end if the last nibble is still used.
513 */
514 static const uint8_t *
515 expandGroupLengths(const uint8_t *s,
516 uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
517 /* read the lengths of the 32 strings in this group and get each string's offset */
518 uint16_t i=0, offset=0, length=0;
519 uint8_t lengthByte;
520
521 /* all 32 lengths must be read to get the offset of the first group string */
522 while(i<LINES_PER_GROUP) {
523 lengthByte=*s++;
524
525 /* read even nibble - MSBs of lengthByte */
526 if(length>=12) {
527 /* double-nibble length spread across two bytes */
528 length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
529 lengthByte&=0xf;
530 } else if((lengthByte /* &0xf0 */)>=0xc0) {
531 /* double-nibble length spread across this one byte */
532 length=(uint16_t)((lengthByte&0x3f)+12);
533 } else {
534 /* single-nibble length in MSBs */
535 length=(uint16_t)(lengthByte>>4);
536 lengthByte&=0xf;
537 }
538
539 *offsets++=offset;
540 *lengths++=length;
541
542 offset+=length;
543 ++i;
544
545 /* read odd nibble - LSBs of lengthByte */
546 if((lengthByte&0xf0)==0) {
547 /* this nibble was not consumed for a double-nibble length above */
548 length=lengthByte;
549 if(length<12) {
550 /* single-nibble length in LSBs */
551 *offsets++=offset;
552 *lengths++=length;
553
554 offset+=length;
555 ++i;
556 }
557 } else {
558 length=0; /* prevent double-nibble detection in the next iteration */
559 }
560 }
561
562 /* now, s is at the first group string */
563 return s;
564 }
565
566 static uint16_t
567 expandGroupName(UCharNames *names, const uint16_t *group,
568 uint16_t lineNumber, UCharNameChoice nameChoice,
569 char *buffer, uint16_t bufferLength) {
570 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
571 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
572 s=expandGroupLengths(s, offsets, lengths);
573 return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
574 buffer, bufferLength);
575 }
576
577 static uint16_t
578 getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
579 char *buffer, uint16_t bufferLength) {
580 const uint16_t *group=getGroup(names, code);
581 if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
582 return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
583 buffer, bufferLength);
584 } else {
585 /* group not found */
586 /* zero-terminate */
587 if(bufferLength>0) {
588 *buffer=0;
589 }
590 return 0;
591 }
592 }
593
594 /*
595 * enumGroupNames() enumerates all the names in a 32-group
596 * and either calls the enumerator function or finds a given input name.
597 */
598 static UBool
599 enumGroupNames(UCharNames *names, const uint16_t *group,
600 UChar32 start, UChar32 end,
601 UEnumCharNamesFn *fn, void *context,
602 UCharNameChoice nameChoice) {
603 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
604 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
605
606 s=expandGroupLengths(s, offsets, lengths);
607 if(fn!=DO_FIND_NAME) {
608 char buffer[200];
609 uint16_t length;
610
611 while(start<=end) {
612 length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
613 if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
614 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
615 }
616 /* here, we assume that the buffer is large enough */
617 if(length>0) {
618 if(!fn(context, start, nameChoice, buffer, length)) {
619 return FALSE;
620 }
621 }
622 ++start;
623 }
624 } else {
625 const char *otherName=((FindName *)context)->otherName;
626 while(start<=end) {
627 if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
628 ((FindName *)context)->code=start;
629 return FALSE;
630 }
631 ++start;
632 }
633 }
634 return TRUE;
635 }
636
637 /*
638 * enumExtNames enumerate extended names.
639 * It only needs to do it if it is called with a real function and not
640 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
641 * for extended names by itself.
642 */
643 static UBool
644 enumExtNames(UChar32 start, UChar32 end,
645 UEnumCharNamesFn *fn, void *context)
646 {
647 if(fn!=DO_FIND_NAME) {
648 char buffer[200];
649 uint16_t length;
650
651 while(start<=end) {
652 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
653 /* here, we assume that the buffer is large enough */
654 if(length>0) {
655 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
656 return FALSE;
657 }
658 }
659 ++start;
660 }
661 }
662
663 return TRUE;
664 }
665
666 static UBool
667 enumNames(UCharNames *names,
668 UChar32 start, UChar32 limit,
669 UEnumCharNamesFn *fn, void *context,
670 UCharNameChoice nameChoice) {
671 uint16_t startGroupMSB, endGroupMSB, groupCount;
672 const uint16_t *group, *groupLimit;
673
674 startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
675 endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
676
677 /* find the group that contains start, or the highest before it */
678 group=getGroup(names, start);
679
680 if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
681 /* enumerate synthetic names between start and the group start */
682 UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
683 if(extLimit>limit) {
684 extLimit=limit;
685 }
686 if(!enumExtNames(start, extLimit-1, fn, context)) {
687 return FALSE;
688 }
689 start=extLimit;
690 }
691
692 if(startGroupMSB==endGroupMSB) {
693 if(startGroupMSB==group[GROUP_MSB]) {
694 /* if start and limit-1 are in the same group, then enumerate only in that one */
695 return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
696 }
697 } else {
698 const uint16_t *groups=GET_GROUPS(names);
699 groupCount=*groups++;
700 groupLimit=groups+groupCount*GROUP_LENGTH;
701
702 if(startGroupMSB==group[GROUP_MSB]) {
703 /* enumerate characters in the partial start group */
704 if((start&GROUP_MASK)!=0) {
705 if(!enumGroupNames(names, group,
706 start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
707 fn, context, nameChoice)) {
708 return FALSE;
709 }
710 group=NEXT_GROUP(group); /* continue with the next group */
711 }
712 } else if(startGroupMSB>group[GROUP_MSB]) {
713 /* make sure that we start enumerating with the first group after start */
714 const uint16_t *nextGroup=NEXT_GROUP(group);
715 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
716 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
717 if (end > limit) {
718 end = limit;
719 }
720 if (!enumExtNames(start, end - 1, fn, context)) {
721 return FALSE;
722 }
723 }
724 group=nextGroup;
725 }
726
727 /* enumerate entire groups between the start- and end-groups */
728 while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
729 const uint16_t *nextGroup;
730 start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
731 if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
732 return FALSE;
733 }
734 nextGroup=NEXT_GROUP(group);
735 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
736 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
737 if (end > limit) {
738 end = limit;
739 }
740 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
741 return FALSE;
742 }
743 }
744 group=nextGroup;
745 }
746
747 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
748 if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
749 return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
750 } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
751 UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
752 if (next > start) {
753 start = next;
754 }
755 } else {
756 return TRUE;
757 }
758 }
759
760 /* we have not found a group, which means everything is made of
761 extended names. */
762 if (nameChoice == U_EXTENDED_CHAR_NAME) {
763 if (limit > UCHAR_MAX_VALUE + 1) {
764 limit = UCHAR_MAX_VALUE + 1;
765 }
766 return enumExtNames(start, limit - 1, fn, context);
767 }
768
769 return TRUE;
770 }
771
772 static uint16_t
773 writeFactorSuffix(const uint16_t *factors, uint16_t count,
774 const char *s, /* suffix elements */
775 uint32_t code,
776 uint16_t indexes[8], /* output fields from here */
777 const char *elementBases[8], const char *elements[8],
778 char *buffer, uint16_t bufferLength) {
779 uint16_t i, factor, bufferPos=0;
780 char c;
781
782 /* write elements according to the factors */
783
784 /*
785 * the factorized elements are determined by modulo arithmetic
786 * with the factors of this algorithm
787 *
788 * note that for fewer operations, count is decremented here
789 */
790 --count;
791 for(i=count; i>0; --i) {
792 factor=factors[i];
793 indexes[i]=(uint16_t)(code%factor);
794 code/=factor;
795 }
796 /*
797 * we don't need to calculate the last modulus because start<=code<=end
798 * guarantees here that code<=factors[0]
799 */
800 indexes[0]=(uint16_t)code;
801
802 /* write each element */
803 for(;;) {
804 if(elementBases!=NULL) {
805 *elementBases++=s;
806 }
807
808 /* skip indexes[i] strings */
809 factor=indexes[i];
810 while(factor>0) {
811 while(*s++!=0) {}
812 --factor;
813 }
814 if(elements!=NULL) {
815 *elements++=s;
816 }
817
818 /* write element */
819 while((c=*s++)!=0) {
820 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
821 }
822
823 /* we do not need to perform the rest of this loop for i==count - break here */
824 if(i>=count) {
825 break;
826 }
827
828 /* skip the rest of the strings for this factors[i] */
829 factor=(uint16_t)(factors[i]-indexes[i]-1);
830 while(factor>0) {
831 while(*s++!=0) {}
832 --factor;
833 }
834
835 ++i;
836 }
837
838 /* zero-terminate */
839 if(bufferLength>0) {
840 *buffer=0;
841 }
842
843 return bufferPos;
844 }
845
846 /*
847 * Important:
848 * Parts of findAlgName() are almost the same as some of getAlgName().
849 * Fixes must be applied to both.
850 */
851 static uint16_t
852 getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
853 char *buffer, uint16_t bufferLength) {
854 uint16_t bufferPos=0;
855
856 /* Only the normative character name can be algorithmic. */
857 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
858 /* zero-terminate */
859 if(bufferLength>0) {
860 *buffer=0;
861 }
862 return 0;
863 }
864
865 switch(range->type) {
866 case 0: {
867 /* name = prefix hex-digits */
868 const char *s=(const char *)(range+1);
869 char c;
870
871 uint16_t i, count;
872
873 /* copy prefix */
874 while((c=*s++)!=0) {
875 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
876 }
877
878 /* write hexadecimal code point value */
879 count=range->variant;
880
881 /* zero-terminate */
882 if(count<bufferLength) {
883 buffer[count]=0;
884 }
885
886 for(i=count; i>0;) {
887 if(--i<bufferLength) {
888 c=(char)(code&0xf);
889 if(c<10) {
890 c+='0';
891 } else {
892 c+='A'-10;
893 }
894 buffer[i]=c;
895 }
896 code>>=4;
897 }
898
899 bufferPos+=count;
900 break;
901 }
902 case 1: {
903 /* name = prefix factorized-elements */
904 uint16_t indexes[8];
905 const uint16_t *factors=(const uint16_t *)(range+1);
906 uint16_t count=range->variant;
907 const char *s=(const char *)(factors+count);
908 char c;
909
910 /* copy prefix */
911 while((c=*s++)!=0) {
912 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
913 }
914
915 bufferPos+=writeFactorSuffix(factors, count,
916 s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
917 break;
918 }
919 default:
920 /* undefined type */
921 /* zero-terminate */
922 if(bufferLength>0) {
923 *buffer=0;
924 }
925 break;
926 }
927
928 return bufferPos;
929 }
930
931 /*
932 * Important: enumAlgNames() and findAlgName() are almost the same.
933 * Any fix must be applied to both.
934 */
935 static UBool
936 enumAlgNames(AlgorithmicRange *range,
937 UChar32 start, UChar32 limit,
938 UEnumCharNamesFn *fn, void *context,
939 UCharNameChoice nameChoice) {
940 char buffer[200];
941 uint16_t length;
942
943 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
944 return TRUE;
945 }
946
947 switch(range->type) {
948 case 0: {
949 char *s, *end;
950 char c;
951
952 /* get the full name of the start character */
953 length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
954 if(length<=0) {
955 return TRUE;
956 }
957
958 /* call the enumerator function with this first character */
959 if(!fn(context, start, nameChoice, buffer, length)) {
960 return FALSE;
961 }
962
963 /* go to the end of the name; all these names have the same length */
964 end=buffer;
965 while(*end!=0) {
966 ++end;
967 }
968
969 /* enumerate the rest of the names */
970 while(++start<limit) {
971 /* increment the hexadecimal number on a character-basis */
972 s=end;
973 for (;;) {
974 c=*--s;
975 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
976 *s=(char)(c+1);
977 break;
978 } else if(c=='9') {
979 *s='A';
980 break;
981 } else if(c=='F') {
982 *s='0';
983 }
984 }
985
986 if(!fn(context, start, nameChoice, buffer, length)) {
987 return FALSE;
988 }
989 }
990 break;
991 }
992 case 1: {
993 uint16_t indexes[8];
994 const char *elementBases[8], *elements[8];
995 const uint16_t *factors=(const uint16_t *)(range+1);
996 uint16_t count=range->variant;
997 const char *s=(const char *)(factors+count);
998 char *suffix, *t;
999 uint16_t prefixLength, i, idx;
1000
1001 char c;
1002
1003 /* name = prefix factorized-elements */
1004
1005 /* copy prefix */
1006 suffix=buffer;
1007 prefixLength=0;
1008 while((c=*s++)!=0) {
1009 *suffix++=c;
1010 ++prefixLength;
1011 }
1012
1013 /* append the suffix of the start character */
1014 length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1015 s, (uint32_t)start-range->start,
1016 indexes, elementBases, elements,
1017 suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1018
1019 /* call the enumerator function with this first character */
1020 if(!fn(context, start, nameChoice, buffer, length)) {
1021 return FALSE;
1022 }
1023
1024 /* enumerate the rest of the names */
1025 while(++start<limit) {
1026 /* increment the indexes in lexical order bound by the factors */
1027 i=count;
1028 for (;;) {
1029 idx=(uint16_t)(indexes[--i]+1);
1030 if(idx<factors[i]) {
1031 /* skip one index and its element string */
1032 indexes[i]=idx;
1033 s=elements[i];
1034 while(*s++!=0) {
1035 }
1036 elements[i]=s;
1037 break;
1038 } else {
1039 /* reset this index to 0 and its element string to the first one */
1040 indexes[i]=0;
1041 elements[i]=elementBases[i];
1042 }
1043 }
1044
1045 /* to make matters a little easier, just append all elements to the suffix */
1046 t=suffix;
1047 length=prefixLength;
1048 for(i=0; i<count; ++i) {
1049 s=elements[i];
1050 while((c=*s++)!=0) {
1051 *t++=c;
1052 ++length;
1053 }
1054 }
1055 /* zero-terminate */
1056 *t=0;
1057
1058 if(!fn(context, start, nameChoice, buffer, length)) {
1059 return FALSE;
1060 }
1061 }
1062 break;
1063 }
1064 default:
1065 /* undefined type */
1066 break;
1067 }
1068
1069 return TRUE;
1070 }
1071
1072 /*
1073 * findAlgName() is almost the same as enumAlgNames() except that it
1074 * returns the code point for a name if it fits into the range.
1075 * It returns 0xffff otherwise.
1076 */
1077 static UChar32
1078 findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1079 UChar32 code;
1080
1081 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1082 return 0xffff;
1083 }
1084
1085 switch(range->type) {
1086 case 0: {
1087 /* name = prefix hex-digits */
1088 const char *s=(const char *)(range+1);
1089 char c;
1090
1091 uint16_t i, count;
1092
1093 /* compare prefix */
1094 while((c=*s++)!=0) {
1095 if((char)c!=*otherName++) {
1096 return 0xffff;
1097 }
1098 }
1099
1100 /* read hexadecimal code point value */
1101 count=range->variant;
1102 code=0;
1103 for(i=0; i<count; ++i) {
1104 c=*otherName++;
1105 if('0'<=c && c<='9') {
1106 code=(code<<4)|(c-'0');
1107 } else if('A'<=c && c<='F') {
1108 code=(code<<4)|(c-'A'+10);
1109 } else {
1110 return 0xffff;
1111 }
1112 }
1113
1114 /* does it fit into the range? */
1115 if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1116 return code;
1117 }
1118 break;
1119 }
1120 case 1: {
1121 char buffer[64];
1122 uint16_t indexes[8];
1123 const char *elementBases[8], *elements[8];
1124 const uint16_t *factors=(const uint16_t *)(range+1);
1125 uint16_t count=range->variant;
1126 const char *s=(const char *)(factors+count), *t;
1127 UChar32 start, limit;
1128 uint16_t i, idx;
1129
1130 char c;
1131
1132 /* name = prefix factorized-elements */
1133
1134 /* compare prefix */
1135 while((c=*s++)!=0) {
1136 if((char)c!=*otherName++) {
1137 return 0xffff;
1138 }
1139 }
1140
1141 start=(UChar32)range->start;
1142 limit=(UChar32)(range->end+1);
1143
1144 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1145 writeFactorSuffix(factors, count, s, 0,
1146 indexes, elementBases, elements, buffer, sizeof(buffer));
1147
1148 /* compare the first suffix */
1149 if(0==uprv_strcmp(otherName, buffer)) {
1150 return start;
1151 }
1152
1153 /* enumerate and compare the rest of the suffixes */
1154 while(++start<limit) {
1155 /* increment the indexes in lexical order bound by the factors */
1156 i=count;
1157 for (;;) {
1158 idx=(uint16_t)(indexes[--i]+1);
1159 if(idx<factors[i]) {
1160 /* skip one index and its element string */
1161 indexes[i]=idx;
1162 s=elements[i];
1163 while(*s++!=0) {}
1164 elements[i]=s;
1165 break;
1166 } else {
1167 /* reset this index to 0 and its element string to the first one */
1168 indexes[i]=0;
1169 elements[i]=elementBases[i];
1170 }
1171 }
1172
1173 /* to make matters a little easier, just compare all elements of the suffix */
1174 t=otherName;
1175 for(i=0; i<count; ++i) {
1176 s=elements[i];
1177 while((c=*s++)!=0) {
1178 if(c!=*t++) {
1179 s=""; /* does not match */
1180 i=99;
1181 }
1182 }
1183 }
1184 if(i<99 && *t==0) {
1185 return start;
1186 }
1187 }
1188 break;
1189 }
1190 default:
1191 /* undefined type */
1192 break;
1193 }
1194
1195 return 0xffff;
1196 }
1197
1198 /* sets of name characters, maximum name lengths ---------------------------- */
1199
1200 #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1201 #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1202
1203 static int32_t
1204 calcStringSetLength(uint32_t set[8], const char *s) {
1205 int32_t length=0;
1206 char c;
1207
1208 while((c=*s++)!=0) {
1209 SET_ADD(set, c);
1210 ++length;
1211 }
1212 return length;
1213 }
1214
1215 static int32_t
1216 calcAlgNameSetsLengths(int32_t maxNameLength) {
1217 AlgorithmicRange *range;
1218 uint32_t *p;
1219 uint32_t rangeCount;
1220 int32_t length;
1221
1222 /* enumerate algorithmic ranges */
1223 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1224 rangeCount=*p;
1225 range=(AlgorithmicRange *)(p+1);
1226 while(rangeCount>0) {
1227 switch(range->type) {
1228 case 0:
1229 /* name = prefix + (range->variant times) hex-digits */
1230 /* prefix */
1231 length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1232 if(length>maxNameLength) {
1233 maxNameLength=length;
1234 }
1235 break;
1236 case 1: {
1237 /* name = prefix factorized-elements */
1238 const uint16_t *factors=(const uint16_t *)(range+1);
1239 const char *s;
1240 int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1241
1242 /* prefix length */
1243 s=(const char *)(factors+count);
1244 length=calcStringSetLength(gNameSet, s);
1245 s+=length+1; /* start of factor suffixes */
1246
1247 /* get the set and maximum factor suffix length for each factor */
1248 for(i=0; i<count; ++i) {
1249 maxFactorLength=0;
1250 for(factor=factors[i]; factor>0; --factor) {
1251 factorLength=calcStringSetLength(gNameSet, s);
1252 s+=factorLength+1;
1253 if(factorLength>maxFactorLength) {
1254 maxFactorLength=factorLength;
1255 }
1256 }
1257 length+=maxFactorLength;
1258 }
1259
1260 if(length>maxNameLength) {
1261 maxNameLength=length;
1262 }
1263 break;
1264 }
1265 default:
1266 /* unknown type */
1267 break;
1268 }
1269
1270 range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1271 --rangeCount;
1272 }
1273 return maxNameLength;
1274 }
1275
1276 static int32_t
1277 calcExtNameSetsLengths(int32_t maxNameLength) {
1278 int32_t i, length;
1279
1280 for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) {
1281 /*
1282 * for each category, count the length of the category name
1283 * plus 9=
1284 * 2 for <>
1285 * 1 for -
1286 * 6 for most hex digits per code point
1287 */
1288 length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1289 if(length>maxNameLength) {
1290 maxNameLength=length;
1291 }
1292 }
1293 return maxNameLength;
1294 }
1295
1296 static int32_t
1297 calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1298 uint32_t set[8],
1299 const uint8_t **pLine, const uint8_t *lineLimit) {
1300 const uint8_t *line=*pLine;
1301 int32_t length=0, tokenLength;
1302 uint16_t c, token;
1303
1304 while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1305 if(c>=tokenCount) {
1306 /* implicit letter */
1307 SET_ADD(set, c);
1308 ++length;
1309 } else {
1310 token=tokens[c];
1311 if(token==(uint16_t)(-2)) {
1312 /* this is a lead byte for a double-byte token */
1313 c=c<<8|*line++;
1314 token=tokens[c];
1315 }
1316 if(token==(uint16_t)(-1)) {
1317 /* explicit letter */
1318 SET_ADD(set, c);
1319 ++length;
1320 } else {
1321 /* count token word */
1322 if(tokenLengths!=NULL) {
1323 /* use cached token length */
1324 tokenLength=tokenLengths[c];
1325 if(tokenLength==0) {
1326 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1327 tokenLengths[c]=(int8_t)tokenLength;
1328 }
1329 } else {
1330 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1331 }
1332 length+=tokenLength;
1333 }
1334 }
1335 }
1336
1337 *pLine=line;
1338 return length;
1339 }
1340
1341 static void
1342 calcGroupNameSetsLengths(int32_t maxNameLength) {
1343 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1344
1345 uint16_t *tokens=(uint16_t *)uCharNames+8;
1346 uint16_t tokenCount=*tokens++;
1347 uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1348
1349 int8_t *tokenLengths;
1350
1351 const uint16_t *group;
1352 const uint8_t *s, *line, *lineLimit;
1353
1354 int32_t groupCount, lineNumber, length;
1355
1356 tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1357 if(tokenLengths!=NULL) {
1358 uprv_memset(tokenLengths, 0, tokenCount);
1359 }
1360
1361 group=GET_GROUPS(uCharNames);
1362 groupCount=*group++;
1363
1364 /* enumerate all groups */
1365 while(groupCount>0) {
1366 s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1367 s=expandGroupLengths(s, offsets, lengths);
1368
1369 /* enumerate all lines in each group */
1370 for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1371 line=s+offsets[lineNumber];
1372 length=lengths[lineNumber];
1373 if(length==0) {
1374 continue;
1375 }
1376
1377 lineLimit=line+length;
1378
1379 /* read regular name */
1380 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1381 if(length>maxNameLength) {
1382 maxNameLength=length;
1383 }
1384 if(line==lineLimit) {
1385 continue;
1386 }
1387
1388 /* read Unicode 1.0 name */
1389 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390 if(length>maxNameLength) {
1391 maxNameLength=length;
1392 }
1393 if(line==lineLimit) {
1394 continue;
1395 }
1396
1397 /* read ISO comment */
1398 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1399 }
1400
1401 group=NEXT_GROUP(group);
1402 --groupCount;
1403 }
1404
1405 if(tokenLengths!=NULL) {
1406 uprv_free(tokenLengths);
1407 }
1408
1409 /* set gMax... - name length last for threading */
1410 gMaxNameLength=maxNameLength;
1411 }
1412
1413 static UBool
1414 calcNameSetsLengths(UErrorCode *pErrorCode) {
1415 static const char extChars[]="0123456789ABCDEF<>-";
1416 int32_t i, maxNameLength;
1417
1418 if(gMaxNameLength!=0) {
1419 return TRUE;
1420 }
1421
1422 if(!isDataLoaded(pErrorCode)) {
1423 return FALSE;
1424 }
1425
1426 /* set hex digits, used in various names, and <>-, used in extended names */
1427 for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
1428 SET_ADD(gNameSet, extChars[i]);
1429 }
1430
1431 /* set sets and lengths from algorithmic names */
1432 maxNameLength=calcAlgNameSetsLengths(0);
1433
1434 /* set sets and lengths from extended names */
1435 maxNameLength=calcExtNameSetsLengths(maxNameLength);
1436
1437 /* set sets and lengths from group names, set global maximum values */
1438 calcGroupNameSetsLengths(maxNameLength);
1439
1440 return TRUE;
1441 }
1442
1443 U_NAMESPACE_END
1444
1445 /* public API --------------------------------------------------------------- */
1446
1447 U_NAMESPACE_USE
1448
1449 U_CAPI int32_t U_EXPORT2
1450 u_charName(UChar32 code, UCharNameChoice nameChoice,
1451 char *buffer, int32_t bufferLength,
1452 UErrorCode *pErrorCode) {
1453 AlgorithmicRange *algRange;
1454 uint32_t *p;
1455 uint32_t i;
1456 int32_t length;
1457
1458 /* check the argument values */
1459 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1460 return 0;
1461 } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1462 bufferLength<0 || (bufferLength>0 && buffer==NULL)
1463 ) {
1464 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1465 return 0;
1466 }
1467
1468 if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1469 return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1470 }
1471
1472 length=0;
1473
1474 /* try algorithmic names first */
1475 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1476 i=*p;
1477 algRange=(AlgorithmicRange *)(p+1);
1478 while(i>0) {
1479 if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1480 length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1481 break;
1482 }
1483 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1484 --i;
1485 }
1486
1487 if(i==0) {
1488 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1489 length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1490 if (!length) {
1491 /* extended character name */
1492 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1493 }
1494 } else {
1495 /* normal character name */
1496 length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1497 }
1498 }
1499
1500 return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1501 }
1502
1503 U_CAPI int32_t U_EXPORT2
1504 u_getISOComment(UChar32 /*c*/,
1505 char *dest, int32_t destCapacity,
1506 UErrorCode *pErrorCode) {
1507 /* check the argument values */
1508 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1509 return 0;
1510 } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1511 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1512 return 0;
1513 }
1514
1515 return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1516 }
1517
1518 U_CAPI UChar32 U_EXPORT2
1519 u_charFromName(UCharNameChoice nameChoice,
1520 const char *name,
1521 UErrorCode *pErrorCode) {
1522 char upper[120], lower[120];
1523 FindName findName;
1524 AlgorithmicRange *algRange;
1525 uint32_t *p;
1526 uint32_t i;
1527 UChar32 cp = 0;
1528 char c0;
1529 UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
1530
1531 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1532 return error;
1533 }
1534
1535 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1536 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1537 return error;
1538 }
1539
1540 if(!isDataLoaded(pErrorCode)) {
1541 return error;
1542 }
1543
1544 /* construct the uppercase and lowercase of the name first */
1545 for(i=0; i<sizeof(upper); ++i) {
1546 if((c0=*name++)!=0) {
1547 upper[i]=uprv_toupper(c0);
1548 lower[i]=uprv_tolower(c0);
1549 } else {
1550 upper[i]=lower[i]=0;
1551 break;
1552 }
1553 }
1554 if(i==sizeof(upper)) {
1555 /* name too long, there is no such character */
1556 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1557 return error;
1558 }
1559 // i==strlen(name)==strlen(lower)==strlen(upper)
1560
1561 /* try extended names first */
1562 if (lower[0] == '<') {
1563 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1564 // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
1565 if (lower[--i] == '>' && i >= 3 && lower[--i] != '-') {
1566 while (i >= 3 && lower[--i] != '-') {}
1567
1568 if (i >= 2 && lower[i] == '-') {
1569 uint32_t cIdx;
1570
1571 lower[i] = 0;
1572
1573 for (++i; lower[i] != '>'; ++i) {
1574 if (lower[i] >= '0' && lower[i] <= '9') {
1575 cp = (cp << 4) + lower[i] - '0';
1576 } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1577 cp = (cp << 4) + lower[i] - 'a' + 10;
1578 } else {
1579 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1580 return error;
1581 }
1582 }
1583
1584 /* Now validate the category name.
1585 We could use a binary search, or a trie, if
1586 we really wanted to. */
1587
1588 for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
1589
1590 if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1591 if (getCharCat(cp) == cIdx) {
1592 return cp;
1593 }
1594 break;
1595 }
1596 }
1597 }
1598 }
1599 }
1600
1601 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1602 return error;
1603 }
1604
1605 /* try algorithmic names now */
1606 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1607 i=*p;
1608 algRange=(AlgorithmicRange *)(p+1);
1609 while(i>0) {
1610 if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1611 return cp;
1612 }
1613 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1614 --i;
1615 }
1616
1617 /* normal character name */
1618 findName.otherName=upper;
1619 findName.code=error;
1620 enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1621 if (findName.code == error) {
1622 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1623 }
1624 return findName.code;
1625 }
1626
1627 U_CAPI void U_EXPORT2
1628 u_enumCharNames(UChar32 start, UChar32 limit,
1629 UEnumCharNamesFn *fn,
1630 void *context,
1631 UCharNameChoice nameChoice,
1632 UErrorCode *pErrorCode) {
1633 AlgorithmicRange *algRange;
1634 uint32_t *p;
1635 uint32_t i;
1636
1637 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1638 return;
1639 }
1640
1641 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1642 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1643 return;
1644 }
1645
1646 if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1647 limit = UCHAR_MAX_VALUE + 1;
1648 }
1649 if((uint32_t)start>=(uint32_t)limit) {
1650 return;
1651 }
1652
1653 if(!isDataLoaded(pErrorCode)) {
1654 return;
1655 }
1656
1657 /* interleave the data-driven ones with the algorithmic ones */
1658 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1659 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1660 i=*p;
1661 algRange=(AlgorithmicRange *)(p+1);
1662 while(i>0) {
1663 /* enumerate the character names before the current algorithmic range */
1664 /* here: start<limit */
1665 if((uint32_t)start<algRange->start) {
1666 if((uint32_t)limit<=algRange->start) {
1667 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1668 return;
1669 }
1670 if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1671 return;
1672 }
1673 start=(UChar32)algRange->start;
1674 }
1675 /* enumerate the character names in the current algorithmic range */
1676 /* here: algRange->start<=start<limit */
1677 if((uint32_t)start<=algRange->end) {
1678 if((uint32_t)limit<=(algRange->end+1)) {
1679 enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1680 return;
1681 }
1682 if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1683 return;
1684 }
1685 start=(UChar32)algRange->end+1;
1686 }
1687 /* continue to the next algorithmic range (here: start<limit) */
1688 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1689 --i;
1690 }
1691 /* enumerate the character names after the last algorithmic range */
1692 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1693 }
1694
1695 U_CAPI int32_t U_EXPORT2
1696 uprv_getMaxCharNameLength() {
1697 UErrorCode errorCode=U_ZERO_ERROR;
1698 if(calcNameSetsLengths(&errorCode)) {
1699 return gMaxNameLength;
1700 } else {
1701 return 0;
1702 }
1703 }
1704
1705 /**
1706 * Converts the char set cset into a Unicode set uset.
1707 * @param cset Set of 256 bit flags corresponding to a set of chars.
1708 * @param uset USet to receive characters. Existing contents are deleted.
1709 */
1710 static void
1711 charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1712 UChar us[256];
1713 char cs[256];
1714
1715 int32_t i, length;
1716 UErrorCode errorCode;
1717
1718 errorCode=U_ZERO_ERROR;
1719
1720 if(!calcNameSetsLengths(&errorCode)) {
1721 return;
1722 }
1723
1724 /* build a char string with all chars that are used in character names */
1725 length=0;
1726 for(i=0; i<256; ++i) {
1727 if(SET_CONTAINS(cset, i)) {
1728 cs[length++]=(char)i;
1729 }
1730 }
1731
1732 /* convert the char string to a UChar string */
1733 u_charsToUChars(cs, us, length);
1734
1735 /* add each UChar to the USet */
1736 for(i=0; i<length; ++i) {
1737 if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1738 sa->add(sa->set, us[i]);
1739 }
1740 }
1741 }
1742
1743 /**
1744 * Fills set with characters that are used in Unicode character names.
1745 * @param set USet to receive characters.
1746 */
1747 U_CAPI void U_EXPORT2
1748 uprv_getCharNameCharacters(const USetAdder *sa) {
1749 charSetToUSet(gNameSet, sa);
1750 }
1751
1752 /* data swapping ------------------------------------------------------------ */
1753
1754 /*
1755 * The token table contains non-negative entries for token bytes,
1756 * and -1 for bytes that represent themselves in the data file's charset.
1757 * -2 entries are used for lead bytes.
1758 *
1759 * Direct bytes (-1 entries) must be translated from the input charset family
1760 * to the output charset family.
1761 * makeTokenMap() writes a permutation mapping for this.
1762 * Use it once for single-/lead-byte tokens and once more for all trail byte
1763 * tokens. (';' is an unused trail byte marked with -1.)
1764 */
1765 static void
1766 makeTokenMap(const UDataSwapper *ds,
1767 int16_t tokens[], uint16_t tokenCount,
1768 uint8_t map[256],
1769 UErrorCode *pErrorCode) {
1770 UBool usedOutChar[256];
1771 uint16_t i, j;
1772 uint8_t c1, c2;
1773
1774 if(U_FAILURE(*pErrorCode)) {
1775 return;
1776 }
1777
1778 if(ds->inCharset==ds->outCharset) {
1779 /* Same charset family: identity permutation */
1780 for(i=0; i<256; ++i) {
1781 map[i]=(uint8_t)i;
1782 }
1783 } else {
1784 uprv_memset(map, 0, 256);
1785 uprv_memset(usedOutChar, 0, 256);
1786
1787 if(tokenCount>256) {
1788 tokenCount=256;
1789 }
1790
1791 /* set the direct bytes (byte 0 always maps to itself) */
1792 for(i=1; i<tokenCount; ++i) {
1793 if(tokens[i]==-1) {
1794 /* convert the direct byte character */
1795 c1=(uint8_t)i;
1796 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1797 if(U_FAILURE(*pErrorCode)) {
1798 udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1799 i, ds->inCharset);
1800 return;
1801 }
1802
1803 /* enter the converted character into the map and mark it used */
1804 map[c1]=c2;
1805 usedOutChar[c2]=TRUE;
1806 }
1807 }
1808
1809 /* set the mappings for the rest of the permutation */
1810 for(i=j=1; i<tokenCount; ++i) {
1811 /* set mappings that were not set for direct bytes */
1812 if(map[i]==0) {
1813 /* set an output byte value that was not used as an output byte above */
1814 while(usedOutChar[j]) {
1815 ++j;
1816 }
1817 map[i]=(uint8_t)j++;
1818 }
1819 }
1820
1821 /*
1822 * leave mappings at tokenCount and above unset if tokenCount<256
1823 * because they won't be used
1824 */
1825 }
1826 }
1827
1828 U_CAPI int32_t U_EXPORT2
1829 uchar_swapNames(const UDataSwapper *ds,
1830 const void *inData, int32_t length, void *outData,
1831 UErrorCode *pErrorCode) {
1832 const UDataInfo *pInfo;
1833 int32_t headerSize;
1834
1835 const uint8_t *inBytes;
1836 uint8_t *outBytes;
1837
1838 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1839 offset, i, count, stringsCount;
1840
1841 const AlgorithmicRange *inRange;
1842 AlgorithmicRange *outRange;
1843
1844 /* udata_swapDataHeader checks the arguments */
1845 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1846 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1847 return 0;
1848 }
1849
1850 /* check data format and format version */
1851 pInfo=(const UDataInfo *)((const char *)inData+4);
1852 if(!(
1853 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
1854 pInfo->dataFormat[1]==0x6e &&
1855 pInfo->dataFormat[2]==0x61 &&
1856 pInfo->dataFormat[3]==0x6d &&
1857 pInfo->formatVersion[0]==1
1858 )) {
1859 udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1860 pInfo->dataFormat[0], pInfo->dataFormat[1],
1861 pInfo->dataFormat[2], pInfo->dataFormat[3],
1862 pInfo->formatVersion[0]);
1863 *pErrorCode=U_UNSUPPORTED_ERROR;
1864 return 0;
1865 }
1866
1867 inBytes=(const uint8_t *)inData+headerSize;
1868 outBytes=(uint8_t *)outData+headerSize;
1869 if(length<0) {
1870 algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1871 } else {
1872 length-=headerSize;
1873 if( length<20 ||
1874 (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1875 ) {
1876 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1877 length);
1878 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1879 return 0;
1880 }
1881 }
1882
1883 if(length<0) {
1884 /* preflighting: iterate through algorithmic ranges */
1885 offset=algNamesOffset;
1886 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1887 offset+=4;
1888
1889 for(i=0; i<count; ++i) {
1890 inRange=(const AlgorithmicRange *)(inBytes+offset);
1891 offset+=ds->readUInt16(inRange->size);
1892 }
1893 } else {
1894 /* swap data */
1895 const uint16_t *p;
1896 uint16_t *q, *temp;
1897
1898 int16_t tokens[512];
1899 uint16_t tokenCount;
1900
1901 uint8_t map[256], trailMap[256];
1902
1903 /* copy the data for inaccessible bytes */
1904 if(inBytes!=outBytes) {
1905 uprv_memcpy(outBytes, inBytes, length);
1906 }
1907
1908 /* the initial 4 offsets first */
1909 tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1910 groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1911 groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1912 ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1913
1914 /*
1915 * now the tokens table
1916 * it needs to be permutated along with the compressed name strings
1917 */
1918 p=(const uint16_t *)(inBytes+16);
1919 q=(uint16_t *)(outBytes+16);
1920
1921 /* read and swap the tokenCount */
1922 tokenCount=ds->readUInt16(*p);
1923 ds->swapArray16(ds, p, 2, q, pErrorCode);
1924 ++p;
1925 ++q;
1926
1927 /* read the first 512 tokens and make the token maps */
1928 if(tokenCount<=512) {
1929 count=tokenCount;
1930 } else {
1931 count=512;
1932 }
1933 for(i=0; i<count; ++i) {
1934 tokens[i]=udata_readInt16(ds, p[i]);
1935 }
1936 for(; i<512; ++i) {
1937 tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1938 }
1939 makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1940 makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1941 if(U_FAILURE(*pErrorCode)) {
1942 return 0;
1943 }
1944
1945 /*
1946 * swap and permutate the tokens
1947 * go through a temporary array to support in-place swapping
1948 */
1949 temp=(uint16_t *)uprv_malloc(tokenCount*2);
1950 if(temp==NULL) {
1951 udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1952 tokenCount);
1953 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1954 return 0;
1955 }
1956
1957 /* swap and permutate single-/lead-byte tokens */
1958 for(i=0; i<tokenCount && i<256; ++i) {
1959 ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1960 }
1961
1962 /* swap and permutate trail-byte tokens */
1963 for(; i<tokenCount; ++i) {
1964 ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1965 }
1966
1967 /* copy the result into the output and free the temporary array */
1968 uprv_memcpy(q, temp, tokenCount*2);
1969 uprv_free(temp);
1970
1971 /*
1972 * swap the token strings but not a possible padding byte after
1973 * the terminating NUL of the last string
1974 */
1975 udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1976 outBytes+tokenStringOffset, pErrorCode);
1977 if(U_FAILURE(*pErrorCode)) {
1978 udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1979 return 0;
1980 }
1981
1982 /* swap the group table */
1983 count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1984 ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1985 outBytes+groupsOffset, pErrorCode);
1986
1987 /*
1988 * swap the group strings
1989 * swap the string bytes but not the nibble-encoded string lengths
1990 */
1991 if(ds->inCharset!=ds->outCharset) {
1992 uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
1993
1994 const uint8_t *inStrings, *nextInStrings;
1995 uint8_t *outStrings;
1996
1997 uint8_t c;
1998
1999 inStrings=inBytes+groupStringOffset;
2000 outStrings=outBytes+groupStringOffset;
2001
2002 stringsCount=algNamesOffset-groupStringOffset;
2003
2004 /* iterate through string groups until only a few padding bytes are left */
2005 while(stringsCount>32) {
2006 nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2007
2008 /* move past the length bytes */
2009 stringsCount-=(uint32_t)(nextInStrings-inStrings);
2010 outStrings+=nextInStrings-inStrings;
2011 inStrings=nextInStrings;
2012
2013 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2014 stringsCount-=count;
2015
2016 /* swap the string bytes using map[] and trailMap[] */
2017 while(count>0) {
2018 c=*inStrings++;
2019 *outStrings++=map[c];
2020 if(tokens[c]!=-2) {
2021 --count;
2022 } else {
2023 /* token lead byte: swap the trail byte, too */
2024 *outStrings++=trailMap[*inStrings++];
2025 count-=2;
2026 }
2027 }
2028 }
2029 }
2030
2031 /* swap the algorithmic ranges */
2032 offset=algNamesOffset;
2033 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2034 ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2035 offset+=4;
2036
2037 for(i=0; i<count; ++i) {
2038 if(offset>(uint32_t)length) {
2039 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2040 length, i);
2041 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2042 return 0;
2043 }
2044
2045 inRange=(const AlgorithmicRange *)(inBytes+offset);
2046 outRange=(AlgorithmicRange *)(outBytes+offset);
2047 offset+=ds->readUInt16(inRange->size);
2048
2049 ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2050 ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2051 switch(inRange->type) {
2052 case 0:
2053 /* swap prefix string */
2054 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2055 outRange+1, pErrorCode);
2056 if(U_FAILURE(*pErrorCode)) {
2057 udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2058 i);
2059 return 0;
2060 }
2061 break;
2062 case 1:
2063 {
2064 /* swap factors and the prefix and factor strings */
2065 uint32_t factorsCount;
2066
2067 factorsCount=inRange->variant;
2068 p=(const uint16_t *)(inRange+1);
2069 q=(uint16_t *)(outRange+1);
2070 ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2071
2072 /* swap the strings, up to the last terminating NUL */
2073 p+=factorsCount;
2074 q+=factorsCount;
2075 stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2076 while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2077 --stringsCount;
2078 }
2079 ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2080 }
2081 break;
2082 default:
2083 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2084 inRange->type, i);
2085 *pErrorCode=U_UNSUPPORTED_ERROR;
2086 return 0;
2087 }
2088 }
2089 }
2090
2091 return headerSize+(int32_t)offset;
2092 }
2093
2094 /*
2095 * Hey, Emacs, please set the following:
2096 *
2097 * Local Variables:
2098 * indent-tabs-mode: nil
2099 * End:
2100 *
2101 */