]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/common/unames.cpp
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / common / unames.cpp
... / ...
CommitLineData
1/*
2******************************************************************************
3*
4* Copyright (C) 1999-2014, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7******************************************************************************
8* file name: unames.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 1999oct04
14* created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/putil.h"
19#include "unicode/uchar.h"
20#include "unicode/udata.h"
21#include "unicode/utf.h"
22#include "unicode/utf16.h"
23#include "uassert.h"
24#include "ustr_imp.h"
25#include "umutex.h"
26#include "cmemory.h"
27#include "cstring.h"
28#include "ucln_cmn.h"
29#include "udataswp.h"
30#include "uprops.h"
31
32U_NAMESPACE_BEGIN
33
34/* prototypes ------------------------------------------------------------- */
35
36static const char DATA_NAME[] = "unames";
37static const char DATA_TYPE[] = "icu";
38
39#define GROUP_SHIFT 5
40#define LINES_PER_GROUP (1L<<GROUP_SHIFT)
41#define GROUP_MASK (LINES_PER_GROUP-1)
42
43/*
44 * This struct was replaced by explicitly accessing equivalent
45 * fields from triples of uint16_t.
46 * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
47 * which broke the assumption that sizeof(Group)==6 and that the ++ operator
48 * would advance by 6 bytes (3 uint16_t).
49 *
50 * We can't just change the data structure because it's loaded from a data file,
51 * and we don't want to make it less compact, so we changed the access code.
52 *
53 * For details see ICU tickets 6331 and 6008.
54typedef struct {
55 uint16_t groupMSB,
56 offsetHigh, offsetLow; / * avoid padding * /
57} Group;
58 */
59enum {
60 GROUP_MSB,
61 GROUP_OFFSET_HIGH,
62 GROUP_OFFSET_LOW,
63 GROUP_LENGTH
64};
65
66/*
67 * Get the 32-bit group offset.
68 * @param group (const uint16_t *) pointer to a Group triple of uint16_t
69 * @return group offset (int32_t)
70 */
71#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
72
73#define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
74#define PREV_GROUP(group) ((group)-GROUP_LENGTH)
75
76typedef struct {
77 uint32_t start, end;
78 uint8_t type, variant;
79 uint16_t size;
80} AlgorithmicRange;
81
82typedef struct {
83 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
84} UCharNames;
85
86/*
87 * Get the groups table from a UCharNames struct.
88 * The groups table consists of one uint16_t groupCount followed by
89 * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
90 * and the comment for the old struct Group above.
91 *
92 * @param names (const UCharNames *) pointer to the UCharNames indexes
93 * @return (const uint16_t *) pointer to the groups table
94 */
95#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
96
97typedef struct {
98 const char *otherName;
99 UChar32 code;
100} FindName;
101
102#define DO_FIND_NAME NULL
103
104static UDataMemory *uCharNamesData=NULL;
105static UCharNames *uCharNames=NULL;
106static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
107
108/*
109 * Maximum length of character names (regular & 1.0).
110 */
111static int32_t gMaxNameLength=0;
112
113/*
114 * Set of chars used in character names (regular & 1.0).
115 * Chars are platform-dependent (can be EBCDIC).
116 */
117static uint32_t gNameSet[8]={ 0 };
118
119#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
120#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
121#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
122
123#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
124
125static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
126 "unassigned",
127 "uppercase letter",
128 "lowercase letter",
129 "titlecase letter",
130 "modifier letter",
131 "other letter",
132 "non spacing mark",
133 "enclosing mark",
134 "combining spacing mark",
135 "decimal digit number",
136 "letter number",
137 "other number",
138 "space separator",
139 "line separator",
140 "paragraph separator",
141 "control",
142 "format",
143 "private use area",
144 "surrogate",
145 "dash punctuation",
146 "start punctuation",
147 "end punctuation",
148 "connector punctuation",
149 "other punctuation",
150 "math symbol",
151 "currency symbol",
152 "modifier symbol",
153 "other symbol",
154 "initial punctuation",
155 "final punctuation",
156 "noncharacter",
157 "lead surrogate",
158 "trail surrogate"
159};
160
161/* implementation ----------------------------------------------------------- */
162
163static UBool U_CALLCONV unames_cleanup(void)
164{
165 if(uCharNamesData) {
166 udata_close(uCharNamesData);
167 uCharNamesData = NULL;
168 }
169 if(uCharNames) {
170 uCharNames = NULL;
171 }
172 gCharNamesInitOnce.reset();
173 gMaxNameLength=0;
174 return TRUE;
175}
176
177static UBool U_CALLCONV
178isAcceptable(void * /*context*/,
179 const char * /*type*/, const char * /*name*/,
180 const UDataInfo *pInfo) {
181 return (UBool)(
182 pInfo->size>=20 &&
183 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
184 pInfo->charsetFamily==U_CHARSET_FAMILY &&
185 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
186 pInfo->dataFormat[1]==0x6e &&
187 pInfo->dataFormat[2]==0x61 &&
188 pInfo->dataFormat[3]==0x6d &&
189 pInfo->formatVersion[0]==1);
190}
191
192static void U_CALLCONV
193loadCharNames(UErrorCode &status) {
194 U_ASSERT(uCharNamesData == NULL);
195 U_ASSERT(uCharNames == NULL);
196
197 uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
198 if(U_FAILURE(status)) {
199 uCharNamesData = NULL;
200 } else {
201 uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
202 }
203 ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
204}
205
206
207static UBool
208isDataLoaded(UErrorCode *pErrorCode) {
209 umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
210 return U_SUCCESS(*pErrorCode);
211}
212
213#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
214 if((bufferLength)>0) { \
215 *(buffer)++=c; \
216 --(bufferLength); \
217 } \
218 ++(bufferPos); \
219}
220
221#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
222
223/*
224 * Important: expandName() and compareName() are almost the same -
225 * apply fixes to both.
226 *
227 * UnicodeData.txt uses ';' as a field separator, so no
228 * field can contain ';' as part of its contents.
229 * In unames.dat, it is marked as token[';']==-1 only if the
230 * semicolon is used in the data file - which is iff we
231 * have Unicode 1.0 names or ISO comments or aliases.
232 * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
233 * although we know that it will never be part of a name.
234 */
235static uint16_t
236expandName(UCharNames *names,
237 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
238 char *buffer, uint16_t bufferLength) {
239 uint16_t *tokens=(uint16_t *)names+8;
240 uint16_t token, tokenCount=*tokens++, bufferPos=0;
241 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
242 uint8_t c;
243
244 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
245 /*
246 * skip the modern name if it is not requested _and_
247 * if the semicolon byte value is a character, not a token number
248 */
249 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
250 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
251 do {
252 while(nameLength>0) {
253 --nameLength;
254 if(*name++==';') {
255 break;
256 }
257 }
258 } while(--fieldIndex>0);
259 } else {
260 /*
261 * the semicolon byte value is a token number, therefore
262 * only modern names are stored in unames.dat and there is no
263 * such requested alternate name here
264 */
265 nameLength=0;
266 }
267 }
268
269 /* write each letter directly, and write a token word per token */
270 while(nameLength>0) {
271 --nameLength;
272 c=*name++;
273
274 if(c>=tokenCount) {
275 if(c!=';') {
276 /* implicit letter */
277 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
278 } else {
279 /* finished */
280 break;
281 }
282 } else {
283 token=tokens[c];
284 if(token==(uint16_t)(-2)) {
285 /* this is a lead byte for a double-byte token */
286 token=tokens[c<<8|*name++];
287 --nameLength;
288 }
289 if(token==(uint16_t)(-1)) {
290 if(c!=';') {
291 /* explicit letter */
292 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
293 } else {
294 /* stop, but skip the semicolon if we are seeking
295 extended names and there was no 2.0 name but there
296 is a 1.0 name. */
297 if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
298 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
299 continue;
300 }
301 }
302 /* finished */
303 break;
304 }
305 } else {
306 /* write token word */
307 uint8_t *tokenString=tokenStrings+token;
308 while((c=*tokenString++)!=0) {
309 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
310 }
311 }
312 }
313 }
314
315 /* zero-terminate */
316 if(bufferLength>0) {
317 *buffer=0;
318 }
319
320 return bufferPos;
321}
322
323/*
324 * compareName() is almost the same as expandName() except that it compares
325 * the currently expanded name to an input name.
326 * It returns the match/no match result as soon as possible.
327 */
328static UBool
329compareName(UCharNames *names,
330 const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
331 const char *otherName) {
332 uint16_t *tokens=(uint16_t *)names+8;
333 uint16_t token, tokenCount=*tokens++;
334 uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
335 uint8_t c;
336 const char *origOtherName = otherName;
337
338 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
339 /*
340 * skip the modern name if it is not requested _and_
341 * if the semicolon byte value is a character, not a token number
342 */
343 if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
344 int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
345 do {
346 while(nameLength>0) {
347 --nameLength;
348 if(*name++==';') {
349 break;
350 }
351 }
352 } while(--fieldIndex>0);
353 } else {
354 /*
355 * the semicolon byte value is a token number, therefore
356 * only modern names are stored in unames.dat and there is no
357 * such requested alternate name here
358 */
359 nameLength=0;
360 }
361 }
362
363 /* compare each letter directly, and compare a token word per token */
364 while(nameLength>0) {
365 --nameLength;
366 c=*name++;
367
368 if(c>=tokenCount) {
369 if(c!=';') {
370 /* implicit letter */
371 if((char)c!=*otherName++) {
372 return FALSE;
373 }
374 } else {
375 /* finished */
376 break;
377 }
378 } else {
379 token=tokens[c];
380 if(token==(uint16_t)(-2)) {
381 /* this is a lead byte for a double-byte token */
382 token=tokens[c<<8|*name++];
383 --nameLength;
384 }
385 if(token==(uint16_t)(-1)) {
386 if(c!=';') {
387 /* explicit letter */
388 if((char)c!=*otherName++) {
389 return FALSE;
390 }
391 } else {
392 /* stop, but skip the semicolon if we are seeking
393 extended names and there was no 2.0 name but there
394 is a 1.0 name. */
395 if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
396 if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
397 continue;
398 }
399 }
400 /* finished */
401 break;
402 }
403 } else {
404 /* write token word */
405 uint8_t *tokenString=tokenStrings+token;
406 while((c=*tokenString++)!=0) {
407 if((char)c!=*otherName++) {
408 return FALSE;
409 }
410 }
411 }
412 }
413 }
414
415 /* complete match? */
416 return (UBool)(*otherName==0);
417}
418
419static uint8_t getCharCat(UChar32 cp) {
420 uint8_t cat;
421
422 if (U_IS_UNICODE_NONCHAR(cp)) {
423 return U_NONCHARACTER_CODE_POINT;
424 }
425
426 if ((cat = u_charType(cp)) == U_SURROGATE) {
427 cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
428 }
429
430 return cat;
431}
432
433static const char *getCharCatName(UChar32 cp) {
434 uint8_t cat = getCharCat(cp);
435
436 /* Return unknown if the table of names above is not up to
437 date. */
438
439 if (cat >= UPRV_LENGTHOF(charCatNames)) {
440 return "unknown";
441 } else {
442 return charCatNames[cat];
443 }
444}
445
446static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
447 const char *catname = getCharCatName(code);
448 uint16_t length = 0;
449
450 UChar32 cp;
451 int ndigits, i;
452
453 WRITE_CHAR(buffer, bufferLength, length, '<');
454 while (catname[length - 1]) {
455 WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
456 }
457 WRITE_CHAR(buffer, bufferLength, length, '-');
458 for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
459 ;
460 if (ndigits < 4)
461 ndigits = 4;
462 for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
463 uint8_t v = (uint8_t)(cp & 0xf);
464 buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
465 }
466 buffer += ndigits;
467 length += ndigits;
468 WRITE_CHAR(buffer, bufferLength, length, '>');
469
470 return length;
471}
472
473/*
474 * getGroup() does a binary search for the group that contains the
475 * Unicode code point "code".
476 * The return value is always a valid Group* that may contain "code"
477 * or else is the highest group before "code".
478 * If the lowest group is after "code", then that one is returned.
479 */
480static const uint16_t *
481getGroup(UCharNames *names, uint32_t code) {
482 const uint16_t *groups=GET_GROUPS(names);
483 uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
484 start=0,
485 limit=*groups++,
486 number;
487
488 /* binary search for the group of names that contains the one for code */
489 while(start<limit-1) {
490 number=(uint16_t)((start+limit)/2);
491 if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
492 limit=number;
493 } else {
494 start=number;
495 }
496 }
497
498 /* return this regardless of whether it is an exact match */
499 return groups+start*GROUP_LENGTH;
500}
501
502/*
503 * expandGroupLengths() reads a block of compressed lengths of 32 strings and
504 * expands them into offsets and lengths for each string.
505 * Lengths are stored with a variable-width encoding in consecutive nibbles:
506 * If a nibble<0xc, then it is the length itself (0=empty string).
507 * If a nibble>=0xc, then it forms a length value with the following nibble.
508 * Calculation see below.
509 * The offsets and lengths arrays must be at least 33 (one more) long because
510 * there is no check here at the end if the last nibble is still used.
511 */
512static const uint8_t *
513expandGroupLengths(const uint8_t *s,
514 uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
515 /* read the lengths of the 32 strings in this group and get each string's offset */
516 uint16_t i=0, offset=0, length=0;
517 uint8_t lengthByte;
518
519 /* all 32 lengths must be read to get the offset of the first group string */
520 while(i<LINES_PER_GROUP) {
521 lengthByte=*s++;
522
523 /* read even nibble - MSBs of lengthByte */
524 if(length>=12) {
525 /* double-nibble length spread across two bytes */
526 length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
527 lengthByte&=0xf;
528 } else if((lengthByte /* &0xf0 */)>=0xc0) {
529 /* double-nibble length spread across this one byte */
530 length=(uint16_t)((lengthByte&0x3f)+12);
531 } else {
532 /* single-nibble length in MSBs */
533 length=(uint16_t)(lengthByte>>4);
534 lengthByte&=0xf;
535 }
536
537 *offsets++=offset;
538 *lengths++=length;
539
540 offset+=length;
541 ++i;
542
543 /* read odd nibble - LSBs of lengthByte */
544 if((lengthByte&0xf0)==0) {
545 /* this nibble was not consumed for a double-nibble length above */
546 length=lengthByte;
547 if(length<12) {
548 /* single-nibble length in LSBs */
549 *offsets++=offset;
550 *lengths++=length;
551
552 offset+=length;
553 ++i;
554 }
555 } else {
556 length=0; /* prevent double-nibble detection in the next iteration */
557 }
558 }
559
560 /* now, s is at the first group string */
561 return s;
562}
563
564static uint16_t
565expandGroupName(UCharNames *names, const uint16_t *group,
566 uint16_t lineNumber, UCharNameChoice nameChoice,
567 char *buffer, uint16_t bufferLength) {
568 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
569 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
570 s=expandGroupLengths(s, offsets, lengths);
571 return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
572 buffer, bufferLength);
573}
574
575static uint16_t
576getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
577 char *buffer, uint16_t bufferLength) {
578 const uint16_t *group=getGroup(names, code);
579 if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
580 return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
581 buffer, bufferLength);
582 } else {
583 /* group not found */
584 /* zero-terminate */
585 if(bufferLength>0) {
586 *buffer=0;
587 }
588 return 0;
589 }
590}
591
592/*
593 * enumGroupNames() enumerates all the names in a 32-group
594 * and either calls the enumerator function or finds a given input name.
595 */
596static UBool
597enumGroupNames(UCharNames *names, const uint16_t *group,
598 UChar32 start, UChar32 end,
599 UEnumCharNamesFn *fn, void *context,
600 UCharNameChoice nameChoice) {
601 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
602 const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
603
604 s=expandGroupLengths(s, offsets, lengths);
605 if(fn!=DO_FIND_NAME) {
606 char buffer[200];
607 uint16_t length;
608
609 while(start<=end) {
610 length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
611 if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
612 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
613 }
614 /* here, we assume that the buffer is large enough */
615 if(length>0) {
616 if(!fn(context, start, nameChoice, buffer, length)) {
617 return FALSE;
618 }
619 }
620 ++start;
621 }
622 } else {
623 const char *otherName=((FindName *)context)->otherName;
624 while(start<=end) {
625 if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
626 ((FindName *)context)->code=start;
627 return FALSE;
628 }
629 ++start;
630 }
631 }
632 return TRUE;
633}
634
635/*
636 * enumExtNames enumerate extended names.
637 * It only needs to do it if it is called with a real function and not
638 * with the dummy DO_FIND_NAME, because u_charFromName() does a check
639 * for extended names by itself.
640 */
641static UBool
642enumExtNames(UChar32 start, UChar32 end,
643 UEnumCharNamesFn *fn, void *context)
644{
645 if(fn!=DO_FIND_NAME) {
646 char buffer[200];
647 uint16_t length;
648
649 while(start<=end) {
650 buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
651 /* here, we assume that the buffer is large enough */
652 if(length>0) {
653 if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
654 return FALSE;
655 }
656 }
657 ++start;
658 }
659 }
660
661 return TRUE;
662}
663
664static UBool
665enumNames(UCharNames *names,
666 UChar32 start, UChar32 limit,
667 UEnumCharNamesFn *fn, void *context,
668 UCharNameChoice nameChoice) {
669 uint16_t startGroupMSB, endGroupMSB, groupCount;
670 const uint16_t *group, *groupLimit;
671
672 startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
673 endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
674
675 /* find the group that contains start, or the highest before it */
676 group=getGroup(names, start);
677
678 if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
679 /* enumerate synthetic names between start and the group start */
680 UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
681 if(extLimit>limit) {
682 extLimit=limit;
683 }
684 if(!enumExtNames(start, extLimit-1, fn, context)) {
685 return FALSE;
686 }
687 start=extLimit;
688 }
689
690 if(startGroupMSB==endGroupMSB) {
691 if(startGroupMSB==group[GROUP_MSB]) {
692 /* if start and limit-1 are in the same group, then enumerate only in that one */
693 return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
694 }
695 } else {
696 const uint16_t *groups=GET_GROUPS(names);
697 groupCount=*groups++;
698 groupLimit=groups+groupCount*GROUP_LENGTH;
699
700 if(startGroupMSB==group[GROUP_MSB]) {
701 /* enumerate characters in the partial start group */
702 if((start&GROUP_MASK)!=0) {
703 if(!enumGroupNames(names, group,
704 start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
705 fn, context, nameChoice)) {
706 return FALSE;
707 }
708 group=NEXT_GROUP(group); /* continue with the next group */
709 }
710 } else if(startGroupMSB>group[GROUP_MSB]) {
711 /* make sure that we start enumerating with the first group after start */
712 const uint16_t *nextGroup=NEXT_GROUP(group);
713 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
714 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
715 if (end > limit) {
716 end = limit;
717 }
718 if (!enumExtNames(start, end - 1, fn, context)) {
719 return FALSE;
720 }
721 }
722 group=nextGroup;
723 }
724
725 /* enumerate entire groups between the start- and end-groups */
726 while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
727 const uint16_t *nextGroup;
728 start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
729 if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
730 return FALSE;
731 }
732 nextGroup=NEXT_GROUP(group);
733 if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
734 UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
735 if (end > limit) {
736 end = limit;
737 }
738 if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
739 return FALSE;
740 }
741 }
742 group=nextGroup;
743 }
744
745 /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
746 if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
747 return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
748 } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
749 UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
750 if (next > start) {
751 start = next;
752 }
753 } else {
754 return TRUE;
755 }
756 }
757
758 /* we have not found a group, which means everything is made of
759 extended names. */
760 if (nameChoice == U_EXTENDED_CHAR_NAME) {
761 if (limit > UCHAR_MAX_VALUE + 1) {
762 limit = UCHAR_MAX_VALUE + 1;
763 }
764 return enumExtNames(start, limit - 1, fn, context);
765 }
766
767 return TRUE;
768}
769
770static uint16_t
771writeFactorSuffix(const uint16_t *factors, uint16_t count,
772 const char *s, /* suffix elements */
773 uint32_t code,
774 uint16_t indexes[8], /* output fields from here */
775 const char *elementBases[8], const char *elements[8],
776 char *buffer, uint16_t bufferLength) {
777 uint16_t i, factor, bufferPos=0;
778 char c;
779
780 /* write elements according to the factors */
781
782 /*
783 * the factorized elements are determined by modulo arithmetic
784 * with the factors of this algorithm
785 *
786 * note that for fewer operations, count is decremented here
787 */
788 --count;
789 for(i=count; i>0; --i) {
790 factor=factors[i];
791 indexes[i]=(uint16_t)(code%factor);
792 code/=factor;
793 }
794 /*
795 * we don't need to calculate the last modulus because start<=code<=end
796 * guarantees here that code<=factors[0]
797 */
798 indexes[0]=(uint16_t)code;
799
800 /* write each element */
801 for(;;) {
802 if(elementBases!=NULL) {
803 *elementBases++=s;
804 }
805
806 /* skip indexes[i] strings */
807 factor=indexes[i];
808 while(factor>0) {
809 while(*s++!=0) {}
810 --factor;
811 }
812 if(elements!=NULL) {
813 *elements++=s;
814 }
815
816 /* write element */
817 while((c=*s++)!=0) {
818 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
819 }
820
821 /* we do not need to perform the rest of this loop for i==count - break here */
822 if(i>=count) {
823 break;
824 }
825
826 /* skip the rest of the strings for this factors[i] */
827 factor=(uint16_t)(factors[i]-indexes[i]-1);
828 while(factor>0) {
829 while(*s++!=0) {}
830 --factor;
831 }
832
833 ++i;
834 }
835
836 /* zero-terminate */
837 if(bufferLength>0) {
838 *buffer=0;
839 }
840
841 return bufferPos;
842}
843
844/*
845 * Important:
846 * Parts of findAlgName() are almost the same as some of getAlgName().
847 * Fixes must be applied to both.
848 */
849static uint16_t
850getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
851 char *buffer, uint16_t bufferLength) {
852 uint16_t bufferPos=0;
853
854 /* Only the normative character name can be algorithmic. */
855 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
856 /* zero-terminate */
857 if(bufferLength>0) {
858 *buffer=0;
859 }
860 return 0;
861 }
862
863 switch(range->type) {
864 case 0: {
865 /* name = prefix hex-digits */
866 const char *s=(const char *)(range+1);
867 char c;
868
869 uint16_t i, count;
870
871 /* copy prefix */
872 while((c=*s++)!=0) {
873 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
874 }
875
876 /* write hexadecimal code point value */
877 count=range->variant;
878
879 /* zero-terminate */
880 if(count<bufferLength) {
881 buffer[count]=0;
882 }
883
884 for(i=count; i>0;) {
885 if(--i<bufferLength) {
886 c=(char)(code&0xf);
887 if(c<10) {
888 c+='0';
889 } else {
890 c+='A'-10;
891 }
892 buffer[i]=c;
893 }
894 code>>=4;
895 }
896
897 bufferPos+=count;
898 break;
899 }
900 case 1: {
901 /* name = prefix factorized-elements */
902 uint16_t indexes[8];
903 const uint16_t *factors=(const uint16_t *)(range+1);
904 uint16_t count=range->variant;
905 const char *s=(const char *)(factors+count);
906 char c;
907
908 /* copy prefix */
909 while((c=*s++)!=0) {
910 WRITE_CHAR(buffer, bufferLength, bufferPos, c);
911 }
912
913 bufferPos+=writeFactorSuffix(factors, count,
914 s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
915 break;
916 }
917 default:
918 /* undefined type */
919 /* zero-terminate */
920 if(bufferLength>0) {
921 *buffer=0;
922 }
923 break;
924 }
925
926 return bufferPos;
927}
928
929/*
930 * Important: enumAlgNames() and findAlgName() are almost the same.
931 * Any fix must be applied to both.
932 */
933static UBool
934enumAlgNames(AlgorithmicRange *range,
935 UChar32 start, UChar32 limit,
936 UEnumCharNamesFn *fn, void *context,
937 UCharNameChoice nameChoice) {
938 char buffer[200];
939 uint16_t length;
940
941 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
942 return TRUE;
943 }
944
945 switch(range->type) {
946 case 0: {
947 char *s, *end;
948 char c;
949
950 /* get the full name of the start character */
951 length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
952 if(length<=0) {
953 return TRUE;
954 }
955
956 /* call the enumerator function with this first character */
957 if(!fn(context, start, nameChoice, buffer, length)) {
958 return FALSE;
959 }
960
961 /* go to the end of the name; all these names have the same length */
962 end=buffer;
963 while(*end!=0) {
964 ++end;
965 }
966
967 /* enumerate the rest of the names */
968 while(++start<limit) {
969 /* increment the hexadecimal number on a character-basis */
970 s=end;
971 for (;;) {
972 c=*--s;
973 if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
974 *s=(char)(c+1);
975 break;
976 } else if(c=='9') {
977 *s='A';
978 break;
979 } else if(c=='F') {
980 *s='0';
981 }
982 }
983
984 if(!fn(context, start, nameChoice, buffer, length)) {
985 return FALSE;
986 }
987 }
988 break;
989 }
990 case 1: {
991 uint16_t indexes[8];
992 const char *elementBases[8], *elements[8];
993 const uint16_t *factors=(const uint16_t *)(range+1);
994 uint16_t count=range->variant;
995 const char *s=(const char *)(factors+count);
996 char *suffix, *t;
997 uint16_t prefixLength, i, idx;
998
999 char c;
1000
1001 /* name = prefix factorized-elements */
1002
1003 /* copy prefix */
1004 suffix=buffer;
1005 prefixLength=0;
1006 while((c=*s++)!=0) {
1007 *suffix++=c;
1008 ++prefixLength;
1009 }
1010
1011 /* append the suffix of the start character */
1012 length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1013 s, (uint32_t)start-range->start,
1014 indexes, elementBases, elements,
1015 suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1016
1017 /* call the enumerator function with this first character */
1018 if(!fn(context, start, nameChoice, buffer, length)) {
1019 return FALSE;
1020 }
1021
1022 /* enumerate the rest of the names */
1023 while(++start<limit) {
1024 /* increment the indexes in lexical order bound by the factors */
1025 i=count;
1026 for (;;) {
1027 idx=(uint16_t)(indexes[--i]+1);
1028 if(idx<factors[i]) {
1029 /* skip one index and its element string */
1030 indexes[i]=idx;
1031 s=elements[i];
1032 while(*s++!=0) {
1033 }
1034 elements[i]=s;
1035 break;
1036 } else {
1037 /* reset this index to 0 and its element string to the first one */
1038 indexes[i]=0;
1039 elements[i]=elementBases[i];
1040 }
1041 }
1042
1043 /* to make matters a little easier, just append all elements to the suffix */
1044 t=suffix;
1045 length=prefixLength;
1046 for(i=0; i<count; ++i) {
1047 s=elements[i];
1048 while((c=*s++)!=0) {
1049 *t++=c;
1050 ++length;
1051 }
1052 }
1053 /* zero-terminate */
1054 *t=0;
1055
1056 if(!fn(context, start, nameChoice, buffer, length)) {
1057 return FALSE;
1058 }
1059 }
1060 break;
1061 }
1062 default:
1063 /* undefined type */
1064 break;
1065 }
1066
1067 return TRUE;
1068}
1069
1070/*
1071 * findAlgName() is almost the same as enumAlgNames() except that it
1072 * returns the code point for a name if it fits into the range.
1073 * It returns 0xffff otherwise.
1074 */
1075static UChar32
1076findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1077 UChar32 code;
1078
1079 if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1080 return 0xffff;
1081 }
1082
1083 switch(range->type) {
1084 case 0: {
1085 /* name = prefix hex-digits */
1086 const char *s=(const char *)(range+1);
1087 char c;
1088
1089 uint16_t i, count;
1090
1091 /* compare prefix */
1092 while((c=*s++)!=0) {
1093 if((char)c!=*otherName++) {
1094 return 0xffff;
1095 }
1096 }
1097
1098 /* read hexadecimal code point value */
1099 count=range->variant;
1100 code=0;
1101 for(i=0; i<count; ++i) {
1102 c=*otherName++;
1103 if('0'<=c && c<='9') {
1104 code=(code<<4)|(c-'0');
1105 } else if('A'<=c && c<='F') {
1106 code=(code<<4)|(c-'A'+10);
1107 } else {
1108 return 0xffff;
1109 }
1110 }
1111
1112 /* does it fit into the range? */
1113 if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1114 return code;
1115 }
1116 break;
1117 }
1118 case 1: {
1119 char buffer[64];
1120 uint16_t indexes[8];
1121 const char *elementBases[8], *elements[8];
1122 const uint16_t *factors=(const uint16_t *)(range+1);
1123 uint16_t count=range->variant;
1124 const char *s=(const char *)(factors+count), *t;
1125 UChar32 start, limit;
1126 uint16_t i, idx;
1127
1128 char c;
1129
1130 /* name = prefix factorized-elements */
1131
1132 /* compare prefix */
1133 while((c=*s++)!=0) {
1134 if((char)c!=*otherName++) {
1135 return 0xffff;
1136 }
1137 }
1138
1139 start=(UChar32)range->start;
1140 limit=(UChar32)(range->end+1);
1141
1142 /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1143 writeFactorSuffix(factors, count, s, 0,
1144 indexes, elementBases, elements, buffer, sizeof(buffer));
1145
1146 /* compare the first suffix */
1147 if(0==uprv_strcmp(otherName, buffer)) {
1148 return start;
1149 }
1150
1151 /* enumerate and compare the rest of the suffixes */
1152 while(++start<limit) {
1153 /* increment the indexes in lexical order bound by the factors */
1154 i=count;
1155 for (;;) {
1156 idx=(uint16_t)(indexes[--i]+1);
1157 if(idx<factors[i]) {
1158 /* skip one index and its element string */
1159 indexes[i]=idx;
1160 s=elements[i];
1161 while(*s++!=0) {}
1162 elements[i]=s;
1163 break;
1164 } else {
1165 /* reset this index to 0 and its element string to the first one */
1166 indexes[i]=0;
1167 elements[i]=elementBases[i];
1168 }
1169 }
1170
1171 /* to make matters a little easier, just compare all elements of the suffix */
1172 t=otherName;
1173 for(i=0; i<count; ++i) {
1174 s=elements[i];
1175 while((c=*s++)!=0) {
1176 if(c!=*t++) {
1177 s=""; /* does not match */
1178 i=99;
1179 }
1180 }
1181 }
1182 if(i<99 && *t==0) {
1183 return start;
1184 }
1185 }
1186 break;
1187 }
1188 default:
1189 /* undefined type */
1190 break;
1191 }
1192
1193 return 0xffff;
1194}
1195
1196/* sets of name characters, maximum name lengths ---------------------------- */
1197
1198#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1199#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1200
1201static int32_t
1202calcStringSetLength(uint32_t set[8], const char *s) {
1203 int32_t length=0;
1204 char c;
1205
1206 while((c=*s++)!=0) {
1207 SET_ADD(set, c);
1208 ++length;
1209 }
1210 return length;
1211}
1212
1213static int32_t
1214calcAlgNameSetsLengths(int32_t maxNameLength) {
1215 AlgorithmicRange *range;
1216 uint32_t *p;
1217 uint32_t rangeCount;
1218 int32_t length;
1219
1220 /* enumerate algorithmic ranges */
1221 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1222 rangeCount=*p;
1223 range=(AlgorithmicRange *)(p+1);
1224 while(rangeCount>0) {
1225 switch(range->type) {
1226 case 0:
1227 /* name = prefix + (range->variant times) hex-digits */
1228 /* prefix */
1229 length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1230 if(length>maxNameLength) {
1231 maxNameLength=length;
1232 }
1233 break;
1234 case 1: {
1235 /* name = prefix factorized-elements */
1236 const uint16_t *factors=(const uint16_t *)(range+1);
1237 const char *s;
1238 int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1239
1240 /* prefix length */
1241 s=(const char *)(factors+count);
1242 length=calcStringSetLength(gNameSet, s);
1243 s+=length+1; /* start of factor suffixes */
1244
1245 /* get the set and maximum factor suffix length for each factor */
1246 for(i=0; i<count; ++i) {
1247 maxFactorLength=0;
1248 for(factor=factors[i]; factor>0; --factor) {
1249 factorLength=calcStringSetLength(gNameSet, s);
1250 s+=factorLength+1;
1251 if(factorLength>maxFactorLength) {
1252 maxFactorLength=factorLength;
1253 }
1254 }
1255 length+=maxFactorLength;
1256 }
1257
1258 if(length>maxNameLength) {
1259 maxNameLength=length;
1260 }
1261 break;
1262 }
1263 default:
1264 /* unknown type */
1265 break;
1266 }
1267
1268 range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1269 --rangeCount;
1270 }
1271 return maxNameLength;
1272}
1273
1274static int32_t
1275calcExtNameSetsLengths(int32_t maxNameLength) {
1276 int32_t i, length;
1277
1278 for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) {
1279 /*
1280 * for each category, count the length of the category name
1281 * plus 9=
1282 * 2 for <>
1283 * 1 for -
1284 * 6 for most hex digits per code point
1285 */
1286 length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1287 if(length>maxNameLength) {
1288 maxNameLength=length;
1289 }
1290 }
1291 return maxNameLength;
1292}
1293
1294static int32_t
1295calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1296 uint32_t set[8],
1297 const uint8_t **pLine, const uint8_t *lineLimit) {
1298 const uint8_t *line=*pLine;
1299 int32_t length=0, tokenLength;
1300 uint16_t c, token;
1301
1302 while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1303 if(c>=tokenCount) {
1304 /* implicit letter */
1305 SET_ADD(set, c);
1306 ++length;
1307 } else {
1308 token=tokens[c];
1309 if(token==(uint16_t)(-2)) {
1310 /* this is a lead byte for a double-byte token */
1311 c=c<<8|*line++;
1312 token=tokens[c];
1313 }
1314 if(token==(uint16_t)(-1)) {
1315 /* explicit letter */
1316 SET_ADD(set, c);
1317 ++length;
1318 } else {
1319 /* count token word */
1320 if(tokenLengths!=NULL) {
1321 /* use cached token length */
1322 tokenLength=tokenLengths[c];
1323 if(tokenLength==0) {
1324 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1325 tokenLengths[c]=(int8_t)tokenLength;
1326 }
1327 } else {
1328 tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1329 }
1330 length+=tokenLength;
1331 }
1332 }
1333 }
1334
1335 *pLine=line;
1336 return length;
1337}
1338
1339static void
1340calcGroupNameSetsLengths(int32_t maxNameLength) {
1341 uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1342
1343 uint16_t *tokens=(uint16_t *)uCharNames+8;
1344 uint16_t tokenCount=*tokens++;
1345 uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1346
1347 int8_t *tokenLengths;
1348
1349 const uint16_t *group;
1350 const uint8_t *s, *line, *lineLimit;
1351
1352 int32_t groupCount, lineNumber, length;
1353
1354 tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1355 if(tokenLengths!=NULL) {
1356 uprv_memset(tokenLengths, 0, tokenCount);
1357 }
1358
1359 group=GET_GROUPS(uCharNames);
1360 groupCount=*group++;
1361
1362 /* enumerate all groups */
1363 while(groupCount>0) {
1364 s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1365 s=expandGroupLengths(s, offsets, lengths);
1366
1367 /* enumerate all lines in each group */
1368 for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1369 line=s+offsets[lineNumber];
1370 length=lengths[lineNumber];
1371 if(length==0) {
1372 continue;
1373 }
1374
1375 lineLimit=line+length;
1376
1377 /* read regular name */
1378 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1379 if(length>maxNameLength) {
1380 maxNameLength=length;
1381 }
1382 if(line==lineLimit) {
1383 continue;
1384 }
1385
1386 /* read Unicode 1.0 name */
1387 length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1388 if(length>maxNameLength) {
1389 maxNameLength=length;
1390 }
1391 if(line==lineLimit) {
1392 continue;
1393 }
1394
1395 /* read ISO comment */
1396 /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1397 }
1398
1399 group=NEXT_GROUP(group);
1400 --groupCount;
1401 }
1402
1403 if(tokenLengths!=NULL) {
1404 uprv_free(tokenLengths);
1405 }
1406
1407 /* set gMax... - name length last for threading */
1408 gMaxNameLength=maxNameLength;
1409}
1410
1411static UBool
1412calcNameSetsLengths(UErrorCode *pErrorCode) {
1413 static const char extChars[]="0123456789ABCDEF<>-";
1414 int32_t i, maxNameLength;
1415
1416 if(gMaxNameLength!=0) {
1417 return TRUE;
1418 }
1419
1420 if(!isDataLoaded(pErrorCode)) {
1421 return FALSE;
1422 }
1423
1424 /* set hex digits, used in various names, and <>-, used in extended names */
1425 for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
1426 SET_ADD(gNameSet, extChars[i]);
1427 }
1428
1429 /* set sets and lengths from algorithmic names */
1430 maxNameLength=calcAlgNameSetsLengths(0);
1431
1432 /* set sets and lengths from extended names */
1433 maxNameLength=calcExtNameSetsLengths(maxNameLength);
1434
1435 /* set sets and lengths from group names, set global maximum values */
1436 calcGroupNameSetsLengths(maxNameLength);
1437
1438 return TRUE;
1439}
1440
1441U_NAMESPACE_END
1442
1443/* public API --------------------------------------------------------------- */
1444
1445U_NAMESPACE_USE
1446
1447U_CAPI int32_t U_EXPORT2
1448u_charName(UChar32 code, UCharNameChoice nameChoice,
1449 char *buffer, int32_t bufferLength,
1450 UErrorCode *pErrorCode) {
1451 AlgorithmicRange *algRange;
1452 uint32_t *p;
1453 uint32_t i;
1454 int32_t length;
1455
1456 /* check the argument values */
1457 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1458 return 0;
1459 } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1460 bufferLength<0 || (bufferLength>0 && buffer==NULL)
1461 ) {
1462 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1463 return 0;
1464 }
1465
1466 if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1467 return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1468 }
1469
1470 length=0;
1471
1472 /* try algorithmic names first */
1473 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1474 i=*p;
1475 algRange=(AlgorithmicRange *)(p+1);
1476 while(i>0) {
1477 if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1478 length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1479 break;
1480 }
1481 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1482 --i;
1483 }
1484
1485 if(i==0) {
1486 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1487 length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1488 if (!length) {
1489 /* extended character name */
1490 length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1491 }
1492 } else {
1493 /* normal character name */
1494 length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1495 }
1496 }
1497
1498 return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1499}
1500
1501U_CAPI int32_t U_EXPORT2
1502u_getISOComment(UChar32 /*c*/,
1503 char *dest, int32_t destCapacity,
1504 UErrorCode *pErrorCode) {
1505 /* check the argument values */
1506 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1507 return 0;
1508 } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1509 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1510 return 0;
1511 }
1512
1513 return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1514}
1515
1516U_CAPI UChar32 U_EXPORT2
1517u_charFromName(UCharNameChoice nameChoice,
1518 const char *name,
1519 UErrorCode *pErrorCode) {
1520 char upper[120], lower[120];
1521 FindName findName;
1522 AlgorithmicRange *algRange;
1523 uint32_t *p;
1524 uint32_t i;
1525 UChar32 cp = 0;
1526 char c0;
1527 UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
1528
1529 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1530 return error;
1531 }
1532
1533 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1534 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1535 return error;
1536 }
1537
1538 if(!isDataLoaded(pErrorCode)) {
1539 return error;
1540 }
1541
1542 /* construct the uppercase and lowercase of the name first */
1543 for(i=0; i<sizeof(upper); ++i) {
1544 if((c0=*name++)!=0) {
1545 upper[i]=uprv_toupper(c0);
1546 lower[i]=uprv_tolower(c0);
1547 } else {
1548 upper[i]=lower[i]=0;
1549 break;
1550 }
1551 }
1552 if(i==sizeof(upper)) {
1553 /* name too long, there is no such character */
1554 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1555 return error;
1556 }
1557 // i==strlen(name)==strlen(lower)==strlen(upper)
1558
1559 /* try extended names first */
1560 if (lower[0] == '<') {
1561 if (nameChoice == U_EXTENDED_CHAR_NAME) {
1562 // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
1563 if (lower[--i] == '>' && i >= 3 && lower[--i] != '-') {
1564 while (i >= 3 && lower[--i] != '-') {}
1565
1566 if (i >= 2 && lower[i] == '-') {
1567 uint32_t cIdx;
1568
1569 lower[i] = 0;
1570
1571 for (++i; lower[i] != '>'; ++i) {
1572 if (lower[i] >= '0' && lower[i] <= '9') {
1573 cp = (cp << 4) + lower[i] - '0';
1574 } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1575 cp = (cp << 4) + lower[i] - 'a' + 10;
1576 } else {
1577 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1578 return error;
1579 }
1580 }
1581
1582 /* Now validate the category name.
1583 We could use a binary search, or a trie, if
1584 we really wanted to. */
1585
1586 for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
1587
1588 if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1589 if (getCharCat(cp) == cIdx) {
1590 return cp;
1591 }
1592 break;
1593 }
1594 }
1595 }
1596 }
1597 }
1598
1599 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1600 return error;
1601 }
1602
1603 /* try algorithmic names now */
1604 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1605 i=*p;
1606 algRange=(AlgorithmicRange *)(p+1);
1607 while(i>0) {
1608 if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1609 return cp;
1610 }
1611 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1612 --i;
1613 }
1614
1615 /* normal character name */
1616 findName.otherName=upper;
1617 findName.code=error;
1618 enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1619 if (findName.code == error) {
1620 *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1621 }
1622 return findName.code;
1623}
1624
1625U_CAPI void U_EXPORT2
1626u_enumCharNames(UChar32 start, UChar32 limit,
1627 UEnumCharNamesFn *fn,
1628 void *context,
1629 UCharNameChoice nameChoice,
1630 UErrorCode *pErrorCode) {
1631 AlgorithmicRange *algRange;
1632 uint32_t *p;
1633 uint32_t i;
1634
1635 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1636 return;
1637 }
1638
1639 if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1640 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1641 return;
1642 }
1643
1644 if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1645 limit = UCHAR_MAX_VALUE + 1;
1646 }
1647 if((uint32_t)start>=(uint32_t)limit) {
1648 return;
1649 }
1650
1651 if(!isDataLoaded(pErrorCode)) {
1652 return;
1653 }
1654
1655 /* interleave the data-driven ones with the algorithmic ones */
1656 /* iterate over all algorithmic ranges; assume that they are in ascending order */
1657 p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1658 i=*p;
1659 algRange=(AlgorithmicRange *)(p+1);
1660 while(i>0) {
1661 /* enumerate the character names before the current algorithmic range */
1662 /* here: start<limit */
1663 if((uint32_t)start<algRange->start) {
1664 if((uint32_t)limit<=algRange->start) {
1665 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1666 return;
1667 }
1668 if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1669 return;
1670 }
1671 start=(UChar32)algRange->start;
1672 }
1673 /* enumerate the character names in the current algorithmic range */
1674 /* here: algRange->start<=start<limit */
1675 if((uint32_t)start<=algRange->end) {
1676 if((uint32_t)limit<=(algRange->end+1)) {
1677 enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1678 return;
1679 }
1680 if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1681 return;
1682 }
1683 start=(UChar32)algRange->end+1;
1684 }
1685 /* continue to the next algorithmic range (here: start<limit) */
1686 algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1687 --i;
1688 }
1689 /* enumerate the character names after the last algorithmic range */
1690 enumNames(uCharNames, start, limit, fn, context, nameChoice);
1691}
1692
1693U_CAPI int32_t U_EXPORT2
1694uprv_getMaxCharNameLength() {
1695 UErrorCode errorCode=U_ZERO_ERROR;
1696 if(calcNameSetsLengths(&errorCode)) {
1697 return gMaxNameLength;
1698 } else {
1699 return 0;
1700 }
1701}
1702
1703/**
1704 * Converts the char set cset into a Unicode set uset.
1705 * @param cset Set of 256 bit flags corresponding to a set of chars.
1706 * @param uset USet to receive characters. Existing contents are deleted.
1707 */
1708static void
1709charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1710 UChar us[256];
1711 char cs[256];
1712
1713 int32_t i, length;
1714 UErrorCode errorCode;
1715
1716 errorCode=U_ZERO_ERROR;
1717
1718 if(!calcNameSetsLengths(&errorCode)) {
1719 return;
1720 }
1721
1722 /* build a char string with all chars that are used in character names */
1723 length=0;
1724 for(i=0; i<256; ++i) {
1725 if(SET_CONTAINS(cset, i)) {
1726 cs[length++]=(char)i;
1727 }
1728 }
1729
1730 /* convert the char string to a UChar string */
1731 u_charsToUChars(cs, us, length);
1732
1733 /* add each UChar to the USet */
1734 for(i=0; i<length; ++i) {
1735 if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1736 sa->add(sa->set, us[i]);
1737 }
1738 }
1739}
1740
1741/**
1742 * Fills set with characters that are used in Unicode character names.
1743 * @param set USet to receive characters.
1744 */
1745U_CAPI void U_EXPORT2
1746uprv_getCharNameCharacters(const USetAdder *sa) {
1747 charSetToUSet(gNameSet, sa);
1748}
1749
1750/* data swapping ------------------------------------------------------------ */
1751
1752/*
1753 * The token table contains non-negative entries for token bytes,
1754 * and -1 for bytes that represent themselves in the data file's charset.
1755 * -2 entries are used for lead bytes.
1756 *
1757 * Direct bytes (-1 entries) must be translated from the input charset family
1758 * to the output charset family.
1759 * makeTokenMap() writes a permutation mapping for this.
1760 * Use it once for single-/lead-byte tokens and once more for all trail byte
1761 * tokens. (';' is an unused trail byte marked with -1.)
1762 */
1763static void
1764makeTokenMap(const UDataSwapper *ds,
1765 int16_t tokens[], uint16_t tokenCount,
1766 uint8_t map[256],
1767 UErrorCode *pErrorCode) {
1768 UBool usedOutChar[256];
1769 uint16_t i, j;
1770 uint8_t c1, c2;
1771
1772 if(U_FAILURE(*pErrorCode)) {
1773 return;
1774 }
1775
1776 if(ds->inCharset==ds->outCharset) {
1777 /* Same charset family: identity permutation */
1778 for(i=0; i<256; ++i) {
1779 map[i]=(uint8_t)i;
1780 }
1781 } else {
1782 uprv_memset(map, 0, 256);
1783 uprv_memset(usedOutChar, 0, 256);
1784
1785 if(tokenCount>256) {
1786 tokenCount=256;
1787 }
1788
1789 /* set the direct bytes (byte 0 always maps to itself) */
1790 for(i=1; i<tokenCount; ++i) {
1791 if(tokens[i]==-1) {
1792 /* convert the direct byte character */
1793 c1=(uint8_t)i;
1794 ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1795 if(U_FAILURE(*pErrorCode)) {
1796 udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1797 i, ds->inCharset);
1798 return;
1799 }
1800
1801 /* enter the converted character into the map and mark it used */
1802 map[c1]=c2;
1803 usedOutChar[c2]=TRUE;
1804 }
1805 }
1806
1807 /* set the mappings for the rest of the permutation */
1808 for(i=j=1; i<tokenCount; ++i) {
1809 /* set mappings that were not set for direct bytes */
1810 if(map[i]==0) {
1811 /* set an output byte value that was not used as an output byte above */
1812 while(usedOutChar[j]) {
1813 ++j;
1814 }
1815 map[i]=(uint8_t)j++;
1816 }
1817 }
1818
1819 /*
1820 * leave mappings at tokenCount and above unset if tokenCount<256
1821 * because they won't be used
1822 */
1823 }
1824}
1825
1826U_CAPI int32_t U_EXPORT2
1827uchar_swapNames(const UDataSwapper *ds,
1828 const void *inData, int32_t length, void *outData,
1829 UErrorCode *pErrorCode) {
1830 const UDataInfo *pInfo;
1831 int32_t headerSize;
1832
1833 const uint8_t *inBytes;
1834 uint8_t *outBytes;
1835
1836 uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1837 offset, i, count, stringsCount;
1838
1839 const AlgorithmicRange *inRange;
1840 AlgorithmicRange *outRange;
1841
1842 /* udata_swapDataHeader checks the arguments */
1843 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1844 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1845 return 0;
1846 }
1847
1848 /* check data format and format version */
1849 pInfo=(const UDataInfo *)((const char *)inData+4);
1850 if(!(
1851 pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
1852 pInfo->dataFormat[1]==0x6e &&
1853 pInfo->dataFormat[2]==0x61 &&
1854 pInfo->dataFormat[3]==0x6d &&
1855 pInfo->formatVersion[0]==1
1856 )) {
1857 udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1858 pInfo->dataFormat[0], pInfo->dataFormat[1],
1859 pInfo->dataFormat[2], pInfo->dataFormat[3],
1860 pInfo->formatVersion[0]);
1861 *pErrorCode=U_UNSUPPORTED_ERROR;
1862 return 0;
1863 }
1864
1865 inBytes=(const uint8_t *)inData+headerSize;
1866 outBytes=(uint8_t *)outData+headerSize;
1867 if(length<0) {
1868 algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1869 } else {
1870 length-=headerSize;
1871 if( length<20 ||
1872 (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1873 ) {
1874 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1875 length);
1876 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1877 return 0;
1878 }
1879 }
1880
1881 if(length<0) {
1882 /* preflighting: iterate through algorithmic ranges */
1883 offset=algNamesOffset;
1884 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1885 offset+=4;
1886
1887 for(i=0; i<count; ++i) {
1888 inRange=(const AlgorithmicRange *)(inBytes+offset);
1889 offset+=ds->readUInt16(inRange->size);
1890 }
1891 } else {
1892 /* swap data */
1893 const uint16_t *p;
1894 uint16_t *q, *temp;
1895
1896 int16_t tokens[512];
1897 uint16_t tokenCount;
1898
1899 uint8_t map[256], trailMap[256];
1900
1901 /* copy the data for inaccessible bytes */
1902 if(inBytes!=outBytes) {
1903 uprv_memcpy(outBytes, inBytes, length);
1904 }
1905
1906 /* the initial 4 offsets first */
1907 tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1908 groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1909 groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1910 ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1911
1912 /*
1913 * now the tokens table
1914 * it needs to be permutated along with the compressed name strings
1915 */
1916 p=(const uint16_t *)(inBytes+16);
1917 q=(uint16_t *)(outBytes+16);
1918
1919 /* read and swap the tokenCount */
1920 tokenCount=ds->readUInt16(*p);
1921 ds->swapArray16(ds, p, 2, q, pErrorCode);
1922 ++p;
1923 ++q;
1924
1925 /* read the first 512 tokens and make the token maps */
1926 if(tokenCount<=512) {
1927 count=tokenCount;
1928 } else {
1929 count=512;
1930 }
1931 for(i=0; i<count; ++i) {
1932 tokens[i]=udata_readInt16(ds, p[i]);
1933 }
1934 for(; i<512; ++i) {
1935 tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1936 }
1937 makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1938 makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1939 if(U_FAILURE(*pErrorCode)) {
1940 return 0;
1941 }
1942
1943 /*
1944 * swap and permutate the tokens
1945 * go through a temporary array to support in-place swapping
1946 */
1947 temp=(uint16_t *)uprv_malloc(tokenCount*2);
1948 if(temp==NULL) {
1949 udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1950 tokenCount);
1951 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1952 return 0;
1953 }
1954
1955 /* swap and permutate single-/lead-byte tokens */
1956 for(i=0; i<tokenCount && i<256; ++i) {
1957 ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1958 }
1959
1960 /* swap and permutate trail-byte tokens */
1961 for(; i<tokenCount; ++i) {
1962 ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1963 }
1964
1965 /* copy the result into the output and free the temporary array */
1966 uprv_memcpy(q, temp, tokenCount*2);
1967 uprv_free(temp);
1968
1969 /*
1970 * swap the token strings but not a possible padding byte after
1971 * the terminating NUL of the last string
1972 */
1973 udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1974 outBytes+tokenStringOffset, pErrorCode);
1975 if(U_FAILURE(*pErrorCode)) {
1976 udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1977 return 0;
1978 }
1979
1980 /* swap the group table */
1981 count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1982 ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1983 outBytes+groupsOffset, pErrorCode);
1984
1985 /*
1986 * swap the group strings
1987 * swap the string bytes but not the nibble-encoded string lengths
1988 */
1989 if(ds->inCharset!=ds->outCharset) {
1990 uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
1991
1992 const uint8_t *inStrings, *nextInStrings;
1993 uint8_t *outStrings;
1994
1995 uint8_t c;
1996
1997 inStrings=inBytes+groupStringOffset;
1998 outStrings=outBytes+groupStringOffset;
1999
2000 stringsCount=algNamesOffset-groupStringOffset;
2001
2002 /* iterate through string groups until only a few padding bytes are left */
2003 while(stringsCount>32) {
2004 nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2005
2006 /* move past the length bytes */
2007 stringsCount-=(uint32_t)(nextInStrings-inStrings);
2008 outStrings+=nextInStrings-inStrings;
2009 inStrings=nextInStrings;
2010
2011 count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2012 stringsCount-=count;
2013
2014 /* swap the string bytes using map[] and trailMap[] */
2015 while(count>0) {
2016 c=*inStrings++;
2017 *outStrings++=map[c];
2018 if(tokens[c]!=-2) {
2019 --count;
2020 } else {
2021 /* token lead byte: swap the trail byte, too */
2022 *outStrings++=trailMap[*inStrings++];
2023 count-=2;
2024 }
2025 }
2026 }
2027 }
2028
2029 /* swap the algorithmic ranges */
2030 offset=algNamesOffset;
2031 count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2032 ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2033 offset+=4;
2034
2035 for(i=0; i<count; ++i) {
2036 if(offset>(uint32_t)length) {
2037 udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2038 length, i);
2039 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2040 return 0;
2041 }
2042
2043 inRange=(const AlgorithmicRange *)(inBytes+offset);
2044 outRange=(AlgorithmicRange *)(outBytes+offset);
2045 offset+=ds->readUInt16(inRange->size);
2046
2047 ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2048 ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2049 switch(inRange->type) {
2050 case 0:
2051 /* swap prefix string */
2052 ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2053 outRange+1, pErrorCode);
2054 if(U_FAILURE(*pErrorCode)) {
2055 udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2056 i);
2057 return 0;
2058 }
2059 break;
2060 case 1:
2061 {
2062 /* swap factors and the prefix and factor strings */
2063 uint32_t factorsCount;
2064
2065 factorsCount=inRange->variant;
2066 p=(const uint16_t *)(inRange+1);
2067 q=(uint16_t *)(outRange+1);
2068 ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2069
2070 /* swap the strings, up to the last terminating NUL */
2071 p+=factorsCount;
2072 q+=factorsCount;
2073 stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2074 while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2075 --stringsCount;
2076 }
2077 ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2078 }
2079 break;
2080 default:
2081 udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2082 inRange->type, i);
2083 *pErrorCode=U_UNSUPPORTED_ERROR;
2084 return 0;
2085 }
2086 }
2087 }
2088
2089 return headerSize+(int32_t)offset;
2090}
2091
2092/*
2093 * Hey, Emacs, please set the following:
2094 *
2095 * Local Variables:
2096 * indent-tabs-mode: nil
2097 * End:
2098 *
2099 */