]> git.saurik.com Git - apple/icu.git/blob - icuSources/tools/toolutil/ucm.c
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / ucm.c
1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2003-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: ucm.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2003jun20
14 * created by: Markus W. Scherer
15 *
16 * This file reads a .ucm file, stores its mappings and sorts them.
17 * It implements handling of Unicode conversion mappings from .ucm files
18 * for makeconv, canonucm, rptp2ucm, etc.
19 *
20 * Unicode code point sequences with a length of more than 1,
21 * as well as byte sequences with more than 4 bytes or more than one complete
22 * character sequence are handled to support m:n mappings.
23 */
24
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
27 #include "cstring.h"
28 #include "cmemory.h"
29 #include "filestrm.h"
30 #include "uarrsort.h"
31 #include "ucnvmbcs.h"
32 #include "ucnv_bld.h"
33 #include "ucnv_ext.h"
34 #include "uparse.h"
35 #include "ucm.h"
36 #include <stdio.h>
37
38 #if !UCONFIG_NO_CONVERSION
39
40 /* -------------------------------------------------------------------------- */
41
42 static void
43 printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
44 int32_t j;
45
46 for(j=0; j<m->uLen; ++j) {
47 fprintf(f, "<U%04lX>", (long)codePoints[j]);
48 }
49
50 fputc(' ', f);
51
52 for(j=0; j<m->bLen; ++j) {
53 fprintf(f, "\\x%02X", bytes[j]);
54 }
55
56 if(m->f>=0) {
57 fprintf(f, " |%u\n", m->f);
58 } else {
59 fputs("\n", f);
60 }
61 }
62
63 U_CAPI void U_EXPORT2
64 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
65 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
66 }
67
68 U_CAPI void U_EXPORT2
69 ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
70 UCMapping *m;
71 int32_t i, length;
72
73 m=table->mappings;
74 length=table->mappingsLength;
75 if(byUnicode) {
76 for(i=0; i<length; ++m, ++i) {
77 ucm_printMapping(table, m, f);
78 }
79 } else {
80 const int32_t *map=table->reverseMap;
81 for(i=0; i<length; ++i) {
82 ucm_printMapping(table, m+map[i], f);
83 }
84 }
85 }
86
87 /* mapping comparisons ------------------------------------------------------ */
88
89 static int32_t
90 compareUnicode(UCMTable *lTable, const UCMapping *l,
91 UCMTable *rTable, const UCMapping *r) {
92 const UChar32 *lu, *ru;
93 int32_t result, i, length;
94
95 if(l->uLen==1 && r->uLen==1) {
96 /* compare two single code points */
97 return l->u-r->u;
98 }
99
100 /* get pointers to the code point sequences */
101 lu=UCM_GET_CODE_POINTS(lTable, l);
102 ru=UCM_GET_CODE_POINTS(rTable, r);
103
104 /* get the minimum length */
105 if(l->uLen<=r->uLen) {
106 length=l->uLen;
107 } else {
108 length=r->uLen;
109 }
110
111 /* compare the code points */
112 for(i=0; i<length; ++i) {
113 result=lu[i]-ru[i];
114 if(result!=0) {
115 return result;
116 }
117 }
118
119 /* compare the lengths */
120 return l->uLen-r->uLen;
121 }
122
123 static int32_t
124 compareBytes(UCMTable *lTable, const UCMapping *l,
125 UCMTable *rTable, const UCMapping *r,
126 UBool lexical) {
127 const uint8_t *lb, *rb;
128 int32_t result, i, length;
129
130 /*
131 * A lexical comparison is used for sorting in the builder, to allow
132 * an efficient search for a byte sequence that could be a prefix
133 * of a previously entered byte sequence.
134 *
135 * Comparing by lengths first is for compatibility with old .ucm tools
136 * like canonucm and rptp2ucm.
137 */
138 if(lexical) {
139 /* get the minimum length and continue */
140 if(l->bLen<=r->bLen) {
141 length=l->bLen;
142 } else {
143 length=r->bLen;
144 }
145 } else {
146 /* compare lengths first */
147 result=l->bLen-r->bLen;
148 if(result!=0) {
149 return result;
150 } else {
151 length=l->bLen;
152 }
153 }
154
155 /* get pointers to the byte sequences */
156 lb=UCM_GET_BYTES(lTable, l);
157 rb=UCM_GET_BYTES(rTable, r);
158
159 /* compare the bytes */
160 for(i=0; i<length; ++i) {
161 result=lb[i]-rb[i];
162 if(result!=0) {
163 return result;
164 }
165 }
166
167 /* compare the lengths */
168 return l->bLen-r->bLen;
169 }
170
171 /* compare UCMappings for sorting */
172 static int32_t
173 compareMappings(UCMTable *lTable, const UCMapping *l,
174 UCMTable *rTable, const UCMapping *r,
175 UBool uFirst) {
176 int32_t result;
177
178 /* choose which side to compare first */
179 if(uFirst) {
180 /* Unicode then bytes */
181 result=compareUnicode(lTable, l, rTable, r);
182 if(result==0) {
183 result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
184 }
185 } else {
186 /* bytes then Unicode */
187 result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
188 if(result==0) {
189 result=compareUnicode(lTable, l, rTable, r);
190 }
191 }
192
193 if(result!=0) {
194 return result;
195 }
196
197 /* compare the flags */
198 return l->f-r->f;
199 }
200
201 /* sorting by Unicode first sorts mappings directly */
202 static int32_t
203 compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
204 return compareMappings(
205 (UCMTable *)context, (const UCMapping *)left,
206 (UCMTable *)context, (const UCMapping *)right, TRUE);
207 }
208
209 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
210 static int32_t
211 compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
212 UCMTable *table=(UCMTable *)context;
213 int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
214 return compareMappings(
215 table, table->mappings+l,
216 table, table->mappings+r, FALSE);
217 }
218
219 U_CAPI void U_EXPORT2
220 ucm_sortTable(UCMTable *t) {
221 UErrorCode errorCode;
222 int32_t i;
223
224 if(t->isSorted) {
225 return;
226 }
227
228 errorCode=U_ZERO_ERROR;
229
230 /* 1. sort by Unicode first */
231 uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
232 compareMappingsUnicodeFirst, t,
233 FALSE, &errorCode);
234
235 /* build the reverseMap */
236 if(t->reverseMap==NULL) {
237 /*
238 * allocate mappingsCapacity instead of mappingsLength so that
239 * if mappings are added, the reverseMap need not be
240 * reallocated each time
241 * (see moveMappings() and ucm_addMapping())
242 */
243 t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
244 if(t->reverseMap==NULL) {
245 fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
246 exit(U_MEMORY_ALLOCATION_ERROR);
247 }
248 }
249 for(i=0; i<t->mappingsLength; ++i) {
250 t->reverseMap[i]=i;
251 }
252
253 /* 2. sort reverseMap by mappings bytes first */
254 uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
255 compareMappingsBytesFirst, t,
256 FALSE, &errorCode);
257
258 if(U_FAILURE(errorCode)) {
259 fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
260 u_errorName(errorCode));
261 exit(errorCode);
262 }
263
264 t->isSorted=TRUE;
265 }
266
267 enum {
268 MOVE_TO_EXT=1,
269 REMOVE_MAPPING=2
270 };
271
272 /*
273 * move mappings with their move flag set from the base table
274 * and optionally to the extension table
275 *
276 * works only with explicit precision flags because it uses some of the
277 * flags bits
278 */
279 static void
280 moveMappings(UCMTable *base, UCMTable *ext) {
281 UCMapping *mb, *mbLimit;
282 int8_t flag;
283
284 mb=base->mappings;
285 mbLimit=mb+base->mappingsLength;
286
287 while(mb<mbLimit) {
288 flag=mb->moveFlag;
289 if(flag!=0) {
290 /* reset the move flag */
291 mb->moveFlag=0;
292
293 if(ext!=NULL && (flag&MOVE_TO_EXT)) {
294 /* add the mapping to the extension table */
295 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
296 }
297
298 /* move the last base mapping down and overwrite the current one */
299 if(mb<(mbLimit-1)) {
300 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
301 }
302 --mbLimit;
303 --base->mappingsLength;
304 base->isSorted=FALSE;
305 } else {
306 ++mb;
307 }
308 }
309 }
310
311 enum {
312 NEEDS_MOVE=1,
313 HAS_ERRORS=2
314 };
315
316 static uint8_t
317 checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
318 UBool moveToExt, UBool intersectBase) {
319 UCMapping *mb, *me, *mbLimit, *meLimit;
320 int32_t cmp;
321 uint8_t result;
322
323 mb=base->mappings;
324 mbLimit=mb+base->mappingsLength;
325
326 me=ext->mappings;
327 meLimit=me+ext->mappingsLength;
328
329 result=0;
330
331 for(;;) {
332 /* skip irrelevant mappings on both sides */
333 for(;;) {
334 if(mb==mbLimit) {
335 return result;
336 }
337
338 if(0<=mb->f && mb->f<=2) {
339 break;
340 }
341
342 ++mb;
343 }
344
345 for(;;) {
346 if(me==meLimit) {
347 return result;
348 }
349
350 if(0<=me->f && me->f<=2) {
351 break;
352 }
353
354 ++me;
355 }
356
357 /* compare the base and extension mappings */
358 cmp=compareUnicode(base, mb, ext, me);
359 if(cmp<0) {
360 if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
361 /*
362 * mapping in base but not in ext, move it
363 *
364 * if ext is DBCS, move DBCS mappings here
365 * and check SBCS ones for Unicode prefix below
366 */
367 mb->moveFlag|=MOVE_TO_EXT;
368 result|=NEEDS_MOVE;
369
370 /* does mb map from an input sequence that is a prefix of me's? */
371 } else if( mb->uLen<me->uLen &&
372 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
373 ) {
374 if(moveToExt) {
375 /* mark this mapping to be moved to the extension table */
376 mb->moveFlag|=MOVE_TO_EXT;
377 result|=NEEDS_MOVE;
378 } else {
379 fprintf(stderr,
380 "ucm error: the base table contains a mapping whose input sequence\n"
381 " is a prefix of the input sequence of an extension mapping\n");
382 ucm_printMapping(base, mb, stderr);
383 ucm_printMapping(ext, me, stderr);
384 result|=HAS_ERRORS;
385 }
386 }
387
388 ++mb;
389 } else if(cmp==0) {
390 /*
391 * same output: remove the extension mapping,
392 * otherwise treat as an error
393 */
394 if( mb->f==me->f && mb->bLen==me->bLen &&
395 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
396 ) {
397 me->moveFlag|=REMOVE_MAPPING;
398 result|=NEEDS_MOVE;
399 } else if(intersectBase) {
400 /* mapping in base but not in ext, move it */
401 mb->moveFlag|=MOVE_TO_EXT;
402 result|=NEEDS_MOVE;
403 } else {
404 fprintf(stderr,
405 "ucm error: the base table contains a mapping whose input sequence\n"
406 " is the same as the input sequence of an extension mapping\n"
407 " but it maps differently\n");
408 ucm_printMapping(base, mb, stderr);
409 ucm_printMapping(ext, me, stderr);
410 result|=HAS_ERRORS;
411 }
412
413 ++mb;
414 } else /* cmp>0 */ {
415 ++me;
416 }
417 }
418 }
419
420 static uint8_t
421 checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
422 UBool moveToExt, UBool intersectBase) {
423 UCMapping *mb, *me;
424 int32_t *baseMap, *extMap;
425 int32_t b, e, bLimit, eLimit, cmp;
426 uint8_t result;
427 UBool isSISO;
428
429 baseMap=base->reverseMap;
430 extMap=ext->reverseMap;
431
432 b=e=0;
433 bLimit=base->mappingsLength;
434 eLimit=ext->mappingsLength;
435
436 result=0;
437
438 isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
439
440 for(;;) {
441 /* skip irrelevant mappings on both sides */
442 for(;; ++b) {
443 if(b==bLimit) {
444 return result;
445 }
446 mb=base->mappings+baseMap[b];
447
448 if(intersectBase==2 && mb->bLen==1) {
449 /*
450 * comparing a base against a DBCS extension:
451 * leave SBCS base mappings alone
452 */
453 continue;
454 }
455
456 if(mb->f==0 || mb->f==3) {
457 break;
458 }
459 }
460
461 for(;;) {
462 if(e==eLimit) {
463 return result;
464 }
465 me=ext->mappings+extMap[e];
466
467 if(me->f==0 || me->f==3) {
468 break;
469 }
470
471 ++e;
472 }
473
474 /* compare the base and extension mappings */
475 cmp=compareBytes(base, mb, ext, me, TRUE);
476 if(cmp<0) {
477 if(intersectBase) {
478 /* mapping in base but not in ext, move it */
479 mb->moveFlag|=MOVE_TO_EXT;
480 result|=NEEDS_MOVE;
481
482 /*
483 * does mb map from an input sequence that is a prefix of me's?
484 * for SI/SO tables, a single byte is never a prefix because it
485 * occurs in a separate single-byte state
486 */
487 } else if( mb->bLen<me->bLen &&
488 (!isSISO || mb->bLen>1) &&
489 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
490 ) {
491 if(moveToExt) {
492 /* mark this mapping to be moved to the extension table */
493 mb->moveFlag|=MOVE_TO_EXT;
494 result|=NEEDS_MOVE;
495 } else {
496 fprintf(stderr,
497 "ucm error: the base table contains a mapping whose input sequence\n"
498 " is a prefix of the input sequence of an extension mapping\n");
499 ucm_printMapping(base, mb, stderr);
500 ucm_printMapping(ext, me, stderr);
501 result|=HAS_ERRORS;
502 }
503 }
504
505 ++b;
506 } else if(cmp==0) {
507 /*
508 * same output: remove the extension mapping,
509 * otherwise treat as an error
510 */
511 if( mb->f==me->f && mb->uLen==me->uLen &&
512 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
513 ) {
514 me->moveFlag|=REMOVE_MAPPING;
515 result|=NEEDS_MOVE;
516 } else if(intersectBase) {
517 /* mapping in base but not in ext, move it */
518 mb->moveFlag|=MOVE_TO_EXT;
519 result|=NEEDS_MOVE;
520 } else {
521 fprintf(stderr,
522 "ucm error: the base table contains a mapping whose input sequence\n"
523 " is the same as the input sequence of an extension mapping\n"
524 " but it maps differently\n");
525 ucm_printMapping(base, mb, stderr);
526 ucm_printMapping(ext, me, stderr);
527 result|=HAS_ERRORS;
528 }
529
530 ++b;
531 } else /* cmp>0 */ {
532 ++e;
533 }
534 }
535 }
536
537 U_CAPI UBool U_EXPORT2
538 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
539 UCMapping *m, *mLimit;
540 int32_t count;
541 UBool isOK;
542
543 m=table->mappings;
544 mLimit=m+table->mappingsLength;
545 isOK=TRUE;
546
547 while(m<mLimit) {
548 count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
549 if(count<1) {
550 ucm_printMapping(table, m, stderr);
551 isOK=FALSE;
552 }
553 ++m;
554 }
555
556 return isOK;
557 }
558
559 U_CAPI UBool U_EXPORT2
560 ucm_checkBaseExt(UCMStates *baseStates,
561 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
562 UBool intersectBase) {
563 uint8_t result;
564
565 /* if we have an extension table, we must always use precision flags */
566 if(base->flagsType&UCM_FLAGS_IMPLICIT) {
567 fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
568 return FALSE;
569 }
570 if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
571 fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
572 return FALSE;
573 }
574
575 /* checking requires both tables to be sorted */
576 ucm_sortTable(base);
577 ucm_sortTable(ext);
578
579 /* check */
580 result=
581 checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
582 checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
583
584 if(result&HAS_ERRORS) {
585 return FALSE;
586 }
587
588 if(result&NEEDS_MOVE) {
589 moveMappings(ext, NULL);
590 moveMappings(base, moveTarget);
591 ucm_sortTable(base);
592 ucm_sortTable(ext);
593 if(moveTarget!=NULL) {
594 ucm_sortTable(moveTarget);
595 }
596 }
597
598 return TRUE;
599 }
600
601 /* merge tables for rptp2ucm ------------------------------------------------ */
602
603 U_CAPI void U_EXPORT2
604 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
605 const uint8_t *subchar, int32_t subcharLength,
606 uint8_t subchar1) {
607 UCMapping *fromUMapping, *toUMapping;
608 int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
609
610 ucm_sortTable(fromUTable);
611 ucm_sortTable(toUTable);
612
613 fromUMapping=fromUTable->mappings;
614 toUMapping=toUTable->mappings;
615
616 fromUTop=fromUTable->mappingsLength;
617 toUTop=toUTable->mappingsLength;
618
619 fromUIndex=toUIndex=0;
620
621 while(fromUIndex<fromUTop && toUIndex<toUTop) {
622 cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
623 if(cmp==0) {
624 /* equal: roundtrip, nothing to do (flags are initially 0) */
625 ++fromUMapping;
626 ++toUMapping;
627
628 ++fromUIndex;
629 ++toUIndex;
630 } else if(cmp<0) {
631 /*
632 * the fromU mapping does not have a toU counterpart:
633 * fallback Unicode->codepage
634 */
635 if( (fromUMapping->bLen==subcharLength &&
636 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
637 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
638 ) {
639 fromUMapping->f=2; /* SUB mapping */
640 } else {
641 fromUMapping->f=1; /* normal fallback */
642 }
643
644 ++fromUMapping;
645 ++fromUIndex;
646 } else {
647 /*
648 * the toU mapping does not have a fromU counterpart:
649 * (reverse) fallback codepage->Unicode, copy it to the fromU table
650 */
651
652 /* ignore reverse fallbacks to Unicode SUB */
653 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
654 toUMapping->f=3; /* reverse fallback */
655 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
656
657 /* the table may have been reallocated */
658 fromUMapping=fromUTable->mappings+fromUIndex;
659 }
660
661 ++toUMapping;
662 ++toUIndex;
663 }
664 }
665
666 /* either one or both tables are exhausted */
667 while(fromUIndex<fromUTop) {
668 /* leftover fromU mappings are fallbacks */
669 if( (fromUMapping->bLen==subcharLength &&
670 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
671 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
672 ) {
673 fromUMapping->f=2; /* SUB mapping */
674 } else {
675 fromUMapping->f=1; /* normal fallback */
676 }
677
678 ++fromUMapping;
679 ++fromUIndex;
680 }
681
682 while(toUIndex<toUTop) {
683 /* leftover toU mappings are reverse fallbacks */
684
685 /* ignore reverse fallbacks to Unicode SUB */
686 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
687 toUMapping->f=3; /* reverse fallback */
688 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
689 }
690
691 ++toUMapping;
692 ++toUIndex;
693 }
694
695 fromUTable->isSorted=FALSE;
696 }
697
698 /* separate extension mappings out of base table for rptp2ucm --------------- */
699
700 U_CAPI UBool U_EXPORT2
701 ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
702 UCMTable *table;
703 UCMapping *m, *mLimit;
704 int32_t type;
705 UBool needsMove, isOK;
706
707 table=ucm->base;
708 m=table->mappings;
709 mLimit=m+table->mappingsLength;
710
711 needsMove=FALSE;
712 isOK=TRUE;
713
714 for(; m<mLimit; ++m) {
715 if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
716 fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
717 ucm_printMapping(table, m, stderr);
718 m->moveFlag|=REMOVE_MAPPING;
719 needsMove=TRUE;
720 continue;
721 }
722
723 type=ucm_mappingType(
724 &ucm->states, m,
725 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
726 if(type<0) {
727 /* illegal byte sequence */
728 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
729 isOK=FALSE;
730 } else if(type>0) {
731 m->moveFlag|=MOVE_TO_EXT;
732 needsMove=TRUE;
733 }
734 }
735
736 if(!isOK) {
737 return FALSE;
738 }
739 if(needsMove) {
740 moveMappings(ucm->base, ucm->ext);
741 return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
742 } else {
743 ucm_sortTable(ucm->base);
744 return TRUE;
745 }
746 }
747
748 /* ucm parser --------------------------------------------------------------- */
749
750 U_CAPI int8_t U_EXPORT2
751 ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
752 const char *s=*ps;
753 char *end;
754 uint8_t byte;
755 int8_t bLen;
756
757 bLen=0;
758 for(;;) {
759 /* skip an optional plus sign */
760 if(bLen>0 && *s=='+') {
761 ++s;
762 }
763 if(*s!='\\') {
764 break;
765 }
766
767 if( s[1]!='x' ||
768 (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
769 ) {
770 fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
771 return -1;
772 }
773
774 if(bLen==UCNV_EXT_MAX_BYTES) {
775 fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
776 return -1;
777 }
778 bytes[bLen++]=byte;
779 s=end;
780 }
781
782 *ps=s;
783 return bLen;
784 }
785
786 /* parse a mapping line; must not be empty */
787 U_CAPI UBool U_EXPORT2
788 ucm_parseMappingLine(UCMapping *m,
789 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
790 uint8_t bytes[UCNV_EXT_MAX_BYTES],
791 const char *line) {
792 const char *s;
793 char *end;
794 UChar32 cp;
795 int32_t u16Length;
796 int8_t uLen, bLen, f;
797
798 s=line;
799 uLen=bLen=0;
800
801 /* parse code points */
802 for(;;) {
803 /* skip an optional plus sign */
804 if(uLen>0 && *s=='+') {
805 ++s;
806 }
807 if(*s!='<') {
808 break;
809 }
810
811 if( s[1]!='U' ||
812 (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
813 *end!='>'
814 ) {
815 fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
816 return FALSE;
817 }
818 if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
819 fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
820 return FALSE;
821 }
822
823 if(uLen==UCNV_EXT_MAX_UCHARS) {
824 fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
825 return FALSE;
826 }
827 codePoints[uLen++]=cp;
828 s=end+1;
829 }
830
831 if(uLen==0) {
832 fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
833 return FALSE;
834 } else if(uLen==1) {
835 m->u=codePoints[0];
836 } else {
837 UErrorCode errorCode=U_ZERO_ERROR;
838 u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
839 if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
840 u16Length>UCNV_EXT_MAX_UCHARS
841 ) {
842 fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
843 return FALSE;
844 }
845 }
846
847 s=u_skipWhitespace(s);
848
849 /* parse bytes */
850 bLen=ucm_parseBytes(bytes, line, &s);
851
852 if(bLen<0) {
853 return FALSE;
854 } else if(bLen==0) {
855 fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
856 return FALSE;
857 } else if(bLen<=4) {
858 uprv_memcpy(m->b.bytes, bytes, bLen);
859 }
860
861 /* skip everything until the fallback indicator, even the start of a comment */
862 for(;;) {
863 if(*s==0) {
864 f=-1; /* no fallback indicator */
865 break;
866 } else if(*s=='|') {
867 f=(int8_t)(s[1]-'0');
868 if((uint8_t)f>3) {
869 fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
870 return FALSE;
871 }
872 break;
873 }
874 ++s;
875 }
876
877 m->uLen=uLen;
878 m->bLen=bLen;
879 m->f=f;
880 return TRUE;
881 }
882
883 /* general APIs ------------------------------------------------------------- */
884
885 U_CAPI UCMTable * U_EXPORT2
886 ucm_openTable() {
887 UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
888 if(table==NULL) {
889 fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
890 exit(U_MEMORY_ALLOCATION_ERROR);
891 }
892
893 memset(table, 0, sizeof(UCMTable));
894 return table;
895 }
896
897 U_CAPI void U_EXPORT2
898 ucm_closeTable(UCMTable *table) {
899 if(table!=NULL) {
900 uprv_free(table->mappings);
901 uprv_free(table->codePoints);
902 uprv_free(table->bytes);
903 uprv_free(table->reverseMap);
904 uprv_free(table);
905 }
906 }
907
908 U_CAPI void U_EXPORT2
909 ucm_resetTable(UCMTable *table) {
910 if(table!=NULL) {
911 table->mappingsLength=0;
912 table->flagsType=0;
913 table->unicodeMask=0;
914 table->bytesLength=table->codePointsLength=0;
915 table->isSorted=FALSE;
916 }
917 }
918
919 U_CAPI void U_EXPORT2
920 ucm_addMapping(UCMTable *table,
921 UCMapping *m,
922 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
923 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
924 UCMapping *tm;
925 UChar32 c;
926 int32_t index;
927
928 if(table->mappingsLength>=table->mappingsCapacity) {
929 /* make the mappings array larger */
930 if(table->mappingsCapacity==0) {
931 table->mappingsCapacity=1000;
932 } else {
933 table->mappingsCapacity*=10;
934 }
935 table->mappings=(UCMapping *)uprv_realloc(table->mappings,
936 table->mappingsCapacity*sizeof(UCMapping));
937 if(table->mappings==NULL) {
938 fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
939 (int)table->mappingsCapacity);
940 exit(U_MEMORY_ALLOCATION_ERROR);
941 }
942
943 if(table->reverseMap!=NULL) {
944 /* the reverseMap must be reallocated in a new sort */
945 uprv_free(table->reverseMap);
946 table->reverseMap=NULL;
947 }
948 }
949
950 if(m->uLen>1 && table->codePointsCapacity==0) {
951 table->codePointsCapacity=10000;
952 table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
953 if(table->codePoints==NULL) {
954 fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
955 (int)table->codePointsCapacity);
956 exit(U_MEMORY_ALLOCATION_ERROR);
957 }
958 }
959
960 if(m->bLen>4 && table->bytesCapacity==0) {
961 table->bytesCapacity=10000;
962 table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
963 if(table->bytes==NULL) {
964 fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
965 (int)table->bytesCapacity);
966 exit(U_MEMORY_ALLOCATION_ERROR);
967 }
968 }
969
970 if(m->uLen>1) {
971 index=table->codePointsLength;
972 table->codePointsLength+=m->uLen;
973 if(table->codePointsLength>table->codePointsCapacity) {
974 fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
975 exit(U_MEMORY_ALLOCATION_ERROR);
976 }
977
978 uprv_memcpy(table->codePoints+index, codePoints, m->uLen*4);
979 m->u=index;
980 }
981
982 if(m->bLen>4) {
983 index=table->bytesLength;
984 table->bytesLength+=m->bLen;
985 if(table->bytesLength>table->bytesCapacity) {
986 fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
987 exit(U_MEMORY_ALLOCATION_ERROR);
988 }
989
990 uprv_memcpy(table->bytes+index, bytes, m->bLen);
991 m->b.index=index;
992 }
993
994 /* set unicodeMask */
995 for(index=0; index<m->uLen; ++index) {
996 c=codePoints[index];
997 if(c>=0x10000) {
998 table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
999 } else if(U_IS_SURROGATE(c)) {
1000 table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
1001 }
1002 }
1003
1004 /* set flagsType */
1005 if(m->f<0) {
1006 table->flagsType|=UCM_FLAGS_IMPLICIT;
1007 } else {
1008 table->flagsType|=UCM_FLAGS_EXPLICIT;
1009 }
1010
1011 tm=table->mappings+table->mappingsLength++;
1012 uprv_memcpy(tm, m, sizeof(UCMapping));
1013
1014 table->isSorted=FALSE;
1015 }
1016
1017 U_CAPI UCMFile * U_EXPORT2
1018 ucm_open() {
1019 UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1020 if(ucm==NULL) {
1021 fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1022 exit(U_MEMORY_ALLOCATION_ERROR);
1023 }
1024
1025 memset(ucm, 0, sizeof(UCMFile));
1026
1027 ucm->base=ucm_openTable();
1028 ucm->ext=ucm_openTable();
1029
1030 ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
1031 ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
1032 ucm->states.outputType=-1;
1033 ucm->states.minCharLength=ucm->states.maxCharLength=1;
1034
1035 return ucm;
1036 }
1037
1038 U_CAPI void U_EXPORT2
1039 ucm_close(UCMFile *ucm) {
1040 if(ucm!=NULL) {
1041 uprv_free(ucm->base);
1042 uprv_free(ucm->ext);
1043 uprv_free(ucm);
1044 }
1045 }
1046
1047 U_CAPI int32_t U_EXPORT2
1048 ucm_mappingType(UCMStates *baseStates,
1049 UCMapping *m,
1050 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1051 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1052 /* check validity of the bytes and count the characters in them */
1053 int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1054 if(count<1) {
1055 /* illegal byte sequence */
1056 return -1;
1057 }
1058
1059 /*
1060 * Suitable for an ICU conversion base table means:
1061 * - a 1:1 mapping
1062 * - not a |2 SUB mappings for <subchar1>
1063 * - not a |1 fallback to 0x00
1064 * - no leading 0x00 bytes
1065 */
1066 if( m->uLen==1 && count==1 &&
1067 !((m->f==2 && m->bLen==1 && baseStates->maxCharLength>1) ||
1068 (m->f==1 && m->bLen==1 && bytes[0]==0) ||
1069 (m->bLen>1 && bytes[0]==0))
1070 ) {
1071 return 0; /* suitable for a base table */
1072 } else {
1073 return 1; /* needs to go into an extension table */
1074 }
1075 }
1076
1077 U_CAPI UBool U_EXPORT2
1078 ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1079 UCMapping *m,
1080 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1081 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1082 int32_t type;
1083
1084 if(m->f==2 && m->uLen>1) {
1085 fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1086 printMapping(m, codePoints, bytes, stderr);
1087 return FALSE;
1088 }
1089
1090 if(baseStates!=NULL) {
1091 /* check validity of the bytes and count the characters in them */
1092 type=ucm_mappingType(baseStates, m, codePoints, bytes);
1093 if(type<0) {
1094 /* illegal byte sequence */
1095 printMapping(m, codePoints, bytes, stderr);
1096 return FALSE;
1097 }
1098 } else {
1099 /* not used - adding a mapping for an extension-only table before its base table is read */
1100 type=1;
1101 }
1102
1103 /*
1104 * Add the mapping to the base table if this is requested and suitable.
1105 * Otherwise, add it to the extension table.
1106 */
1107 if(forBase && type==0) {
1108 ucm_addMapping(ucm->base, m, codePoints, bytes);
1109 } else {
1110 ucm_addMapping(ucm->ext, m, codePoints, bytes);
1111 }
1112
1113 return TRUE;
1114 }
1115
1116 U_CAPI UBool U_EXPORT2
1117 ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1118 UCMapping m={ 0 };
1119 UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1120 uint8_t bytes[UCNV_EXT_MAX_BYTES];
1121
1122 const char *s;
1123
1124 /* ignore empty and comment lines */
1125 if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1126 return TRUE;
1127 }
1128
1129 return
1130 ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1131 ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1132 }
1133
1134 U_CAPI void U_EXPORT2
1135 ucm_readTable(UCMFile *ucm, FileStream* convFile,
1136 UBool forBase, UCMStates *baseStates,
1137 UErrorCode *pErrorCode) {
1138 char line[500];
1139 char *end;
1140 UBool isOK;
1141
1142 if(U_FAILURE(*pErrorCode)) {
1143 return;
1144 }
1145
1146 isOK=TRUE;
1147
1148 for(;;) {
1149 /* read the next line */
1150 if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1151 fprintf(stderr, "incomplete charmap section\n");
1152 isOK=FALSE;
1153 break;
1154 }
1155
1156 /* remove CR LF */
1157 end=uprv_strchr(line, 0);
1158 while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1159 --end;
1160 }
1161 *end=0;
1162
1163 /* ignore empty and comment lines */
1164 if(line[0]==0 || line[0]=='#') {
1165 continue;
1166 }
1167
1168 /* stop at the end of the mapping table */
1169 if(0==uprv_strcmp(line, "END CHARMAP")) {
1170 break;
1171 }
1172
1173 isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1174 }
1175
1176 if(!isOK) {
1177 *pErrorCode=U_INVALID_TABLE_FORMAT;
1178 }
1179 }
1180 #endif
1181