]> git.saurik.com Git - apple/icu.git/blame - icuSources/tools/toolutil/ucm.c
ICU-6.2.15.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / ucm.c
CommitLineData
374ca955
A
1/*
2*******************************************************************************
3*
4* Copyright (C) 2003-2004, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: ucm.c
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2003jun20
14* created by: Markus W. Scherer
15*
16* This file reads a .ucm file, stores its mappings and sorts them.
17* It implements handling of Unicode conversion mappings from .ucm files
18* for makeconv, canonucm, rptp2ucm, etc.
19*
20* Unicode code point sequences with a length of more than 1,
21* as well as byte sequences with more than 4 bytes or more than one complete
22* character sequence are handled to support m:n mappings.
23*/
24
25#include "unicode/utypes.h"
26#include "unicode/ustring.h"
27#include "cstring.h"
28#include "cmemory.h"
29#include "filestrm.h"
30#include "uarrsort.h"
31#include "ucnvmbcs.h"
32#include "ucnv_bld.h"
33#include "ucnv_ext.h"
34#include "uparse.h"
35#include "ucm.h"
36#include <stdio.h>
37
38/* -------------------------------------------------------------------------- */
39
40static void
41printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
42 int32_t j;
43
44 for(j=0; j<m->uLen; ++j) {
45 fprintf(f, "<U%04lX>", (long)codePoints[j]);
46 }
47
48 fputc(' ', f);
49
50 for(j=0; j<m->bLen; ++j) {
51 fprintf(f, "\\x%02X", bytes[j]);
52 }
53
54 if(m->f>=0) {
55 fprintf(f, " |%u\n", m->f);
56 } else {
57 fputs("\n", f);
58 }
59}
60
61U_CAPI void U_EXPORT2
62ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
63 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
64}
65
66U_CAPI void U_EXPORT2
67ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
68 UCMapping *m;
69 int32_t i, length;
70
71 m=table->mappings;
72 length=table->mappingsLength;
73 if(byUnicode) {
74 for(i=0; i<length; ++m, ++i) {
75 ucm_printMapping(table, m, f);
76 }
77 } else {
78 const int32_t *map=table->reverseMap;
79 for(i=0; i<length; ++i) {
80 ucm_printMapping(table, m+map[i], f);
81 }
82 }
83}
84
85/* mapping comparisons ------------------------------------------------------ */
86
87static int32_t
88compareUnicode(UCMTable *lTable, const UCMapping *l,
89 UCMTable *rTable, const UCMapping *r) {
90 const UChar32 *lu, *ru;
91 int32_t result, i, length;
92
93 if(l->uLen==1 && r->uLen==1) {
94 /* compare two single code points */
95 return l->u-r->u;
96 }
97
98 /* get pointers to the code point sequences */
99 lu=UCM_GET_CODE_POINTS(lTable, l);
100 ru=UCM_GET_CODE_POINTS(rTable, r);
101
102 /* get the minimum length */
103 if(l->uLen<=r->uLen) {
104 length=l->uLen;
105 } else {
106 length=r->uLen;
107 }
108
109 /* compare the code points */
110 for(i=0; i<length; ++i) {
111 result=lu[i]-ru[i];
112 if(result!=0) {
113 return result;
114 }
115 }
116
117 /* compare the lengths */
118 return l->uLen-r->uLen;
119}
120
121static int32_t
122compareBytes(UCMTable *lTable, const UCMapping *l,
123 UCMTable *rTable, const UCMapping *r,
124 UBool lexical) {
125 const uint8_t *lb, *rb;
126 int32_t result, i, length;
127
128 /*
129 * A lexical comparison is used for sorting in the builder, to allow
130 * an efficient search for a byte sequence that could be a prefix
131 * of a previously entered byte sequence.
132 *
133 * Comparing by lengths first is for compatibility with old .ucm tools
134 * like canonucm and rptp2ucm.
135 */
136 if(lexical) {
137 /* get the minimum length and continue */
138 if(l->bLen<=r->bLen) {
139 length=l->bLen;
140 } else {
141 length=r->bLen;
142 }
143 } else {
144 /* compare lengths first */
145 result=l->bLen-r->bLen;
146 if(result!=0) {
147 return result;
148 } else {
149 length=l->bLen;
150 }
151 }
152
153 /* get pointers to the byte sequences */
154 lb=UCM_GET_BYTES(lTable, l);
155 rb=UCM_GET_BYTES(rTable, r);
156
157 /* compare the bytes */
158 for(i=0; i<length; ++i) {
159 result=lb[i]-rb[i];
160 if(result!=0) {
161 return result;
162 }
163 }
164
165 /* compare the lengths */
166 return l->bLen-r->bLen;
167}
168
169/* compare UCMappings for sorting */
170static int32_t
171compareMappings(UCMTable *lTable, const UCMapping *l,
172 UCMTable *rTable, const UCMapping *r,
173 UBool uFirst) {
174 int32_t result;
175
176 /* choose which side to compare first */
177 if(uFirst) {
178 /* Unicode then bytes */
179 result=compareUnicode(lTable, l, rTable, r);
180 if(result==0) {
181 result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
182 }
183 } else {
184 /* bytes then Unicode */
185 result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
186 if(result==0) {
187 result=compareUnicode(lTable, l, rTable, r);
188 }
189 }
190
191 if(result!=0) {
192 return result;
193 }
194
195 /* compare the flags */
196 return l->f-r->f;
197}
198
199/* sorting by Unicode first sorts mappings directly */
200static int32_t
201compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
202 return compareMappings(
203 (UCMTable *)context, (const UCMapping *)left,
204 (UCMTable *)context, (const UCMapping *)right, TRUE);
205}
206
207/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
208static int32_t
209compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
210 UCMTable *table=(UCMTable *)context;
211 int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
212 return compareMappings(
213 table, table->mappings+l,
214 table, table->mappings+r, FALSE);
215}
216
217U_CAPI void U_EXPORT2
218ucm_sortTable(UCMTable *t) {
219 UErrorCode errorCode;
220 int32_t i;
221
222 if(t->isSorted) {
223 return;
224 }
225
226 errorCode=U_ZERO_ERROR;
227
228 /* 1. sort by Unicode first */
229 uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
230 compareMappingsUnicodeFirst, t,
231 FALSE, &errorCode);
232
233 /* build the reverseMap */
234 if(t->reverseMap==NULL) {
235 /*
236 * allocate mappingsCapacity instead of mappingsLength so that
237 * if mappings are added, the reverseMap need not be
238 * reallocated each time
239 * (see moveMappings() and ucm_addMapping())
240 */
241 t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
242 if(t->reverseMap==NULL) {
243 fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
244 exit(U_MEMORY_ALLOCATION_ERROR);
245 }
246 }
247 for(i=0; i<t->mappingsLength; ++i) {
248 t->reverseMap[i]=i;
249 }
250
251 /* 2. sort reverseMap by mappings bytes first */
252 uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
253 compareMappingsBytesFirst, t,
254 FALSE, &errorCode);
255
256 if(U_FAILURE(errorCode)) {
257 fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
258 u_errorName(errorCode));
259 exit(errorCode);
260 }
261
262 t->isSorted=TRUE;
263}
264
265enum {
266 MOVE_TO_EXT=1,
267 REMOVE_MAPPING=2
268};
269
270/*
271 * move mappings with their move flag set from the base table
272 * and optionally to the extension table
273 *
274 * works only with explicit precision flags because it uses some of the
275 * flags bits
276 */
277static void
278moveMappings(UCMTable *base, UCMTable *ext) {
279 UCMapping *mb, *mbLimit;
280 int8_t flag;
281
282 mb=base->mappings;
283 mbLimit=mb+base->mappingsLength;
284
285 while(mb<mbLimit) {
286 flag=mb->moveFlag;
287 if(flag!=0) {
288 /* reset the move flag */
289 mb->moveFlag=0;
290
291 if(ext!=NULL && (flag&MOVE_TO_EXT)) {
292 /* add the mapping to the extension table */
293 ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
294 }
295
296 /* move the last base mapping down and overwrite the current one */
297 if(mb<(mbLimit-1)) {
298 uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
299 }
300 --mbLimit;
301 --base->mappingsLength;
302 base->isSorted=FALSE;
303 } else {
304 ++mb;
305 }
306 }
307}
308
309enum {
310 NEEDS_MOVE=1,
311 HAS_ERRORS=2
312};
313
314static uint8_t
315checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
316 UBool moveToExt, UBool intersectBase) {
317 UCMapping *mb, *me, *mbLimit, *meLimit;
318 int32_t cmp;
319 uint8_t result;
320
321 mb=base->mappings;
322 mbLimit=mb+base->mappingsLength;
323
324 me=ext->mappings;
325 meLimit=me+ext->mappingsLength;
326
327 result=0;
328
329 for(;;) {
330 /* skip irrelevant mappings on both sides */
331 for(;;) {
332 if(mb==mbLimit) {
333 return result;
334 }
335
336 if(0<=mb->f && mb->f<=2) {
337 break;
338 }
339
340 ++mb;
341 }
342
343 for(;;) {
344 if(me==meLimit) {
345 return result;
346 }
347
348 if(0<=me->f && me->f<=2) {
349 break;
350 }
351
352 ++me;
353 }
354
355 /* compare the base and extension mappings */
356 cmp=compareUnicode(base, mb, ext, me);
357 if(cmp<0) {
358 if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
359 /*
360 * mapping in base but not in ext, move it
361 *
362 * if ext is DBCS, move DBCS mappings here
363 * and check SBCS ones for Unicode prefix below
364 */
365 mb->moveFlag|=MOVE_TO_EXT;
366 result|=NEEDS_MOVE;
367
368 /* does mb map from an input sequence that is a prefix of me's? */
369 } else if( mb->uLen<me->uLen &&
370 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
371 ) {
372 if(moveToExt) {
373 /* mark this mapping to be moved to the extension table */
374 mb->moveFlag|=MOVE_TO_EXT;
375 result|=NEEDS_MOVE;
376 } else {
377 fprintf(stderr,
378 "ucm error: the base table contains a mapping whose input sequence\n"
379 " is a prefix of the input sequence of an extension mapping\n");
380 ucm_printMapping(base, mb, stderr);
381 ucm_printMapping(ext, me, stderr);
382 result|=HAS_ERRORS;
383 }
384 }
385
386 ++mb;
387 } else if(cmp==0) {
388 /*
389 * same output: remove the extension mapping,
390 * otherwise treat as an error
391 */
392 if( mb->f==me->f && mb->bLen==me->bLen &&
393 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
394 ) {
395 me->moveFlag|=REMOVE_MAPPING;
396 result|=NEEDS_MOVE;
397 } else if(intersectBase) {
398 /* mapping in base but not in ext, move it */
399 mb->moveFlag|=MOVE_TO_EXT;
400 result|=NEEDS_MOVE;
401 } else {
402 fprintf(stderr,
403 "ucm error: the base table contains a mapping whose input sequence\n"
404 " is the same as the input sequence of an extension mapping\n"
405 " but it maps differently\n");
406 ucm_printMapping(base, mb, stderr);
407 ucm_printMapping(ext, me, stderr);
408 result|=HAS_ERRORS;
409 }
410
411 ++mb;
412 } else /* cmp>0 */ {
413 ++me;
414 }
415 }
416}
417
418static uint8_t
419checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
420 UBool moveToExt, UBool intersectBase) {
421 UCMapping *mb, *me;
422 int32_t *baseMap, *extMap;
423 int32_t b, e, bLimit, eLimit, cmp;
424 uint8_t result;
425 UBool isSISO;
426
427 baseMap=base->reverseMap;
428 extMap=ext->reverseMap;
429
430 b=e=0;
431 bLimit=base->mappingsLength;
432 eLimit=ext->mappingsLength;
433
434 result=0;
435
436 isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
437
438 for(;;) {
439 /* skip irrelevant mappings on both sides */
440 for(;; ++b) {
441 if(b==bLimit) {
442 return result;
443 }
444 mb=base->mappings+baseMap[b];
445
446 if(intersectBase==2 && mb->bLen==1) {
447 /*
448 * comparing a base against a DBCS extension:
449 * leave SBCS base mappings alone
450 */
451 continue;
452 }
453
454 if(mb->f==0 || mb->f==3) {
455 break;
456 }
457 }
458
459 for(;;) {
460 if(e==eLimit) {
461 return result;
462 }
463 me=ext->mappings+extMap[e];
464
465 if(me->f==0 || me->f==3) {
466 break;
467 }
468
469 ++e;
470 }
471
472 /* compare the base and extension mappings */
473 cmp=compareBytes(base, mb, ext, me, TRUE);
474 if(cmp<0) {
475 if(intersectBase) {
476 /* mapping in base but not in ext, move it */
477 mb->moveFlag|=MOVE_TO_EXT;
478 result|=NEEDS_MOVE;
479
480 /*
481 * does mb map from an input sequence that is a prefix of me's?
482 * for SI/SO tables, a single byte is never a prefix because it
483 * occurs in a separate single-byte state
484 */
485 } else if( mb->bLen<me->bLen &&
486 (!isSISO || mb->bLen>1) &&
487 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
488 ) {
489 if(moveToExt) {
490 /* mark this mapping to be moved to the extension table */
491 mb->moveFlag|=MOVE_TO_EXT;
492 result|=NEEDS_MOVE;
493 } else {
494 fprintf(stderr,
495 "ucm error: the base table contains a mapping whose input sequence\n"
496 " is a prefix of the input sequence of an extension mapping\n");
497 ucm_printMapping(base, mb, stderr);
498 ucm_printMapping(ext, me, stderr);
499 result|=HAS_ERRORS;
500 }
501 }
502
503 ++b;
504 } else if(cmp==0) {
505 /*
506 * same output: remove the extension mapping,
507 * otherwise treat as an error
508 */
509 if( mb->f==me->f && mb->uLen==me->uLen &&
510 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
511 ) {
512 me->moveFlag|=REMOVE_MAPPING;
513 result|=NEEDS_MOVE;
514 } else if(intersectBase) {
515 /* mapping in base but not in ext, move it */
516 mb->moveFlag|=MOVE_TO_EXT;
517 result|=NEEDS_MOVE;
518 } else {
519 fprintf(stderr,
520 "ucm error: the base table contains a mapping whose input sequence\n"
521 " is the same as the input sequence of an extension mapping\n"
522 " but it maps differently\n");
523 ucm_printMapping(base, mb, stderr);
524 ucm_printMapping(ext, me, stderr);
525 result|=HAS_ERRORS;
526 }
527
528 ++b;
529 } else /* cmp>0 */ {
530 ++e;
531 }
532 }
533}
534
535U_CAPI UBool U_EXPORT2
536ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
537 UCMapping *m, *mLimit;
538 int32_t count;
539 UBool isOK;
540
541 m=table->mappings;
542 mLimit=m+table->mappingsLength;
543 isOK=TRUE;
544
545 while(m<mLimit) {
546 count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
547 if(count<1) {
548 ucm_printMapping(table, m, stderr);
549 isOK=FALSE;
550 }
551 ++m;
552 }
553
554 return isOK;
555}
556
557U_CAPI UBool U_EXPORT2
558ucm_checkBaseExt(UCMStates *baseStates,
559 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
560 UBool intersectBase) {
561 uint8_t result;
562
563 /* if we have an extension table, we must always use precision flags */
564 if(base->flagsType&UCM_FLAGS_IMPLICIT) {
565 fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
566 return FALSE;
567 }
568 if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
569 fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
570 return FALSE;
571 }
572
573 /* checking requires both tables to be sorted */
574 ucm_sortTable(base);
575 ucm_sortTable(ext);
576
577 /* check */
578 result=
579 checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
580 checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
581
582 if(result&HAS_ERRORS) {
583 return FALSE;
584 }
585
586 if(result&NEEDS_MOVE) {
587 moveMappings(ext, NULL);
588 moveMappings(base, moveTarget);
589 ucm_sortTable(base);
590 ucm_sortTable(ext);
591 if(moveTarget!=NULL) {
592 ucm_sortTable(moveTarget);
593 }
594 }
595
596 return TRUE;
597}
598
599/* merge tables for rptp2ucm ------------------------------------------------ */
600
601U_CAPI void U_EXPORT2
602ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
603 const uint8_t *subchar, int32_t subcharLength,
604 uint8_t subchar1) {
605 UCMapping *fromUMapping, *toUMapping;
606 int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
607
608 ucm_sortTable(fromUTable);
609 ucm_sortTable(toUTable);
610
611 fromUMapping=fromUTable->mappings;
612 toUMapping=toUTable->mappings;
613
614 fromUTop=fromUTable->mappingsLength;
615 toUTop=toUTable->mappingsLength;
616
617 fromUIndex=toUIndex=0;
618
619 while(fromUIndex<fromUTop && toUIndex<toUTop) {
620 cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
621 if(cmp==0) {
622 /* equal: roundtrip, nothing to do (flags are initially 0) */
623 ++fromUMapping;
624 ++toUMapping;
625
626 ++fromUIndex;
627 ++toUIndex;
628 } else if(cmp<0) {
629 /*
630 * the fromU mapping does not have a toU counterpart:
631 * fallback Unicode->codepage
632 */
633 if( (fromUMapping->bLen==subcharLength &&
634 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
635 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
636 ) {
637 fromUMapping->f=2; /* SUB mapping */
638 } else {
639 fromUMapping->f=1; /* normal fallback */
640 }
641
642 ++fromUMapping;
643 ++fromUIndex;
644 } else {
645 /*
646 * the toU mapping does not have a fromU counterpart:
647 * (reverse) fallback codepage->Unicode, copy it to the fromU table
648 */
649
650 /* ignore reverse fallbacks to Unicode SUB */
651 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
652 toUMapping->f=3; /* reverse fallback */
653 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
654
655 /* the table may have been reallocated */
656 fromUMapping=fromUTable->mappings+fromUIndex;
657 }
658
659 ++toUMapping;
660 ++toUIndex;
661 }
662 }
663
664 /* either one or both tables are exhausted */
665 while(fromUIndex<fromUTop) {
666 /* leftover fromU mappings are fallbacks */
667 if( (fromUMapping->bLen==subcharLength &&
668 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
669 (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
670 ) {
671 fromUMapping->f=2; /* SUB mapping */
672 } else {
673 fromUMapping->f=1; /* normal fallback */
674 }
675
676 ++fromUMapping;
677 ++fromUIndex;
678 }
679
680 while(toUIndex<toUTop) {
681 /* leftover toU mappings are reverse fallbacks */
682
683 /* ignore reverse fallbacks to Unicode SUB */
684 if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
685 toUMapping->f=3; /* reverse fallback */
686 ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
687 }
688
689 ++toUMapping;
690 ++toUIndex;
691 }
692
693 fromUTable->isSorted=FALSE;
694}
695
696/* separate extension mappings out of base table for rptp2ucm --------------- */
697
698U_CAPI UBool U_EXPORT2
699ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
700 UCMTable *table;
701 UCMapping *m, *mLimit;
702 int32_t type;
703 UBool needsMove, isOK;
704
705 table=ucm->base;
706 m=table->mappings;
707 mLimit=m+table->mappingsLength;
708
709 needsMove=FALSE;
710 isOK=TRUE;
711
712 for(; m<mLimit; ++m) {
713 if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
714 fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
715 ucm_printMapping(table, m, stderr);
716 m->moveFlag|=REMOVE_MAPPING;
717 needsMove=TRUE;
718 continue;
719 }
720
721 type=ucm_mappingType(
722 &ucm->states, m,
723 UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
724 if(type<0) {
725 /* illegal byte sequence */
726 printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
727 isOK=FALSE;
728 } else if(type>0) {
729 m->moveFlag|=MOVE_TO_EXT;
730 needsMove=TRUE;
731 }
732 }
733
734 if(!isOK) {
735 return FALSE;
736 }
737 if(needsMove) {
738 moveMappings(ucm->base, ucm->ext);
739 return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
740 } else {
741 ucm_sortTable(ucm->base);
742 return TRUE;
743 }
744}
745
746/* ucm parser --------------------------------------------------------------- */
747
748U_CAPI int8_t U_EXPORT2
749ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
750 const char *s=*ps;
751 char *end;
752 uint8_t byte;
753 int8_t bLen;
754
755 bLen=0;
756 for(;;) {
757 /* skip an optional plus sign */
758 if(bLen>0 && *s=='+') {
759 ++s;
760 }
761 if(*s!='\\') {
762 break;
763 }
764
765 if( s[1]!='x' ||
766 (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
767 ) {
768 fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
769 return -1;
770 }
771
772 if(bLen==UCNV_EXT_MAX_BYTES) {
773 fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
774 return -1;
775 }
776 bytes[bLen++]=byte;
777 s=end;
778 }
779
780 *ps=s;
781 return bLen;
782}
783
784/* parse a mapping line; must not be empty */
785U_CAPI UBool U_EXPORT2
786ucm_parseMappingLine(UCMapping *m,
787 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
788 uint8_t bytes[UCNV_EXT_MAX_BYTES],
789 const char *line) {
790 const char *s;
791 char *end;
792 UChar32 cp;
793 int32_t u16Length;
794 int8_t uLen, bLen, f;
795
796 s=line;
797 uLen=bLen=0;
798
799 /* parse code points */
800 for(;;) {
801 /* skip an optional plus sign */
802 if(uLen>0 && *s=='+') {
803 ++s;
804 }
805 if(*s!='<') {
806 break;
807 }
808
809 if( s[1]!='U' ||
810 (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
811 *end!='>'
812 ) {
813 fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
814 return FALSE;
815 }
816 if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
817 fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
818 return FALSE;
819 }
820
821 if(uLen==UCNV_EXT_MAX_UCHARS) {
822 fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
823 return FALSE;
824 }
825 codePoints[uLen++]=cp;
826 s=end+1;
827 }
828
829 if(uLen==0) {
830 fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
831 return FALSE;
832 } else if(uLen==1) {
833 m->u=codePoints[0];
834 } else {
835 UErrorCode errorCode=U_ZERO_ERROR;
836 u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
837 if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
838 u16Length>UCNV_EXT_MAX_UCHARS
839 ) {
840 fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
841 return FALSE;
842 }
843 }
844
845 s=u_skipWhitespace(s);
846
847 /* parse bytes */
848 bLen=ucm_parseBytes(bytes, line, &s);
849
850 if(bLen<0) {
851 return FALSE;
852 } else if(bLen==0) {
853 fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
854 return FALSE;
855 } else if(bLen<=4) {
856 uprv_memcpy(m->b.bytes, bytes, bLen);
857 }
858
859 /* skip everything until the fallback indicator, even the start of a comment */
860 for(;;) {
861 if(*s==0) {
862 f=-1; /* no fallback indicator */
863 break;
864 } else if(*s=='|') {
865 f=(int8_t)(s[1]-'0');
866 if((uint8_t)f>3) {
867 fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
868 return FALSE;
869 }
870 break;
871 }
872 ++s;
873 }
874
875 m->uLen=uLen;
876 m->bLen=bLen;
877 m->f=f;
878 return TRUE;
879}
880
881/* general APIs ------------------------------------------------------------- */
882
883U_CAPI UCMTable * U_EXPORT2
884ucm_openTable() {
885 UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
886 if(table==NULL) {
887 fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
888 exit(U_MEMORY_ALLOCATION_ERROR);
889 }
890
891 memset(table, 0, sizeof(UCMTable));
892 return table;
893}
894
895U_CAPI void U_EXPORT2
896ucm_closeTable(UCMTable *table) {
897 if(table!=NULL) {
898 uprv_free(table->mappings);
899 uprv_free(table->codePoints);
900 uprv_free(table->bytes);
901 uprv_free(table->reverseMap);
902 uprv_free(table);
903 }
904}
905
906U_CAPI void U_EXPORT2
907ucm_resetTable(UCMTable *table) {
908 if(table!=NULL) {
909 table->mappingsLength=0;
910 table->flagsType=0;
911 table->unicodeMask=0;
912 table->bytesLength=table->codePointsLength=0;
913 table->isSorted=FALSE;
914 }
915}
916
917U_CAPI void U_EXPORT2
918ucm_addMapping(UCMTable *table,
919 UCMapping *m,
920 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
921 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
922 UCMapping *tm;
923 UChar32 c;
924 int32_t index;
925
926 if(table->mappingsLength>=table->mappingsCapacity) {
927 /* make the mappings array larger */
928 if(table->mappingsCapacity==0) {
929 table->mappingsCapacity=1000;
930 } else {
931 table->mappingsCapacity*=10;
932 }
933 table->mappings=(UCMapping *)uprv_realloc(table->mappings,
934 table->mappingsCapacity*sizeof(UCMapping));
935 if(table->mappings==NULL) {
936 fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
937 (int)table->mappingsCapacity);
938 exit(U_MEMORY_ALLOCATION_ERROR);
939 }
940
941 if(table->reverseMap!=NULL) {
942 /* the reverseMap must be reallocated in a new sort */
943 uprv_free(table->reverseMap);
944 table->reverseMap=NULL;
945 }
946 }
947
948 if(m->uLen>1 && table->codePointsCapacity==0) {
949 table->codePointsCapacity=10000;
950 table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
951 if(table->codePoints==NULL) {
952 fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
953 (int)table->codePointsCapacity);
954 exit(U_MEMORY_ALLOCATION_ERROR);
955 }
956 }
957
958 if(m->bLen>4 && table->bytesCapacity==0) {
959 table->bytesCapacity=10000;
960 table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
961 if(table->bytes==NULL) {
962 fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
963 (int)table->bytesCapacity);
964 exit(U_MEMORY_ALLOCATION_ERROR);
965 }
966 }
967
968 if(m->uLen>1) {
969 index=table->codePointsLength;
970 table->codePointsLength+=m->uLen;
971 if(table->codePointsLength>table->codePointsCapacity) {
972 fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
973 exit(U_MEMORY_ALLOCATION_ERROR);
974 }
975
976 uprv_memcpy(table->codePoints+index, codePoints, m->uLen*4);
977 m->u=index;
978 }
979
980 if(m->bLen>4) {
981 index=table->bytesLength;
982 table->bytesLength+=m->bLen;
983 if(table->bytesLength>table->bytesCapacity) {
984 fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
985 exit(U_MEMORY_ALLOCATION_ERROR);
986 }
987
988 uprv_memcpy(table->bytes+index, bytes, m->bLen);
989 m->b.index=index;
990 }
991
992 /* set unicodeMask */
993 for(index=0; index<m->uLen; ++index) {
994 c=codePoints[index];
995 if(c>=0x10000) {
996 table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
997 } else if(U_IS_SURROGATE(c)) {
998 table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
999 }
1000 }
1001
1002 /* set flagsType */
1003 if(m->f<0) {
1004 table->flagsType|=UCM_FLAGS_IMPLICIT;
1005 } else {
1006 table->flagsType|=UCM_FLAGS_EXPLICIT;
1007 }
1008
1009 tm=table->mappings+table->mappingsLength++;
1010 uprv_memcpy(tm, m, sizeof(UCMapping));
1011
1012 table->isSorted=FALSE;
1013}
1014
1015U_CAPI UCMFile * U_EXPORT2
1016ucm_open() {
1017 UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
1018 if(ucm==NULL) {
1019 fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
1020 exit(U_MEMORY_ALLOCATION_ERROR);
1021 }
1022
1023 memset(ucm, 0, sizeof(UCMFile));
1024
1025 ucm->base=ucm_openTable();
1026 ucm->ext=ucm_openTable();
1027
1028 ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
1029 ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
1030 ucm->states.outputType=-1;
1031 ucm->states.minCharLength=ucm->states.maxCharLength=1;
1032
1033 return ucm;
1034}
1035
1036U_CAPI void U_EXPORT2
1037ucm_close(UCMFile *ucm) {
1038 if(ucm!=NULL) {
1039 uprv_free(ucm->base);
1040 uprv_free(ucm->ext);
1041 uprv_free(ucm);
1042 }
1043}
1044
1045U_CAPI int32_t U_EXPORT2
1046ucm_mappingType(UCMStates *baseStates,
1047 UCMapping *m,
1048 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1049 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1050 /* check validity of the bytes and count the characters in them */
1051 int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
1052 if(count<1) {
1053 /* illegal byte sequence */
1054 return -1;
1055 }
1056
1057 /*
1058 * Suitable for an ICU conversion base table means:
1059 * - a 1:1 mapping
1060 * - not a |2 SUB mappings for <subchar1>
1061 * - not a |1 fallback to 0x00
1062 * - no leading 0x00 bytes
1063 */
1064 if( m->uLen==1 && count==1 &&
1065 !((m->f==2 && m->bLen==1 && baseStates->maxCharLength>1) ||
1066 (m->f==1 && m->bLen==1 && bytes[0]==0) ||
1067 (m->bLen>1 && bytes[0]==0))
1068 ) {
1069 return 0; /* suitable for a base table */
1070 } else {
1071 return 1; /* needs to go into an extension table */
1072 }
1073}
1074
1075U_CAPI UBool U_EXPORT2
1076ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
1077 UCMapping *m,
1078 UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
1079 uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
1080 int32_t type;
1081
1082 if(m->f==2 && m->uLen>1) {
1083 fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1084 printMapping(m, codePoints, bytes, stderr);
1085 return FALSE;
1086 }
1087
1088 if(baseStates!=NULL) {
1089 /* check validity of the bytes and count the characters in them */
1090 type=ucm_mappingType(baseStates, m, codePoints, bytes);
1091 if(type<0) {
1092 /* illegal byte sequence */
1093 printMapping(m, codePoints, bytes, stderr);
1094 return FALSE;
1095 }
1096 } else {
1097 /* not used - adding a mapping for an extension-only table before its base table is read */
1098 type=1;
1099 }
1100
1101 /*
1102 * Add the mapping to the base table if this is requested and suitable.
1103 * Otherwise, add it to the extension table.
1104 */
1105 if(forBase && type==0) {
1106 ucm_addMapping(ucm->base, m, codePoints, bytes);
1107 } else {
1108 ucm_addMapping(ucm->ext, m, codePoints, bytes);
1109 }
1110
1111 return TRUE;
1112}
1113
1114U_CAPI UBool U_EXPORT2
1115ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
1116 UCMapping m={ 0 };
1117 UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
1118 uint8_t bytes[UCNV_EXT_MAX_BYTES];
1119
1120 const char *s;
1121
1122 /* ignore empty and comment lines */
1123 if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
1124 return TRUE;
1125 }
1126
1127 return
1128 ucm_parseMappingLine(&m, codePoints, bytes, line) &&
1129 ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
1130}
1131
1132U_CAPI void U_EXPORT2
1133ucm_readTable(UCMFile *ucm, FileStream* convFile,
1134 UBool forBase, UCMStates *baseStates,
1135 UErrorCode *pErrorCode) {
1136 char line[500];
1137 char *end;
1138 UBool isOK;
1139
1140 if(U_FAILURE(*pErrorCode)) {
1141 return;
1142 }
1143
1144 isOK=TRUE;
1145
1146 for(;;) {
1147 /* read the next line */
1148 if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
1149 fprintf(stderr, "incomplete charmap section\n");
1150 isOK=FALSE;
1151 break;
1152 }
1153
1154 /* remove CR LF */
1155 end=uprv_strchr(line, 0);
1156 while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
1157 --end;
1158 }
1159 *end=0;
1160
1161 /* ignore empty and comment lines */
1162 if(line[0]==0 || line[0]=='#') {
1163 continue;
1164 }
1165
1166 /* stop at the end of the mapping table */
1167 if(0==uprv_strcmp(line, "END CHARMAP")) {
1168 break;
1169 }
1170
1171 isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
1172 }
1173
1174 if(!isOK) {
1175 *pErrorCode=U_INVALID_TABLE_FORMAT;
1176 }
1177}