2 *******************************************************************************
4 * Copyright (C) 2003-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2003jun20
14 * created by: Markus W. Scherer
16 * This file reads a .ucm file, stores its mappings and sorts them.
17 * It implements handling of Unicode conversion mappings from .ucm files
18 * for makeconv, canonucm, rptp2ucm, etc.
20 * Unicode code point sequences with a length of more than 1,
21 * as well as byte sequences with more than 4 bytes or more than one complete
22 * character sequence are handled to support m:n mappings.
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
38 /* -------------------------------------------------------------------------- */
41 printMapping(UCMapping
*m
, UChar32
*codePoints
, uint8_t *bytes
, FILE *f
) {
44 for(j
=0; j
<m
->uLen
; ++j
) {
45 fprintf(f
, "<U%04lX>", (long)codePoints
[j
]);
50 for(j
=0; j
<m
->bLen
; ++j
) {
51 fprintf(f
, "\\x%02X", bytes
[j
]);
55 fprintf(f
, " |%u\n", m
->f
);
62 ucm_printMapping(UCMTable
*table
, UCMapping
*m
, FILE *f
) {
63 printMapping(m
, UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
), f
);
67 ucm_printTable(UCMTable
*table
, FILE *f
, UBool byUnicode
) {
72 length
=table
->mappingsLength
;
74 for(i
=0; i
<length
; ++m
, ++i
) {
75 ucm_printMapping(table
, m
, f
);
78 const int32_t *map
=table
->reverseMap
;
79 for(i
=0; i
<length
; ++i
) {
80 ucm_printMapping(table
, m
+map
[i
], f
);
85 /* mapping comparisons ------------------------------------------------------ */
88 compareUnicode(UCMTable
*lTable
, const UCMapping
*l
,
89 UCMTable
*rTable
, const UCMapping
*r
) {
90 const UChar32
*lu
, *ru
;
91 int32_t result
, i
, length
;
93 if(l
->uLen
==1 && r
->uLen
==1) {
94 /* compare two single code points */
98 /* get pointers to the code point sequences */
99 lu
=UCM_GET_CODE_POINTS(lTable
, l
);
100 ru
=UCM_GET_CODE_POINTS(rTable
, r
);
102 /* get the minimum length */
103 if(l
->uLen
<=r
->uLen
) {
109 /* compare the code points */
110 for(i
=0; i
<length
; ++i
) {
117 /* compare the lengths */
118 return l
->uLen
-r
->uLen
;
122 compareBytes(UCMTable
*lTable
, const UCMapping
*l
,
123 UCMTable
*rTable
, const UCMapping
*r
,
125 const uint8_t *lb
, *rb
;
126 int32_t result
, i
, length
;
129 * A lexical comparison is used for sorting in the builder, to allow
130 * an efficient search for a byte sequence that could be a prefix
131 * of a previously entered byte sequence.
133 * Comparing by lengths first is for compatibility with old .ucm tools
134 * like canonucm and rptp2ucm.
137 /* get the minimum length and continue */
138 if(l
->bLen
<=r
->bLen
) {
144 /* compare lengths first */
145 result
=l
->bLen
-r
->bLen
;
153 /* get pointers to the byte sequences */
154 lb
=UCM_GET_BYTES(lTable
, l
);
155 rb
=UCM_GET_BYTES(rTable
, r
);
157 /* compare the bytes */
158 for(i
=0; i
<length
; ++i
) {
165 /* compare the lengths */
166 return l
->bLen
-r
->bLen
;
169 /* compare UCMappings for sorting */
171 compareMappings(UCMTable
*lTable
, const UCMapping
*l
,
172 UCMTable
*rTable
, const UCMapping
*r
,
176 /* choose which side to compare first */
178 /* Unicode then bytes */
179 result
=compareUnicode(lTable
, l
, rTable
, r
);
181 result
=compareBytes(lTable
, l
, rTable
, r
, FALSE
); /* not lexically, like canonucm */
184 /* bytes then Unicode */
185 result
=compareBytes(lTable
, l
, rTable
, r
, TRUE
); /* lexically, for builder */
187 result
=compareUnicode(lTable
, l
, rTable
, r
);
195 /* compare the flags */
199 /* sorting by Unicode first sorts mappings directly */
201 compareMappingsUnicodeFirst(const void *context
, const void *left
, const void *right
) {
202 return compareMappings(
203 (UCMTable
*)context
, (const UCMapping
*)left
,
204 (UCMTable
*)context
, (const UCMapping
*)right
, TRUE
);
207 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
209 compareMappingsBytesFirst(const void *context
, const void *left
, const void *right
) {
210 UCMTable
*table
=(UCMTable
*)context
;
211 int32_t l
=*(const int32_t *)left
, r
=*(const int32_t *)right
;
212 return compareMappings(
213 table
, table
->mappings
+l
,
214 table
, table
->mappings
+r
, FALSE
);
217 U_CAPI
void U_EXPORT2
218 ucm_sortTable(UCMTable
*t
) {
219 UErrorCode errorCode
;
226 errorCode
=U_ZERO_ERROR
;
228 /* 1. sort by Unicode first */
229 uprv_sortArray(t
->mappings
, t
->mappingsLength
, sizeof(UCMapping
),
230 compareMappingsUnicodeFirst
, t
,
233 /* build the reverseMap */
234 if(t
->reverseMap
==NULL
) {
236 * allocate mappingsCapacity instead of mappingsLength so that
237 * if mappings are added, the reverseMap need not be
238 * reallocated each time
239 * (see moveMappings() and ucm_addMapping())
241 t
->reverseMap
=(int32_t *)uprv_malloc(t
->mappingsCapacity
*sizeof(int32_t));
242 if(t
->reverseMap
==NULL
) {
243 fprintf(stderr
, "ucm error: unable to allocate reverseMap\n");
244 exit(U_MEMORY_ALLOCATION_ERROR
);
247 for(i
=0; i
<t
->mappingsLength
; ++i
) {
251 /* 2. sort reverseMap by mappings bytes first */
252 uprv_sortArray(t
->reverseMap
, t
->mappingsLength
, sizeof(int32_t),
253 compareMappingsBytesFirst
, t
,
256 if(U_FAILURE(errorCode
)) {
257 fprintf(stderr
, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
258 u_errorName(errorCode
));
271 * move mappings with their move flag set from the base table
272 * and optionally to the extension table
274 * works only with explicit precision flags because it uses some of the
278 moveMappings(UCMTable
*base
, UCMTable
*ext
) {
279 UCMapping
*mb
, *mbLimit
;
283 mbLimit
=mb
+base
->mappingsLength
;
288 /* reset the move flag */
291 if(ext
!=NULL
&& (flag
&MOVE_TO_EXT
)) {
292 /* add the mapping to the extension table */
293 ucm_addMapping(ext
, mb
, UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_BYTES(base
, mb
));
296 /* move the last base mapping down and overwrite the current one */
298 uprv_memcpy(mb
, mbLimit
-1, sizeof(UCMapping
));
301 --base
->mappingsLength
;
302 base
->isSorted
=FALSE
;
315 checkBaseExtUnicode(UCMStates
*baseStates
, UCMTable
*base
, UCMTable
*ext
,
316 UBool moveToExt
, UBool intersectBase
) {
317 UCMapping
*mb
, *me
, *mbLimit
, *meLimit
;
322 mbLimit
=mb
+base
->mappingsLength
;
325 meLimit
=me
+ext
->mappingsLength
;
330 /* skip irrelevant mappings on both sides */
336 if(0<=mb
->f
&& mb
->f
<=2) {
348 if(0<=me
->f
&& me
->f
<=2) {
355 /* compare the base and extension mappings */
356 cmp
=compareUnicode(base
, mb
, ext
, me
);
358 if(intersectBase
&& (intersectBase
!=2 || mb
->bLen
>1)) {
360 * mapping in base but not in ext, move it
362 * if ext is DBCS, move DBCS mappings here
363 * and check SBCS ones for Unicode prefix below
365 mb
->moveFlag
|=MOVE_TO_EXT
;
368 /* does mb map from an input sequence that is a prefix of me's? */
369 } else if( mb
->uLen
<me
->uLen
&&
370 0==uprv_memcmp(UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_CODE_POINTS(ext
, me
), 4*mb
->uLen
)
373 /* mark this mapping to be moved to the extension table */
374 mb
->moveFlag
|=MOVE_TO_EXT
;
378 "ucm error: the base table contains a mapping whose input sequence\n"
379 " is a prefix of the input sequence of an extension mapping\n");
380 ucm_printMapping(base
, mb
, stderr
);
381 ucm_printMapping(ext
, me
, stderr
);
389 * same output: remove the extension mapping,
390 * otherwise treat as an error
392 if( mb
->f
==me
->f
&& mb
->bLen
==me
->bLen
&&
393 0==uprv_memcmp(UCM_GET_BYTES(base
, mb
), UCM_GET_BYTES(ext
, me
), mb
->bLen
)
395 me
->moveFlag
|=REMOVE_MAPPING
;
397 } else if(intersectBase
) {
398 /* mapping in base but not in ext, move it */
399 mb
->moveFlag
|=MOVE_TO_EXT
;
403 "ucm error: the base table contains a mapping whose input sequence\n"
404 " is the same as the input sequence of an extension mapping\n"
405 " but it maps differently\n");
406 ucm_printMapping(base
, mb
, stderr
);
407 ucm_printMapping(ext
, me
, stderr
);
419 checkBaseExtBytes(UCMStates
*baseStates
, UCMTable
*base
, UCMTable
*ext
,
420 UBool moveToExt
, UBool intersectBase
) {
422 int32_t *baseMap
, *extMap
;
423 int32_t b
, e
, bLimit
, eLimit
, cmp
;
427 baseMap
=base
->reverseMap
;
428 extMap
=ext
->reverseMap
;
431 bLimit
=base
->mappingsLength
;
432 eLimit
=ext
->mappingsLength
;
436 isSISO
=(UBool
)(baseStates
->outputType
==MBCS_OUTPUT_2_SISO
);
439 /* skip irrelevant mappings on both sides */
444 mb
=base
->mappings
+baseMap
[b
];
446 if(intersectBase
==2 && mb
->bLen
==1) {
448 * comparing a base against a DBCS extension:
449 * leave SBCS base mappings alone
454 if(mb
->f
==0 || mb
->f
==3) {
463 me
=ext
->mappings
+extMap
[e
];
465 if(me
->f
==0 || me
->f
==3) {
472 /* compare the base and extension mappings */
473 cmp
=compareBytes(base
, mb
, ext
, me
, TRUE
);
476 /* mapping in base but not in ext, move it */
477 mb
->moveFlag
|=MOVE_TO_EXT
;
481 * does mb map from an input sequence that is a prefix of me's?
482 * for SI/SO tables, a single byte is never a prefix because it
483 * occurs in a separate single-byte state
485 } else if( mb
->bLen
<me
->bLen
&&
486 (!isSISO
|| mb
->bLen
>1) &&
487 0==uprv_memcmp(UCM_GET_BYTES(base
, mb
), UCM_GET_BYTES(ext
, me
), mb
->bLen
)
490 /* mark this mapping to be moved to the extension table */
491 mb
->moveFlag
|=MOVE_TO_EXT
;
495 "ucm error: the base table contains a mapping whose input sequence\n"
496 " is a prefix of the input sequence of an extension mapping\n");
497 ucm_printMapping(base
, mb
, stderr
);
498 ucm_printMapping(ext
, me
, stderr
);
506 * same output: remove the extension mapping,
507 * otherwise treat as an error
509 if( mb
->f
==me
->f
&& mb
->uLen
==me
->uLen
&&
510 0==uprv_memcmp(UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_CODE_POINTS(ext
, me
), 4*mb
->uLen
)
512 me
->moveFlag
|=REMOVE_MAPPING
;
514 } else if(intersectBase
) {
515 /* mapping in base but not in ext, move it */
516 mb
->moveFlag
|=MOVE_TO_EXT
;
520 "ucm error: the base table contains a mapping whose input sequence\n"
521 " is the same as the input sequence of an extension mapping\n"
522 " but it maps differently\n");
523 ucm_printMapping(base
, mb
, stderr
);
524 ucm_printMapping(ext
, me
, stderr
);
535 U_CAPI UBool U_EXPORT2
536 ucm_checkValidity(UCMTable
*table
, UCMStates
*baseStates
) {
537 UCMapping
*m
, *mLimit
;
542 mLimit
=m
+table
->mappingsLength
;
546 count
=ucm_countChars(baseStates
, UCM_GET_BYTES(table
, m
), m
->bLen
);
548 ucm_printMapping(table
, m
, stderr
);
557 U_CAPI UBool U_EXPORT2
558 ucm_checkBaseExt(UCMStates
*baseStates
,
559 UCMTable
*base
, UCMTable
*ext
, UCMTable
*moveTarget
,
560 UBool intersectBase
) {
563 /* if we have an extension table, we must always use precision flags */
564 if(base
->flagsType
&UCM_FLAGS_IMPLICIT
) {
565 fprintf(stderr
, "ucm error: the base table contains mappings without precision flags\n");
568 if(ext
->flagsType
&UCM_FLAGS_IMPLICIT
) {
569 fprintf(stderr
, "ucm error: extension table contains mappings without precision flags\n");
573 /* checking requires both tables to be sorted */
579 checkBaseExtUnicode(baseStates
, base
, ext
, (UBool
)(moveTarget
!=NULL
), intersectBase
)|
580 checkBaseExtBytes(baseStates
, base
, ext
, (UBool
)(moveTarget
!=NULL
), intersectBase
);
582 if(result
&HAS_ERRORS
) {
586 if(result
&NEEDS_MOVE
) {
587 moveMappings(ext
, NULL
);
588 moveMappings(base
, moveTarget
);
591 if(moveTarget
!=NULL
) {
592 ucm_sortTable(moveTarget
);
599 /* merge tables for rptp2ucm ------------------------------------------------ */
601 U_CAPI
void U_EXPORT2
602 ucm_mergeTables(UCMTable
*fromUTable
, UCMTable
*toUTable
,
603 const uint8_t *subchar
, int32_t subcharLength
,
605 UCMapping
*fromUMapping
, *toUMapping
;
606 int32_t fromUIndex
, toUIndex
, fromUTop
, toUTop
, cmp
;
608 ucm_sortTable(fromUTable
);
609 ucm_sortTable(toUTable
);
611 fromUMapping
=fromUTable
->mappings
;
612 toUMapping
=toUTable
->mappings
;
614 fromUTop
=fromUTable
->mappingsLength
;
615 toUTop
=toUTable
->mappingsLength
;
617 fromUIndex
=toUIndex
=0;
619 while(fromUIndex
<fromUTop
&& toUIndex
<toUTop
) {
620 cmp
=compareMappings(fromUTable
, fromUMapping
, toUTable
, toUMapping
, TRUE
);
622 /* equal: roundtrip, nothing to do (flags are initially 0) */
630 * the fromU mapping does not have a toU counterpart:
631 * fallback Unicode->codepage
633 if( (fromUMapping
->bLen
==subcharLength
&&
634 0==uprv_memcmp(UCM_GET_BYTES(fromUTable
, fromUMapping
), subchar
, subcharLength
)) ||
635 (subchar1
!=0 && fromUMapping
->bLen
==1 && fromUMapping
->b
.bytes
[0]==subchar1
)
637 fromUMapping
->f
=2; /* SUB mapping */
639 fromUMapping
->f
=1; /* normal fallback */
646 * the toU mapping does not have a fromU counterpart:
647 * (reverse) fallback codepage->Unicode, copy it to the fromU table
650 /* ignore reverse fallbacks to Unicode SUB */
651 if(!(toUMapping
->uLen
==1 && (toUMapping
->u
==0xfffd || toUMapping
->u
==0x1a))) {
652 toUMapping
->f
=3; /* reverse fallback */
653 ucm_addMapping(fromUTable
, toUMapping
, UCM_GET_CODE_POINTS(toUTable
, toUMapping
), UCM_GET_BYTES(toUTable
, toUMapping
));
655 /* the table may have been reallocated */
656 fromUMapping
=fromUTable
->mappings
+fromUIndex
;
664 /* either one or both tables are exhausted */
665 while(fromUIndex
<fromUTop
) {
666 /* leftover fromU mappings are fallbacks */
667 if( (fromUMapping
->bLen
==subcharLength
&&
668 0==uprv_memcmp(UCM_GET_BYTES(fromUTable
, fromUMapping
), subchar
, subcharLength
)) ||
669 (subchar1
!=0 && fromUMapping
->bLen
==1 && fromUMapping
->b
.bytes
[0]==subchar1
)
671 fromUMapping
->f
=2; /* SUB mapping */
673 fromUMapping
->f
=1; /* normal fallback */
680 while(toUIndex
<toUTop
) {
681 /* leftover toU mappings are reverse fallbacks */
683 /* ignore reverse fallbacks to Unicode SUB */
684 if(!(toUMapping
->uLen
==1 && (toUMapping
->u
==0xfffd || toUMapping
->u
==0x1a))) {
685 toUMapping
->f
=3; /* reverse fallback */
686 ucm_addMapping(fromUTable
, toUMapping
, UCM_GET_CODE_POINTS(toUTable
, toUMapping
), UCM_GET_BYTES(toUTable
, toUMapping
));
693 fromUTable
->isSorted
=FALSE
;
696 /* separate extension mappings out of base table for rptp2ucm --------------- */
698 U_CAPI UBool U_EXPORT2
699 ucm_separateMappings(UCMFile
*ucm
, UBool isSISO
) {
701 UCMapping
*m
, *mLimit
;
703 UBool needsMove
, isOK
;
707 mLimit
=m
+table
->mappingsLength
;
712 for(; m
<mLimit
; ++m
) {
713 if(isSISO
&& m
->bLen
==1 && (m
->b
.bytes
[0]==0xe || m
->b
.bytes
[0]==0xf)) {
714 fprintf(stderr
, "warning: removing illegal mapping from an SI/SO-stateful table\n");
715 ucm_printMapping(table
, m
, stderr
);
716 m
->moveFlag
|=REMOVE_MAPPING
;
721 type
=ucm_mappingType(
723 UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
));
725 /* illegal byte sequence */
726 printMapping(m
, UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
), stderr
);
729 m
->moveFlag
|=MOVE_TO_EXT
;
738 moveMappings(ucm
->base
, ucm
->ext
);
739 return ucm_checkBaseExt(&ucm
->states
, ucm
->base
, ucm
->ext
, ucm
->ext
, FALSE
);
741 ucm_sortTable(ucm
->base
);
746 /* ucm parser --------------------------------------------------------------- */
748 U_CAPI
int8_t U_EXPORT2
749 ucm_parseBytes(uint8_t bytes
[UCNV_EXT_MAX_BYTES
], const char *line
, const char **ps
) {
757 /* skip an optional plus sign */
758 if(bLen
>0 && *s
=='+') {
766 (byte
=(uint8_t)uprv_strtoul(s
+2, &end
, 16), end
)!=s
+4
768 fprintf(stderr
, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line
);
772 if(bLen
==UCNV_EXT_MAX_BYTES
) {
773 fprintf(stderr
, "ucm error: too many bytes on \"%s\"\n", line
);
784 /* parse a mapping line; must not be empty */
785 U_CAPI UBool U_EXPORT2
786 ucm_parseMappingLine(UCMapping
*m
,
787 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
788 uint8_t bytes
[UCNV_EXT_MAX_BYTES
],
794 int8_t uLen
, bLen
, f
;
799 /* parse code points */
801 /* skip an optional plus sign */
802 if(uLen
>0 && *s
=='+') {
810 (cp
=(UChar32
)uprv_strtoul(s
+2, &end
, 16), end
)==s
+2 ||
813 fprintf(stderr
, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line
);
816 if((uint32_t)cp
>0x10ffff || U_IS_SURROGATE(cp
)) {
817 fprintf(stderr
, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line
);
821 if(uLen
==UCNV_EXT_MAX_UCHARS
) {
822 fprintf(stderr
, "ucm error: too many code points on \"%s\"\n", line
);
825 codePoints
[uLen
++]=cp
;
830 fprintf(stderr
, "ucm error: no Unicode code points on \"%s\"\n", line
);
835 UErrorCode errorCode
=U_ZERO_ERROR
;
836 u_strFromUTF32(NULL
, 0, &u16Length
, codePoints
, uLen
, &errorCode
);
837 if( (U_FAILURE(errorCode
) && errorCode
!=U_BUFFER_OVERFLOW_ERROR
) ||
838 u16Length
>UCNV_EXT_MAX_UCHARS
840 fprintf(stderr
, "ucm error: too many UChars on \"%s\"\n", line
);
845 s
=u_skipWhitespace(s
);
848 bLen
=ucm_parseBytes(bytes
, line
, &s
);
853 fprintf(stderr
, "ucm error: no bytes on \"%s\"\n", line
);
856 uprv_memcpy(m
->b
.bytes
, bytes
, bLen
);
859 /* skip everything until the fallback indicator, even the start of a comment */
862 f
=-1; /* no fallback indicator */
865 f
=(int8_t)(s
[1]-'0');
867 fprintf(stderr
, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line
);
881 /* general APIs ------------------------------------------------------------- */
883 U_CAPI UCMTable
* U_EXPORT2
885 UCMTable
*table
=(UCMTable
*)uprv_malloc(sizeof(UCMTable
));
887 fprintf(stderr
, "ucm error: unable to allocate a UCMTable\n");
888 exit(U_MEMORY_ALLOCATION_ERROR
);
891 memset(table
, 0, sizeof(UCMTable
));
895 U_CAPI
void U_EXPORT2
896 ucm_closeTable(UCMTable
*table
) {
898 uprv_free(table
->mappings
);
899 uprv_free(table
->codePoints
);
900 uprv_free(table
->bytes
);
901 uprv_free(table
->reverseMap
);
906 U_CAPI
void U_EXPORT2
907 ucm_resetTable(UCMTable
*table
) {
909 table
->mappingsLength
=0;
911 table
->unicodeMask
=0;
912 table
->bytesLength
=table
->codePointsLength
=0;
913 table
->isSorted
=FALSE
;
917 U_CAPI
void U_EXPORT2
918 ucm_addMapping(UCMTable
*table
,
920 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
921 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
926 if(table
->mappingsLength
>=table
->mappingsCapacity
) {
927 /* make the mappings array larger */
928 if(table
->mappingsCapacity
==0) {
929 table
->mappingsCapacity
=1000;
931 table
->mappingsCapacity
*=10;
933 table
->mappings
=(UCMapping
*)uprv_realloc(table
->mappings
,
934 table
->mappingsCapacity
*sizeof(UCMapping
));
935 if(table
->mappings
==NULL
) {
936 fprintf(stderr
, "ucm error: unable to allocate %d UCMappings\n",
937 (int)table
->mappingsCapacity
);
938 exit(U_MEMORY_ALLOCATION_ERROR
);
941 if(table
->reverseMap
!=NULL
) {
942 /* the reverseMap must be reallocated in a new sort */
943 uprv_free(table
->reverseMap
);
944 table
->reverseMap
=NULL
;
948 if(m
->uLen
>1 && table
->codePointsCapacity
==0) {
949 table
->codePointsCapacity
=10000;
950 table
->codePoints
=(UChar32
*)uprv_malloc(table
->codePointsCapacity
*4);
951 if(table
->codePoints
==NULL
) {
952 fprintf(stderr
, "ucm error: unable to allocate %d UChar32s\n",
953 (int)table
->codePointsCapacity
);
954 exit(U_MEMORY_ALLOCATION_ERROR
);
958 if(m
->bLen
>4 && table
->bytesCapacity
==0) {
959 table
->bytesCapacity
=10000;
960 table
->bytes
=(uint8_t *)uprv_malloc(table
->bytesCapacity
);
961 if(table
->bytes
==NULL
) {
962 fprintf(stderr
, "ucm error: unable to allocate %d bytes\n",
963 (int)table
->bytesCapacity
);
964 exit(U_MEMORY_ALLOCATION_ERROR
);
969 index
=table
->codePointsLength
;
970 table
->codePointsLength
+=m
->uLen
;
971 if(table
->codePointsLength
>table
->codePointsCapacity
) {
972 fprintf(stderr
, "ucm error: too many code points in multiple-code point mappings\n");
973 exit(U_MEMORY_ALLOCATION_ERROR
);
976 uprv_memcpy(table
->codePoints
+index
, codePoints
, m
->uLen
*4);
981 index
=table
->bytesLength
;
982 table
->bytesLength
+=m
->bLen
;
983 if(table
->bytesLength
>table
->bytesCapacity
) {
984 fprintf(stderr
, "ucm error: too many bytes in mappings with >4 charset bytes\n");
985 exit(U_MEMORY_ALLOCATION_ERROR
);
988 uprv_memcpy(table
->bytes
+index
, bytes
, m
->bLen
);
992 /* set unicodeMask */
993 for(index
=0; index
<m
->uLen
; ++index
) {
996 table
->unicodeMask
|=UCNV_HAS_SUPPLEMENTARY
; /* there are supplementary code points */
997 } else if(U_IS_SURROGATE(c
)) {
998 table
->unicodeMask
|=UCNV_HAS_SURROGATES
; /* there are surrogate code points */
1004 table
->flagsType
|=UCM_FLAGS_IMPLICIT
;
1006 table
->flagsType
|=UCM_FLAGS_EXPLICIT
;
1009 tm
=table
->mappings
+table
->mappingsLength
++;
1010 uprv_memcpy(tm
, m
, sizeof(UCMapping
));
1012 table
->isSorted
=FALSE
;
1015 U_CAPI UCMFile
* U_EXPORT2
1017 UCMFile
*ucm
=(UCMFile
*)uprv_malloc(sizeof(UCMFile
));
1019 fprintf(stderr
, "ucm error: unable to allocate a UCMFile\n");
1020 exit(U_MEMORY_ALLOCATION_ERROR
);
1023 memset(ucm
, 0, sizeof(UCMFile
));
1025 ucm
->base
=ucm_openTable();
1026 ucm
->ext
=ucm_openTable();
1028 ucm
->states
.stateFlags
[0]=MBCS_STATE_FLAG_DIRECT
;
1029 ucm
->states
.conversionType
=UCNV_UNSUPPORTED_CONVERTER
;
1030 ucm
->states
.outputType
=-1;
1031 ucm
->states
.minCharLength
=ucm
->states
.maxCharLength
=1;
1036 U_CAPI
void U_EXPORT2
1037 ucm_close(UCMFile
*ucm
) {
1039 uprv_free(ucm
->base
);
1040 uprv_free(ucm
->ext
);
1045 U_CAPI
int32_t U_EXPORT2
1046 ucm_mappingType(UCMStates
*baseStates
,
1048 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
1049 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
1050 /* check validity of the bytes and count the characters in them */
1051 int32_t count
=ucm_countChars(baseStates
, bytes
, m
->bLen
);
1053 /* illegal byte sequence */
1058 * Suitable for an ICU conversion base table means:
1060 * - not a |2 SUB mappings for <subchar1>
1061 * - not a |1 fallback to 0x00
1062 * - no leading 0x00 bytes
1064 if( m
->uLen
==1 && count
==1 &&
1065 !((m
->f
==2 && m
->bLen
==1 && baseStates
->maxCharLength
>1) ||
1066 (m
->f
==1 && m
->bLen
==1 && bytes
[0]==0) ||
1067 (m
->bLen
>1 && bytes
[0]==0))
1069 return 0; /* suitable for a base table */
1071 return 1; /* needs to go into an extension table */
1075 U_CAPI UBool U_EXPORT2
1076 ucm_addMappingAuto(UCMFile
*ucm
, UBool forBase
, UCMStates
*baseStates
,
1078 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
1079 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
1082 if(m
->f
==2 && m
->uLen
>1) {
1083 fprintf(stderr
, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1084 printMapping(m
, codePoints
, bytes
, stderr
);
1088 if(baseStates
!=NULL
) {
1089 /* check validity of the bytes and count the characters in them */
1090 type
=ucm_mappingType(baseStates
, m
, codePoints
, bytes
);
1092 /* illegal byte sequence */
1093 printMapping(m
, codePoints
, bytes
, stderr
);
1097 /* not used - adding a mapping for an extension-only table before its base table is read */
1102 * Add the mapping to the base table if this is requested and suitable.
1103 * Otherwise, add it to the extension table.
1105 if(forBase
&& type
==0) {
1106 ucm_addMapping(ucm
->base
, m
, codePoints
, bytes
);
1108 ucm_addMapping(ucm
->ext
, m
, codePoints
, bytes
);
1114 U_CAPI UBool U_EXPORT2
1115 ucm_addMappingFromLine(UCMFile
*ucm
, const char *line
, UBool forBase
, UCMStates
*baseStates
) {
1117 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
];
1118 uint8_t bytes
[UCNV_EXT_MAX_BYTES
];
1122 /* ignore empty and comment lines */
1123 if(line
[0]=='#' || *(s
=u_skipWhitespace(line
))==0 || *s
=='\n' || *s
=='\r') {
1128 ucm_parseMappingLine(&m
, codePoints
, bytes
, line
) &&
1129 ucm_addMappingAuto(ucm
, forBase
, baseStates
, &m
, codePoints
, bytes
);
1132 U_CAPI
void U_EXPORT2
1133 ucm_readTable(UCMFile
*ucm
, FileStream
* convFile
,
1134 UBool forBase
, UCMStates
*baseStates
,
1135 UErrorCode
*pErrorCode
) {
1140 if(U_FAILURE(*pErrorCode
)) {
1147 /* read the next line */
1148 if(!T_FileStream_readLine(convFile
, line
, sizeof(line
))) {
1149 fprintf(stderr
, "incomplete charmap section\n");
1155 end
=uprv_strchr(line
, 0);
1156 while(line
<end
&& (*(end
-1)=='\r' || *(end
-1)=='\n')) {
1161 /* ignore empty and comment lines */
1162 if(line
[0]==0 || line
[0]=='#') {
1166 /* stop at the end of the mapping table */
1167 if(0==uprv_strcmp(line
, "END CHARMAP")) {
1171 isOK
&=ucm_addMappingFromLine(ucm
, line
, forBase
, baseStates
);
1175 *pErrorCode
=U_INVALID_TABLE_FORMAT
;