2 *******************************************************************************
4 * Copyright (C) 2003-2005, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2003jun20
14 * created by: Markus W. Scherer
16 * This file reads a .ucm file, stores its mappings and sorts them.
17 * It implements handling of Unicode conversion mappings from .ucm files
18 * for makeconv, canonucm, rptp2ucm, etc.
20 * Unicode code point sequences with a length of more than 1,
21 * as well as byte sequences with more than 4 bytes or more than one complete
22 * character sequence are handled to support m:n mappings.
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
38 #if !UCONFIG_NO_CONVERSION
40 /* -------------------------------------------------------------------------- */
43 printMapping(UCMapping
*m
, UChar32
*codePoints
, uint8_t *bytes
, FILE *f
) {
46 for(j
=0; j
<m
->uLen
; ++j
) {
47 fprintf(f
, "<U%04lX>", (long)codePoints
[j
]);
52 for(j
=0; j
<m
->bLen
; ++j
) {
53 fprintf(f
, "\\x%02X", bytes
[j
]);
57 fprintf(f
, " |%u\n", m
->f
);
64 ucm_printMapping(UCMTable
*table
, UCMapping
*m
, FILE *f
) {
65 printMapping(m
, UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
), f
);
69 ucm_printTable(UCMTable
*table
, FILE *f
, UBool byUnicode
) {
74 length
=table
->mappingsLength
;
76 for(i
=0; i
<length
; ++m
, ++i
) {
77 ucm_printMapping(table
, m
, f
);
80 const int32_t *map
=table
->reverseMap
;
81 for(i
=0; i
<length
; ++i
) {
82 ucm_printMapping(table
, m
+map
[i
], f
);
87 /* mapping comparisons ------------------------------------------------------ */
90 compareUnicode(UCMTable
*lTable
, const UCMapping
*l
,
91 UCMTable
*rTable
, const UCMapping
*r
) {
92 const UChar32
*lu
, *ru
;
93 int32_t result
, i
, length
;
95 if(l
->uLen
==1 && r
->uLen
==1) {
96 /* compare two single code points */
100 /* get pointers to the code point sequences */
101 lu
=UCM_GET_CODE_POINTS(lTable
, l
);
102 ru
=UCM_GET_CODE_POINTS(rTable
, r
);
104 /* get the minimum length */
105 if(l
->uLen
<=r
->uLen
) {
111 /* compare the code points */
112 for(i
=0; i
<length
; ++i
) {
119 /* compare the lengths */
120 return l
->uLen
-r
->uLen
;
124 compareBytes(UCMTable
*lTable
, const UCMapping
*l
,
125 UCMTable
*rTable
, const UCMapping
*r
,
127 const uint8_t *lb
, *rb
;
128 int32_t result
, i
, length
;
131 * A lexical comparison is used for sorting in the builder, to allow
132 * an efficient search for a byte sequence that could be a prefix
133 * of a previously entered byte sequence.
135 * Comparing by lengths first is for compatibility with old .ucm tools
136 * like canonucm and rptp2ucm.
139 /* get the minimum length and continue */
140 if(l
->bLen
<=r
->bLen
) {
146 /* compare lengths first */
147 result
=l
->bLen
-r
->bLen
;
155 /* get pointers to the byte sequences */
156 lb
=UCM_GET_BYTES(lTable
, l
);
157 rb
=UCM_GET_BYTES(rTable
, r
);
159 /* compare the bytes */
160 for(i
=0; i
<length
; ++i
) {
167 /* compare the lengths */
168 return l
->bLen
-r
->bLen
;
171 /* compare UCMappings for sorting */
173 compareMappings(UCMTable
*lTable
, const UCMapping
*l
,
174 UCMTable
*rTable
, const UCMapping
*r
,
178 /* choose which side to compare first */
180 /* Unicode then bytes */
181 result
=compareUnicode(lTable
, l
, rTable
, r
);
183 result
=compareBytes(lTable
, l
, rTable
, r
, FALSE
); /* not lexically, like canonucm */
186 /* bytes then Unicode */
187 result
=compareBytes(lTable
, l
, rTable
, r
, TRUE
); /* lexically, for builder */
189 result
=compareUnicode(lTable
, l
, rTable
, r
);
197 /* compare the flags */
201 /* sorting by Unicode first sorts mappings directly */
203 compareMappingsUnicodeFirst(const void *context
, const void *left
, const void *right
) {
204 return compareMappings(
205 (UCMTable
*)context
, (const UCMapping
*)left
,
206 (UCMTable
*)context
, (const UCMapping
*)right
, TRUE
);
209 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
211 compareMappingsBytesFirst(const void *context
, const void *left
, const void *right
) {
212 UCMTable
*table
=(UCMTable
*)context
;
213 int32_t l
=*(const int32_t *)left
, r
=*(const int32_t *)right
;
214 return compareMappings(
215 table
, table
->mappings
+l
,
216 table
, table
->mappings
+r
, FALSE
);
219 U_CAPI
void U_EXPORT2
220 ucm_sortTable(UCMTable
*t
) {
221 UErrorCode errorCode
;
228 errorCode
=U_ZERO_ERROR
;
230 /* 1. sort by Unicode first */
231 uprv_sortArray(t
->mappings
, t
->mappingsLength
, sizeof(UCMapping
),
232 compareMappingsUnicodeFirst
, t
,
235 /* build the reverseMap */
236 if(t
->reverseMap
==NULL
) {
238 * allocate mappingsCapacity instead of mappingsLength so that
239 * if mappings are added, the reverseMap need not be
240 * reallocated each time
241 * (see moveMappings() and ucm_addMapping())
243 t
->reverseMap
=(int32_t *)uprv_malloc(t
->mappingsCapacity
*sizeof(int32_t));
244 if(t
->reverseMap
==NULL
) {
245 fprintf(stderr
, "ucm error: unable to allocate reverseMap\n");
246 exit(U_MEMORY_ALLOCATION_ERROR
);
249 for(i
=0; i
<t
->mappingsLength
; ++i
) {
253 /* 2. sort reverseMap by mappings bytes first */
254 uprv_sortArray(t
->reverseMap
, t
->mappingsLength
, sizeof(int32_t),
255 compareMappingsBytesFirst
, t
,
258 if(U_FAILURE(errorCode
)) {
259 fprintf(stderr
, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
260 u_errorName(errorCode
));
273 * move mappings with their move flag set from the base table
274 * and optionally to the extension table
276 * works only with explicit precision flags because it uses some of the
280 moveMappings(UCMTable
*base
, UCMTable
*ext
) {
281 UCMapping
*mb
, *mbLimit
;
285 mbLimit
=mb
+base
->mappingsLength
;
290 /* reset the move flag */
293 if(ext
!=NULL
&& (flag
&MOVE_TO_EXT
)) {
294 /* add the mapping to the extension table */
295 ucm_addMapping(ext
, mb
, UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_BYTES(base
, mb
));
298 /* move the last base mapping down and overwrite the current one */
300 uprv_memcpy(mb
, mbLimit
-1, sizeof(UCMapping
));
303 --base
->mappingsLength
;
304 base
->isSorted
=FALSE
;
317 checkBaseExtUnicode(UCMStates
*baseStates
, UCMTable
*base
, UCMTable
*ext
,
318 UBool moveToExt
, UBool intersectBase
) {
319 UCMapping
*mb
, *me
, *mbLimit
, *meLimit
;
324 mbLimit
=mb
+base
->mappingsLength
;
327 meLimit
=me
+ext
->mappingsLength
;
332 /* skip irrelevant mappings on both sides */
338 if(0<=mb
->f
&& mb
->f
<=2) {
350 if(0<=me
->f
&& me
->f
<=2) {
357 /* compare the base and extension mappings */
358 cmp
=compareUnicode(base
, mb
, ext
, me
);
360 if(intersectBase
&& (intersectBase
!=2 || mb
->bLen
>1)) {
362 * mapping in base but not in ext, move it
364 * if ext is DBCS, move DBCS mappings here
365 * and check SBCS ones for Unicode prefix below
367 mb
->moveFlag
|=MOVE_TO_EXT
;
370 /* does mb map from an input sequence that is a prefix of me's? */
371 } else if( mb
->uLen
<me
->uLen
&&
372 0==uprv_memcmp(UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_CODE_POINTS(ext
, me
), 4*mb
->uLen
)
375 /* mark this mapping to be moved to the extension table */
376 mb
->moveFlag
|=MOVE_TO_EXT
;
380 "ucm error: the base table contains a mapping whose input sequence\n"
381 " is a prefix of the input sequence of an extension mapping\n");
382 ucm_printMapping(base
, mb
, stderr
);
383 ucm_printMapping(ext
, me
, stderr
);
391 * same output: remove the extension mapping,
392 * otherwise treat as an error
394 if( mb
->f
==me
->f
&& mb
->bLen
==me
->bLen
&&
395 0==uprv_memcmp(UCM_GET_BYTES(base
, mb
), UCM_GET_BYTES(ext
, me
), mb
->bLen
)
397 me
->moveFlag
|=REMOVE_MAPPING
;
399 } else if(intersectBase
) {
400 /* mapping in base but not in ext, move it */
401 mb
->moveFlag
|=MOVE_TO_EXT
;
405 "ucm error: the base table contains a mapping whose input sequence\n"
406 " is the same as the input sequence of an extension mapping\n"
407 " but it maps differently\n");
408 ucm_printMapping(base
, mb
, stderr
);
409 ucm_printMapping(ext
, me
, stderr
);
421 checkBaseExtBytes(UCMStates
*baseStates
, UCMTable
*base
, UCMTable
*ext
,
422 UBool moveToExt
, UBool intersectBase
) {
424 int32_t *baseMap
, *extMap
;
425 int32_t b
, e
, bLimit
, eLimit
, cmp
;
429 baseMap
=base
->reverseMap
;
430 extMap
=ext
->reverseMap
;
433 bLimit
=base
->mappingsLength
;
434 eLimit
=ext
->mappingsLength
;
438 isSISO
=(UBool
)(baseStates
->outputType
==MBCS_OUTPUT_2_SISO
);
441 /* skip irrelevant mappings on both sides */
446 mb
=base
->mappings
+baseMap
[b
];
448 if(intersectBase
==2 && mb
->bLen
==1) {
450 * comparing a base against a DBCS extension:
451 * leave SBCS base mappings alone
456 if(mb
->f
==0 || mb
->f
==3) {
465 me
=ext
->mappings
+extMap
[e
];
467 if(me
->f
==0 || me
->f
==3) {
474 /* compare the base and extension mappings */
475 cmp
=compareBytes(base
, mb
, ext
, me
, TRUE
);
478 /* mapping in base but not in ext, move it */
479 mb
->moveFlag
|=MOVE_TO_EXT
;
483 * does mb map from an input sequence that is a prefix of me's?
484 * for SI/SO tables, a single byte is never a prefix because it
485 * occurs in a separate single-byte state
487 } else if( mb
->bLen
<me
->bLen
&&
488 (!isSISO
|| mb
->bLen
>1) &&
489 0==uprv_memcmp(UCM_GET_BYTES(base
, mb
), UCM_GET_BYTES(ext
, me
), mb
->bLen
)
492 /* mark this mapping to be moved to the extension table */
493 mb
->moveFlag
|=MOVE_TO_EXT
;
497 "ucm error: the base table contains a mapping whose input sequence\n"
498 " is a prefix of the input sequence of an extension mapping\n");
499 ucm_printMapping(base
, mb
, stderr
);
500 ucm_printMapping(ext
, me
, stderr
);
508 * same output: remove the extension mapping,
509 * otherwise treat as an error
511 if( mb
->f
==me
->f
&& mb
->uLen
==me
->uLen
&&
512 0==uprv_memcmp(UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_CODE_POINTS(ext
, me
), 4*mb
->uLen
)
514 me
->moveFlag
|=REMOVE_MAPPING
;
516 } else if(intersectBase
) {
517 /* mapping in base but not in ext, move it */
518 mb
->moveFlag
|=MOVE_TO_EXT
;
522 "ucm error: the base table contains a mapping whose input sequence\n"
523 " is the same as the input sequence of an extension mapping\n"
524 " but it maps differently\n");
525 ucm_printMapping(base
, mb
, stderr
);
526 ucm_printMapping(ext
, me
, stderr
);
537 U_CAPI UBool U_EXPORT2
538 ucm_checkValidity(UCMTable
*table
, UCMStates
*baseStates
) {
539 UCMapping
*m
, *mLimit
;
544 mLimit
=m
+table
->mappingsLength
;
548 count
=ucm_countChars(baseStates
, UCM_GET_BYTES(table
, m
), m
->bLen
);
550 ucm_printMapping(table
, m
, stderr
);
559 U_CAPI UBool U_EXPORT2
560 ucm_checkBaseExt(UCMStates
*baseStates
,
561 UCMTable
*base
, UCMTable
*ext
, UCMTable
*moveTarget
,
562 UBool intersectBase
) {
565 /* if we have an extension table, we must always use precision flags */
566 if(base
->flagsType
&UCM_FLAGS_IMPLICIT
) {
567 fprintf(stderr
, "ucm error: the base table contains mappings without precision flags\n");
570 if(ext
->flagsType
&UCM_FLAGS_IMPLICIT
) {
571 fprintf(stderr
, "ucm error: extension table contains mappings without precision flags\n");
575 /* checking requires both tables to be sorted */
581 checkBaseExtUnicode(baseStates
, base
, ext
, (UBool
)(moveTarget
!=NULL
), intersectBase
)|
582 checkBaseExtBytes(baseStates
, base
, ext
, (UBool
)(moveTarget
!=NULL
), intersectBase
);
584 if(result
&HAS_ERRORS
) {
588 if(result
&NEEDS_MOVE
) {
589 moveMappings(ext
, NULL
);
590 moveMappings(base
, moveTarget
);
593 if(moveTarget
!=NULL
) {
594 ucm_sortTable(moveTarget
);
601 /* merge tables for rptp2ucm ------------------------------------------------ */
603 U_CAPI
void U_EXPORT2
604 ucm_mergeTables(UCMTable
*fromUTable
, UCMTable
*toUTable
,
605 const uint8_t *subchar
, int32_t subcharLength
,
607 UCMapping
*fromUMapping
, *toUMapping
;
608 int32_t fromUIndex
, toUIndex
, fromUTop
, toUTop
, cmp
;
610 ucm_sortTable(fromUTable
);
611 ucm_sortTable(toUTable
);
613 fromUMapping
=fromUTable
->mappings
;
614 toUMapping
=toUTable
->mappings
;
616 fromUTop
=fromUTable
->mappingsLength
;
617 toUTop
=toUTable
->mappingsLength
;
619 fromUIndex
=toUIndex
=0;
621 while(fromUIndex
<fromUTop
&& toUIndex
<toUTop
) {
622 cmp
=compareMappings(fromUTable
, fromUMapping
, toUTable
, toUMapping
, TRUE
);
624 /* equal: roundtrip, nothing to do (flags are initially 0) */
632 * the fromU mapping does not have a toU counterpart:
633 * fallback Unicode->codepage
635 if( (fromUMapping
->bLen
==subcharLength
&&
636 0==uprv_memcmp(UCM_GET_BYTES(fromUTable
, fromUMapping
), subchar
, subcharLength
)) ||
637 (subchar1
!=0 && fromUMapping
->bLen
==1 && fromUMapping
->b
.bytes
[0]==subchar1
)
639 fromUMapping
->f
=2; /* SUB mapping */
641 fromUMapping
->f
=1; /* normal fallback */
648 * the toU mapping does not have a fromU counterpart:
649 * (reverse) fallback codepage->Unicode, copy it to the fromU table
652 /* ignore reverse fallbacks to Unicode SUB */
653 if(!(toUMapping
->uLen
==1 && (toUMapping
->u
==0xfffd || toUMapping
->u
==0x1a))) {
654 toUMapping
->f
=3; /* reverse fallback */
655 ucm_addMapping(fromUTable
, toUMapping
, UCM_GET_CODE_POINTS(toUTable
, toUMapping
), UCM_GET_BYTES(toUTable
, toUMapping
));
657 /* the table may have been reallocated */
658 fromUMapping
=fromUTable
->mappings
+fromUIndex
;
666 /* either one or both tables are exhausted */
667 while(fromUIndex
<fromUTop
) {
668 /* leftover fromU mappings are fallbacks */
669 if( (fromUMapping
->bLen
==subcharLength
&&
670 0==uprv_memcmp(UCM_GET_BYTES(fromUTable
, fromUMapping
), subchar
, subcharLength
)) ||
671 (subchar1
!=0 && fromUMapping
->bLen
==1 && fromUMapping
->b
.bytes
[0]==subchar1
)
673 fromUMapping
->f
=2; /* SUB mapping */
675 fromUMapping
->f
=1; /* normal fallback */
682 while(toUIndex
<toUTop
) {
683 /* leftover toU mappings are reverse fallbacks */
685 /* ignore reverse fallbacks to Unicode SUB */
686 if(!(toUMapping
->uLen
==1 && (toUMapping
->u
==0xfffd || toUMapping
->u
==0x1a))) {
687 toUMapping
->f
=3; /* reverse fallback */
688 ucm_addMapping(fromUTable
, toUMapping
, UCM_GET_CODE_POINTS(toUTable
, toUMapping
), UCM_GET_BYTES(toUTable
, toUMapping
));
695 fromUTable
->isSorted
=FALSE
;
698 /* separate extension mappings out of base table for rptp2ucm --------------- */
700 U_CAPI UBool U_EXPORT2
701 ucm_separateMappings(UCMFile
*ucm
, UBool isSISO
) {
703 UCMapping
*m
, *mLimit
;
705 UBool needsMove
, isOK
;
709 mLimit
=m
+table
->mappingsLength
;
714 for(; m
<mLimit
; ++m
) {
715 if(isSISO
&& m
->bLen
==1 && (m
->b
.bytes
[0]==0xe || m
->b
.bytes
[0]==0xf)) {
716 fprintf(stderr
, "warning: removing illegal mapping from an SI/SO-stateful table\n");
717 ucm_printMapping(table
, m
, stderr
);
718 m
->moveFlag
|=REMOVE_MAPPING
;
723 type
=ucm_mappingType(
725 UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
));
727 /* illegal byte sequence */
728 printMapping(m
, UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
), stderr
);
731 m
->moveFlag
|=MOVE_TO_EXT
;
740 moveMappings(ucm
->base
, ucm
->ext
);
741 return ucm_checkBaseExt(&ucm
->states
, ucm
->base
, ucm
->ext
, ucm
->ext
, FALSE
);
743 ucm_sortTable(ucm
->base
);
748 /* ucm parser --------------------------------------------------------------- */
750 U_CAPI
int8_t U_EXPORT2
751 ucm_parseBytes(uint8_t bytes
[UCNV_EXT_MAX_BYTES
], const char *line
, const char **ps
) {
759 /* skip an optional plus sign */
760 if(bLen
>0 && *s
=='+') {
768 (byte
=(uint8_t)uprv_strtoul(s
+2, &end
, 16), end
)!=s
+4
770 fprintf(stderr
, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line
);
774 if(bLen
==UCNV_EXT_MAX_BYTES
) {
775 fprintf(stderr
, "ucm error: too many bytes on \"%s\"\n", line
);
786 /* parse a mapping line; must not be empty */
787 U_CAPI UBool U_EXPORT2
788 ucm_parseMappingLine(UCMapping
*m
,
789 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
790 uint8_t bytes
[UCNV_EXT_MAX_BYTES
],
796 int8_t uLen
, bLen
, f
;
801 /* parse code points */
803 /* skip an optional plus sign */
804 if(uLen
>0 && *s
=='+') {
812 (cp
=(UChar32
)uprv_strtoul(s
+2, &end
, 16), end
)==s
+2 ||
815 fprintf(stderr
, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line
);
818 if((uint32_t)cp
>0x10ffff || U_IS_SURROGATE(cp
)) {
819 fprintf(stderr
, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line
);
823 if(uLen
==UCNV_EXT_MAX_UCHARS
) {
824 fprintf(stderr
, "ucm error: too many code points on \"%s\"\n", line
);
827 codePoints
[uLen
++]=cp
;
832 fprintf(stderr
, "ucm error: no Unicode code points on \"%s\"\n", line
);
837 UErrorCode errorCode
=U_ZERO_ERROR
;
838 u_strFromUTF32(NULL
, 0, &u16Length
, codePoints
, uLen
, &errorCode
);
839 if( (U_FAILURE(errorCode
) && errorCode
!=U_BUFFER_OVERFLOW_ERROR
) ||
840 u16Length
>UCNV_EXT_MAX_UCHARS
842 fprintf(stderr
, "ucm error: too many UChars on \"%s\"\n", line
);
847 s
=u_skipWhitespace(s
);
850 bLen
=ucm_parseBytes(bytes
, line
, &s
);
855 fprintf(stderr
, "ucm error: no bytes on \"%s\"\n", line
);
858 uprv_memcpy(m
->b
.bytes
, bytes
, bLen
);
861 /* skip everything until the fallback indicator, even the start of a comment */
864 f
=-1; /* no fallback indicator */
867 f
=(int8_t)(s
[1]-'0');
869 fprintf(stderr
, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line
);
883 /* general APIs ------------------------------------------------------------- */
885 U_CAPI UCMTable
* U_EXPORT2
887 UCMTable
*table
=(UCMTable
*)uprv_malloc(sizeof(UCMTable
));
889 fprintf(stderr
, "ucm error: unable to allocate a UCMTable\n");
890 exit(U_MEMORY_ALLOCATION_ERROR
);
893 memset(table
, 0, sizeof(UCMTable
));
897 U_CAPI
void U_EXPORT2
898 ucm_closeTable(UCMTable
*table
) {
900 uprv_free(table
->mappings
);
901 uprv_free(table
->codePoints
);
902 uprv_free(table
->bytes
);
903 uprv_free(table
->reverseMap
);
908 U_CAPI
void U_EXPORT2
909 ucm_resetTable(UCMTable
*table
) {
911 table
->mappingsLength
=0;
913 table
->unicodeMask
=0;
914 table
->bytesLength
=table
->codePointsLength
=0;
915 table
->isSorted
=FALSE
;
919 U_CAPI
void U_EXPORT2
920 ucm_addMapping(UCMTable
*table
,
922 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
923 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
928 if(table
->mappingsLength
>=table
->mappingsCapacity
) {
929 /* make the mappings array larger */
930 if(table
->mappingsCapacity
==0) {
931 table
->mappingsCapacity
=1000;
933 table
->mappingsCapacity
*=10;
935 table
->mappings
=(UCMapping
*)uprv_realloc(table
->mappings
,
936 table
->mappingsCapacity
*sizeof(UCMapping
));
937 if(table
->mappings
==NULL
) {
938 fprintf(stderr
, "ucm error: unable to allocate %d UCMappings\n",
939 (int)table
->mappingsCapacity
);
940 exit(U_MEMORY_ALLOCATION_ERROR
);
943 if(table
->reverseMap
!=NULL
) {
944 /* the reverseMap must be reallocated in a new sort */
945 uprv_free(table
->reverseMap
);
946 table
->reverseMap
=NULL
;
950 if(m
->uLen
>1 && table
->codePointsCapacity
==0) {
951 table
->codePointsCapacity
=10000;
952 table
->codePoints
=(UChar32
*)uprv_malloc(table
->codePointsCapacity
*4);
953 if(table
->codePoints
==NULL
) {
954 fprintf(stderr
, "ucm error: unable to allocate %d UChar32s\n",
955 (int)table
->codePointsCapacity
);
956 exit(U_MEMORY_ALLOCATION_ERROR
);
960 if(m
->bLen
>4 && table
->bytesCapacity
==0) {
961 table
->bytesCapacity
=10000;
962 table
->bytes
=(uint8_t *)uprv_malloc(table
->bytesCapacity
);
963 if(table
->bytes
==NULL
) {
964 fprintf(stderr
, "ucm error: unable to allocate %d bytes\n",
965 (int)table
->bytesCapacity
);
966 exit(U_MEMORY_ALLOCATION_ERROR
);
971 index
=table
->codePointsLength
;
972 table
->codePointsLength
+=m
->uLen
;
973 if(table
->codePointsLength
>table
->codePointsCapacity
) {
974 fprintf(stderr
, "ucm error: too many code points in multiple-code point mappings\n");
975 exit(U_MEMORY_ALLOCATION_ERROR
);
978 uprv_memcpy(table
->codePoints
+index
, codePoints
, m
->uLen
*4);
983 index
=table
->bytesLength
;
984 table
->bytesLength
+=m
->bLen
;
985 if(table
->bytesLength
>table
->bytesCapacity
) {
986 fprintf(stderr
, "ucm error: too many bytes in mappings with >4 charset bytes\n");
987 exit(U_MEMORY_ALLOCATION_ERROR
);
990 uprv_memcpy(table
->bytes
+index
, bytes
, m
->bLen
);
994 /* set unicodeMask */
995 for(index
=0; index
<m
->uLen
; ++index
) {
998 table
->unicodeMask
|=UCNV_HAS_SUPPLEMENTARY
; /* there are supplementary code points */
999 } else if(U_IS_SURROGATE(c
)) {
1000 table
->unicodeMask
|=UCNV_HAS_SURROGATES
; /* there are surrogate code points */
1006 table
->flagsType
|=UCM_FLAGS_IMPLICIT
;
1008 table
->flagsType
|=UCM_FLAGS_EXPLICIT
;
1011 tm
=table
->mappings
+table
->mappingsLength
++;
1012 uprv_memcpy(tm
, m
, sizeof(UCMapping
));
1014 table
->isSorted
=FALSE
;
1017 U_CAPI UCMFile
* U_EXPORT2
1019 UCMFile
*ucm
=(UCMFile
*)uprv_malloc(sizeof(UCMFile
));
1021 fprintf(stderr
, "ucm error: unable to allocate a UCMFile\n");
1022 exit(U_MEMORY_ALLOCATION_ERROR
);
1025 memset(ucm
, 0, sizeof(UCMFile
));
1027 ucm
->base
=ucm_openTable();
1028 ucm
->ext
=ucm_openTable();
1030 ucm
->states
.stateFlags
[0]=MBCS_STATE_FLAG_DIRECT
;
1031 ucm
->states
.conversionType
=UCNV_UNSUPPORTED_CONVERTER
;
1032 ucm
->states
.outputType
=-1;
1033 ucm
->states
.minCharLength
=ucm
->states
.maxCharLength
=1;
1038 U_CAPI
void U_EXPORT2
1039 ucm_close(UCMFile
*ucm
) {
1041 uprv_free(ucm
->base
);
1042 uprv_free(ucm
->ext
);
1047 U_CAPI
int32_t U_EXPORT2
1048 ucm_mappingType(UCMStates
*baseStates
,
1050 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
1051 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
1052 /* check validity of the bytes and count the characters in them */
1053 int32_t count
=ucm_countChars(baseStates
, bytes
, m
->bLen
);
1055 /* illegal byte sequence */
1060 * Suitable for an ICU conversion base table means:
1062 * - not a |2 SUB mappings for <subchar1>
1063 * - not a |1 fallback to 0x00
1064 * - no leading 0x00 bytes
1066 if( m
->uLen
==1 && count
==1 &&
1067 !((m
->f
==2 && m
->bLen
==1 && baseStates
->maxCharLength
>1) ||
1068 (m
->f
==1 && m
->bLen
==1 && bytes
[0]==0) ||
1069 (m
->bLen
>1 && bytes
[0]==0))
1071 return 0; /* suitable for a base table */
1073 return 1; /* needs to go into an extension table */
1077 U_CAPI UBool U_EXPORT2
1078 ucm_addMappingAuto(UCMFile
*ucm
, UBool forBase
, UCMStates
*baseStates
,
1080 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
1081 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
1084 if(m
->f
==2 && m
->uLen
>1) {
1085 fprintf(stderr
, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1086 printMapping(m
, codePoints
, bytes
, stderr
);
1090 if(baseStates
!=NULL
) {
1091 /* check validity of the bytes and count the characters in them */
1092 type
=ucm_mappingType(baseStates
, m
, codePoints
, bytes
);
1094 /* illegal byte sequence */
1095 printMapping(m
, codePoints
, bytes
, stderr
);
1099 /* not used - adding a mapping for an extension-only table before its base table is read */
1104 * Add the mapping to the base table if this is requested and suitable.
1105 * Otherwise, add it to the extension table.
1107 if(forBase
&& type
==0) {
1108 ucm_addMapping(ucm
->base
, m
, codePoints
, bytes
);
1110 ucm_addMapping(ucm
->ext
, m
, codePoints
, bytes
);
1116 U_CAPI UBool U_EXPORT2
1117 ucm_addMappingFromLine(UCMFile
*ucm
, const char *line
, UBool forBase
, UCMStates
*baseStates
) {
1119 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
];
1120 uint8_t bytes
[UCNV_EXT_MAX_BYTES
];
1124 /* ignore empty and comment lines */
1125 if(line
[0]=='#' || *(s
=u_skipWhitespace(line
))==0 || *s
=='\n' || *s
=='\r') {
1130 ucm_parseMappingLine(&m
, codePoints
, bytes
, line
) &&
1131 ucm_addMappingAuto(ucm
, forBase
, baseStates
, &m
, codePoints
, bytes
);
1134 U_CAPI
void U_EXPORT2
1135 ucm_readTable(UCMFile
*ucm
, FileStream
* convFile
,
1136 UBool forBase
, UCMStates
*baseStates
,
1137 UErrorCode
*pErrorCode
) {
1142 if(U_FAILURE(*pErrorCode
)) {
1149 /* read the next line */
1150 if(!T_FileStream_readLine(convFile
, line
, sizeof(line
))) {
1151 fprintf(stderr
, "incomplete charmap section\n");
1157 end
=uprv_strchr(line
, 0);
1158 while(line
<end
&& (*(end
-1)=='\r' || *(end
-1)=='\n')) {
1163 /* ignore empty and comment lines */
1164 if(line
[0]==0 || line
[0]=='#') {
1168 /* stop at the end of the mapping table */
1169 if(0==uprv_strcmp(line
, "END CHARMAP")) {
1173 isOK
&=ucm_addMappingFromLine(ucm
, line
, forBase
, baseStates
);
1177 *pErrorCode
=U_INVALID_TABLE_FORMAT
;