2 *******************************************************************************
4 * Copyright (C) 2003-2013, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2003jun20
14 * created by: Markus W. Scherer
16 * This file reads a .ucm file, stores its mappings and sorts them.
17 * It implements handling of Unicode conversion mappings from .ucm files
18 * for makeconv, canonucm, rptp2ucm, etc.
20 * Unicode code point sequences with a length of more than 1,
21 * as well as byte sequences with more than 4 bytes or more than one complete
22 * character sequence are handled to support m:n mappings.
25 #include "unicode/utypes.h"
26 #include "unicode/ustring.h"
38 #if !UCONFIG_NO_CONVERSION
40 /* -------------------------------------------------------------------------- */
43 printMapping(UCMapping
*m
, UChar32
*codePoints
, uint8_t *bytes
, FILE *f
) {
46 for(j
=0; j
<m
->uLen
; ++j
) {
47 fprintf(f
, "<U%04lX>", (long)codePoints
[j
]);
52 for(j
=0; j
<m
->bLen
; ++j
) {
53 fprintf(f
, "\\x%02X", bytes
[j
]);
57 fprintf(f
, " |%u\n", m
->f
);
64 ucm_printMapping(UCMTable
*table
, UCMapping
*m
, FILE *f
) {
65 printMapping(m
, UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
), f
);
69 ucm_printTable(UCMTable
*table
, FILE *f
, UBool byUnicode
) {
74 length
=table
->mappingsLength
;
76 for(i
=0; i
<length
; ++m
, ++i
) {
77 ucm_printMapping(table
, m
, f
);
80 const int32_t *map
=table
->reverseMap
;
81 for(i
=0; i
<length
; ++i
) {
82 ucm_printMapping(table
, m
+map
[i
], f
);
87 /* mapping comparisons ------------------------------------------------------ */
90 compareUnicode(UCMTable
*lTable
, const UCMapping
*l
,
91 UCMTable
*rTable
, const UCMapping
*r
) {
92 const UChar32
*lu
, *ru
;
93 int32_t result
, i
, length
;
95 if(l
->uLen
==1 && r
->uLen
==1) {
96 /* compare two single code points */
100 /* get pointers to the code point sequences */
101 lu
=UCM_GET_CODE_POINTS(lTable
, l
);
102 ru
=UCM_GET_CODE_POINTS(rTable
, r
);
104 /* get the minimum length */
105 if(l
->uLen
<=r
->uLen
) {
111 /* compare the code points */
112 for(i
=0; i
<length
; ++i
) {
119 /* compare the lengths */
120 return l
->uLen
-r
->uLen
;
124 compareBytes(UCMTable
*lTable
, const UCMapping
*l
,
125 UCMTable
*rTable
, const UCMapping
*r
,
127 const uint8_t *lb
, *rb
;
128 int32_t result
, i
, length
;
131 * A lexical comparison is used for sorting in the builder, to allow
132 * an efficient search for a byte sequence that could be a prefix
133 * of a previously entered byte sequence.
135 * Comparing by lengths first is for compatibility with old .ucm tools
136 * like canonucm and rptp2ucm.
139 /* get the minimum length and continue */
140 if(l
->bLen
<=r
->bLen
) {
146 /* compare lengths first */
147 result
=l
->bLen
-r
->bLen
;
155 /* get pointers to the byte sequences */
156 lb
=UCM_GET_BYTES(lTable
, l
);
157 rb
=UCM_GET_BYTES(rTable
, r
);
159 /* compare the bytes */
160 for(i
=0; i
<length
; ++i
) {
167 /* compare the lengths */
168 return l
->bLen
-r
->bLen
;
171 /* compare UCMappings for sorting */
173 compareMappings(UCMTable
*lTable
, const UCMapping
*l
,
174 UCMTable
*rTable
, const UCMapping
*r
,
178 /* choose which side to compare first */
180 /* Unicode then bytes */
181 result
=compareUnicode(lTable
, l
, rTable
, r
);
183 result
=compareBytes(lTable
, l
, rTable
, r
, FALSE
); /* not lexically, like canonucm */
186 /* bytes then Unicode */
187 result
=compareBytes(lTable
, l
, rTable
, r
, TRUE
); /* lexically, for builder */
189 result
=compareUnicode(lTable
, l
, rTable
, r
);
197 /* compare the flags */
201 /* sorting by Unicode first sorts mappings directly */
203 compareMappingsUnicodeFirst(const void *context
, const void *left
, const void *right
) {
204 return compareMappings(
205 (UCMTable
*)context
, (const UCMapping
*)left
,
206 (UCMTable
*)context
, (const UCMapping
*)right
, TRUE
);
209 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
211 compareMappingsBytesFirst(const void *context
, const void *left
, const void *right
) {
212 UCMTable
*table
=(UCMTable
*)context
;
213 int32_t l
=*(const int32_t *)left
, r
=*(const int32_t *)right
;
214 return compareMappings(
215 table
, table
->mappings
+l
,
216 table
, table
->mappings
+r
, FALSE
);
219 U_CAPI
void U_EXPORT2
220 ucm_sortTable(UCMTable
*t
) {
221 UErrorCode errorCode
;
228 errorCode
=U_ZERO_ERROR
;
230 /* 1. sort by Unicode first */
231 uprv_sortArray(t
->mappings
, t
->mappingsLength
, sizeof(UCMapping
),
232 compareMappingsUnicodeFirst
, t
,
235 /* build the reverseMap */
236 if(t
->reverseMap
==NULL
) {
238 * allocate mappingsCapacity instead of mappingsLength so that
239 * if mappings are added, the reverseMap need not be
240 * reallocated each time
241 * (see ucm_moveMappings() and ucm_addMapping())
243 t
->reverseMap
=(int32_t *)uprv_malloc(t
->mappingsCapacity
*sizeof(int32_t));
244 if(t
->reverseMap
==NULL
) {
245 fprintf(stderr
, "ucm error: unable to allocate reverseMap\n");
246 exit(U_MEMORY_ALLOCATION_ERROR
);
249 for(i
=0; i
<t
->mappingsLength
; ++i
) {
253 /* 2. sort reverseMap by mappings bytes first */
254 uprv_sortArray(t
->reverseMap
, t
->mappingsLength
, sizeof(int32_t),
255 compareMappingsBytesFirst
, t
,
258 if(U_FAILURE(errorCode
)) {
259 fprintf(stderr
, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
260 u_errorName(errorCode
));
268 * remove mappings with their move flag set from the base table
269 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
271 U_CAPI
void U_EXPORT2
272 ucm_moveMappings(UCMTable
*base
, UCMTable
*ext
) {
273 UCMapping
*mb
, *mbLimit
;
277 mbLimit
=mb
+base
->mappingsLength
;
282 /* reset the move flag */
285 if(ext
!=NULL
&& (flag
&UCM_MOVE_TO_EXT
)) {
286 /* add the mapping to the extension table */
287 ucm_addMapping(ext
, mb
, UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_BYTES(base
, mb
));
290 /* remove this mapping: move the last base mapping down and overwrite the current one */
292 uprv_memcpy(mb
, mbLimit
-1, sizeof(UCMapping
));
295 --base
->mappingsLength
;
296 base
->isSorted
=FALSE
;
309 checkBaseExtUnicode(UCMStates
*baseStates
, UCMTable
*base
, UCMTable
*ext
,
310 UBool moveToExt
, UBool intersectBase
) {
311 UCMapping
*mb
, *me
, *mbLimit
, *meLimit
;
316 mbLimit
=mb
+base
->mappingsLength
;
319 meLimit
=me
+ext
->mappingsLength
;
324 /* skip irrelevant mappings on both sides */
330 if((0<=mb
->f
&& mb
->f
<=2) || mb
->f
==4) {
342 if((0<=me
->f
&& me
->f
<=2) || me
->f
==4) {
349 /* compare the base and extension mappings */
350 cmp
=compareUnicode(base
, mb
, ext
, me
);
352 if(intersectBase
&& (intersectBase
!=2 || mb
->bLen
>1)) {
354 * mapping in base but not in ext, move it
356 * if ext is DBCS, move DBCS mappings here
357 * and check SBCS ones for Unicode prefix below
359 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
362 /* does mb map from an input sequence that is a prefix of me's? */
363 } else if( mb
->uLen
<me
->uLen
&&
364 0==uprv_memcmp(UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_CODE_POINTS(ext
, me
), 4*mb
->uLen
)
367 /* mark this mapping to be moved to the extension table */
368 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
372 "ucm error: the base table contains a mapping whose input sequence\n"
373 " is a prefix of the input sequence of an extension mapping\n");
374 ucm_printMapping(base
, mb
, stderr
);
375 ucm_printMapping(ext
, me
, stderr
);
383 * same output: remove the extension mapping,
384 * otherwise treat as an error
386 if( mb
->f
==me
->f
&& mb
->bLen
==me
->bLen
&&
387 0==uprv_memcmp(UCM_GET_BYTES(base
, mb
), UCM_GET_BYTES(ext
, me
), mb
->bLen
)
389 me
->moveFlag
|=UCM_REMOVE_MAPPING
;
391 } else if(intersectBase
) {
392 /* mapping in base but not in ext, move it */
393 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
397 "ucm error: the base table contains a mapping whose input sequence\n"
398 " is the same as the input sequence of an extension mapping\n"
399 " but it maps differently\n");
400 ucm_printMapping(base
, mb
, stderr
);
401 ucm_printMapping(ext
, me
, stderr
);
413 checkBaseExtBytes(UCMStates
*baseStates
, UCMTable
*base
, UCMTable
*ext
,
414 UBool moveToExt
, UBool intersectBase
) {
416 int32_t *baseMap
, *extMap
;
417 int32_t b
, e
, bLimit
, eLimit
, cmp
;
421 baseMap
=base
->reverseMap
;
422 extMap
=ext
->reverseMap
;
425 bLimit
=base
->mappingsLength
;
426 eLimit
=ext
->mappingsLength
;
430 isSISO
=(UBool
)(baseStates
->outputType
==MBCS_OUTPUT_2_SISO
);
433 /* skip irrelevant mappings on both sides */
438 mb
=base
->mappings
+baseMap
[b
];
440 if(intersectBase
==2 && mb
->bLen
==1) {
442 * comparing a base against a DBCS extension:
443 * leave SBCS base mappings alone
448 if(mb
->f
==0 || mb
->f
==3) {
457 me
=ext
->mappings
+extMap
[e
];
459 if(me
->f
==0 || me
->f
==3) {
466 /* compare the base and extension mappings */
467 cmp
=compareBytes(base
, mb
, ext
, me
, TRUE
);
470 /* mapping in base but not in ext, move it */
471 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
475 * does mb map from an input sequence that is a prefix of me's?
476 * for SI/SO tables, a single byte is never a prefix because it
477 * occurs in a separate single-byte state
479 } else if( mb
->bLen
<me
->bLen
&&
480 (!isSISO
|| mb
->bLen
>1) &&
481 0==uprv_memcmp(UCM_GET_BYTES(base
, mb
), UCM_GET_BYTES(ext
, me
), mb
->bLen
)
484 /* mark this mapping to be moved to the extension table */
485 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
489 "ucm error: the base table contains a mapping whose input sequence\n"
490 " is a prefix of the input sequence of an extension mapping\n");
491 ucm_printMapping(base
, mb
, stderr
);
492 ucm_printMapping(ext
, me
, stderr
);
500 * same output: remove the extension mapping,
501 * otherwise treat as an error
503 if( mb
->f
==me
->f
&& mb
->uLen
==me
->uLen
&&
504 0==uprv_memcmp(UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_CODE_POINTS(ext
, me
), 4*mb
->uLen
)
506 me
->moveFlag
|=UCM_REMOVE_MAPPING
;
508 } else if(intersectBase
) {
509 /* mapping in base but not in ext, move it */
510 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
514 "ucm error: the base table contains a mapping whose input sequence\n"
515 " is the same as the input sequence of an extension mapping\n"
516 " but it maps differently\n");
517 ucm_printMapping(base
, mb
, stderr
);
518 ucm_printMapping(ext
, me
, stderr
);
529 U_CAPI UBool U_EXPORT2
530 ucm_checkValidity(UCMTable
*table
, UCMStates
*baseStates
) {
531 UCMapping
*m
, *mLimit
;
536 mLimit
=m
+table
->mappingsLength
;
540 count
=ucm_countChars(baseStates
, UCM_GET_BYTES(table
, m
), m
->bLen
);
542 ucm_printMapping(table
, m
, stderr
);
551 U_CAPI UBool U_EXPORT2
552 ucm_checkBaseExt(UCMStates
*baseStates
,
553 UCMTable
*base
, UCMTable
*ext
, UCMTable
*moveTarget
,
554 UBool intersectBase
) {
557 /* if we have an extension table, we must always use precision flags */
558 if(base
->flagsType
&UCM_FLAGS_IMPLICIT
) {
559 fprintf(stderr
, "ucm error: the base table contains mappings without precision flags\n");
562 if(ext
->flagsType
&UCM_FLAGS_IMPLICIT
) {
563 fprintf(stderr
, "ucm error: extension table contains mappings without precision flags\n");
567 /* checking requires both tables to be sorted */
573 checkBaseExtUnicode(baseStates
, base
, ext
, (UBool
)(moveTarget
!=NULL
), intersectBase
)|
574 checkBaseExtBytes(baseStates
, base
, ext
, (UBool
)(moveTarget
!=NULL
), intersectBase
);
576 if(result
&HAS_ERRORS
) {
580 if(result
&NEEDS_MOVE
) {
581 ucm_moveMappings(ext
, NULL
);
582 ucm_moveMappings(base
, moveTarget
);
585 if(moveTarget
!=NULL
) {
586 ucm_sortTable(moveTarget
);
593 /* merge tables for rptp2ucm ------------------------------------------------ */
595 U_CAPI
void U_EXPORT2
596 ucm_mergeTables(UCMTable
*fromUTable
, UCMTable
*toUTable
,
597 const uint8_t *subchar
, int32_t subcharLength
,
599 UCMapping
*fromUMapping
, *toUMapping
;
600 int32_t fromUIndex
, toUIndex
, fromUTop
, toUTop
, cmp
;
602 ucm_sortTable(fromUTable
);
603 ucm_sortTable(toUTable
);
605 fromUMapping
=fromUTable
->mappings
;
606 toUMapping
=toUTable
->mappings
;
608 fromUTop
=fromUTable
->mappingsLength
;
609 toUTop
=toUTable
->mappingsLength
;
611 fromUIndex
=toUIndex
=0;
613 while(fromUIndex
<fromUTop
&& toUIndex
<toUTop
) {
614 cmp
=compareMappings(fromUTable
, fromUMapping
, toUTable
, toUMapping
, TRUE
);
616 /* equal: roundtrip, nothing to do (flags are initially 0) */
624 * the fromU mapping does not have a toU counterpart:
625 * fallback Unicode->codepage
627 if( (fromUMapping
->bLen
==subcharLength
&&
628 0==uprv_memcmp(UCM_GET_BYTES(fromUTable
, fromUMapping
), subchar
, subcharLength
)) ||
629 (subchar1
!=0 && fromUMapping
->bLen
==1 && fromUMapping
->b
.bytes
[0]==subchar1
)
631 fromUMapping
->f
=2; /* SUB mapping */
633 fromUMapping
->f
=1; /* normal fallback */
640 * the toU mapping does not have a fromU counterpart:
641 * (reverse) fallback codepage->Unicode, copy it to the fromU table
644 /* ignore reverse fallbacks to Unicode SUB */
645 if(!(toUMapping
->uLen
==1 && (toUMapping
->u
==0xfffd || toUMapping
->u
==0x1a))) {
646 toUMapping
->f
=3; /* reverse fallback */
647 ucm_addMapping(fromUTable
, toUMapping
, UCM_GET_CODE_POINTS(toUTable
, toUMapping
), UCM_GET_BYTES(toUTable
, toUMapping
));
649 /* the table may have been reallocated */
650 fromUMapping
=fromUTable
->mappings
+fromUIndex
;
658 /* either one or both tables are exhausted */
659 while(fromUIndex
<fromUTop
) {
660 /* leftover fromU mappings are fallbacks */
661 if( (fromUMapping
->bLen
==subcharLength
&&
662 0==uprv_memcmp(UCM_GET_BYTES(fromUTable
, fromUMapping
), subchar
, subcharLength
)) ||
663 (subchar1
!=0 && fromUMapping
->bLen
==1 && fromUMapping
->b
.bytes
[0]==subchar1
)
665 fromUMapping
->f
=2; /* SUB mapping */
667 fromUMapping
->f
=1; /* normal fallback */
674 while(toUIndex
<toUTop
) {
675 /* leftover toU mappings are reverse fallbacks */
677 /* ignore reverse fallbacks to Unicode SUB */
678 if(!(toUMapping
->uLen
==1 && (toUMapping
->u
==0xfffd || toUMapping
->u
==0x1a))) {
679 toUMapping
->f
=3; /* reverse fallback */
680 ucm_addMapping(fromUTable
, toUMapping
, UCM_GET_CODE_POINTS(toUTable
, toUMapping
), UCM_GET_BYTES(toUTable
, toUMapping
));
687 fromUTable
->isSorted
=FALSE
;
690 /* separate extension mappings out of base table for rptp2ucm --------------- */
692 U_CAPI UBool U_EXPORT2
693 ucm_separateMappings(UCMFile
*ucm
, UBool isSISO
) {
695 UCMapping
*m
, *mLimit
;
697 UBool needsMove
, isOK
;
701 mLimit
=m
+table
->mappingsLength
;
706 for(; m
<mLimit
; ++m
) {
707 if(isSISO
&& m
->bLen
==1 && (m
->b
.bytes
[0]==0xe || m
->b
.bytes
[0]==0xf)) {
708 fprintf(stderr
, "warning: removing illegal mapping from an SI/SO-stateful table\n");
709 ucm_printMapping(table
, m
, stderr
);
710 m
->moveFlag
|=UCM_REMOVE_MAPPING
;
715 type
=ucm_mappingType(
717 UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
));
719 /* illegal byte sequence */
720 printMapping(m
, UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
), stderr
);
723 m
->moveFlag
|=UCM_MOVE_TO_EXT
;
732 ucm_moveMappings(ucm
->base
, ucm
->ext
);
733 return ucm_checkBaseExt(&ucm
->states
, ucm
->base
, ucm
->ext
, ucm
->ext
, FALSE
);
735 ucm_sortTable(ucm
->base
);
740 /* ucm parser --------------------------------------------------------------- */
742 U_CAPI
int8_t U_EXPORT2
743 ucm_parseBytes(uint8_t bytes
[UCNV_EXT_MAX_BYTES
], const char *line
, const char **ps
) {
751 /* skip an optional plus sign */
752 if(bLen
>0 && *s
=='+') {
760 (byte
=(uint8_t)uprv_strtoul(s
+2, &end
, 16), end
)!=s
+4
762 fprintf(stderr
, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line
);
766 if(bLen
==UCNV_EXT_MAX_BYTES
) {
767 fprintf(stderr
, "ucm error: too many bytes on \"%s\"\n", line
);
778 /* parse a mapping line; must not be empty */
779 U_CAPI UBool U_EXPORT2
780 ucm_parseMappingLine(UCMapping
*m
,
781 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
782 uint8_t bytes
[UCNV_EXT_MAX_BYTES
],
788 int8_t uLen
, bLen
, f
;
793 /* parse code points */
795 /* skip an optional plus sign */
796 if(uLen
>0 && *s
=='+') {
804 (cp
=(UChar32
)uprv_strtoul(s
+2, &end
, 16), end
)==s
+2 ||
807 fprintf(stderr
, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line
);
810 if((uint32_t)cp
>0x10ffff || U_IS_SURROGATE(cp
)) {
811 fprintf(stderr
, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line
);
815 if(uLen
==UCNV_EXT_MAX_UCHARS
) {
816 fprintf(stderr
, "ucm error: too many code points on \"%s\"\n", line
);
819 codePoints
[uLen
++]=cp
;
824 fprintf(stderr
, "ucm error: no Unicode code points on \"%s\"\n", line
);
829 UErrorCode errorCode
=U_ZERO_ERROR
;
830 u_strFromUTF32(NULL
, 0, &u16Length
, codePoints
, uLen
, &errorCode
);
831 if( (U_FAILURE(errorCode
) && errorCode
!=U_BUFFER_OVERFLOW_ERROR
) ||
832 u16Length
>UCNV_EXT_MAX_UCHARS
834 fprintf(stderr
, "ucm error: too many UChars on \"%s\"\n", line
);
839 s
=u_skipWhitespace(s
);
842 bLen
=ucm_parseBytes(bytes
, line
, &s
);
847 fprintf(stderr
, "ucm error: no bytes on \"%s\"\n", line
);
850 uprv_memcpy(m
->b
.bytes
, bytes
, bLen
);
853 /* skip everything until the fallback indicator, even the start of a comment */
856 f
=-1; /* no fallback indicator */
859 f
=(int8_t)(s
[1]-'0');
861 fprintf(stderr
, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line
);
875 /* general APIs ------------------------------------------------------------- */
877 U_CAPI UCMTable
* U_EXPORT2
879 UCMTable
*table
=(UCMTable
*)uprv_malloc(sizeof(UCMTable
));
881 fprintf(stderr
, "ucm error: unable to allocate a UCMTable\n");
882 exit(U_MEMORY_ALLOCATION_ERROR
);
885 memset(table
, 0, sizeof(UCMTable
));
889 U_CAPI
void U_EXPORT2
890 ucm_closeTable(UCMTable
*table
) {
892 uprv_free(table
->mappings
);
893 uprv_free(table
->codePoints
);
894 uprv_free(table
->bytes
);
895 uprv_free(table
->reverseMap
);
900 U_CAPI
void U_EXPORT2
901 ucm_resetTable(UCMTable
*table
) {
903 table
->mappingsLength
=0;
905 table
->unicodeMask
=0;
906 table
->bytesLength
=table
->codePointsLength
=0;
907 table
->isSorted
=FALSE
;
911 U_CAPI
void U_EXPORT2
912 ucm_addMapping(UCMTable
*table
,
914 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
915 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
920 if(table
->mappingsLength
>=table
->mappingsCapacity
) {
921 /* make the mappings array larger */
922 if(table
->mappingsCapacity
==0) {
923 table
->mappingsCapacity
=1000;
925 table
->mappingsCapacity
*=10;
927 table
->mappings
=(UCMapping
*)uprv_realloc(table
->mappings
,
928 table
->mappingsCapacity
*sizeof(UCMapping
));
929 if(table
->mappings
==NULL
) {
930 fprintf(stderr
, "ucm error: unable to allocate %d UCMappings\n",
931 (int)table
->mappingsCapacity
);
932 exit(U_MEMORY_ALLOCATION_ERROR
);
935 if(table
->reverseMap
!=NULL
) {
936 /* the reverseMap must be reallocated in a new sort */
937 uprv_free(table
->reverseMap
);
938 table
->reverseMap
=NULL
;
942 if(m
->uLen
>1 && table
->codePointsCapacity
==0) {
943 table
->codePointsCapacity
=10000;
944 table
->codePoints
=(UChar32
*)uprv_malloc(table
->codePointsCapacity
*4);
945 if(table
->codePoints
==NULL
) {
946 fprintf(stderr
, "ucm error: unable to allocate %d UChar32s\n",
947 (int)table
->codePointsCapacity
);
948 exit(U_MEMORY_ALLOCATION_ERROR
);
952 if(m
->bLen
>4 && table
->bytesCapacity
==0) {
953 table
->bytesCapacity
=10000;
954 table
->bytes
=(uint8_t *)uprv_malloc(table
->bytesCapacity
);
955 if(table
->bytes
==NULL
) {
956 fprintf(stderr
, "ucm error: unable to allocate %d bytes\n",
957 (int)table
->bytesCapacity
);
958 exit(U_MEMORY_ALLOCATION_ERROR
);
963 idx
=table
->codePointsLength
;
964 table
->codePointsLength
+=m
->uLen
;
965 if(table
->codePointsLength
>table
->codePointsCapacity
) {
966 fprintf(stderr
, "ucm error: too many code points in multiple-code point mappings\n");
967 exit(U_MEMORY_ALLOCATION_ERROR
);
970 uprv_memcpy(table
->codePoints
+idx
, codePoints
, m
->uLen
*4);
975 idx
=table
->bytesLength
;
976 table
->bytesLength
+=m
->bLen
;
977 if(table
->bytesLength
>table
->bytesCapacity
) {
978 fprintf(stderr
, "ucm error: too many bytes in mappings with >4 charset bytes\n");
979 exit(U_MEMORY_ALLOCATION_ERROR
);
982 uprv_memcpy(table
->bytes
+idx
, bytes
, m
->bLen
);
986 /* set unicodeMask */
987 for(idx
=0; idx
<m
->uLen
; ++idx
) {
990 table
->unicodeMask
|=UCNV_HAS_SUPPLEMENTARY
; /* there are supplementary code points */
991 } else if(U_IS_SURROGATE(c
)) {
992 table
->unicodeMask
|=UCNV_HAS_SURROGATES
; /* there are surrogate code points */
998 table
->flagsType
|=UCM_FLAGS_IMPLICIT
;
1000 table
->flagsType
|=UCM_FLAGS_EXPLICIT
;
1003 tm
=table
->mappings
+table
->mappingsLength
++;
1004 uprv_memcpy(tm
, m
, sizeof(UCMapping
));
1006 table
->isSorted
=FALSE
;
1009 U_CAPI UCMFile
* U_EXPORT2
1011 UCMFile
*ucm
=(UCMFile
*)uprv_malloc(sizeof(UCMFile
));
1013 fprintf(stderr
, "ucm error: unable to allocate a UCMFile\n");
1014 exit(U_MEMORY_ALLOCATION_ERROR
);
1017 memset(ucm
, 0, sizeof(UCMFile
));
1019 ucm
->base
=ucm_openTable();
1020 ucm
->ext
=ucm_openTable();
1022 ucm
->states
.stateFlags
[0]=MBCS_STATE_FLAG_DIRECT
;
1023 ucm
->states
.conversionType
=UCNV_UNSUPPORTED_CONVERTER
;
1024 ucm
->states
.outputType
=-1;
1025 ucm
->states
.minCharLength
=ucm
->states
.maxCharLength
=1;
1030 U_CAPI
void U_EXPORT2
1031 ucm_close(UCMFile
*ucm
) {
1033 ucm_closeTable(ucm
->base
);
1034 ucm_closeTable(ucm
->ext
);
1039 U_CAPI
int32_t U_EXPORT2
1040 ucm_mappingType(UCMStates
*baseStates
,
1042 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
1043 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
1044 /* check validity of the bytes and count the characters in them */
1045 int32_t count
=ucm_countChars(baseStates
, bytes
, m
->bLen
);
1047 /* illegal byte sequence */
1052 * Suitable for an ICU conversion base table means:
1053 * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1054 * - precision flag 0..3
1055 * - SBCS: any 1:1 mapping
1056 * (the table stores additional bits to distinguish mapping types)
1057 * - MBCS: not a |2 SUB mapping for <subchar1>
1058 * - MBCS: not a |1 fallback to 0x00
1059 * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1061 * Further restrictions for fromUnicode tables
1062 * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1064 * All of the MBCS fromUnicode specific tests could be removed from here,
1065 * but the ones above are for unusual mappings, and removing the tests
1066 * from here would change canonucm output which seems gratuitous.
1067 * (Markus Scherer 2006-nov-28)
1069 * Exception: All implicit mappings (f<0) that need to be moved
1070 * because of fromUnicode restrictions _must_ be moved here because
1071 * makeconv uses a hack for moving mappings only for the fromUnicode table
1072 * that only works with non-negative values of f.
1074 if( m
->uLen
==1 && count
==1 && m
->f
<=3 &&
1075 (baseStates
->maxCharLength
==1 ||
1076 !((m
->f
==2 && m
->bLen
==1) ||
1077 (m
->f
==1 && bytes
[0]==0) ||
1078 (m
->f
<=1 && m
->bLen
>1 && bytes
[0]==0)))
1080 return 0; /* suitable for a base table */
1082 return 1; /* needs to go into an extension table */
1086 U_CAPI UBool U_EXPORT2
1087 ucm_addMappingAuto(UCMFile
*ucm
, UBool forBase
, UCMStates
*baseStates
,
1089 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
1090 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
1093 if(m
->f
==2 && m
->uLen
>1) {
1094 fprintf(stderr
, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1095 printMapping(m
, codePoints
, bytes
, stderr
);
1099 if(baseStates
!=NULL
) {
1100 /* check validity of the bytes and count the characters in them */
1101 type
=ucm_mappingType(baseStates
, m
, codePoints
, bytes
);
1103 /* illegal byte sequence */
1104 printMapping(m
, codePoints
, bytes
, stderr
);
1108 /* not used - adding a mapping for an extension-only table before its base table is read */
1113 * Add the mapping to the base table if this is requested and suitable.
1114 * Otherwise, add it to the extension table.
1116 if(forBase
&& type
==0) {
1117 ucm_addMapping(ucm
->base
, m
, codePoints
, bytes
);
1119 ucm_addMapping(ucm
->ext
, m
, codePoints
, bytes
);
1125 U_CAPI UBool U_EXPORT2
1126 ucm_addMappingFromLine(UCMFile
*ucm
, const char *line
, UBool forBase
, UCMStates
*baseStates
) {
1128 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
];
1129 uint8_t bytes
[UCNV_EXT_MAX_BYTES
];
1133 /* ignore empty and comment lines */
1134 if(line
[0]=='#' || *(s
=u_skipWhitespace(line
))==0 || *s
=='\n' || *s
=='\r') {
1139 ucm_parseMappingLine(&m
, codePoints
, bytes
, line
) &&
1140 ucm_addMappingAuto(ucm
, forBase
, baseStates
, &m
, codePoints
, bytes
);
1143 U_CAPI
void U_EXPORT2
1144 ucm_readTable(UCMFile
*ucm
, FileStream
* convFile
,
1145 UBool forBase
, UCMStates
*baseStates
,
1146 UErrorCode
*pErrorCode
) {
1151 if(U_FAILURE(*pErrorCode
)) {
1158 /* read the next line */
1159 if(!T_FileStream_readLine(convFile
, line
, sizeof(line
))) {
1160 fprintf(stderr
, "incomplete charmap section\n");
1166 end
=uprv_strchr(line
, 0);
1167 while(line
<end
&& (*(end
-1)=='\r' || *(end
-1)=='\n')) {
1172 /* ignore empty and comment lines */
1173 if(line
[0]==0 || line
[0]=='#') {
1177 /* stop at the end of the mapping table */
1178 if(0==uprv_strcmp(line
, "END CHARMAP")) {
1182 isOK
&=ucm_addMappingFromLine(ucm
, line
, forBase
, baseStates
);
1186 *pErrorCode
=U_INVALID_TABLE_FORMAT
;