1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2003-2013, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
12 * tab size: 8 (not used)
15 * created on: 2003jun20
16 * created by: Markus W. Scherer
18 * This file reads a .ucm file, stores its mappings and sorts them.
19 * It implements handling of Unicode conversion mappings from .ucm files
20 * for makeconv, canonucm, rptp2ucm, etc.
22 * Unicode code point sequences with a length of more than 1,
23 * as well as byte sequences with more than 4 bytes or more than one complete
24 * character sequence are handled to support m:n mappings.
27 #include "unicode/utypes.h"
28 #include "unicode/ustring.h"
40 #if !UCONFIG_NO_CONVERSION
42 /* -------------------------------------------------------------------------- */
45 printMapping(UCMapping
*m
, UChar32
*codePoints
, uint8_t *bytes
, FILE *f
) {
48 for(j
=0; j
<m
->uLen
; ++j
) {
49 fprintf(f
, "<U%04lX>", (long)codePoints
[j
]);
54 for(j
=0; j
<m
->bLen
; ++j
) {
55 fprintf(f
, "\\x%02X", bytes
[j
]);
59 fprintf(f
, " |%u\n", m
->f
);
66 ucm_printMapping(UCMTable
*table
, UCMapping
*m
, FILE *f
) {
67 printMapping(m
, UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
), f
);
71 ucm_printTable(UCMTable
*table
, FILE *f
, UBool byUnicode
) {
76 length
=table
->mappingsLength
;
78 for(i
=0; i
<length
; ++m
, ++i
) {
79 ucm_printMapping(table
, m
, f
);
82 const int32_t *map
=table
->reverseMap
;
83 for(i
=0; i
<length
; ++i
) {
84 ucm_printMapping(table
, m
+map
[i
], f
);
89 /* mapping comparisons ------------------------------------------------------ */
92 compareUnicode(UCMTable
*lTable
, const UCMapping
*l
,
93 UCMTable
*rTable
, const UCMapping
*r
) {
94 const UChar32
*lu
, *ru
;
95 int32_t result
, i
, length
;
97 if(l
->uLen
==1 && r
->uLen
==1) {
98 /* compare two single code points */
102 /* get pointers to the code point sequences */
103 lu
=UCM_GET_CODE_POINTS(lTable
, l
);
104 ru
=UCM_GET_CODE_POINTS(rTable
, r
);
106 /* get the minimum length */
107 if(l
->uLen
<=r
->uLen
) {
113 /* compare the code points */
114 for(i
=0; i
<length
; ++i
) {
121 /* compare the lengths */
122 return l
->uLen
-r
->uLen
;
126 compareBytes(UCMTable
*lTable
, const UCMapping
*l
,
127 UCMTable
*rTable
, const UCMapping
*r
,
129 const uint8_t *lb
, *rb
;
130 int32_t result
, i
, length
;
133 * A lexical comparison is used for sorting in the builder, to allow
134 * an efficient search for a byte sequence that could be a prefix
135 * of a previously entered byte sequence.
137 * Comparing by lengths first is for compatibility with old .ucm tools
138 * like canonucm and rptp2ucm.
141 /* get the minimum length and continue */
142 if(l
->bLen
<=r
->bLen
) {
148 /* compare lengths first */
149 result
=l
->bLen
-r
->bLen
;
157 /* get pointers to the byte sequences */
158 lb
=UCM_GET_BYTES(lTable
, l
);
159 rb
=UCM_GET_BYTES(rTable
, r
);
161 /* compare the bytes */
162 for(i
=0; i
<length
; ++i
) {
169 /* compare the lengths */
170 return l
->bLen
-r
->bLen
;
173 /* compare UCMappings for sorting */
175 compareMappings(UCMTable
*lTable
, const UCMapping
*l
,
176 UCMTable
*rTable
, const UCMapping
*r
,
180 /* choose which side to compare first */
182 /* Unicode then bytes */
183 result
=compareUnicode(lTable
, l
, rTable
, r
);
185 result
=compareBytes(lTable
, l
, rTable
, r
, FALSE
); /* not lexically, like canonucm */
188 /* bytes then Unicode */
189 result
=compareBytes(lTable
, l
, rTable
, r
, TRUE
); /* lexically, for builder */
191 result
=compareUnicode(lTable
, l
, rTable
, r
);
199 /* compare the flags */
203 /* sorting by Unicode first sorts mappings directly */
204 static int32_t U_CALLCONV
205 compareMappingsUnicodeFirst(const void *context
, const void *left
, const void *right
) {
206 return compareMappings(
207 (UCMTable
*)context
, (const UCMapping
*)left
,
208 (UCMTable
*)context
, (const UCMapping
*)right
, TRUE
);
211 /* sorting by bytes first sorts the reverseMap; use indirection to mappings */
212 static int32_t U_CALLCONV
213 compareMappingsBytesFirst(const void *context
, const void *left
, const void *right
) {
214 UCMTable
*table
=(UCMTable
*)context
;
215 int32_t l
=*(const int32_t *)left
, r
=*(const int32_t *)right
;
216 return compareMappings(
217 table
, table
->mappings
+l
,
218 table
, table
->mappings
+r
, FALSE
);
222 U_CAPI
void U_EXPORT2
223 ucm_sortTable(UCMTable
*t
) {
224 UErrorCode errorCode
;
231 errorCode
=U_ZERO_ERROR
;
233 /* 1. sort by Unicode first */
234 uprv_sortArray(t
->mappings
, t
->mappingsLength
, sizeof(UCMapping
),
235 compareMappingsUnicodeFirst
, t
,
238 /* build the reverseMap */
239 if(t
->reverseMap
==NULL
) {
241 * allocate mappingsCapacity instead of mappingsLength so that
242 * if mappings are added, the reverseMap need not be
243 * reallocated each time
244 * (see ucm_moveMappings() and ucm_addMapping())
246 t
->reverseMap
=(int32_t *)uprv_malloc(t
->mappingsCapacity
*sizeof(int32_t));
247 if(t
->reverseMap
==NULL
) {
248 fprintf(stderr
, "ucm error: unable to allocate reverseMap\n");
249 exit(U_MEMORY_ALLOCATION_ERROR
);
252 for(i
=0; i
<t
->mappingsLength
; ++i
) {
256 /* 2. sort reverseMap by mappings bytes first */
257 uprv_sortArray(t
->reverseMap
, t
->mappingsLength
, sizeof(int32_t),
258 compareMappingsBytesFirst
, t
,
261 if(U_FAILURE(errorCode
)) {
262 fprintf(stderr
, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
263 u_errorName(errorCode
));
271 * remove mappings with their move flag set from the base table
272 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
274 U_CAPI
void U_EXPORT2
275 ucm_moveMappings(UCMTable
*base
, UCMTable
*ext
) {
276 UCMapping
*mb
, *mbLimit
;
280 mbLimit
=mb
+base
->mappingsLength
;
285 /* reset the move flag */
288 if(ext
!=NULL
&& (flag
&UCM_MOVE_TO_EXT
)) {
289 /* add the mapping to the extension table */
290 ucm_addMapping(ext
, mb
, UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_BYTES(base
, mb
));
293 /* remove this mapping: move the last base mapping down and overwrite the current one */
295 uprv_memcpy(mb
, mbLimit
-1, sizeof(UCMapping
));
298 --base
->mappingsLength
;
299 base
->isSorted
=FALSE
;
312 checkBaseExtUnicode(UCMStates
*baseStates
, UCMTable
*base
, UCMTable
*ext
,
313 UBool moveToExt
, UBool intersectBase
) {
316 UCMapping
*mb
, *me
, *mbLimit
, *meLimit
;
321 mbLimit
=mb
+base
->mappingsLength
;
324 meLimit
=me
+ext
->mappingsLength
;
329 /* skip irrelevant mappings on both sides */
335 if((0<=mb
->f
&& mb
->f
<=2) || mb
->f
==4) {
347 if((0<=me
->f
&& me
->f
<=2) || me
->f
==4) {
354 /* compare the base and extension mappings */
355 cmp
=compareUnicode(base
, mb
, ext
, me
);
357 if(intersectBase
&& (intersectBase
!=2 || mb
->bLen
>1)) {
359 * mapping in base but not in ext, move it
361 * if ext is DBCS, move DBCS mappings here
362 * and check SBCS ones for Unicode prefix below
364 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
367 /* does mb map from an input sequence that is a prefix of me's? */
368 } else if( mb
->uLen
<me
->uLen
&&
369 0==uprv_memcmp(UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_CODE_POINTS(ext
, me
), 4*mb
->uLen
)
372 /* mark this mapping to be moved to the extension table */
373 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
377 "ucm error: the base table contains a mapping whose input sequence\n"
378 " is a prefix of the input sequence of an extension mapping\n");
379 ucm_printMapping(base
, mb
, stderr
);
380 ucm_printMapping(ext
, me
, stderr
);
388 * same output: remove the extension mapping,
389 * otherwise treat as an error
391 if( mb
->f
==me
->f
&& mb
->bLen
==me
->bLen
&&
392 0==uprv_memcmp(UCM_GET_BYTES(base
, mb
), UCM_GET_BYTES(ext
, me
), mb
->bLen
)
394 me
->moveFlag
|=UCM_REMOVE_MAPPING
;
396 } else if(intersectBase
) {
397 /* mapping in base but not in ext, move it */
398 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
402 "ucm error: the base table contains a mapping whose input sequence\n"
403 " is the same as the input sequence of an extension mapping\n"
404 " but it maps differently\n");
405 ucm_printMapping(base
, mb
, stderr
);
406 ucm_printMapping(ext
, me
, stderr
);
418 checkBaseExtBytes(UCMStates
*baseStates
, UCMTable
*base
, UCMTable
*ext
,
419 UBool moveToExt
, UBool intersectBase
) {
421 int32_t *baseMap
, *extMap
;
422 int32_t b
, e
, bLimit
, eLimit
, cmp
;
426 baseMap
=base
->reverseMap
;
427 extMap
=ext
->reverseMap
;
430 bLimit
=base
->mappingsLength
;
431 eLimit
=ext
->mappingsLength
;
435 isSISO
=(UBool
)(baseStates
->outputType
==MBCS_OUTPUT_2_SISO
);
438 /* skip irrelevant mappings on both sides */
443 mb
=base
->mappings
+baseMap
[b
];
445 if(intersectBase
==2 && mb
->bLen
==1) {
447 * comparing a base against a DBCS extension:
448 * leave SBCS base mappings alone
453 if(mb
->f
==0 || mb
->f
==3) {
462 me
=ext
->mappings
+extMap
[e
];
464 if(me
->f
==0 || me
->f
==3) {
471 /* compare the base and extension mappings */
472 cmp
=compareBytes(base
, mb
, ext
, me
, TRUE
);
475 /* mapping in base but not in ext, move it */
476 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
480 * does mb map from an input sequence that is a prefix of me's?
481 * for SI/SO tables, a single byte is never a prefix because it
482 * occurs in a separate single-byte state
484 } else if( mb
->bLen
<me
->bLen
&&
485 (!isSISO
|| mb
->bLen
>1) &&
486 0==uprv_memcmp(UCM_GET_BYTES(base
, mb
), UCM_GET_BYTES(ext
, me
), mb
->bLen
)
489 /* mark this mapping to be moved to the extension table */
490 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
494 "ucm error: the base table contains a mapping whose input sequence\n"
495 " is a prefix of the input sequence of an extension mapping\n");
496 ucm_printMapping(base
, mb
, stderr
);
497 ucm_printMapping(ext
, me
, stderr
);
505 * same output: remove the extension mapping,
506 * otherwise treat as an error
508 if( mb
->f
==me
->f
&& mb
->uLen
==me
->uLen
&&
509 0==uprv_memcmp(UCM_GET_CODE_POINTS(base
, mb
), UCM_GET_CODE_POINTS(ext
, me
), 4*mb
->uLen
)
511 me
->moveFlag
|=UCM_REMOVE_MAPPING
;
513 } else if(intersectBase
) {
514 /* mapping in base but not in ext, move it */
515 mb
->moveFlag
|=UCM_MOVE_TO_EXT
;
519 "ucm error: the base table contains a mapping whose input sequence\n"
520 " is the same as the input sequence of an extension mapping\n"
521 " but it maps differently\n");
522 ucm_printMapping(base
, mb
, stderr
);
523 ucm_printMapping(ext
, me
, stderr
);
534 U_CAPI UBool U_EXPORT2
535 ucm_checkValidity(UCMTable
*table
, UCMStates
*baseStates
) {
536 UCMapping
*m
, *mLimit
;
541 mLimit
=m
+table
->mappingsLength
;
545 count
=ucm_countChars(baseStates
, UCM_GET_BYTES(table
, m
), m
->bLen
);
547 ucm_printMapping(table
, m
, stderr
);
556 U_CAPI UBool U_EXPORT2
557 ucm_checkBaseExt(UCMStates
*baseStates
,
558 UCMTable
*base
, UCMTable
*ext
, UCMTable
*moveTarget
,
559 UBool intersectBase
) {
562 /* if we have an extension table, we must always use precision flags */
563 if(base
->flagsType
&UCM_FLAGS_IMPLICIT
) {
564 fprintf(stderr
, "ucm error: the base table contains mappings without precision flags\n");
567 if(ext
->flagsType
&UCM_FLAGS_IMPLICIT
) {
568 fprintf(stderr
, "ucm error: extension table contains mappings without precision flags\n");
572 /* checking requires both tables to be sorted */
578 checkBaseExtUnicode(baseStates
, base
, ext
, (UBool
)(moveTarget
!=NULL
), intersectBase
)|
579 checkBaseExtBytes(baseStates
, base
, ext
, (UBool
)(moveTarget
!=NULL
), intersectBase
);
581 if(result
&HAS_ERRORS
) {
585 if(result
&NEEDS_MOVE
) {
586 ucm_moveMappings(ext
, NULL
);
587 ucm_moveMappings(base
, moveTarget
);
590 if(moveTarget
!=NULL
) {
591 ucm_sortTable(moveTarget
);
598 /* merge tables for rptp2ucm ------------------------------------------------ */
600 U_CAPI
void U_EXPORT2
601 ucm_mergeTables(UCMTable
*fromUTable
, UCMTable
*toUTable
,
602 const uint8_t *subchar
, int32_t subcharLength
,
604 UCMapping
*fromUMapping
, *toUMapping
;
605 int32_t fromUIndex
, toUIndex
, fromUTop
, toUTop
, cmp
;
607 ucm_sortTable(fromUTable
);
608 ucm_sortTable(toUTable
);
610 fromUMapping
=fromUTable
->mappings
;
611 toUMapping
=toUTable
->mappings
;
613 fromUTop
=fromUTable
->mappingsLength
;
614 toUTop
=toUTable
->mappingsLength
;
616 fromUIndex
=toUIndex
=0;
618 while(fromUIndex
<fromUTop
&& toUIndex
<toUTop
) {
619 cmp
=compareMappings(fromUTable
, fromUMapping
, toUTable
, toUMapping
, TRUE
);
621 /* equal: roundtrip, nothing to do (flags are initially 0) */
629 * the fromU mapping does not have a toU counterpart:
630 * fallback Unicode->codepage
632 if( (fromUMapping
->bLen
==subcharLength
&&
633 0==uprv_memcmp(UCM_GET_BYTES(fromUTable
, fromUMapping
), subchar
, subcharLength
)) ||
634 (subchar1
!=0 && fromUMapping
->bLen
==1 && fromUMapping
->b
.bytes
[0]==subchar1
)
636 fromUMapping
->f
=2; /* SUB mapping */
638 fromUMapping
->f
=1; /* normal fallback */
645 * the toU mapping does not have a fromU counterpart:
646 * (reverse) fallback codepage->Unicode, copy it to the fromU table
649 /* ignore reverse fallbacks to Unicode SUB */
650 if(!(toUMapping
->uLen
==1 && (toUMapping
->u
==0xfffd || toUMapping
->u
==0x1a))) {
651 toUMapping
->f
=3; /* reverse fallback */
652 ucm_addMapping(fromUTable
, toUMapping
, UCM_GET_CODE_POINTS(toUTable
, toUMapping
), UCM_GET_BYTES(toUTable
, toUMapping
));
654 /* the table may have been reallocated */
655 fromUMapping
=fromUTable
->mappings
+fromUIndex
;
663 /* either one or both tables are exhausted */
664 while(fromUIndex
<fromUTop
) {
665 /* leftover fromU mappings are fallbacks */
666 if( (fromUMapping
->bLen
==subcharLength
&&
667 0==uprv_memcmp(UCM_GET_BYTES(fromUTable
, fromUMapping
), subchar
, subcharLength
)) ||
668 (subchar1
!=0 && fromUMapping
->bLen
==1 && fromUMapping
->b
.bytes
[0]==subchar1
)
670 fromUMapping
->f
=2; /* SUB mapping */
672 fromUMapping
->f
=1; /* normal fallback */
679 while(toUIndex
<toUTop
) {
680 /* leftover toU mappings are reverse fallbacks */
682 /* ignore reverse fallbacks to Unicode SUB */
683 if(!(toUMapping
->uLen
==1 && (toUMapping
->u
==0xfffd || toUMapping
->u
==0x1a))) {
684 toUMapping
->f
=3; /* reverse fallback */
685 ucm_addMapping(fromUTable
, toUMapping
, UCM_GET_CODE_POINTS(toUTable
, toUMapping
), UCM_GET_BYTES(toUTable
, toUMapping
));
692 fromUTable
->isSorted
=FALSE
;
695 /* separate extension mappings out of base table for rptp2ucm --------------- */
697 U_CAPI UBool U_EXPORT2
698 ucm_separateMappings(UCMFile
*ucm
, UBool isSISO
) {
700 UCMapping
*m
, *mLimit
;
702 UBool needsMove
, isOK
;
706 mLimit
=m
+table
->mappingsLength
;
711 for(; m
<mLimit
; ++m
) {
712 if(isSISO
&& m
->bLen
==1 && (m
->b
.bytes
[0]==0xe || m
->b
.bytes
[0]==0xf)) {
713 fprintf(stderr
, "warning: removing illegal mapping from an SI/SO-stateful table\n");
714 ucm_printMapping(table
, m
, stderr
);
715 m
->moveFlag
|=UCM_REMOVE_MAPPING
;
720 type
=ucm_mappingType(
722 UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
));
724 /* illegal byte sequence */
725 printMapping(m
, UCM_GET_CODE_POINTS(table
, m
), UCM_GET_BYTES(table
, m
), stderr
);
728 m
->moveFlag
|=UCM_MOVE_TO_EXT
;
737 ucm_moveMappings(ucm
->base
, ucm
->ext
);
738 return ucm_checkBaseExt(&ucm
->states
, ucm
->base
, ucm
->ext
, ucm
->ext
, FALSE
);
740 ucm_sortTable(ucm
->base
);
745 /* ucm parser --------------------------------------------------------------- */
747 U_CAPI
int8_t U_EXPORT2
748 ucm_parseBytes(uint8_t bytes
[UCNV_EXT_MAX_BYTES
], const char *line
, const char **ps
) {
756 /* skip an optional plus sign */
757 if(bLen
>0 && *s
=='+') {
765 (byte
=(uint8_t)uprv_strtoul(s
+2, &end
, 16), end
)!=s
+4
767 fprintf(stderr
, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line
);
771 if(bLen
==UCNV_EXT_MAX_BYTES
) {
772 fprintf(stderr
, "ucm error: too many bytes on \"%s\"\n", line
);
783 /* parse a mapping line; must not be empty */
784 U_CAPI UBool U_EXPORT2
785 ucm_parseMappingLine(UCMapping
*m
,
786 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
787 uint8_t bytes
[UCNV_EXT_MAX_BYTES
],
793 int8_t uLen
, bLen
, f
;
798 /* parse code points */
800 /* skip an optional plus sign */
801 if(uLen
>0 && *s
=='+') {
809 (cp
=(UChar32
)uprv_strtoul(s
+2, &end
, 16), end
)==s
+2 ||
812 fprintf(stderr
, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line
);
815 if((uint32_t)cp
>0x10ffff || U_IS_SURROGATE(cp
)) {
816 fprintf(stderr
, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line
);
820 if(uLen
==UCNV_EXT_MAX_UCHARS
) {
821 fprintf(stderr
, "ucm error: too many code points on \"%s\"\n", line
);
824 codePoints
[uLen
++]=cp
;
829 fprintf(stderr
, "ucm error: no Unicode code points on \"%s\"\n", line
);
834 UErrorCode errorCode
=U_ZERO_ERROR
;
835 u_strFromUTF32(NULL
, 0, &u16Length
, codePoints
, uLen
, &errorCode
);
836 if( (U_FAILURE(errorCode
) && errorCode
!=U_BUFFER_OVERFLOW_ERROR
) ||
837 u16Length
>UCNV_EXT_MAX_UCHARS
839 fprintf(stderr
, "ucm error: too many UChars on \"%s\"\n", line
);
844 s
=u_skipWhitespace(s
);
847 bLen
=ucm_parseBytes(bytes
, line
, &s
);
852 fprintf(stderr
, "ucm error: no bytes on \"%s\"\n", line
);
855 uprv_memcpy(m
->b
.bytes
, bytes
, bLen
);
858 /* skip everything until the fallback indicator, even the start of a comment */
861 f
=-1; /* no fallback indicator */
864 f
=(int8_t)(s
[1]-'0');
866 fprintf(stderr
, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line
);
880 /* general APIs ------------------------------------------------------------- */
882 U_CAPI UCMTable
* U_EXPORT2
884 UCMTable
*table
=(UCMTable
*)uprv_malloc(sizeof(UCMTable
));
886 fprintf(stderr
, "ucm error: unable to allocate a UCMTable\n");
887 exit(U_MEMORY_ALLOCATION_ERROR
);
890 memset(table
, 0, sizeof(UCMTable
));
894 U_CAPI
void U_EXPORT2
895 ucm_closeTable(UCMTable
*table
) {
897 uprv_free(table
->mappings
);
898 uprv_free(table
->codePoints
);
899 uprv_free(table
->bytes
);
900 uprv_free(table
->reverseMap
);
905 U_CAPI
void U_EXPORT2
906 ucm_resetTable(UCMTable
*table
) {
908 table
->mappingsLength
=0;
910 table
->unicodeMask
=0;
911 table
->bytesLength
=table
->codePointsLength
=0;
912 table
->isSorted
=FALSE
;
916 U_CAPI
void U_EXPORT2
917 ucm_addMapping(UCMTable
*table
,
919 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
920 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
925 if(table
->mappingsLength
>=table
->mappingsCapacity
) {
926 /* make the mappings array larger */
927 if(table
->mappingsCapacity
==0) {
928 table
->mappingsCapacity
=1000;
930 table
->mappingsCapacity
*=10;
932 table
->mappings
=(UCMapping
*)uprv_realloc(table
->mappings
,
933 table
->mappingsCapacity
*sizeof(UCMapping
));
934 if(table
->mappings
==NULL
) {
935 fprintf(stderr
, "ucm error: unable to allocate %d UCMappings\n",
936 (int)table
->mappingsCapacity
);
937 exit(U_MEMORY_ALLOCATION_ERROR
);
940 if(table
->reverseMap
!=NULL
) {
941 /* the reverseMap must be reallocated in a new sort */
942 uprv_free(table
->reverseMap
);
943 table
->reverseMap
=NULL
;
947 if(m
->uLen
>1 && table
->codePointsCapacity
==0) {
948 table
->codePointsCapacity
=10000;
949 table
->codePoints
=(UChar32
*)uprv_malloc(table
->codePointsCapacity
*4);
950 if(table
->codePoints
==NULL
) {
951 fprintf(stderr
, "ucm error: unable to allocate %d UChar32s\n",
952 (int)table
->codePointsCapacity
);
953 exit(U_MEMORY_ALLOCATION_ERROR
);
957 if(m
->bLen
>4 && table
->bytesCapacity
==0) {
958 table
->bytesCapacity
=10000;
959 table
->bytes
=(uint8_t *)uprv_malloc(table
->bytesCapacity
);
960 if(table
->bytes
==NULL
) {
961 fprintf(stderr
, "ucm error: unable to allocate %d bytes\n",
962 (int)table
->bytesCapacity
);
963 exit(U_MEMORY_ALLOCATION_ERROR
);
968 idx
=table
->codePointsLength
;
969 table
->codePointsLength
+=m
->uLen
;
970 if(table
->codePointsLength
>table
->codePointsCapacity
) {
971 fprintf(stderr
, "ucm error: too many code points in multiple-code point mappings\n");
972 exit(U_MEMORY_ALLOCATION_ERROR
);
975 uprv_memcpy(table
->codePoints
+idx
, codePoints
, (size_t)m
->uLen
*4);
980 idx
=table
->bytesLength
;
981 table
->bytesLength
+=m
->bLen
;
982 if(table
->bytesLength
>table
->bytesCapacity
) {
983 fprintf(stderr
, "ucm error: too many bytes in mappings with >4 charset bytes\n");
984 exit(U_MEMORY_ALLOCATION_ERROR
);
987 uprv_memcpy(table
->bytes
+idx
, bytes
, m
->bLen
);
991 /* set unicodeMask */
992 for(idx
=0; idx
<m
->uLen
; ++idx
) {
995 table
->unicodeMask
|=UCNV_HAS_SUPPLEMENTARY
; /* there are supplementary code points */
996 } else if(U_IS_SURROGATE(c
)) {
997 table
->unicodeMask
|=UCNV_HAS_SURROGATES
; /* there are surrogate code points */
1003 table
->flagsType
|=UCM_FLAGS_IMPLICIT
;
1005 table
->flagsType
|=UCM_FLAGS_EXPLICIT
;
1008 tm
=table
->mappings
+table
->mappingsLength
++;
1009 uprv_memcpy(tm
, m
, sizeof(UCMapping
));
1011 table
->isSorted
=FALSE
;
1014 U_CAPI UCMFile
* U_EXPORT2
1016 UCMFile
*ucm
=(UCMFile
*)uprv_malloc(sizeof(UCMFile
));
1018 fprintf(stderr
, "ucm error: unable to allocate a UCMFile\n");
1019 exit(U_MEMORY_ALLOCATION_ERROR
);
1022 memset(ucm
, 0, sizeof(UCMFile
));
1024 ucm
->base
=ucm_openTable();
1025 ucm
->ext
=ucm_openTable();
1027 ucm
->states
.stateFlags
[0]=MBCS_STATE_FLAG_DIRECT
;
1028 ucm
->states
.conversionType
=UCNV_UNSUPPORTED_CONVERTER
;
1029 ucm
->states
.outputType
=-1;
1030 ucm
->states
.minCharLength
=ucm
->states
.maxCharLength
=1;
1035 U_CAPI
void U_EXPORT2
1036 ucm_close(UCMFile
*ucm
) {
1038 ucm_closeTable(ucm
->base
);
1039 ucm_closeTable(ucm
->ext
);
1044 U_CAPI
int32_t U_EXPORT2
1045 ucm_mappingType(UCMStates
*baseStates
,
1047 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
1048 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
1050 /* check validity of the bytes and count the characters in them */
1051 int32_t count
=ucm_countChars(baseStates
, bytes
, m
->bLen
);
1053 /* illegal byte sequence */
1058 * Suitable for an ICU conversion base table means:
1059 * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
1060 * - precision flag 0..3
1061 * - SBCS: any 1:1 mapping
1062 * (the table stores additional bits to distinguish mapping types)
1063 * - MBCS: not a |2 SUB mapping for <subchar1>
1064 * - MBCS: not a |1 fallback to 0x00
1065 * - MBCS: not a multi-byte mapping with leading 0x00 bytes
1067 * Further restrictions for fromUnicode tables
1068 * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
1070 * All of the MBCS fromUnicode specific tests could be removed from here,
1071 * but the ones above are for unusual mappings, and removing the tests
1072 * from here would change canonucm output which seems gratuitous.
1073 * (Markus Scherer 2006-nov-28)
1075 * Exception: All implicit mappings (f<0) that need to be moved
1076 * because of fromUnicode restrictions _must_ be moved here because
1077 * makeconv uses a hack for moving mappings only for the fromUnicode table
1078 * that only works with non-negative values of f.
1080 if( m
->uLen
==1 && count
==1 && m
->f
<=3 &&
1081 (baseStates
->maxCharLength
==1 ||
1082 !((m
->f
==2 && m
->bLen
==1) ||
1083 (m
->f
==1 && bytes
[0]==0) ||
1084 (m
->f
<=1 && m
->bLen
>1 && bytes
[0]==0)))
1086 return 0; /* suitable for a base table */
1088 return 1; /* needs to go into an extension table */
1092 U_CAPI UBool U_EXPORT2
1093 ucm_addMappingAuto(UCMFile
*ucm
, UBool forBase
, UCMStates
*baseStates
,
1095 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
],
1096 uint8_t bytes
[UCNV_EXT_MAX_BYTES
]) {
1099 if(m
->f
==2 && m
->uLen
>1) {
1100 fprintf(stderr
, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
1101 printMapping(m
, codePoints
, bytes
, stderr
);
1105 if(baseStates
!=NULL
) {
1106 /* check validity of the bytes and count the characters in them */
1107 type
=ucm_mappingType(baseStates
, m
, codePoints
, bytes
);
1109 /* illegal byte sequence */
1110 printMapping(m
, codePoints
, bytes
, stderr
);
1114 /* not used - adding a mapping for an extension-only table before its base table is read */
1119 * Add the mapping to the base table if this is requested and suitable.
1120 * Otherwise, add it to the extension table.
1122 if(forBase
&& type
==0) {
1123 ucm_addMapping(ucm
->base
, m
, codePoints
, bytes
);
1125 ucm_addMapping(ucm
->ext
, m
, codePoints
, bytes
);
1131 U_CAPI UBool U_EXPORT2
1132 ucm_addMappingFromLine(UCMFile
*ucm
, const char *line
, UBool forBase
, UCMStates
*baseStates
) {
1133 UCMapping m
={ 0, {0}, 0, 0, 0, 0 };
1134 UChar32 codePoints
[UCNV_EXT_MAX_UCHARS
];
1135 uint8_t bytes
[UCNV_EXT_MAX_BYTES
];
1139 /* ignore empty and comment lines */
1140 if(line
[0]=='#' || *(s
=u_skipWhitespace(line
))==0 || *s
=='\n' || *s
=='\r') {
1145 ucm_parseMappingLine(&m
, codePoints
, bytes
, line
) &&
1146 ucm_addMappingAuto(ucm
, forBase
, baseStates
, &m
, codePoints
, bytes
);
1149 U_CAPI
void U_EXPORT2
1150 ucm_readTable(UCMFile
*ucm
, FileStream
* convFile
,
1151 UBool forBase
, UCMStates
*baseStates
,
1152 UErrorCode
*pErrorCode
) {
1157 if(U_FAILURE(*pErrorCode
)) {
1164 /* read the next line */
1165 if(!T_FileStream_readLine(convFile
, line
, sizeof(line
))) {
1166 fprintf(stderr
, "incomplete charmap section\n");
1172 end
=uprv_strchr(line
, 0);
1173 while(line
<end
&& (*(end
-1)=='\r' || *(end
-1)=='\n')) {
1178 /* ignore empty and comment lines */
1179 if(line
[0]==0 || line
[0]=='#') {
1183 /* stop at the end of the mapping table */
1184 if(0==uprv_strcmp(line
, "END CHARMAP")) {
1188 isOK
&=ucm_addMappingFromLine(ucm
, line
, forBase
, baseStates
);
1192 *pErrorCode
=U_INVALID_TABLE_FORMAT
;