1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2000-2008, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: genmbcs.h
12 * tab size: 8 (not used)
15 * created on: 2000jul10
16 * created by: Markus W. Scherer
26 * TODO: Consider using ucnvmbcs.h constants.
27 * However, not all values need to be exactly the same, for example
28 * the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX
29 * may be higher in makeconv than in the runtime code because that
30 * affects only a small number of .cnv files [if any] but all
31 * runtime UConverterSharedData objects.
34 MBCS_STAGE_2_BLOCK_SIZE
=0x40, /* =64=1<<6 for 6 bits in stage 2 */
35 MBCS_STAGE_2_BLOCK_SIZE_SHIFT
=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
36 MBCS_STAGE_2_BLOCK_MASK
=0x3f, /* for after shifting by MBCS_STAGE_2_SHIFT */
37 MBCS_STAGE_1_SHIFT
=10,
38 MBCS_STAGE_1_BMP_SIZE
=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */
39 MBCS_STAGE_1_SIZE
=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */
40 MBCS_STAGE_2_SIZE
=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */
41 MBCS_MAX_STAGE_2_TOP
=MBCS_STAGE_2_SIZE
,
42 MBCS_STAGE_2_MAX_BLOCKS
=MBCS_STAGE_2_SIZE
>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT
,
44 MBCS_STAGE_2_ALL_UNASSIGNED_INDEX
=0, /* stage 1 entry for the all-unassigned stage 2 block */
45 MBCS_STAGE_2_FIRST_ASSIGNED
=MBCS_STAGE_2_BLOCK_SIZE
, /* start of the first stage 2 block after the all-unassigned one */
47 MBCS_STAGE_3_BLOCK_SIZE
=16, /* =16=1<<4 for 4 bits in stage 3 */
48 MBCS_STAGE_3_BLOCK_MASK
=0xf,
49 MBCS_STAGE_3_FIRST_ASSIGNED
=MBCS_STAGE_3_BLOCK_SIZE
, /* start of the first stage 3 block after the all-unassigned one */
51 MBCS_STAGE_3_GRANULARITY
=16, /* =1<<4: MBCS stage 2 indexes are shifted left 4 */
52 MBCS_STAGE_3_SBCS_SIZE
=0x10000, /* max 64k mappings for SBCS */
53 MBCS_STAGE_3_MBCS_SIZE
=0x10000*MBCS_STAGE_3_GRANULARITY
, /* max mappings for MBCS */
56 * SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures.
57 * Possible values are 0x01ff..0xffff, in steps of 0x100.
59 * Unlike for MBCS, this constant only affects the stage 3 block allocation size;
60 * there is no additional stage 1/2 table stored in the .cnv file.
61 * The max value should be at least 0x7ff to cover 2-byte UTF-8.
62 * 0xfff also covers a number other small scripts which have legacy charsets
64 * Higher values up to 0x1fff are harmless and potentially useful because
65 * that covers small-script blocks which usually have either dense mappings
66 * or no mappings at all.
67 * Starting at U+2000, there are mostly symbols and format characters
68 * with a low density of SBCS mappings, which would result in more wasted
69 * stage 3 entries with the larger block size.
74 * MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures.
75 * Possible values are 0x01ff..0xffff, in steps of 0x100.
77 * Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table
78 * with extreme input data. The function checks for this overflow.
80 * 0xd7ff is chosen for the majority of common characters including Unihan and Hangul.
81 * At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc.
82 * Larger values cause slightly larger MBCS .cnv files.
85 MBCS_UTF8_LIMIT
=MBCS_UTF8_MAX
+1, /* =0xd800 */
87 MBCS_UTF8_STAGE_SHIFT
=6,
88 MBCS_UTF8_STAGE_3_BLOCK_SIZE
=0x40, /* =64=1<<6 for 6 bits from last trail byte */
89 MBCS_UTF8_STAGE_3_BLOCK_MASK
=0x3f,
91 /* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */
92 MBCS_UTF8_STAGE_SIZE
=MBCS_UTF8_LIMIT
>>MBCS_UTF8_STAGE_SHIFT
, /* =0x360 */
94 MBCS_FROM_U_EXT_FLAG
=0x10, /* UCMapping.f bit for base table mappings that fit into the base toU table */
95 MBCS_FROM_U_EXT_MASK
=0x0f, /* but need to go into the extension fromU table */
97 /* =4 number of regular stage 3 blocks for final UTF-8 trail byte */
98 MBCS_UTF8_STAGE_3_BLOCKS
=MBCS_UTF8_STAGE_3_BLOCK_SIZE
/MBCS_STAGE_3_BLOCK_SIZE
,
100 MBCS_MAX_FALLBACK_COUNT
=8192
103 U_CFUNC NewConverter
*
104 MBCSOpen(UCMFile
*ucm
);
107 typedef struct MBCSData MBCSData
;
110 * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode()
111 * for creating an extension-only file.
112 * Assume maxCharLength>1.
114 U_CFUNC
const MBCSData
*
117 /* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */
119 MBCSOkForBaseFromUnicode(const MBCSData
*mbcsData
,
120 const uint8_t *bytes
, int32_t length
,
121 UChar32 c
, int8_t flag
);
123 U_CFUNC NewConverter
*
124 CnvExtOpen(UCMFile
*ucm
);
126 #endif /* __GENMBCS_H__ */