]>
Commit | Line | Data |
---|---|---|
1 | // © 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | /* | |
4 | ******************************************************************************* | |
5 | * | |
6 | * Copyright (C) 2000-2008, International Business Machines | |
7 | * Corporation and others. All Rights Reserved. | |
8 | * | |
9 | ******************************************************************************* | |
10 | * file name: genmbcs.h | |
11 | * encoding: UTF-8 | |
12 | * tab size: 8 (not used) | |
13 | * indentation:4 | |
14 | * | |
15 | * created on: 2000jul10 | |
16 | * created by: Markus W. Scherer | |
17 | */ | |
18 | ||
19 | #ifndef __GENMBCS_H__ | |
20 | #define __GENMBCS_H__ | |
21 | ||
22 | #include "makeconv.h" | |
23 | ||
24 | enum { | |
25 | /* | |
26 | * TODO: Consider using ucnvmbcs.h constants. | |
27 | * However, not all values need to be exactly the same, for example | |
28 | * the xxx_UTF8_MAX values may be different. (Especially SBCS_UTF8_MAX | |
29 | * may be higher in makeconv than in the runtime code because that | |
30 | * affects only a small number of .cnv files [if any] but all | |
31 | * runtime UConverterSharedData objects. | |
32 | */ | |
33 | MBCS_STAGE_2_SHIFT=4, | |
34 | MBCS_STAGE_2_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits in stage 2 */ | |
35 | MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */ | |
36 | MBCS_STAGE_2_BLOCK_MASK=0x3f, /* for after shifting by MBCS_STAGE_2_SHIFT */ | |
37 | MBCS_STAGE_1_SHIFT=10, | |
38 | MBCS_STAGE_1_BMP_SIZE=0x40, /* 0x10000>>MBCS_STAGE_1_SHIFT, or 16 for one entry per 1k code points on the BMP */ | |
39 | MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>MBCS_STAGE_1_SHIFT, or 17*64 for one entry per 1k code points */ | |
40 | MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE: stages 1 & 2 share a 16-bit-indexed array */ | |
41 | MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE, | |
42 | MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT, | |
43 | ||
44 | MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */ | |
45 | MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */ | |
46 | ||
47 | MBCS_STAGE_3_BLOCK_SIZE=16, /* =16=1<<4 for 4 bits in stage 3 */ | |
48 | MBCS_STAGE_3_BLOCK_MASK=0xf, | |
49 | MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */ | |
50 | ||
51 | MBCS_STAGE_3_GRANULARITY=16, /* =1<<4: MBCS stage 2 indexes are shifted left 4 */ | |
52 | MBCS_STAGE_3_SBCS_SIZE=0x10000, /* max 64k mappings for SBCS */ | |
53 | MBCS_STAGE_3_MBCS_SIZE=0x10000*MBCS_STAGE_3_GRANULARITY, /* max mappings for MBCS */ | |
54 | ||
55 | /* | |
56 | * SBCS_UTF8_MAX: Maximum code point with UTF-8-friendly SBCS data structures. | |
57 | * Possible values are 0x01ff..0xffff, in steps of 0x100. | |
58 | * | |
59 | * Unlike for MBCS, this constant only affects the stage 3 block allocation size; | |
60 | * there is no additional stage 1/2 table stored in the .cnv file. | |
61 | * The max value should be at least 0x7ff to cover 2-byte UTF-8. | |
62 | * 0xfff also covers a number other small scripts which have legacy charsets | |
63 | * (like Thai). | |
64 | * Higher values up to 0x1fff are harmless and potentially useful because | |
65 | * that covers small-script blocks which usually have either dense mappings | |
66 | * or no mappings at all. | |
67 | * Starting at U+2000, there are mostly symbols and format characters | |
68 | * with a low density of SBCS mappings, which would result in more wasted | |
69 | * stage 3 entries with the larger block size. | |
70 | */ | |
71 | SBCS_UTF8_MAX=0x1fff, | |
72 | ||
73 | /* | |
74 | * MBCS_UTF8_MAX: Maximum code point with UTF-8-friendly MBCS data structures. | |
75 | * Possible values are 0x01ff..0xffff, in steps of 0x100. | |
76 | * | |
77 | * Note that with 0xffff, MBCSAddFromUnicode() may overflow the additional UTF-8 stage table | |
78 | * with extreme input data. The function checks for this overflow. | |
79 | * | |
80 | * 0xd7ff is chosen for the majority of common characters including Unihan and Hangul. | |
81 | * At U+d800 there are mostly surrogates, private use codes, compatibility characters, etc. | |
82 | * Larger values cause slightly larger MBCS .cnv files. | |
83 | */ | |
84 | MBCS_UTF8_MAX=0xd7ff, | |
85 | MBCS_UTF8_LIMIT=MBCS_UTF8_MAX+1, /* =0xd800 */ | |
86 | ||
87 | MBCS_UTF8_STAGE_SHIFT=6, | |
88 | MBCS_UTF8_STAGE_3_BLOCK_SIZE=0x40, /* =64=1<<6 for 6 bits from last trail byte */ | |
89 | MBCS_UTF8_STAGE_3_BLOCK_MASK=0x3f, | |
90 | ||
91 | /* size of the single-stage table for up to U+d7ff (used instead of stage1/2) */ | |
92 | MBCS_UTF8_STAGE_SIZE=MBCS_UTF8_LIMIT>>MBCS_UTF8_STAGE_SHIFT, /* =0x360 */ | |
93 | ||
94 | MBCS_FROM_U_EXT_FLAG=0x10, /* UCMapping.f bit for base table mappings that fit into the base toU table */ | |
95 | MBCS_FROM_U_EXT_MASK=0x0f, /* but need to go into the extension fromU table */ | |
96 | ||
97 | /* =4 number of regular stage 3 blocks for final UTF-8 trail byte */ | |
98 | MBCS_UTF8_STAGE_3_BLOCKS=MBCS_UTF8_STAGE_3_BLOCK_SIZE/MBCS_STAGE_3_BLOCK_SIZE, | |
99 | ||
100 | MBCS_MAX_FALLBACK_COUNT=8192 | |
101 | }; | |
102 | ||
103 | U_CFUNC NewConverter * | |
104 | MBCSOpen(UCMFile *ucm); | |
105 | ||
106 | struct MBCSData; | |
107 | typedef struct MBCSData MBCSData; | |
108 | ||
109 | /* | |
110 | * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode() | |
111 | * for creating an extension-only file. | |
112 | * Assume maxCharLength>1. | |
113 | */ | |
114 | U_CFUNC const MBCSData * | |
115 | MBCSGetDummy(void); | |
116 | ||
117 | /* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */ | |
118 | U_CFUNC UBool | |
119 | MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, | |
120 | const uint8_t *bytes, int32_t length, | |
121 | UChar32 c, int8_t flag); | |
122 | ||
123 | U_CFUNC NewConverter * | |
124 | CnvExtOpen(UCMFile *ucm); | |
125 | ||
126 | #endif /* __GENMBCS_H__ */ |