]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
374ca955 | 4 | * Copyright (C) 2001-2004, International Business Machines |
b75a7d8f A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: unormimp.h | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2001may25 | |
14 | * created by: Markus W. Scherer | |
15 | */ | |
16 | ||
17 | #ifndef __UNORMIMP_H__ | |
18 | #define __UNORMIMP_H__ | |
19 | ||
20 | #include "unicode/utypes.h" | |
21 | ||
22 | #if !UCONFIG_NO_NORMALIZATION | |
23 | ||
374ca955 A |
24 | #ifdef XP_CPLUSPLUS |
25 | #include "unicode/uniset.h" | |
26 | #endif | |
27 | ||
b75a7d8f A |
28 | #include "unicode/uiter.h" |
29 | #include "unicode/unorm.h" | |
30 | #include "unicode/uset.h" | |
31 | #include "utrie.h" | |
32 | #include "ustr_imp.h" | |
374ca955 | 33 | #include "udataswp.h" |
b75a7d8f A |
34 | |
35 | /* | |
36 | * This new implementation of the normalization code loads its data from | |
374ca955 | 37 | * unorm.icu, which is generated with the gennorm tool. |
b75a7d8f A |
38 | * The format of that file is described at the end of this file. |
39 | */ | |
40 | ||
41 | /* norm32 value constants */ | |
42 | enum { | |
43 | /* quick check flags 0..3 set mean "no" for their forms */ | |
44 | _NORM_QC_NFC=0x11, /* no|maybe */ | |
45 | _NORM_QC_NFKC=0x22, /* no|maybe */ | |
46 | _NORM_QC_NFD=4, /* no */ | |
47 | _NORM_QC_NFKD=8, /* no */ | |
48 | ||
49 | _NORM_QC_ANY_NO=0xf, | |
50 | ||
51 | /* quick check flags 4..5 mean "maybe" for their forms; test flags>=_NORM_QC_MAYBE */ | |
52 | _NORM_QC_MAYBE=0x10, | |
53 | _NORM_QC_ANY_MAYBE=0x30, | |
54 | ||
55 | _NORM_QC_MASK=0x3f, | |
56 | ||
57 | _NORM_COMBINES_FWD=0x40, | |
58 | _NORM_COMBINES_BACK=0x80, | |
59 | _NORM_COMBINES_ANY=0xc0, | |
60 | ||
61 | _NORM_CC_SHIFT=8, /* UnicodeData.txt combining class in bits 15..8 */ | |
62 | _NORM_CC_MASK=0xff00, | |
63 | ||
64 | _NORM_EXTRA_SHIFT=16, /* 16 bits for the index to UChars and other extra data */ | |
65 | _NORM_EXTRA_INDEX_TOP=0xfc00, /* start of surrogate specials after shift */ | |
66 | ||
67 | _NORM_EXTRA_SURROGATE_MASK=0x3ff, | |
68 | _NORM_EXTRA_SURROGATE_TOP=0x3f0, /* hangul etc. */ | |
69 | ||
70 | _NORM_EXTRA_HANGUL=_NORM_EXTRA_SURROGATE_TOP, | |
71 | _NORM_EXTRA_JAMO_L, | |
72 | _NORM_EXTRA_JAMO_V, | |
73 | _NORM_EXTRA_JAMO_T | |
74 | }; | |
75 | ||
76 | /* norm32 value constants using >16 bits */ | |
77 | #define _NORM_MIN_SPECIAL 0xfc000000 | |
78 | #define _NORM_SURROGATES_TOP 0xfff00000 | |
79 | #define _NORM_MIN_HANGUL 0xfff00000 | |
80 | #define _NORM_MIN_JAMO_V 0xfff20000 | |
81 | #define _NORM_JAMO_V_TOP 0xfff30000 | |
82 | ||
83 | /* value constants for auxTrie */ | |
84 | enum { | |
85 | _NORM_AUX_COMP_EX_SHIFT=10, | |
86 | _NORM_AUX_UNSAFE_SHIFT=11, | |
87 | _NORM_AUX_NFC_SKIPPABLE_F_SHIFT=12 | |
88 | }; | |
89 | ||
90 | #define _NORM_AUX_MAX_FNC ((int32_t)1<<_NORM_AUX_COMP_EX_SHIFT) | |
91 | ||
92 | #define _NORM_AUX_FNC_MASK (uint32_t)(_NORM_AUX_MAX_FNC-1) | |
93 | #define _NORM_AUX_COMP_EX_MASK ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT) | |
94 | #define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT) | |
95 | #define _NORM_AUX_NFC_SKIP_F_MASK ((uint32_t)1<<_NORM_AUX_NFC_SKIPPABLE_F_SHIFT) | |
96 | ||
97 | /* canonStartSets[0..31] contains indexes for what is in the array */ | |
98 | enum { | |
374ca955 | 99 | _NORM_SET_INDEX_CANON_SETS_LENGTH, /* number of uint16_t in canonical starter sets */ |
b75a7d8f A |
100 | _NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */ |
101 | _NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */ | |
102 | ||
374ca955 A |
103 | /* from formatVersion 2.3: */ |
104 | _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET, /* uint16_t offset from canonStartSets[0] to the | |
105 | exclusion set for CJK compatibility characters */ | |
106 | _NORM_SET_INDEX_NX_UNICODE32_OFFSET, /* uint16_t offset from canonStartSets[0] to the | |
107 | exclusion set for Unicode 3.2 characters */ | |
108 | _NORM_SET_INDEX_NX_RESERVED_OFFSET, /* uint16_t offset from canonStartSets[0] to the | |
109 | end of the previous exclusion set */ | |
110 | ||
111 | _NORM_SET_INDEX_TOP=32 /* changing this requires a new formatVersion */ | |
b75a7d8f A |
112 | }; |
113 | ||
114 | /* more constants for canonical starter sets */ | |
115 | ||
116 | /* 14 bit indexes to canonical USerializedSets */ | |
117 | #define _NORM_MAX_CANON_SETS 0x4000 | |
118 | ||
119 | /* single-code point BMP sets are encoded directly in the search table except if result=0x4000..0x7fff */ | |
120 | #define _NORM_CANON_SET_BMP_MASK 0xc000 | |
121 | #define _NORM_CANON_SET_BMP_IS_INDEX 0x4000 | |
122 | ||
123 | /* indexes[] value names */ | |
124 | enum { | |
125 | _NORM_INDEX_TRIE_SIZE, /* number of bytes in normalization trie */ | |
126 | _NORM_INDEX_UCHAR_COUNT, /* number of UChars in extra data */ | |
127 | ||
128 | _NORM_INDEX_COMBINE_DATA_COUNT, /* number of uint16_t words for combining data */ | |
129 | _NORM_INDEX_COMBINE_FWD_COUNT, /* number of code points that combine forward */ | |
130 | _NORM_INDEX_COMBINE_BOTH_COUNT, /* number of code points that combine forward and backward */ | |
131 | _NORM_INDEX_COMBINE_BACK_COUNT, /* number of code points that combine backward */ | |
132 | ||
133 | _NORM_INDEX_MIN_NFC_NO_MAYBE, /* first code point with quick check NFC NO/MAYBE */ | |
134 | _NORM_INDEX_MIN_NFKC_NO_MAYBE, /* first code point with quick check NFKC NO/MAYBE */ | |
135 | _NORM_INDEX_MIN_NFD_NO_MAYBE, /* first code point with quick check NFD NO/MAYBE */ | |
136 | _NORM_INDEX_MIN_NFKD_NO_MAYBE, /* first code point with quick check NFKD NO/MAYBE */ | |
137 | ||
138 | _NORM_INDEX_FCD_TRIE_SIZE, /* number of bytes in FCD trie */ | |
139 | ||
140 | _NORM_INDEX_AUX_TRIE_SIZE, /* number of bytes in the auxiliary trie */ | |
141 | _NORM_INDEX_CANON_SET_COUNT, /* number of uint16_t in the array of serialized USet */ | |
142 | ||
143 | _NORM_INDEX_TOP=32 /* changing this requires a new formatVersion */ | |
144 | }; | |
145 | ||
146 | enum { | |
147 | /* FCD check: everything below this code point is known to have a 0 lead combining class */ | |
148 | _NORM_MIN_WITH_LEAD_CC=0x300 | |
149 | }; | |
150 | ||
151 | enum { | |
152 | /** | |
153 | * Bit 7 of the length byte for a decomposition string in extra data is | |
154 | * a flag indicating whether the decomposition string is | |
155 | * preceded by a 16-bit word with the leading and trailing cc | |
156 | * of the decomposition (like for A-umlaut); | |
157 | * if not, then both cc's are zero (like for compatibility ideographs). | |
158 | */ | |
159 | _NORM_DECOMP_FLAG_LENGTH_HAS_CC=0x80, | |
160 | /** | |
161 | * Bits 6..0 of the length byte contain the actual length. | |
162 | */ | |
163 | _NORM_DECOMP_LENGTH_MASK=0x7f | |
164 | }; | |
165 | ||
166 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ | |
167 | ||
168 | /* Korean Hangul and Jamo constants */ | |
169 | enum { | |
170 | JAMO_L_BASE=0x1100, /* "lead" jamo */ | |
171 | JAMO_V_BASE=0x1161, /* "vowel" jamo */ | |
172 | JAMO_T_BASE=0x11a7, /* "trail" jamo */ | |
173 | ||
174 | HANGUL_BASE=0xac00, | |
175 | ||
176 | JAMO_L_COUNT=19, | |
177 | JAMO_V_COUNT=21, | |
178 | JAMO_T_COUNT=28, | |
179 | ||
180 | HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT | |
181 | }; | |
182 | ||
183 | #if !UCONFIG_NO_NORMALIZATION | |
184 | ||
185 | /* Constants for options flags for normalization. @draft ICU 2.6 */ | |
186 | enum { | |
187 | /** Options bit 0, do not decompose Hangul syllables. @draft ICU 2.6 */ | |
188 | UNORM_NX_HANGUL=1, | |
189 | /** Options bit 1, do not decompose CJK compatibility characters. @draft ICU 2.6 */ | |
374ca955 A |
190 | UNORM_NX_CJK_COMPAT=2, |
191 | /** | |
192 | * Options bit 8, use buggy recomposition described in | |
193 | * Unicode Public Review Issue #29 | |
194 | * at http://www.unicode.org/review/resolved-pri.html#pri29 | |
195 | * | |
196 | * Used in IDNA implementation according to strict interpretation | |
197 | * of IDNA definition based on Unicode 3.2 which predates PRI #29. | |
198 | */ | |
199 | UNORM_BEFORE_PRI_29=0x100 | |
b75a7d8f A |
200 | }; |
201 | ||
202 | /** | |
203 | * Is the normalizer data loaded? | |
204 | * This is used internally before other internal normalizer functions | |
205 | * are called. | |
206 | * It saves this check in each of many normalization calls that | |
207 | * are made for, e.g., collation. | |
208 | * | |
209 | * @param pErrorCode as usual | |
210 | * @return boolean value for whether the normalization data is loaded | |
211 | * | |
212 | * @internal | |
213 | */ | |
214 | U_CAPI UBool U_EXPORT2 | |
215 | unorm_haveData(UErrorCode *pErrorCode); | |
216 | ||
217 | /** | |
218 | * Internal API for normalizing. | |
219 | * Does not check for bad input. | |
220 | * @internal | |
221 | */ | |
222 | U_CAPI int32_t U_EXPORT2 | |
223 | unorm_internalNormalize(UChar *dest, int32_t destCapacity, | |
224 | const UChar *src, int32_t srcLength, | |
225 | UNormalizationMode mode, int32_t options, | |
226 | UErrorCode *pErrorCode); | |
227 | ||
374ca955 A |
228 | #ifdef XP_CPLUSPLUS |
229 | ||
230 | /** | |
231 | * Internal API for normalizing. | |
232 | * Does not check for bad input. | |
233 | * Requires _haveData() to be true. | |
234 | * @internal | |
235 | */ | |
236 | U_CFUNC int32_t | |
237 | unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity, | |
238 | const UChar *src, int32_t srcLength, | |
239 | UNormalizationMode mode, int32_t options, const UnicodeSet *nx, | |
240 | UErrorCode *pErrorCode); | |
241 | ||
242 | #endif | |
243 | ||
b75a7d8f A |
244 | /** |
245 | * internal API, used by normlzr.cpp | |
246 | * @internal | |
247 | */ | |
248 | U_CAPI int32_t U_EXPORT2 | |
249 | unorm_decompose(UChar *dest, int32_t destCapacity, | |
250 | const UChar *src, int32_t srcLength, | |
251 | UBool compat, int32_t options, | |
252 | UErrorCode *pErrorCode); | |
253 | ||
254 | /** | |
255 | * internal API, used by normlzr.cpp | |
256 | * @internal | |
257 | */ | |
258 | U_CAPI int32_t U_EXPORT2 | |
259 | unorm_compose(UChar *dest, int32_t destCapacity, | |
260 | const UChar *src, int32_t srcLength, | |
261 | UBool compat, int32_t options, | |
262 | UErrorCode *pErrorCode); | |
263 | ||
374ca955 A |
264 | #ifdef XP_CPLUSPLUS |
265 | ||
266 | /** | |
267 | * internal API, used by unormcmp.cpp | |
268 | * @internal | |
269 | */ | |
270 | U_CFUNC UNormalizationCheckResult | |
271 | unorm_internalQuickCheck(const UChar *src, | |
272 | int32_t srcLength, | |
273 | UNormalizationMode mode, | |
274 | UBool allowMaybe, | |
275 | const UnicodeSet *nx, | |
276 | UErrorCode *pErrorCode); | |
277 | ||
278 | #endif | |
279 | ||
b75a7d8f A |
280 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ |
281 | ||
282 | /** | |
283 | * Internal option for unorm_cmpEquivFold() for decomposing. | |
284 | * If not set, just do strcasecmp(). | |
285 | * @internal | |
286 | */ | |
287 | #define _COMPARE_EQUIV 0x80000 | |
288 | ||
289 | #ifndef U_COMPARE_IGNORE_CASE | |
290 | /* see also unorm.h */ | |
291 | /** | |
292 | * Option bit for unorm_compare: | |
293 | * Perform case-insensitive comparison. | |
294 | * @draft ICU 2.2 | |
295 | */ | |
296 | #define U_COMPARE_IGNORE_CASE 0x10000 | |
297 | #endif | |
298 | ||
299 | /** | |
300 | * Internal option for unorm_cmpEquivFold() for strncmp style. | |
301 | * If set, checks for both string length and terminating NUL. | |
302 | * @internal | |
303 | */ | |
304 | #define _STRNCMP_STYLE 0x1000 | |
305 | ||
374ca955 A |
306 | #if !UCONFIG_NO_NORMALIZATION |
307 | ||
b75a7d8f | 308 | /** |
374ca955 A |
309 | * Internal API to get the 16-bit FCD value (lccc + tccc) for c, |
310 | * for u_getIntPropertyValue(). | |
b75a7d8f A |
311 | * @internal |
312 | */ | |
374ca955 A |
313 | U_CAPI uint16_t U_EXPORT2 |
314 | unorm_getFCD16FromCodePoint(UChar32 c); | |
b75a7d8f A |
315 | |
316 | /** | |
317 | * Internal API, used by collation code. | |
318 | * Get access to the internal FCD trie table to be able to perform | |
319 | * incremental, per-code unit, FCD checks in collation. | |
320 | * One pointer is sufficient because the trie index values are offset | |
321 | * by the index size, so that the same pointer is used to access the trie data. | |
322 | * @internal | |
323 | */ | |
324 | U_CAPI const uint16_t * U_EXPORT2 | |
325 | unorm_getFCDTrie(UErrorCode *pErrorCode); | |
326 | ||
327 | #ifdef XP_CPLUSPLUS | |
328 | ||
329 | U_NAMESPACE_BEGIN | |
330 | /** | |
331 | * Internal API, used by collation code. | |
332 | * Get the FCD value for a code unit, with | |
333 | * bits 15..8 lead combining class | |
334 | * bits 7..0 trail combining class | |
335 | * | |
336 | * If c is a lead surrogate and the value is not 0, | |
337 | * then instead of combining classes the value | |
338 | * is used in unorm_getFCD16FromSurrogatePair() to get the real value | |
339 | * of the supplementary code point. | |
340 | * | |
341 | * @internal | |
342 | */ | |
343 | inline uint16_t | |
344 | unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) { | |
345 | return | |
346 | fcdTrieIndex[ | |
347 | (fcdTrieIndex[ | |
348 | c>>UTRIE_SHIFT | |
349 | ]<<UTRIE_INDEX_SHIFT)+ | |
350 | (c&UTRIE_MASK) | |
351 | ]; | |
352 | } | |
353 | ||
354 | /** | |
355 | * Internal API, used by collation code. | |
356 | * Get the FCD value for a supplementary code point, with | |
357 | * bits 15..8 lead combining class | |
358 | * bits 7..0 trail combining class | |
359 | * | |
360 | * @param fcd16 The FCD value for the lead surrogate, not 0. | |
361 | * @param c2 The trail surrogate code unit. | |
362 | * | |
363 | * @internal | |
364 | */ | |
365 | inline uint16_t | |
366 | unorm_getFCD16FromSurrogatePair(const uint16_t *fcdTrieIndex, uint16_t fcd16, UChar c2) { | |
367 | return | |
368 | fcdTrieIndex[ | |
369 | (fcdTrieIndex[ | |
370 | (int32_t)fcd16+((c2&0x3ff)>>UTRIE_SHIFT) | |
371 | ]<<UTRIE_INDEX_SHIFT)+ | |
372 | (c2&UTRIE_MASK) | |
373 | ]; | |
374 | } | |
375 | ||
376 | U_NAMESPACE_END | |
377 | ||
378 | #endif | |
379 | ||
374ca955 A |
380 | /** |
381 | * internal API, used by StringPrep | |
382 | * @internal | |
383 | */ | |
384 | U_CAPI void U_EXPORT2 | |
385 | unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode); | |
386 | ||
387 | /** | |
388 | * Get the canonical decomposition for one code point. | |
389 | * Requires unorm_haveData() and buffer!=NULL and pLength!=NULL. | |
390 | * @param c code point | |
391 | * @param buffer out-only buffer for algorithmic decompositions of Hangul | |
392 | * @param length out-only, takes the length of the decomposition, if any | |
393 | * @return pointer to decomposition, or 0 if none | |
394 | * @internal | |
395 | */ | |
396 | U_CFUNC const UChar * | |
397 | unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength); | |
398 | ||
b75a7d8f A |
399 | /** |
400 | * internal API, used by the canonical iterator | |
374ca955 A |
401 | * TODO Consider using signature similar to unorm_getCanonicalDecomposition() |
402 | * for more efficiency | |
b75a7d8f A |
403 | * @internal |
404 | */ | |
405 | U_CAPI int32_t U_EXPORT2 | |
406 | unorm_getDecomposition(UChar32 c, UBool compat, | |
407 | UChar *dest, int32_t destCapacity); | |
408 | ||
409 | /** | |
410 | * internal API, used by uprops.cpp | |
411 | * @internal | |
412 | */ | |
413 | U_CAPI UBool U_EXPORT2 | |
414 | unorm_internalIsFullCompositionExclusion(UChar32 c); | |
415 | ||
416 | /** | |
417 | * Internal API, used by enumeration of canonically equivalent strings | |
418 | * @internal | |
419 | */ | |
420 | U_CAPI UBool U_EXPORT2 | |
421 | unorm_isCanonSafeStart(UChar32 c); | |
422 | ||
423 | /** | |
424 | * Internal API, used by enumeration of canonically equivalent strings | |
425 | * @internal | |
426 | */ | |
427 | U_CAPI UBool U_EXPORT2 | |
428 | unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet); | |
429 | ||
430 | /** | |
431 | * Is c an NF<mode>-skippable code point? See unormimp.h. | |
432 | * @internal | |
433 | */ | |
434 | U_CAPI UBool U_EXPORT2 | |
435 | unorm_isNFSkippable(UChar32 c, UNormalizationMode mode); | |
436 | ||
374ca955 A |
437 | #ifdef XP_CPLUSPLUS |
438 | ||
439 | /** | |
440 | * Get normalization exclusion set for the options. | |
441 | * Requires unorm_haveData(). | |
442 | * @internal | |
443 | */ | |
444 | U_CFUNC const UnicodeSet * | |
445 | unorm_getNX(int32_t options, UErrorCode *pErrorCode); | |
446 | ||
447 | #endif | |
448 | ||
b75a7d8f A |
449 | /** |
450 | * Enumerate each normalization data trie and add the | |
451 | * start of each range of same properties to the set. | |
452 | * @internal | |
453 | */ | |
454 | U_CAPI void U_EXPORT2 | |
73c04bcf | 455 | unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode); |
374ca955 A |
456 | |
457 | /** | |
458 | * Swap unorm.icu. See udataswp.h. | |
459 | * @internal | |
460 | */ | |
461 | U_CAPI int32_t U_EXPORT2 | |
462 | unorm_swap(const UDataSwapper *ds, | |
463 | const void *inData, int32_t length, void *outData, | |
464 | UErrorCode *pErrorCode); | |
b75a7d8f A |
465 | |
466 | /** | |
374ca955 A |
467 | * Get the NF*_QC property for a code point, for u_getIntPropertyValue(). |
468 | * @internal | |
469 | */ | |
470 | U_CAPI UNormalizationCheckResult U_EXPORT2 | |
471 | unorm_getQuickCheck(UChar32 c, UNormalizationMode mode); | |
472 | ||
473 | /** | |
474 | * Description of the format of unorm.icu version 2.3. | |
b75a7d8f A |
475 | * |
476 | * Main change from version 1 to version 2: | |
477 | * Use of new, common UTrie instead of normalization-specific tries. | |
478 | * Change to version 2.1: add third/auxiliary trie with associated data. | |
479 | * Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK). | |
374ca955 A |
480 | * Change to version 2.3: add serialized sets for normalization exclusions |
481 | * stored inside canonStartSets[] | |
b75a7d8f A |
482 | * |
483 | * For more details of how to use the data structures see the code | |
484 | * in unorm.cpp (runtime normalization code) and | |
485 | * in gennorm.c and gennorm/store.c (build-time data generation). | |
486 | * | |
487 | * For the serialized format of UTrie see utrie.c/UTrieHeader. | |
488 | * | |
489 | * - Overall partition | |
490 | * | |
491 | * unorm.dat customarily begins with a UDataInfo structure, see udata.h and .c. | |
492 | * After that there are the following structures: | |
493 | * | |
374ca955 | 494 | * int32_t indexes[_NORM_INDEX_TOP]; -- _NORM_INDEX_TOP=32, see enum in this file |
b75a7d8f A |
495 | * |
496 | * UTrie normTrie; -- size in bytes=indexes[_NORM_INDEX_TRIE_SIZE] | |
497 | * | |
498 | * uint16_t extraData[extraDataTop]; -- extraDataTop=indexes[_NORM_INDEX_UCHAR_COUNT] | |
499 | * extraData[0] contains the number of units for | |
500 | * FC_NFKC_Closure (formatVersion>=2.1) | |
501 | * | |
502 | * uint16_t combiningTable[combiningTableTop]; -- combiningTableTop=indexes[_NORM_INDEX_COMBINE_DATA_COUNT] | |
503 | * combiningTableTop may include one 16-bit padding unit | |
504 | * to make sure that fcdTrie is 32-bit-aligned | |
505 | * | |
506 | * UTrie fcdTrie; -- size in bytes=indexes[_NORM_INDEX_FCD_TRIE_SIZE] | |
507 | * | |
508 | * UTrie auxTrie; -- size in bytes=indexes[_NORM_INDEX_AUX_TRIE_SIZE] | |
509 | * | |
510 | * uint16_t canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[_NORM_INDEX_CANON_SET_COUNT] | |
511 | * serialized USets and binary search tables, see below | |
512 | * | |
513 | * | |
514 | * The indexes array contains lengths and sizes of the following arrays and structures | |
515 | * as well as the following values: | |
516 | * indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop | |
517 | * -- one more than the highest combining index computed for forward-only-combining characters | |
518 | * indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop | |
519 | * -- number of combining indexes computed for both-ways-combining characters | |
520 | * indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop | |
521 | * -- number of combining indexes computed for backward-only-combining characters | |
522 | * | |
523 | * indexes[_NORM_INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD }) | |
524 | * -- first code point with a quick check NF* value of NO/MAYBE | |
525 | * | |
526 | * | |
527 | * - Tries | |
528 | * | |
529 | * The main structures are two UTrie tables ("compact arrays"), | |
530 | * each with one index array and one data array. | |
531 | * See utrie.h and utrie.c. | |
532 | * | |
533 | * | |
534 | * - Tries in unorm.dat | |
535 | * | |
536 | * The first trie (normTrie above) | |
537 | * provides data for the NF* quick checks and normalization. | |
538 | * The second trie (fcdTrie above) provides data just for FCD checks. | |
539 | * | |
540 | * | |
541 | * - norm32 data words from the first trie | |
542 | * | |
543 | * The norm32Table contains one 32-bit word "norm32" per code point. | |
544 | * It contains the following bit fields: | |
545 | * 31..16 extra data index, _NORM_EXTRA_SHIFT is used to shift this field down | |
546 | * if this index is <_NORM_EXTRA_INDEX_TOP then it is an index into | |
547 | * extraData[] where variable-length normalization data for this | |
548 | * code point is found | |
549 | * if this index is <_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_SURROGATE_TOP | |
550 | * then this is a norm32 for a leading surrogate, and the index | |
551 | * value is used together with the following trailing surrogate | |
552 | * code unit in the second trie access | |
553 | * if this index is >=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_SURROGATE_TOP | |
554 | * then this is a norm32 for a "special" character, | |
555 | * i.e., the character is a Hangul syllable or a Jamo | |
556 | * see _NORM_EXTRA_HANGUL etc. | |
557 | * generally, instead of extracting this index from the norm32 and | |
558 | * comparing it with the above constants, | |
559 | * the normalization code compares the entire norm32 value | |
560 | * with _NORM_MIN_SPECIAL, _NORM_SURROGATES_TOP, _NORM_MIN_HANGUL etc. | |
561 | * | |
562 | * 15..8 combining class (cc) according to UnicodeData.txt | |
563 | * | |
564 | * 7..6 _NORM_COMBINES_ANY flags, used in composition to see if a character | |
565 | * combines with any following or preceding character(s) | |
566 | * at all | |
567 | * 7 _NORM_COMBINES_BACK | |
568 | * 6 _NORM_COMBINES_FWD | |
569 | * | |
570 | * 5..0 quick check flags, set for "no" or "maybe", with separate flags for | |
571 | * each normalization form | |
572 | * the higher bits are "maybe" flags; for NF*D there are no such flags | |
573 | * the lower bits are "no" flags for all forms, in the same order | |
574 | * as the "maybe" flags, | |
575 | * which is (MSB to LSB): NFKD NFD NFKC NFC | |
576 | * 5..4 _NORM_QC_ANY_MAYBE | |
577 | * 3..0 _NORM_QC_ANY_NO | |
578 | * see further related constants | |
579 | * | |
580 | * | |
581 | * - Extra data per code point | |
582 | * | |
583 | * "Extra data" is referenced by the index in norm32. | |
584 | * It is variable-length data. It is only present, and only those parts | |
585 | * of it are, as needed for a given character. | |
586 | * The norm32 extra data index is added to the beginning of extraData[] | |
587 | * to get to a vector of 16-bit words with data at the following offsets: | |
588 | * | |
589 | * [-1] Combining index for composition. | |
590 | * Stored only if norm32&_NORM_COMBINES_ANY . | |
591 | * [0] Lengths of the canonical and compatibility decomposition strings. | |
592 | * Stored only if there are decompositions, i.e., | |
593 | * if norm32&(_NORM_QC_NFD|_NORM_QC_NFKD) | |
594 | * High byte: length of NFKD, or 0 if none | |
595 | * Low byte: length of NFD, or 0 if none | |
596 | * Each length byte also has another flag: | |
597 | * Bit 7 of a length byte is set if there are non-zero | |
598 | * combining classes (cc's) associated with the respective | |
599 | * decomposition. If this flag is set, then the decomposition | |
600 | * is preceded by a 16-bit word that contains the | |
601 | * leading and trailing cc's. | |
602 | * Bits 6..0 of a length byte are the length of the | |
603 | * decomposition string, not counting the cc word. | |
604 | * [1..n] NFD | |
605 | * [n+1..] NFKD | |
606 | * | |
607 | * Each of the two decompositions consists of up to two parts: | |
608 | * - The 16-bit words with the leading and trailing cc's. | |
609 | * This is only stored if bit 7 of the corresponding length byte | |
610 | * is set. In this case, at least one of the cc's is not zero. | |
611 | * High byte: leading cc==cc of the first code point in the decomposition string | |
612 | * Low byte: trailing cc==cc of the last code point in the decomposition string | |
613 | * - The decomposition string in UTF-16, with length code units. | |
614 | * | |
615 | * | |
616 | * - Combining indexes and combiningTable[] | |
617 | * | |
618 | * Combining indexes are stored at the [-1] offset of the extra data | |
619 | * if the character combines forward or backward with any other characters. | |
620 | * They are used for (re)composition in NF*C. | |
621 | * Values of combining indexes are arranged according to whether a character | |
622 | * combines forward, backward, or both ways: | |
623 | * forward-only < both ways < backward-only | |
624 | * | |
625 | * The index values for forward-only and both-ways combining characters | |
626 | * are indexes into the combiningTable[]. | |
627 | * The index values for backward-only combining characters are simply | |
628 | * incremented from the preceding index values to be unique. | |
629 | * | |
630 | * In the combiningTable[], a variable-length list | |
631 | * of variable-length (back-index, code point) pair entries is stored | |
632 | * for each forward-combining character. | |
633 | * | |
634 | * These back-indexes are the combining indexes of both-ways or backward-only | |
635 | * combining characters that the forward-combining character combines with. | |
636 | * | |
637 | * Each list is sorted in ascending order of back-indexes. | |
638 | * Each list is terminated with the last back-index having bit 15 set. | |
639 | * | |
640 | * Each pair (back-index, code point) takes up either 2 or 3 | |
641 | * 16-bit words. | |
642 | * The first word of a list entry is the back-index, with its bit 15 set if | |
643 | * this is the last pair in the list. | |
644 | * | |
645 | * The second word contains flags in bits 15..13 that determine | |
646 | * if there is a third word and how the combined character is encoded: | |
647 | * 15 set if there is a third word in this list entry | |
648 | * 14 set if the result is a supplementary character | |
649 | * 13 set if the result itself combines forward | |
650 | * | |
651 | * According to these bits 15..14 of the second word, | |
652 | * the result character is encoded as follows: | |
653 | * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of | |
654 | * the second word. | |
655 | * 10 The result is 0x2000..0xffff and stored in the third word. | |
656 | * Bits 12..0 of the second word are not used. | |
657 | * 11 The result is a supplementary character. | |
658 | * Bits 9..0 of the leading surrogate are in bits 9..0 of | |
659 | * the second word. | |
660 | * Add 0xd800 to these bits to get the complete surrogate. | |
661 | * Bits 12..10 of the second word are not used. | |
662 | * The trailing surrogate is stored in the third word. | |
663 | * | |
664 | * | |
665 | * - FCD trie | |
666 | * | |
667 | * The FCD trie is very simple. | |
668 | * It is a folded trie with 16-bit data words. | |
669 | * In each word, the high byte contains the leading cc of the character, | |
670 | * and the low byte contains the trailing cc of the character. | |
671 | * These cc's are the cc's of the first and last code points in the | |
672 | * canonical decomposition of the character. | |
673 | * | |
674 | * Since all 16 bits are used for cc's, lead surrogates must be tested | |
675 | * by checking the code unit instead of the trie data. | |
676 | * This is done only if the 16-bit data word is not zero. | |
677 | * If the code unit is a leading surrogate and the data word is not zero, | |
678 | * then instead of cc's it contains the offset for the second trie lookup. | |
679 | * | |
680 | * | |
681 | * - Auxiliary trie and data | |
682 | * | |
683 | * The auxiliary 16-bit trie contains data for additional properties. | |
684 | * Bits | |
685 | * 15..13 reserved | |
686 | * 12 not NFC_Skippable (f) (formatVersion>=2.2) | |
687 | * 11 flag: not a safe starter for canonical closure | |
688 | * 10 composition exclusion | |
689 | * 9.. 0 index into extraData[] to FC_NFKC_Closure string | |
690 | * (not for lead surrogate), | |
691 | * or lead surrogate offset (for lead surrogate, if 9..0 not zero) | |
692 | * | |
693 | * - FC_NFKC_Closure strings in extraData[] | |
694 | * | |
695 | * Strings are either stored as a single code unit or as the length | |
696 | * followed by that many units. | |
697 | * const UChar *s=extraData+(index from auxTrie data bits 9..0); | |
698 | * int32_t length; | |
699 | * if(*s<0xff00) { | |
700 | * // s points to the single-unit string | |
701 | * length=1; | |
702 | * } else { | |
703 | * length=*s&0xff; | |
704 | * ++s; | |
705 | * } | |
706 | * | |
707 | * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable: | |
708 | * (used in NormalizerTransliterator) | |
709 | * | |
710 | * A skippable character is | |
711 | * a) unassigned, or ALL of the following: | |
712 | * b) of combining class 0. | |
713 | * c) not decomposed by this normalization form. | |
714 | * AND if NFC or NFKC, | |
715 | * d) can never compose with a previous character. | |
716 | * e) can never compose with a following character. | |
717 | * f) can never change if another character is added. | |
718 | * Example: a-breve might satisfy all but f, but if you | |
719 | * add an ogonek it changes to a-ogonek + breve | |
720 | * | |
721 | * a)..e) must be tested from norm32. | |
722 | * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built | |
723 | * into the auxiliary trie. | |
724 | * The same bit is used for NFC and NFKC; (c) differs for them. | |
725 | * As usual, we build the "not skippable" flags so that unassigned | |
726 | * code points get a 0 bit. | |
727 | * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well. | |
728 | * Test Hangul LV syllables entirely in code. | |
729 | * | |
730 | * | |
731 | * - structure inside canonStartSets[] | |
732 | * | |
733 | * This array maps from code points c to sets of code points (USerializedSet). | |
734 | * The result sets are the code points whose canonical decompositions start | |
735 | * with c. | |
736 | * | |
737 | * canonStartSets[] contains the following sub-arrays: | |
738 | * | |
739 | * indexes[_NORM_SET_INDEX_TOP] | |
740 | * - contains lengths of sub-arrays etc. | |
741 | * | |
742 | * startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP] | |
743 | * - contains serialized sets (USerializedSet) of canonical starters for | |
744 | * enumerating canonically equivalent strings | |
745 | * indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP | |
746 | * for details about the structure see uset.c | |
747 | * | |
748 | * bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]] | |
749 | * - a sorted search table for BMP code points whose results are | |
750 | * either indexes to USerializedSets or single code points for | |
751 | * single-code point sets; | |
752 | * each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx | |
753 | * if yy==01 then there is a USerializedSet at canonStartSets+x | |
754 | * else build a USerializedSet with result as the single code point | |
755 | * | |
756 | * suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]] | |
757 | * - a sorted search table for supplementary code points whose results are | |
758 | * either indexes to USerializedSets or single code points for | |
759 | * single-code point sets; | |
760 | * each entry is a triplet of { high16(cp), low16(cp), result } | |
761 | * each code point's high-word may contain extra data in bits 15..5: | |
762 | * if the high word has bit 15 set, then build a set with a single code point | |
763 | * which is (((high16(cp)&0x1f00)<<8)|result; | |
764 | * else there is a USerializedSet at canonStartSets+result | |
374ca955 A |
765 | * |
766 | * FormatVersion 2.3 adds 2 serialized sets for normalization exclusions. | |
767 | * They are stored in the data file so that the runtime normalization code need | |
768 | * not depend on other properties and their data and implementation files. | |
769 | * The _NORM_SET_INDEX_NX_..._OFFSET offsets in the canonStartSets index table | |
770 | * give the location for each set. | |
771 | * There is no set stored for UNORM_NX_HANGUL because it's trivial to create | |
772 | * without using properties. | |
773 | * | |
774 | * Set contents: | |
775 | * | |
776 | * _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET (for UNORM_NX_CJK_COMPAT) | |
777 | * [[:Ideographic:]&[:NFD_QC=No:]] | |
778 | * =[CJK Ideographs]&[has canonical decomposition] | |
779 | * | |
780 | * _NORM_SET_INDEX_NX_UNICODE32_OFFSET (for UNORM_UNICODE_3_2) | |
781 | * [:^Age=3.2:] | |
782 | * =set with all code points that were not designated by the specified Unicode version | |
783 | * | |
784 | * _NORM_SET_INDEX_NX_RESERVED_OFFSET | |
785 | * This is an offset that points to where the next, future set would start. | |
786 | * Currently it indicates where the previous set ends, and thus its length. | |
787 | * The name for this enum constant may in the future be applied to different | |
788 | * index slots. In order to get the limit of a set, use its index slot and | |
789 | * the immediately following one regardless of that one's enum name. | |
b75a7d8f A |
790 | */ |
791 | ||
792 | #endif /* #if !UCONFIG_NO_NORMALIZATION */ | |
793 | ||
794 | #endif |