1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: normalizer2impl.h
12 * tab size: 8 (not used)
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
19 #ifndef __NORMALIZER2IMPL_H__
20 #define __NORMALIZER2IMPL_H__
22 #include "unicode/utypes.h"
24 #if !UCONFIG_NO_NORMALIZATION
26 #include "unicode/normalizer2.h"
27 #include "unicode/unistr.h"
28 #include "unicode/unorm.h"
29 #include "unicode/utf16.h"
40 class InitCanonIterData
;
43 class U_COMMON_API Hangul
{
45 /* Korean Hangul and Jamo constants */
47 JAMO_L_BASE
=0x1100, /* "lead" jamo */
49 JAMO_V_BASE
=0x1161, /* "vowel" jamo */
51 JAMO_T_BASE
=0x11a7, /* "trail" jamo */
61 JAMO_VT_COUNT
=JAMO_V_COUNT
*JAMO_T_COUNT
,
63 HANGUL_COUNT
=JAMO_L_COUNT
*JAMO_V_COUNT
*JAMO_T_COUNT
,
64 HANGUL_LIMIT
=HANGUL_BASE
+HANGUL_COUNT
67 static inline UBool
isHangul(UChar32 c
) {
68 return HANGUL_BASE
<=c
&& c
<HANGUL_LIMIT
;
71 isHangulLV(UChar32 c
) {
73 return 0<=c
&& c
<HANGUL_COUNT
&& c%JAMO_T_COUNT
==0;
75 static inline UBool
isJamoL(UChar32 c
) {
76 return (uint32_t)(c
-JAMO_L_BASE
)<JAMO_L_COUNT
;
78 static inline UBool
isJamoV(UChar32 c
) {
79 return (uint32_t)(c
-JAMO_V_BASE
)<JAMO_V_COUNT
;
81 static inline UBool
isJamoT(UChar32 c
) {
82 int32_t t
=c
-JAMO_T_BASE
;
83 return 0<t
&& t
<JAMO_T_COUNT
; // not JAMO_T_BASE itself
85 static UBool
isJamo(UChar32 c
) {
86 return JAMO_L_BASE
<=c
&& c
<=JAMO_T_END
&&
87 (c
<=JAMO_L_END
|| (JAMO_V_BASE
<=c
&& c
<=JAMO_V_END
) || JAMO_T_BASE
<c
);
91 * Decomposes c, which must be a Hangul syllable, into buffer
92 * and returns the length of the decomposition (2 or 3).
94 static inline int32_t decompose(UChar32 c
, UChar buffer
[3]) {
96 UChar32 c2
=c%JAMO_T_COUNT
;
98 buffer
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
99 buffer
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
103 buffer
[2]=(UChar
)(JAMO_T_BASE
+c2
);
109 * Decomposes c, which must be a Hangul syllable, into buffer.
110 * This is the raw, not recursive, decomposition. Its length is always 2.
112 static inline void getRawDecomposition(UChar32 c
, UChar buffer
[2]) {
115 UChar32 c2
=c%JAMO_T_COUNT
;
118 buffer
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
119 buffer
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
121 buffer
[0]=orig
-c2
; // LV syllable
122 buffer
[1]=(UChar
)(JAMO_T_BASE
+c2
);
126 Hangul(); // no instantiation
129 class Normalizer2Impl
;
131 class U_COMMON_API ReorderingBuffer
: public UMemory
{
133 /** Constructs only; init() should be called. */
134 ReorderingBuffer(const Normalizer2Impl
&ni
, UnicodeString
&dest
) :
136 start(NULL
), reorderStart(NULL
), limit(NULL
),
137 remainingCapacity(0), lastCC(0) {}
138 /** Constructs, removes the string contents, and initializes for a small initial capacity. */
139 ReorderingBuffer(const Normalizer2Impl
&ni
, UnicodeString
&dest
, UErrorCode
&errorCode
);
140 ~ReorderingBuffer() {
142 str
.releaseBuffer((int32_t)(limit
-start
));
145 UBool
init(int32_t destCapacity
, UErrorCode
&errorCode
);
147 UBool
isEmpty() const { return start
==limit
; }
148 int32_t length() const { return (int32_t)(limit
-start
); }
149 UChar
*getStart() { return start
; }
150 UChar
*getLimit() { return limit
; }
151 uint8_t getLastCC() const { return lastCC
; }
153 UBool
equals(const UChar
*start
, const UChar
*limit
) const;
154 UBool
equals(const uint8_t *otherStart
, const uint8_t *otherLimit
) const;
156 UBool
append(UChar32 c
, uint8_t cc
, UErrorCode
&errorCode
) {
158 appendBMP((UChar
)c
, cc
, errorCode
) :
159 appendSupplementary(c
, cc
, errorCode
);
161 // s must be in NFD, otherwise change the implementation.
162 UBool
append(const UChar
*s
, int32_t length
,
163 uint8_t leadCC
, uint8_t trailCC
,
164 UErrorCode
&errorCode
);
165 UBool
appendBMP(UChar c
, uint8_t cc
, UErrorCode
&errorCode
) {
166 if(remainingCapacity
==0 && !resize(1, errorCode
)) {
169 if(lastCC
<=cc
|| cc
==0) {
181 UBool
appendZeroCC(UChar32 c
, UErrorCode
&errorCode
);
182 UBool
appendZeroCC(const UChar
*s
, const UChar
*sLimit
, UErrorCode
&errorCode
);
184 void removeSuffix(int32_t suffixLength
);
185 void setReorderingLimit(UChar
*newLimit
) {
186 remainingCapacity
+=(int32_t)(limit
-newLimit
);
187 reorderStart
=limit
=newLimit
;
190 void copyReorderableSuffixTo(UnicodeString
&s
) const {
191 s
.setTo(ConstChar16Ptr(reorderStart
), (int32_t)(limit
-reorderStart
));
195 * TODO: Revisit whether it makes sense to track reorderStart.
196 * It is set to after the last known character with cc<=1,
197 * which stops previousCC() before it reads that character and looks up its cc.
198 * previousCC() is normally only called from insert().
199 * In other words, reorderStart speeds up the insertion of a combining mark
200 * into a multi-combining mark sequence where it does not belong at the end.
201 * This might not be worth the trouble.
202 * On the other hand, it's not a huge amount of trouble.
204 * We probably need it for UNORM_SIMPLE_APPEND.
207 UBool
appendSupplementary(UChar32 c
, uint8_t cc
, UErrorCode
&errorCode
);
208 void insert(UChar32 c
, uint8_t cc
);
209 static void writeCodePoint(UChar
*p
, UChar32 c
) {
217 UBool
resize(int32_t appendLength
, UErrorCode
&errorCode
);
219 const Normalizer2Impl
&impl
;
221 UChar
*start
, *reorderStart
, *limit
;
222 int32_t remainingCapacity
;
225 // private backward iterator
226 void setIterator() { codePointStart
=limit
; }
227 void skipPrevious(); // Requires start<codePointStart.
228 uint8_t previousCC(); // Returns 0 if there is no previous character.
230 UChar
*codePointStart
, *codePointLimit
;
234 * Low-level implementation of the Unicode Normalization Algorithm.
235 * For the data structure and details see the documentation at the end of
236 * this normalizer2impl.h and in the design doc at
237 * http://site.icu-project.org/design/normalization/custom
239 class U_COMMON_API Normalizer2Impl
: public UObject
{
241 Normalizer2Impl() : normTrie(NULL
), fCanonIterData(NULL
) {
242 fCanonIterDataInitOnce
.reset();
244 virtual ~Normalizer2Impl();
246 void init(const int32_t *inIndexes
, const UTrie2
*inTrie
,
247 const uint16_t *inExtraData
, const uint8_t *inSmallFCD
);
249 void addLcccChars(UnicodeSet
&set
) const;
250 void addPropertyStarts(const USetAdder
*sa
, UErrorCode
&errorCode
) const;
251 void addCanonIterPropertyStarts(const USetAdder
*sa
, UErrorCode
&errorCode
) const;
253 // low-level properties ------------------------------------------------ ***
255 UBool
ensureCanonIterData(UErrorCode
&errorCode
) const;
257 uint16_t getNorm16(UChar32 c
) const { return UTRIE2_GET16(normTrie
, c
); }
259 UNormalizationCheckResult
getCompQuickCheck(uint16_t norm16
) const {
260 if(norm16
<minNoNo
|| MIN_YES_YES_WITH_CC
<=norm16
) {
262 } else if(minMaybeYes
<=norm16
) {
268 UBool
isAlgorithmicNoNo(uint16_t norm16
) const { return limitNoNo
<=norm16
&& norm16
<minMaybeYes
; }
269 UBool
isCompNo(uint16_t norm16
) const { return minNoNo
<=norm16
&& norm16
<minMaybeYes
; }
270 UBool
isDecompYes(uint16_t norm16
) const { return norm16
<minYesNo
|| minMaybeYes
<=norm16
; }
272 uint8_t getCC(uint16_t norm16
) const {
273 if(norm16
>=MIN_NORMAL_MAYBE_YES
) {
274 return getCCFromNormalYesOrMaybe(norm16
);
276 if(norm16
<minNoNo
|| limitNoNo
<=norm16
) {
279 return getCCFromNoNo(norm16
);
281 static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16
) {
282 return (uint8_t)(norm16
>> OFFSET_SHIFT
);
284 static uint8_t getCCFromYesOrMaybe(uint16_t norm16
) {
285 return norm16
>=MIN_NORMAL_MAYBE_YES
? getCCFromNormalYesOrMaybe(norm16
) : 0;
287 uint8_t getCCFromYesOrMaybeCP(UChar32 c
) const {
288 if (c
< minCompNoMaybeCP
) { return 0; }
289 return getCCFromYesOrMaybe(getNorm16(c
));
293 * Returns the FCD data for code point c.
294 * @param c A Unicode code point.
295 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
297 uint16_t getFCD16(UChar32 c
) const {
298 if(c
<minDecompNoCP
) {
300 } else if(c
<=0xffff) {
301 if(!singleLeadMightHaveNonZeroFCD16(c
)) { return 0; }
303 return getFCD16FromNormData(c
);
306 * Returns the FCD data for the next code point (post-increment).
307 * Might skip only a lead surrogate rather than the whole surrogate pair if none of
308 * the supplementary code points associated with the lead surrogate have non-zero FCD data.
309 * @param s A valid pointer into a string. Requires s!=limit.
310 * @param limit The end of the string, or NULL.
311 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
313 uint16_t nextFCD16(const UChar
*&s
, const UChar
*limit
) const {
315 if(c
<minDecompNoCP
|| !singleLeadMightHaveNonZeroFCD16(c
)) {
319 if(U16_IS_LEAD(c
) && s
!=limit
&& U16_IS_TRAIL(c2
=*s
)) {
320 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
323 return getFCD16FromNormData(c
);
326 * Returns the FCD data for the previous code point (pre-decrement).
327 * @param start The start of the string.
328 * @param s A valid pointer into a string. Requires start<s.
329 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
331 uint16_t previousFCD16(const UChar
*start
, const UChar
*&s
) const {
333 if(c
<minDecompNoCP
) {
336 if(!U16_IS_TRAIL(c
)) {
337 if(!singleLeadMightHaveNonZeroFCD16(c
)) {
342 if(start
<s
&& U16_IS_LEAD(c2
=*(s
-1))) {
343 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
347 return getFCD16FromNormData(c
);
350 /** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */
351 UBool
singleLeadMightHaveNonZeroFCD16(UChar32 lead
) const {
353 uint8_t bits
=smallFCD
[lead
>>8];
354 if(bits
==0) { return false; }
355 return (UBool
)((bits
>>((lead
>>5)&7))&1);
357 /** Returns the FCD value from the regular normalization data. */
358 uint16_t getFCD16FromNormData(UChar32 c
) const;
361 * Gets the decomposition for one code point.
362 * @param c code point
363 * @param buffer out-only buffer for algorithmic decompositions
364 * @param length out-only, takes the length of the decomposition, if any
365 * @return pointer to the decomposition, or NULL if none
367 const UChar
*getDecomposition(UChar32 c
, UChar buffer
[4], int32_t &length
) const;
370 * Gets the raw decomposition for one code point.
371 * @param c code point
372 * @param buffer out-only buffer for algorithmic decompositions
373 * @param length out-only, takes the length of the decomposition, if any
374 * @return pointer to the decomposition, or NULL if none
376 const UChar
*getRawDecomposition(UChar32 c
, UChar buffer
[30], int32_t &length
) const;
378 UChar32
composePair(UChar32 a
, UChar32 b
) const;
380 UBool
isCanonSegmentStarter(UChar32 c
) const;
381 UBool
getCanonStartSet(UChar32 c
, UnicodeSet
&set
) const;
384 // Fixed norm16 values.
385 MIN_YES_YES_WITH_CC
=0xfe02,
387 MIN_NORMAL_MAYBE_YES
=0xfc00,
388 JAMO_L
=2, // offset=1 hasCompBoundaryAfter=FALSE
389 INERT
=1, // offset=0 hasCompBoundaryAfter=TRUE
391 // norm16 bit 0 is comp-boundary-after.
392 HAS_COMP_BOUNDARY_AFTER
=1,
395 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
396 // tccc (0, 1, >1) for quick FCC boundary-after tests.
407 // Byte offsets from the start of the data, after the generic header.
409 IX_EXTRA_DATA_OFFSET
,
417 // Code point thresholds for quick check codes.
419 IX_MIN_COMP_NO_MAYBE_CP
,
421 // Norm16 value thresholds for quick check combinations and types of extra data.
423 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
425 /** Mappings are comp-normalized. */
430 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
431 IX_MIN_YES_NO_MAPPINGS_ONLY
,
432 /** Mappings are not comp-normalized but have a comp boundary before. */
433 IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE
,
434 /** Mappings do not have a comp boundary before. */
435 IX_MIN_NO_NO_COMP_NO_MAYBE_CC
,
436 /** Mappings to the empty string. */
445 MAPPING_HAS_CCC_LCCC_WORD
=0x80,
446 MAPPING_HAS_RAW_MAPPING
=0x40,
448 MAPPING_LENGTH_MASK
=0x1f
452 COMP_1_LAST_TUPLE
=0x8000,
454 COMP_1_TRAIL_LIMIT
=0x3400,
455 COMP_1_TRAIL_MASK
=0x7ffe,
456 COMP_1_TRAIL_SHIFT
=9, // 10-1 for the "triple" bit
457 COMP_2_TRAIL_SHIFT
=6,
458 COMP_2_TRAIL_MASK
=0xffc0
461 // higher-level functionality ------------------------------------------ ***
463 // NFD without an NFD Normalizer2 instance.
464 UnicodeString
&decompose(const UnicodeString
&src
, UnicodeString
&dest
,
465 UErrorCode
&errorCode
) const;
467 * Decomposes [src, limit[ and writes the result to dest.
468 * limit can be NULL if src is NUL-terminated.
469 * destLengthEstimate is the initial dest buffer capacity and can be -1.
471 void decompose(const UChar
*src
, const UChar
*limit
,
472 UnicodeString
&dest
, int32_t destLengthEstimate
,
473 UErrorCode
&errorCode
) const;
475 const UChar
*decompose(const UChar
*src
, const UChar
*limit
,
476 ReorderingBuffer
*buffer
, UErrorCode
&errorCode
) const;
477 void decomposeAndAppend(const UChar
*src
, const UChar
*limit
,
479 UnicodeString
&safeMiddle
,
480 ReorderingBuffer
&buffer
,
481 UErrorCode
&errorCode
) const;
482 UBool
compose(const UChar
*src
, const UChar
*limit
,
483 UBool onlyContiguous
,
485 ReorderingBuffer
&buffer
,
486 UErrorCode
&errorCode
) const;
487 const UChar
*composeQuickCheck(const UChar
*src
, const UChar
*limit
,
488 UBool onlyContiguous
,
489 UNormalizationCheckResult
*pQCResult
) const;
490 void composeAndAppend(const UChar
*src
, const UChar
*limit
,
492 UBool onlyContiguous
,
493 UnicodeString
&safeMiddle
,
494 ReorderingBuffer
&buffer
,
495 UErrorCode
&errorCode
) const;
497 /** sink==nullptr: isNormalized() */
498 UBool
composeUTF8(uint32_t options
, UBool onlyContiguous
,
499 const uint8_t *src
, const uint8_t *limit
,
500 ByteSink
*sink
, icu::Edits
*edits
, UErrorCode
&errorCode
) const;
502 const UChar
*makeFCD(const UChar
*src
, const UChar
*limit
,
503 ReorderingBuffer
*buffer
, UErrorCode
&errorCode
) const;
504 void makeFCDAndAppend(const UChar
*src
, const UChar
*limit
,
506 UnicodeString
&safeMiddle
,
507 ReorderingBuffer
&buffer
,
508 UErrorCode
&errorCode
) const;
510 UBool
hasDecompBoundaryBefore(UChar32 c
) const;
511 UBool
norm16HasDecompBoundaryBefore(uint16_t norm16
) const;
512 UBool
hasDecompBoundaryAfter(UChar32 c
) const;
513 UBool
norm16HasDecompBoundaryAfter(uint16_t norm16
) const;
514 UBool
isDecompInert(UChar32 c
) const { return isDecompYesAndZeroCC(getNorm16(c
)); }
516 UBool
hasCompBoundaryBefore(UChar32 c
) const {
517 return c
<minCompNoMaybeCP
|| norm16HasCompBoundaryBefore(getNorm16(c
));
519 UBool
hasCompBoundaryAfter(UChar32 c
, UBool onlyContiguous
) const {
520 return norm16HasCompBoundaryAfter(getNorm16(c
), onlyContiguous
);
522 UBool
isCompInert(UChar32 c
, UBool onlyContiguous
) const {
523 uint16_t norm16
=getNorm16(c
);
524 return isCompYesAndZeroCC(norm16
) &&
525 (norm16
& HAS_COMP_BOUNDARY_AFTER
) != 0 &&
526 (!onlyContiguous
|| isInert(norm16
) || *getMapping(norm16
) <= 0x1ff);
529 UBool
hasFCDBoundaryBefore(UChar32 c
) const { return hasDecompBoundaryBefore(c
); }
530 UBool
hasFCDBoundaryAfter(UChar32 c
) const { return hasDecompBoundaryAfter(c
); }
531 UBool
isFCDInert(UChar32 c
) const { return getFCD16(c
)<=1; }
533 friend class InitCanonIterData
;
534 friend class LcccContext
;
536 UBool
isMaybe(uint16_t norm16
) const { return minMaybeYes
<=norm16
&& norm16
<=JAMO_VT
; }
537 UBool
isMaybeOrNonZeroCC(uint16_t norm16
) const { return norm16
>=minMaybeYes
; }
538 static UBool
isInert(uint16_t norm16
) { return norm16
==INERT
; }
539 static UBool
isJamoL(uint16_t norm16
) { return norm16
==JAMO_L
; }
540 static UBool
isJamoVT(uint16_t norm16
) { return norm16
==JAMO_VT
; }
541 uint16_t hangulLVT() const { return minYesNoMappingsOnly
|HAS_COMP_BOUNDARY_AFTER
; }
542 UBool
isHangulLV(uint16_t norm16
) const { return norm16
==minYesNo
; }
543 UBool
isHangulLVT(uint16_t norm16
) const {
544 return norm16
==hangulLVT();
546 UBool
isCompYesAndZeroCC(uint16_t norm16
) const { return norm16
<minNoNo
; }
547 // UBool isCompYes(uint16_t norm16) const {
548 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
550 // UBool isCompYesOrMaybe(uint16_t norm16) const {
551 // return norm16<minNoNo || minMaybeYes<=norm16;
553 // UBool hasZeroCCFromDecompYes(uint16_t norm16) const {
554 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
556 UBool
isDecompYesAndZeroCC(uint16_t norm16
) const {
557 return norm16
<minYesNo
||
559 (minMaybeYes
<=norm16
&& norm16
<=MIN_NORMAL_MAYBE_YES
);
562 * A little faster and simpler than isDecompYesAndZeroCC() but does not include
563 * the MaybeYes which combine-forward and have ccc=0.
564 * (Standard Unicode 10 normalization does not have such characters.)
566 UBool
isMostDecompYesAndZeroCC(uint16_t norm16
) const {
567 return norm16
<minYesNo
|| norm16
==MIN_NORMAL_MAYBE_YES
|| norm16
==JAMO_VT
;
569 UBool
isDecompNoAlgorithmic(uint16_t norm16
) const { return norm16
>=limitNoNo
; }
571 // For use with isCompYes().
572 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
573 // static uint8_t getCCFromYes(uint16_t norm16) {
574 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
576 uint8_t getCCFromNoNo(uint16_t norm16
) const {
577 const uint16_t *mapping
=getMapping(norm16
);
578 if(*mapping
&MAPPING_HAS_CCC_LCCC_WORD
) {
579 return (uint8_t)*(mapping
-1);
584 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
585 uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16
) const {
586 if(norm16
<=minYesNo
) {
587 return 0; // yesYes and Hangul LV have ccc=tccc=0
589 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
590 return (uint8_t)(*getMapping(norm16
)>>8); // tccc from yesNo
593 uint8_t getPreviousTrailCC(const UChar
*start
, const UChar
*p
) const;
594 uint8_t getPreviousTrailCC(const uint8_t *start
, const uint8_t *p
) const;
596 // Requires algorithmic-NoNo.
597 UChar32
mapAlgorithmic(UChar32 c
, uint16_t norm16
) const {
598 return c
+(norm16
>>DELTA_SHIFT
)-centerNoNoDelta
;
600 UChar32
getAlgorithmicDelta(uint16_t norm16
) const {
601 return (norm16
>>DELTA_SHIFT
)-centerNoNoDelta
;
604 // Requires minYesNo<norm16<limitNoNo.
605 const uint16_t *getMapping(uint16_t norm16
) const { return extraData
+(norm16
>>OFFSET_SHIFT
); }
606 const uint16_t *getCompositionsListForDecompYes(uint16_t norm16
) const {
607 if(norm16
<JAMO_L
|| MIN_NORMAL_MAYBE_YES
<=norm16
) {
609 } else if(norm16
<minMaybeYes
) {
610 return getMapping(norm16
); // for yesYes; if Jamo L: harmless empty list
612 return maybeYesCompositions
+norm16
-minMaybeYes
;
615 const uint16_t *getCompositionsListForComposite(uint16_t norm16
) const {
616 // A composite has both mapping & compositions list.
617 const uint16_t *list
=getMapping(norm16
);
618 return list
+ // mapping pointer
619 1+ // +1 to skip the first unit with the mapping length
620 (*list
&MAPPING_LENGTH_MASK
); // + mapping length
622 const uint16_t *getCompositionsListForMaybe(uint16_t norm16
) const {
623 // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
624 return maybeYesCompositions
+((norm16
-minMaybeYes
)>>OFFSET_SHIFT
);
627 * @param c code point must have compositions
628 * @return compositions list pointer
630 const uint16_t *getCompositionsList(uint16_t norm16
) const {
631 return isDecompYes(norm16
) ?
632 getCompositionsListForDecompYes(norm16
) :
633 getCompositionsListForComposite(norm16
);
636 const UChar
*copyLowPrefixFromNulTerminated(const UChar
*src
,
637 UChar32 minNeedDataCP
,
638 ReorderingBuffer
*buffer
,
639 UErrorCode
&errorCode
) const;
640 const UChar
*decomposeShort(const UChar
*src
, const UChar
*limit
,
641 UBool stopAtCompBoundary
, UBool onlyContiguous
,
642 ReorderingBuffer
&buffer
, UErrorCode
&errorCode
) const;
643 UBool
decompose(UChar32 c
, uint16_t norm16
,
644 ReorderingBuffer
&buffer
, UErrorCode
&errorCode
) const;
646 const uint8_t *decomposeShort(const uint8_t *src
, const uint8_t *limit
,
647 UBool stopAtCompBoundary
, UBool onlyContiguous
,
648 ReorderingBuffer
&buffer
, UErrorCode
&errorCode
) const;
650 static int32_t combine(const uint16_t *list
, UChar32 trail
);
651 void addComposites(const uint16_t *list
, UnicodeSet
&set
) const;
652 void recompose(ReorderingBuffer
&buffer
, int32_t recomposeStartIndex
,
653 UBool onlyContiguous
) const;
655 UBool
hasCompBoundaryBefore(UChar32 c
, uint16_t norm16
) const {
656 return c
<minCompNoMaybeCP
|| norm16HasCompBoundaryBefore(norm16
);
658 UBool
norm16HasCompBoundaryBefore(uint16_t norm16
) const {
659 return norm16
< minNoNoCompNoMaybeCC
|| isAlgorithmicNoNo(norm16
);
661 UBool
hasCompBoundaryBefore(const UChar
*src
, const UChar
*limit
) const;
662 UBool
hasCompBoundaryBefore(const uint8_t *src
, const uint8_t *limit
) const;
663 UBool
hasCompBoundaryAfter(const UChar
*start
, const UChar
*p
,
664 UBool onlyContiguous
) const;
665 UBool
hasCompBoundaryAfter(const uint8_t *start
, const uint8_t *p
,
666 UBool onlyContiguous
) const;
667 UBool
norm16HasCompBoundaryAfter(uint16_t norm16
, UBool onlyContiguous
) const {
668 return (norm16
& HAS_COMP_BOUNDARY_AFTER
) != 0 &&
669 (!onlyContiguous
|| isTrailCC01ForCompBoundaryAfter(norm16
));
671 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
672 UBool
isTrailCC01ForCompBoundaryAfter(uint16_t norm16
) const {
673 return isInert(norm16
) || (isDecompNoAlgorithmic(norm16
) ?
674 (norm16
& DELTA_TCCC_MASK
) <= DELTA_TCCC_1
: *getMapping(norm16
) <= 0x1ff);
677 const UChar
*findPreviousCompBoundary(const UChar
*start
, const UChar
*p
, UBool onlyContiguous
) const;
678 const UChar
*findNextCompBoundary(const UChar
*p
, const UChar
*limit
, UBool onlyContiguous
) const;
680 const UChar
*findPreviousFCDBoundary(const UChar
*start
, const UChar
*p
) const;
681 const UChar
*findNextFCDBoundary(const UChar
*p
, const UChar
*limit
) const;
683 void makeCanonIterDataFromNorm16(UChar32 start
, UChar32 end
, const uint16_t norm16
,
684 CanonIterData
&newData
, UErrorCode
&errorCode
) const;
686 int32_t getCanonValue(UChar32 c
) const;
687 const UnicodeSet
&getCanonStartSet(int32_t n
) const;
689 // UVersionInfo dataVersion;
691 // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
693 UChar minCompNoMaybeCP
;
696 // Norm16 value thresholds for quick check combinations and types of extra data.
698 uint16_t minYesNoMappingsOnly
;
700 uint16_t minNoNoCompBoundaryBefore
;
701 uint16_t minNoNoCompNoMaybeCC
;
702 uint16_t minNoNoEmpty
;
704 uint16_t centerNoNoDelta
;
705 uint16_t minMaybeYes
;
707 const UTrie2
*normTrie
;
708 const uint16_t *maybeYesCompositions
;
709 const uint16_t *extraData
; // mappings and/or compositions for yesYes, yesNo & noNo characters
710 const uint8_t *smallFCD
; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
712 UInitOnce fCanonIterDataInitOnce
;
713 CanonIterData
*fCanonIterData
;
716 // bits in canonIterData
717 #define CANON_NOT_SEGMENT_STARTER 0x80000000
718 #define CANON_HAS_COMPOSITIONS 0x40000000
719 #define CANON_HAS_SET 0x200000
720 #define CANON_VALUE_MASK 0x1fffff
723 * ICU-internal shortcut for quick access to standard Unicode normalization.
725 class U_COMMON_API Normalizer2Factory
{
727 static const Normalizer2
*getFCDInstance(UErrorCode
&errorCode
);
728 static const Normalizer2
*getFCCInstance(UErrorCode
&errorCode
);
729 static const Normalizer2
*getNoopInstance(UErrorCode
&errorCode
);
731 static const Normalizer2
*getInstance(UNormalizationMode mode
, UErrorCode
&errorCode
);
733 static const Normalizer2Impl
*getNFCImpl(UErrorCode
&errorCode
);
734 static const Normalizer2Impl
*getNFKCImpl(UErrorCode
&errorCode
);
735 static const Normalizer2Impl
*getNFKC_CFImpl(UErrorCode
&errorCode
);
737 // Get the Impl instance of the Normalizer2.
738 // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance.
739 static const Normalizer2Impl
*getImpl(const Normalizer2
*norm2
);
741 Normalizer2Factory(); // No instantiation.
746 U_CAPI
int32_t U_EXPORT2
747 unorm2_swap(const UDataSwapper
*ds
,
748 const void *inData
, int32_t length
, void *outData
,
749 UErrorCode
*pErrorCode
);
752 * Get the NF*_QC property for a code point, for u_getIntPropertyValue().
755 U_CFUNC UNormalizationCheckResult
756 unorm_getQuickCheck(UChar32 c
, UNormalizationMode mode
);
759 * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue().
763 unorm_getFCD16(UChar32 c
);
766 * Format of Normalizer2 .nrm data files.
767 * Format version 3.0.
769 * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
770 * ICU ships with data files for standard Unicode Normalization Forms
771 * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm).
772 * Custom (application-specific) data can be built into additional .nrm files
773 * with the gennorm2 build tool.
774 * ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.
776 * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
777 * cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
779 * A .nrm file begins with a standard ICU data file header
780 * (DataHeader, see ucmndata.h and unicode/udata.h).
781 * The UDataInfo.dataVersion field usually contains the Unicode version
782 * for which the data was generated.
784 * After the header, the file contains the following parts.
785 * Constants are defined as enum values of the Normalizer2Impl class.
787 * Many details of the data structures are described in the design doc
788 * which is at http://site.icu-project.org/design/normalization/custom
790 * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4;
792 * The first eight indexes are byte offsets in ascending order.
793 * Each byte offset marks the start of the next part in the data file,
794 * and the end of the previous one.
795 * When two consecutive byte offsets are the same, then the corresponding part is empty.
796 * Byte offsets are offsets from after the header,
797 * that is, from the beginning of the indexes[].
798 * Each part starts at an offset with proper alignment for its data.
799 * If necessary, the previous part may include padding bytes to achieve this alignment.
801 * minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point
802 * with a decomposition mapping, that is, with NF*D_QC=No.
803 * minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
804 * with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
805 * minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3)
806 * is the lowest code point with lccc!=0.
808 * The next eight indexes are thresholds of 16-bit trie values for ranges of
809 * values indicating multiple normalization properties.
810 * They are listed here in threshold order, not in the order they are stored in the indexes.
811 * minYesNo=indexes[IX_MIN_YES_NO];
812 * minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
813 * minNoNo=indexes[IX_MIN_NO_NO];
814 * minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
815 * minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
816 * minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];
817 * limitNoNo=indexes[IX_LIMIT_NO_NO];
818 * minMaybeYes=indexes[IX_MIN_MAYBE_YES];
819 * See the normTrie description below and the design doc for details.
821 * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
823 * The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
824 * Rather than using independent bits in the value (which would require more than 16 bits),
825 * information is extracted primarily via range checks.
826 * Except, format version 3 uses bit 0 for hasCompBoundaryAfter().
827 * For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
828 * means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
829 * which means it has a two-way (round-trip) decomposition mapping.
830 * Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
831 * pointing to mappings, compositions lists, or both.
832 * Value norm16==INERT (0 in versions 1 & 2, 1 in version 3)
833 * means that the character is normalization-inert, that is,
834 * it does not have a mapping, does not participate in composition, has a zero
835 * canonical combining class, and forms a boundary where text before it and after it
836 * can be normalized independently.
837 * For details about how multiple properties are encoded in 16-bit values
838 * see the design doc.
839 * Note that the encoding cannot express all combinations of the properties involved;
840 * it only supports those combinations that are allowed by
841 * the Unicode Normalization algorithms. Details are in the design doc as well.
842 * The gennorm2 tool only builds .nrm files for data that conforms to the limitations.
844 * The trie has a value for each lead surrogate code unit representing the "worst case"
845 * properties of the 1024 supplementary characters whose UTF-16 form starts with
846 * the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
847 * then their lead surrogate code unit has the trie value INERT.
848 * When the lead surrogate unit's value exceeds the quick check minimum during processing,
849 * the properties for the full supplementary code point need to be looked up.
851 * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes];
852 * uint16_t extraData[];
854 * There is only one byte offset for the end of these two arrays.
855 * The split between them is given by the constant and variable mentioned above.
856 * In version 3, the difference must be shifted right by OFFSET_SHIFT.
858 * The maybeYesCompositions array contains compositions lists for characters that
859 * combine both forward (as starters in composition pairs)
860 * and backward (as trailing characters in composition pairs).
861 * Such characters do not occur in Unicode 5.2 but are allowed by
862 * the Unicode Normalization algorithms.
863 * If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES
864 * and the maybeYesCompositions array is empty.
865 * If there are such characters, then minMaybeYes is subtracted from their norm16 values
866 * to get the index into this array.
868 * The extraData array contains compositions lists for "YesYes" characters,
869 * followed by mappings and optional compositions lists for "YesNo" characters,
870 * followed by only mappings for "NoNo" characters.
871 * (Referring to pairs of NFC/NFD quick check values.)
872 * The norm16 values of those characters are directly indexes into the extraData array.
873 * In version 3, the norm16 values must be shifted right by OFFSET_SHIFT
874 * for accessing extraData.
876 * The data structures for compositions lists and mappings are described in the design doc.
878 * uint8_t smallFCD[0x100]; -- new in format version 2
880 * This is a bit set to help speed up FCD value lookups in the absence of a full
881 * UTrie2 or other large data structure with the full FCD value mapping.
883 * Each smallFCD bit is set if any of the corresponding 32 BMP code points
884 * has a non-zero FCD value (lccc!=0 or tccc!=0).
885 * Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF.
886 * A bit for 32 lead surrogates is set if any of the 32k corresponding
887 * _supplementary_ code points has a non-zero FCD value.
889 * This bit set is most useful for the large blocks of CJK characters with FCD=0.
891 * Changes from format version 1 to format version 2 ---------------------------
893 * - Addition of data for raw (not recursively decomposed) mappings.
894 * + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when
895 * the mapping is to an empty string or when the character combines-forward.
896 * This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
897 * is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
898 * + For details see the design doc.
899 * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into
900 * distinct ranges (combines-forward vs. not)
901 * so that a range check can be used to find out if there is a compositions list.
902 * This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
903 * It is needed for the new (in ICU 49) composePair(), not for other normalization.
904 * - Addition of the smallFCD[] bit set.
906 * Changes from format version 2 to format version 3 (ICU 60) ------------------
908 * - norm16 bit 0 indicates hasCompBoundaryAfter(),
909 * except that for contiguous composition (FCC) the tccc must be checked as well.
910 * Data indexes and ccc values are shifted left by one (OFFSET_SHIFT).
911 * Thresholds like minNoNo are tested before shifting.
913 * - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT),
914 * to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater.
915 * See DELTA_TCCC_MASK etc.
916 * This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter().
917 * minMaybeYes is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly.
919 * - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters,
920 * and ASCII characters are mapped algorithmically only to other ASCII characters.
921 * This helps with hasCompBoundaryBefore() and compose() fast paths.
922 * It is never necessary any more to loop for algorithmic mappings.
924 * - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE],
925 * indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY],
926 * and separation of the noNo extraData into distinct ranges.
927 * With this, the noNo norm16 value indicates whether the mapping is
928 * compose-normalized, not normalized but hasCompBoundaryBefore(),
929 * not even that, or maps to an empty string.
930 * hasCompBoundaryBefore() can be determined solely from the norm16 value.
932 * - The norm16 value for Hangul LVT is now different from that for Hangul LV,
933 * so that hasCompBoundaryAfter() need not check for the syllable type.
934 * For Hangul LV, minYesNo continues to be used (no comp-boundary-after).
935 * For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used.
936 * The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively,
937 * to simplify some code.
939 * - The extraData firstUnit bit 5 is no longer necessary
940 * (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER),
941 * is reserved again, and always set to 0.
943 * - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0.
944 * This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower:
945 * U+00AD Soft Hyphen maps to an empty string,
946 * which is artificially assigned "worst case" values lccc=1 and tccc=255.
948 * - A mapping to an empty string has explicit lccc=1 and tccc=255 values.
951 #endif /* !UCONFIG_NO_NORMALIZATION */
952 #endif /* __NORMALIZER2IMPL_H__ */