2 ******************************************************************************* 
   4 *   Copyright (C) 2009-2014, International Business Machines 
   5 *   Corporation and others.  All Rights Reserved. 
   7 ******************************************************************************* 
   8 *   file name:  normalizer2impl.h 
  10 *   tab size:   8 (not used) 
  13 *   created on: 2009nov22 
  14 *   created by: Markus W. Scherer 
  17 #ifndef __NORMALIZER2IMPL_H__ 
  18 #define __NORMALIZER2IMPL_H__ 
  20 #include "unicode/utypes.h" 
  22 #if !UCONFIG_NO_NORMALIZATION 
  24 #include "unicode/normalizer2.h" 
  25 #include "unicode/unistr.h" 
  26 #include "unicode/unorm.h" 
  27 #include "unicode/utf16.h" 
  36 class U_COMMON_API Hangul 
{ 
  38     /* Korean Hangul and Jamo constants */ 
  40         JAMO_L_BASE
=0x1100,     /* "lead" jamo */ 
  42         JAMO_V_BASE
=0x1161,     /* "vowel" jamo */ 
  44         JAMO_T_BASE
=0x11a7,     /* "trail" jamo */ 
  54         JAMO_VT_COUNT
=JAMO_V_COUNT
*JAMO_T_COUNT
, 
  56         HANGUL_COUNT
=JAMO_L_COUNT
*JAMO_V_COUNT
*JAMO_T_COUNT
, 
  57         HANGUL_LIMIT
=HANGUL_BASE
+HANGUL_COUNT
 
  60     static inline UBool 
isHangul(UChar32 c
) { 
  61         return HANGUL_BASE
<=c 
&& c
<HANGUL_LIMIT
; 
  64     isHangulWithoutJamoT(UChar c
) { 
  66         return c
<HANGUL_COUNT 
&& c%JAMO_T_COUNT
==0; 
  68     static inline UBool 
isJamoL(UChar32 c
) { 
  69         return (uint32_t)(c
-JAMO_L_BASE
)<JAMO_L_COUNT
; 
  71     static inline UBool 
isJamoV(UChar32 c
) { 
  72         return (uint32_t)(c
-JAMO_V_BASE
)<JAMO_V_COUNT
; 
  76      * Decomposes c, which must be a Hangul syllable, into buffer 
  77      * and returns the length of the decomposition (2 or 3). 
  79     static inline int32_t decompose(UChar32 c
, UChar buffer
[3]) { 
  81         UChar32 c2
=c%JAMO_T_COUNT
; 
  83         buffer
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
); 
  84         buffer
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
); 
  88             buffer
[2]=(UChar
)(JAMO_T_BASE
+c2
); 
  94      * Decomposes c, which must be a Hangul syllable, into buffer. 
  95      * This is the raw, not recursive, decomposition. Its length is always 2. 
  97     static inline void getRawDecomposition(UChar32 c
, UChar buffer
[2]) { 
 100         UChar32 c2
=c%JAMO_T_COUNT
; 
 103             buffer
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
); 
 104             buffer
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
); 
 106             buffer
[0]=orig
-c2
;  // LV syllable 
 107             buffer
[1]=(UChar
)(JAMO_T_BASE
+c2
); 
 111     Hangul();  // no instantiation 
 114 class Normalizer2Impl
; 
 116 class U_COMMON_API ReorderingBuffer 
: public UMemory 
{ 
 118     ReorderingBuffer(const Normalizer2Impl 
&ni
, UnicodeString 
&dest
) : 
 120         start(NULL
), reorderStart(NULL
), limit(NULL
), 
 121         remainingCapacity(0), lastCC(0) {} 
 122     ~ReorderingBuffer() { 
 124             str
.releaseBuffer((int32_t)(limit
-start
)); 
 127     UBool 
init(int32_t destCapacity
, UErrorCode 
&errorCode
); 
 129     UBool 
isEmpty() const { return start
==limit
; } 
 130     int32_t length() const { return (int32_t)(limit
-start
); } 
 131     UChar 
*getStart() { return start
; } 
 132     UChar 
*getLimit() { return limit
; } 
 133     uint8_t getLastCC() const { return lastCC
; } 
 135     UBool 
equals(const UChar 
*start
, const UChar 
*limit
) const; 
 137     // For Hangul composition, replacing the Leading consonant Jamo with the syllable. 
 138     void setLastChar(UChar c
) { 
 142     UBool 
append(UChar32 c
, uint8_t cc
, UErrorCode 
&errorCode
) { 
 144             appendBMP((UChar
)c
, cc
, errorCode
) : 
 145             appendSupplementary(c
, cc
, errorCode
); 
 147     // s must be in NFD, otherwise change the implementation. 
 148     UBool 
append(const UChar 
*s
, int32_t length
, 
 149                  uint8_t leadCC
, uint8_t trailCC
, 
 150                  UErrorCode 
&errorCode
); 
 151     UBool 
appendBMP(UChar c
, uint8_t cc
, UErrorCode 
&errorCode
) { 
 152         if(remainingCapacity
==0 && !resize(1, errorCode
)) { 
 155         if(lastCC
<=cc 
|| cc
==0) { 
 167     UBool 
appendZeroCC(UChar32 c
, UErrorCode 
&errorCode
); 
 168     UBool 
appendZeroCC(const UChar 
*s
, const UChar 
*sLimit
, UErrorCode 
&errorCode
); 
 170     void removeSuffix(int32_t suffixLength
); 
 171     void setReorderingLimit(UChar 
*newLimit
) { 
 172         remainingCapacity
+=(int32_t)(limit
-newLimit
); 
 173         reorderStart
=limit
=newLimit
; 
 176     void copyReorderableSuffixTo(UnicodeString 
&s
) const { 
 177         s
.setTo(reorderStart
, (int32_t)(limit
-reorderStart
)); 
 181      * TODO: Revisit whether it makes sense to track reorderStart. 
 182      * It is set to after the last known character with cc<=1, 
 183      * which stops previousCC() before it reads that character and looks up its cc. 
 184      * previousCC() is normally only called from insert(). 
 185      * In other words, reorderStart speeds up the insertion of a combining mark 
 186      * into a multi-combining mark sequence where it does not belong at the end. 
 187      * This might not be worth the trouble. 
 188      * On the other hand, it's not a huge amount of trouble. 
 190      * We probably need it for UNORM_SIMPLE_APPEND. 
 193     UBool 
appendSupplementary(UChar32 c
, uint8_t cc
, UErrorCode 
&errorCode
); 
 194     void insert(UChar32 c
, uint8_t cc
); 
 195     static void writeCodePoint(UChar 
*p
, UChar32 c
) { 
 203     UBool 
resize(int32_t appendLength
, UErrorCode 
&errorCode
); 
 205     const Normalizer2Impl 
&impl
; 
 207     UChar 
*start
, *reorderStart
, *limit
; 
 208     int32_t remainingCapacity
; 
 211     // private backward iterator 
 212     void setIterator() { codePointStart
=limit
; } 
 213     void skipPrevious();  // Requires start<codePointStart. 
 214     uint8_t previousCC();  // Returns 0 if there is no previous character. 
 216     UChar 
*codePointStart
, *codePointLimit
; 
 219 class U_COMMON_API Normalizer2Impl 
: public UObject 
{ 
 221     Normalizer2Impl() : normTrie(NULL
), fCanonIterData(NULL
) { 
 222         fCanonIterDataInitOnce
.reset(); 
 224     virtual ~Normalizer2Impl(); 
 226     void init(const int32_t *inIndexes
, const UTrie2 
*inTrie
, 
 227               const uint16_t *inExtraData
, const uint8_t *inSmallFCD
); 
 229     void addLcccChars(UnicodeSet 
&set
) const; 
 230     void addPropertyStarts(const USetAdder 
*sa
, UErrorCode 
&errorCode
) const; 
 231     void addCanonIterPropertyStarts(const USetAdder 
*sa
, UErrorCode 
&errorCode
) const; 
 233     // low-level properties ------------------------------------------------ *** 
 235     const UTrie2 
*getNormTrie() const { return normTrie
; } 
 237     UBool 
ensureCanonIterData(UErrorCode 
&errorCode
) const; 
 239     uint16_t getNorm16(UChar32 c
) const { return UTRIE2_GET16(normTrie
, c
); } 
 241     UNormalizationCheckResult 
getCompQuickCheck(uint16_t norm16
) const { 
 242         if(norm16
<minNoNo 
|| MIN_YES_YES_WITH_CC
<=norm16
) { 
 244         } else if(minMaybeYes
<=norm16
) { 
 250     UBool 
isAlgorithmicNoNo(uint16_t norm16
) const { return limitNoNo
<=norm16 
&& norm16
<minMaybeYes
; } 
 251     UBool 
isCompNo(uint16_t norm16
) const { return minNoNo
<=norm16 
&& norm16
<minMaybeYes
; } 
 252     UBool 
isDecompYes(uint16_t norm16
) const { return norm16
<minYesNo 
|| minMaybeYes
<=norm16
; } 
 254     uint8_t getCC(uint16_t norm16
) const { 
 255         if(norm16
>=MIN_NORMAL_MAYBE_YES
) { 
 256             return (uint8_t)norm16
; 
 258         if(norm16
<minNoNo 
|| limitNoNo
<=norm16
) { 
 261         return getCCFromNoNo(norm16
); 
 263     static uint8_t getCCFromYesOrMaybe(uint16_t norm16
) { 
 264         return norm16
>=MIN_NORMAL_MAYBE_YES 
? (uint8_t)norm16 
: 0; 
 268      * Returns the FCD data for code point c. 
 269      * @param c A Unicode code point. 
 270      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 
 272     uint16_t getFCD16(UChar32 c
) const { 
 277         } else if(c
<=0xffff) { 
 278             if(!singleLeadMightHaveNonZeroFCD16(c
)) { return 0; } 
 280         return getFCD16FromNormData(c
); 
 283      * Returns the FCD data for the next code point (post-increment). 
 284      * Might skip only a lead surrogate rather than the whole surrogate pair if none of 
 285      * the supplementary code points associated with the lead surrogate have non-zero FCD data. 
 286      * @param s A valid pointer into a string. Requires s!=limit. 
 287      * @param limit The end of the string, or NULL. 
 288      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 
 290     uint16_t nextFCD16(const UChar 
*&s
, const UChar 
*limit
) const { 
 294         } else if(!singleLeadMightHaveNonZeroFCD16(c
)) { 
 298         if(U16_IS_LEAD(c
) && s
!=limit 
&& U16_IS_TRAIL(c2
=*s
)) { 
 299             c
=U16_GET_SUPPLEMENTARY(c
, c2
); 
 302         return getFCD16FromNormData(c
); 
 305      * Returns the FCD data for the previous code point (pre-decrement). 
 306      * @param start The start of the string. 
 307      * @param s A valid pointer into a string. Requires start<s. 
 308      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 
 310     uint16_t previousFCD16(const UChar 
*start
, const UChar 
*&s
) const { 
 315         if(!U16_IS_TRAIL(c
)) { 
 316             if(!singleLeadMightHaveNonZeroFCD16(c
)) { 
 321             if(start
<s 
&& U16_IS_LEAD(c2
=*(s
-1))) { 
 322                 c
=U16_GET_SUPPLEMENTARY(c2
, c
); 
 326         return getFCD16FromNormData(c
); 
 329     /** Returns the FCD data for U+0000<=c<U+0180. */ 
 330     uint16_t getFCD16FromBelow180(UChar32 c
) const { return tccc180
[c
]; } 
 331     /** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */ 
 332     UBool 
singleLeadMightHaveNonZeroFCD16(UChar32 lead
) const { 
 334         uint8_t bits
=smallFCD
[lead
>>8]; 
 335         if(bits
==0) { return false; } 
 336         return (UBool
)((bits
>>((lead
>>5)&7))&1); 
 338     /** Returns the FCD value from the regular normalization data. */ 
 339     uint16_t getFCD16FromNormData(UChar32 c
) const; 
 341     void makeCanonIterDataFromNorm16(UChar32 start
, UChar32 end
, uint16_t norm16
, 
 342                                      CanonIterData 
&newData
, UErrorCode 
&errorCode
) const; 
 345      * Gets the decomposition for one code point. 
 346      * @param c code point 
 347      * @param buffer out-only buffer for algorithmic decompositions 
 348      * @param length out-only, takes the length of the decomposition, if any 
 349      * @return pointer to the decomposition, or NULL if none 
 351     const UChar 
*getDecomposition(UChar32 c
, UChar buffer
[4], int32_t &length
) const; 
 354      * Gets the raw decomposition for one code point. 
 355      * @param c code point 
 356      * @param buffer out-only buffer for algorithmic decompositions 
 357      * @param length out-only, takes the length of the decomposition, if any 
 358      * @return pointer to the decomposition, or NULL if none 
 360     const UChar 
*getRawDecomposition(UChar32 c
, UChar buffer
[30], int32_t &length
) const; 
 362     UChar32 
composePair(UChar32 a
, UChar32 b
) const; 
 364     UBool 
isCanonSegmentStarter(UChar32 c
) const; 
 365     UBool 
getCanonStartSet(UChar32 c
, UnicodeSet 
&set
) const; 
 368         MIN_CCC_LCCC_CP
=0x300 
 372         MIN_YES_YES_WITH_CC
=0xff01, 
 374         MIN_NORMAL_MAYBE_YES
=0xfe00, 
 380         // Byte offsets from the start of the data, after the generic header. 
 382         IX_EXTRA_DATA_OFFSET
, 
 390         // Code point thresholds for quick check codes. 
 392         IX_MIN_COMP_NO_MAYBE_CP
, 
 394         // Norm16 value thresholds for quick check combinations and types of extra data. 
 395         IX_MIN_YES_NO
,  // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. 
 400         IX_MIN_YES_NO_MAPPINGS_ONLY
,  // Mappings only in [minYesNoMappingsOnly..minNoNo[. 
 407         MAPPING_HAS_CCC_LCCC_WORD
=0x80, 
 408         MAPPING_HAS_RAW_MAPPING
=0x40, 
 409         MAPPING_NO_COMP_BOUNDARY_AFTER
=0x20, 
 410         MAPPING_LENGTH_MASK
=0x1f 
 414         COMP_1_LAST_TUPLE
=0x8000, 
 416         COMP_1_TRAIL_LIMIT
=0x3400, 
 417         COMP_1_TRAIL_MASK
=0x7ffe, 
 418         COMP_1_TRAIL_SHIFT
=9,  // 10-1 for the "triple" bit 
 419         COMP_2_TRAIL_SHIFT
=6, 
 420         COMP_2_TRAIL_MASK
=0xffc0 
 423     // higher-level functionality ------------------------------------------ *** 
 425     // NFD without an NFD Normalizer2 instance. 
 426     UnicodeString 
&decompose(const UnicodeString 
&src
, UnicodeString 
&dest
, 
 427                              UErrorCode 
&errorCode
) const; 
 429      * Decomposes [src, limit[ and writes the result to dest. 
 430      * limit can be NULL if src is NUL-terminated. 
 431      * destLengthEstimate is the initial dest buffer capacity and can be -1. 
 433     void decompose(const UChar 
*src
, const UChar 
*limit
, 
 434                    UnicodeString 
&dest
, int32_t destLengthEstimate
, 
 435                    UErrorCode 
&errorCode
) const; 
 437     const UChar 
*decompose(const UChar 
*src
, const UChar 
*limit
, 
 438                            ReorderingBuffer 
*buffer
, UErrorCode 
&errorCode
) const; 
 439     void decomposeAndAppend(const UChar 
*src
, const UChar 
*limit
, 
 441                             UnicodeString 
&safeMiddle
, 
 442                             ReorderingBuffer 
&buffer
, 
 443                             UErrorCode 
&errorCode
) const; 
 444     UBool 
compose(const UChar 
*src
, const UChar 
*limit
, 
 445                   UBool onlyContiguous
, 
 447                   ReorderingBuffer 
&buffer
, 
 448                   UErrorCode 
&errorCode
) const; 
 449     const UChar 
*composeQuickCheck(const UChar 
*src
, const UChar 
*limit
, 
 450                                    UBool onlyContiguous
, 
 451                                    UNormalizationCheckResult 
*pQCResult
) const; 
 452     void composeAndAppend(const UChar 
*src
, const UChar 
*limit
, 
 454                           UBool onlyContiguous
, 
 455                           UnicodeString 
&safeMiddle
, 
 456                           ReorderingBuffer 
&buffer
, 
 457                           UErrorCode 
&errorCode
) const; 
 458     const UChar 
*makeFCD(const UChar 
*src
, const UChar 
*limit
, 
 459                          ReorderingBuffer 
*buffer
, UErrorCode 
&errorCode
) const; 
 460     void makeFCDAndAppend(const UChar 
*src
, const UChar 
*limit
, 
 462                           UnicodeString 
&safeMiddle
, 
 463                           ReorderingBuffer 
&buffer
, 
 464                           UErrorCode 
&errorCode
) const; 
 466     UBool 
hasDecompBoundary(UChar32 c
, UBool before
) const; 
 467     UBool 
isDecompInert(UChar32 c
) const { return isDecompYesAndZeroCC(getNorm16(c
)); } 
 469     UBool 
hasCompBoundaryBefore(UChar32 c
) const { 
 470         return c
<minCompNoMaybeCP 
|| hasCompBoundaryBefore(c
, getNorm16(c
)); 
 472     UBool 
hasCompBoundaryAfter(UChar32 c
, UBool onlyContiguous
, UBool testInert
) const; 
 474     UBool 
hasFCDBoundaryBefore(UChar32 c
) const { return c
<MIN_CCC_LCCC_CP 
|| getFCD16(c
)<=0xff; } 
 475     UBool 
hasFCDBoundaryAfter(UChar32 c
) const { 
 476         uint16_t fcd16
=getFCD16(c
); 
 477         return fcd16
<=1 || (fcd16
&0xff)==0; 
 479     UBool 
isFCDInert(UChar32 c
) const { return getFCD16(c
)<=1; } 
 481     UBool 
isMaybe(uint16_t norm16
) const { return minMaybeYes
<=norm16 
&& norm16
<=JAMO_VT
; } 
 482     UBool 
isMaybeOrNonZeroCC(uint16_t norm16
) const { return norm16
>=minMaybeYes
; } 
 483     static UBool 
isInert(uint16_t norm16
) { return norm16
==0; } 
 484     static UBool 
isJamoL(uint16_t norm16
) { return norm16
==1; } 
 485     static UBool 
isJamoVT(uint16_t norm16
) { return norm16
==JAMO_VT
; } 
 486     UBool 
isHangul(uint16_t norm16
) const { return norm16
==minYesNo
; } 
 487     UBool 
isCompYesAndZeroCC(uint16_t norm16
) const { return norm16
<minNoNo
; } 
 488     // UBool isCompYes(uint16_t norm16) const { 
 489     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 
 491     // UBool isCompYesOrMaybe(uint16_t norm16) const { 
 492     //     return norm16<minNoNo || minMaybeYes<=norm16; 
 494     // UBool hasZeroCCFromDecompYes(uint16_t norm16) const { 
 495     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 
 497     UBool 
isDecompYesAndZeroCC(uint16_t norm16
) const { 
 498         return norm16
<minYesNo 
|| 
 500                (minMaybeYes
<=norm16 
&& norm16
<=MIN_NORMAL_MAYBE_YES
); 
 503      * A little faster and simpler than isDecompYesAndZeroCC() but does not include 
 504      * the MaybeYes which combine-forward and have ccc=0. 
 505      * (Standard Unicode 5.2 normalization does not have such characters.) 
 507     UBool 
isMostDecompYesAndZeroCC(uint16_t norm16
) const { 
 508         return norm16
<minYesNo 
|| norm16
==MIN_NORMAL_MAYBE_YES 
|| norm16
==JAMO_VT
; 
 510     UBool 
isDecompNoAlgorithmic(uint16_t norm16
) const { return norm16
>=limitNoNo
; } 
 512     // For use with isCompYes(). 
 513     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 
 514     // static uint8_t getCCFromYes(uint16_t norm16) { 
 515     //     return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 
 517     uint8_t getCCFromNoNo(uint16_t norm16
) const { 
 518         const uint16_t *mapping
=getMapping(norm16
); 
 519         if(*mapping
&MAPPING_HAS_CCC_LCCC_WORD
) { 
 520             return (uint8_t)*(mapping
-1); 
 525     // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() 
 526     uint8_t getTrailCCFromCompYesAndZeroCC(const UChar 
*cpStart
, const UChar 
*cpLimit
) const; 
 528     // Requires algorithmic-NoNo. 
 529     UChar32 
mapAlgorithmic(UChar32 c
, uint16_t norm16
) const { 
 530         return c
+norm16
-(minMaybeYes
-MAX_DELTA
-1); 
 533     // Requires minYesNo<norm16<limitNoNo. 
 534     const uint16_t *getMapping(uint16_t norm16
) const { return extraData
+norm16
; } 
 535     const uint16_t *getCompositionsListForDecompYes(uint16_t norm16
) const { 
 536         if(norm16
==0 || MIN_NORMAL_MAYBE_YES
<=norm16
) { 
 538         } else if(norm16
<minMaybeYes
) { 
 539             return extraData
+norm16
;  // for yesYes; if Jamo L: harmless empty list 
 541             return maybeYesCompositions
+norm16
-minMaybeYes
; 
 544     const uint16_t *getCompositionsListForComposite(uint16_t norm16
) const { 
 545         const uint16_t *list
=extraData
+norm16
;  // composite has both mapping & compositions list 
 546         return list
+  // mapping pointer 
 547             1+  // +1 to skip the first unit with the mapping lenth 
 548             (*list
&MAPPING_LENGTH_MASK
);  // + mapping length 
 551      * @param c code point must have compositions 
 552      * @return compositions list pointer 
 554     const uint16_t *getCompositionsList(uint16_t norm16
) const { 
 555         return isDecompYes(norm16
) ? 
 556                 getCompositionsListForDecompYes(norm16
) : 
 557                 getCompositionsListForComposite(norm16
); 
 560     const UChar 
*copyLowPrefixFromNulTerminated(const UChar 
*src
, 
 561                                                 UChar32 minNeedDataCP
, 
 562                                                 ReorderingBuffer 
*buffer
, 
 563                                                 UErrorCode 
&errorCode
) const; 
 564     UBool 
decomposeShort(const UChar 
*src
, const UChar 
*limit
, 
 565                          ReorderingBuffer 
&buffer
, UErrorCode 
&errorCode
) const; 
 566     UBool 
decompose(UChar32 c
, uint16_t norm16
, 
 567                     ReorderingBuffer 
&buffer
, UErrorCode 
&errorCode
) const; 
 569     static int32_t combine(const uint16_t *list
, UChar32 trail
); 
 570     void addComposites(const uint16_t *list
, UnicodeSet 
&set
) const; 
 571     void recompose(ReorderingBuffer 
&buffer
, int32_t recomposeStartIndex
, 
 572                    UBool onlyContiguous
) const; 
 574     UBool 
hasCompBoundaryBefore(UChar32 c
, uint16_t norm16
) const; 
 575     const UChar 
*findPreviousCompBoundary(const UChar 
*start
, const UChar 
*p
) const; 
 576     const UChar 
*findNextCompBoundary(const UChar 
*p
, const UChar 
*limit
) const; 
 578     const UChar 
*findPreviousFCDBoundary(const UChar 
*start
, const UChar 
*p
) const; 
 579     const UChar 
*findNextFCDBoundary(const UChar 
*p
, const UChar 
*limit
) const; 
 581     int32_t getCanonValue(UChar32 c
) const; 
 582     const UnicodeSet 
&getCanonStartSet(int32_t n
) const; 
 584     // UVersionInfo dataVersion; 
 586     // Code point thresholds for quick check codes. 
 587     UChar32 minDecompNoCP
; 
 588     UChar32 minCompNoMaybeCP
; 
 590     // Norm16 value thresholds for quick check combinations and types of extra data. 
 592     uint16_t minYesNoMappingsOnly
; 
 595     uint16_t minMaybeYes
; 
 597     const UTrie2 
*normTrie
; 
 598     const uint16_t *maybeYesCompositions
; 
 599     const uint16_t *extraData
;  // mappings and/or compositions for yesYes, yesNo & noNo characters 
 600     const uint8_t *smallFCD
;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0 
 601     uint8_t tccc180
[0x180];  // tccc values for U+0000..U+017F 
 603 public:  // CanonIterData is public to allow access from C callback functions. 
 604     UInitOnce       fCanonIterDataInitOnce
; 
 605     CanonIterData  
*fCanonIterData
; 
 608 // bits in canonIterData 
 609 #define CANON_NOT_SEGMENT_STARTER 0x80000000 
 610 #define CANON_HAS_COMPOSITIONS 0x40000000 
 611 #define CANON_HAS_SET 0x200000 
 612 #define CANON_VALUE_MASK 0x1fffff 
 615  * ICU-internal shortcut for quick access to standard Unicode normalization. 
 617 class U_COMMON_API Normalizer2Factory 
{ 
 619     static const Normalizer2 
*getFCDInstance(UErrorCode 
&errorCode
); 
 620     static const Normalizer2 
*getFCCInstance(UErrorCode 
&errorCode
); 
 621     static const Normalizer2 
*getNoopInstance(UErrorCode 
&errorCode
); 
 623     static const Normalizer2 
*getInstance(UNormalizationMode mode
, UErrorCode 
&errorCode
); 
 625     static const Normalizer2Impl 
*getNFCImpl(UErrorCode 
&errorCode
); 
 626     static const Normalizer2Impl 
*getNFKCImpl(UErrorCode 
&errorCode
); 
 627     static const Normalizer2Impl 
*getNFKC_CFImpl(UErrorCode 
&errorCode
); 
 629     // Get the Impl instance of the Normalizer2. 
 630     // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance. 
 631     static const Normalizer2Impl 
*getImpl(const Normalizer2 
*norm2
); 
 633     Normalizer2Factory();  // No instantiation. 
 638 U_CAPI 
int32_t U_EXPORT2
 
 639 unorm2_swap(const UDataSwapper 
*ds
, 
 640             const void *inData
, int32_t length
, void *outData
, 
 641             UErrorCode 
*pErrorCode
); 
 644  * Get the NF*_QC property for a code point, for u_getIntPropertyValue(). 
 647 U_CFUNC UNormalizationCheckResult
 
 648 unorm_getQuickCheck(UChar32 c
, UNormalizationMode mode
); 
 651  * Gets the 16-bit FCD value (lead & trail CCs) for a code point, for u_getIntPropertyValue(). 
 655 unorm_getFCD16(UChar32 c
); 
 658  * Format of Normalizer2 .nrm data files. 
 659  * Format version 2.0. 
 661  * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms. 
 662  * ICU ships with data files for standard Unicode Normalization Forms 
 663  * NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm). 
 664  * Custom (application-specific) data can be built into additional .nrm files 
 665  * with the gennorm2 build tool. 
 667  * Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been 
 668  * cached already. Internally, Normalizer2Impl.load() reads the .nrm file. 
 670  * A .nrm file begins with a standard ICU data file header 
 671  * (DataHeader, see ucmndata.h and unicode/udata.h). 
 672  * The UDataInfo.dataVersion field usually contains the Unicode version 
 673  * for which the data was generated. 
 675  * After the header, the file contains the following parts. 
 676  * Constants are defined as enum values of the Normalizer2Impl class. 
 678  * Many details of the data structures are described in the design doc 
 679  * which is at http://site.icu-project.org/design/normalization/custom 
 681  * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_NORM_TRIE_OFFSET]/4; 
 683  *      The first eight indexes are byte offsets in ascending order. 
 684  *      Each byte offset marks the start of the next part in the data file, 
 685  *      and the end of the previous one. 
 686  *      When two consecutive byte offsets are the same, then the corresponding part is empty. 
 687  *      Byte offsets are offsets from after the header, 
 688  *      that is, from the beginning of the indexes[]. 
 689  *      Each part starts at an offset with proper alignment for its data. 
 690  *      If necessary, the previous part may include padding bytes to achieve this alignment. 
 692  *      minDecompNoCP=indexes[IX_MIN_DECOMP_NO_CP] is the lowest code point 
 693  *      with a decomposition mapping, that is, with NF*D_QC=No. 
 694  *      minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point 
 695  *      with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward). 
 697  *      The next five indexes are thresholds of 16-bit trie values for ranges of 
 698  *      values indicating multiple normalization properties. 
 699  *          minYesNo=indexes[IX_MIN_YES_NO]; 
 700  *          minNoNo=indexes[IX_MIN_NO_NO]; 
 701  *          limitNoNo=indexes[IX_LIMIT_NO_NO]; 
 702  *          minMaybeYes=indexes[IX_MIN_MAYBE_YES]; 
 703  *          minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 
 704  *      See the normTrie description below and the design doc for details. 
 706  * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h 
 708  *      The trie holds the main normalization data. Each code point is mapped to a 16-bit value. 
 709  *      Rather than using independent bits in the value (which would require more than 16 bits), 
 710  *      information is extracted primarily via range checks. 
 711  *      For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo 
 712  *      means that the character has NF*C_QC=Yes and NF*D_QC=No properties, 
 713  *      which means it has a two-way (round-trip) decomposition mapping. 
 714  *      Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData 
 715  *      pointing to mappings, compositions lists, or both. 
 716  *      Value norm16==0 means that the character is normalization-inert, that is, 
 717  *      it does not have a mapping, does not participate in composition, has a zero 
 718  *      canonical combining class, and forms a boundary where text before it and after it 
 719  *      can be normalized independently. 
 720  *      For details about how multiple properties are encoded in 16-bit values 
 721  *      see the design doc. 
 722  *      Note that the encoding cannot express all combinations of the properties involved; 
 723  *      it only supports those combinations that are allowed by 
 724  *      the Unicode Normalization algorithms. Details are in the design doc as well. 
 725  *      The gennorm2 tool only builds .nrm files for data that conforms to the limitations. 
 727  *      The trie has a value for each lead surrogate code unit representing the "worst case" 
 728  *      properties of the 1024 supplementary characters whose UTF-16 form starts with 
 729  *      the lead surrogate. If all of the 1024 supplementary characters are normalization-inert, 
 730  *      then their lead surrogate code unit has the trie value 0. 
 731  *      When the lead surrogate unit's value exceeds the quick check minimum during processing, 
 732  *      the properties for the full supplementary code point need to be looked up. 
 734  * uint16_t maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]; 
 735  * uint16_t extraData[]; 
 737  *      There is only one byte offset for the end of these two arrays. 
 738  *      The split between them is given by the constant and variable mentioned above. 
 740  *      The maybeYesCompositions array contains compositions lists for characters that 
 741  *      combine both forward (as starters in composition pairs) 
 742  *      and backward (as trailing characters in composition pairs). 
 743  *      Such characters do not occur in Unicode 5.2 but are allowed by 
 744  *      the Unicode Normalization algorithms. 
 745  *      If there are no such characters, then minMaybeYes==MIN_NORMAL_MAYBE_YES 
 746  *      and the maybeYesCompositions array is empty. 
 747  *      If there are such characters, then minMaybeYes is subtracted from their norm16 values 
 748  *      to get the index into this array. 
 750  *      The extraData array contains compositions lists for "YesYes" characters, 
 751  *      followed by mappings and optional compositions lists for "YesNo" characters, 
 752  *      followed by only mappings for "NoNo" characters. 
 753  *      (Referring to pairs of NFC/NFD quick check values.) 
 754  *      The norm16 values of those characters are directly indexes into the extraData array. 
 756  *      The data structures for compositions lists and mappings are described in the design doc. 
 758  * uint8_t smallFCD[0x100]; -- new in format version 2 
 760  *      This is a bit set to help speed up FCD value lookups in the absence of a full 
 761  *      UTrie2 or other large data structure with the full FCD value mapping. 
 763  *      Each smallFCD bit is set if any of the corresponding 32 BMP code points 
 764  *      has a non-zero FCD value (lccc!=0 or tccc!=0). 
 765  *      Bit 0 of smallFCD[0] is for U+0000..U+001F. Bit 7 of smallFCD[0xff] is for U+FFE0..U+FFFF. 
 766  *      A bit for 32 lead surrogates is set if any of the 32k corresponding 
 767  *      _supplementary_ code points has a non-zero FCD value. 
 769  *      This bit set is most useful for the large blocks of CJK characters with FCD=0. 
 771  * Changes from format version 1 to format version 2 --------------------------- 
 773  * - Addition of data for raw (not recursively decomposed) mappings. 
 774  *   + The MAPPING_NO_COMP_BOUNDARY_AFTER bit in the extraData is now also set when 
 775  *     the mapping is to an empty string or when the character combines-forward. 
 776  *     This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which 
 777  *     is then repurposed for the MAPPING_HAS_RAW_MAPPING bit. 
 778  *   + For details see the design doc. 
 779  * - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into 
 780  *   distinct ranges (combines-forward vs. not) 
 781  *   so that a range check can be used to find out if there is a compositions list. 
 782  *   This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag. 
 783  *   It is needed for the new (in ICU 49) composePair(), not for other normalization. 
 784  * - Addition of the smallFCD[] bit set. 
 787 #endif  /* !UCONFIG_NO_NORMALIZATION */ 
 788 #endif  /* __NORMALIZER2IMPL_H__ */