2 ******************************************************************************
3 * Copyright (c) 1996-2006, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
8 * Created by: Vladimir Weinstein 12052000
10 * Modification history :
12 * Date Name Description
13 * 02/01/01 synwee Added normalization quickcheck enum and method.
14 * 02/12/01 synwee Commented out quickcheck util api has been approved
15 * Added private method for doing FCD checks
16 * 02/23/01 synwee Modified quickcheck and checkFCE to run through
17 * string for codepoints < 0x300 for the normalization
19 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here
20 * instead of just wrappers around normlzr.cpp,
21 * load unorm.dat, support Unicode 3.1 with
22 * supplementary code points, etc.
25 #include "unicode/utypes.h"
27 #if !UCONFIG_NO_NORMALIZATION
29 #include "unicode/udata.h"
30 #include "unicode/uchar.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/uniset.h"
34 #include "unicode/usetiter.h"
35 #include "unicode/unorm.h"
42 #include "unicode/uset.h"
47 * Status of tailored normalization
49 * This was done initially for investigation on Unicode public review issue 7
50 * (http://www.unicode.org/review/). See Jitterbug 2481.
51 * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
52 * a permanent feature in ICU 2.6 in support of IDNA which requires true
53 * Unicode 3.2 normalization.
54 * (NormalizationCorrections are rolled into IDNA mapping tables.)
56 * Tailored normalization as implemented here allows to "normalize less"
57 * than full Unicode normalization would.
58 * Based internally on a UnicodeSet of code points that are
59 * "excluded from normalization", the normalization functions leave those
60 * code points alone ("inert"). This means that tailored normalization
61 * still transforms text into a canonically equivalent form.
62 * It does not add decompositions to code points that do not have any or
63 * change decomposition results.
65 * Any function that searches for a safe boundary has not been touched,
66 * which means that these functions will be over-pessimistic when
67 * exclusions are applied.
68 * This should not matter because subsequent checks and normalizations
69 * do apply the exclusions; only a little more of the text may be processed
70 * than necessary under exclusions.
72 * Normalization exclusions have the following effect on excluded code points c:
73 * - c is not decomposed
74 * - c is not a composition target
75 * - c does not combine forward or backward for composition
76 * except that this is not implemented for Jamo
77 * - c is treated as having a combining class of 0
79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
82 * This new implementation of the normalization code loads its data from
83 * unorm.dat, which is generated with the gennorm tool.
84 * The format of that file is described in unormimp.h .
87 /* -------------------------------------------------------------------------- */
90 _STACK_BUFFER_CAPACITY
=100
94 * Constants for the bit fields in the options bit set parameter.
95 * These need not be public.
96 * A user only needs to know the currently assigned values.
97 * The number and positions of reserved bits per field can remain private
98 * and may change in future implementations.
101 _NORM_OPTIONS_NX_MASK
=0x1f,
102 _NORM_OPTIONS_UNICODE_MASK
=0x60,
103 _NORM_OPTIONS_SETS_MASK
=0x7f,
105 _NORM_OPTIONS_UNICODE_SHIFT
=5,
108 * The following options are used only in some composition functions.
109 * They use bits 12 and up to preserve lower bits for the available options
110 * space in unorm_compare() -
111 * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
114 /** Options bit 12, for compatibility vs. canonical decomposition. */
115 _NORM_OPTIONS_COMPAT
=0x1000,
116 /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
117 _NORM_OPTIONS_COMPOSE_CONTIGUOUS
=0x2000
122 isHangulWithoutJamoT(UChar c
) {
124 return c
<HANGUL_COUNT
&& c%JAMO_T_COUNT
==0;
129 /* is this a norm32 with a regular index? */
131 isNorm32Regular(uint32_t norm32
) {
132 return norm32
<_NORM_MIN_SPECIAL
;
135 /* is this a norm32 with a special index for a lead surrogate? */
137 isNorm32LeadSurrogate(uint32_t norm32
) {
138 return _NORM_MIN_SPECIAL
<=norm32
&& norm32
<_NORM_SURROGATES_TOP
;
141 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
143 isNorm32HangulOrJamo(uint32_t norm32
) {
144 return norm32
>=_NORM_MIN_HANGUL
;
148 * Given isNorm32HangulOrJamo(),
149 * is this a Hangul syllable or a Jamo?
151 /*static inline UBool
152 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
153 return norm32<_NORM_MIN_JAMO_V;
157 * Given norm32 for Jamo V or T,
161 isJamoVTNorm32JamoV(uint32_t norm32
) {
162 return norm32
<_NORM_JAMO_V_TOP
;
165 /* load unorm.dat ----------------------------------------------------------- */
167 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
168 static int32_t U_CALLCONV
169 getFoldingNormOffset(uint32_t norm32
) {
170 if(isNorm32LeadSurrogate(norm32
)) {
172 UTRIE_BMP_INDEX_LENGTH
+
173 (((int32_t)norm32
>>(_NORM_EXTRA_SHIFT
-UTRIE_SURROGATE_BLOCK_BITS
))&
174 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS
));
180 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
181 static int32_t U_CALLCONV
182 getFoldingAuxOffset(uint32_t data
) {
183 return (int32_t)(data
&_NORM_AUX_FNC_MASK
)<<UTRIE_SURROGATE_BLOCK_BITS
;
187 #define UNORM_HARDCODE_DATA 1
189 #if UNORM_HARDCODE_DATA
191 /* unorm_props_data.c is machine-generated by gennorm --csource */
192 #include "unorm_props_data.c"
194 static const UBool formatVersion_2_2
=TRUE
;
198 #define DATA_NAME "unorm"
199 #define DATA_TYPE "icu"
201 static UDataMemory
*normData
=NULL
;
202 static UErrorCode dataErrorCode
=U_ZERO_ERROR
;
203 static int8_t haveNormData
=0;
205 static int32_t indexes
[_NORM_INDEX_TOP
]={ 0 };
206 static UTrie normTrie
={ 0,0,0,0,0,0,0 }, fcdTrie
={ 0,0,0,0,0,0,0 }, auxTrie
={ 0,0,0,0,0,0,0 };
209 * pointers into the memory-mapped unorm.icu
211 static const uint16_t *extraData
=NULL
,
212 *combiningTable
=NULL
,
213 *canonStartSets
=NULL
;
215 static uint8_t formatVersion
[4]={ 0, 0, 0, 0 };
216 static UBool formatVersion_2_1
=FALSE
, formatVersion_2_2
=FALSE
;
218 /* the Unicode version of the normalization data */
219 static UVersionInfo dataVersion
={ 0, 0, 0, 0 };
223 /* cache UnicodeSets for each combination of exclusion flags */
224 static UnicodeSet
*nxCache
[_NORM_OPTIONS_SETS_MASK
+1]={ NULL
};
228 static UBool U_CALLCONV
229 unorm_cleanup(void) {
232 #if !UNORM_HARDCODE_DATA
234 udata_close(normData
);
237 dataErrorCode
=U_ZERO_ERROR
;
241 for(i
=0; i
<(int32_t)LENGTHOF(nxCache
); ++i
) {
251 #if !UNORM_HARDCODE_DATA
253 static UBool U_CALLCONV
254 isAcceptable(void * /* context */,
255 const char * /* type */, const char * /* name */,
256 const UDataInfo
*pInfo
) {
259 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
260 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
261 pInfo
->dataFormat
[0]==0x4e && /* dataFormat="Norm" */
262 pInfo
->dataFormat
[1]==0x6f &&
263 pInfo
->dataFormat
[2]==0x72 &&
264 pInfo
->dataFormat
[3]==0x6d &&
265 pInfo
->formatVersion
[0]==2 &&
266 pInfo
->formatVersion
[2]==UTRIE_SHIFT
&&
267 pInfo
->formatVersion
[3]==UTRIE_INDEX_SHIFT
269 uprv_memcpy(formatVersion
, pInfo
->formatVersion
, 4);
270 uprv_memcpy(dataVersion
, pInfo
->dataVersion
, 4);
279 static UBool U_CALLCONV
280 _enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32
/*limit*/, uint32_t /*value*/) {
281 /* add the start code point to the USet */
282 const USetAdder
*sa
=(const USetAdder
*)context
;
283 sa
->add(sa
->set
, start
);
289 #if !UNORM_HARDCODE_DATA
292 loadNormData(UErrorCode
&errorCode
) {
293 /* load Unicode normalization data from file */
296 * This lazy intialization with double-checked locking (without mutex protection for
297 * haveNormData==0) is transiently unsafe under certain circumstances.
298 * Check the readme and use u_init() if necessary.
300 * While u_init() initializes the main normalization data via this functions,
301 * it does not do so for exclusion sets (which are fully mutexed).
303 * - there can be many exclusion sets
304 * - they are rarely used
305 * - they are not usually used in execution paths that are
306 * as performance-sensitive as others
307 * (e.g., IDNA takes more time than unorm_quickCheck() anyway)
309 if(haveNormData
==0) {
310 UTrie _normTrie
={ 0,0,0,0,0,0,0 }, _fcdTrie
={ 0,0,0,0,0,0,0 }, _auxTrie
={ 0,0,0,0,0,0,0 };
313 const int32_t *p
=NULL
;
316 if(&errorCode
==NULL
|| U_FAILURE(errorCode
)) {
320 /* open the data outside the mutex block */
321 data
=udata_openChoice(NULL
, DATA_TYPE
, DATA_NAME
, isAcceptable
, NULL
, &errorCode
);
322 dataErrorCode
=errorCode
;
323 if(U_FAILURE(errorCode
)) {
324 return haveNormData
=-1;
327 p
=(const int32_t *)udata_getMemory(data
);
328 pb
=(const uint8_t *)(p
+_NORM_INDEX_TOP
);
329 utrie_unserialize(&_normTrie
, pb
, p
[_NORM_INDEX_TRIE_SIZE
], &errorCode
);
330 _normTrie
.getFoldingOffset
=getFoldingNormOffset
;
332 pb
+=p
[_NORM_INDEX_TRIE_SIZE
]+p
[_NORM_INDEX_UCHAR_COUNT
]*2+p
[_NORM_INDEX_COMBINE_DATA_COUNT
]*2;
333 if(p
[_NORM_INDEX_FCD_TRIE_SIZE
]!=0) {
334 utrie_unserialize(&_fcdTrie
, pb
, p
[_NORM_INDEX_FCD_TRIE_SIZE
], &errorCode
);
336 pb
+=p
[_NORM_INDEX_FCD_TRIE_SIZE
];
338 if(p
[_NORM_INDEX_AUX_TRIE_SIZE
]!=0) {
339 utrie_unserialize(&_auxTrie
, pb
, p
[_NORM_INDEX_AUX_TRIE_SIZE
], &errorCode
);
340 _auxTrie
.getFoldingOffset
=getFoldingAuxOffset
;
343 if(U_FAILURE(errorCode
)) {
344 dataErrorCode
=errorCode
;
346 return haveNormData
=-1;
349 /* in the mutex block, set the data for this process */
355 uprv_memcpy(&indexes
, p
, sizeof(indexes
));
356 uprv_memcpy(&normTrie
, &_normTrie
, sizeof(UTrie
));
357 uprv_memcpy(&fcdTrie
, &_fcdTrie
, sizeof(UTrie
));
358 uprv_memcpy(&auxTrie
, &_auxTrie
, sizeof(UTrie
));
360 p
=(const int32_t *)udata_getMemory(normData
);
363 /* initialize some variables */
364 extraData
=(uint16_t *)((uint8_t *)(p
+_NORM_INDEX_TOP
)+indexes
[_NORM_INDEX_TRIE_SIZE
]);
365 combiningTable
=extraData
+indexes
[_NORM_INDEX_UCHAR_COUNT
];
366 formatVersion_2_1
=formatVersion
[0]>2 || (formatVersion
[0]==2 && formatVersion
[1]>=1);
367 formatVersion_2_2
=formatVersion
[0]>2 || (formatVersion
[0]==2 && formatVersion
[1]>=2);
368 if(formatVersion_2_1
) {
369 canonStartSets
=combiningTable
+
370 indexes
[_NORM_INDEX_COMBINE_DATA_COUNT
]+
371 (indexes
[_NORM_INDEX_FCD_TRIE_SIZE
]+indexes
[_NORM_INDEX_AUX_TRIE_SIZE
])/2;
374 ucln_common_registerCleanup(UCLN_COMMON_UNORM
, unorm_cleanup
);
377 /* if a different thread set it first, then close the extra data */
379 udata_close(data
); /* NULL if it was set correctly */
389 _haveData(UErrorCode
&errorCode
) {
390 #if UNORM_HARDCODE_DATA
391 return U_SUCCESS(errorCode
);
393 if(U_FAILURE(errorCode
)) {
395 } else if(haveNormData
>0) {
397 } else if(haveNormData
<0) {
398 errorCode
=dataErrorCode
;
400 } else /* haveNormData==0 */ {
401 return (UBool
)(loadNormData(errorCode
)>0);
406 U_CAPI UBool U_EXPORT2
407 unorm_haveData(UErrorCode
*pErrorCode
) {
408 return _haveData(*pErrorCode
);
411 U_CAPI
const uint16_t * U_EXPORT2
412 unorm_getFCDTrie(UErrorCode
*pErrorCode
) {
413 if(_haveData(*pErrorCode
)) {
414 return fcdTrie
.index
;
420 /* data access primitives --------------------------------------------------- */
422 static inline uint32_t
423 _getNorm32(UChar c
) {
424 return UTRIE_GET32_FROM_LEAD(&normTrie
, c
);
427 static inline uint32_t
428 _getNorm32FromSurrogatePair(uint32_t norm32
, UChar c2
) {
430 * the surrogate index in norm32 stores only the number of the surrogate index block
431 * see gennorm/store.c/getFoldedNormValue()
434 UTRIE_BMP_INDEX_LENGTH
+
435 ((norm32
>>(_NORM_EXTRA_SHIFT
-UTRIE_SURROGATE_BLOCK_BITS
))&
436 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS
));
437 return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie
, norm32
, c2
);
441 * get a norm32 from text with complete code points
442 * (like from decompositions)
444 static inline uint32_t
445 _getNorm32(const UChar
*p
, uint32_t mask
) {
446 uint32_t norm32
=_getNorm32(*p
);
447 if((norm32
&mask
) && isNorm32LeadSurrogate(norm32
)) {
448 /* *p is a lead surrogate, get the real norm32 */
449 norm32
=_getNorm32FromSurrogatePair(norm32
, *(p
+1));
454 static inline uint16_t
456 return UTRIE_GET16_FROM_LEAD(&fcdTrie
, c
);
459 static inline uint16_t
460 _getFCD16FromSurrogatePair(uint16_t fcd16
, UChar c2
) {
461 /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
462 return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie
, fcd16
, c2
);
465 static inline const uint16_t *
466 _getExtraData(uint32_t norm32
) {
467 return extraData
+(norm32
>>_NORM_EXTRA_SHIFT
);
472 * It is possible to get the FCD data from the main trie if unorm.icu
473 * was built without the FCD trie, although it is slower.
474 * This is not implemented because it is hard to test, and because it seems
475 * unusual to want to use FCD and not build the data file for it.
477 * Untested sample code:
479 static inline uint16_t
480 _getFCD16FromNormData(UChar32 c
) {
481 uint32_t norm32
, fcd
;
483 norm32
=_getNorm32(c
);
484 if((norm32
&_NORM_QC_NFD
) && isNorm32Regular(norm32
)) {
485 /* get the lead/trail cc from the decomposition data */
486 const uint16_t *nfd
=_getExtraData(norm32
);
487 if(*nfd
&_NORM_DECOMP_FLAG_LENGTH_HAS_CC
) {
491 fcd
=norm32
&_NORM_CC_MASK
;
493 /* use the code point cc value for both lead and trail cc's */
494 fcd
|=fcd
>>_NORM_CC_SHIFT
; /* assume that the cc is in bits 15..8 */
498 return (uint16_t)fcd
;
502 /* normalization exclusion sets --------------------------------------------- */
505 * Normalization exclusion UnicodeSets are used for tailored normalization;
506 * see the comment near the beginning of this file.
508 * By specifying one or several sets of code points,
509 * those code points become inert for normalization.
512 static const UnicodeSet
*
513 internalGetNXHangul(UErrorCode
&errorCode
) {
514 /* internal function, does not check for incoming U_FAILURE */
517 UMTX_CHECK(NULL
, (UBool
)(nxCache
[UNORM_NX_HANGUL
]!=NULL
), isCached
);
520 UnicodeSet
*set
=new UnicodeSet(0xac00, 0xd7a3);
522 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
527 if(nxCache
[UNORM_NX_HANGUL
]==NULL
) {
528 nxCache
[UNORM_NX_HANGUL
]=set
;
530 ucln_common_registerCleanup(UCLN_COMMON_UNORM
, unorm_cleanup
);
537 return nxCache
[UNORM_NX_HANGUL
];
540 /* unorm.cpp 1.116 had and used
541 static const UnicodeSet *
542 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
547 /* get and set an exclusion set from a serialized UnicodeSet */
548 static const UnicodeSet
*
549 internalGetSerializedNX(int32_t options
, int32_t nxIndex
, UErrorCode
&errorCode
) {
550 /* internal function, does not check for incoming U_FAILURE */
553 UMTX_CHECK(NULL
, (UBool
)(nxCache
[options
]!=NULL
), isCached
);
556 canonStartSets
!=NULL
&&
557 canonStartSets
[nxIndex
]!=0 && canonStartSets
[nxIndex
+1]>canonStartSets
[nxIndex
]
564 if( !uset_getSerializedSet(
566 canonStartSets
+canonStartSets
[nxIndex
],
567 canonStartSets
[nxIndex
+1]-canonStartSets
[nxIndex
])
569 errorCode
=U_INVALID_FORMAT_ERROR
;
573 /* turn the serialized set into a UnicodeSet */
574 set
=new UnicodeSet();
576 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
579 for(i
=0; uset_getSerializedRange(&sset
, i
, &start
, &end
); ++i
) {
580 set
->add(start
, end
);
584 if(nxCache
[options
]==NULL
) {
585 nxCache
[options
]=set
;
587 ucln_common_registerCleanup(UCLN_COMMON_UNORM
, unorm_cleanup
);
594 return nxCache
[options
];
597 static const UnicodeSet
*
598 internalGetNXCJKCompat(UErrorCode
&errorCode
) {
599 /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
600 return internalGetSerializedNX(
602 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET
,
606 static const UnicodeSet
*
607 internalGetNXUnicode(uint32_t options
, UErrorCode
&errorCode
) {
608 /* internal function, does not check for incoming U_FAILURE */
611 options
&=_NORM_OPTIONS_UNICODE_MASK
;
615 case UNORM_UNICODE_3_2
:
617 nxIndex
=_NORM_SET_INDEX_NX_UNICODE32_OFFSET
;
620 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
624 /* build a set with all code points that were not designated by the specified Unicode version */
625 return internalGetSerializedNX(options
, nxIndex
, errorCode
);
628 /* Get a decomposition exclusion set. The data must be loaded. */
629 static const UnicodeSet
*
630 internalGetNX(int32_t options
, UErrorCode
&errorCode
) {
631 options
&=_NORM_OPTIONS_SETS_MASK
;
635 UMTX_CHECK(NULL
, (UBool
)(nxCache
[options
]!=NULL
), isCached
);
638 /* return basic sets */
639 if(options
==UNORM_NX_HANGUL
) {
640 return internalGetNXHangul(errorCode
);
642 if(options
==UNORM_NX_CJK_COMPAT
) {
643 return internalGetNXCJKCompat(errorCode
);
645 if((options
&_NORM_OPTIONS_UNICODE_MASK
)!=0 && (options
&_NORM_OPTIONS_NX_MASK
)==0) {
646 return internalGetNXUnicode(options
, errorCode
);
649 /* build a set from multiple subsets */
651 const UnicodeSet
*other
;
653 set
=new UnicodeSet();
655 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
659 if((options
&UNORM_NX_HANGUL
)!=0 && NULL
!=(other
=internalGetNXHangul(errorCode
))) {
662 if((options
&UNORM_NX_CJK_COMPAT
)!=0 && NULL
!=(other
=internalGetNXCJKCompat(errorCode
))) {
665 if((options
&_NORM_OPTIONS_UNICODE_MASK
)!=0 && NULL
!=(other
=internalGetNXUnicode(options
, errorCode
))) {
669 if(U_FAILURE(errorCode
)) {
675 if(nxCache
[options
]==NULL
) {
676 nxCache
[options
]=set
;
678 ucln_common_registerCleanup(UCLN_COMMON_UNORM
, unorm_cleanup
);
685 return nxCache
[options
];
688 static inline const UnicodeSet
*
689 getNX(int32_t options
, UErrorCode
&errorCode
) {
690 if(U_FAILURE(errorCode
) || (options
&=_NORM_OPTIONS_SETS_MASK
)==0) {
691 /* incoming failure, or no decomposition exclusions requested */
694 return internalGetNX(options
, errorCode
);
698 U_CFUNC
const UnicodeSet
*
699 unorm_getNX(int32_t options
, UErrorCode
*pErrorCode
) {
700 return getNX(options
, *pErrorCode
);
704 nx_contains(const UnicodeSet
*nx
, UChar32 c
) {
705 return nx
!=NULL
&& nx
->contains(c
);
709 nx_contains(const UnicodeSet
*nx
, UChar c
, UChar c2
) {
710 return nx
!=NULL
&& nx
->contains(c2
==0 ? c
: U16_GET_SUPPLEMENTARY(c
, c2
));
713 /* other normalization primitives ------------------------------------------- */
715 /* get the canonical or compatibility decomposition for one character */
716 static inline const UChar
*
717 _decompose(uint32_t norm32
, uint32_t qcMask
, int32_t &length
,
718 uint8_t &cc
, uint8_t &trailCC
) {
719 const UChar
*p
=(const UChar
*)_getExtraData(norm32
);
722 if((norm32
&qcMask
&_NORM_QC_NFKD
)!=0 && length
>=0x100) {
723 /* use compatibility decomposition, skip canonical data */
724 p
+=((length
>>7)&1)+(length
&_NORM_DECOMP_LENGTH_MASK
);
728 if(length
&_NORM_DECOMP_FLAG_LENGTH_HAS_CC
) {
729 /* get the lead and trail cc's */
731 cc
=(uint8_t)(bothCCs
>>8);
732 trailCC
=(uint8_t)bothCCs
;
734 /* lead and trail cc's are both 0 */
738 length
&=_NORM_DECOMP_LENGTH_MASK
;
742 /* get the canonical decomposition for one character */
743 static inline const UChar
*
744 _decompose(uint32_t norm32
, int32_t &length
,
745 uint8_t &cc
, uint8_t &trailCC
) {
746 const UChar
*p
=(const UChar
*)_getExtraData(norm32
);
749 if(length
&_NORM_DECOMP_FLAG_LENGTH_HAS_CC
) {
750 /* get the lead and trail cc's */
752 cc
=(uint8_t)(bothCCs
>>8);
753 trailCC
=(uint8_t)bothCCs
;
755 /* lead and trail cc's are both 0 */
759 length
&=_NORM_DECOMP_LENGTH_MASK
;
764 * Get the canonical decomposition for one code point.
765 * @param c code point
766 * @param buffer out-only buffer for algorithmic decompositions of Hangul
767 * @param length out-only, takes the length of the decomposition, if any
768 * @return pointer to decomposition, or 0 if none
771 U_CFUNC
const UChar
*
772 unorm_getCanonicalDecomposition(UChar32 c
, UChar buffer
[4], int32_t *pLength
) {
775 if(c
<indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]) {
780 UTRIE_GET32(&normTrie
, c
, norm32
);
781 if(norm32
&_NORM_QC_NFD
) {
782 if(isNorm32HangulOrJamo(norm32
)) {
783 /* Hangul syllable: decompose algorithmically */
788 c2
=(UChar
)(c%JAMO_T_COUNT
);
791 buffer
[2]=(UChar
)(JAMO_T_BASE
+c2
);
797 buffer
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
798 buffer
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
801 /* normal decomposition */
803 return _decompose(norm32
, *pLength
, cc
, trailCC
);
811 * get the combining class of (c, c2)=*p++
812 * before: p<limit after: p<=limit
813 * if only one code unit is used, then c2==0
815 static inline uint8_t
816 _getNextCC(const UChar
*&p
, const UChar
*limit
, UChar
&c
, UChar
&c2
) {
820 norm32
=_getNorm32(c
);
821 if((norm32
&_NORM_CC_MASK
)==0) {
825 if(!isNorm32LeadSurrogate(norm32
)) {
828 /* c is a lead surrogate, get the real norm32 */
829 if(p
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*p
)) {
831 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
838 return (uint8_t)(norm32
>>_NORM_CC_SHIFT
);
843 * read backwards and get norm32
844 * return 0 if the character is <minC
845 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
847 static inline uint32_t
848 _getPrevNorm32(const UChar
*start
, const UChar
*&src
,
849 uint32_t minC
, uint32_t mask
,
850 UChar
&c
, UChar
&c2
) {
856 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
859 } else if(!UTF_IS_SURROGATE(c
)) {
860 return _getNorm32(c
);
861 } else if(UTF_IS_SURROGATE_FIRST(c
)) {
862 /* unpaired first surrogate */
864 } else if(src
!=start
&& UTF_IS_FIRST_SURROGATE(c2
=*(src
-1))) {
866 norm32
=_getNorm32(c2
);
868 if((norm32
&mask
)==0) {
869 /* all surrogate pairs with this lead surrogate have only irrelevant data */
872 /* norm32 must be a surrogate special */
873 return _getNorm32FromSurrogatePair(norm32
, c
);
876 /* unpaired second surrogate */
883 * get the combining class of (c, c2)=*--p
884 * before: start<p after: start<=p
886 static inline uint8_t
887 _getPrevCC(const UChar
*start
, const UChar
*&p
) {
890 return (uint8_t)(_getPrevNorm32(start
, p
, _NORM_MIN_WITH_LEAD_CC
, _NORM_CC_MASK
, c
, c2
)>>_NORM_CC_SHIFT
);
894 * is this a safe boundary character for NF*D?
898 _isNFDSafe(uint32_t norm32
, uint32_t ccOrQCMask
, uint32_t decompQCMask
) {
899 if((norm32
&ccOrQCMask
)==0) {
900 return TRUE
; /* cc==0 and no decomposition: this is NF*D safe */
903 /* inspect its decomposition - maybe a Hangul but not a surrogate here */
904 if(isNorm32Regular(norm32
) && (norm32
&decompQCMask
)!=0) {
908 /* decomposes, get everything from the variable-length extra data */
909 _decompose(norm32
, decompQCMask
, length
, cc
, trailCC
);
912 /* no decomposition (or Hangul), test the cc directly */
913 return (norm32
&_NORM_CC_MASK
)==0;
918 * is this (or does its decomposition begin with) a "true starter"?
919 * (cc==0 and NF*C_YES)
922 _isTrueStarter(uint32_t norm32
, uint32_t ccOrQCMask
, uint32_t decompQCMask
) {
923 if((norm32
&ccOrQCMask
)==0) {
924 return TRUE
; /* this is a true starter (could be Hangul or Jamo L) */
927 /* inspect its decomposition - not a Hangul or a surrogate here */
928 if((norm32
&decompQCMask
)!=0) {
933 /* decomposes, get everything from the variable-length extra data */
934 p
=_decompose(norm32
, decompQCMask
, length
, cc
, trailCC
);
936 uint32_t qcMask
=ccOrQCMask
&_NORM_QC_MASK
;
938 /* does it begin with NFC_YES? */
939 if((_getNorm32(p
, qcMask
)&qcMask
)==0) {
940 /* yes, the decomposition begins with a true starter */
949 U_CAPI
uint8_t U_EXPORT2
950 u_getCombiningClass(UChar32 c
) {
951 #if !UNORM_HARDCODE_DATA
952 UErrorCode errorCode
=U_ZERO_ERROR
;
953 if(_haveData(errorCode
)) {
957 UTRIE_GET32(&normTrie
, c
, norm32
);
958 return (uint8_t)(norm32
>>_NORM_CC_SHIFT
);
959 #if !UNORM_HARDCODE_DATA
966 U_CAPI UBool U_EXPORT2
967 unorm_internalIsFullCompositionExclusion(UChar32 c
) {
968 #if UNORM_HARDCODE_DATA
969 if(auxTrie
.index
!=NULL
) {
971 UErrorCode errorCode
=U_ZERO_ERROR
;
972 if(_haveData(errorCode
) && auxTrie
.index
!=NULL
) {
976 UTRIE_GET16(&auxTrie
, c
, aux
);
977 return (UBool
)((aux
&_NORM_AUX_COMP_EX_MASK
)!=0);
983 U_CAPI UBool U_EXPORT2
984 unorm_isCanonSafeStart(UChar32 c
) {
985 #if UNORM_HARDCODE_DATA
986 if(auxTrie
.index
!=NULL
) {
988 UErrorCode errorCode
=U_ZERO_ERROR
;
989 if(_haveData(errorCode
) && auxTrie
.index
!=NULL
) {
993 UTRIE_GET16(&auxTrie
, c
, aux
);
994 return (UBool
)((aux
&_NORM_AUX_UNSAFE_MASK
)==0);
1000 U_CAPI
void U_EXPORT2
1001 unorm_getUnicodeVersion(UVersionInfo
*versionInfo
, UErrorCode
*pErrorCode
){
1002 if(unorm_haveData(pErrorCode
)){
1003 uprv_memcpy(*versionInfo
, dataVersion
, 4);
1008 U_CAPI UBool U_EXPORT2
1009 unorm_getCanonStartSet(UChar32 c
, USerializedSet
*fillSet
) {
1010 #if !UNORM_HARDCODE_DATA
1011 UErrorCode errorCode
=U_ZERO_ERROR
;
1013 if( fillSet
!=NULL
&& (uint32_t)c
<=0x10ffff &&
1014 #if !UNORM_HARDCODE_DATA
1015 _haveData(errorCode
) &&
1017 canonStartSets
!=NULL
1019 const uint16_t *table
;
1020 int32_t i
, start
, limit
;
1023 * binary search for c
1025 * There are two search tables,
1026 * one for BMP code points and one for supplementary ones.
1027 * See unormimp.h for details.
1030 table
=canonStartSets
+canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
];
1032 limit
=canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
1034 /* each entry is a pair { c, result } */
1035 while(start
<limit
-2) {
1036 i
=(uint16_t)(((start
+limit
)/4)*2); /* (start+limit)/2 and address pairs */
1045 if(c
==table
[start
]) {
1047 if((i
&_NORM_CANON_SET_BMP_MASK
)==_NORM_CANON_SET_BMP_IS_INDEX
) {
1048 /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
1049 i
&=(_NORM_MAX_CANON_SETS
-1);
1050 return uset_getSerializedSet(fillSet
,
1052 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]-i
);
1054 /* other result values are BMP code points for single-code point sets */
1055 uset_setSerializedToOne(fillSet
, (UChar32
)i
);
1060 uint16_t high
, low
, h
;
1062 table
=canonStartSets
+canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]+
1063 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
1065 limit
=canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
];
1067 high
=(uint16_t)(c
>>16);
1070 /* each entry is a triplet { high(c), low(c), result } */
1071 while(start
<limit
-3) {
1072 i
=(uint16_t)(((start
+limit
)/6)*3); /* (start+limit)/2 and address triplets */
1073 h
=table
[i
]&0x1f; /* high word */
1074 if(high
<h
|| (high
==h
&& low
<table
[i
+1])) {
1083 if(high
==(h
&0x1f) && low
==table
[start
+1]) {
1086 /* the result is an index to a USerializedSet */
1087 return uset_getSerializedSet(fillSet
,
1089 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]-i
);
1092 * single-code point set {x} in
1093 * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
1095 i
|=((int32_t)h
&0x1f00)<<8; /* add high bits from high(c) */
1096 uset_setSerializedToOne(fillSet
, (UChar32
)i
);
1103 return FALSE
; /* not found */
1106 U_CAPI
int32_t U_EXPORT2
1107 u_getFC_NFKC_Closure(UChar32 c
, UChar
*dest
, int32_t destCapacity
, UErrorCode
*pErrorCode
) {
1110 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1113 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
1114 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1117 if(!_haveData(*pErrorCode
) || auxTrie
.index
==NULL
) {
1121 UTRIE_GET16(&auxTrie
, c
, aux
);
1122 aux
&=_NORM_AUX_FNC_MASK
;
1127 s
=(const UChar
*)(extraData
+aux
);
1129 /* s points to the single-unit string */
1135 if(0<length
&& length
<=destCapacity
) {
1136 uprv_memcpy(dest
, s
, length
*U_SIZEOF_UCHAR
);
1138 return u_terminateUChars(dest
, destCapacity
, length
, pErrorCode
);
1140 return u_terminateUChars(dest
, destCapacity
, 0, pErrorCode
);
1144 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
1145 U_CAPI UBool U_EXPORT2
1146 unorm_isNFSkippable(UChar32 c
, UNormalizationMode mode
) {
1147 uint32_t norm32
, mask
;
1150 #if !UNORM_HARDCODE_DATA
1151 UErrorCode errorCode
=U_ZERO_ERROR
;
1152 if(!_haveData(errorCode
)) {
1157 /* handle trivial cases; set the comparison mask for the normal ones */
1162 mask
=_NORM_CC_MASK
|_NORM_QC_NFD
;
1165 mask
=_NORM_CC_MASK
|_NORM_QC_NFKD
;
1168 /* case UNORM_FCC: */
1169 mask
=_NORM_CC_MASK
|_NORM_COMBINES_ANY
|(_NORM_QC_NFC
&_NORM_QC_ANY_NO
);
1172 mask
=_NORM_CC_MASK
|_NORM_COMBINES_ANY
|(_NORM_QC_NFKC
&_NORM_QC_ANY_NO
);
1175 /* FCD: skippable if lead cc==0 and trail cc<=1 */
1176 if(fcdTrie
.index
!=NULL
) {
1177 UTRIE_GET16(&fcdTrie
, c
, fcd
);
1186 /* check conditions (a)..(e), see unormimp.h */
1187 UTRIE_GET32(&normTrie
, c
, norm32
);
1188 if((norm32
&mask
)!=0) {
1189 return FALSE
; /* fails (a)..(e), not skippable */
1192 if(mode
<UNORM_NFC
) {
1193 return TRUE
; /* NF*D, passed (a)..(c), is skippable */
1196 /* NF*C/FCC, passed (a)..(e) */
1197 if((norm32
&_NORM_QC_NFD
)==0) {
1198 return TRUE
; /* no canonical decomposition, is skippable */
1201 /* check Hangul syllables algorithmically */
1202 if(isNorm32HangulOrJamo(norm32
)) {
1203 /* Jamo passed (a)..(e) above, must be Hangul */
1204 return !isHangulWithoutJamoT((UChar
)c
); /* LVT are skippable, LV are not */
1207 /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1208 /* NF*C, test (f) flag */
1209 if(!formatVersion_2_2
|| auxTrie
.index
==NULL
) {
1210 return FALSE
; /* no (f) data, say not skippable to be safe */
1213 UTRIE_GET16(&auxTrie
, c
, aux
);
1214 return (aux
&_NORM_AUX_NFC_SKIP_F_MASK
)==0; /* TRUE=skippable if the (f) flag is not set */
1216 /* } else { FCC, test fcd<=1 instead of the above } */
1219 U_CAPI
void U_EXPORT2
1220 unorm_addPropertyStarts(const USetAdder
*sa
, UErrorCode
*pErrorCode
) {
1223 if(!_haveData(*pErrorCode
)) {
1227 /* add the start code point of each same-value range of each trie */
1228 utrie_enum(&normTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1229 if(fcdTrie
.index
!=NULL
) {
1230 utrie_enum(&fcdTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1232 if(auxTrie
.index
!=NULL
) {
1233 utrie_enum(&auxTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1236 /* add Hangul LV syllables and LV+1 because of skippables */
1237 for(c
=HANGUL_BASE
; c
<HANGUL_BASE
+HANGUL_COUNT
; c
+=JAMO_T_COUNT
) {
1238 sa
->add(sa
->set
, c
);
1239 sa
->add(sa
->set
, c
+1);
1241 sa
->add(sa
->set
, HANGUL_BASE
+HANGUL_COUNT
); /* add Hangul+1 to continue with other properties */
1244 U_CAPI UNormalizationCheckResult U_EXPORT2
1245 unorm_getQuickCheck(UChar32 c
, UNormalizationMode mode
) {
1246 static const uint32_t qcMask
[UNORM_MODE_COUNT
]={
1247 0, 0, _NORM_QC_NFD
, _NORM_QC_NFKD
, _NORM_QC_NFC
, _NORM_QC_NFKC
1252 #if !UNORM_HARDCODE_DATA
1253 UErrorCode errorCode
=U_ZERO_ERROR
;
1254 if(!_haveData(errorCode
)) {
1259 UTRIE_GET32(&normTrie
, c
, norm32
);
1260 norm32
&=qcMask
[mode
];
1264 } else if(norm32
&_NORM_QC_ANY_NO
) {
1266 } else /* _NORM_QC_ANY_MAYBE */ {
1271 U_CAPI
uint16_t U_EXPORT2
1272 unorm_getFCD16FromCodePoint(UChar32 c
) {
1273 UErrorCode errorCode
;
1276 errorCode
=U_ZERO_ERROR
;
1278 #if !UNORM_HARDCODE_DATA
1279 !_haveData(errorCode
) ||
1286 UTRIE_GET16(&fcdTrie
, c
, fcd
);
1290 /* reorder UTF-16 in-place -------------------------------------------------- */
1293 * simpler, single-character version of _mergeOrdered() -
1294 * bubble-insert one single code point into the preceding string
1295 * which is already canonically ordered
1296 * (c, c2) may or may not yet have been inserted at [current..p[
1298 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1300 * before: [start..current[ is already ordered, and
1301 * [current..p[ may or may not hold (c, c2) but
1302 * must be exactly the same length as (c, c2)
1303 * after: [start..p[ is ordered
1305 * returns the trailing combining class
1308 _insertOrdered(const UChar
*start
, UChar
*current
, UChar
*p
,
1309 UChar c
, UChar c2
, uint8_t cc
) {
1310 const UChar
*pBack
, *pPreBack
;
1312 uint8_t prevCC
, trailCC
=cc
;
1314 if(start
<current
&& cc
!=0) {
1315 /* search for the insertion point where cc>=prevCC */
1316 pPreBack
=pBack
=current
;
1317 prevCC
=_getPrevCC(start
, pPreBack
);
1319 /* this will be the last code point, so keep its cc */
1322 while(start
<pPreBack
) {
1323 prevCC
=_getPrevCC(start
, pPreBack
);
1331 * this is where we are right now with all these pointers:
1332 * [start..pPreBack[ 0..? code points that we can ignore
1333 * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1334 * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2)
1335 * [current..p[ 1 code point (c, c2) with cc
1338 /* move the code units in between up */
1342 } while(pBack
!=current
);
1346 /* insert (c, c2) */
1352 /* we know the cc of the last code point */
1357 * merge two UTF-16 string parts together
1358 * to canonically order (order by combining classes) their concatenation
1360 * the two strings may already be adjacent, so that the merging is done in-place
1361 * if the two strings are not adjacent, then the buffer holding the first one
1362 * must be large enough
1363 * the second string may or may not be ordered in itself
1365 * before: [start..current[ is already ordered, and
1366 * [next..limit[ may be ordered in itself, but
1367 * is not in relation to [start..current[
1368 * after: [start..current+(limit-next)[ is ordered
1370 * the algorithm is a simple bubble-sort that takes the characters from *next++
1371 * and inserts them in correct combining class order into the preceding part
1374 * since this function is called much less often than the single-code point
1375 * _insertOrdered(), it just uses that for easier maintenance
1376 * (see file version from before 2001aug31 for a more optimized version)
1378 * returns the trailing combining class
1381 _mergeOrdered(UChar
*start
, UChar
*current
,
1382 const UChar
*next
, const UChar
*limit
, UBool isOrdered
=TRUE
) {
1385 uint8_t cc
, trailCC
=0;
1388 adjacent
= current
==next
;
1390 if(start
!=current
|| !isOrdered
) {
1392 cc
=_getNextCC(next
, limit
, c
, c2
);
1394 /* does not bubble back */
1397 current
=(UChar
*)next
;
1410 r
=current
+(c2
==0 ? 1 : 2);
1411 trailCC
=_insertOrdered(start
, current
, r
, c
, c2
, cc
);
1418 /* we know the cc of the last code point */
1422 /* copy the second string part */
1425 } while(next
!=limit
);
1428 return _getPrevCC(start
, limit
);
1432 /* find the last true starter in [start..src[ and return the pointer to it */
1433 static const UChar
*
1434 _findPreviousStarter(const UChar
*start
, const UChar
*src
,
1435 uint32_t ccOrQCMask
, uint32_t decompQCMask
, UChar minNoMaybe
) {
1440 norm32
=_getPrevNorm32(start
, src
, minNoMaybe
, ccOrQCMask
|decompQCMask
, c
, c2
);
1441 if(_isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
)) {
1448 /* find the first true starter in [src..limit[ and return the pointer to it */
1449 static const UChar
*
1450 _findNextStarter(const UChar
*src
, const UChar
*limit
,
1451 uint32_t qcMask
, uint32_t decompQCMask
, UChar minNoMaybe
) {
1453 uint32_t norm32
, ccOrQCMask
;
1456 uint8_t cc
, trailCC
;
1458 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
1462 break; /* end of string */
1466 break; /* catches NUL terminater, too */
1469 norm32
=_getNorm32(c
);
1470 if((norm32
&ccOrQCMask
)==0) {
1471 break; /* true starter */
1474 if(isNorm32LeadSurrogate(norm32
)) {
1475 /* c is a lead surrogate, get the real norm32 */
1476 if((src
+1)==limit
|| !UTF_IS_SECOND_SURROGATE(c2
=*(src
+1))) {
1477 break; /* unmatched first surrogate: counts as a true starter */
1479 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1481 if((norm32
&ccOrQCMask
)==0) {
1482 break; /* true starter */
1488 /* (c, c2) is not a true starter but its decomposition may be */
1489 if(norm32
&decompQCMask
) {
1490 /* (c, c2) decomposes, get everything from the variable-length extra data */
1491 p
=_decompose(norm32
, decompQCMask
, length
, cc
, trailCC
);
1493 /* get the first character's norm32 to check if it is a true starter */
1494 if(cc
==0 && (_getNorm32(p
, qcMask
)&qcMask
)==0) {
1495 break; /* true starter */
1499 src
+= c2
==0 ? 1 : 2; /* not a true starter, continue */
1505 /* make NFD & NFKD ---------------------------------------------------------- */
1507 U_CAPI
int32_t U_EXPORT2
1508 unorm_getDecomposition(UChar32 c
, UBool compat
,
1509 UChar
*dest
, int32_t destCapacity
) {
1510 #if !UNORM_HARDCODE_DATA
1511 UErrorCode errorCode
=U_ZERO_ERROR
;
1513 if( (uint32_t)c
<=0x10ffff &&
1514 #if !UNORM_HARDCODE_DATA
1515 _haveData(errorCode
) &&
1517 ((dest
!=NULL
&& destCapacity
>0) || destCapacity
==0)
1519 uint32_t norm32
, qcMask
;
1525 minNoMaybe
=(UChar32
)indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
];
1526 qcMask
=_NORM_QC_NFD
;
1528 minNoMaybe
=(UChar32
)indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
];
1529 qcMask
=_NORM_QC_NFKD
;
1534 if(destCapacity
>0) {
1541 UTRIE_GET32(&normTrie
, c
, norm32
);
1542 if((norm32
&qcMask
)==0) {
1543 /* simple case: no decomposition */
1545 if(destCapacity
>0) {
1550 if(destCapacity
>=2) {
1551 dest
[0]=UTF16_LEAD(c
);
1552 dest
[1]=UTF16_TRAIL(c
);
1556 } else if(isNorm32HangulOrJamo(norm32
)) {
1557 /* Hangul syllable: decompose algorithmically */
1562 c2
=(UChar
)(c%JAMO_T_COUNT
);
1565 if(destCapacity
>=3) {
1566 dest
[2]=(UChar
)(JAMO_T_BASE
+c2
);
1573 if(destCapacity
>=2) {
1574 dest
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
1575 dest
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
1579 /* c decomposes, get everything from the variable-length extra data */
1580 const UChar
*p
, *limit
;
1581 uint8_t cc
, trailCC
;
1583 p
=_decompose(norm32
, qcMask
, length
, cc
, trailCC
);
1584 if(length
<=destCapacity
) {
1598 _decompose(UChar
*dest
, int32_t destCapacity
,
1599 const UChar
*src
, int32_t srcLength
,
1600 UBool compat
, const UnicodeSet
*nx
,
1601 uint8_t &outTrailCC
) {
1603 const UChar
*limit
, *prevSrc
, *p
;
1604 uint32_t norm32
, ccOrQCMask
, qcMask
;
1605 int32_t destIndex
, reorderStartIndex
, length
;
1606 UChar c
, c2
, minNoMaybe
;
1607 uint8_t cc
, prevCC
, trailCC
;
1610 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
];
1611 qcMask
=_NORM_QC_NFD
;
1613 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
];
1614 qcMask
=_NORM_QC_NFKD
;
1618 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
1619 destIndex
=reorderStartIndex
=0;
1622 /* avoid compiler warnings */
1629 /* string with length */
1630 limit
=src
+srcLength
;
1631 } else /* srcLength==-1 */ {
1632 /* zero-terminated string */
1639 /* count code units below the minimum or with irrelevant data for the quick check */
1642 while((c
=*src
)<minNoMaybe
? c
!=0 : ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0) {
1647 while(src
!=limit
&& ((c
=*src
)<minNoMaybe
|| ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0)) {
1653 /* copy these code units all at once */
1655 length
=(int32_t)(src
-prevSrc
);
1656 if((destIndex
+length
)<=destCapacity
) {
1657 uprv_memcpy(dest
+destIndex
, prevSrc
, length
*U_SIZEOF_UCHAR
);
1660 reorderStartIndex
=destIndex
;
1663 /* end of source reached? */
1664 if(limit
==NULL
? c
==0 : src
==limit
) {
1668 /* c already contains *src and norm32 is set for it, increment src */
1671 /* check one above-minimum, relevant code unit */
1673 * generally, set p and length to the decomposition string
1674 * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1675 * in all cases, set cc to the lead and trailCC to the trail combining class
1677 * the following merge-sort of the current character into the preceding,
1678 * canonically ordered result text will use the optimized _insertOrdered()
1679 * if there is only one single code point to process;
1680 * this is indicated with p==NULL, and (c, c2) is the character to insert
1681 * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1682 * for a supplementary character)
1683 * otherwise, p[length] is merged in with _mergeOrdered()
1685 if(isNorm32HangulOrJamo(norm32
)) {
1686 if(nx_contains(nx
, c
)) {
1691 /* Hangul syllable: decompose algorithmically */
1697 c2
=(UChar
)(c%JAMO_T_COUNT
);
1700 buffer
[2]=(UChar
)(JAMO_T_BASE
+c2
);
1706 buffer
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
1707 buffer
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
1710 if(isNorm32Regular(norm32
)) {
1714 /* c is a lead surrogate, get the real norm32 */
1715 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
1718 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1726 /* get the decomposition and the lead and trail cc's */
1727 if(nx_contains(nx
, c
, c2
)) {
1728 /* excluded: norm32==0 */
1731 } else if((norm32
&qcMask
)==0) {
1732 /* c does not decompose */
1733 cc
=trailCC
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
1736 /* c decomposes, get everything from the variable-length extra data */
1737 p
=_decompose(norm32
, qcMask
, length
, cc
, trailCC
);
1739 /* fastpath a single code unit from decomposition */
1747 /* append the decomposition to the destination buffer, assume length>0 */
1748 if((destIndex
+length
)<=destCapacity
) {
1749 UChar
*reorderSplit
=dest
+destIndex
;
1751 /* fastpath: single code point */
1752 if(cc
!=0 && cc
<prevCC
) {
1753 /* (c, c2) is out of order with respect to the preceding text */
1755 trailCC
=_insertOrdered(dest
+reorderStartIndex
, reorderSplit
, dest
+destIndex
, c
, c2
, cc
);
1757 /* just append (c, c2) */
1758 dest
[destIndex
++]=c
;
1760 dest
[destIndex
++]=c2
;
1764 /* general: multiple code points (ordered by themselves) from decomposition */
1765 if(cc
!=0 && cc
<prevCC
) {
1766 /* the decomposition is out of order with respect to the preceding text */
1768 trailCC
=_mergeOrdered(dest
+reorderStartIndex
, reorderSplit
, p
, p
+length
);
1770 /* just append the decomposition */
1772 dest
[destIndex
++]=*p
++;
1773 } while(--length
>0);
1777 /* buffer overflow */
1778 /* keep incrementing the destIndex for preflighting */
1784 reorderStartIndex
=destIndex
;
1792 U_CAPI
int32_t U_EXPORT2
1793 unorm_decompose(UChar
*dest
, int32_t destCapacity
,
1794 const UChar
*src
, int32_t srcLength
,
1795 UBool compat
, int32_t options
,
1796 UErrorCode
*pErrorCode
) {
1797 const UnicodeSet
*nx
;
1801 if(!_haveData(*pErrorCode
)) {
1805 nx
=getNX(options
, *pErrorCode
);
1806 if(U_FAILURE(*pErrorCode
)) {
1810 destIndex
=_decompose(dest
, destCapacity
,
1815 return u_terminateUChars(dest
, destCapacity
, destIndex
, pErrorCode
);
1818 /* make NFC & NFKC ---------------------------------------------------------- */
1820 /* get the composition properties of the next character */
1821 static inline uint32_t
1822 _getNextCombining(UChar
*&p
, const UChar
*limit
,
1823 UChar
&c
, UChar
&c2
,
1824 uint16_t &combiningIndex
, uint8_t &cc
,
1825 const UnicodeSet
*nx
) {
1826 uint32_t norm32
, combineFlags
;
1828 /* get properties */
1830 norm32
=_getNorm32(c
);
1832 /* preset output values for most characters */
1837 if((norm32
&(_NORM_CC_MASK
|_NORM_COMBINES_ANY
))==0) {
1840 if(isNorm32Regular(norm32
)) {
1841 /* set cc etc. below */
1842 } else if(isNorm32HangulOrJamo(norm32
)) {
1843 /* a compatibility decomposition contained Jamos */
1844 combiningIndex
=(uint16_t)(0xfff0|(norm32
>>_NORM_EXTRA_SHIFT
));
1845 return norm32
&_NORM_COMBINES_ANY
;
1847 /* c is a lead surrogate, get the real norm32 */
1848 if(p
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*p
)) {
1850 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1857 if(nx_contains(nx
, c
, c2
)) {
1858 return 0; /* excluded: norm32==0 */
1861 cc
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
1863 combineFlags
=norm32
&_NORM_COMBINES_ANY
;
1864 if(combineFlags
!=0) {
1865 combiningIndex
=*(_getExtraData(norm32
)-1);
1867 return combineFlags
;
1872 * given a composition-result starter (c, c2) - which means its cc==0,
1873 * it combines forward, it has extra data, its norm32!=0,
1874 * it is not a Hangul or Jamo,
1875 * get just its combineFwdIndex
1877 * norm32(c) is special if and only if c2!=0
1879 static inline uint16_t
1880 _getCombiningIndexFromStarter(UChar c
, UChar c2
) {
1883 norm32
=_getNorm32(c
);
1885 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1887 return *(_getExtraData(norm32
)-1);
1891 * Find the recomposition result for
1892 * a forward-combining character
1893 * (specified with a pointer to its part of the combiningTable[])
1894 * and a backward-combining character
1895 * (specified with its combineBackIndex).
1897 * If these two characters combine, then set (value, value2)
1898 * with the code unit(s) of the composition character.
1903 * >1 combine, and the composition is a forward-combining starter
1905 * See unormimp.h for a description of the composition table format.
1907 static inline uint16_t
1908 _combine(const uint16_t *table
, uint16_t combineBackIndex
,
1909 uint16_t &value
, uint16_t &value2
) {
1912 /* search in the starter's composition table */
1915 if(key
>=combineBackIndex
) {
1918 table
+= *table
&0x8000 ? 2 : 1;
1921 /* mask off bit 15, the last-entry-in-the-list flag */
1922 if((key
&0x7fff)==combineBackIndex
) {
1923 /* found! combine! */
1926 /* is the composition a starter that combines forward? */
1927 key
=(uint16_t)((value
&0x2000)+1);
1929 /* get the composition result code point from the variable-length result value */
1932 /* surrogate pair composition result */
1933 value
=(uint16_t)((value
&0x3ff)|0xd800);
1936 /* BMP composition result U+2000..U+ffff */
1941 /* BMP composition result U+0000..U+1fff */
1954 _composeHangul(UChar prev
, UChar c
, uint32_t norm32
, const UChar
*&src
, const UChar
*limit
,
1955 UBool compat
, UChar
*dest
, const UnicodeSet
*nx
) {
1956 if(isJamoVTNorm32JamoV(norm32
)) {
1957 /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1958 prev
=(UChar
)(prev
-JAMO_L_BASE
);
1959 if(prev
<JAMO_L_COUNT
) {
1960 c
=(UChar
)(HANGUL_BASE
+(prev
*JAMO_V_COUNT
+(c
-JAMO_V_BASE
))*JAMO_T_COUNT
);
1962 /* check if the next character is a Jamo T (normal or compatibility) */
1967 if((t
=(UChar
)(next
-JAMO_T_BASE
))<JAMO_T_COUNT
) {
1972 /* if NFKC, then check for compatibility Jamo T (BMP only) */
1973 norm32
=_getNorm32(next
);
1974 if(isNorm32Regular(norm32
) && (norm32
&_NORM_QC_NFKD
)) {
1977 uint8_t cc
, trailCC
;
1979 p
=_decompose(norm32
, _NORM_QC_NFKD
, length
, cc
, trailCC
);
1980 if(length
==1 && (t
=(UChar
)(*p
-JAMO_T_BASE
))<JAMO_T_COUNT
) {
1981 /* compatibility Jamo T */
1988 if(nx_contains(nx
, c
)) {
1989 if(!isHangulWithoutJamoT(c
)) {
1990 --src
; /* undo ++src from reading the Jamo T */
1999 } else if(isHangulWithoutJamoT(prev
)) {
2000 /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
2001 c
=(UChar
)(prev
+(c
-JAMO_T_BASE
));
2002 if(nx_contains(nx
, c
)) {
2014 * recompose the characters in [p..limit[
2015 * (which is in NFD - decomposed and canonically ordered),
2016 * adjust limit, and return the trailing cc
2018 * since for NFKC we may get Jamos in decompositions, we need to
2019 * recompose those too
2021 * note that recomposition never lengthens the text:
2022 * any character consists of either one or two code units;
2023 * a composition may contain at most one more code unit than the original starter,
2024 * while the combining mark that is removed has at least one code unit
2027 _recompose(UChar
*p
, UChar
*&limit
, int32_t options
, const UnicodeSet
*nx
) {
2028 UChar
*starter
, *pRemove
, *q
, *r
;
2029 uint32_t combineFlags
;
2031 uint16_t combineFwdIndex
, combineBackIndex
;
2032 uint16_t result
, value
, value2
;
2034 UBool starterIsSupplementary
;
2036 starter
=NULL
; /* no starter */
2037 combineFwdIndex
=0; /* will not be used until starter!=NULL - avoid compiler warnings */
2038 combineBackIndex
=0; /* will always be set if combineFlags!=0 - avoid compiler warnings */
2039 value
=value2
=0; /* always set by _combine() before used - avoid compiler warnings */
2040 starterIsSupplementary
=FALSE
; /* will not be used until starter!=NULL - avoid compiler warnings */
2044 combineFlags
=_getNextCombining(p
, limit
, c
, c2
, combineBackIndex
, cc
, nx
);
2045 if((combineFlags
&_NORM_COMBINES_BACK
) && starter
!=NULL
) {
2046 if(combineBackIndex
&0x8000) {
2047 /* c is a Jamo V/T, see if we can compose it with the previous character */
2048 /* for the PRI #29 fix, check that there is no intervening combining mark */
2049 if((options
&UNORM_BEFORE_PRI_29
) || prevCC
==0) {
2050 pRemove
=NULL
; /* NULL while no Hangul composition */
2053 if(combineBackIndex
==0xfff2) {
2054 /* Jamo V, compose with previous Jamo L and following Jamo T */
2055 c2
=(UChar
)(c2
-JAMO_L_BASE
);
2056 if(c2
<JAMO_L_COUNT
) {
2058 c
=(UChar
)(HANGUL_BASE
+(c2
*JAMO_V_COUNT
+(c
-JAMO_V_BASE
))*JAMO_T_COUNT
);
2059 if(p
!=limit
&& (c2
=(UChar
)(*p
-JAMO_T_BASE
))<JAMO_T_COUNT
) {
2063 /* the result is an LV syllable, which is a starter (unlike LVT) */
2064 combineFlags
=_NORM_COMBINES_FWD
;
2066 if(!nx_contains(nx
, c
)) {
2070 if(!isHangulWithoutJamoT(c
)) {
2071 --p
; /* undo the ++p from reading the Jamo T */
2073 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
2079 * Normally, the following can not occur:
2080 * Since the input is in NFD, there are no Hangul LV syllables that
2081 * a Jamo T could combine with.
2082 * All Jamo Ts are combined above when handling Jamo Vs.
2084 * However, before the PRI #29 fix, this can occur due to
2085 * an intervening combining mark between the Hangul LV and the Jamo T.
2088 /* Jamo T, compose with previous Hangul that does not have a Jamo T */
2089 if(isHangulWithoutJamoT(c2
)) {
2090 c2
+=(UChar
)(c
-JAMO_T_BASE
);
2091 if(!nx_contains(nx
, c2
)) {
2099 /* remove the Jamo(s) */
2109 c2
=0; /* c2 held *starter temporarily */
2111 if(combineFlags
!=0) {
2113 * not starter=NULL because the composition is a Hangul LV syllable
2114 * and might combine once more (but only before the PRI #29 fix)
2122 /* the composition is a Hangul LV syllable which is a starter that combines forward */
2123 combineFwdIndex
=0xfff0;
2125 /* we combined; continue with looking for compositions */
2131 * now: cc==0 and the combining index does not include "forward" ->
2132 * the rest of the loop body will reset starter to NULL;
2133 * technically, a composed Hangul syllable is a starter, but it
2134 * does not combine forward now that we have consumed all eligible Jamos;
2135 * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2139 /* the starter is not a Hangul LV or Jamo V/T and */
2140 !(combineFwdIndex
&0x8000) &&
2141 /* the combining mark is not blocked and */
2142 ((options
&UNORM_BEFORE_PRI_29
) ?
2143 (prevCC
!=cc
|| prevCC
==0) :
2144 (prevCC
<cc
|| prevCC
==0)) &&
2145 /* the starter and the combining mark (c, c2) do combine and */
2146 0!=(result
=_combine(combiningTable
+combineFwdIndex
, combineBackIndex
, value
, value2
)) &&
2147 /* the composition result is not excluded */
2148 !nx_contains(nx
, value
, value2
)
2150 /* replace the starter with the composition, remove the combining mark */
2151 pRemove
= c2
==0 ? p
-1 : p
-2; /* pointer to the combining mark */
2153 /* replace the starter with the composition */
2154 *starter
=(UChar
)value
;
2155 if(starterIsSupplementary
) {
2157 /* both are supplementary */
2158 *(starter
+1)=(UChar
)value2
;
2160 /* the composition is shorter than the starter, move the intermediate characters forward one */
2161 starterIsSupplementary
=FALSE
;
2169 } else if(value2
!=0) {
2170 /* the composition is longer than the starter, move the intermediate characters back one */
2171 starterIsSupplementary
=TRUE
;
2172 ++starter
; /* temporarily increment for the loop boundary */
2178 *starter
=(UChar
)value2
;
2179 --starter
; /* undo the temporary increment */
2180 /* } else { both are on the BMP, nothing more to do */
2183 /* remove the combining mark by moving the following text over it */
2194 /* keep prevCC because we removed the combining mark */
2201 /* is the composition a starter that combines forward? */
2203 combineFwdIndex
=_getCombiningIndexFromStarter((UChar
)value
, (UChar
)value2
);
2208 /* we combined; continue with looking for compositions */
2213 /* no combination this time */
2219 /* if (c, c2) did not combine, then check if it is a starter */
2221 /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2222 if(combineFlags
&_NORM_COMBINES_FWD
) {
2223 /* it may combine with something, prepare for it */
2225 starterIsSupplementary
=FALSE
;
2228 starterIsSupplementary
=TRUE
;
2231 combineFwdIndex
=combineBackIndex
;
2233 /* it will not combine with anything */
2236 } else if(options
&_NORM_OPTIONS_COMPOSE_CONTIGUOUS
) {
2237 /* FCC: no discontiguous compositions; any intervening character blocks */
2243 /* decompose and recompose [prevStarter..src[ */
2244 static const UChar
*
2245 _composePart(UChar
*stackBuffer
, UChar
*&buffer
, int32_t &bufferCapacity
, int32_t &length
,
2246 const UChar
*prevStarter
, const UChar
*src
,
2248 int32_t options
, const UnicodeSet
*nx
,
2249 UErrorCode
*pErrorCode
) {
2250 UChar
*recomposeLimit
;
2254 compat
=(UBool
)((options
&_NORM_OPTIONS_COMPAT
)!=0);
2256 /* decompose [prevStarter..src[ */
2257 length
=_decompose(buffer
, bufferCapacity
,
2258 prevStarter
, (int32_t)(src
-prevStarter
),
2261 if(length
>bufferCapacity
) {
2262 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, 2*length
, 0)) {
2263 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
2266 length
=_decompose(buffer
, bufferCapacity
,
2267 prevStarter
, (int32_t)(src
-prevStarter
),
2272 /* recompose the decomposition */
2273 recomposeLimit
=buffer
+length
;
2275 prevCC
=_recompose(buffer
, recomposeLimit
, options
, nx
);
2278 /* return with a pointer to the recomposition and its length */
2279 length
=(int32_t)(recomposeLimit
-buffer
);
2284 _compose(UChar
*dest
, int32_t destCapacity
,
2285 const UChar
*src
, int32_t srcLength
,
2286 int32_t options
, const UnicodeSet
*nx
,
2287 UErrorCode
*pErrorCode
) {
2288 UChar stackBuffer
[_STACK_BUFFER_CAPACITY
];
2290 int32_t bufferCapacity
;
2292 const UChar
*limit
, *prevSrc
, *prevStarter
;
2293 uint32_t norm32
, ccOrQCMask
, qcMask
;
2294 int32_t destIndex
, reorderStartIndex
, length
;
2295 UChar c
, c2
, minNoMaybe
;
2298 if(options
&_NORM_OPTIONS_COMPAT
) {
2299 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
2300 qcMask
=_NORM_QC_NFKC
;
2302 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
2303 qcMask
=_NORM_QC_NFC
;
2308 bufferCapacity
=_STACK_BUFFER_CAPACITY
;
2311 * prevStarter points to the last character before the current one
2312 * that is a "true" starter with cc==0 and quick check "yes".
2314 * prevStarter will be used instead of looking for a true starter
2315 * while incrementally decomposing [prevStarter..prevSrc[
2316 * in _composePart(). Having a good prevStarter allows to just decompose
2317 * the entire [prevStarter..prevSrc[.
2319 * When _composePart() backs out from prevSrc back to prevStarter,
2320 * then it also backs out destIndex by the same amount.
2321 * Therefore, at all times, the (prevSrc-prevStarter) source units
2322 * must correspond 1:1 to destination units counted with destIndex,
2323 * except for reordering.
2324 * This is true for the qc "yes" characters copied in the fast loop,
2325 * and for pure reordering.
2326 * prevStarter must be set forward to src when this is not true:
2327 * In _composePart() and after composing a Hangul syllable.
2329 * This mechanism relies on the assumption that the decomposition of a true starter
2330 * also begins with a true starter. gennorm/store.c checks for this.
2334 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
2335 destIndex
=reorderStartIndex
=0;
2338 /* avoid compiler warnings */
2343 /* string with length */
2344 limit
=src
+srcLength
;
2345 } else /* srcLength==-1 */ {
2346 /* zero-terminated string */
2353 /* count code units below the minimum or with irrelevant data for the quick check */
2356 while((c
=*src
)<minNoMaybe
? c
!=0 : ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0) {
2361 while(src
!=limit
&& ((c
=*src
)<minNoMaybe
|| ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0)) {
2367 /* copy these code units all at once */
2369 length
=(int32_t)(src
-prevSrc
);
2370 if((destIndex
+length
)<=destCapacity
) {
2371 uprv_memcpy(dest
+destIndex
, prevSrc
, length
*U_SIZEOF_UCHAR
);
2374 reorderStartIndex
=destIndex
;
2376 /* set prevStarter to the last character in the quick check loop */
2378 if(UTF_IS_SECOND_SURROGATE(*prevStarter
) && prevSrc
<prevStarter
&& UTF_IS_FIRST_SURROGATE(*(prevStarter
-1))) {
2385 /* end of source reached? */
2386 if(limit
==NULL
? c
==0 : src
==limit
) {
2390 /* c already contains *src and norm32 is set for it, increment src */
2394 * source buffer pointers:
2396 * all done quick check current char not yet
2397 * "yes" but (c, c2) processed
2400 * [-------------[-------------[-------------[-------------[
2402 * start prevStarter prevSrc src limit
2405 * destination buffer pointers and indexes:
2407 * all done might take not filled yet
2410 * [-------------[-------------[-------------[
2412 * dest reorderStartIndex destIndex destCapacity
2415 /* check one above-minimum, relevant code unit */
2417 * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2418 * check for Jamo V/T, then for surrogates and regular characters
2419 * c is not a Hangul syllable or Jamo L because
2420 * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2422 if(isNorm32HangulOrJamo(norm32
)) {
2425 * try to compose with the previous character, Jamo V also with a following Jamo T,
2426 * and set values here right now in case we just continue with the main loop
2429 reorderStartIndex
=destIndex
;
2434 *(prevSrc
-1), c
, norm32
, src
, limit
, (UBool
)((options
&_NORM_OPTIONS_COMPAT
)!=0),
2435 destIndex
<=destCapacity
? dest
+(destIndex
-1) : 0,
2442 /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2445 prevStarter
=prevSrc
;
2447 if(isNorm32Regular(norm32
)) {
2451 /* c is a lead surrogate, get the real norm32 */
2452 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
2455 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
2457 /* c is an unpaired lead surrogate, nothing to do */
2464 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2465 if(nx_contains(nx
, c
, c2
)) {
2466 /* excluded: norm32==0 */
2468 } else if((norm32
&qcMask
)==0) {
2469 cc
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
2472 uint32_t decompQCMask
;
2475 * find appropriate boundaries around this character,
2476 * decompose the source text from between the boundaries,
2479 * this puts the intermediate text into the side buffer because
2480 * it might be longer than the recomposition end result,
2481 * or the destination buffer may be too short or missing
2483 * note that destIndex may be adjusted backwards to account
2484 * for source text that passed the quick check but needed to
2485 * take part in the recomposition
2487 decompQCMask
=(qcMask
<<2)&0xf; /* decomposition quick check mask */
2490 * find the last true starter in [prevStarter..src[
2491 * it is either the decomposition of the current character (at prevSrc),
2494 if(_isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
)) {
2495 prevStarter
=prevSrc
;
2497 /* adjust destIndex: back out what had been copied with qc "yes" */
2498 destIndex
-=(int32_t)(prevSrc
-prevStarter
);
2501 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2502 src
=_findNextStarter(src
, limit
, qcMask
, decompQCMask
, minNoMaybe
);
2504 /* compose [prevStarter..src[ */
2505 p
=_composePart(stackBuffer
, buffer
, bufferCapacity
,
2506 length
, /* output */
2508 prevCC
, /* output */
2513 destIndex
=0; /* an error occurred (out of memory) */
2517 /* append the recomposed buffer contents to the destination buffer */
2518 if((destIndex
+length
)<=destCapacity
) {
2520 dest
[destIndex
++]=*p
++;
2524 /* buffer overflow */
2525 /* keep incrementing the destIndex for preflighting */
2529 /* set the next starter */
2536 /* append the single code point (c, c2) to the destination buffer */
2537 if((destIndex
+length
)<=destCapacity
) {
2538 if(cc
!=0 && cc
<prevCC
) {
2539 /* (c, c2) is out of order with respect to the preceding text */
2540 UChar
*reorderSplit
=dest
+destIndex
;
2542 prevCC
=_insertOrdered(dest
+reorderStartIndex
, reorderSplit
, dest
+destIndex
, c
, c2
, cc
);
2544 /* just append (c, c2) */
2545 dest
[destIndex
++]=c
;
2547 dest
[destIndex
++]=c2
;
2552 /* buffer overflow */
2553 /* keep incrementing the destIndex for preflighting */
2560 if(buffer
!=stackBuffer
) {
2567 U_CAPI
int32_t U_EXPORT2
2568 unorm_compose(UChar
*dest
, int32_t destCapacity
,
2569 const UChar
*src
, int32_t srcLength
,
2570 UBool compat
, int32_t options
,
2571 UErrorCode
*pErrorCode
) {
2572 const UnicodeSet
*nx
;
2575 if(!_haveData(*pErrorCode
)) {
2579 nx
=getNX(options
, *pErrorCode
);
2580 if(U_FAILURE(*pErrorCode
)) {
2584 /* reset options bits that should only be set here or inside _compose() */
2585 options
&=~(_NORM_OPTIONS_SETS_MASK
|_NORM_OPTIONS_COMPAT
|_NORM_OPTIONS_COMPOSE_CONTIGUOUS
);
2588 options
|=_NORM_OPTIONS_COMPAT
;
2591 destIndex
=_compose(dest
, destCapacity
,
2596 return u_terminateUChars(dest
, destCapacity
, destIndex
, pErrorCode
);
2599 /* make FCD ----------------------------------------------------------------- */
2601 static const UChar
*
2602 _findSafeFCD(const UChar
*src
, const UChar
*limit
, uint16_t fcd16
) {
2606 * find the first position in [src..limit[ after some cc==0 according to FCD data
2608 * at the beginning of the loop, we have fcd16 from before src
2610 * stop at positions:
2611 * - after trail cc==0
2612 * - at the end of the source
2613 * - before lead cc==0
2616 /* stop if trail cc==0 for the previous character */
2617 if((fcd16
&0xff)==0) {
2621 /* get c=*src - stop at end of string */
2627 /* stop if lead cc==0 for this character */
2628 if(c
<_NORM_MIN_WITH_LEAD_CC
|| (fcd16
=_getFCD16(c
))==0) {
2629 break; /* catches terminating NUL, too */
2632 if(!UTF_IS_FIRST_SURROGATE(c
)) {
2637 } else if((src
+1)!=limit
&& (c2
=*(src
+1), UTF_IS_SECOND_SURROGATE(c2
))) {
2638 /* c is a lead surrogate, get the real fcd16 */
2639 fcd16
=_getFCD16FromSurrogatePair(fcd16
, c2
);
2645 /* c is an unpaired first surrogate, lead cc==0 */
2654 _decomposeFCD(const UChar
*src
, const UChar
*decompLimit
,
2655 UChar
*dest
, int32_t &destIndex
, int32_t destCapacity
,
2656 const UnicodeSet
*nx
) {
2659 int32_t reorderStartIndex
, length
;
2661 uint8_t cc
, prevCC
, trailCC
;
2664 * canonically decompose [src..decompLimit[
2666 * all characters in this range have some non-zero cc,
2667 * directly or in decomposition,
2668 * so that we do not need to check in the following for quick-check limits etc.
2670 * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2672 * we also do not need to check for c==0 because we have an established decompLimit
2674 reorderStartIndex
=destIndex
;
2677 while(src
<decompLimit
) {
2679 norm32
=_getNorm32(c
);
2680 if(isNorm32Regular(norm32
)) {
2685 * reminder: this function is called with [src..decompLimit[
2686 * not containing any Hangul/Jamo characters,
2687 * therefore the only specials are lead surrogates
2689 /* c is a lead surrogate, get the real norm32 */
2690 if(src
!=decompLimit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
2693 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
2701 /* get the decomposition and the lead and trail cc's */
2702 if(nx_contains(nx
, c
, c2
)) {
2703 /* excluded: norm32==0 */
2706 } else if((norm32
&_NORM_QC_NFD
)==0) {
2707 /* c does not decompose */
2708 cc
=trailCC
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
2711 /* c decomposes, get everything from the variable-length extra data */
2712 p
=_decompose(norm32
, length
, cc
, trailCC
);
2714 /* fastpath a single code unit from decomposition */
2721 /* append the decomposition to the destination buffer, assume length>0 */
2722 if((destIndex
+length
)<=destCapacity
) {
2723 UChar
*reorderSplit
=dest
+destIndex
;
2725 /* fastpath: single code point */
2726 if(cc
!=0 && cc
<prevCC
) {
2727 /* (c, c2) is out of order with respect to the preceding text */
2729 trailCC
=_insertOrdered(dest
+reorderStartIndex
, reorderSplit
, dest
+destIndex
, c
, c2
, cc
);
2731 /* just append (c, c2) */
2732 dest
[destIndex
++]=c
;
2734 dest
[destIndex
++]=c2
;
2738 /* general: multiple code points (ordered by themselves) from decomposition */
2739 if(cc
!=0 && cc
<prevCC
) {
2740 /* the decomposition is out of order with respect to the preceding text */
2742 trailCC
=_mergeOrdered(dest
+reorderStartIndex
, reorderSplit
, p
, p
+length
);
2744 /* just append the decomposition */
2746 dest
[destIndex
++]=*p
++;
2747 } while(--length
>0);
2751 /* buffer overflow */
2752 /* keep incrementing the destIndex for preflighting */
2758 reorderStartIndex
=destIndex
;
2766 unorm_makeFCD(UChar
*dest
, int32_t destCapacity
,
2767 const UChar
*src
, int32_t srcLength
,
2768 const UnicodeSet
*nx
,
2769 UErrorCode
*pErrorCode
) {
2770 const UChar
*limit
, *prevSrc
, *decompStart
;
2771 int32_t destIndex
, length
;
2776 if(!_haveData(*pErrorCode
)) {
2785 /* avoid compiler warnings */
2790 /* string with length */
2791 limit
=src
+srcLength
;
2792 } else /* srcLength==-1 */ {
2793 /* zero-terminated string */
2800 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2805 if(c
<_NORM_MIN_WITH_LEAD_CC
) {
2810 } else if((fcd16
=_getFCD16(c
))==0) {
2821 } else if((c
=*src
)<_NORM_MIN_WITH_LEAD_CC
) {
2823 } else if((fcd16
=_getFCD16(c
))==0) {
2833 * prevCC has values from the following ranges:
2834 * 0..0xff - the previous trail combining class
2835 * <0 - the negative value of the previous code unit;
2836 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2837 * was deferred so that average text is checked faster
2840 /* copy these code units all at once */
2842 length
=(int32_t)(src
-prevSrc
);
2843 if((destIndex
+length
)<=destCapacity
) {
2844 uprv_memcpy(dest
+destIndex
, prevSrc
, length
*U_SIZEOF_UCHAR
);
2849 /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2851 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2852 if(!nx_contains(nx
, (UChar32
)-prevCC
)) {
2853 prevCC
=(int16_t)(_getFCD16((UChar
)-prevCC
)&0xff);
2855 prevCC
=0; /* excluded: fcd16==0 */
2859 * set a pointer to this below-U+0300 character;
2860 * if prevCC==0 then it will moved to after this character below
2862 decompStart
=prevSrc
-1;
2867 * prevSrc==src - used later to adjust destIndex before decomposition
2871 /* end of source reached? */
2872 if(limit
==NULL
? c
==0 : src
==limit
) {
2876 /* set a pointer to after the last source position where prevCC==0 */
2878 decompStart
=prevSrc
;
2881 /* c already contains *src and fcd16 is set for it, increment src */
2884 /* check one above-minimum, relevant code unit */
2885 if(UTF_IS_FIRST_SURROGATE(c
)) {
2886 /* c is a lead surrogate, get the real fcd16 */
2887 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
2889 fcd16
=_getFCD16FromSurrogatePair(fcd16
, c2
);
2898 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2899 if(nx_contains(nx
, c
, c2
)) {
2900 fcd16
=0; /* excluded: fcd16==0 */
2903 /* check the combining order, get the lead cc */
2904 cc
=(int16_t)(fcd16
>>8);
2905 if(cc
==0 || cc
>=prevCC
) {
2906 /* the order is ok */
2908 decompStart
=prevSrc
;
2910 prevCC
=(int16_t)(fcd16
&0xff);
2912 /* just append (c, c2) */
2913 length
= c2
==0 ? 1 : 2;
2914 if((destIndex
+length
)<=destCapacity
) {
2915 dest
[destIndex
++]=c
;
2917 dest
[destIndex
++]=c2
;
2924 * back out the part of the source that we copied already but
2925 * is now going to be decomposed;
2926 * prevSrc is set to after what was copied
2928 destIndex
-=(int32_t)(prevSrc
-decompStart
);
2931 * find the part of the source that needs to be decomposed;
2932 * to be safe and simple, decompose to before the next character with lead cc==0
2934 src
=_findSafeFCD(src
, limit
, fcd16
);
2937 * the source text does not fulfill the conditions for FCD;
2938 * decompose and reorder a limited piece of the text
2940 prevCC
=_decomposeFCD(decompStart
, src
,
2941 dest
, destIndex
, destCapacity
,
2947 return u_terminateUChars(dest
, destCapacity
, destIndex
, pErrorCode
);
2950 /* quick check functions ---------------------------------------------------- */
2953 unorm_checkFCD(const UChar
*src
, int32_t srcLength
, const UnicodeSet
*nx
) {
2963 /* string with length */
2964 limit
=src
+srcLength
;
2965 } else /* srcLength==-1 */ {
2966 /* zero-terminated string */
2973 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2977 if(c
<_NORM_MIN_WITH_LEAD_CC
) {
2982 * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2983 * because chances are good that the next one will have
2984 * a leading cc of 0;
2985 * _getFCD16(-prevCC) is later called when necessary -
2986 * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2989 } else if((fcd16
=_getFCD16(c
))==0) {
2999 } else if((c
=*src
++)<_NORM_MIN_WITH_LEAD_CC
) {
3001 } else if((fcd16
=_getFCD16(c
))==0) {
3009 /* check one above-minimum, relevant code unit */
3010 if(UTF_IS_FIRST_SURROGATE(c
)) {
3011 /* c is a lead surrogate, get the real fcd16 */
3012 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
3014 fcd16
=_getFCD16FromSurrogatePair(fcd16
, c2
);
3023 if(nx_contains(nx
, c
, c2
)) {
3024 prevCC
=0; /* excluded: fcd16==0 */
3029 * prevCC has values from the following ranges:
3030 * 0..0xff - the previous trail combining class
3031 * <0 - the negative value of the previous code unit;
3032 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
3033 * was deferred so that average text is checked faster
3036 /* check the combining order */
3037 cc
=(int16_t)(fcd16
>>8);
3040 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
3041 if(!nx_contains(nx
, (UChar32
)-prevCC
)) {
3042 prevCC
=(int16_t)(_getFCD16((UChar
)-prevCC
)&0xff);
3044 prevCC
=0; /* excluded: fcd16==0 */
3052 prevCC
=(int16_t)(fcd16
&0xff);
3056 static UNormalizationCheckResult
3057 _quickCheck(const UChar
*src
,
3059 UNormalizationMode mode
,
3061 const UnicodeSet
*nx
,
3062 UErrorCode
*pErrorCode
) {
3063 UChar stackBuffer
[_STACK_BUFFER_CAPACITY
];
3065 int32_t bufferCapacity
;
3067 const UChar
*start
, *limit
;
3068 uint32_t norm32
, qcNorm32
, ccOrQCMask
, qcMask
;
3070 UChar c
, c2
, minNoMaybe
;
3072 UNormalizationCheckResult result
;
3074 /* check arguments */
3075 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3079 if(src
==NULL
|| srcLength
<-1) {
3080 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3084 if(!_haveData(*pErrorCode
)) {
3088 /* check for a valid mode and set the quick check minimum and mask */
3091 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
3092 qcMask
=_NORM_QC_NFC
;
3096 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
3097 qcMask
=_NORM_QC_NFKC
;
3098 options
=_NORM_OPTIONS_COMPAT
;
3101 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
];
3102 qcMask
=_NORM_QC_NFD
;
3106 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
];
3107 qcMask
=_NORM_QC_NFKD
;
3108 options
=_NORM_OPTIONS_COMPAT
;
3111 if(fcdTrie
.index
==NULL
) {
3112 *pErrorCode
=U_UNSUPPORTED_ERROR
;
3115 return unorm_checkFCD(src
, srcLength
, nx
) ? UNORM_YES
: UNORM_NO
;
3117 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3123 bufferCapacity
=_STACK_BUFFER_CAPACITY
;
3125 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
3131 /* string with length */
3132 limit
=src
+srcLength
;
3133 } else /* srcLength==-1 */ {
3134 /* zero-terminated string */
3141 /* skip a run of code units below the minimum or with irrelevant data for the quick check */
3147 goto endloop
; /* break out of outer loop */
3149 } else if(((norm32
=_getNorm32(c
))&ccOrQCMask
)!=0) {
3157 goto endloop
; /* break out of outer loop */
3158 } else if((c
=*src
++)>=minNoMaybe
&& ((norm32
=_getNorm32(c
))&ccOrQCMask
)!=0) {
3165 /* check one above-minimum, relevant code unit */
3166 if(isNorm32LeadSurrogate(norm32
)) {
3167 /* c is a lead surrogate, get the real norm32 */
3168 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
3170 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
3179 if(nx_contains(nx
, c
, c2
)) {
3180 /* excluded: norm32==0 */
3184 /* check the combining order */
3185 cc
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
3186 if(cc
!=0 && cc
<prevCC
) {
3192 /* check for "no" or "maybe" quick check flags */
3193 qcNorm32
=norm32
&qcMask
;
3194 if(qcNorm32
&_NORM_QC_ANY_NO
) {
3197 } else if(qcNorm32
!=0) {
3198 /* "maybe" can only occur for NFC and NFKC */
3202 /* normalize a section around here to see if it is really normalized or not */
3203 const UChar
*prevStarter
;
3204 uint32_t decompQCMask
;
3207 decompQCMask
=(qcMask
<<2)&0xf; /* decomposition quick check mask */
3209 /* find the previous starter */
3210 prevStarter
=src
-1; /* set prevStarter to the beginning of the current character */
3211 if(UTF_IS_TRAIL(*prevStarter
)) {
3212 --prevStarter
; /* safe because unpaired surrogates do not result in "maybe" */
3214 prevStarter
=_findPreviousStarter(start
, prevStarter
, ccOrQCMask
, decompQCMask
, minNoMaybe
);
3216 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3217 src
=_findNextStarter(src
, limit
, qcMask
, decompQCMask
, minNoMaybe
);
3219 /* decompose and recompose [prevStarter..src[ */
3220 _composePart(stackBuffer
, buffer
, bufferCapacity
,
3225 options
, nx
, pErrorCode
);
3226 if(U_FAILURE(*pErrorCode
)) {
3227 result
=UNORM_MAYBE
; /* error (out of memory) */
3231 /* compare the normalized version with the original */
3232 if(0!=uprv_strCompare(prevStarter
, (int32_t)(src
-prevStarter
), buffer
, length
, FALSE
, FALSE
)) {
3233 result
=UNORM_NO
; /* normalization differs */
3237 /* continue after the next starter */
3243 if(buffer
!=stackBuffer
) {
3250 U_CAPI UNormalizationCheckResult U_EXPORT2
3251 unorm_quickCheck(const UChar
*src
,
3253 UNormalizationMode mode
,
3254 UErrorCode
*pErrorCode
) {
3255 return _quickCheck(src
, srcLength
, mode
, TRUE
, NULL
, pErrorCode
);
3258 U_CAPI UNormalizationCheckResult U_EXPORT2
3259 unorm_quickCheckWithOptions(const UChar
*src
, int32_t srcLength
,
3260 UNormalizationMode mode
, int32_t options
,
3261 UErrorCode
*pErrorCode
) {
3262 return _quickCheck(src
, srcLength
, mode
, TRUE
, getNX(options
, *pErrorCode
), pErrorCode
);
3265 U_CFUNC UNormalizationCheckResult
3266 unorm_internalQuickCheck(const UChar
*src
,
3268 UNormalizationMode mode
,
3270 const UnicodeSet
*nx
,
3271 UErrorCode
*pErrorCode
) {
3272 return _quickCheck(src
, srcLength
, mode
, allowMaybe
, nx
, pErrorCode
);
3275 U_CAPI UBool U_EXPORT2
3276 unorm_isNormalized(const UChar
*src
, int32_t srcLength
,
3277 UNormalizationMode mode
,
3278 UErrorCode
*pErrorCode
) {
3279 return (UBool
)(UNORM_YES
==_quickCheck(src
, srcLength
, mode
, FALSE
, NULL
, pErrorCode
));
3282 U_CAPI UBool U_EXPORT2
3283 unorm_isNormalizedWithOptions(const UChar
*src
, int32_t srcLength
,
3284 UNormalizationMode mode
, int32_t options
,
3285 UErrorCode
*pErrorCode
) {
3286 return (UBool
)(UNORM_YES
==_quickCheck(src
, srcLength
, mode
, FALSE
, getNX(options
, *pErrorCode
), pErrorCode
));
3289 /* normalize() API ---------------------------------------------------------- */
3292 * Internal API for normalizing.
3293 * Does not check for bad input.
3294 * Requires _haveData() to be true.
3298 unorm_internalNormalizeWithNX(UChar
*dest
, int32_t destCapacity
,
3299 const UChar
*src
, int32_t srcLength
,
3300 UNormalizationMode mode
, int32_t options
, const UnicodeSet
*nx
,
3301 UErrorCode
*pErrorCode
) {
3307 destLength
=_decompose(dest
, destCapacity
,
3309 FALSE
, nx
, trailCC
);
3312 destLength
=_decompose(dest
, destCapacity
,
3317 destLength
=_compose(dest
, destCapacity
,
3319 options
, nx
, pErrorCode
);
3322 destLength
=_compose(dest
, destCapacity
,
3324 options
|_NORM_OPTIONS_COMPAT
, nx
, pErrorCode
);
3327 if(fcdTrie
.index
==NULL
) {
3328 *pErrorCode
=U_UNSUPPORTED_ERROR
;
3331 return unorm_makeFCD(dest
, destCapacity
,
3337 destLength
=_compose(dest
, destCapacity
,
3339 options
|_NORM_OPTIONS_COMPOSE_CONTIGUOUS
, nx
, pErrorCode
);
3343 /* just copy the string */
3345 srcLength
=u_strlen(src
);
3347 if(srcLength
>0 && srcLength
<=destCapacity
) {
3348 uprv_memcpy(dest
, src
, srcLength
*U_SIZEOF_UCHAR
);
3350 destLength
=srcLength
;
3353 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3357 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
3361 * Internal API for normalizing.
3362 * Does not check for bad input.
3365 U_CAPI
int32_t U_EXPORT2
3366 unorm_internalNormalize(UChar
*dest
, int32_t destCapacity
,
3367 const UChar
*src
, int32_t srcLength
,
3368 UNormalizationMode mode
, int32_t options
,
3369 UErrorCode
*pErrorCode
) {
3370 const UnicodeSet
*nx
;
3372 if(!_haveData(*pErrorCode
)) {
3376 nx
=getNX(options
, *pErrorCode
);
3377 if(U_FAILURE(*pErrorCode
)) {
3381 /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3382 options
&=~(_NORM_OPTIONS_SETS_MASK
|_NORM_OPTIONS_COMPAT
|_NORM_OPTIONS_COMPOSE_CONTIGUOUS
);
3384 return unorm_internalNormalizeWithNX(dest
, destCapacity
,
3390 /** Public API for normalizing. */
3391 U_CAPI
int32_t U_EXPORT2
3392 unorm_normalize(const UChar
*src
, int32_t srcLength
,
3393 UNormalizationMode mode
, int32_t options
,
3394 UChar
*dest
, int32_t destCapacity
,
3395 UErrorCode
*pErrorCode
) {
3396 /* check argument values */
3397 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3401 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3402 src
==NULL
|| srcLength
<-1
3404 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3408 /* check for overlapping src and destination */
3410 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
3411 (srcLength
>0 && dest
>=src
&& dest
<(src
+srcLength
)))
3413 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3417 return unorm_internalNormalize(dest
, destCapacity
,
3424 /* iteration functions ------------------------------------------------------ */
3427 * These iteration functions are the core implementations of the
3428 * Normalizer class iteration API.
3429 * They read from a UCharIterator into their own buffer
3430 * and normalize into the Normalizer iteration buffer.
3431 * Normalizer itself then iterates over its buffer until that needs to be
3437 * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3438 * if iteration bounds are reached,
3439 * try to not call hasNext/hasPrevious and instead check for >=0.
3442 /* backward iteration ------------------------------------------------------- */
3445 * read backwards and get norm32
3446 * return 0 if the character is <minC
3447 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3449 static inline uint32_t
3450 _getPrevNorm32(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
) {
3453 /* need src.hasPrevious() */
3454 c
=(UChar
)src
.previous(&src
);
3457 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3460 } else if(!UTF_IS_SURROGATE(c
)) {
3461 return _getNorm32(c
);
3462 } else if(UTF_IS_SURROGATE_FIRST(c
) || !src
.hasPrevious(&src
)) {
3463 /* unpaired surrogate */
3465 } else if(UTF_IS_FIRST_SURROGATE(c2
=(UChar
)src
.previous(&src
))) {
3466 norm32
=_getNorm32(c2
);
3467 if((norm32
&mask
)==0) {
3468 /* all surrogate pairs with this lead surrogate have irrelevant data */
3471 /* norm32 must be a surrogate special */
3472 return _getNorm32FromSurrogatePair(norm32
, c
);
3475 /* unpaired second surrogate, undo the c2=src.previous() movement */
3476 src
.move(&src
, 1, UITER_CURRENT
);
3483 * read backwards and check if the character is a previous-iteration boundary
3484 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3487 IsPrevBoundaryFn(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
);
3491 * read backwards and check if the lead combining class is 0
3492 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3495 _isPrevNFDSafe(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3496 return _isNFDSafe(_getPrevNorm32(src
, minC
, ccOrQCMask
, c
, c2
), ccOrQCMask
, ccOrQCMask
&_NORM_QC_MASK
);
3500 * read backwards and check if the character is (or its decomposition begins with)
3501 * a "true starter" (cc==0 and NF*C_YES)
3502 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3505 _isPrevTrueStarter(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3506 uint32_t norm32
, decompQCMask
;
3508 decompQCMask
=(ccOrQCMask
<<2)&0xf; /* decomposition quick check mask */
3509 norm32
=_getPrevNorm32(src
, minC
, ccOrQCMask
|decompQCMask
, c
, c2
);
3510 return _isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
);
3514 _findPreviousIterationBoundary(UCharIterator
&src
,
3515 IsPrevBoundaryFn
*isPrevBoundary
, uint32_t minC
, uint32_t mask
,
3516 UChar
*&buffer
, int32_t &bufferCapacity
,
3517 int32_t &startIndex
,
3518 UErrorCode
*pErrorCode
) {
3525 startIndex
=bufferCapacity
; /* fill the buffer from the end backwards */
3527 while(src
.hasPrevious(&src
)) {
3528 isBoundary
=isPrevBoundary(src
, minC
, mask
, c
, c2
);
3530 /* always write this character to the front of the buffer */
3531 /* make sure there is enough space in the buffer */
3532 if(startIndex
< (c2
==0 ? 1 : 2)) {
3533 int32_t bufferLength
=bufferCapacity
;
3535 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, 2*bufferCapacity
, bufferLength
)) {
3536 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
3537 src
.move(&src
, 0, UITER_START
);
3541 /* move the current buffer contents up */
3542 uprv_memmove(buffer
+(bufferCapacity
-bufferLength
), buffer
, bufferLength
*U_SIZEOF_UCHAR
);
3543 startIndex
+=bufferCapacity
-bufferLength
;
3546 buffer
[--startIndex
]=c
;
3548 buffer
[--startIndex
]=c2
;
3551 /* stop if this just-copied character is a boundary */
3557 /* return the length of the buffer contents */
3558 return bufferCapacity
-startIndex
;
3561 U_CAPI
int32_t U_EXPORT2
3562 unorm_previous(UCharIterator
*src
,
3563 UChar
*dest
, int32_t destCapacity
,
3564 UNormalizationMode mode
, int32_t options
,
3565 UBool doNormalize
, UBool
*pNeededToNormalize
,
3566 UErrorCode
*pErrorCode
) {
3567 UChar stackBuffer
[100];
3569 IsPrevBoundaryFn
*isPreviousBoundary
=NULL
;
3571 int32_t startIndex
=0, bufferLength
=0, bufferCapacity
=0, destLength
=0;
3575 /* check argument values */
3576 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3580 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3583 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3587 if(!_haveData(*pErrorCode
)) {
3591 if(pNeededToNormalize
!=NULL
) {
3592 *pNeededToNormalize
=FALSE
;
3597 if(fcdTrie
.index
==NULL
) {
3598 *pErrorCode
=U_UNSUPPORTED_ERROR
;
3601 /* fall through to NFD */
3603 isPreviousBoundary
=_isPrevNFDSafe
;
3604 minC
=_NORM_MIN_WITH_LEAD_CC
;
3605 mask
=_NORM_CC_MASK
|_NORM_QC_NFD
;
3608 isPreviousBoundary
=_isPrevNFDSafe
;
3609 minC
=_NORM_MIN_WITH_LEAD_CC
;
3610 mask
=_NORM_CC_MASK
|_NORM_QC_NFKD
;
3613 isPreviousBoundary
=_isPrevTrueStarter
;
3614 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
3615 mask
=_NORM_CC_MASK
|_NORM_QC_NFC
;
3618 isPreviousBoundary
=_isPrevTrueStarter
;
3619 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
3620 mask
=_NORM_CC_MASK
|_NORM_QC_NFKC
;
3624 if((c
=src
->previous(src
))>=0) {
3626 if(UTF_IS_TRAIL(c
) && (c2
=src
->previous(src
))>=0) {
3627 if(UTF_IS_LEAD(c2
)) {
3628 if(destCapacity
>=2) {
3629 dest
[1]=(UChar
)c
; /* trail surrogate */
3632 c
=c2
; /* lead surrogate to be written below */
3634 src
->move(src
, 1, UITER_CURRENT
);
3638 if(destCapacity
>0) {
3642 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
3644 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3649 bufferCapacity
=(int32_t)(sizeof(stackBuffer
)/U_SIZEOF_UCHAR
);
3650 bufferLength
=_findPreviousIterationBoundary(*src
,
3651 isPreviousBoundary
, minC
, mask
,
3652 buffer
, bufferCapacity
,
3655 if(bufferLength
>0) {
3657 destLength
=unorm_internalNormalize(dest
, destCapacity
,
3658 buffer
+startIndex
, bufferLength
,
3661 if(pNeededToNormalize
!=0 && U_SUCCESS(*pErrorCode
)) {
3662 *pNeededToNormalize
=
3663 (UBool
)(destLength
!=bufferLength
||
3664 0!=uprv_memcmp(dest
, buffer
+startIndex
, destLength
*U_SIZEOF_UCHAR
));
3667 /* just copy the source characters */
3668 if(destCapacity
>0) {
3669 uprv_memcpy(dest
, buffer
+startIndex
, uprv_min(bufferLength
, destCapacity
)*U_SIZEOF_UCHAR
);
3671 destLength
=u_terminateUChars(dest
, destCapacity
, bufferLength
, pErrorCode
);
3674 destLength
=u_terminateUChars(dest
, destCapacity
, 0, pErrorCode
);
3678 if(buffer
!=stackBuffer
) {
3685 /* forward iteration -------------------------------------------------------- */
3688 * read forward and get norm32
3689 * return 0 if the character is <minC
3690 * if c2!=0 then (c2, c) is a surrogate pair
3691 * always reads complete characters
3693 static inline uint32_t
3694 _getNextNorm32(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
) {
3697 /* need src.hasNext() to be true */
3698 c
=(UChar
)src
.next(&src
);
3705 norm32
=_getNorm32(c
);
3706 if(UTF_IS_FIRST_SURROGATE(c
)) {
3707 if(src
.hasNext(&src
) && UTF_IS_SECOND_SURROGATE(c2
=(UChar
)src
.current(&src
))) {
3708 src
.move(&src
, 1, UITER_CURRENT
); /* skip the c2 surrogate */
3709 if((norm32
&mask
)==0) {
3710 /* irrelevant data */
3713 /* norm32 must be a surrogate special */
3714 return _getNorm32FromSurrogatePair(norm32
, c2
);
3717 /* unmatched surrogate */
3726 * read forward and check if the character is a next-iteration boundary
3727 * if c2!=0 then (c, c2) is a surrogate pair
3730 IsNextBoundaryFn(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
);
3734 * read forward and check if the lead combining class is 0
3735 * if c2!=0 then (c, c2) is a surrogate pair
3738 _isNextNFDSafe(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3739 return _isNFDSafe(_getNextNorm32(src
, minC
, ccOrQCMask
, c
, c2
), ccOrQCMask
, ccOrQCMask
&_NORM_QC_MASK
);
3744 * read forward and check if the character is (or its decomposition begins with)
3745 * a "true starter" (cc==0 and NF*C_YES)
3746 * if c2!=0 then (c, c2) is a surrogate pair
3749 _isNextTrueStarter(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3750 uint32_t norm32
, decompQCMask
;
3752 decompQCMask
=(ccOrQCMask
<<2)&0xf; /* decomposition quick check mask */
3753 norm32
=_getNextNorm32(src
, minC
, ccOrQCMask
|decompQCMask
, c
, c2
);
3754 return _isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
);
3758 _findNextIterationBoundary(UCharIterator
&src
,
3759 IsNextBoundaryFn
*isNextBoundary
, uint32_t minC
, uint32_t mask
,
3760 UChar
*&buffer
, int32_t &bufferCapacity
,
3761 UErrorCode
*pErrorCode
) {
3763 int32_t bufferIndex
;
3766 if(!src
.hasNext(&src
)) {
3773 /* get one character and ignore its properties */
3774 buffer
[0]=c
=(UChar
)src
.next(&src
);
3776 if(UTF_IS_FIRST_SURROGATE(c
) && src
.hasNext(&src
)) {
3777 if(UTF_IS_SECOND_SURROGATE(c2
=(UChar
)src
.next(&src
))) {
3778 buffer
[bufferIndex
++]=c2
;
3780 src
.move(&src
, -1, UITER_CURRENT
); /* back out the non-trail-surrogate */
3784 /* get all following characters until we see a boundary */
3785 /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3786 while(src
.hasNext(&src
)) {
3787 if(isNextBoundary(src
, minC
, mask
, c
, c2
)) {
3788 /* back out the latest movement to stop at the boundary */
3789 src
.move(&src
, c2
==0 ? -1 : -2, UITER_CURRENT
);
3792 if(bufferIndex
+(c2
==0 ? 1 : 2)<=bufferCapacity
||
3793 /* attempt to grow the buffer */
3794 u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
,
3798 buffer
[bufferIndex
++]=c
;
3800 buffer
[bufferIndex
++]=c2
;
3803 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
3804 src
.move(&src
, 0, UITER_LIMIT
);
3810 /* return the length of the buffer contents */
3814 U_CAPI
int32_t U_EXPORT2
3815 unorm_next(UCharIterator
*src
,
3816 UChar
*dest
, int32_t destCapacity
,
3817 UNormalizationMode mode
, int32_t options
,
3818 UBool doNormalize
, UBool
*pNeededToNormalize
,
3819 UErrorCode
*pErrorCode
) {
3820 UChar stackBuffer
[100];
3822 IsNextBoundaryFn
*isNextBoundary
;
3824 int32_t bufferLength
, bufferCapacity
, destLength
;
3828 /* check argument values */
3829 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3833 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3836 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3840 if(!_haveData(*pErrorCode
)) {
3844 if(pNeededToNormalize
!=NULL
) {
3845 *pNeededToNormalize
=FALSE
;
3850 if(fcdTrie
.index
==NULL
) {
3851 *pErrorCode
=U_UNSUPPORTED_ERROR
;
3854 /* fall through to NFD */
3856 isNextBoundary
=_isNextNFDSafe
;
3857 minC
=_NORM_MIN_WITH_LEAD_CC
;
3858 mask
=_NORM_CC_MASK
|_NORM_QC_NFD
;
3861 isNextBoundary
=_isNextNFDSafe
;
3862 minC
=_NORM_MIN_WITH_LEAD_CC
;
3863 mask
=_NORM_CC_MASK
|_NORM_QC_NFKD
;
3866 isNextBoundary
=_isNextTrueStarter
;
3867 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
3868 mask
=_NORM_CC_MASK
|_NORM_QC_NFC
;
3871 isNextBoundary
=_isNextTrueStarter
;
3872 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
3873 mask
=_NORM_CC_MASK
|_NORM_QC_NFKC
;
3877 if((c
=src
->next(src
))>=0) {
3879 if(UTF_IS_LEAD(c
) && (c2
=src
->next(src
))>=0) {
3880 if(UTF_IS_TRAIL(c2
)) {
3881 if(destCapacity
>=2) {
3882 dest
[1]=(UChar
)c2
; /* trail surrogate */
3885 /* lead surrogate to be written below */
3887 src
->move(src
, -1, UITER_CURRENT
);
3891 if(destCapacity
>0) {
3895 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
3897 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3902 bufferCapacity
=(int32_t)(sizeof(stackBuffer
)/U_SIZEOF_UCHAR
);
3903 bufferLength
=_findNextIterationBoundary(*src
,
3904 isNextBoundary
, minC
, mask
,
3905 buffer
, bufferCapacity
,
3907 if(bufferLength
>0) {
3909 destLength
=unorm_internalNormalize(dest
, destCapacity
,
3910 buffer
, bufferLength
,
3913 if(pNeededToNormalize
!=0 && U_SUCCESS(*pErrorCode
)) {
3914 *pNeededToNormalize
=
3915 (UBool
)(destLength
!=bufferLength
||
3916 0!=uprv_memcmp(dest
, buffer
, destLength
*U_SIZEOF_UCHAR
));
3919 /* just copy the source characters */
3920 if(destCapacity
>0) {
3921 uprv_memcpy(dest
, buffer
, uprv_min(bufferLength
, destCapacity
)*U_SIZEOF_UCHAR
);
3923 destLength
=u_terminateUChars(dest
, destCapacity
, bufferLength
, pErrorCode
);
3926 destLength
=u_terminateUChars(dest
, destCapacity
, 0, pErrorCode
);
3930 if(buffer
!=stackBuffer
) {
3938 * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3939 * and if not, how hard it would be to improve it.
3940 * For example, see _findSafeFCD().
3943 /* Concatenation of normalized strings -------------------------------------- */
3945 U_CAPI
int32_t U_EXPORT2
3946 unorm_concatenate(const UChar
*left
, int32_t leftLength
,
3947 const UChar
*right
, int32_t rightLength
,
3948 UChar
*dest
, int32_t destCapacity
,
3949 UNormalizationMode mode
, int32_t options
,
3950 UErrorCode
*pErrorCode
) {
3951 UChar stackBuffer
[100];
3953 int32_t bufferLength
, bufferCapacity
;
3956 int32_t leftBoundary
, rightBoundary
, destLength
;
3958 /* check argument values */
3959 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3963 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3964 left
==NULL
|| leftLength
<-1 ||
3965 right
==NULL
|| rightLength
<-1
3967 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3971 /* check for overlapping right and destination */
3973 ((right
>=dest
&& right
<(dest
+destCapacity
)) ||
3974 (rightLength
>0 && dest
>=right
&& dest
<(right
+rightLength
)))
3976 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3980 /* allow left==dest */
3982 /* set up intermediate buffer */
3984 bufferCapacity
=(int32_t)(sizeof(stackBuffer
)/U_SIZEOF_UCHAR
);
3987 * Input: left[0..leftLength[ + right[0..rightLength[
3989 * Find normalization-safe boundaries leftBoundary and rightBoundary
3990 * and copy the end parts together:
3991 * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
3993 * dest=left[0..leftBoundary[ +
3994 * normalize(buffer) +
3995 * right[rightBoundary..rightLength[
3999 * find a normalization boundary at the end of the left string
4000 * and copy the end part into the buffer
4002 uiter_setString(&iter
, left
, leftLength
);
4003 iter
.index
=leftLength
=iter
.length
; /* end of left string */
4005 bufferLength
=unorm_previous(&iter
, buffer
, bufferCapacity
,
4009 leftBoundary
=iter
.index
;
4010 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
4011 *pErrorCode
=U_ZERO_ERROR
;
4012 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, 2*bufferLength
, 0)) {
4013 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
4014 /* dont need to cleanup here since
4015 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4020 /* just copy from the left string: we know the boundary already */
4021 uprv_memcpy(buffer
, left
+leftBoundary
, bufferLength
*U_SIZEOF_UCHAR
);
4025 * find a normalization boundary at the beginning of the right string
4026 * and concatenate the beginning part to the buffer
4028 uiter_setString(&iter
, right
, rightLength
);
4029 rightLength
=iter
.length
; /* in case it was -1 */
4031 rightBoundary
=unorm_next(&iter
, buffer
+bufferLength
, bufferCapacity
-bufferLength
,
4035 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
4036 *pErrorCode
=U_ZERO_ERROR
;
4037 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, bufferLength
+rightBoundary
, 0)) {
4038 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
4039 /* dont need to cleanup here since
4040 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4045 /* just copy from the right string: we know the boundary already */
4046 uprv_memcpy(buffer
+bufferLength
, right
, rightBoundary
*U_SIZEOF_UCHAR
);
4049 bufferLength
+=rightBoundary
;
4051 /* copy left[0..leftBoundary[ to dest */
4052 if(left
!=dest
&& leftBoundary
>0 && destCapacity
>0) {
4053 uprv_memcpy(dest
, left
, uprv_min(leftBoundary
, destCapacity
)*U_SIZEOF_UCHAR
);
4055 destLength
=leftBoundary
;
4057 /* concatenate the normalization of the buffer to dest */
4058 if(destCapacity
>destLength
) {
4059 destLength
+=unorm_internalNormalize(dest
+destLength
, destCapacity
-destLength
,
4060 buffer
, bufferLength
,
4064 destLength
+=unorm_internalNormalize(NULL
, 0,
4065 buffer
, bufferLength
,
4070 * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
4071 * so we dont check for the error code here..just let it pass through
4073 /* concatenate right[rightBoundary..rightLength[ to dest */
4074 right
+=rightBoundary
;
4075 rightLength
-=rightBoundary
;
4076 if(rightLength
>0 && destCapacity
>destLength
) {
4077 uprv_memcpy(dest
+destLength
, right
, uprv_min(rightLength
, destCapacity
-destLength
)*U_SIZEOF_UCHAR
);
4079 destLength
+=rightLength
;
4082 if(buffer
!=stackBuffer
) {
4086 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
4089 #endif /* #if !UCONFIG_NO_NORMALIZATION */