2 ******************************************************************************
3 * Copyright (c) 1996-2007, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
8 * Created by: Vladimir Weinstein 12052000
10 * Modification history :
12 * Date Name Description
13 * 02/01/01 synwee Added normalization quickcheck enum and method.
14 * 02/12/01 synwee Commented out quickcheck util api has been approved
15 * Added private method for doing FCD checks
16 * 02/23/01 synwee Modified quickcheck and checkFCE to run through
17 * string for codepoints < 0x300 for the normalization
19 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here
20 * instead of just wrappers around normlzr.cpp,
21 * load unorm.dat, support Unicode 3.1 with
22 * supplementary code points, etc.
25 #include "unicode/utypes.h"
27 #if !UCONFIG_NO_NORMALIZATION
29 #include "unicode/udata.h"
30 #include "unicode/uchar.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/uniset.h"
34 #include "unicode/usetiter.h"
35 #include "unicode/unorm.h"
42 #include "unicode/uset.h"
47 * Status of tailored normalization
49 * This was done initially for investigation on Unicode public review issue 7
50 * (http://www.unicode.org/review/). See Jitterbug 2481.
51 * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
52 * a permanent feature in ICU 2.6 in support of IDNA which requires true
53 * Unicode 3.2 normalization.
54 * (NormalizationCorrections are rolled into IDNA mapping tables.)
56 * Tailored normalization as implemented here allows to "normalize less"
57 * than full Unicode normalization would.
58 * Based internally on a UnicodeSet of code points that are
59 * "excluded from normalization", the normalization functions leave those
60 * code points alone ("inert"). This means that tailored normalization
61 * still transforms text into a canonically equivalent form.
62 * It does not add decompositions to code points that do not have any or
63 * change decomposition results.
65 * Any function that searches for a safe boundary has not been touched,
66 * which means that these functions will be over-pessimistic when
67 * exclusions are applied.
68 * This should not matter because subsequent checks and normalizations
69 * do apply the exclusions; only a little more of the text may be processed
70 * than necessary under exclusions.
72 * Normalization exclusions have the following effect on excluded code points c:
73 * - c is not decomposed
74 * - c is not a composition target
75 * - c does not combine forward or backward for composition
76 * except that this is not implemented for Jamo
77 * - c is treated as having a combining class of 0
79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
84 * This new implementation of the normalization code loads its data from
85 * unorm.dat, which is generated with the gennorm tool.
86 * The format of that file is described in unormimp.h .
89 /* -------------------------------------------------------------------------- */
92 _STACK_BUFFER_CAPACITY
=100
96 * Constants for the bit fields in the options bit set parameter.
97 * These need not be public.
98 * A user only needs to know the currently assigned values.
99 * The number and positions of reserved bits per field can remain private
100 * and may change in future implementations.
103 _NORM_OPTIONS_NX_MASK
=0x1f,
104 _NORM_OPTIONS_UNICODE_MASK
=0x60,
105 _NORM_OPTIONS_SETS_MASK
=0x7f,
107 _NORM_OPTIONS_UNICODE_SHIFT
=5,
110 * The following options are used only in some composition functions.
111 * They use bits 12 and up to preserve lower bits for the available options
112 * space in unorm_compare() -
113 * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
116 /** Options bit 12, for compatibility vs. canonical decomposition. */
117 _NORM_OPTIONS_COMPAT
=0x1000,
118 /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
119 _NORM_OPTIONS_COMPOSE_CONTIGUOUS
=0x2000
124 isHangulWithoutJamoT(UChar c
) {
126 return c
<HANGUL_COUNT
&& c%JAMO_T_COUNT
==0;
131 /* is this a norm32 with a regular index? */
133 isNorm32Regular(uint32_t norm32
) {
134 return norm32
<_NORM_MIN_SPECIAL
;
137 /* is this a norm32 with a special index for a lead surrogate? */
139 isNorm32LeadSurrogate(uint32_t norm32
) {
140 return _NORM_MIN_SPECIAL
<=norm32
&& norm32
<_NORM_SURROGATES_TOP
;
143 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
145 isNorm32HangulOrJamo(uint32_t norm32
) {
146 return norm32
>=_NORM_MIN_HANGUL
;
150 * Given isNorm32HangulOrJamo(),
151 * is this a Hangul syllable or a Jamo?
153 /*static inline UBool
154 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
155 return norm32<_NORM_MIN_JAMO_V;
159 * Given norm32 for Jamo V or T,
163 isJamoVTNorm32JamoV(uint32_t norm32
) {
164 return norm32
<_NORM_JAMO_V_TOP
;
167 /* load unorm.dat ----------------------------------------------------------- */
169 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
170 static int32_t U_CALLCONV
171 getFoldingNormOffset(uint32_t norm32
) {
172 if(isNorm32LeadSurrogate(norm32
)) {
174 UTRIE_BMP_INDEX_LENGTH
+
175 (((int32_t)norm32
>>(_NORM_EXTRA_SHIFT
-UTRIE_SURROGATE_BLOCK_BITS
))&
176 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS
));
182 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
183 static int32_t U_CALLCONV
184 getFoldingAuxOffset(uint32_t data
) {
185 return (int32_t)(data
&_NORM_AUX_FNC_MASK
)<<UTRIE_SURROGATE_BLOCK_BITS
;
189 #define UNORM_HARDCODE_DATA 1
191 #if UNORM_HARDCODE_DATA
193 /* unorm_props_data.c is machine-generated by gennorm --csource */
194 #include "unorm_props_data.c"
196 static const UBool formatVersion_2_2
=TRUE
;
200 #define DATA_NAME "unorm"
201 #define DATA_TYPE "icu"
203 static UDataMemory
*normData
=NULL
;
204 static UErrorCode dataErrorCode
=U_ZERO_ERROR
;
205 static int8_t haveNormData
=0;
207 static int32_t indexes
[_NORM_INDEX_TOP
]={ 0 };
208 static UTrie normTrie
={ 0,0,0,0,0,0,0 }, fcdTrie
={ 0,0,0,0,0,0,0 }, auxTrie
={ 0,0,0,0,0,0,0 };
211 * pointers into the memory-mapped unorm.icu
213 static const uint16_t *extraData
=NULL
,
214 *combiningTable
=NULL
,
215 *canonStartSets
=NULL
;
217 static uint8_t formatVersion
[4]={ 0, 0, 0, 0 };
218 static UBool formatVersion_2_1
=FALSE
, formatVersion_2_2
=FALSE
;
220 /* the Unicode version of the normalization data */
221 static UVersionInfo dataVersion
={ 0, 0, 0, 0 };
225 /* cache UnicodeSets for each combination of exclusion flags */
226 static UnicodeSet
*nxCache
[_NORM_OPTIONS_SETS_MASK
+1]={ NULL
};
230 static UBool U_CALLCONV
231 unorm_cleanup(void) {
234 #if !UNORM_HARDCODE_DATA
236 udata_close(normData
);
239 dataErrorCode
=U_ZERO_ERROR
;
243 for(i
=0; i
<(int32_t)LENGTHOF(nxCache
); ++i
) {
253 #if !UNORM_HARDCODE_DATA
255 static UBool U_CALLCONV
256 isAcceptable(void * /* context */,
257 const char * /* type */, const char * /* name */,
258 const UDataInfo
*pInfo
) {
261 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
262 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
263 pInfo
->dataFormat
[0]==0x4e && /* dataFormat="Norm" */
264 pInfo
->dataFormat
[1]==0x6f &&
265 pInfo
->dataFormat
[2]==0x72 &&
266 pInfo
->dataFormat
[3]==0x6d &&
267 pInfo
->formatVersion
[0]==2 &&
268 pInfo
->formatVersion
[2]==UTRIE_SHIFT
&&
269 pInfo
->formatVersion
[3]==UTRIE_INDEX_SHIFT
271 uprv_memcpy(formatVersion
, pInfo
->formatVersion
, 4);
272 uprv_memcpy(dataVersion
, pInfo
->dataVersion
, 4);
281 static UBool U_CALLCONV
282 _enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32
/*limit*/, uint32_t /*value*/) {
283 /* add the start code point to the USet */
284 const USetAdder
*sa
=(const USetAdder
*)context
;
285 sa
->add(sa
->set
, start
);
291 #if !UNORM_HARDCODE_DATA
294 loadNormData(UErrorCode
&errorCode
) {
295 /* load Unicode normalization data from file */
298 * This lazy intialization with double-checked locking (without mutex protection for
299 * haveNormData==0) is transiently unsafe under certain circumstances.
300 * Check the readme and use u_init() if necessary.
302 * While u_init() initializes the main normalization data via this functions,
303 * it does not do so for exclusion sets (which are fully mutexed).
305 * - there can be many exclusion sets
306 * - they are rarely used
307 * - they are not usually used in execution paths that are
308 * as performance-sensitive as others
309 * (e.g., IDNA takes more time than unorm_quickCheck() anyway)
311 if(haveNormData
==0) {
312 UTrie _normTrie
={ 0,0,0,0,0,0,0 }, _fcdTrie
={ 0,0,0,0,0,0,0 }, _auxTrie
={ 0,0,0,0,0,0,0 };
315 const int32_t *p
=NULL
;
318 if(&errorCode
==NULL
|| U_FAILURE(errorCode
)) {
322 /* open the data outside the mutex block */
323 data
=udata_openChoice(NULL
, DATA_TYPE
, DATA_NAME
, isAcceptable
, NULL
, &errorCode
);
324 dataErrorCode
=errorCode
;
325 if(U_FAILURE(errorCode
)) {
326 return haveNormData
=-1;
329 p
=(const int32_t *)udata_getMemory(data
);
330 pb
=(const uint8_t *)(p
+_NORM_INDEX_TOP
);
331 utrie_unserialize(&_normTrie
, pb
, p
[_NORM_INDEX_TRIE_SIZE
], &errorCode
);
332 _normTrie
.getFoldingOffset
=getFoldingNormOffset
;
334 pb
+=p
[_NORM_INDEX_TRIE_SIZE
]+p
[_NORM_INDEX_UCHAR_COUNT
]*2+p
[_NORM_INDEX_COMBINE_DATA_COUNT
]*2;
335 if(p
[_NORM_INDEX_FCD_TRIE_SIZE
]!=0) {
336 utrie_unserialize(&_fcdTrie
, pb
, p
[_NORM_INDEX_FCD_TRIE_SIZE
], &errorCode
);
338 pb
+=p
[_NORM_INDEX_FCD_TRIE_SIZE
];
340 if(p
[_NORM_INDEX_AUX_TRIE_SIZE
]!=0) {
341 utrie_unserialize(&_auxTrie
, pb
, p
[_NORM_INDEX_AUX_TRIE_SIZE
], &errorCode
);
342 _auxTrie
.getFoldingOffset
=getFoldingAuxOffset
;
345 if(U_FAILURE(errorCode
)) {
346 dataErrorCode
=errorCode
;
348 return haveNormData
=-1;
351 /* in the mutex block, set the data for this process */
357 uprv_memcpy(&indexes
, p
, sizeof(indexes
));
358 uprv_memcpy(&normTrie
, &_normTrie
, sizeof(UTrie
));
359 uprv_memcpy(&fcdTrie
, &_fcdTrie
, sizeof(UTrie
));
360 uprv_memcpy(&auxTrie
, &_auxTrie
, sizeof(UTrie
));
362 p
=(const int32_t *)udata_getMemory(normData
);
365 /* initialize some variables */
366 extraData
=(uint16_t *)((uint8_t *)(p
+_NORM_INDEX_TOP
)+indexes
[_NORM_INDEX_TRIE_SIZE
]);
367 combiningTable
=extraData
+indexes
[_NORM_INDEX_UCHAR_COUNT
];
368 formatVersion_2_1
=formatVersion
[0]>2 || (formatVersion
[0]==2 && formatVersion
[1]>=1);
369 formatVersion_2_2
=formatVersion
[0]>2 || (formatVersion
[0]==2 && formatVersion
[1]>=2);
370 if(formatVersion_2_1
) {
371 canonStartSets
=combiningTable
+
372 indexes
[_NORM_INDEX_COMBINE_DATA_COUNT
]+
373 (indexes
[_NORM_INDEX_FCD_TRIE_SIZE
]+indexes
[_NORM_INDEX_AUX_TRIE_SIZE
])/2;
376 ucln_common_registerCleanup(UCLN_COMMON_UNORM
, unorm_cleanup
);
379 /* if a different thread set it first, then close the extra data */
381 udata_close(data
); /* NULL if it was set correctly */
391 _haveData(UErrorCode
&errorCode
) {
392 #if UNORM_HARDCODE_DATA
393 return U_SUCCESS(errorCode
);
395 if(U_FAILURE(errorCode
)) {
397 } else if(haveNormData
>0) {
399 } else if(haveNormData
<0) {
400 errorCode
=dataErrorCode
;
402 } else /* haveNormData==0 */ {
403 return (UBool
)(loadNormData(errorCode
)>0);
408 U_CAPI UBool U_EXPORT2
409 unorm_haveData(UErrorCode
*pErrorCode
) {
410 return _haveData(*pErrorCode
);
413 U_CAPI
const uint16_t * U_EXPORT2
414 unorm_getFCDTrie(UErrorCode
*pErrorCode
) {
415 if(_haveData(*pErrorCode
)) {
416 return fcdTrie
.index
;
422 /* data access primitives --------------------------------------------------- */
424 static inline uint32_t
425 _getNorm32(UChar c
) {
426 return UTRIE_GET32_FROM_LEAD(&normTrie
, c
);
429 static inline uint32_t
430 _getNorm32FromSurrogatePair(uint32_t norm32
, UChar c2
) {
432 * the surrogate index in norm32 stores only the number of the surrogate index block
433 * see gennorm/store.c/getFoldedNormValue()
436 UTRIE_BMP_INDEX_LENGTH
+
437 ((norm32
>>(_NORM_EXTRA_SHIFT
-UTRIE_SURROGATE_BLOCK_BITS
))&
438 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS
));
439 return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie
, norm32
, c2
);
443 * get a norm32 from text with complete code points
444 * (like from decompositions)
446 static inline uint32_t
447 _getNorm32(const UChar
*p
, uint32_t mask
) {
448 uint32_t norm32
=_getNorm32(*p
);
449 if((norm32
&mask
) && isNorm32LeadSurrogate(norm32
)) {
450 /* *p is a lead surrogate, get the real norm32 */
451 norm32
=_getNorm32FromSurrogatePair(norm32
, *(p
+1));
456 static inline uint16_t
458 return UTRIE_GET16_FROM_LEAD(&fcdTrie
, c
);
461 static inline uint16_t
462 _getFCD16FromSurrogatePair(uint16_t fcd16
, UChar c2
) {
463 /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
464 return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie
, fcd16
, c2
);
467 static inline const uint16_t *
468 _getExtraData(uint32_t norm32
) {
469 return extraData
+(norm32
>>_NORM_EXTRA_SHIFT
);
474 * It is possible to get the FCD data from the main trie if unorm.icu
475 * was built without the FCD trie, although it is slower.
476 * This is not implemented because it is hard to test, and because it seems
477 * unusual to want to use FCD and not build the data file for it.
479 * Untested sample code:
481 static inline uint16_t
482 _getFCD16FromNormData(UChar32 c
) {
483 uint32_t norm32
, fcd
;
485 norm32
=_getNorm32(c
);
486 if((norm32
&_NORM_QC_NFD
) && isNorm32Regular(norm32
)) {
487 /* get the lead/trail cc from the decomposition data */
488 const uint16_t *nfd
=_getExtraData(norm32
);
489 if(*nfd
&_NORM_DECOMP_FLAG_LENGTH_HAS_CC
) {
493 fcd
=norm32
&_NORM_CC_MASK
;
495 /* use the code point cc value for both lead and trail cc's */
496 fcd
|=fcd
>>_NORM_CC_SHIFT
; /* assume that the cc is in bits 15..8 */
500 return (uint16_t)fcd
;
504 /* normalization exclusion sets --------------------------------------------- */
507 * Normalization exclusion UnicodeSets are used for tailored normalization;
508 * see the comment near the beginning of this file.
510 * By specifying one or several sets of code points,
511 * those code points become inert for normalization.
514 static const UnicodeSet
*
515 internalGetNXHangul(UErrorCode
&errorCode
) {
516 /* internal function, does not check for incoming U_FAILURE */
519 UMTX_CHECK(NULL
, (UBool
)(nxCache
[UNORM_NX_HANGUL
]!=NULL
), isCached
);
522 UnicodeSet
*set
=new UnicodeSet(0xac00, 0xd7a3);
524 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
527 // Compact the set for caching.
531 if(nxCache
[UNORM_NX_HANGUL
]==NULL
) {
532 nxCache
[UNORM_NX_HANGUL
]=set
;
534 ucln_common_registerCleanup(UCLN_COMMON_UNORM
, unorm_cleanup
);
541 return nxCache
[UNORM_NX_HANGUL
];
544 /* unorm.cpp 1.116 had and used
545 static const UnicodeSet *
546 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
551 /* get and set an exclusion set from a serialized UnicodeSet */
552 static const UnicodeSet
*
553 internalGetSerializedNX(int32_t options
, int32_t nxIndex
, UErrorCode
&errorCode
) {
554 /* internal function, does not check for incoming U_FAILURE */
557 UMTX_CHECK(NULL
, (UBool
)(nxCache
[options
]!=NULL
), isCached
);
560 canonStartSets
!=NULL
&&
561 canonStartSets
[nxIndex
]!=0 && canonStartSets
[nxIndex
+1]>canonStartSets
[nxIndex
]
568 if( !uset_getSerializedSet(
570 canonStartSets
+canonStartSets
[nxIndex
],
571 canonStartSets
[nxIndex
+1]-canonStartSets
[nxIndex
])
573 errorCode
=U_INVALID_FORMAT_ERROR
;
577 /* turn the serialized set into a UnicodeSet */
578 set
=new UnicodeSet();
580 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
583 for(i
=0; uset_getSerializedRange(&sset
, i
, &start
, &end
); ++i
) {
584 set
->add(start
, end
);
586 // Compact the set for caching.
590 if(nxCache
[options
]==NULL
) {
591 nxCache
[options
]=set
;
593 ucln_common_registerCleanup(UCLN_COMMON_UNORM
, unorm_cleanup
);
600 return nxCache
[options
];
603 static const UnicodeSet
*
604 internalGetNXCJKCompat(UErrorCode
&errorCode
) {
605 /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
606 return internalGetSerializedNX(
608 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET
,
612 static const UnicodeSet
*
613 internalGetNXUnicode(uint32_t options
, UErrorCode
&errorCode
) {
614 /* internal function, does not check for incoming U_FAILURE */
617 options
&=_NORM_OPTIONS_UNICODE_MASK
;
621 case UNORM_UNICODE_3_2
:
623 nxIndex
=_NORM_SET_INDEX_NX_UNICODE32_OFFSET
;
626 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
630 /* build a set with all code points that were not designated by the specified Unicode version */
631 return internalGetSerializedNX(options
, nxIndex
, errorCode
);
634 /* Get a decomposition exclusion set. The data must be loaded. */
635 static const UnicodeSet
*
636 internalGetNX(int32_t options
, UErrorCode
&errorCode
) {
637 options
&=_NORM_OPTIONS_SETS_MASK
;
641 UMTX_CHECK(NULL
, (UBool
)(nxCache
[options
]!=NULL
), isCached
);
644 /* return basic sets */
645 if(options
==UNORM_NX_HANGUL
) {
646 return internalGetNXHangul(errorCode
);
648 if(options
==UNORM_NX_CJK_COMPAT
) {
649 return internalGetNXCJKCompat(errorCode
);
651 if((options
&_NORM_OPTIONS_UNICODE_MASK
)!=0 && (options
&_NORM_OPTIONS_NX_MASK
)==0) {
652 return internalGetNXUnicode(options
, errorCode
);
655 /* build a set from multiple subsets */
657 const UnicodeSet
*other
;
659 set
=new UnicodeSet();
661 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
665 if((options
&UNORM_NX_HANGUL
)!=0 && NULL
!=(other
=internalGetNXHangul(errorCode
))) {
668 if((options
&UNORM_NX_CJK_COMPAT
)!=0 && NULL
!=(other
=internalGetNXCJKCompat(errorCode
))) {
671 if((options
&_NORM_OPTIONS_UNICODE_MASK
)!=0 && NULL
!=(other
=internalGetNXUnicode(options
, errorCode
))) {
675 if(U_FAILURE(errorCode
)) {
679 // Compact the set for caching.
683 if(nxCache
[options
]==NULL
) {
684 nxCache
[options
]=set
;
686 ucln_common_registerCleanup(UCLN_COMMON_UNORM
, unorm_cleanup
);
693 return nxCache
[options
];
696 static inline const UnicodeSet
*
697 getNX(int32_t options
, UErrorCode
&errorCode
) {
698 if(U_FAILURE(errorCode
) || (options
&=_NORM_OPTIONS_SETS_MASK
)==0) {
699 /* incoming failure, or no decomposition exclusions requested */
702 return internalGetNX(options
, errorCode
);
706 U_CFUNC
const UnicodeSet
*
707 unorm_getNX(int32_t options
, UErrorCode
*pErrorCode
) {
708 return getNX(options
, *pErrorCode
);
712 nx_contains(const UnicodeSet
*nx
, UChar32 c
) {
713 return nx
!=NULL
&& nx
->contains(c
);
717 nx_contains(const UnicodeSet
*nx
, UChar c
, UChar c2
) {
718 return nx
!=NULL
&& nx
->contains(c2
==0 ? c
: U16_GET_SUPPLEMENTARY(c
, c2
));
721 /* other normalization primitives ------------------------------------------- */
723 /* get the canonical or compatibility decomposition for one character */
724 static inline const UChar
*
725 _decompose(uint32_t norm32
, uint32_t qcMask
, int32_t &length
,
726 uint8_t &cc
, uint8_t &trailCC
) {
727 const UChar
*p
=(const UChar
*)_getExtraData(norm32
);
730 if((norm32
&qcMask
&_NORM_QC_NFKD
)!=0 && length
>=0x100) {
731 /* use compatibility decomposition, skip canonical data */
732 p
+=((length
>>7)&1)+(length
&_NORM_DECOMP_LENGTH_MASK
);
736 if(length
&_NORM_DECOMP_FLAG_LENGTH_HAS_CC
) {
737 /* get the lead and trail cc's */
739 cc
=(uint8_t)(bothCCs
>>8);
740 trailCC
=(uint8_t)bothCCs
;
742 /* lead and trail cc's are both 0 */
746 length
&=_NORM_DECOMP_LENGTH_MASK
;
750 /* get the canonical decomposition for one character */
751 static inline const UChar
*
752 _decompose(uint32_t norm32
, int32_t &length
,
753 uint8_t &cc
, uint8_t &trailCC
) {
754 const UChar
*p
=(const UChar
*)_getExtraData(norm32
);
757 if(length
&_NORM_DECOMP_FLAG_LENGTH_HAS_CC
) {
758 /* get the lead and trail cc's */
760 cc
=(uint8_t)(bothCCs
>>8);
761 trailCC
=(uint8_t)bothCCs
;
763 /* lead and trail cc's are both 0 */
767 length
&=_NORM_DECOMP_LENGTH_MASK
;
772 * Get the canonical decomposition for one code point.
773 * @param c code point
774 * @param buffer out-only buffer for algorithmic decompositions of Hangul
775 * @param length out-only, takes the length of the decomposition, if any
776 * @return pointer to decomposition, or 0 if none
779 U_CFUNC
const UChar
*
780 unorm_getCanonicalDecomposition(UChar32 c
, UChar buffer
[4], int32_t *pLength
) {
783 if(c
<indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]) {
788 UTRIE_GET32(&normTrie
, c
, norm32
);
789 if(norm32
&_NORM_QC_NFD
) {
790 if(isNorm32HangulOrJamo(norm32
)) {
791 /* Hangul syllable: decompose algorithmically */
796 c2
=(UChar
)(c%JAMO_T_COUNT
);
799 buffer
[2]=(UChar
)(JAMO_T_BASE
+c2
);
805 buffer
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
806 buffer
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
809 /* normal decomposition */
811 return _decompose(norm32
, *pLength
, cc
, trailCC
);
819 * get the combining class of (c, c2)=*p++
820 * before: p<limit after: p<=limit
821 * if only one code unit is used, then c2==0
823 static inline uint8_t
824 _getNextCC(const UChar
*&p
, const UChar
*limit
, UChar
&c
, UChar
&c2
) {
828 norm32
=_getNorm32(c
);
829 if((norm32
&_NORM_CC_MASK
)==0) {
833 if(!isNorm32LeadSurrogate(norm32
)) {
836 /* c is a lead surrogate, get the real norm32 */
837 if(p
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*p
)) {
839 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
846 return (uint8_t)(norm32
>>_NORM_CC_SHIFT
);
851 * read backwards and get norm32
852 * return 0 if the character is <minC
853 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
855 static inline uint32_t
856 _getPrevNorm32(const UChar
*start
, const UChar
*&src
,
857 uint32_t minC
, uint32_t mask
,
858 UChar
&c
, UChar
&c2
) {
864 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
867 } else if(!UTF_IS_SURROGATE(c
)) {
868 return _getNorm32(c
);
869 } else if(UTF_IS_SURROGATE_FIRST(c
)) {
870 /* unpaired first surrogate */
872 } else if(src
!=start
&& UTF_IS_FIRST_SURROGATE(c2
=*(src
-1))) {
874 norm32
=_getNorm32(c2
);
876 if((norm32
&mask
)==0) {
877 /* all surrogate pairs with this lead surrogate have only irrelevant data */
880 /* norm32 must be a surrogate special */
881 return _getNorm32FromSurrogatePair(norm32
, c
);
884 /* unpaired second surrogate */
891 * get the combining class of (c, c2)=*--p
892 * before: start<p after: start<=p
894 static inline uint8_t
895 _getPrevCC(const UChar
*start
, const UChar
*&p
) {
898 return (uint8_t)(_getPrevNorm32(start
, p
, _NORM_MIN_WITH_LEAD_CC
, _NORM_CC_MASK
, c
, c2
)>>_NORM_CC_SHIFT
);
902 * is this a safe boundary character for NF*D?
906 _isNFDSafe(uint32_t norm32
, uint32_t ccOrQCMask
, uint32_t decompQCMask
) {
907 if((norm32
&ccOrQCMask
)==0) {
908 return TRUE
; /* cc==0 and no decomposition: this is NF*D safe */
911 /* inspect its decomposition - maybe a Hangul but not a surrogate here */
912 if(isNorm32Regular(norm32
) && (norm32
&decompQCMask
)!=0) {
916 /* decomposes, get everything from the variable-length extra data */
917 _decompose(norm32
, decompQCMask
, length
, cc
, trailCC
);
920 /* no decomposition (or Hangul), test the cc directly */
921 return (norm32
&_NORM_CC_MASK
)==0;
926 * is this (or does its decomposition begin with) a "true starter"?
927 * (cc==0 and NF*C_YES)
930 _isTrueStarter(uint32_t norm32
, uint32_t ccOrQCMask
, uint32_t decompQCMask
) {
931 if((norm32
&ccOrQCMask
)==0) {
932 return TRUE
; /* this is a true starter (could be Hangul or Jamo L) */
935 /* inspect its decomposition - not a Hangul or a surrogate here */
936 if((norm32
&decompQCMask
)!=0) {
941 /* decomposes, get everything from the variable-length extra data */
942 p
=_decompose(norm32
, decompQCMask
, length
, cc
, trailCC
);
944 uint32_t qcMask
=ccOrQCMask
&_NORM_QC_MASK
;
946 /* does it begin with NFC_YES? */
947 if((_getNorm32(p
, qcMask
)&qcMask
)==0) {
948 /* yes, the decomposition begins with a true starter */
957 U_CAPI
uint8_t U_EXPORT2
958 u_getCombiningClass(UChar32 c
) {
959 #if !UNORM_HARDCODE_DATA
960 UErrorCode errorCode
=U_ZERO_ERROR
;
961 if(_haveData(errorCode
)) {
965 UTRIE_GET32(&normTrie
, c
, norm32
);
966 return (uint8_t)(norm32
>>_NORM_CC_SHIFT
);
967 #if !UNORM_HARDCODE_DATA
974 U_CFUNC UBool U_EXPORT2
975 unorm_internalIsFullCompositionExclusion(UChar32 c
) {
976 #if UNORM_HARDCODE_DATA
977 if(auxTrie
.index
!=NULL
) {
979 UErrorCode errorCode
=U_ZERO_ERROR
;
980 if(_haveData(errorCode
) && auxTrie
.index
!=NULL
) {
984 UTRIE_GET16(&auxTrie
, c
, aux
);
985 return (UBool
)((aux
&_NORM_AUX_COMP_EX_MASK
)!=0);
991 U_CFUNC UBool U_EXPORT2
992 unorm_isCanonSafeStart(UChar32 c
) {
993 #if UNORM_HARDCODE_DATA
994 if(auxTrie
.index
!=NULL
) {
996 UErrorCode errorCode
=U_ZERO_ERROR
;
997 if(_haveData(errorCode
) && auxTrie
.index
!=NULL
) {
1001 UTRIE_GET16(&auxTrie
, c
, aux
);
1002 return (UBool
)((aux
&_NORM_AUX_UNSAFE_MASK
)==0);
1008 U_CAPI
void U_EXPORT2
1009 unorm_getUnicodeVersion(UVersionInfo
*versionInfo
, UErrorCode
*pErrorCode
){
1010 if(unorm_haveData(pErrorCode
)){
1011 uprv_memcpy(*versionInfo
, dataVersion
, 4);
1016 U_CAPI UBool U_EXPORT2
1017 unorm_getCanonStartSet(UChar32 c
, USerializedSet
*fillSet
) {
1018 #if !UNORM_HARDCODE_DATA
1019 UErrorCode errorCode
=U_ZERO_ERROR
;
1021 if( fillSet
!=NULL
&& (uint32_t)c
<=0x10ffff &&
1022 #if !UNORM_HARDCODE_DATA
1023 _haveData(errorCode
) &&
1025 canonStartSets
!=NULL
1027 const uint16_t *table
;
1028 int32_t i
, start
, limit
;
1031 * binary search for c
1033 * There are two search tables,
1034 * one for BMP code points and one for supplementary ones.
1035 * See unormimp.h for details.
1038 table
=canonStartSets
+canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
];
1040 limit
=canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
1042 /* each entry is a pair { c, result } */
1043 while(start
<limit
-2) {
1044 i
=(uint16_t)(((start
+limit
)/4)*2); /* (start+limit)/2 and address pairs */
1053 if(c
==table
[start
]) {
1055 if((i
&_NORM_CANON_SET_BMP_MASK
)==_NORM_CANON_SET_BMP_IS_INDEX
) {
1056 /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
1057 i
&=(_NORM_MAX_CANON_SETS
-1);
1058 return uset_getSerializedSet(fillSet
,
1060 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]-i
);
1062 /* other result values are BMP code points for single-code point sets */
1063 uset_setSerializedToOne(fillSet
, (UChar32
)i
);
1068 uint16_t high
, low
, h
;
1070 table
=canonStartSets
+canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]+
1071 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
1073 limit
=canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
];
1075 high
=(uint16_t)(c
>>16);
1078 /* each entry is a triplet { high(c), low(c), result } */
1079 while(start
<limit
-3) {
1080 i
=(uint16_t)(((start
+limit
)/6)*3); /* (start+limit)/2 and address triplets */
1081 h
=table
[i
]&0x1f; /* high word */
1082 if(high
<h
|| (high
==h
&& low
<table
[i
+1])) {
1091 if(high
==(h
&0x1f) && low
==table
[start
+1]) {
1094 /* the result is an index to a USerializedSet */
1095 return uset_getSerializedSet(fillSet
,
1097 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]-i
);
1100 * single-code point set {x} in
1101 * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
1103 i
|=((int32_t)h
&0x1f00)<<8; /* add high bits from high(c) */
1104 uset_setSerializedToOne(fillSet
, (UChar32
)i
);
1111 return FALSE
; /* not found */
1114 U_CAPI
int32_t U_EXPORT2
1115 u_getFC_NFKC_Closure(UChar32 c
, UChar
*dest
, int32_t destCapacity
, UErrorCode
*pErrorCode
) {
1118 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1121 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
1122 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1125 if(!_haveData(*pErrorCode
) || auxTrie
.index
==NULL
) {
1129 UTRIE_GET16(&auxTrie
, c
, aux
);
1130 aux
&=_NORM_AUX_FNC_MASK
;
1135 s
=(const UChar
*)(extraData
+aux
);
1137 /* s points to the single-unit string */
1143 if(0<length
&& length
<=destCapacity
) {
1144 uprv_memcpy(dest
, s
, length
*U_SIZEOF_UCHAR
);
1146 return u_terminateUChars(dest
, destCapacity
, length
, pErrorCode
);
1148 return u_terminateUChars(dest
, destCapacity
, 0, pErrorCode
);
1152 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
1153 U_CAPI UBool U_EXPORT2
1154 unorm_isNFSkippable(UChar32 c
, UNormalizationMode mode
) {
1155 uint32_t norm32
, mask
;
1158 #if !UNORM_HARDCODE_DATA
1159 UErrorCode errorCode
=U_ZERO_ERROR
;
1160 if(!_haveData(errorCode
)) {
1165 /* handle trivial cases; set the comparison mask for the normal ones */
1170 mask
=_NORM_CC_MASK
|_NORM_QC_NFD
;
1173 mask
=_NORM_CC_MASK
|_NORM_QC_NFKD
;
1176 /* case UNORM_FCC: */
1177 mask
=_NORM_CC_MASK
|_NORM_COMBINES_ANY
|(_NORM_QC_NFC
&_NORM_QC_ANY_NO
);
1180 mask
=_NORM_CC_MASK
|_NORM_COMBINES_ANY
|(_NORM_QC_NFKC
&_NORM_QC_ANY_NO
);
1183 /* FCD: skippable if lead cc==0 and trail cc<=1 */
1184 if(fcdTrie
.index
!=NULL
) {
1185 UTRIE_GET16(&fcdTrie
, c
, fcd
);
1194 /* check conditions (a)..(e), see unormimp.h */
1195 UTRIE_GET32(&normTrie
, c
, norm32
);
1196 if((norm32
&mask
)!=0) {
1197 return FALSE
; /* fails (a)..(e), not skippable */
1200 if(mode
<UNORM_NFC
) {
1201 return TRUE
; /* NF*D, passed (a)..(c), is skippable */
1204 /* NF*C/FCC, passed (a)..(e) */
1205 if((norm32
&_NORM_QC_NFD
)==0) {
1206 return TRUE
; /* no canonical decomposition, is skippable */
1209 /* check Hangul syllables algorithmically */
1210 if(isNorm32HangulOrJamo(norm32
)) {
1211 /* Jamo passed (a)..(e) above, must be Hangul */
1212 return !isHangulWithoutJamoT((UChar
)c
); /* LVT are skippable, LV are not */
1215 /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1216 /* NF*C, test (f) flag */
1217 if(!formatVersion_2_2
|| auxTrie
.index
==NULL
) {
1218 return FALSE
; /* no (f) data, say not skippable to be safe */
1221 UTRIE_GET16(&auxTrie
, c
, aux
);
1222 return (aux
&_NORM_AUX_NFC_SKIP_F_MASK
)==0; /* TRUE=skippable if the (f) flag is not set */
1224 /* } else { FCC, test fcd<=1 instead of the above } */
1227 U_CAPI
void U_EXPORT2
1228 unorm_addPropertyStarts(const USetAdder
*sa
, UErrorCode
*pErrorCode
) {
1231 if(!_haveData(*pErrorCode
)) {
1235 /* add the start code point of each same-value range of each trie */
1236 utrie_enum(&normTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1237 if(fcdTrie
.index
!=NULL
) {
1238 utrie_enum(&fcdTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1240 if(auxTrie
.index
!=NULL
) {
1241 utrie_enum(&auxTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1244 /* add Hangul LV syllables and LV+1 because of skippables */
1245 for(c
=HANGUL_BASE
; c
<HANGUL_BASE
+HANGUL_COUNT
; c
+=JAMO_T_COUNT
) {
1246 sa
->add(sa
->set
, c
);
1247 sa
->add(sa
->set
, c
+1);
1249 sa
->add(sa
->set
, HANGUL_BASE
+HANGUL_COUNT
); /* add Hangul+1 to continue with other properties */
1252 U_CFUNC UNormalizationCheckResult U_EXPORT2
1253 unorm_getQuickCheck(UChar32 c
, UNormalizationMode mode
) {
1254 static const uint32_t qcMask
[UNORM_MODE_COUNT
]={
1255 0, 0, _NORM_QC_NFD
, _NORM_QC_NFKD
, _NORM_QC_NFC
, _NORM_QC_NFKC
1260 #if !UNORM_HARDCODE_DATA
1261 UErrorCode errorCode
=U_ZERO_ERROR
;
1262 if(!_haveData(errorCode
)) {
1267 UTRIE_GET32(&normTrie
, c
, norm32
);
1268 norm32
&=qcMask
[mode
];
1272 } else if(norm32
&_NORM_QC_ANY_NO
) {
1274 } else /* _NORM_QC_ANY_MAYBE */ {
1279 U_CFUNC
uint16_t U_EXPORT2
1280 unorm_getFCD16FromCodePoint(UChar32 c
) {
1282 #if !UNORM_HARDCODE_DATA
1283 UErrorCode errorCode
;
1284 errorCode
=U_ZERO_ERROR
;
1288 #if !UNORM_HARDCODE_DATA
1289 !_haveData(errorCode
) ||
1296 UTRIE_GET16(&fcdTrie
, c
, fcd
);
1300 /* reorder UTF-16 in-place -------------------------------------------------- */
1303 * simpler, single-character version of _mergeOrdered() -
1304 * bubble-insert one single code point into the preceding string
1305 * which is already canonically ordered
1306 * (c, c2) may or may not yet have been inserted at [current..p[
1308 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1310 * before: [start..current[ is already ordered, and
1311 * [current..p[ may or may not hold (c, c2) but
1312 * must be exactly the same length as (c, c2)
1313 * after: [start..p[ is ordered
1315 * returns the trailing combining class
1318 _insertOrdered(const UChar
*start
, UChar
*current
, UChar
*p
,
1319 UChar c
, UChar c2
, uint8_t cc
) {
1320 const UChar
*pBack
, *pPreBack
;
1322 uint8_t prevCC
, trailCC
=cc
;
1324 if(start
<current
&& cc
!=0) {
1325 /* search for the insertion point where cc>=prevCC */
1326 pPreBack
=pBack
=current
;
1327 prevCC
=_getPrevCC(start
, pPreBack
);
1329 /* this will be the last code point, so keep its cc */
1332 while(start
<pPreBack
) {
1333 prevCC
=_getPrevCC(start
, pPreBack
);
1341 * this is where we are right now with all these pointers:
1342 * [start..pPreBack[ 0..? code points that we can ignore
1343 * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1344 * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2)
1345 * [current..p[ 1 code point (c, c2) with cc
1348 /* move the code units in between up */
1352 } while(pBack
!=current
);
1356 /* insert (c, c2) */
1362 /* we know the cc of the last code point */
1367 * merge two UTF-16 string parts together
1368 * to canonically order (order by combining classes) their concatenation
1370 * the two strings may already be adjacent, so that the merging is done in-place
1371 * if the two strings are not adjacent, then the buffer holding the first one
1372 * must be large enough
1373 * the second string may or may not be ordered in itself
1375 * before: [start..current[ is already ordered, and
1376 * [next..limit[ may be ordered in itself, but
1377 * is not in relation to [start..current[
1378 * after: [start..current+(limit-next)[ is ordered
1380 * the algorithm is a simple bubble-sort that takes the characters from *next++
1381 * and inserts them in correct combining class order into the preceding part
1384 * since this function is called much less often than the single-code point
1385 * _insertOrdered(), it just uses that for easier maintenance
1386 * (see file version from before 2001aug31 for a more optimized version)
1388 * returns the trailing combining class
1391 _mergeOrdered(UChar
*start
, UChar
*current
,
1392 const UChar
*next
, const UChar
*limit
, UBool isOrdered
=TRUE
) {
1395 uint8_t cc
, trailCC
=0;
1398 adjacent
= current
==next
;
1400 if(start
!=current
|| !isOrdered
) {
1402 cc
=_getNextCC(next
, limit
, c
, c2
);
1404 /* does not bubble back */
1407 current
=(UChar
*)next
;
1420 r
=current
+(c2
==0 ? 1 : 2);
1421 trailCC
=_insertOrdered(start
, current
, r
, c
, c2
, cc
);
1428 /* we know the cc of the last code point */
1432 /* copy the second string part */
1435 } while(next
!=limit
);
1438 return _getPrevCC(start
, limit
);
1442 /* find the last true starter in [start..src[ and return the pointer to it */
1443 static const UChar
*
1444 _findPreviousStarter(const UChar
*start
, const UChar
*src
,
1445 uint32_t ccOrQCMask
, uint32_t decompQCMask
, UChar minNoMaybe
) {
1450 norm32
=_getPrevNorm32(start
, src
, minNoMaybe
, ccOrQCMask
|decompQCMask
, c
, c2
);
1451 if(_isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
)) {
1458 /* find the first true starter in [src..limit[ and return the pointer to it */
1459 static const UChar
*
1460 _findNextStarter(const UChar
*src
, const UChar
*limit
,
1461 uint32_t qcMask
, uint32_t decompQCMask
, UChar minNoMaybe
) {
1463 uint32_t norm32
, ccOrQCMask
;
1466 uint8_t cc
, trailCC
;
1468 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
1472 break; /* end of string */
1476 break; /* catches NUL terminater, too */
1479 norm32
=_getNorm32(c
);
1480 if((norm32
&ccOrQCMask
)==0) {
1481 break; /* true starter */
1484 if(isNorm32LeadSurrogate(norm32
)) {
1485 /* c is a lead surrogate, get the real norm32 */
1486 if((src
+1)==limit
|| !UTF_IS_SECOND_SURROGATE(c2
=*(src
+1))) {
1487 break; /* unmatched first surrogate: counts as a true starter */
1489 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1491 if((norm32
&ccOrQCMask
)==0) {
1492 break; /* true starter */
1498 /* (c, c2) is not a true starter but its decomposition may be */
1499 if(norm32
&decompQCMask
) {
1500 /* (c, c2) decomposes, get everything from the variable-length extra data */
1501 p
=_decompose(norm32
, decompQCMask
, length
, cc
, trailCC
);
1503 /* get the first character's norm32 to check if it is a true starter */
1504 if(cc
==0 && (_getNorm32(p
, qcMask
)&qcMask
)==0) {
1505 break; /* true starter */
1509 src
+= c2
==0 ? 1 : 2; /* not a true starter, continue */
1515 /* make NFD & NFKD ---------------------------------------------------------- */
1517 U_CAPI
int32_t U_EXPORT2
1518 unorm_getDecomposition(UChar32 c
, UBool compat
,
1519 UChar
*dest
, int32_t destCapacity
) {
1520 #if !UNORM_HARDCODE_DATA
1521 UErrorCode errorCode
=U_ZERO_ERROR
;
1523 if( (uint32_t)c
<=0x10ffff &&
1524 #if !UNORM_HARDCODE_DATA
1525 _haveData(errorCode
) &&
1527 ((dest
!=NULL
&& destCapacity
>0) || destCapacity
==0)
1529 uint32_t norm32
, qcMask
;
1535 minNoMaybe
=(UChar32
)indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
];
1536 qcMask
=_NORM_QC_NFD
;
1538 minNoMaybe
=(UChar32
)indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
];
1539 qcMask
=_NORM_QC_NFKD
;
1544 if(destCapacity
>0) {
1551 UTRIE_GET32(&normTrie
, c
, norm32
);
1552 if((norm32
&qcMask
)==0) {
1553 /* simple case: no decomposition */
1555 if(destCapacity
>0) {
1560 if(destCapacity
>=2) {
1561 dest
[0]=UTF16_LEAD(c
);
1562 dest
[1]=UTF16_TRAIL(c
);
1566 } else if(isNorm32HangulOrJamo(norm32
)) {
1567 /* Hangul syllable: decompose algorithmically */
1572 c2
=(UChar
)(c%JAMO_T_COUNT
);
1575 if(destCapacity
>=3) {
1576 dest
[2]=(UChar
)(JAMO_T_BASE
+c2
);
1583 if(destCapacity
>=2) {
1584 dest
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
1585 dest
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
1589 /* c decomposes, get everything from the variable-length extra data */
1590 const UChar
*p
, *limit
;
1591 uint8_t cc
, trailCC
;
1593 p
=_decompose(norm32
, qcMask
, length
, cc
, trailCC
);
1594 if(length
<=destCapacity
) {
1608 _decompose(UChar
*dest
, int32_t destCapacity
,
1609 const UChar
*src
, int32_t srcLength
,
1610 UBool compat
, const UnicodeSet
*nx
,
1611 uint8_t &outTrailCC
) {
1613 const UChar
*limit
, *prevSrc
, *p
;
1614 uint32_t norm32
, ccOrQCMask
, qcMask
;
1615 int32_t destIndex
, reorderStartIndex
, length
;
1616 UChar c
, c2
, minNoMaybe
;
1617 uint8_t cc
, prevCC
, trailCC
;
1620 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
];
1621 qcMask
=_NORM_QC_NFD
;
1623 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
];
1624 qcMask
=_NORM_QC_NFKD
;
1628 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
1629 destIndex
=reorderStartIndex
=0;
1632 /* avoid compiler warnings */
1639 /* string with length */
1640 limit
=src
+srcLength
;
1641 } else /* srcLength==-1 */ {
1642 /* zero-terminated string */
1649 /* count code units below the minimum or with irrelevant data for the quick check */
1652 while((c
=*src
)<minNoMaybe
? c
!=0 : ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0) {
1657 while(src
!=limit
&& ((c
=*src
)<minNoMaybe
|| ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0)) {
1663 /* copy these code units all at once */
1665 length
=(int32_t)(src
-prevSrc
);
1666 if((destIndex
+length
)<=destCapacity
) {
1667 uprv_memcpy(dest
+destIndex
, prevSrc
, length
*U_SIZEOF_UCHAR
);
1670 reorderStartIndex
=destIndex
;
1673 /* end of source reached? */
1674 if(limit
==NULL
? c
==0 : src
==limit
) {
1678 /* c already contains *src and norm32 is set for it, increment src */
1681 /* check one above-minimum, relevant code unit */
1683 * generally, set p and length to the decomposition string
1684 * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1685 * in all cases, set cc to the lead and trailCC to the trail combining class
1687 * the following merge-sort of the current character into the preceding,
1688 * canonically ordered result text will use the optimized _insertOrdered()
1689 * if there is only one single code point to process;
1690 * this is indicated with p==NULL, and (c, c2) is the character to insert
1691 * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1692 * for a supplementary character)
1693 * otherwise, p[length] is merged in with _mergeOrdered()
1695 if(isNorm32HangulOrJamo(norm32
)) {
1696 if(nx_contains(nx
, c
)) {
1701 /* Hangul syllable: decompose algorithmically */
1707 c2
=(UChar
)(c%JAMO_T_COUNT
);
1710 buffer
[2]=(UChar
)(JAMO_T_BASE
+c2
);
1716 buffer
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
1717 buffer
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
1720 if(isNorm32Regular(norm32
)) {
1724 /* c is a lead surrogate, get the real norm32 */
1725 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
1728 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1736 /* get the decomposition and the lead and trail cc's */
1737 if(nx_contains(nx
, c
, c2
)) {
1738 /* excluded: norm32==0 */
1741 } else if((norm32
&qcMask
)==0) {
1742 /* c does not decompose */
1743 cc
=trailCC
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
1746 /* c decomposes, get everything from the variable-length extra data */
1747 p
=_decompose(norm32
, qcMask
, length
, cc
, trailCC
);
1749 /* fastpath a single code unit from decomposition */
1757 /* append the decomposition to the destination buffer, assume length>0 */
1758 if((destIndex
+length
)<=destCapacity
) {
1759 UChar
*reorderSplit
=dest
+destIndex
;
1761 /* fastpath: single code point */
1762 if(cc
!=0 && cc
<prevCC
) {
1763 /* (c, c2) is out of order with respect to the preceding text */
1765 trailCC
=_insertOrdered(dest
+reorderStartIndex
, reorderSplit
, dest
+destIndex
, c
, c2
, cc
);
1767 /* just append (c, c2) */
1768 dest
[destIndex
++]=c
;
1770 dest
[destIndex
++]=c2
;
1774 /* general: multiple code points (ordered by themselves) from decomposition */
1775 if(cc
!=0 && cc
<prevCC
) {
1776 /* the decomposition is out of order with respect to the preceding text */
1778 trailCC
=_mergeOrdered(dest
+reorderStartIndex
, reorderSplit
, p
, p
+length
);
1780 /* just append the decomposition */
1782 dest
[destIndex
++]=*p
++;
1783 } while(--length
>0);
1787 /* buffer overflow */
1788 /* keep incrementing the destIndex for preflighting */
1794 reorderStartIndex
=destIndex
;
1802 U_CAPI
int32_t U_EXPORT2
1803 unorm_decompose(UChar
*dest
, int32_t destCapacity
,
1804 const UChar
*src
, int32_t srcLength
,
1805 UBool compat
, int32_t options
,
1806 UErrorCode
*pErrorCode
) {
1807 const UnicodeSet
*nx
;
1811 if(!_haveData(*pErrorCode
)) {
1815 nx
=getNX(options
, *pErrorCode
);
1816 if(U_FAILURE(*pErrorCode
)) {
1820 destIndex
=_decompose(dest
, destCapacity
,
1825 return u_terminateUChars(dest
, destCapacity
, destIndex
, pErrorCode
);
1828 /* make NFC & NFKC ---------------------------------------------------------- */
1830 /* get the composition properties of the next character */
1831 static inline uint32_t
1832 _getNextCombining(UChar
*&p
, const UChar
*limit
,
1833 UChar
&c
, UChar
&c2
,
1834 uint16_t &combiningIndex
, uint8_t &cc
,
1835 const UnicodeSet
*nx
) {
1836 uint32_t norm32
, combineFlags
;
1838 /* get properties */
1840 norm32
=_getNorm32(c
);
1842 /* preset output values for most characters */
1847 if((norm32
&(_NORM_CC_MASK
|_NORM_COMBINES_ANY
))==0) {
1850 if(isNorm32Regular(norm32
)) {
1851 /* set cc etc. below */
1852 } else if(isNorm32HangulOrJamo(norm32
)) {
1853 /* a compatibility decomposition contained Jamos */
1854 combiningIndex
=(uint16_t)(0xfff0|(norm32
>>_NORM_EXTRA_SHIFT
));
1855 return norm32
&_NORM_COMBINES_ANY
;
1857 /* c is a lead surrogate, get the real norm32 */
1858 if(p
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*p
)) {
1860 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1867 if(nx_contains(nx
, c
, c2
)) {
1868 return 0; /* excluded: norm32==0 */
1871 cc
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
1873 combineFlags
=norm32
&_NORM_COMBINES_ANY
;
1874 if(combineFlags
!=0) {
1875 combiningIndex
=*(_getExtraData(norm32
)-1);
1877 return combineFlags
;
1882 * given a composition-result starter (c, c2) - which means its cc==0,
1883 * it combines forward, it has extra data, its norm32!=0,
1884 * it is not a Hangul or Jamo,
1885 * get just its combineFwdIndex
1887 * norm32(c) is special if and only if c2!=0
1889 static inline uint16_t
1890 _getCombiningIndexFromStarter(UChar c
, UChar c2
) {
1893 norm32
=_getNorm32(c
);
1895 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1897 return *(_getExtraData(norm32
)-1);
1901 * Find the recomposition result for
1902 * a forward-combining character
1903 * (specified with a pointer to its part of the combiningTable[])
1904 * and a backward-combining character
1905 * (specified with its combineBackIndex).
1907 * If these two characters combine, then set (value, value2)
1908 * with the code unit(s) of the composition character.
1913 * >1 combine, and the composition is a forward-combining starter
1915 * See unormimp.h for a description of the composition table format.
1917 static inline uint16_t
1918 _combine(const uint16_t *table
, uint16_t combineBackIndex
,
1919 uint16_t &value
, uint16_t &value2
) {
1922 /* search in the starter's composition table */
1925 if(key
>=combineBackIndex
) {
1928 table
+= *table
&0x8000 ? 2 : 1;
1931 /* mask off bit 15, the last-entry-in-the-list flag */
1932 if((key
&0x7fff)==combineBackIndex
) {
1933 /* found! combine! */
1936 /* is the composition a starter that combines forward? */
1937 key
=(uint16_t)((value
&0x2000)+1);
1939 /* get the composition result code point from the variable-length result value */
1942 /* surrogate pair composition result */
1943 value
=(uint16_t)((value
&0x3ff)|0xd800);
1946 /* BMP composition result U+2000..U+ffff */
1951 /* BMP composition result U+0000..U+1fff */
1964 _composeHangul(UChar prev
, UChar c
, uint32_t norm32
, const UChar
*&src
, const UChar
*limit
,
1965 UBool compat
, UChar
*dest
, const UnicodeSet
*nx
) {
1966 if(isJamoVTNorm32JamoV(norm32
)) {
1967 /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1968 prev
=(UChar
)(prev
-JAMO_L_BASE
);
1969 if(prev
<JAMO_L_COUNT
) {
1970 c
=(UChar
)(HANGUL_BASE
+(prev
*JAMO_V_COUNT
+(c
-JAMO_V_BASE
))*JAMO_T_COUNT
);
1972 /* check if the next character is a Jamo T (normal or compatibility) */
1977 if((t
=(UChar
)(next
-JAMO_T_BASE
))<JAMO_T_COUNT
) {
1982 /* if NFKC, then check for compatibility Jamo T (BMP only) */
1983 norm32
=_getNorm32(next
);
1984 if(isNorm32Regular(norm32
) && (norm32
&_NORM_QC_NFKD
)) {
1987 uint8_t cc
, trailCC
;
1989 p
=_decompose(norm32
, _NORM_QC_NFKD
, length
, cc
, trailCC
);
1990 if(length
==1 && (t
=(UChar
)(*p
-JAMO_T_BASE
))<JAMO_T_COUNT
) {
1991 /* compatibility Jamo T */
1998 if(nx_contains(nx
, c
)) {
1999 if(!isHangulWithoutJamoT(c
)) {
2000 --src
; /* undo ++src from reading the Jamo T */
2009 } else if(isHangulWithoutJamoT(prev
)) {
2010 /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
2011 c
=(UChar
)(prev
+(c
-JAMO_T_BASE
));
2012 if(nx_contains(nx
, c
)) {
2024 * recompose the characters in [p..limit[
2025 * (which is in NFD - decomposed and canonically ordered),
2026 * adjust limit, and return the trailing cc
2028 * since for NFKC we may get Jamos in decompositions, we need to
2029 * recompose those too
2031 * note that recomposition never lengthens the text:
2032 * any character consists of either one or two code units;
2033 * a composition may contain at most one more code unit than the original starter,
2034 * while the combining mark that is removed has at least one code unit
2037 _recompose(UChar
*p
, UChar
*&limit
, int32_t options
, const UnicodeSet
*nx
) {
2038 UChar
*starter
, *pRemove
, *q
, *r
;
2039 uint32_t combineFlags
;
2041 uint16_t combineFwdIndex
, combineBackIndex
;
2042 uint16_t result
, value
, value2
;
2044 UBool starterIsSupplementary
;
2046 starter
=NULL
; /* no starter */
2047 combineFwdIndex
=0; /* will not be used until starter!=NULL - avoid compiler warnings */
2048 combineBackIndex
=0; /* will always be set if combineFlags!=0 - avoid compiler warnings */
2049 value
=value2
=0; /* always set by _combine() before used - avoid compiler warnings */
2050 starterIsSupplementary
=FALSE
; /* will not be used until starter!=NULL - avoid compiler warnings */
2054 combineFlags
=_getNextCombining(p
, limit
, c
, c2
, combineBackIndex
, cc
, nx
);
2055 if((combineFlags
&_NORM_COMBINES_BACK
) && starter
!=NULL
) {
2056 if(combineBackIndex
&0x8000) {
2057 /* c is a Jamo V/T, see if we can compose it with the previous character */
2058 /* for the PRI #29 fix, check that there is no intervening combining mark */
2059 if((options
&UNORM_BEFORE_PRI_29
) || prevCC
==0) {
2060 pRemove
=NULL
; /* NULL while no Hangul composition */
2063 if(combineBackIndex
==0xfff2) {
2064 /* Jamo V, compose with previous Jamo L and following Jamo T */
2065 c2
=(UChar
)(c2
-JAMO_L_BASE
);
2066 if(c2
<JAMO_L_COUNT
) {
2068 c
=(UChar
)(HANGUL_BASE
+(c2
*JAMO_V_COUNT
+(c
-JAMO_V_BASE
))*JAMO_T_COUNT
);
2069 if(p
!=limit
&& (c2
=(UChar
)(*p
-JAMO_T_BASE
))<JAMO_T_COUNT
) {
2073 /* the result is an LV syllable, which is a starter (unlike LVT) */
2074 combineFlags
=_NORM_COMBINES_FWD
;
2076 if(!nx_contains(nx
, c
)) {
2080 if(!isHangulWithoutJamoT(c
)) {
2081 --p
; /* undo the ++p from reading the Jamo T */
2083 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
2089 * Normally, the following can not occur:
2090 * Since the input is in NFD, there are no Hangul LV syllables that
2091 * a Jamo T could combine with.
2092 * All Jamo Ts are combined above when handling Jamo Vs.
2094 * However, before the PRI #29 fix, this can occur due to
2095 * an intervening combining mark between the Hangul LV and the Jamo T.
2098 /* Jamo T, compose with previous Hangul that does not have a Jamo T */
2099 if(isHangulWithoutJamoT(c2
)) {
2100 c2
+=(UChar
)(c
-JAMO_T_BASE
);
2101 if(!nx_contains(nx
, c2
)) {
2109 /* remove the Jamo(s) */
2119 c2
=0; /* c2 held *starter temporarily */
2121 if(combineFlags
!=0) {
2123 * not starter=NULL because the composition is a Hangul LV syllable
2124 * and might combine once more (but only before the PRI #29 fix)
2132 /* the composition is a Hangul LV syllable which is a starter that combines forward */
2133 combineFwdIndex
=0xfff0;
2135 /* we combined; continue with looking for compositions */
2141 * now: cc==0 and the combining index does not include "forward" ->
2142 * the rest of the loop body will reset starter to NULL;
2143 * technically, a composed Hangul syllable is a starter, but it
2144 * does not combine forward now that we have consumed all eligible Jamos;
2145 * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2149 /* the starter is not a Hangul LV or Jamo V/T and */
2150 !(combineFwdIndex
&0x8000) &&
2151 /* the combining mark is not blocked and */
2152 ((options
&UNORM_BEFORE_PRI_29
) ?
2153 (prevCC
!=cc
|| prevCC
==0) :
2154 (prevCC
<cc
|| prevCC
==0)) &&
2155 /* the starter and the combining mark (c, c2) do combine and */
2156 0!=(result
=_combine(combiningTable
+combineFwdIndex
, combineBackIndex
, value
, value2
)) &&
2157 /* the composition result is not excluded */
2158 !nx_contains(nx
, value
, value2
)
2160 /* replace the starter with the composition, remove the combining mark */
2161 pRemove
= c2
==0 ? p
-1 : p
-2; /* pointer to the combining mark */
2163 /* replace the starter with the composition */
2164 *starter
=(UChar
)value
;
2165 if(starterIsSupplementary
) {
2167 /* both are supplementary */
2168 *(starter
+1)=(UChar
)value2
;
2170 /* the composition is shorter than the starter, move the intermediate characters forward one */
2171 starterIsSupplementary
=FALSE
;
2179 } else if(value2
!=0) {
2180 /* the composition is longer than the starter, move the intermediate characters back one */
2181 starterIsSupplementary
=TRUE
;
2182 ++starter
; /* temporarily increment for the loop boundary */
2188 *starter
=(UChar
)value2
;
2189 --starter
; /* undo the temporary increment */
2190 /* } else { both are on the BMP, nothing more to do */
2193 /* remove the combining mark by moving the following text over it */
2204 /* keep prevCC because we removed the combining mark */
2211 /* is the composition a starter that combines forward? */
2213 combineFwdIndex
=_getCombiningIndexFromStarter((UChar
)value
, (UChar
)value2
);
2218 /* we combined; continue with looking for compositions */
2223 /* no combination this time */
2229 /* if (c, c2) did not combine, then check if it is a starter */
2231 /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2232 if(combineFlags
&_NORM_COMBINES_FWD
) {
2233 /* it may combine with something, prepare for it */
2235 starterIsSupplementary
=FALSE
;
2238 starterIsSupplementary
=TRUE
;
2241 combineFwdIndex
=combineBackIndex
;
2243 /* it will not combine with anything */
2246 } else if(options
&_NORM_OPTIONS_COMPOSE_CONTIGUOUS
) {
2247 /* FCC: no discontiguous compositions; any intervening character blocks */
2253 /* decompose and recompose [prevStarter..src[ */
2254 static const UChar
*
2255 _composePart(UChar
*stackBuffer
, UChar
*&buffer
, int32_t &bufferCapacity
, int32_t &length
,
2256 const UChar
*prevStarter
, const UChar
*src
,
2258 int32_t options
, const UnicodeSet
*nx
,
2259 UErrorCode
*pErrorCode
) {
2260 UChar
*recomposeLimit
;
2264 compat
=(UBool
)((options
&_NORM_OPTIONS_COMPAT
)!=0);
2266 /* decompose [prevStarter..src[ */
2267 length
=_decompose(buffer
, bufferCapacity
,
2268 prevStarter
, (int32_t)(src
-prevStarter
),
2271 if(length
>bufferCapacity
) {
2272 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, 2*length
, 0)) {
2273 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
2276 length
=_decompose(buffer
, bufferCapacity
,
2277 prevStarter
, (int32_t)(src
-prevStarter
),
2282 /* recompose the decomposition */
2283 recomposeLimit
=buffer
+length
;
2285 prevCC
=_recompose(buffer
, recomposeLimit
, options
, nx
);
2288 /* return with a pointer to the recomposition and its length */
2289 length
=(int32_t)(recomposeLimit
-buffer
);
2294 _compose(UChar
*dest
, int32_t destCapacity
,
2295 const UChar
*src
, int32_t srcLength
,
2296 int32_t options
, const UnicodeSet
*nx
,
2297 UErrorCode
*pErrorCode
) {
2298 UChar stackBuffer
[_STACK_BUFFER_CAPACITY
];
2300 int32_t bufferCapacity
;
2302 const UChar
*limit
, *prevSrc
, *prevStarter
;
2303 uint32_t norm32
, ccOrQCMask
, qcMask
;
2304 int32_t destIndex
, reorderStartIndex
, length
;
2305 UChar c
, c2
, minNoMaybe
;
2308 if(options
&_NORM_OPTIONS_COMPAT
) {
2309 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
2310 qcMask
=_NORM_QC_NFKC
;
2312 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
2313 qcMask
=_NORM_QC_NFC
;
2318 bufferCapacity
=_STACK_BUFFER_CAPACITY
;
2321 * prevStarter points to the last character before the current one
2322 * that is a "true" starter with cc==0 and quick check "yes".
2324 * prevStarter will be used instead of looking for a true starter
2325 * while incrementally decomposing [prevStarter..prevSrc[
2326 * in _composePart(). Having a good prevStarter allows to just decompose
2327 * the entire [prevStarter..prevSrc[.
2329 * When _composePart() backs out from prevSrc back to prevStarter,
2330 * then it also backs out destIndex by the same amount.
2331 * Therefore, at all times, the (prevSrc-prevStarter) source units
2332 * must correspond 1:1 to destination units counted with destIndex,
2333 * except for reordering.
2334 * This is true for the qc "yes" characters copied in the fast loop,
2335 * and for pure reordering.
2336 * prevStarter must be set forward to src when this is not true:
2337 * In _composePart() and after composing a Hangul syllable.
2339 * This mechanism relies on the assumption that the decomposition of a true starter
2340 * also begins with a true starter. gennorm/store.c checks for this.
2344 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
2345 destIndex
=reorderStartIndex
=0;
2348 /* avoid compiler warnings */
2353 /* string with length */
2354 limit
=src
+srcLength
;
2355 } else /* srcLength==-1 */ {
2356 /* zero-terminated string */
2363 /* count code units below the minimum or with irrelevant data for the quick check */
2366 while((c
=*src
)<minNoMaybe
? c
!=0 : ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0) {
2371 while(src
!=limit
&& ((c
=*src
)<minNoMaybe
|| ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0)) {
2377 /* copy these code units all at once */
2379 length
=(int32_t)(src
-prevSrc
);
2380 if((destIndex
+length
)<=destCapacity
) {
2381 uprv_memcpy(dest
+destIndex
, prevSrc
, length
*U_SIZEOF_UCHAR
);
2384 reorderStartIndex
=destIndex
;
2386 /* set prevStarter to the last character in the quick check loop */
2388 if(UTF_IS_SECOND_SURROGATE(*prevStarter
) && prevSrc
<prevStarter
&& UTF_IS_FIRST_SURROGATE(*(prevStarter
-1))) {
2395 /* end of source reached? */
2396 if(limit
==NULL
? c
==0 : src
==limit
) {
2400 /* c already contains *src and norm32 is set for it, increment src */
2404 * source buffer pointers:
2406 * all done quick check current char not yet
2407 * "yes" but (c, c2) processed
2410 * [-------------[-------------[-------------[-------------[
2412 * start prevStarter prevSrc src limit
2415 * destination buffer pointers and indexes:
2417 * all done might take not filled yet
2420 * [-------------[-------------[-------------[
2422 * dest reorderStartIndex destIndex destCapacity
2425 /* check one above-minimum, relevant code unit */
2427 * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2428 * check for Jamo V/T, then for surrogates and regular characters
2429 * c is not a Hangul syllable or Jamo L because
2430 * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2432 if(isNorm32HangulOrJamo(norm32
)) {
2435 * try to compose with the previous character, Jamo V also with a following Jamo T,
2436 * and set values here right now in case we just continue with the main loop
2439 reorderStartIndex
=destIndex
;
2444 *(prevSrc
-1), c
, norm32
, src
, limit
, (UBool
)((options
&_NORM_OPTIONS_COMPAT
)!=0),
2445 destIndex
<=destCapacity
? dest
+(destIndex
-1) : 0,
2452 /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2455 prevStarter
=prevSrc
;
2457 if(isNorm32Regular(norm32
)) {
2461 /* c is a lead surrogate, get the real norm32 */
2462 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
2465 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
2467 /* c is an unpaired lead surrogate, nothing to do */
2474 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2475 if(nx_contains(nx
, c
, c2
)) {
2476 /* excluded: norm32==0 */
2478 } else if((norm32
&qcMask
)==0) {
2479 cc
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
2482 uint32_t decompQCMask
;
2485 * find appropriate boundaries around this character,
2486 * decompose the source text from between the boundaries,
2489 * this puts the intermediate text into the side buffer because
2490 * it might be longer than the recomposition end result,
2491 * or the destination buffer may be too short or missing
2493 * note that destIndex may be adjusted backwards to account
2494 * for source text that passed the quick check but needed to
2495 * take part in the recomposition
2497 decompQCMask
=(qcMask
<<2)&0xf; /* decomposition quick check mask */
2500 * find the last true starter in [prevStarter..src[
2501 * it is either the decomposition of the current character (at prevSrc),
2504 if(_isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
)) {
2505 prevStarter
=prevSrc
;
2507 /* adjust destIndex: back out what had been copied with qc "yes" */
2508 destIndex
-=(int32_t)(prevSrc
-prevStarter
);
2511 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2512 src
=_findNextStarter(src
, limit
, qcMask
, decompQCMask
, minNoMaybe
);
2514 /* compose [prevStarter..src[ */
2515 p
=_composePart(stackBuffer
, buffer
, bufferCapacity
,
2516 length
, /* output */
2518 prevCC
, /* output */
2523 destIndex
=0; /* an error occurred (out of memory) */
2527 /* append the recomposed buffer contents to the destination buffer */
2528 if((destIndex
+length
)<=destCapacity
) {
2530 dest
[destIndex
++]=*p
++;
2534 /* buffer overflow */
2535 /* keep incrementing the destIndex for preflighting */
2539 /* set the next starter */
2546 /* append the single code point (c, c2) to the destination buffer */
2547 if((destIndex
+length
)<=destCapacity
) {
2548 if(cc
!=0 && cc
<prevCC
) {
2549 /* (c, c2) is out of order with respect to the preceding text */
2550 UChar
*reorderSplit
=dest
+destIndex
;
2552 prevCC
=_insertOrdered(dest
+reorderStartIndex
, reorderSplit
, dest
+destIndex
, c
, c2
, cc
);
2554 /* just append (c, c2) */
2555 dest
[destIndex
++]=c
;
2557 dest
[destIndex
++]=c2
;
2562 /* buffer overflow */
2563 /* keep incrementing the destIndex for preflighting */
2570 if(buffer
!=stackBuffer
) {
2577 U_CAPI
int32_t U_EXPORT2
2578 unorm_compose(UChar
*dest
, int32_t destCapacity
,
2579 const UChar
*src
, int32_t srcLength
,
2580 UBool compat
, int32_t options
,
2581 UErrorCode
*pErrorCode
) {
2582 const UnicodeSet
*nx
;
2585 if(!_haveData(*pErrorCode
)) {
2589 nx
=getNX(options
, *pErrorCode
);
2590 if(U_FAILURE(*pErrorCode
)) {
2594 /* reset options bits that should only be set here or inside _compose() */
2595 options
&=~(_NORM_OPTIONS_SETS_MASK
|_NORM_OPTIONS_COMPAT
|_NORM_OPTIONS_COMPOSE_CONTIGUOUS
);
2598 options
|=_NORM_OPTIONS_COMPAT
;
2601 destIndex
=_compose(dest
, destCapacity
,
2606 return u_terminateUChars(dest
, destCapacity
, destIndex
, pErrorCode
);
2609 /* make FCD ----------------------------------------------------------------- */
2611 static const UChar
*
2612 _findSafeFCD(const UChar
*src
, const UChar
*limit
, uint16_t fcd16
) {
2616 * find the first position in [src..limit[ after some cc==0 according to FCD data
2618 * at the beginning of the loop, we have fcd16 from before src
2620 * stop at positions:
2621 * - after trail cc==0
2622 * - at the end of the source
2623 * - before lead cc==0
2626 /* stop if trail cc==0 for the previous character */
2627 if((fcd16
&0xff)==0) {
2631 /* get c=*src - stop at end of string */
2637 /* stop if lead cc==0 for this character */
2638 if(c
<_NORM_MIN_WITH_LEAD_CC
|| (fcd16
=_getFCD16(c
))==0) {
2639 break; /* catches terminating NUL, too */
2642 if(!UTF_IS_FIRST_SURROGATE(c
)) {
2647 } else if((src
+1)!=limit
&& (c2
=*(src
+1), UTF_IS_SECOND_SURROGATE(c2
))) {
2648 /* c is a lead surrogate, get the real fcd16 */
2649 fcd16
=_getFCD16FromSurrogatePair(fcd16
, c2
);
2655 /* c is an unpaired first surrogate, lead cc==0 */
2664 _decomposeFCD(const UChar
*src
, const UChar
*decompLimit
,
2665 UChar
*dest
, int32_t &destIndex
, int32_t destCapacity
,
2666 const UnicodeSet
*nx
) {
2669 int32_t reorderStartIndex
, length
;
2671 uint8_t cc
, prevCC
, trailCC
;
2674 * canonically decompose [src..decompLimit[
2676 * all characters in this range have some non-zero cc,
2677 * directly or in decomposition,
2678 * so that we do not need to check in the following for quick-check limits etc.
2680 * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2682 * we also do not need to check for c==0 because we have an established decompLimit
2684 reorderStartIndex
=destIndex
;
2687 while(src
<decompLimit
) {
2689 norm32
=_getNorm32(c
);
2690 if(isNorm32Regular(norm32
)) {
2695 * reminder: this function is called with [src..decompLimit[
2696 * not containing any Hangul/Jamo characters,
2697 * therefore the only specials are lead surrogates
2699 /* c is a lead surrogate, get the real norm32 */
2700 if(src
!=decompLimit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
2703 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
2711 /* get the decomposition and the lead and trail cc's */
2712 if(nx_contains(nx
, c
, c2
)) {
2713 /* excluded: norm32==0 */
2716 } else if((norm32
&_NORM_QC_NFD
)==0) {
2717 /* c does not decompose */
2718 cc
=trailCC
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
2721 /* c decomposes, get everything from the variable-length extra data */
2722 p
=_decompose(norm32
, length
, cc
, trailCC
);
2724 /* fastpath a single code unit from decomposition */
2731 /* append the decomposition to the destination buffer, assume length>0 */
2732 if((destIndex
+length
)<=destCapacity
) {
2733 UChar
*reorderSplit
=dest
+destIndex
;
2735 /* fastpath: single code point */
2736 if(cc
!=0 && cc
<prevCC
) {
2737 /* (c, c2) is out of order with respect to the preceding text */
2739 trailCC
=_insertOrdered(dest
+reorderStartIndex
, reorderSplit
, dest
+destIndex
, c
, c2
, cc
);
2741 /* just append (c, c2) */
2742 dest
[destIndex
++]=c
;
2744 dest
[destIndex
++]=c2
;
2748 /* general: multiple code points (ordered by themselves) from decomposition */
2749 if(cc
!=0 && cc
<prevCC
) {
2750 /* the decomposition is out of order with respect to the preceding text */
2752 trailCC
=_mergeOrdered(dest
+reorderStartIndex
, reorderSplit
, p
, p
+length
);
2754 /* just append the decomposition */
2756 dest
[destIndex
++]=*p
++;
2757 } while(--length
>0);
2761 /* buffer overflow */
2762 /* keep incrementing the destIndex for preflighting */
2768 reorderStartIndex
=destIndex
;
2776 unorm_makeFCD(UChar
*dest
, int32_t destCapacity
,
2777 const UChar
*src
, int32_t srcLength
,
2778 const UnicodeSet
*nx
,
2779 UErrorCode
*pErrorCode
) {
2780 const UChar
*limit
, *prevSrc
, *decompStart
;
2781 int32_t destIndex
, length
;
2786 if(!_haveData(*pErrorCode
)) {
2795 /* avoid compiler warnings */
2800 /* string with length */
2801 limit
=src
+srcLength
;
2802 } else /* srcLength==-1 */ {
2803 /* zero-terminated string */
2810 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2815 if(c
<_NORM_MIN_WITH_LEAD_CC
) {
2820 } else if((fcd16
=_getFCD16(c
))==0) {
2831 } else if((c
=*src
)<_NORM_MIN_WITH_LEAD_CC
) {
2833 } else if((fcd16
=_getFCD16(c
))==0) {
2843 * prevCC has values from the following ranges:
2844 * 0..0xff - the previous trail combining class
2845 * <0 - the negative value of the previous code unit;
2846 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2847 * was deferred so that average text is checked faster
2850 /* copy these code units all at once */
2852 length
=(int32_t)(src
-prevSrc
);
2853 if((destIndex
+length
)<=destCapacity
) {
2854 uprv_memcpy(dest
+destIndex
, prevSrc
, length
*U_SIZEOF_UCHAR
);
2859 /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2861 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2862 if(!nx_contains(nx
, (UChar32
)-prevCC
)) {
2863 prevCC
=(int16_t)(_getFCD16((UChar
)-prevCC
)&0xff);
2865 prevCC
=0; /* excluded: fcd16==0 */
2869 * set a pointer to this below-U+0300 character;
2870 * if prevCC==0 then it will moved to after this character below
2872 decompStart
=prevSrc
-1;
2877 * prevSrc==src - used later to adjust destIndex before decomposition
2881 /* end of source reached? */
2882 if(limit
==NULL
? c
==0 : src
==limit
) {
2886 /* set a pointer to after the last source position where prevCC==0 */
2888 decompStart
=prevSrc
;
2891 /* c already contains *src and fcd16 is set for it, increment src */
2894 /* check one above-minimum, relevant code unit */
2895 if(UTF_IS_FIRST_SURROGATE(c
)) {
2896 /* c is a lead surrogate, get the real fcd16 */
2897 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
2899 fcd16
=_getFCD16FromSurrogatePair(fcd16
, c2
);
2908 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2909 if(nx_contains(nx
, c
, c2
)) {
2910 fcd16
=0; /* excluded: fcd16==0 */
2913 /* check the combining order, get the lead cc */
2914 cc
=(int16_t)(fcd16
>>8);
2915 if(cc
==0 || cc
>=prevCC
) {
2916 /* the order is ok */
2918 decompStart
=prevSrc
;
2920 prevCC
=(int16_t)(fcd16
&0xff);
2922 /* just append (c, c2) */
2923 length
= c2
==0 ? 1 : 2;
2924 if((destIndex
+length
)<=destCapacity
) {
2925 dest
[destIndex
++]=c
;
2927 dest
[destIndex
++]=c2
;
2934 * back out the part of the source that we copied already but
2935 * is now going to be decomposed;
2936 * prevSrc is set to after what was copied
2938 destIndex
-=(int32_t)(prevSrc
-decompStart
);
2941 * find the part of the source that needs to be decomposed;
2942 * to be safe and simple, decompose to before the next character with lead cc==0
2944 src
=_findSafeFCD(src
, limit
, fcd16
);
2947 * the source text does not fulfill the conditions for FCD;
2948 * decompose and reorder a limited piece of the text
2950 prevCC
=_decomposeFCD(decompStart
, src
,
2951 dest
, destIndex
, destCapacity
,
2957 return u_terminateUChars(dest
, destCapacity
, destIndex
, pErrorCode
);
2960 /* quick check functions ---------------------------------------------------- */
2963 unorm_checkFCD(const UChar
*src
, int32_t srcLength
, const UnicodeSet
*nx
) {
2973 /* string with length */
2974 limit
=src
+srcLength
;
2975 } else /* srcLength==-1 */ {
2976 /* zero-terminated string */
2983 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2987 if(c
<_NORM_MIN_WITH_LEAD_CC
) {
2992 * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2993 * because chances are good that the next one will have
2994 * a leading cc of 0;
2995 * _getFCD16(-prevCC) is later called when necessary -
2996 * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2999 } else if((fcd16
=_getFCD16(c
))==0) {
3009 } else if((c
=*src
++)<_NORM_MIN_WITH_LEAD_CC
) {
3011 } else if((fcd16
=_getFCD16(c
))==0) {
3019 /* check one above-minimum, relevant code unit */
3020 if(UTF_IS_FIRST_SURROGATE(c
)) {
3021 /* c is a lead surrogate, get the real fcd16 */
3022 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
3024 fcd16
=_getFCD16FromSurrogatePair(fcd16
, c2
);
3033 if(nx_contains(nx
, c
, c2
)) {
3034 prevCC
=0; /* excluded: fcd16==0 */
3039 * prevCC has values from the following ranges:
3040 * 0..0xff - the previous trail combining class
3041 * <0 - the negative value of the previous code unit;
3042 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
3043 * was deferred so that average text is checked faster
3046 /* check the combining order */
3047 cc
=(int16_t)(fcd16
>>8);
3050 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
3051 if(!nx_contains(nx
, (UChar32
)-prevCC
)) {
3052 prevCC
=(int16_t)(_getFCD16((UChar
)-prevCC
)&0xff);
3054 prevCC
=0; /* excluded: fcd16==0 */
3062 prevCC
=(int16_t)(fcd16
&0xff);
3066 static UNormalizationCheckResult
3067 _quickCheck(const UChar
*src
,
3069 UNormalizationMode mode
,
3071 const UnicodeSet
*nx
,
3072 UErrorCode
*pErrorCode
) {
3073 UChar stackBuffer
[_STACK_BUFFER_CAPACITY
];
3075 int32_t bufferCapacity
;
3077 const UChar
*start
, *limit
;
3078 uint32_t norm32
, qcNorm32
, ccOrQCMask
, qcMask
;
3080 UChar c
, c2
, minNoMaybe
;
3082 UNormalizationCheckResult result
;
3084 /* check arguments */
3085 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3089 if(src
==NULL
|| srcLength
<-1) {
3090 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3094 if(!_haveData(*pErrorCode
)) {
3098 /* check for a valid mode and set the quick check minimum and mask */
3101 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
3102 qcMask
=_NORM_QC_NFC
;
3106 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
3107 qcMask
=_NORM_QC_NFKC
;
3108 options
=_NORM_OPTIONS_COMPAT
;
3111 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
];
3112 qcMask
=_NORM_QC_NFD
;
3116 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
];
3117 qcMask
=_NORM_QC_NFKD
;
3118 options
=_NORM_OPTIONS_COMPAT
;
3121 if(fcdTrie
.index
==NULL
) {
3122 *pErrorCode
=U_UNSUPPORTED_ERROR
;
3125 return unorm_checkFCD(src
, srcLength
, nx
) ? UNORM_YES
: UNORM_NO
;
3127 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3133 bufferCapacity
=_STACK_BUFFER_CAPACITY
;
3135 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
3141 /* string with length */
3142 limit
=src
+srcLength
;
3143 } else /* srcLength==-1 */ {
3144 /* zero-terminated string */
3151 /* skip a run of code units below the minimum or with irrelevant data for the quick check */
3157 goto endloop
; /* break out of outer loop */
3159 } else if(((norm32
=_getNorm32(c
))&ccOrQCMask
)!=0) {
3167 goto endloop
; /* break out of outer loop */
3168 } else if((c
=*src
++)>=minNoMaybe
&& ((norm32
=_getNorm32(c
))&ccOrQCMask
)!=0) {
3175 /* check one above-minimum, relevant code unit */
3176 if(isNorm32LeadSurrogate(norm32
)) {
3177 /* c is a lead surrogate, get the real norm32 */
3178 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
3180 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
3189 if(nx_contains(nx
, c
, c2
)) {
3190 /* excluded: norm32==0 */
3194 /* check the combining order */
3195 cc
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
3196 if(cc
!=0 && cc
<prevCC
) {
3202 /* check for "no" or "maybe" quick check flags */
3203 qcNorm32
=norm32
&qcMask
;
3204 if(qcNorm32
&_NORM_QC_ANY_NO
) {
3207 } else if(qcNorm32
!=0) {
3208 /* "maybe" can only occur for NFC and NFKC */
3212 /* normalize a section around here to see if it is really normalized or not */
3213 const UChar
*prevStarter
;
3214 uint32_t decompQCMask
;
3217 decompQCMask
=(qcMask
<<2)&0xf; /* decomposition quick check mask */
3219 /* find the previous starter */
3220 prevStarter
=src
-1; /* set prevStarter to the beginning of the current character */
3221 if(UTF_IS_TRAIL(*prevStarter
)) {
3222 --prevStarter
; /* safe because unpaired surrogates do not result in "maybe" */
3224 prevStarter
=_findPreviousStarter(start
, prevStarter
, ccOrQCMask
, decompQCMask
, minNoMaybe
);
3226 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3227 src
=_findNextStarter(src
, limit
, qcMask
, decompQCMask
, minNoMaybe
);
3229 /* decompose and recompose [prevStarter..src[ */
3230 _composePart(stackBuffer
, buffer
, bufferCapacity
,
3235 options
, nx
, pErrorCode
);
3236 if(U_FAILURE(*pErrorCode
)) {
3237 result
=UNORM_MAYBE
; /* error (out of memory) */
3241 /* compare the normalized version with the original */
3242 if(0!=uprv_strCompare(prevStarter
, (int32_t)(src
-prevStarter
), buffer
, length
, FALSE
, FALSE
)) {
3243 result
=UNORM_NO
; /* normalization differs */
3247 /* continue after the next starter */
3253 if(buffer
!=stackBuffer
) {
3260 U_CAPI UNormalizationCheckResult U_EXPORT2
3261 unorm_quickCheck(const UChar
*src
,
3263 UNormalizationMode mode
,
3264 UErrorCode
*pErrorCode
) {
3265 return _quickCheck(src
, srcLength
, mode
, TRUE
, NULL
, pErrorCode
);
3268 U_CAPI UNormalizationCheckResult U_EXPORT2
3269 unorm_quickCheckWithOptions(const UChar
*src
, int32_t srcLength
,
3270 UNormalizationMode mode
, int32_t options
,
3271 UErrorCode
*pErrorCode
) {
3272 return _quickCheck(src
, srcLength
, mode
, TRUE
, getNX(options
, *pErrorCode
), pErrorCode
);
3275 U_CFUNC UNormalizationCheckResult
3276 unorm_internalQuickCheck(const UChar
*src
,
3278 UNormalizationMode mode
,
3280 const UnicodeSet
*nx
,
3281 UErrorCode
*pErrorCode
) {
3282 return _quickCheck(src
, srcLength
, mode
, allowMaybe
, nx
, pErrorCode
);
3285 U_CAPI UBool U_EXPORT2
3286 unorm_isNormalized(const UChar
*src
, int32_t srcLength
,
3287 UNormalizationMode mode
,
3288 UErrorCode
*pErrorCode
) {
3289 return (UBool
)(UNORM_YES
==_quickCheck(src
, srcLength
, mode
, FALSE
, NULL
, pErrorCode
));
3292 U_CAPI UBool U_EXPORT2
3293 unorm_isNormalizedWithOptions(const UChar
*src
, int32_t srcLength
,
3294 UNormalizationMode mode
, int32_t options
,
3295 UErrorCode
*pErrorCode
) {
3296 return (UBool
)(UNORM_YES
==_quickCheck(src
, srcLength
, mode
, FALSE
, getNX(options
, *pErrorCode
), pErrorCode
));
3299 /* normalize() API ---------------------------------------------------------- */
3302 * Internal API for normalizing.
3303 * Does not check for bad input.
3304 * Requires _haveData() to be true.
3308 unorm_internalNormalizeWithNX(UChar
*dest
, int32_t destCapacity
,
3309 const UChar
*src
, int32_t srcLength
,
3310 UNormalizationMode mode
, int32_t options
, const UnicodeSet
*nx
,
3311 UErrorCode
*pErrorCode
) {
3317 destLength
=_decompose(dest
, destCapacity
,
3319 FALSE
, nx
, trailCC
);
3322 destLength
=_decompose(dest
, destCapacity
,
3327 destLength
=_compose(dest
, destCapacity
,
3329 options
, nx
, pErrorCode
);
3332 destLength
=_compose(dest
, destCapacity
,
3334 options
|_NORM_OPTIONS_COMPAT
, nx
, pErrorCode
);
3337 if(fcdTrie
.index
==NULL
) {
3338 *pErrorCode
=U_UNSUPPORTED_ERROR
;
3341 return unorm_makeFCD(dest
, destCapacity
,
3347 destLength
=_compose(dest
, destCapacity
,
3349 options
|_NORM_OPTIONS_COMPOSE_CONTIGUOUS
, nx
, pErrorCode
);
3353 /* just copy the string */
3355 srcLength
=u_strlen(src
);
3357 if(srcLength
>0 && srcLength
<=destCapacity
) {
3358 uprv_memcpy(dest
, src
, srcLength
*U_SIZEOF_UCHAR
);
3360 destLength
=srcLength
;
3363 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3367 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
3371 * Internal API for normalizing.
3372 * Does not check for bad input.
3375 U_CAPI
int32_t U_EXPORT2
3376 unorm_internalNormalize(UChar
*dest
, int32_t destCapacity
,
3377 const UChar
*src
, int32_t srcLength
,
3378 UNormalizationMode mode
, int32_t options
,
3379 UErrorCode
*pErrorCode
) {
3380 const UnicodeSet
*nx
;
3382 if(!_haveData(*pErrorCode
)) {
3386 nx
=getNX(options
, *pErrorCode
);
3387 if(U_FAILURE(*pErrorCode
)) {
3391 /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3392 options
&=~(_NORM_OPTIONS_SETS_MASK
|_NORM_OPTIONS_COMPAT
|_NORM_OPTIONS_COMPOSE_CONTIGUOUS
);
3394 return unorm_internalNormalizeWithNX(dest
, destCapacity
,
3400 /** Public API for normalizing. */
3401 U_CAPI
int32_t U_EXPORT2
3402 unorm_normalize(const UChar
*src
, int32_t srcLength
,
3403 UNormalizationMode mode
, int32_t options
,
3404 UChar
*dest
, int32_t destCapacity
,
3405 UErrorCode
*pErrorCode
) {
3406 /* check argument values */
3407 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3411 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3412 src
==NULL
|| srcLength
<-1
3414 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3418 /* check for overlapping src and destination */
3420 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
3421 (srcLength
>0 && dest
>=src
&& dest
<(src
+srcLength
)))
3423 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3427 return unorm_internalNormalize(dest
, destCapacity
,
3434 /* iteration functions ------------------------------------------------------ */
3437 * These iteration functions are the core implementations of the
3438 * Normalizer class iteration API.
3439 * They read from a UCharIterator into their own buffer
3440 * and normalize into the Normalizer iteration buffer.
3441 * Normalizer itself then iterates over its buffer until that needs to be
3447 * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3448 * if iteration bounds are reached,
3449 * try to not call hasNext/hasPrevious and instead check for >=0.
3452 /* backward iteration ------------------------------------------------------- */
3455 * read backwards and get norm32
3456 * return 0 if the character is <minC
3457 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3459 static inline uint32_t
3460 _getPrevNorm32(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
) {
3463 /* need src.hasPrevious() */
3464 c
=(UChar
)src
.previous(&src
);
3467 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3470 } else if(!UTF_IS_SURROGATE(c
)) {
3471 return _getNorm32(c
);
3472 } else if(UTF_IS_SURROGATE_FIRST(c
) || !src
.hasPrevious(&src
)) {
3473 /* unpaired surrogate */
3475 } else if(UTF_IS_FIRST_SURROGATE(c2
=(UChar
)src
.previous(&src
))) {
3476 norm32
=_getNorm32(c2
);
3477 if((norm32
&mask
)==0) {
3478 /* all surrogate pairs with this lead surrogate have irrelevant data */
3481 /* norm32 must be a surrogate special */
3482 return _getNorm32FromSurrogatePair(norm32
, c
);
3485 /* unpaired second surrogate, undo the c2=src.previous() movement */
3486 src
.move(&src
, 1, UITER_CURRENT
);
3493 * read backwards and check if the character is a previous-iteration boundary
3494 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3497 IsPrevBoundaryFn(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
);
3501 * read backwards and check if the lead combining class is 0
3502 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3505 _isPrevNFDSafe(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3506 return _isNFDSafe(_getPrevNorm32(src
, minC
, ccOrQCMask
, c
, c2
), ccOrQCMask
, ccOrQCMask
&_NORM_QC_MASK
);
3510 * read backwards and check if the character is (or its decomposition begins with)
3511 * a "true starter" (cc==0 and NF*C_YES)
3512 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3515 _isPrevTrueStarter(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3516 uint32_t norm32
, decompQCMask
;
3518 decompQCMask
=(ccOrQCMask
<<2)&0xf; /* decomposition quick check mask */
3519 norm32
=_getPrevNorm32(src
, minC
, ccOrQCMask
|decompQCMask
, c
, c2
);
3520 return _isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
);
3524 _findPreviousIterationBoundary(UCharIterator
&src
,
3525 IsPrevBoundaryFn
*isPrevBoundary
, uint32_t minC
, uint32_t mask
,
3526 UChar
*&buffer
, int32_t &bufferCapacity
,
3527 int32_t &startIndex
,
3528 UErrorCode
*pErrorCode
) {
3535 startIndex
=bufferCapacity
; /* fill the buffer from the end backwards */
3537 while(src
.hasPrevious(&src
)) {
3538 isBoundary
=isPrevBoundary(src
, minC
, mask
, c
, c2
);
3540 /* always write this character to the front of the buffer */
3541 /* make sure there is enough space in the buffer */
3542 if(startIndex
< (c2
==0 ? 1 : 2)) {
3543 int32_t bufferLength
=bufferCapacity
;
3545 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, 2*bufferCapacity
, bufferLength
)) {
3546 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
3547 src
.move(&src
, 0, UITER_START
);
3551 /* move the current buffer contents up */
3552 uprv_memmove(buffer
+(bufferCapacity
-bufferLength
), buffer
, bufferLength
*U_SIZEOF_UCHAR
);
3553 startIndex
+=bufferCapacity
-bufferLength
;
3556 buffer
[--startIndex
]=c
;
3558 buffer
[--startIndex
]=c2
;
3561 /* stop if this just-copied character is a boundary */
3567 /* return the length of the buffer contents */
3568 return bufferCapacity
-startIndex
;
3571 U_CAPI
int32_t U_EXPORT2
3572 unorm_previous(UCharIterator
*src
,
3573 UChar
*dest
, int32_t destCapacity
,
3574 UNormalizationMode mode
, int32_t options
,
3575 UBool doNormalize
, UBool
*pNeededToNormalize
,
3576 UErrorCode
*pErrorCode
) {
3577 UChar stackBuffer
[100];
3579 IsPrevBoundaryFn
*isPreviousBoundary
=NULL
;
3581 int32_t startIndex
=0, bufferLength
=0, bufferCapacity
=0, destLength
=0;
3585 /* check argument values */
3586 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3590 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3593 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3597 if(!_haveData(*pErrorCode
)) {
3601 if(pNeededToNormalize
!=NULL
) {
3602 *pNeededToNormalize
=FALSE
;
3607 if(fcdTrie
.index
==NULL
) {
3608 *pErrorCode
=U_UNSUPPORTED_ERROR
;
3611 /* fall through to NFD */
3613 isPreviousBoundary
=_isPrevNFDSafe
;
3614 minC
=_NORM_MIN_WITH_LEAD_CC
;
3615 mask
=_NORM_CC_MASK
|_NORM_QC_NFD
;
3618 isPreviousBoundary
=_isPrevNFDSafe
;
3619 minC
=_NORM_MIN_WITH_LEAD_CC
;
3620 mask
=_NORM_CC_MASK
|_NORM_QC_NFKD
;
3623 isPreviousBoundary
=_isPrevTrueStarter
;
3624 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
3625 mask
=_NORM_CC_MASK
|_NORM_QC_NFC
;
3628 isPreviousBoundary
=_isPrevTrueStarter
;
3629 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
3630 mask
=_NORM_CC_MASK
|_NORM_QC_NFKC
;
3634 if((c
=src
->previous(src
))>=0) {
3636 if(UTF_IS_TRAIL(c
) && (c2
=src
->previous(src
))>=0) {
3637 if(UTF_IS_LEAD(c2
)) {
3638 if(destCapacity
>=2) {
3639 dest
[1]=(UChar
)c
; /* trail surrogate */
3642 c
=c2
; /* lead surrogate to be written below */
3644 src
->move(src
, 1, UITER_CURRENT
);
3648 if(destCapacity
>0) {
3652 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
3654 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3659 bufferCapacity
=(int32_t)(sizeof(stackBuffer
)/U_SIZEOF_UCHAR
);
3660 bufferLength
=_findPreviousIterationBoundary(*src
,
3661 isPreviousBoundary
, minC
, mask
,
3662 buffer
, bufferCapacity
,
3665 if(bufferLength
>0) {
3667 destLength
=unorm_internalNormalize(dest
, destCapacity
,
3668 buffer
+startIndex
, bufferLength
,
3671 if(pNeededToNormalize
!=0 && U_SUCCESS(*pErrorCode
)) {
3672 *pNeededToNormalize
=
3673 (UBool
)(destLength
!=bufferLength
||
3674 0!=uprv_memcmp(dest
, buffer
+startIndex
, destLength
*U_SIZEOF_UCHAR
));
3677 /* just copy the source characters */
3678 if(destCapacity
>0) {
3679 uprv_memcpy(dest
, buffer
+startIndex
, uprv_min(bufferLength
, destCapacity
)*U_SIZEOF_UCHAR
);
3681 destLength
=u_terminateUChars(dest
, destCapacity
, bufferLength
, pErrorCode
);
3684 destLength
=u_terminateUChars(dest
, destCapacity
, 0, pErrorCode
);
3688 if(buffer
!=stackBuffer
) {
3695 /* forward iteration -------------------------------------------------------- */
3698 * read forward and get norm32
3699 * return 0 if the character is <minC
3700 * if c2!=0 then (c2, c) is a surrogate pair
3701 * always reads complete characters
3703 static inline uint32_t
3704 _getNextNorm32(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
) {
3707 /* need src.hasNext() to be true */
3708 c
=(UChar
)src
.next(&src
);
3715 norm32
=_getNorm32(c
);
3716 if(UTF_IS_FIRST_SURROGATE(c
)) {
3717 if(src
.hasNext(&src
) && UTF_IS_SECOND_SURROGATE(c2
=(UChar
)src
.current(&src
))) {
3718 src
.move(&src
, 1, UITER_CURRENT
); /* skip the c2 surrogate */
3719 if((norm32
&mask
)==0) {
3720 /* irrelevant data */
3723 /* norm32 must be a surrogate special */
3724 return _getNorm32FromSurrogatePair(norm32
, c2
);
3727 /* unmatched surrogate */
3736 * read forward and check if the character is a next-iteration boundary
3737 * if c2!=0 then (c, c2) is a surrogate pair
3740 IsNextBoundaryFn(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
);
3744 * read forward and check if the lead combining class is 0
3745 * if c2!=0 then (c, c2) is a surrogate pair
3748 _isNextNFDSafe(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3749 return _isNFDSafe(_getNextNorm32(src
, minC
, ccOrQCMask
, c
, c2
), ccOrQCMask
, ccOrQCMask
&_NORM_QC_MASK
);
3754 * read forward and check if the character is (or its decomposition begins with)
3755 * a "true starter" (cc==0 and NF*C_YES)
3756 * if c2!=0 then (c, c2) is a surrogate pair
3759 _isNextTrueStarter(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3760 uint32_t norm32
, decompQCMask
;
3762 decompQCMask
=(ccOrQCMask
<<2)&0xf; /* decomposition quick check mask */
3763 norm32
=_getNextNorm32(src
, minC
, ccOrQCMask
|decompQCMask
, c
, c2
);
3764 return _isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
);
3768 _findNextIterationBoundary(UCharIterator
&src
,
3769 IsNextBoundaryFn
*isNextBoundary
, uint32_t minC
, uint32_t mask
,
3770 UChar
*&buffer
, int32_t &bufferCapacity
,
3771 UErrorCode
*pErrorCode
) {
3773 int32_t bufferIndex
;
3776 if(!src
.hasNext(&src
)) {
3783 /* get one character and ignore its properties */
3784 buffer
[0]=c
=(UChar
)src
.next(&src
);
3786 if(UTF_IS_FIRST_SURROGATE(c
) && src
.hasNext(&src
)) {
3787 if(UTF_IS_SECOND_SURROGATE(c2
=(UChar
)src
.next(&src
))) {
3788 buffer
[bufferIndex
++]=c2
;
3790 src
.move(&src
, -1, UITER_CURRENT
); /* back out the non-trail-surrogate */
3794 /* get all following characters until we see a boundary */
3795 /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3796 while(src
.hasNext(&src
)) {
3797 if(isNextBoundary(src
, minC
, mask
, c
, c2
)) {
3798 /* back out the latest movement to stop at the boundary */
3799 src
.move(&src
, c2
==0 ? -1 : -2, UITER_CURRENT
);
3802 if(bufferIndex
+(c2
==0 ? 1 : 2)<=bufferCapacity
||
3803 /* attempt to grow the buffer */
3804 u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
,
3808 buffer
[bufferIndex
++]=c
;
3810 buffer
[bufferIndex
++]=c2
;
3813 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
3814 src
.move(&src
, 0, UITER_LIMIT
);
3820 /* return the length of the buffer contents */
3824 U_CAPI
int32_t U_EXPORT2
3825 unorm_next(UCharIterator
*src
,
3826 UChar
*dest
, int32_t destCapacity
,
3827 UNormalizationMode mode
, int32_t options
,
3828 UBool doNormalize
, UBool
*pNeededToNormalize
,
3829 UErrorCode
*pErrorCode
) {
3830 UChar stackBuffer
[100];
3832 IsNextBoundaryFn
*isNextBoundary
;
3834 int32_t bufferLength
, bufferCapacity
, destLength
;
3838 /* check argument values */
3839 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3843 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3846 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3850 if(!_haveData(*pErrorCode
)) {
3854 if(pNeededToNormalize
!=NULL
) {
3855 *pNeededToNormalize
=FALSE
;
3860 if(fcdTrie
.index
==NULL
) {
3861 *pErrorCode
=U_UNSUPPORTED_ERROR
;
3864 /* fall through to NFD */
3866 isNextBoundary
=_isNextNFDSafe
;
3867 minC
=_NORM_MIN_WITH_LEAD_CC
;
3868 mask
=_NORM_CC_MASK
|_NORM_QC_NFD
;
3871 isNextBoundary
=_isNextNFDSafe
;
3872 minC
=_NORM_MIN_WITH_LEAD_CC
;
3873 mask
=_NORM_CC_MASK
|_NORM_QC_NFKD
;
3876 isNextBoundary
=_isNextTrueStarter
;
3877 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
3878 mask
=_NORM_CC_MASK
|_NORM_QC_NFC
;
3881 isNextBoundary
=_isNextTrueStarter
;
3882 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
3883 mask
=_NORM_CC_MASK
|_NORM_QC_NFKC
;
3887 if((c
=src
->next(src
))>=0) {
3889 if(UTF_IS_LEAD(c
) && (c2
=src
->next(src
))>=0) {
3890 if(UTF_IS_TRAIL(c2
)) {
3891 if(destCapacity
>=2) {
3892 dest
[1]=(UChar
)c2
; /* trail surrogate */
3895 /* lead surrogate to be written below */
3897 src
->move(src
, -1, UITER_CURRENT
);
3901 if(destCapacity
>0) {
3905 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
3907 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3912 bufferCapacity
=(int32_t)(sizeof(stackBuffer
)/U_SIZEOF_UCHAR
);
3913 bufferLength
=_findNextIterationBoundary(*src
,
3914 isNextBoundary
, minC
, mask
,
3915 buffer
, bufferCapacity
,
3917 if(bufferLength
>0) {
3919 destLength
=unorm_internalNormalize(dest
, destCapacity
,
3920 buffer
, bufferLength
,
3923 if(pNeededToNormalize
!=0 && U_SUCCESS(*pErrorCode
)) {
3924 *pNeededToNormalize
=
3925 (UBool
)(destLength
!=bufferLength
||
3926 0!=uprv_memcmp(dest
, buffer
, destLength
*U_SIZEOF_UCHAR
));
3929 /* just copy the source characters */
3930 if(destCapacity
>0) {
3931 uprv_memcpy(dest
, buffer
, uprv_min(bufferLength
, destCapacity
)*U_SIZEOF_UCHAR
);
3933 destLength
=u_terminateUChars(dest
, destCapacity
, bufferLength
, pErrorCode
);
3936 destLength
=u_terminateUChars(dest
, destCapacity
, 0, pErrorCode
);
3940 if(buffer
!=stackBuffer
) {
3948 * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3949 * and if not, how hard it would be to improve it.
3950 * For example, see _findSafeFCD().
3953 /* Concatenation of normalized strings -------------------------------------- */
3955 U_CAPI
int32_t U_EXPORT2
3956 unorm_concatenate(const UChar
*left
, int32_t leftLength
,
3957 const UChar
*right
, int32_t rightLength
,
3958 UChar
*dest
, int32_t destCapacity
,
3959 UNormalizationMode mode
, int32_t options
,
3960 UErrorCode
*pErrorCode
) {
3961 UChar stackBuffer
[100];
3963 int32_t bufferLength
, bufferCapacity
;
3966 int32_t leftBoundary
, rightBoundary
, destLength
;
3968 /* check argument values */
3969 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3973 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3974 left
==NULL
|| leftLength
<-1 ||
3975 right
==NULL
|| rightLength
<-1
3977 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3981 /* check for overlapping right and destination */
3983 ((right
>=dest
&& right
<(dest
+destCapacity
)) ||
3984 (rightLength
>0 && dest
>=right
&& dest
<(right
+rightLength
)))
3986 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3990 /* allow left==dest */
3992 /* set up intermediate buffer */
3994 bufferCapacity
=(int32_t)(sizeof(stackBuffer
)/U_SIZEOF_UCHAR
);
3997 * Input: left[0..leftLength[ + right[0..rightLength[
3999 * Find normalization-safe boundaries leftBoundary and rightBoundary
4000 * and copy the end parts together:
4001 * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
4003 * dest=left[0..leftBoundary[ +
4004 * normalize(buffer) +
4005 * right[rightBoundary..rightLength[
4009 * find a normalization boundary at the end of the left string
4010 * and copy the end part into the buffer
4012 uiter_setString(&iter
, left
, leftLength
);
4013 iter
.index
=leftLength
=iter
.length
; /* end of left string */
4015 bufferLength
=unorm_previous(&iter
, buffer
, bufferCapacity
,
4019 leftBoundary
=iter
.index
;
4020 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
4021 *pErrorCode
=U_ZERO_ERROR
;
4022 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, 2*bufferLength
, 0)) {
4023 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
4024 /* dont need to cleanup here since
4025 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4030 /* just copy from the left string: we know the boundary already */
4031 uprv_memcpy(buffer
, left
+leftBoundary
, bufferLength
*U_SIZEOF_UCHAR
);
4035 * find a normalization boundary at the beginning of the right string
4036 * and concatenate the beginning part to the buffer
4038 uiter_setString(&iter
, right
, rightLength
);
4039 rightLength
=iter
.length
; /* in case it was -1 */
4041 rightBoundary
=unorm_next(&iter
, buffer
+bufferLength
, bufferCapacity
-bufferLength
,
4045 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
4046 *pErrorCode
=U_ZERO_ERROR
;
4047 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, bufferLength
+rightBoundary
, 0)) {
4048 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
4049 /* dont need to cleanup here since
4050 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4055 /* just copy from the right string: we know the boundary already */
4056 uprv_memcpy(buffer
+bufferLength
, right
, rightBoundary
*U_SIZEOF_UCHAR
);
4059 bufferLength
+=rightBoundary
;
4061 /* copy left[0..leftBoundary[ to dest */
4062 if(left
!=dest
&& leftBoundary
>0 && destCapacity
>0) {
4063 uprv_memcpy(dest
, left
, uprv_min(leftBoundary
, destCapacity
)*U_SIZEOF_UCHAR
);
4065 destLength
=leftBoundary
;
4067 /* concatenate the normalization of the buffer to dest */
4068 if(destCapacity
>destLength
) {
4069 destLength
+=unorm_internalNormalize(dest
+destLength
, destCapacity
-destLength
,
4070 buffer
, bufferLength
,
4074 destLength
+=unorm_internalNormalize(NULL
, 0,
4075 buffer
, bufferLength
,
4080 * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
4081 * so we dont check for the error code here..just let it pass through
4083 /* concatenate right[rightBoundary..rightLength[ to dest */
4084 right
+=rightBoundary
;
4085 rightLength
-=rightBoundary
;
4086 if(rightLength
>0 && destCapacity
>destLength
) {
4087 uprv_memcpy(dest
+destLength
, right
, uprv_min(rightLength
, destCapacity
-destLength
)*U_SIZEOF_UCHAR
);
4089 destLength
+=rightLength
;
4092 if(buffer
!=stackBuffer
) {
4096 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
4099 #endif /* #if !UCONFIG_NO_NORMALIZATION */