2 ******************************************************************************
3 * Copyright (c) 1996-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
8 * Created by: Vladimir Weinstein 12052000
10 * Modification history :
12 * Date Name Description
13 * 02/01/01 synwee Added normalization quickcheck enum and method.
14 * 02/12/01 synwee Commented out quickcheck util api has been approved
15 * Added private method for doing FCD checks
16 * 02/23/01 synwee Modified quickcheck and checkFCE to run through
17 * string for codepoints < 0x300 for the normalization
19 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here
20 * instead of just wrappers around normlzr.cpp,
21 * load unorm.dat, support Unicode 3.1 with
22 * supplementary code points, etc.
25 #include "unicode/utypes.h"
27 #if !UCONFIG_NO_NORMALIZATION
29 #include "unicode/udata.h"
30 #include "unicode/uchar.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/uniset.h"
34 #include "unicode/usetiter.h"
35 #include "unicode/unorm.h"
42 #include "unicode/uset.h"
47 * Status of tailored normalization
49 * This was done initially for investigation on Unicode public review issue 7
50 * (http://www.unicode.org/review/). See Jitterbug 2481.
51 * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
52 * a permanent feature in ICU 2.6 in support of IDNA which requires true
53 * Unicode 3.2 normalization.
54 * (NormalizationCorrections are rolled into IDNA mapping tables.)
56 * Tailored normalization as implemented here allows to "normalize less"
57 * than full Unicode normalization would.
58 * Based internally on a UnicodeSet of code points that are
59 * "excluded from normalization", the normalization functions leave those
60 * code points alone ("inert"). This means that tailored normalization
61 * still transforms text into a canonically equivalent form.
62 * It does not add decompositions to code points that do not have any or
63 * change decomposition results.
65 * Any function that searches for a safe boundary has not been touched,
66 * which means that these functions will be over-pessimistic when
67 * exclusions are applied.
68 * This should not matter because subsequent checks and normalizations
69 * do apply the exclusions; only a little more of the text may be processed
70 * than necessary under exclusions.
72 * Normalization exclusions have the following effect on excluded code points c:
73 * - c is not decomposed
74 * - c is not a composition target
75 * - c does not combine forward or backward for composition
76 * except that this is not implemented for Jamo
77 * - c is treated as having a combining class of 0
79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
82 * This new implementation of the normalization code loads its data from
83 * unorm.dat, which is generated with the gennorm tool.
84 * The format of that file is described in unormimp.h .
87 /* -------------------------------------------------------------------------- */
90 _STACK_BUFFER_CAPACITY
=100
94 * Constants for the bit fields in the options bit set parameter.
95 * These need not be public.
96 * A user only needs to know the currently assigned values.
97 * The number and positions of reserved bits per field can remain private
98 * and may change in future implementations.
101 _NORM_OPTIONS_NX_MASK
=0x1f,
102 _NORM_OPTIONS_UNICODE_MASK
=0x60,
103 _NORM_OPTIONS_SETS_MASK
=0x7f,
105 _NORM_OPTIONS_UNICODE_SHIFT
=5,
108 * The following options are used only in some composition functions.
109 * They use bits 12 and up to preserve lower bits for the available options
110 * space in unorm_compare() -
111 * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
114 /** Options bit 12, for compatibility vs. canonical decomposition. */
115 _NORM_OPTIONS_COMPAT
=0x1000,
116 /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
117 _NORM_OPTIONS_COMPOSE_CONTIGUOUS
=0x2000
121 isHangulWithoutJamoT(UChar c
) {
123 return c
<HANGUL_COUNT
&& c%JAMO_T_COUNT
==0;
128 /* is this a norm32 with a regular index? */
130 isNorm32Regular(uint32_t norm32
) {
131 return norm32
<_NORM_MIN_SPECIAL
;
134 /* is this a norm32 with a special index for a lead surrogate? */
136 isNorm32LeadSurrogate(uint32_t norm32
) {
137 return _NORM_MIN_SPECIAL
<=norm32
&& norm32
<_NORM_SURROGATES_TOP
;
140 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
142 isNorm32HangulOrJamo(uint32_t norm32
) {
143 return norm32
>=_NORM_MIN_HANGUL
;
147 * Given isNorm32HangulOrJamo(),
148 * is this a Hangul syllable or a Jamo?
151 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32
) {
152 return norm32
<_NORM_MIN_JAMO_V
;
156 * Given norm32 for Jamo V or T,
160 isJamoVTNorm32JamoV(uint32_t norm32
) {
161 return norm32
<_NORM_JAMO_V_TOP
;
164 /* load unorm.dat ----------------------------------------------------------- */
166 #define DATA_NAME "unorm"
167 #define DATA_TYPE "icu"
169 static UDataMemory
*normData
=NULL
;
170 static UErrorCode dataErrorCode
=U_ZERO_ERROR
;
171 static int8_t haveNormData
=0;
173 static int32_t indexes
[_NORM_INDEX_TOP
]={ 0 };
174 static UTrie normTrie
={ 0,0,0,0,0,0,0 }, fcdTrie
={ 0,0,0,0,0,0,0 }, auxTrie
={ 0,0,0,0,0,0,0 };
177 * pointers into the memory-mapped unorm.icu
179 static const uint16_t *extraData
=NULL
,
180 *combiningTable
=NULL
,
181 *canonStartSets
=NULL
;
183 static uint8_t formatVersion
[4]={ 0, 0, 0, 0 };
184 static UBool formatVersion_2_1
=FALSE
, formatVersion_2_2
=FALSE
;
186 /* the Unicode version of the normalization data */
187 static UVersionInfo dataVersion
={ 0, 0, 0, 0 };
189 /* cache UnicodeSets for each combination of exclusion flags */
190 static UnicodeSet
*nxCache
[_NORM_OPTIONS_SETS_MASK
+1]={ NULL
};
194 static UBool U_CALLCONV
199 udata_close(normData
);
202 dataErrorCode
=U_ZERO_ERROR
;
205 for(i
=0; i
<(int32_t)LENGTHOF(nxCache
); ++i
) {
208 uprv_memset(nxCache
, 0, sizeof(nxCache
));
213 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
214 static int32_t U_CALLCONV
215 getFoldingNormOffset(uint32_t norm32
) {
216 if(isNorm32LeadSurrogate(norm32
)) {
218 UTRIE_BMP_INDEX_LENGTH
+
219 (((int32_t)norm32
>>(_NORM_EXTRA_SHIFT
-UTRIE_SURROGATE_BLOCK_BITS
))&
220 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS
));
226 /* fcdTrie: the folding offset is the lead FCD value itself */
227 static int32_t U_CALLCONV
228 getFoldingFCDOffset(uint32_t data
) {
229 return (int32_t)data
;
232 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
233 static int32_t U_CALLCONV
234 getFoldingAuxOffset(uint32_t data
) {
235 return (int32_t)(data
&_NORM_AUX_FNC_MASK
)<<UTRIE_SURROGATE_BLOCK_BITS
;
238 static UBool U_CALLCONV
239 isAcceptable(void * /* context */,
240 const char * /* type */, const char * /* name */,
241 const UDataInfo
*pInfo
) {
244 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
245 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
246 pInfo
->dataFormat
[0]==0x4e && /* dataFormat="Norm" */
247 pInfo
->dataFormat
[1]==0x6f &&
248 pInfo
->dataFormat
[2]==0x72 &&
249 pInfo
->dataFormat
[3]==0x6d &&
250 pInfo
->formatVersion
[0]==2 &&
251 pInfo
->formatVersion
[2]==UTRIE_SHIFT
&&
252 pInfo
->formatVersion
[3]==UTRIE_INDEX_SHIFT
254 uprv_memcpy(formatVersion
, pInfo
->formatVersion
, 4);
255 uprv_memcpy(dataVersion
, pInfo
->dataVersion
, 4);
262 static UBool U_CALLCONV
263 _enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32
/*limit*/, uint32_t /*value*/) {
264 /* add the start code point to the USet */
265 USetAdder
*sa
=(USetAdder
*)context
;
266 sa
->add(sa
->set
, start
);
273 loadNormData(UErrorCode
&errorCode
) {
274 /* load Unicode normalization data from file */
277 * This lazy intialization with double-checked locking (without mutex protection for
278 * haveNormData==0) is transiently unsafe under certain circumstances.
279 * Check the readme and use u_init() if necessary.
281 * While u_init() initializes the main normalization data via this functions,
282 * it does not do so for exclusion sets (which are fully mutexed).
284 * - there can be many exclusion sets
285 * - they are rarely used
286 * - they are not usually used in execution paths that are
287 * as performance-sensitive as others
288 * (e.g., IDNA takes more time than unorm_quickCheck() anyway)
290 if(haveNormData
==0) {
291 UTrie _normTrie
={ 0,0,0,0,0,0,0 }, _fcdTrie
={ 0,0,0,0,0,0,0 }, _auxTrie
={ 0,0,0,0,0,0,0 };
293 const int32_t *p
=NULL
;
296 if(&errorCode
==NULL
|| U_FAILURE(errorCode
)) {
300 /* open the data outside the mutex block */
301 data
=udata_openChoice(NULL
, DATA_TYPE
, DATA_NAME
, isAcceptable
, NULL
, &errorCode
);
302 dataErrorCode
=errorCode
;
303 if(U_FAILURE(errorCode
)) {
304 return haveNormData
=-1;
307 p
=(const int32_t *)udata_getMemory(data
);
308 pb
=(const uint8_t *)(p
+_NORM_INDEX_TOP
);
309 utrie_unserialize(&_normTrie
, pb
, p
[_NORM_INDEX_TRIE_SIZE
], &errorCode
);
310 _normTrie
.getFoldingOffset
=getFoldingNormOffset
;
312 pb
+=p
[_NORM_INDEX_TRIE_SIZE
]+p
[_NORM_INDEX_UCHAR_COUNT
]*2+p
[_NORM_INDEX_COMBINE_DATA_COUNT
]*2;
313 utrie_unserialize(&_fcdTrie
, pb
, p
[_NORM_INDEX_FCD_TRIE_SIZE
], &errorCode
);
314 _fcdTrie
.getFoldingOffset
=getFoldingFCDOffset
;
316 if(p
[_NORM_INDEX_FCD_TRIE_SIZE
]!=0) {
317 pb
+=p
[_NORM_INDEX_FCD_TRIE_SIZE
];
318 utrie_unserialize(&_auxTrie
, pb
, p
[_NORM_INDEX_AUX_TRIE_SIZE
], &errorCode
);
319 _auxTrie
.getFoldingOffset
=getFoldingAuxOffset
;
322 if(U_FAILURE(errorCode
)) {
323 dataErrorCode
=errorCode
;
325 return haveNormData
=-1;
328 /* in the mutex block, set the data for this process */
334 uprv_memcpy(&indexes
, p
, sizeof(indexes
));
335 uprv_memcpy(&normTrie
, &_normTrie
, sizeof(UTrie
));
336 uprv_memcpy(&fcdTrie
, &_fcdTrie
, sizeof(UTrie
));
337 uprv_memcpy(&auxTrie
, &_auxTrie
, sizeof(UTrie
));
339 p
=(const int32_t *)udata_getMemory(normData
);
342 /* initialize some variables */
343 extraData
=(uint16_t *)((uint8_t *)(p
+_NORM_INDEX_TOP
)+indexes
[_NORM_INDEX_TRIE_SIZE
]);
344 combiningTable
=extraData
+indexes
[_NORM_INDEX_UCHAR_COUNT
];
345 formatVersion_2_1
=formatVersion
[0]>2 || (formatVersion
[0]==2 && formatVersion
[1]>=1);
346 formatVersion_2_2
=formatVersion
[0]>2 || (formatVersion
[0]==2 && formatVersion
[1]>=2);
347 if(formatVersion_2_1
) {
348 canonStartSets
=combiningTable
+
349 indexes
[_NORM_INDEX_COMBINE_DATA_COUNT
]+
350 (indexes
[_NORM_INDEX_FCD_TRIE_SIZE
]+indexes
[_NORM_INDEX_AUX_TRIE_SIZE
])/2;
353 ucln_common_registerCleanup(UCLN_COMMON_UNORM
, unorm_cleanup
);
356 /* if a different thread set it first, then close the extra data */
358 udata_close(data
); /* NULL if it was set correctly */
366 _haveData(UErrorCode
&errorCode
) {
367 if(haveNormData
!=0) {
368 errorCode
=dataErrorCode
;
369 return (UBool
)(haveNormData
>0);
371 return (UBool
)(loadNormData(errorCode
)>0);
375 U_CAPI UBool U_EXPORT2
376 unorm_haveData(UErrorCode
*pErrorCode
) {
377 return _haveData(*pErrorCode
);
380 U_CAPI
const uint16_t * U_EXPORT2
381 unorm_getFCDTrie(UErrorCode
*pErrorCode
) {
382 if(_haveData(*pErrorCode
)) {
383 return fcdTrie
.index
;
389 /* data access primitives --------------------------------------------------- */
391 static inline uint32_t
392 _getNorm32(UChar c
) {
393 return UTRIE_GET32_FROM_LEAD(&normTrie
, c
);
396 static inline uint32_t
397 _getNorm32FromSurrogatePair(uint32_t norm32
, UChar c2
) {
399 * the surrogate index in norm32 stores only the number of the surrogate index block
400 * see gennorm/store.c/getFoldedNormValue()
403 UTRIE_BMP_INDEX_LENGTH
+
404 ((norm32
>>(_NORM_EXTRA_SHIFT
-UTRIE_SURROGATE_BLOCK_BITS
))&
405 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS
));
406 return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie
, norm32
, c2
);
410 * get a norm32 from text with complete code points
411 * (like from decompositions)
413 static inline uint32_t
414 _getNorm32(const UChar
*p
, uint32_t mask
) {
415 uint32_t norm32
=_getNorm32(*p
);
416 if((norm32
&mask
) && isNorm32LeadSurrogate(norm32
)) {
417 /* *p is a lead surrogate, get the real norm32 */
418 norm32
=_getNorm32FromSurrogatePair(norm32
, *(p
+1));
423 static inline uint16_t
425 return UTRIE_GET16_FROM_LEAD(&fcdTrie
, c
);
428 static inline uint16_t
429 _getFCD16FromSurrogatePair(uint16_t fcd16
, UChar c2
) {
430 /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
431 return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie
, fcd16
, c2
);
434 static inline const uint16_t *
435 _getExtraData(uint32_t norm32
) {
436 return extraData
+(norm32
>>_NORM_EXTRA_SHIFT
);
439 /* normalization exclusion sets --------------------------------------------- */
442 * Normalization exclusion UnicodeSets are used for tailored normalization;
443 * see the comment near the beginning of this file.
445 * By specifying one or several sets of code points,
446 * those code points become inert for normalization.
449 static const UnicodeSet
*
450 internalGetNXHangul(UErrorCode
&errorCode
) {
451 /* internal function, does not check for incoming U_FAILURE */
454 UMTX_CHECK(NULL
, (UBool
)(nxCache
[UNORM_NX_HANGUL
]!=NULL
), isCached
);
457 UnicodeSet
*set
=new UnicodeSet(0xac00, 0xd7a3);
459 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
464 if(nxCache
[UNORM_NX_HANGUL
]==NULL
) {
465 nxCache
[UNORM_NX_HANGUL
]=set
;
473 return nxCache
[UNORM_NX_HANGUL
];
476 /* unorm.cpp 1.116 had and used
477 static const UnicodeSet *
478 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
483 /* get and set an exclusion set from a serialized UnicodeSet */
484 static const UnicodeSet
*
485 internalGetSerializedNX(int32_t options
, int32_t nxIndex
, UErrorCode
&errorCode
) {
486 /* internal function, does not check for incoming U_FAILURE */
489 UMTX_CHECK(NULL
, (UBool
)(nxCache
[options
]!=NULL
), isCached
);
492 canonStartSets
!=NULL
&&
493 canonStartSets
[nxIndex
]!=0 && canonStartSets
[nxIndex
+1]>canonStartSets
[nxIndex
]
500 if( !uset_getSerializedSet(
502 canonStartSets
+canonStartSets
[nxIndex
],
503 canonStartSets
[nxIndex
+1]-canonStartSets
[nxIndex
])
505 errorCode
=U_INVALID_FORMAT_ERROR
;
509 /* turn the serialized set into a UnicodeSet */
510 set
=new UnicodeSet();
512 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
515 for(i
=0; uset_getSerializedRange(&sset
, i
, &start
, &end
); ++i
) {
516 set
->add(start
, end
);
520 if(nxCache
[options
]==NULL
) {
521 nxCache
[options
]=set
;
529 return nxCache
[options
];
532 static const UnicodeSet
*
533 internalGetNXCJKCompat(UErrorCode
&errorCode
) {
534 /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
535 return internalGetSerializedNX(
537 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET
,
541 static const UnicodeSet
*
542 internalGetNXUnicode(uint32_t options
, UErrorCode
&errorCode
) {
543 /* internal function, does not check for incoming U_FAILURE */
546 options
&=_NORM_OPTIONS_UNICODE_MASK
;
550 case UNORM_UNICODE_3_2
:
552 nxIndex
=_NORM_SET_INDEX_NX_UNICODE32_OFFSET
;
555 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
559 /* build a set with all code points that were not designated by the specified Unicode version */
560 return internalGetSerializedNX(options
, nxIndex
, errorCode
);
563 /* Get a decomposition exclusion set. The data must be loaded. */
564 static const UnicodeSet
*
565 internalGetNX(int32_t options
, UErrorCode
&errorCode
) {
566 options
&=_NORM_OPTIONS_SETS_MASK
;
570 UMTX_CHECK(NULL
, (UBool
)(nxCache
[options
]!=NULL
), isCached
);
573 /* return basic sets */
574 if(options
==UNORM_NX_HANGUL
) {
575 return internalGetNXHangul(errorCode
);
577 if(options
==UNORM_NX_CJK_COMPAT
) {
578 return internalGetNXCJKCompat(errorCode
);
580 if((options
&_NORM_OPTIONS_UNICODE_MASK
)!=0 && (options
&_NORM_OPTIONS_NX_MASK
)==0) {
581 return internalGetNXUnicode(options
, errorCode
);
584 /* build a set from multiple subsets */
586 const UnicodeSet
*other
;
588 set
=new UnicodeSet();
590 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
594 if((options
&UNORM_NX_HANGUL
)!=0 && NULL
!=(other
=internalGetNXHangul(errorCode
))) {
597 if((options
&UNORM_NX_CJK_COMPAT
)!=0 && NULL
!=(other
=internalGetNXCJKCompat(errorCode
))) {
600 if((options
&_NORM_OPTIONS_UNICODE_MASK
)!=0 && NULL
!=(other
=internalGetNXUnicode(options
, errorCode
))) {
604 if(U_FAILURE(errorCode
)) {
610 if(nxCache
[options
]==NULL
) {
611 nxCache
[options
]=set
;
619 return nxCache
[options
];
622 static inline const UnicodeSet
*
623 getNX(int32_t options
, UErrorCode
&errorCode
) {
624 if(U_FAILURE(errorCode
) || (options
&=_NORM_OPTIONS_SETS_MASK
)==0) {
625 /* incoming failure, or no decomposition exclusions requested */
628 return internalGetNX(options
, errorCode
);
632 U_CFUNC
const UnicodeSet
*
633 unorm_getNX(int32_t options
, UErrorCode
*pErrorCode
) {
634 return getNX(options
, *pErrorCode
);
638 nx_contains(const UnicodeSet
*nx
, UChar32 c
) {
639 return nx
!=NULL
&& nx
->contains(c
);
643 nx_contains(const UnicodeSet
*nx
, UChar c
, UChar c2
) {
644 return nx
!=NULL
&& nx
->contains(c2
==0 ? c
: U16_GET_SUPPLEMENTARY(c
, c2
));
647 /* other normalization primitives ------------------------------------------- */
649 /* get the canonical or compatibility decomposition for one character */
650 static inline const UChar
*
651 _decompose(uint32_t norm32
, uint32_t qcMask
, int32_t &length
,
652 uint8_t &cc
, uint8_t &trailCC
) {
653 const UChar
*p
=(const UChar
*)_getExtraData(norm32
);
656 if((norm32
&qcMask
&_NORM_QC_NFKD
)!=0 && length
>=0x100) {
657 /* use compatibility decomposition, skip canonical data */
658 p
+=((length
>>7)&1)+(length
&_NORM_DECOMP_LENGTH_MASK
);
662 if(length
&_NORM_DECOMP_FLAG_LENGTH_HAS_CC
) {
663 /* get the lead and trail cc's */
665 cc
=(uint8_t)(bothCCs
>>8);
666 trailCC
=(uint8_t)bothCCs
;
668 /* lead and trail cc's are both 0 */
672 length
&=_NORM_DECOMP_LENGTH_MASK
;
676 /* get the canonical decomposition for one character */
677 static inline const UChar
*
678 _decompose(uint32_t norm32
, int32_t &length
,
679 uint8_t &cc
, uint8_t &trailCC
) {
680 const UChar
*p
=(const UChar
*)_getExtraData(norm32
);
683 if(length
&_NORM_DECOMP_FLAG_LENGTH_HAS_CC
) {
684 /* get the lead and trail cc's */
686 cc
=(uint8_t)(bothCCs
>>8);
687 trailCC
=(uint8_t)bothCCs
;
689 /* lead and trail cc's are both 0 */
693 length
&=_NORM_DECOMP_LENGTH_MASK
;
698 * Get the canonical decomposition for one code point.
699 * @param c code point
700 * @param buffer out-only buffer for algorithmic decompositions of Hangul
701 * @param length out-only, takes the length of the decomposition, if any
702 * @return pointer to decomposition, or 0 if none
705 U_CFUNC
const UChar
*
706 unorm_getCanonicalDecomposition(UChar32 c
, UChar buffer
[4], int32_t *pLength
) {
709 if(c
<indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
]) {
714 UTRIE_GET32(&normTrie
, c
, norm32
);
715 if(norm32
&_NORM_QC_NFD
) {
716 if(isNorm32HangulOrJamo(norm32
)) {
717 /* Hangul syllable: decompose algorithmically */
722 c2
=(UChar
)(c%JAMO_T_COUNT
);
725 buffer
[2]=(UChar
)(JAMO_T_BASE
+c2
);
731 buffer
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
732 buffer
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
735 /* normal decomposition */
737 return _decompose(norm32
, *pLength
, cc
, trailCC
);
745 * get the combining class of (c, c2)=*p++
746 * before: p<limit after: p<=limit
747 * if only one code unit is used, then c2==0
749 static inline uint8_t
750 _getNextCC(const UChar
*&p
, const UChar
*limit
, UChar
&c
, UChar
&c2
) {
754 norm32
=_getNorm32(c
);
755 if((norm32
&_NORM_CC_MASK
)==0) {
759 if(!isNorm32LeadSurrogate(norm32
)) {
762 /* c is a lead surrogate, get the real norm32 */
763 if(p
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*p
)) {
765 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
772 return (uint8_t)(norm32
>>_NORM_CC_SHIFT
);
777 * read backwards and get norm32
778 * return 0 if the character is <minC
779 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
781 static inline uint32_t
782 _getPrevNorm32(const UChar
*start
, const UChar
*&src
,
783 uint32_t minC
, uint32_t mask
,
784 UChar
&c
, UChar
&c2
) {
790 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
793 } else if(!UTF_IS_SURROGATE(c
)) {
794 return _getNorm32(c
);
795 } else if(UTF_IS_SURROGATE_FIRST(c
)) {
796 /* unpaired first surrogate */
798 } else if(src
!=start
&& UTF_IS_FIRST_SURROGATE(c2
=*(src
-1))) {
800 norm32
=_getNorm32(c2
);
802 if((norm32
&mask
)==0) {
803 /* all surrogate pairs with this lead surrogate have only irrelevant data */
806 /* norm32 must be a surrogate special */
807 return _getNorm32FromSurrogatePair(norm32
, c
);
810 /* unpaired second surrogate */
817 * get the combining class of (c, c2)=*--p
818 * before: start<p after: start<=p
820 static inline uint8_t
821 _getPrevCC(const UChar
*start
, const UChar
*&p
) {
824 return (uint8_t)(_getPrevNorm32(start
, p
, _NORM_MIN_WITH_LEAD_CC
, _NORM_CC_MASK
, c
, c2
)>>_NORM_CC_SHIFT
);
828 * is this a safe boundary character for NF*D?
832 _isNFDSafe(uint32_t norm32
, uint32_t ccOrQCMask
, uint32_t decompQCMask
) {
833 if((norm32
&ccOrQCMask
)==0) {
834 return TRUE
; /* cc==0 and no decomposition: this is NF*D safe */
837 /* inspect its decomposition - maybe a Hangul but not a surrogate here */
838 if(isNorm32Regular(norm32
) && (norm32
&decompQCMask
)!=0) {
842 /* decomposes, get everything from the variable-length extra data */
843 _decompose(norm32
, decompQCMask
, length
, cc
, trailCC
);
846 /* no decomposition (or Hangul), test the cc directly */
847 return (norm32
&_NORM_CC_MASK
)==0;
852 * is this (or does its decomposition begin with) a "true starter"?
853 * (cc==0 and NF*C_YES)
856 _isTrueStarter(uint32_t norm32
, uint32_t ccOrQCMask
, uint32_t decompQCMask
) {
857 if((norm32
&ccOrQCMask
)==0) {
858 return TRUE
; /* this is a true starter (could be Hangul or Jamo L) */
861 /* inspect its decomposition - not a Hangul or a surrogate here */
862 if((norm32
&decompQCMask
)!=0) {
867 /* decomposes, get everything from the variable-length extra data */
868 p
=_decompose(norm32
, decompQCMask
, length
, cc
, trailCC
);
870 uint32_t qcMask
=ccOrQCMask
&_NORM_QC_MASK
;
872 /* does it begin with NFC_YES? */
873 if((_getNorm32(p
, qcMask
)&qcMask
)==0) {
874 /* yes, the decomposition begins with a true starter */
883 U_CAPI
uint8_t U_EXPORT2
884 u_getCombiningClass(UChar32 c
) {
885 UErrorCode errorCode
=U_ZERO_ERROR
;
886 if(_haveData(errorCode
)) {
889 UTRIE_GET32(&normTrie
, c
, norm32
);
890 return (uint8_t)(norm32
>>_NORM_CC_SHIFT
);
896 U_CAPI UBool U_EXPORT2
897 unorm_internalIsFullCompositionExclusion(UChar32 c
) {
898 UErrorCode errorCode
=U_ZERO_ERROR
;
899 if(_haveData(errorCode
) && formatVersion_2_1
) {
902 UTRIE_GET16(&auxTrie
, c
, aux
);
903 return (UBool
)((aux
&_NORM_AUX_COMP_EX_MASK
)!=0);
909 U_CAPI UBool U_EXPORT2
910 unorm_isCanonSafeStart(UChar32 c
) {
911 UErrorCode errorCode
=U_ZERO_ERROR
;
912 if(_haveData(errorCode
) && formatVersion_2_1
) {
915 UTRIE_GET16(&auxTrie
, c
, aux
);
916 return (UBool
)((aux
&_NORM_AUX_UNSAFE_MASK
)==0);
922 U_CAPI
void U_EXPORT2
923 unorm_getUnicodeVersion(UVersionInfo
*versionInfo
, UErrorCode
*pErrorCode
){
924 if(unorm_haveData(pErrorCode
)){
925 uprv_memcpy(*versionInfo
, dataVersion
, 4);
930 U_CAPI UBool U_EXPORT2
931 unorm_getCanonStartSet(UChar32 c
, USerializedSet
*fillSet
) {
932 UErrorCode errorCode
=U_ZERO_ERROR
;
933 if( fillSet
!=NULL
&& (uint32_t)c
<=0x10ffff &&
934 _haveData(errorCode
) && canonStartSets
!=NULL
936 const uint16_t *table
;
937 int32_t i
, start
, limit
;
940 * binary search for c
942 * There are two search tables,
943 * one for BMP code points and one for supplementary ones.
944 * See unormimp.h for details.
947 table
=canonStartSets
+canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
];
949 limit
=canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
951 /* each entry is a pair { c, result } */
952 while(start
<limit
-2) {
953 i
=(uint16_t)(((start
+limit
)/4)*2); /* (start+limit)/2 and address pairs */
962 if(c
==table
[start
]) {
964 if((i
&_NORM_CANON_SET_BMP_MASK
)==_NORM_CANON_SET_BMP_IS_INDEX
) {
965 /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
966 i
&=(_NORM_MAX_CANON_SETS
-1);
967 return uset_getSerializedSet(fillSet
,
969 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]-i
);
971 /* other result values are BMP code points for single-code point sets */
972 uset_setSerializedToOne(fillSet
, (UChar32
)i
);
977 uint16_t high
, low
, h
;
979 table
=canonStartSets
+canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]+
980 canonStartSets
[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH
];
982 limit
=canonStartSets
[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH
];
984 high
=(uint16_t)(c
>>16);
987 /* each entry is a triplet { high(c), low(c), result } */
988 while(start
<limit
-3) {
989 i
=(uint16_t)(((start
+limit
)/6)*3); /* (start+limit)/2 and address triplets */
990 h
=table
[i
]&0x1f; /* high word */
991 if(high
<h
|| (high
==h
&& low
<table
[i
+1])) {
1000 if(high
==(h
&0x1f) && low
==table
[start
+1]) {
1003 /* the result is an index to a USerializedSet */
1004 return uset_getSerializedSet(fillSet
,
1006 canonStartSets
[_NORM_SET_INDEX_CANON_SETS_LENGTH
]-i
);
1009 * single-code point set {x} in
1010 * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
1012 i
|=((int32_t)h
&0x1f00)<<8; /* add high bits from high(c) */
1013 uset_setSerializedToOne(fillSet
, (UChar32
)i
);
1020 return FALSE
; /* not found */
1023 U_CAPI
int32_t U_EXPORT2
1024 u_getFC_NFKC_Closure(UChar32 c
, UChar
*dest
, int32_t destCapacity
, UErrorCode
*pErrorCode
) {
1027 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1030 if(destCapacity
<0 || (dest
==NULL
&& destCapacity
>0)) {
1031 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
1034 if(!_haveData(*pErrorCode
) || !formatVersion_2_1
) {
1038 UTRIE_GET16(&auxTrie
, c
, aux
);
1039 aux
&=_NORM_AUX_FNC_MASK
;
1044 s
=(const UChar
*)(extraData
+aux
);
1046 /* s points to the single-unit string */
1052 if(0<length
&& length
<=destCapacity
) {
1053 uprv_memcpy(dest
, s
, length
*U_SIZEOF_UCHAR
);
1055 return u_terminateUChars(dest
, destCapacity
, length
, pErrorCode
);
1057 return u_terminateUChars(dest
, destCapacity
, 0, pErrorCode
);
1061 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
1062 U_CAPI UBool U_EXPORT2
1063 unorm_isNFSkippable(UChar32 c
, UNormalizationMode mode
) {
1064 UErrorCode errorCode
;
1065 uint32_t norm32
, mask
;
1068 errorCode
=U_ZERO_ERROR
;
1069 if(!_haveData(errorCode
)) {
1073 /* handle trivial cases; set the comparison mask for the normal ones */
1078 mask
=_NORM_CC_MASK
|_NORM_QC_NFD
;
1081 mask
=_NORM_CC_MASK
|_NORM_QC_NFKD
;
1084 /* case UNORM_FCC: */
1085 mask
=_NORM_CC_MASK
|_NORM_COMBINES_ANY
|(_NORM_QC_NFC
&_NORM_QC_ANY_NO
);
1088 mask
=_NORM_CC_MASK
|_NORM_COMBINES_ANY
|(_NORM_QC_NFKC
&_NORM_QC_ANY_NO
);
1091 /* FCD: skippable if lead cc==0 and trail cc<=1 */
1092 UTRIE_GET16(&fcdTrie
, c
, fcd
);
1098 /* check conditions (a)..(e), see unormimp.h */
1099 UTRIE_GET32(&normTrie
, c
, norm32
);
1100 if((norm32
&mask
)!=0) {
1101 return FALSE
; /* fails (a)..(e), not skippable */
1104 if(mode
<UNORM_NFC
) {
1105 return TRUE
; /* NF*D, passed (a)..(c), is skippable */
1108 /* NF*C/FCC, passed (a)..(e) */
1109 if((norm32
&_NORM_QC_NFD
)==0) {
1110 return TRUE
; /* no canonical decomposition, is skippable */
1113 /* check Hangul syllables algorithmically */
1114 if(isNorm32HangulOrJamo(norm32
)) {
1115 /* Jamo passed (a)..(e) above, must be Hangul */
1116 return !isHangulWithoutJamoT((UChar
)c
); /* LVT are skippable, LV are not */
1119 /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1120 /* NF*C, test (f) flag */
1121 if(!formatVersion_2_2
) {
1122 return FALSE
; /* no (f) data, say not skippable to be safe */
1125 UTRIE_GET16(&auxTrie
, c
, aux
);
1126 return (aux
&_NORM_AUX_NFC_SKIP_F_MASK
)==0; /* TRUE=skippable if the (f) flag is not set */
1128 /* } else { FCC, test fcd<=1 instead of the above } */
1131 U_CAPI
void U_EXPORT2
1132 unorm_addPropertyStarts(USetAdder
*sa
, UErrorCode
*pErrorCode
) {
1135 if(U_FAILURE(*pErrorCode
) || !_haveData(*pErrorCode
)) {
1139 /* add the start code point of each same-value range of each trie */
1140 utrie_enum(&normTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1141 utrie_enum(&fcdTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1142 if(formatVersion_2_1
) {
1143 utrie_enum(&auxTrie
, NULL
, _enumPropertyStartsRange
, sa
);
1146 /* add Hangul LV syllables and LV+1 because of skippables */
1147 for(c
=HANGUL_BASE
; c
<HANGUL_BASE
+HANGUL_COUNT
; c
+=JAMO_T_COUNT
) {
1148 sa
->add(sa
->set
, c
);
1149 sa
->add(sa
->set
, c
+1);
1151 sa
->add(sa
->set
, HANGUL_BASE
+HANGUL_COUNT
); /* add Hangul+1 to continue with other properties */
1154 U_CAPI UNormalizationCheckResult U_EXPORT2
1155 unorm_getQuickCheck(UChar32 c
, UNormalizationMode mode
) {
1156 static const uint32_t qcMask
[UNORM_MODE_COUNT
]={
1157 0, 0, _NORM_QC_NFD
, _NORM_QC_NFKD
, _NORM_QC_NFC
, _NORM_QC_NFKC
1160 UErrorCode errorCode
;
1163 errorCode
=U_ZERO_ERROR
;
1164 if(!_haveData(errorCode
)) {
1168 UTRIE_GET32(&normTrie
, c
, norm32
);
1169 norm32
&=qcMask
[mode
];
1173 } else if(norm32
&_NORM_QC_ANY_NO
) {
1175 } else /* _NORM_QC_ANY_MAYBE */ {
1180 U_CAPI
uint16_t U_EXPORT2
1181 unorm_getFCD16FromCodePoint(UChar32 c
) {
1182 UErrorCode errorCode
;
1185 errorCode
=U_ZERO_ERROR
;
1186 if(!_haveData(errorCode
)) {
1190 UTRIE_GET16(&fcdTrie
, c
, fcd
);
1194 /* reorder UTF-16 in-place -------------------------------------------------- */
1197 * simpler, single-character version of _mergeOrdered() -
1198 * bubble-insert one single code point into the preceding string
1199 * which is already canonically ordered
1200 * (c, c2) may or may not yet have been inserted at [current..p[
1202 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1204 * before: [start..current[ is already ordered, and
1205 * [current..p[ may or may not hold (c, c2) but
1206 * must be exactly the same length as (c, c2)
1207 * after: [start..p[ is ordered
1209 * returns the trailing combining class
1212 _insertOrdered(const UChar
*start
, UChar
*current
, UChar
*p
,
1213 UChar c
, UChar c2
, uint8_t cc
) {
1214 const UChar
*pBack
, *pPreBack
;
1216 uint8_t prevCC
, trailCC
=cc
;
1218 if(start
<current
&& cc
!=0) {
1219 /* search for the insertion point where cc>=prevCC */
1220 pPreBack
=pBack
=current
;
1221 prevCC
=_getPrevCC(start
, pPreBack
);
1223 /* this will be the last code point, so keep its cc */
1226 while(start
<pPreBack
) {
1227 prevCC
=_getPrevCC(start
, pPreBack
);
1235 * this is where we are right now with all these pointers:
1236 * [start..pPreBack[ 0..? code points that we can ignore
1237 * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1238 * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2)
1239 * [current..p[ 1 code point (c, c2) with cc
1242 /* move the code units in between up */
1246 } while(pBack
!=current
);
1250 /* insert (c, c2) */
1256 /* we know the cc of the last code point */
1261 * merge two UTF-16 string parts together
1262 * to canonically order (order by combining classes) their concatenation
1264 * the two strings may already be adjacent, so that the merging is done in-place
1265 * if the two strings are not adjacent, then the buffer holding the first one
1266 * must be large enough
1267 * the second string may or may not be ordered in itself
1269 * before: [start..current[ is already ordered, and
1270 * [next..limit[ may be ordered in itself, but
1271 * is not in relation to [start..current[
1272 * after: [start..current+(limit-next)[ is ordered
1274 * the algorithm is a simple bubble-sort that takes the characters from *next++
1275 * and inserts them in correct combining class order into the preceding part
1278 * since this function is called much less often than the single-code point
1279 * _insertOrdered(), it just uses that for easier maintenance
1280 * (see file version from before 2001aug31 for a more optimized version)
1282 * returns the trailing combining class
1285 _mergeOrdered(UChar
*start
, UChar
*current
,
1286 const UChar
*next
, const UChar
*limit
, UBool isOrdered
=TRUE
) {
1289 uint8_t cc
, trailCC
=0;
1292 adjacent
= current
==next
;
1294 if(start
!=current
|| !isOrdered
) {
1296 cc
=_getNextCC(next
, limit
, c
, c2
);
1298 /* does not bubble back */
1301 current
=(UChar
*)next
;
1314 r
=current
+(c2
==0 ? 1 : 2);
1315 trailCC
=_insertOrdered(start
, current
, r
, c
, c2
, cc
);
1322 /* we know the cc of the last code point */
1326 /* copy the second string part */
1329 } while(next
!=limit
);
1332 return _getPrevCC(start
, limit
);
1336 /* find the last true starter in [start..src[ and return the pointer to it */
1337 static const UChar
*
1338 _findPreviousStarter(const UChar
*start
, const UChar
*src
,
1339 uint32_t ccOrQCMask
, uint32_t decompQCMask
, UChar minNoMaybe
) {
1344 norm32
=_getPrevNorm32(start
, src
, minNoMaybe
, ccOrQCMask
|decompQCMask
, c
, c2
);
1345 if(_isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
)) {
1352 /* find the first true starter in [src..limit[ and return the pointer to it */
1353 static const UChar
*
1354 _findNextStarter(const UChar
*src
, const UChar
*limit
,
1355 uint32_t qcMask
, uint32_t decompQCMask
, UChar minNoMaybe
) {
1357 uint32_t norm32
, ccOrQCMask
;
1360 uint8_t cc
, trailCC
;
1362 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
1366 break; /* end of string */
1370 break; /* catches NUL terminater, too */
1373 norm32
=_getNorm32(c
);
1374 if((norm32
&ccOrQCMask
)==0) {
1375 break; /* true starter */
1378 if(isNorm32LeadSurrogate(norm32
)) {
1379 /* c is a lead surrogate, get the real norm32 */
1380 if((src
+1)==limit
|| !UTF_IS_SECOND_SURROGATE(c2
=*(src
+1))) {
1381 break; /* unmatched first surrogate: counts as a true starter */
1383 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1385 if((norm32
&ccOrQCMask
)==0) {
1386 break; /* true starter */
1392 /* (c, c2) is not a true starter but its decomposition may be */
1393 if(norm32
&decompQCMask
) {
1394 /* (c, c2) decomposes, get everything from the variable-length extra data */
1395 p
=_decompose(norm32
, decompQCMask
, length
, cc
, trailCC
);
1397 /* get the first character's norm32 to check if it is a true starter */
1398 if(cc
==0 && (_getNorm32(p
, qcMask
)&qcMask
)==0) {
1399 break; /* true starter */
1403 src
+= c2
==0 ? 1 : 2; /* not a true starter, continue */
1409 /* make NFD & NFKD ---------------------------------------------------------- */
1411 U_CAPI
int32_t U_EXPORT2
1412 unorm_getDecomposition(UChar32 c
, UBool compat
,
1413 UChar
*dest
, int32_t destCapacity
) {
1414 UErrorCode errorCode
=U_ZERO_ERROR
;
1415 if( (uint32_t)c
<=0x10ffff &&
1416 _haveData(errorCode
) &&
1417 ((dest
!=NULL
&& destCapacity
>0) || destCapacity
==0)
1419 uint32_t norm32
, qcMask
;
1425 minNoMaybe
=(UChar32
)indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
];
1426 qcMask
=_NORM_QC_NFD
;
1428 minNoMaybe
=(UChar32
)indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
];
1429 qcMask
=_NORM_QC_NFKD
;
1434 if(destCapacity
>0) {
1441 UTRIE_GET32(&normTrie
, c
, norm32
);
1442 if((norm32
&qcMask
)==0) {
1443 /* simple case: no decomposition */
1445 if(destCapacity
>0) {
1450 if(destCapacity
>=2) {
1451 dest
[0]=UTF16_LEAD(c
);
1452 dest
[1]=UTF16_TRAIL(c
);
1456 } else if(isNorm32HangulOrJamo(norm32
)) {
1457 /* Hangul syllable: decompose algorithmically */
1462 c2
=(UChar
)(c%JAMO_T_COUNT
);
1465 if(destCapacity
>=3) {
1466 dest
[2]=(UChar
)(JAMO_T_BASE
+c2
);
1473 if(destCapacity
>=2) {
1474 dest
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
1475 dest
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
1479 /* c decomposes, get everything from the variable-length extra data */
1480 const UChar
*p
, *limit
;
1481 uint8_t cc
, trailCC
;
1483 p
=_decompose(norm32
, qcMask
, length
, cc
, trailCC
);
1484 if(length
<=destCapacity
) {
1498 _decompose(UChar
*dest
, int32_t destCapacity
,
1499 const UChar
*src
, int32_t srcLength
,
1500 UBool compat
, const UnicodeSet
*nx
,
1501 uint8_t &outTrailCC
) {
1503 const UChar
*limit
, *prevSrc
, *p
;
1504 uint32_t norm32
, ccOrQCMask
, qcMask
;
1505 int32_t destIndex
, reorderStartIndex
, length
;
1506 UChar c
, c2
, minNoMaybe
;
1507 uint8_t cc
, prevCC
, trailCC
;
1510 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
];
1511 qcMask
=_NORM_QC_NFD
;
1513 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
];
1514 qcMask
=_NORM_QC_NFKD
;
1518 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
1519 destIndex
=reorderStartIndex
=0;
1522 /* avoid compiler warnings */
1527 /* string with length */
1528 limit
=src
+srcLength
;
1529 } else /* srcLength==-1 */ {
1530 /* zero-terminated string */
1537 /* count code units below the minimum or with irrelevant data for the quick check */
1540 while((c
=*src
)<minNoMaybe
? c
!=0 : ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0) {
1545 while(src
!=limit
&& ((c
=*src
)<minNoMaybe
|| ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0)) {
1551 /* copy these code units all at once */
1553 length
=(int32_t)(src
-prevSrc
);
1554 if((destIndex
+length
)<=destCapacity
) {
1555 uprv_memcpy(dest
+destIndex
, prevSrc
, length
*U_SIZEOF_UCHAR
);
1558 reorderStartIndex
=destIndex
;
1561 /* end of source reached? */
1562 if(limit
==NULL
? c
==0 : src
==limit
) {
1566 /* c already contains *src and norm32 is set for it, increment src */
1569 /* check one above-minimum, relevant code unit */
1571 * generally, set p and length to the decomposition string
1572 * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1573 * in all cases, set cc to the lead and trailCC to the trail combining class
1575 * the following merge-sort of the current character into the preceding,
1576 * canonically ordered result text will use the optimized _insertOrdered()
1577 * if there is only one single code point to process;
1578 * this is indicated with p==NULL, and (c, c2) is the character to insert
1579 * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1580 * for a supplementary character)
1581 * otherwise, p[length] is merged in with _mergeOrdered()
1583 if(isNorm32HangulOrJamo(norm32
)) {
1584 if(nx_contains(nx
, c
)) {
1589 /* Hangul syllable: decompose algorithmically */
1595 c2
=(UChar
)(c%JAMO_T_COUNT
);
1598 buffer
[2]=(UChar
)(JAMO_T_BASE
+c2
);
1604 buffer
[1]=(UChar
)(JAMO_V_BASE
+c%JAMO_V_COUNT
);
1605 buffer
[0]=(UChar
)(JAMO_L_BASE
+c
/JAMO_V_COUNT
);
1608 if(isNorm32Regular(norm32
)) {
1612 /* c is a lead surrogate, get the real norm32 */
1613 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
1616 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1624 /* get the decomposition and the lead and trail cc's */
1625 if(nx_contains(nx
, c
, c2
)) {
1626 /* excluded: norm32==0 */
1629 } else if((norm32
&qcMask
)==0) {
1630 /* c does not decompose */
1631 cc
=trailCC
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
1634 /* c decomposes, get everything from the variable-length extra data */
1635 p
=_decompose(norm32
, qcMask
, length
, cc
, trailCC
);
1637 /* fastpath a single code unit from decomposition */
1645 /* append the decomposition to the destination buffer, assume length>0 */
1646 if((destIndex
+length
)<=destCapacity
) {
1647 UChar
*reorderSplit
=dest
+destIndex
;
1649 /* fastpath: single code point */
1650 if(cc
!=0 && cc
<prevCC
) {
1651 /* (c, c2) is out of order with respect to the preceding text */
1653 trailCC
=_insertOrdered(dest
+reorderStartIndex
, reorderSplit
, dest
+destIndex
, c
, c2
, cc
);
1655 /* just append (c, c2) */
1656 dest
[destIndex
++]=c
;
1658 dest
[destIndex
++]=c2
;
1662 /* general: multiple code points (ordered by themselves) from decomposition */
1663 if(cc
!=0 && cc
<prevCC
) {
1664 /* the decomposition is out of order with respect to the preceding text */
1666 trailCC
=_mergeOrdered(dest
+reorderStartIndex
, reorderSplit
, p
, p
+length
);
1668 /* just append the decomposition */
1670 dest
[destIndex
++]=*p
++;
1671 } while(--length
>0);
1675 /* buffer overflow */
1676 /* keep incrementing the destIndex for preflighting */
1682 reorderStartIndex
=destIndex
;
1690 U_CAPI
int32_t U_EXPORT2
1691 unorm_decompose(UChar
*dest
, int32_t destCapacity
,
1692 const UChar
*src
, int32_t srcLength
,
1693 UBool compat
, int32_t options
,
1694 UErrorCode
*pErrorCode
) {
1695 const UnicodeSet
*nx
;
1699 if(!_haveData(*pErrorCode
)) {
1703 nx
=getNX(options
, *pErrorCode
);
1704 if(U_FAILURE(*pErrorCode
)) {
1708 destIndex
=_decompose(dest
, destCapacity
,
1713 return u_terminateUChars(dest
, destCapacity
, destIndex
, pErrorCode
);
1716 /* make NFC & NFKC ---------------------------------------------------------- */
1718 /* get the composition properties of the next character */
1719 static inline uint32_t
1720 _getNextCombining(UChar
*&p
, const UChar
*limit
,
1721 UChar
&c
, UChar
&c2
,
1722 uint16_t &combiningIndex
, uint8_t &cc
,
1723 const UnicodeSet
*nx
) {
1724 uint32_t norm32
, combineFlags
;
1726 /* get properties */
1728 norm32
=_getNorm32(c
);
1730 /* preset output values for most characters */
1735 if((norm32
&(_NORM_CC_MASK
|_NORM_COMBINES_ANY
))==0) {
1738 if(isNorm32Regular(norm32
)) {
1739 /* set cc etc. below */
1740 } else if(isNorm32HangulOrJamo(norm32
)) {
1741 /* a compatibility decomposition contained Jamos */
1742 combiningIndex
=(uint16_t)(0xfff0|(norm32
>>_NORM_EXTRA_SHIFT
));
1743 return norm32
&_NORM_COMBINES_ANY
;
1745 /* c is a lead surrogate, get the real norm32 */
1746 if(p
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*p
)) {
1748 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1755 if(nx_contains(nx
, c
, c2
)) {
1756 return 0; /* excluded: norm32==0 */
1759 cc
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
1761 combineFlags
=norm32
&_NORM_COMBINES_ANY
;
1762 if(combineFlags
!=0) {
1763 combiningIndex
=*(_getExtraData(norm32
)-1);
1765 return combineFlags
;
1770 * given a composition-result starter (c, c2) - which means its cc==0,
1771 * it combines forward, it has extra data, its norm32!=0,
1772 * it is not a Hangul or Jamo,
1773 * get just its combineFwdIndex
1775 * norm32(c) is special if and only if c2!=0
1777 static inline uint16_t
1778 _getCombiningIndexFromStarter(UChar c
, UChar c2
) {
1781 norm32
=_getNorm32(c
);
1783 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
1785 return *(_getExtraData(norm32
)-1);
1789 * Find the recomposition result for
1790 * a forward-combining character
1791 * (specified with a pointer to its part of the combiningTable[])
1792 * and a backward-combining character
1793 * (specified with its combineBackIndex).
1795 * If these two characters combine, then set (value, value2)
1796 * with the code unit(s) of the composition character.
1801 * >1 combine, and the composition is a forward-combining starter
1803 * See unormimp.h for a description of the composition table format.
1805 static inline uint16_t
1806 _combine(const uint16_t *table
, uint16_t combineBackIndex
,
1807 uint16_t &value
, uint16_t &value2
) {
1810 /* search in the starter's composition table */
1813 if(key
>=combineBackIndex
) {
1816 table
+= *table
&0x8000 ? 2 : 1;
1819 /* mask off bit 15, the last-entry-in-the-list flag */
1820 if((key
&0x7fff)==combineBackIndex
) {
1821 /* found! combine! */
1824 /* is the composition a starter that combines forward? */
1825 key
=(uint16_t)((value
&0x2000)+1);
1827 /* get the composition result code point from the variable-length result value */
1830 /* surrogate pair composition result */
1831 value
=(uint16_t)((value
&0x3ff)|0xd800);
1834 /* BMP composition result U+2000..U+ffff */
1839 /* BMP composition result U+0000..U+1fff */
1852 _composeHangul(UChar prev
, UChar c
, uint32_t norm32
, const UChar
*&src
, const UChar
*limit
,
1853 UBool compat
, UChar
*dest
, const UnicodeSet
*nx
) {
1854 if(isJamoVTNorm32JamoV(norm32
)) {
1855 /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1856 prev
=(UChar
)(prev
-JAMO_L_BASE
);
1857 if(prev
<JAMO_L_COUNT
) {
1858 c
=(UChar
)(HANGUL_BASE
+(prev
*JAMO_V_COUNT
+(c
-JAMO_V_BASE
))*JAMO_T_COUNT
);
1860 /* check if the next character is a Jamo T (normal or compatibility) */
1865 if((t
=(UChar
)(next
-JAMO_T_BASE
))<JAMO_T_COUNT
) {
1870 /* if NFKC, then check for compatibility Jamo T (BMP only) */
1871 norm32
=_getNorm32(next
);
1872 if(isNorm32Regular(norm32
) && (norm32
&_NORM_QC_NFKD
)) {
1875 uint8_t cc
, trailCC
;
1877 p
=_decompose(norm32
, _NORM_QC_NFKD
, length
, cc
, trailCC
);
1878 if(length
==1 && (t
=(UChar
)(*p
-JAMO_T_BASE
))<JAMO_T_COUNT
) {
1879 /* compatibility Jamo T */
1886 if(nx_contains(nx
, c
)) {
1887 if(!isHangulWithoutJamoT(c
)) {
1888 --src
; /* undo ++src from reading the Jamo T */
1897 } else if(isHangulWithoutJamoT(prev
)) {
1898 /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
1899 c
=(UChar
)(prev
+(c
-JAMO_T_BASE
));
1900 if(nx_contains(nx
, c
)) {
1912 * recompose the characters in [p..limit[
1913 * (which is in NFD - decomposed and canonically ordered),
1914 * adjust limit, and return the trailing cc
1916 * since for NFKC we may get Jamos in decompositions, we need to
1917 * recompose those too
1919 * note that recomposition never lengthens the text:
1920 * any character consists of either one or two code units;
1921 * a composition may contain at most one more code unit than the original starter,
1922 * while the combining mark that is removed has at least one code unit
1925 _recompose(UChar
*p
, UChar
*&limit
, int32_t options
, const UnicodeSet
*nx
) {
1926 UChar
*starter
, *pRemove
, *q
, *r
;
1927 uint32_t combineFlags
;
1929 uint16_t combineFwdIndex
, combineBackIndex
;
1930 uint16_t result
, value
, value2
;
1932 UBool starterIsSupplementary
;
1934 starter
=NULL
; /* no starter */
1935 combineFwdIndex
=0; /* will not be used until starter!=NULL - avoid compiler warnings */
1936 combineBackIndex
=0; /* will always be set if combineFlags!=0 - avoid compiler warnings */
1937 value
=value2
=0; /* always set by _combine() before used - avoid compiler warnings */
1938 starterIsSupplementary
=FALSE
; /* will not be used until starter!=NULL - avoid compiler warnings */
1942 combineFlags
=_getNextCombining(p
, limit
, c
, c2
, combineBackIndex
, cc
, nx
);
1943 if((combineFlags
&_NORM_COMBINES_BACK
) && starter
!=NULL
) {
1944 if(combineBackIndex
&0x8000) {
1945 /* c is a Jamo V/T, see if we can compose it with the previous character */
1946 /* for the PRI #29 fix, check that there is no intervening combining mark */
1947 if((options
&UNORM_BEFORE_PRI_29
) || prevCC
==0) {
1948 pRemove
=NULL
; /* NULL while no Hangul composition */
1951 if(combineBackIndex
==0xfff2) {
1952 /* Jamo V, compose with previous Jamo L and following Jamo T */
1953 c2
=(UChar
)(c2
-JAMO_L_BASE
);
1954 if(c2
<JAMO_L_COUNT
) {
1956 c
=(UChar
)(HANGUL_BASE
+(c2
*JAMO_V_COUNT
+(c
-JAMO_V_BASE
))*JAMO_T_COUNT
);
1957 if(p
!=limit
&& (c2
=(UChar
)(*p
-JAMO_T_BASE
))<JAMO_T_COUNT
) {
1961 /* the result is an LV syllable, which is a starter (unlike LVT) */
1962 combineFlags
=_NORM_COMBINES_FWD
;
1964 if(!nx_contains(nx
, c
)) {
1968 if(!isHangulWithoutJamoT(c
)) {
1969 --p
; /* undo the ++p from reading the Jamo T */
1971 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
1977 * Normally, the following can not occur:
1978 * Since the input is in NFD, there are no Hangul LV syllables that
1979 * a Jamo T could combine with.
1980 * All Jamo Ts are combined above when handling Jamo Vs.
1982 * However, before the PRI #29 fix, this can occur due to
1983 * an intervening combining mark between the Hangul LV and the Jamo T.
1986 /* Jamo T, compose with previous Hangul that does not have a Jamo T */
1987 if(isHangulWithoutJamoT(c2
)) {
1988 c2
+=(UChar
)(c
-JAMO_T_BASE
);
1989 if(!nx_contains(nx
, c2
)) {
1997 /* remove the Jamo(s) */
2007 c2
=0; /* c2 held *starter temporarily */
2009 if(combineFlags
!=0) {
2011 * not starter=NULL because the composition is a Hangul LV syllable
2012 * and might combine once more (but only before the PRI #29 fix)
2020 /* the composition is a Hangul LV syllable which is a starter that combines forward */
2021 combineFwdIndex
=0xfff0;
2023 /* we combined; continue with looking for compositions */
2029 * now: cc==0 and the combining index does not include "forward" ->
2030 * the rest of the loop body will reset starter to NULL;
2031 * technically, a composed Hangul syllable is a starter, but it
2032 * does not combine forward now that we have consumed all eligible Jamos;
2033 * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2037 /* the starter is not a Hangul LV or Jamo V/T and */
2038 !(combineFwdIndex
&0x8000) &&
2039 /* the combining mark is not blocked and */
2040 ((options
&UNORM_BEFORE_PRI_29
) ?
2041 (prevCC
!=cc
|| prevCC
==0) :
2042 (prevCC
<cc
|| prevCC
==0)) &&
2043 /* the starter and the combining mark (c, c2) do combine and */
2044 0!=(result
=_combine(combiningTable
+combineFwdIndex
, combineBackIndex
, value
, value2
)) &&
2045 /* the composition result is not excluded */
2046 !nx_contains(nx
, value
, value2
)
2048 /* replace the starter with the composition, remove the combining mark */
2049 pRemove
= c2
==0 ? p
-1 : p
-2; /* pointer to the combining mark */
2051 /* replace the starter with the composition */
2052 *starter
=(UChar
)value
;
2053 if(starterIsSupplementary
) {
2055 /* both are supplementary */
2056 *(starter
+1)=(UChar
)value2
;
2058 /* the composition is shorter than the starter, move the intermediate characters forward one */
2059 starterIsSupplementary
=FALSE
;
2067 } else if(value2
!=0) {
2068 /* the composition is longer than the starter, move the intermediate characters back one */
2069 starterIsSupplementary
=TRUE
;
2070 ++starter
; /* temporarily increment for the loop boundary */
2076 *starter
=(UChar
)value2
;
2077 --starter
; /* undo the temporary increment */
2078 /* } else { both are on the BMP, nothing more to do */
2081 /* remove the combining mark by moving the following text over it */
2092 /* keep prevCC because we removed the combining mark */
2099 /* is the composition a starter that combines forward? */
2101 combineFwdIndex
=_getCombiningIndexFromStarter((UChar
)value
, (UChar
)value2
);
2106 /* we combined; continue with looking for compositions */
2111 /* no combination this time */
2117 /* if (c, c2) did not combine, then check if it is a starter */
2119 /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2120 if(combineFlags
&_NORM_COMBINES_FWD
) {
2121 /* it may combine with something, prepare for it */
2123 starterIsSupplementary
=FALSE
;
2126 starterIsSupplementary
=TRUE
;
2129 combineFwdIndex
=combineBackIndex
;
2131 /* it will not combine with anything */
2134 } else if(options
&_NORM_OPTIONS_COMPOSE_CONTIGUOUS
) {
2135 /* FCC: no discontiguous compositions; any intervening character blocks */
2141 /* decompose and recompose [prevStarter..src[ */
2142 static const UChar
*
2143 _composePart(UChar
*stackBuffer
, UChar
*&buffer
, int32_t &bufferCapacity
, int32_t &length
,
2144 const UChar
*prevStarter
, const UChar
*src
,
2146 int32_t options
, const UnicodeSet
*nx
,
2147 UErrorCode
*pErrorCode
) {
2148 UChar
*recomposeLimit
;
2152 compat
=(UBool
)((options
&_NORM_OPTIONS_COMPAT
)!=0);
2154 /* decompose [prevStarter..src[ */
2155 length
=_decompose(buffer
, bufferCapacity
,
2156 prevStarter
, src
-prevStarter
,
2159 if(length
>bufferCapacity
) {
2160 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, 2*length
, 0)) {
2161 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
2164 length
=_decompose(buffer
, bufferCapacity
,
2165 prevStarter
, src
-prevStarter
,
2170 /* recompose the decomposition */
2171 recomposeLimit
=buffer
+length
;
2173 prevCC
=_recompose(buffer
, recomposeLimit
, options
, nx
);
2176 /* return with a pointer to the recomposition and its length */
2177 length
=recomposeLimit
-buffer
;
2182 _compose(UChar
*dest
, int32_t destCapacity
,
2183 const UChar
*src
, int32_t srcLength
,
2184 int32_t options
, const UnicodeSet
*nx
,
2185 UErrorCode
*pErrorCode
) {
2186 UChar stackBuffer
[_STACK_BUFFER_CAPACITY
];
2188 int32_t bufferCapacity
;
2190 const UChar
*limit
, *prevSrc
, *prevStarter
;
2191 uint32_t norm32
, ccOrQCMask
, qcMask
;
2192 int32_t destIndex
, reorderStartIndex
, length
;
2193 UChar c
, c2
, minNoMaybe
;
2196 if(options
&_NORM_OPTIONS_COMPAT
) {
2197 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
2198 qcMask
=_NORM_QC_NFKC
;
2200 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
2201 qcMask
=_NORM_QC_NFC
;
2206 bufferCapacity
=_STACK_BUFFER_CAPACITY
;
2209 * prevStarter points to the last character before the current one
2210 * that is a "true" starter with cc==0 and quick check "yes".
2212 * prevStarter will be used instead of looking for a true starter
2213 * while incrementally decomposing [prevStarter..prevSrc[
2214 * in _composePart(). Having a good prevStarter allows to just decompose
2215 * the entire [prevStarter..prevSrc[.
2217 * When _composePart() backs out from prevSrc back to prevStarter,
2218 * then it also backs out destIndex by the same amount.
2219 * Therefore, at all times, the (prevSrc-prevStarter) source units
2220 * must correspond 1:1 to destination units counted with destIndex,
2221 * except for reordering.
2222 * This is true for the qc "yes" characters copied in the fast loop,
2223 * and for pure reordering.
2224 * prevStarter must be set forward to src when this is not true:
2225 * In _composePart() and after composing a Hangul syllable.
2227 * This mechanism relies on the assumption that the decomposition of a true starter
2228 * also begins with a true starter. gennorm/store.c checks for this.
2232 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
2233 destIndex
=reorderStartIndex
=0;
2236 /* avoid compiler warnings */
2241 /* string with length */
2242 limit
=src
+srcLength
;
2243 } else /* srcLength==-1 */ {
2244 /* zero-terminated string */
2251 /* count code units below the minimum or with irrelevant data for the quick check */
2254 while((c
=*src
)<minNoMaybe
? c
!=0 : ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0) {
2259 while(src
!=limit
&& ((c
=*src
)<minNoMaybe
|| ((norm32
=_getNorm32(c
))&ccOrQCMask
)==0)) {
2265 /* copy these code units all at once */
2267 length
=(int32_t)(src
-prevSrc
);
2268 if((destIndex
+length
)<=destCapacity
) {
2269 uprv_memcpy(dest
+destIndex
, prevSrc
, length
*U_SIZEOF_UCHAR
);
2272 reorderStartIndex
=destIndex
;
2274 /* set prevStarter to the last character in the quick check loop */
2276 if(UTF_IS_SECOND_SURROGATE(*prevStarter
) && prevSrc
<prevStarter
&& UTF_IS_FIRST_SURROGATE(*(prevStarter
-1))) {
2283 /* end of source reached? */
2284 if(limit
==NULL
? c
==0 : src
==limit
) {
2288 /* c already contains *src and norm32 is set for it, increment src */
2292 * source buffer pointers:
2294 * all done quick check current char not yet
2295 * "yes" but (c, c2) processed
2298 * [-------------[-------------[-------------[-------------[
2300 * start prevStarter prevSrc src limit
2303 * destination buffer pointers and indexes:
2305 * all done might take not filled yet
2308 * [-------------[-------------[-------------[
2310 * dest reorderStartIndex destIndex destCapacity
2313 /* check one above-minimum, relevant code unit */
2315 * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2316 * check for Jamo V/T, then for surrogates and regular characters
2317 * c is not a Hangul syllable or Jamo L because
2318 * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2320 if(isNorm32HangulOrJamo(norm32
)) {
2323 * try to compose with the previous character, Jamo V also with a following Jamo T,
2324 * and set values here right now in case we just continue with the main loop
2327 reorderStartIndex
=destIndex
;
2332 *(prevSrc
-1), c
, norm32
, src
, limit
, (UBool
)((options
&_NORM_OPTIONS_COMPAT
)!=0),
2333 destIndex
<=destCapacity
? dest
+(destIndex
-1) : 0,
2340 /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2343 prevStarter
=prevSrc
;
2345 if(isNorm32Regular(norm32
)) {
2349 /* c is a lead surrogate, get the real norm32 */
2350 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
2353 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
2355 /* c is an unpaired lead surrogate, nothing to do */
2362 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2363 if(nx_contains(nx
, c
, c2
)) {
2364 /* excluded: norm32==0 */
2366 } else if((norm32
&qcMask
)==0) {
2367 cc
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
2370 uint32_t decompQCMask
;
2373 * find appropriate boundaries around this character,
2374 * decompose the source text from between the boundaries,
2377 * this puts the intermediate text into the side buffer because
2378 * it might be longer than the recomposition end result,
2379 * or the destination buffer may be too short or missing
2381 * note that destIndex may be adjusted backwards to account
2382 * for source text that passed the quick check but needed to
2383 * take part in the recomposition
2385 decompQCMask
=(qcMask
<<2)&0xf; /* decomposition quick check mask */
2388 * find the last true starter in [prevStarter..src[
2389 * it is either the decomposition of the current character (at prevSrc),
2392 if(_isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
)) {
2393 prevStarter
=prevSrc
;
2395 /* adjust destIndex: back out what had been copied with qc "yes" */
2396 destIndex
-=(int32_t)(prevSrc
-prevStarter
);
2399 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2400 src
=_findNextStarter(src
, limit
, qcMask
, decompQCMask
, minNoMaybe
);
2402 /* compose [prevStarter..src[ */
2403 p
=_composePart(stackBuffer
, buffer
, bufferCapacity
,
2404 length
, /* output */
2406 prevCC
, /* output */
2411 destIndex
=0; /* an error occurred (out of memory) */
2415 /* append the recomposed buffer contents to the destination buffer */
2416 if((destIndex
+length
)<=destCapacity
) {
2418 dest
[destIndex
++]=*p
++;
2422 /* buffer overflow */
2423 /* keep incrementing the destIndex for preflighting */
2427 /* set the next starter */
2434 /* append the single code point (c, c2) to the destination buffer */
2435 if((destIndex
+length
)<=destCapacity
) {
2436 if(cc
!=0 && cc
<prevCC
) {
2437 /* (c, c2) is out of order with respect to the preceding text */
2438 UChar
*reorderSplit
=dest
+destIndex
;
2440 prevCC
=_insertOrdered(dest
+reorderStartIndex
, reorderSplit
, dest
+destIndex
, c
, c2
, cc
);
2442 /* just append (c, c2) */
2443 dest
[destIndex
++]=c
;
2445 dest
[destIndex
++]=c2
;
2450 /* buffer overflow */
2451 /* keep incrementing the destIndex for preflighting */
2458 if(buffer
!=stackBuffer
) {
2465 U_CAPI
int32_t U_EXPORT2
2466 unorm_compose(UChar
*dest
, int32_t destCapacity
,
2467 const UChar
*src
, int32_t srcLength
,
2468 UBool compat
, int32_t options
,
2469 UErrorCode
*pErrorCode
) {
2470 const UnicodeSet
*nx
;
2473 if(!_haveData(*pErrorCode
)) {
2477 nx
=getNX(options
, *pErrorCode
);
2478 if(U_FAILURE(*pErrorCode
)) {
2482 /* reset options bits that should only be set here or inside _compose() */
2483 options
&=~(_NORM_OPTIONS_SETS_MASK
|_NORM_OPTIONS_COMPAT
|_NORM_OPTIONS_COMPOSE_CONTIGUOUS
);
2486 options
|=_NORM_OPTIONS_COMPAT
;
2489 destIndex
=_compose(dest
, destCapacity
,
2494 return u_terminateUChars(dest
, destCapacity
, destIndex
, pErrorCode
);
2497 /* make FCD ----------------------------------------------------------------- */
2499 static const UChar
*
2500 _findSafeFCD(const UChar
*src
, const UChar
*limit
, uint16_t fcd16
) {
2504 * find the first position in [src..limit[ after some cc==0 according to FCD data
2506 * at the beginning of the loop, we have fcd16 from before src
2508 * stop at positions:
2509 * - after trail cc==0
2510 * - at the end of the source
2511 * - before lead cc==0
2514 /* stop if trail cc==0 for the previous character */
2515 if((fcd16
&0xff)==0) {
2519 /* get c=*src - stop at end of string */
2525 /* stop if lead cc==0 for this character */
2526 if(c
<_NORM_MIN_WITH_LEAD_CC
|| (fcd16
=_getFCD16(c
))==0) {
2527 break; /* catches terminating NUL, too */
2530 if(!UTF_IS_FIRST_SURROGATE(c
)) {
2535 } else if((src
+1)!=limit
&& (c2
=*(src
+1), UTF_IS_SECOND_SURROGATE(c2
))) {
2536 /* c is a lead surrogate, get the real fcd16 */
2537 fcd16
=_getFCD16FromSurrogatePair(fcd16
, c2
);
2543 /* c is an unpaired first surrogate, lead cc==0 */
2552 _decomposeFCD(const UChar
*src
, const UChar
*decompLimit
,
2553 UChar
*dest
, int32_t &destIndex
, int32_t destCapacity
,
2554 const UnicodeSet
*nx
) {
2557 int32_t reorderStartIndex
, length
;
2559 uint8_t cc
, prevCC
, trailCC
;
2562 * canonically decompose [src..decompLimit[
2564 * all characters in this range have some non-zero cc,
2565 * directly or in decomposition,
2566 * so that we do not need to check in the following for quick-check limits etc.
2568 * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2570 * we also do not need to check for c==0 because we have an established decompLimit
2572 reorderStartIndex
=destIndex
;
2575 while(src
<decompLimit
) {
2577 norm32
=_getNorm32(c
);
2578 if(isNorm32Regular(norm32
)) {
2583 * reminder: this function is called with [src..decompLimit[
2584 * not containing any Hangul/Jamo characters,
2585 * therefore the only specials are lead surrogates
2587 /* c is a lead surrogate, get the real norm32 */
2588 if(src
!=decompLimit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
2591 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
2599 /* get the decomposition and the lead and trail cc's */
2600 if(nx_contains(nx
, c
, c2
)) {
2601 /* excluded: norm32==0 */
2604 } else if((norm32
&_NORM_QC_NFD
)==0) {
2605 /* c does not decompose */
2606 cc
=trailCC
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
2609 /* c decomposes, get everything from the variable-length extra data */
2610 p
=_decompose(norm32
, length
, cc
, trailCC
);
2612 /* fastpath a single code unit from decomposition */
2619 /* append the decomposition to the destination buffer, assume length>0 */
2620 if((destIndex
+length
)<=destCapacity
) {
2621 UChar
*reorderSplit
=dest
+destIndex
;
2623 /* fastpath: single code point */
2624 if(cc
!=0 && cc
<prevCC
) {
2625 /* (c, c2) is out of order with respect to the preceding text */
2627 trailCC
=_insertOrdered(dest
+reorderStartIndex
, reorderSplit
, dest
+destIndex
, c
, c2
, cc
);
2629 /* just append (c, c2) */
2630 dest
[destIndex
++]=c
;
2632 dest
[destIndex
++]=c2
;
2636 /* general: multiple code points (ordered by themselves) from decomposition */
2637 if(cc
!=0 && cc
<prevCC
) {
2638 /* the decomposition is out of order with respect to the preceding text */
2640 trailCC
=_mergeOrdered(dest
+reorderStartIndex
, reorderSplit
, p
, p
+length
);
2642 /* just append the decomposition */
2644 dest
[destIndex
++]=*p
++;
2645 } while(--length
>0);
2649 /* buffer overflow */
2650 /* keep incrementing the destIndex for preflighting */
2656 reorderStartIndex
=destIndex
;
2664 unorm_makeFCD(UChar
*dest
, int32_t destCapacity
,
2665 const UChar
*src
, int32_t srcLength
,
2666 const UnicodeSet
*nx
,
2667 UErrorCode
*pErrorCode
) {
2668 const UChar
*limit
, *prevSrc
, *decompStart
;
2669 int32_t destIndex
, length
;
2674 if(!_haveData(*pErrorCode
)) {
2683 /* avoid compiler warnings */
2688 /* string with length */
2689 limit
=src
+srcLength
;
2690 } else /* srcLength==-1 */ {
2691 /* zero-terminated string */
2698 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2703 if(c
<_NORM_MIN_WITH_LEAD_CC
) {
2708 } else if((fcd16
=_getFCD16(c
))==0) {
2719 } else if((c
=*src
)<_NORM_MIN_WITH_LEAD_CC
) {
2721 } else if((fcd16
=_getFCD16(c
))==0) {
2731 * prevCC has values from the following ranges:
2732 * 0..0xff - the previous trail combining class
2733 * <0 - the negative value of the previous code unit;
2734 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2735 * was deferred so that average text is checked faster
2738 /* copy these code units all at once */
2740 length
=(int32_t)(src
-prevSrc
);
2741 if((destIndex
+length
)<=destCapacity
) {
2742 uprv_memcpy(dest
+destIndex
, prevSrc
, length
*U_SIZEOF_UCHAR
);
2747 /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2749 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2750 if(!nx_contains(nx
, (UChar32
)-prevCC
)) {
2751 prevCC
=(int16_t)(_getFCD16((UChar
)-prevCC
)&0xff);
2753 prevCC
=0; /* excluded: fcd16==0 */
2757 * set a pointer to this below-U+0300 character;
2758 * if prevCC==0 then it will moved to after this character below
2760 decompStart
=prevSrc
-1;
2765 * prevSrc==src - used later to adjust destIndex before decomposition
2769 /* end of source reached? */
2770 if(limit
==NULL
? c
==0 : src
==limit
) {
2774 /* set a pointer to after the last source position where prevCC==0 */
2776 decompStart
=prevSrc
;
2779 /* c already contains *src and fcd16 is set for it, increment src */
2782 /* check one above-minimum, relevant code unit */
2783 if(UTF_IS_FIRST_SURROGATE(c
)) {
2784 /* c is a lead surrogate, get the real fcd16 */
2785 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
2787 fcd16
=_getFCD16FromSurrogatePair(fcd16
, c2
);
2796 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2797 if(nx_contains(nx
, c
, c2
)) {
2798 fcd16
=0; /* excluded: fcd16==0 */
2801 /* check the combining order, get the lead cc */
2802 cc
=(int16_t)(fcd16
>>8);
2803 if(cc
==0 || cc
>=prevCC
) {
2804 /* the order is ok */
2806 decompStart
=prevSrc
;
2808 prevCC
=(int16_t)(fcd16
&0xff);
2810 /* just append (c, c2) */
2811 length
= c2
==0 ? 1 : 2;
2812 if((destIndex
+length
)<=destCapacity
) {
2813 dest
[destIndex
++]=c
;
2815 dest
[destIndex
++]=c2
;
2822 * back out the part of the source that we copied already but
2823 * is now going to be decomposed;
2824 * prevSrc is set to after what was copied
2826 destIndex
-=(int32_t)(prevSrc
-decompStart
);
2829 * find the part of the source that needs to be decomposed;
2830 * to be safe and simple, decompose to before the next character with lead cc==0
2832 src
=_findSafeFCD(src
, limit
, fcd16
);
2835 * the source text does not fulfill the conditions for FCD;
2836 * decompose and reorder a limited piece of the text
2838 prevCC
=_decomposeFCD(decompStart
, src
,
2839 dest
, destIndex
, destCapacity
,
2845 return u_terminateUChars(dest
, destCapacity
, destIndex
, pErrorCode
);
2848 /* quick check functions ---------------------------------------------------- */
2851 unorm_checkFCD(const UChar
*src
, int32_t srcLength
, const UnicodeSet
*nx
) {
2861 /* string with length */
2862 limit
=src
+srcLength
;
2863 } else /* srcLength==-1 */ {
2864 /* zero-terminated string */
2871 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2875 if(c
<_NORM_MIN_WITH_LEAD_CC
) {
2880 * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2881 * because chances are good that the next one will have
2882 * a leading cc of 0;
2883 * _getFCD16(-prevCC) is later called when necessary -
2884 * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2887 } else if((fcd16
=_getFCD16(c
))==0) {
2897 } else if((c
=*src
++)<_NORM_MIN_WITH_LEAD_CC
) {
2899 } else if((fcd16
=_getFCD16(c
))==0) {
2907 /* check one above-minimum, relevant code unit */
2908 if(UTF_IS_FIRST_SURROGATE(c
)) {
2909 /* c is a lead surrogate, get the real fcd16 */
2910 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
2912 fcd16
=_getFCD16FromSurrogatePair(fcd16
, c2
);
2921 if(nx_contains(nx
, c
, c2
)) {
2922 prevCC
=0; /* excluded: fcd16==0 */
2927 * prevCC has values from the following ranges:
2928 * 0..0xff - the previous trail combining class
2929 * <0 - the negative value of the previous code unit;
2930 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2931 * was deferred so that average text is checked faster
2934 /* check the combining order */
2935 cc
=(int16_t)(fcd16
>>8);
2938 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2939 if(!nx_contains(nx
, (UChar32
)-prevCC
)) {
2940 prevCC
=(int16_t)(_getFCD16((UChar
)-prevCC
)&0xff);
2942 prevCC
=0; /* excluded: fcd16==0 */
2950 prevCC
=(int16_t)(fcd16
&0xff);
2954 static UNormalizationCheckResult
2955 _quickCheck(const UChar
*src
,
2957 UNormalizationMode mode
,
2959 const UnicodeSet
*nx
,
2960 UErrorCode
*pErrorCode
) {
2961 UChar stackBuffer
[_STACK_BUFFER_CAPACITY
];
2963 int32_t bufferCapacity
;
2965 const UChar
*start
, *limit
;
2966 uint32_t norm32
, qcNorm32
, ccOrQCMask
, qcMask
;
2968 UChar c
, c2
, minNoMaybe
;
2970 UNormalizationCheckResult result
;
2972 /* check arguments */
2973 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
2977 if(src
==NULL
|| srcLength
<-1) {
2978 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
2982 if(!_haveData(*pErrorCode
)) {
2986 /* check for a valid mode and set the quick check minimum and mask */
2989 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
2990 qcMask
=_NORM_QC_NFC
;
2994 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
2995 qcMask
=_NORM_QC_NFKC
;
2996 options
=_NORM_OPTIONS_COMPAT
;
2999 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFD_NO_MAYBE
];
3000 qcMask
=_NORM_QC_NFD
;
3004 minNoMaybe
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKD_NO_MAYBE
];
3005 qcMask
=_NORM_QC_NFKD
;
3006 options
=_NORM_OPTIONS_COMPAT
;
3009 return unorm_checkFCD(src
, srcLength
, nx
) ? UNORM_YES
: UNORM_NO
;
3011 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3017 bufferCapacity
=_STACK_BUFFER_CAPACITY
;
3019 ccOrQCMask
=_NORM_CC_MASK
|qcMask
;
3025 /* string with length */
3026 limit
=src
+srcLength
;
3027 } else /* srcLength==-1 */ {
3028 /* zero-terminated string */
3035 /* skip a run of code units below the minimum or with irrelevant data for the quick check */
3041 goto endloop
; /* break out of outer loop */
3043 } else if(((norm32
=_getNorm32(c
))&ccOrQCMask
)!=0) {
3051 goto endloop
; /* break out of outer loop */
3052 } else if((c
=*src
++)>=minNoMaybe
&& ((norm32
=_getNorm32(c
))&ccOrQCMask
)!=0) {
3059 /* check one above-minimum, relevant code unit */
3060 if(isNorm32LeadSurrogate(norm32
)) {
3061 /* c is a lead surrogate, get the real norm32 */
3062 if(src
!=limit
&& UTF_IS_SECOND_SURROGATE(c2
=*src
)) {
3064 norm32
=_getNorm32FromSurrogatePair(norm32
, c2
);
3073 if(nx_contains(nx
, c
, c2
)) {
3074 /* excluded: norm32==0 */
3078 /* check the combining order */
3079 cc
=(uint8_t)(norm32
>>_NORM_CC_SHIFT
);
3080 if(cc
!=0 && cc
<prevCC
) {
3086 /* check for "no" or "maybe" quick check flags */
3087 qcNorm32
=norm32
&qcMask
;
3088 if(qcNorm32
&_NORM_QC_ANY_NO
) {
3091 } else if(qcNorm32
!=0) {
3092 /* "maybe" can only occur for NFC and NFKC */
3096 /* normalize a section around here to see if it is really normalized or not */
3097 const UChar
*prevStarter
;
3098 uint32_t decompQCMask
;
3101 decompQCMask
=(qcMask
<<2)&0xf; /* decomposition quick check mask */
3103 /* find the previous starter */
3104 prevStarter
=src
-1; /* set prevStarter to the beginning of the current character */
3105 if(UTF_IS_TRAIL(*prevStarter
)) {
3106 --prevStarter
; /* safe because unpaired surrogates do not result in "maybe" */
3108 prevStarter
=_findPreviousStarter(start
, prevStarter
, ccOrQCMask
, decompQCMask
, minNoMaybe
);
3110 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3111 src
=_findNextStarter(src
, limit
, qcMask
, decompQCMask
, minNoMaybe
);
3113 /* decompose and recompose [prevStarter..src[ */
3114 _composePart(stackBuffer
, buffer
, bufferCapacity
,
3119 options
, nx
, pErrorCode
);
3120 if(U_FAILURE(*pErrorCode
)) {
3121 result
=UNORM_MAYBE
; /* error (out of memory) */
3125 /* compare the normalized version with the original */
3126 if(0!=uprv_strCompare(prevStarter
, (int32_t)(src
-prevStarter
), buffer
, length
, FALSE
, FALSE
)) {
3127 result
=UNORM_NO
; /* normalization differs */
3131 /* continue after the next starter */
3137 if(buffer
!=stackBuffer
) {
3144 U_CAPI UNormalizationCheckResult U_EXPORT2
3145 unorm_quickCheck(const UChar
*src
,
3147 UNormalizationMode mode
,
3148 UErrorCode
*pErrorCode
) {
3149 return _quickCheck(src
, srcLength
, mode
, TRUE
, NULL
, pErrorCode
);
3152 U_CAPI UNormalizationCheckResult U_EXPORT2
3153 unorm_quickCheckWithOptions(const UChar
*src
, int32_t srcLength
,
3154 UNormalizationMode mode
, int32_t options
,
3155 UErrorCode
*pErrorCode
) {
3156 return _quickCheck(src
, srcLength
, mode
, TRUE
, getNX(options
, *pErrorCode
), pErrorCode
);
3159 U_CFUNC UNormalizationCheckResult
3160 unorm_internalQuickCheck(const UChar
*src
,
3162 UNormalizationMode mode
,
3164 const UnicodeSet
*nx
,
3165 UErrorCode
*pErrorCode
) {
3166 return _quickCheck(src
, srcLength
, mode
, allowMaybe
, nx
, pErrorCode
);
3169 U_CAPI UBool U_EXPORT2
3170 unorm_isNormalized(const UChar
*src
, int32_t srcLength
,
3171 UNormalizationMode mode
,
3172 UErrorCode
*pErrorCode
) {
3173 return (UBool
)(UNORM_YES
==_quickCheck(src
, srcLength
, mode
, FALSE
, NULL
, pErrorCode
));
3176 U_CAPI UBool U_EXPORT2
3177 unorm_isNormalizedWithOptions(const UChar
*src
, int32_t srcLength
,
3178 UNormalizationMode mode
, int32_t options
,
3179 UErrorCode
*pErrorCode
) {
3180 return (UBool
)(UNORM_YES
==_quickCheck(src
, srcLength
, mode
, FALSE
, getNX(options
, *pErrorCode
), pErrorCode
));
3183 /* normalize() API ---------------------------------------------------------- */
3186 * Internal API for normalizing.
3187 * Does not check for bad input.
3188 * Requires _haveData() to be true.
3192 unorm_internalNormalizeWithNX(UChar
*dest
, int32_t destCapacity
,
3193 const UChar
*src
, int32_t srcLength
,
3194 UNormalizationMode mode
, int32_t options
, const UnicodeSet
*nx
,
3195 UErrorCode
*pErrorCode
) {
3201 destLength
=_decompose(dest
, destCapacity
,
3203 FALSE
, nx
, trailCC
);
3206 destLength
=_decompose(dest
, destCapacity
,
3211 destLength
=_compose(dest
, destCapacity
,
3213 options
, nx
, pErrorCode
);
3216 destLength
=_compose(dest
, destCapacity
,
3218 options
|_NORM_OPTIONS_COMPAT
, nx
, pErrorCode
);
3221 return unorm_makeFCD(dest
, destCapacity
,
3227 destLength
=_compose(dest
, destCapacity
,
3229 options
|_NORM_OPTIONS_COMPOSE_CONTIGUOUS
, nx
, pErrorCode
);
3233 /* just copy the string */
3235 srcLength
=u_strlen(src
);
3237 if(srcLength
>0 && srcLength
<=destCapacity
) {
3238 uprv_memcpy(dest
, src
, srcLength
*U_SIZEOF_UCHAR
);
3240 destLength
=srcLength
;
3243 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3247 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
3251 * Internal API for normalizing.
3252 * Does not check for bad input.
3255 U_CAPI
int32_t U_EXPORT2
3256 unorm_internalNormalize(UChar
*dest
, int32_t destCapacity
,
3257 const UChar
*src
, int32_t srcLength
,
3258 UNormalizationMode mode
, int32_t options
,
3259 UErrorCode
*pErrorCode
) {
3260 const UnicodeSet
*nx
;
3262 if(!_haveData(*pErrorCode
)) {
3266 nx
=getNX(options
, *pErrorCode
);
3267 if(U_FAILURE(*pErrorCode
)) {
3271 /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3272 options
&=~(_NORM_OPTIONS_SETS_MASK
|_NORM_OPTIONS_COMPAT
|_NORM_OPTIONS_COMPOSE_CONTIGUOUS
);
3274 return unorm_internalNormalizeWithNX(dest
, destCapacity
,
3280 /** Public API for normalizing. */
3281 U_CAPI
int32_t U_EXPORT2
3282 unorm_normalize(const UChar
*src
, int32_t srcLength
,
3283 UNormalizationMode mode
, int32_t options
,
3284 UChar
*dest
, int32_t destCapacity
,
3285 UErrorCode
*pErrorCode
) {
3286 /* check argument values */
3287 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3291 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3292 src
==NULL
|| srcLength
<-1
3294 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3298 /* check for overlapping src and destination */
3300 ((src
>=dest
&& src
<(dest
+destCapacity
)) ||
3301 (srcLength
>0 && dest
>=src
&& dest
<(src
+srcLength
)))
3303 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3307 return unorm_internalNormalize(dest
, destCapacity
,
3314 /* iteration functions ------------------------------------------------------ */
3317 * These iteration functions are the core implementations of the
3318 * Normalizer class iteration API.
3319 * They read from a UCharIterator into their own buffer
3320 * and normalize into the Normalizer iteration buffer.
3321 * Normalizer itself then iterates over its buffer until that needs to be
3327 * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3328 * if iteration bounds are reached,
3329 * try to not call hasNext/hasPrevious and instead check for >=0.
3332 /* backward iteration ------------------------------------------------------- */
3335 * read backwards and get norm32
3336 * return 0 if the character is <minC
3337 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3339 static inline uint32_t
3340 _getPrevNorm32(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
) {
3343 /* need src.hasPrevious() */
3344 c
=(UChar
)src
.previous(&src
);
3347 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3350 } else if(!UTF_IS_SURROGATE(c
)) {
3351 return _getNorm32(c
);
3352 } else if(UTF_IS_SURROGATE_FIRST(c
) || !src
.hasPrevious(&src
)) {
3353 /* unpaired surrogate */
3355 } else if(UTF_IS_FIRST_SURROGATE(c2
=(UChar
)src
.previous(&src
))) {
3356 norm32
=_getNorm32(c2
);
3357 if((norm32
&mask
)==0) {
3358 /* all surrogate pairs with this lead surrogate have irrelevant data */
3361 /* norm32 must be a surrogate special */
3362 return _getNorm32FromSurrogatePair(norm32
, c
);
3365 /* unpaired second surrogate, undo the c2=src.previous() movement */
3366 src
.move(&src
, 1, UITER_CURRENT
);
3373 * read backwards and check if the character is a previous-iteration boundary
3374 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3377 IsPrevBoundaryFn(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
);
3381 * read backwards and check if the lead combining class is 0
3382 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3385 _isPrevNFDSafe(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3386 return _isNFDSafe(_getPrevNorm32(src
, minC
, ccOrQCMask
, c
, c2
), ccOrQCMask
, ccOrQCMask
&_NORM_QC_MASK
);
3390 * read backwards and check if the character is (or its decomposition begins with)
3391 * a "true starter" (cc==0 and NF*C_YES)
3392 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3395 _isPrevTrueStarter(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3396 uint32_t norm32
, decompQCMask
;
3398 decompQCMask
=(ccOrQCMask
<<2)&0xf; /* decomposition quick check mask */
3399 norm32
=_getPrevNorm32(src
, minC
, ccOrQCMask
|decompQCMask
, c
, c2
);
3400 return _isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
);
3404 _findPreviousIterationBoundary(UCharIterator
&src
,
3405 IsPrevBoundaryFn
*isPrevBoundary
, uint32_t minC
, uint32_t mask
,
3406 UChar
*&buffer
, int32_t &bufferCapacity
,
3407 int32_t &startIndex
,
3408 UErrorCode
*pErrorCode
) {
3415 startIndex
=bufferCapacity
; /* fill the buffer from the end backwards */
3417 while(src
.hasPrevious(&src
)) {
3418 isBoundary
=isPrevBoundary(src
, minC
, mask
, c
, c2
);
3420 /* always write this character to the front of the buffer */
3421 /* make sure there is enough space in the buffer */
3422 if(startIndex
< (c2
==0 ? 1 : 2)) {
3423 int32_t bufferLength
=bufferCapacity
;
3425 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, 2*bufferCapacity
, bufferLength
)) {
3426 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
3427 src
.move(&src
, 0, UITER_START
);
3431 /* move the current buffer contents up */
3432 uprv_memmove(buffer
+(bufferCapacity
-bufferLength
), buffer
, bufferLength
*U_SIZEOF_UCHAR
);
3433 startIndex
+=bufferCapacity
-bufferLength
;
3436 buffer
[--startIndex
]=c
;
3438 buffer
[--startIndex
]=c2
;
3441 /* stop if this just-copied character is a boundary */
3447 /* return the length of the buffer contents */
3448 return bufferCapacity
-startIndex
;
3451 U_CAPI
int32_t U_EXPORT2
3452 unorm_previous(UCharIterator
*src
,
3453 UChar
*dest
, int32_t destCapacity
,
3454 UNormalizationMode mode
, int32_t options
,
3455 UBool doNormalize
, UBool
*pNeededToNormalize
,
3456 UErrorCode
*pErrorCode
) {
3457 UChar stackBuffer
[100];
3459 IsPrevBoundaryFn
*isPreviousBoundary
=NULL
;
3461 int32_t startIndex
=0, bufferLength
=0, bufferCapacity
=0, destLength
=0;
3465 /* check argument values */
3466 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3470 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3473 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3477 if(!_haveData(*pErrorCode
)) {
3481 if(pNeededToNormalize
!=NULL
) {
3482 *pNeededToNormalize
=FALSE
;
3488 isPreviousBoundary
=_isPrevNFDSafe
;
3489 minC
=_NORM_MIN_WITH_LEAD_CC
;
3490 mask
=_NORM_CC_MASK
|_NORM_QC_NFD
;
3493 isPreviousBoundary
=_isPrevNFDSafe
;
3494 minC
=_NORM_MIN_WITH_LEAD_CC
;
3495 mask
=_NORM_CC_MASK
|_NORM_QC_NFKD
;
3498 isPreviousBoundary
=_isPrevTrueStarter
;
3499 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
3500 mask
=_NORM_CC_MASK
|_NORM_QC_NFC
;
3503 isPreviousBoundary
=_isPrevTrueStarter
;
3504 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
3505 mask
=_NORM_CC_MASK
|_NORM_QC_NFKC
;
3509 if((c
=src
->previous(src
))>=0) {
3511 if(UTF_IS_TRAIL(c
) && (c2
=src
->previous(src
))>=0) {
3512 if(UTF_IS_LEAD(c2
)) {
3513 if(destCapacity
>=2) {
3514 dest
[1]=(UChar
)c
; /* trail surrogate */
3517 c
=c2
; /* lead surrogate to be written below */
3519 src
->move(src
, 1, UITER_CURRENT
);
3523 if(destCapacity
>0) {
3527 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
3529 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3534 bufferCapacity
=(int32_t)(sizeof(stackBuffer
)/U_SIZEOF_UCHAR
);
3535 bufferLength
=_findPreviousIterationBoundary(*src
,
3536 isPreviousBoundary
, minC
, mask
,
3537 buffer
, bufferCapacity
,
3540 if(bufferLength
>0) {
3542 destLength
=unorm_internalNormalize(dest
, destCapacity
,
3543 buffer
+startIndex
, bufferLength
,
3546 if(pNeededToNormalize
!=0 && U_SUCCESS(*pErrorCode
)) {
3547 *pNeededToNormalize
=
3548 (UBool
)(destLength
!=bufferLength
||
3549 0!=uprv_memcmp(dest
, buffer
+startIndex
, destLength
*U_SIZEOF_UCHAR
));
3552 /* just copy the source characters */
3553 if(destCapacity
>0) {
3554 uprv_memcpy(dest
, buffer
+startIndex
, uprv_min(bufferLength
, destCapacity
)*U_SIZEOF_UCHAR
);
3556 destLength
=u_terminateUChars(dest
, destCapacity
, bufferLength
, pErrorCode
);
3559 destLength
=u_terminateUChars(dest
, destCapacity
, 0, pErrorCode
);
3563 if(buffer
!=stackBuffer
) {
3570 /* forward iteration -------------------------------------------------------- */
3573 * read forward and get norm32
3574 * return 0 if the character is <minC
3575 * if c2!=0 then (c2, c) is a surrogate pair
3576 * always reads complete characters
3578 static inline uint32_t
3579 _getNextNorm32(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
) {
3582 /* need src.hasNext() to be true */
3583 c
=(UChar
)src
.next(&src
);
3590 norm32
=_getNorm32(c
);
3591 if(UTF_IS_FIRST_SURROGATE(c
)) {
3592 if(src
.hasNext(&src
) && UTF_IS_SECOND_SURROGATE(c2
=(UChar
)src
.current(&src
))) {
3593 src
.move(&src
, 1, UITER_CURRENT
); /* skip the c2 surrogate */
3594 if((norm32
&mask
)==0) {
3595 /* irrelevant data */
3598 /* norm32 must be a surrogate special */
3599 return _getNorm32FromSurrogatePair(norm32
, c2
);
3602 /* unmatched surrogate */
3611 * read forward and check if the character is a next-iteration boundary
3612 * if c2!=0 then (c, c2) is a surrogate pair
3615 IsNextBoundaryFn(UCharIterator
&src
, uint32_t minC
, uint32_t mask
, UChar
&c
, UChar
&c2
);
3619 * read forward and check if the lead combining class is 0
3620 * if c2!=0 then (c, c2) is a surrogate pair
3623 _isNextNFDSafe(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3624 return _isNFDSafe(_getNextNorm32(src
, minC
, ccOrQCMask
, c
, c2
), ccOrQCMask
, ccOrQCMask
&_NORM_QC_MASK
);
3629 * read forward and check if the character is (or its decomposition begins with)
3630 * a "true starter" (cc==0 and NF*C_YES)
3631 * if c2!=0 then (c, c2) is a surrogate pair
3634 _isNextTrueStarter(UCharIterator
&src
, uint32_t minC
, uint32_t ccOrQCMask
, UChar
&c
, UChar
&c2
) {
3635 uint32_t norm32
, decompQCMask
;
3637 decompQCMask
=(ccOrQCMask
<<2)&0xf; /* decomposition quick check mask */
3638 norm32
=_getNextNorm32(src
, minC
, ccOrQCMask
|decompQCMask
, c
, c2
);
3639 return _isTrueStarter(norm32
, ccOrQCMask
, decompQCMask
);
3643 _findNextIterationBoundary(UCharIterator
&src
,
3644 IsNextBoundaryFn
*isNextBoundary
, uint32_t minC
, uint32_t mask
,
3645 UChar
*&buffer
, int32_t &bufferCapacity
,
3646 UErrorCode
*pErrorCode
) {
3648 int32_t bufferIndex
;
3651 if(!src
.hasNext(&src
)) {
3658 /* get one character and ignore its properties */
3659 buffer
[0]=c
=(UChar
)src
.next(&src
);
3661 if(UTF_IS_FIRST_SURROGATE(c
) && src
.hasNext(&src
)) {
3662 if(UTF_IS_SECOND_SURROGATE(c2
=(UChar
)src
.next(&src
))) {
3663 buffer
[bufferIndex
++]=c2
;
3665 src
.move(&src
, -1, UITER_CURRENT
); /* back out the non-trail-surrogate */
3669 /* get all following characters until we see a boundary */
3670 /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3671 while(src
.hasNext(&src
)) {
3672 if(isNextBoundary(src
, minC
, mask
, c
, c2
)) {
3673 /* back out the latest movement to stop at the boundary */
3674 src
.move(&src
, c2
==0 ? -1 : -2, UITER_CURRENT
);
3677 if(bufferIndex
+(c2
==0 ? 1 : 2)<=bufferCapacity
||
3678 /* attempt to grow the buffer */
3679 u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
,
3683 buffer
[bufferIndex
++]=c
;
3685 buffer
[bufferIndex
++]=c2
;
3688 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
3689 src
.move(&src
, 0, UITER_LIMIT
);
3695 /* return the length of the buffer contents */
3699 U_CAPI
int32_t U_EXPORT2
3700 unorm_next(UCharIterator
*src
,
3701 UChar
*dest
, int32_t destCapacity
,
3702 UNormalizationMode mode
, int32_t options
,
3703 UBool doNormalize
, UBool
*pNeededToNormalize
,
3704 UErrorCode
*pErrorCode
) {
3705 UChar stackBuffer
[100];
3707 IsNextBoundaryFn
*isNextBoundary
;
3709 int32_t bufferLength
, bufferCapacity
, destLength
;
3713 /* check argument values */
3714 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3718 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3721 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3725 if(!_haveData(*pErrorCode
)) {
3729 if(pNeededToNormalize
!=NULL
) {
3730 *pNeededToNormalize
=FALSE
;
3736 isNextBoundary
=_isNextNFDSafe
;
3737 minC
=_NORM_MIN_WITH_LEAD_CC
;
3738 mask
=_NORM_CC_MASK
|_NORM_QC_NFD
;
3741 isNextBoundary
=_isNextNFDSafe
;
3742 minC
=_NORM_MIN_WITH_LEAD_CC
;
3743 mask
=_NORM_CC_MASK
|_NORM_QC_NFKD
;
3746 isNextBoundary
=_isNextTrueStarter
;
3747 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFC_NO_MAYBE
];
3748 mask
=_NORM_CC_MASK
|_NORM_QC_NFC
;
3751 isNextBoundary
=_isNextTrueStarter
;
3752 minC
=(UChar
)indexes
[_NORM_INDEX_MIN_NFKC_NO_MAYBE
];
3753 mask
=_NORM_CC_MASK
|_NORM_QC_NFKC
;
3757 if((c
=src
->next(src
))>=0) {
3759 if(UTF_IS_LEAD(c
) && (c2
=src
->next(src
))>=0) {
3760 if(UTF_IS_TRAIL(c2
)) {
3761 if(destCapacity
>=2) {
3762 dest
[1]=(UChar
)c2
; /* trail surrogate */
3765 /* lead surrogate to be written below */
3767 src
->move(src
, -1, UITER_CURRENT
);
3771 if(destCapacity
>0) {
3775 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
3777 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3782 bufferCapacity
=(int32_t)(sizeof(stackBuffer
)/U_SIZEOF_UCHAR
);
3783 bufferLength
=_findNextIterationBoundary(*src
,
3784 isNextBoundary
, minC
, mask
,
3785 buffer
, bufferCapacity
,
3787 if(bufferLength
>0) {
3789 destLength
=unorm_internalNormalize(dest
, destCapacity
,
3790 buffer
, bufferLength
,
3793 if(pNeededToNormalize
!=0 && U_SUCCESS(*pErrorCode
)) {
3794 *pNeededToNormalize
=
3795 (UBool
)(destLength
!=bufferLength
||
3796 0!=uprv_memcmp(dest
, buffer
, destLength
*U_SIZEOF_UCHAR
));
3799 /* just copy the source characters */
3800 if(destCapacity
>0) {
3801 uprv_memcpy(dest
, buffer
, uprv_min(bufferLength
, destCapacity
)*U_SIZEOF_UCHAR
);
3803 destLength
=u_terminateUChars(dest
, destCapacity
, bufferLength
, pErrorCode
);
3806 destLength
=u_terminateUChars(dest
, destCapacity
, 0, pErrorCode
);
3810 if(buffer
!=stackBuffer
) {
3818 * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3819 * and if not, how hard it would be to improve it.
3820 * For example, see _findSafeFCD().
3823 /* Concatenation of normalized strings -------------------------------------- */
3825 U_CAPI
int32_t U_EXPORT2
3826 unorm_concatenate(const UChar
*left
, int32_t leftLength
,
3827 const UChar
*right
, int32_t rightLength
,
3828 UChar
*dest
, int32_t destCapacity
,
3829 UNormalizationMode mode
, int32_t options
,
3830 UErrorCode
*pErrorCode
) {
3831 UChar stackBuffer
[100];
3833 int32_t bufferLength
, bufferCapacity
;
3836 int32_t leftBoundary
, rightBoundary
, destLength
;
3838 /* check argument values */
3839 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3843 if( destCapacity
<0 || (dest
==NULL
&& destCapacity
>0) ||
3844 left
==NULL
|| leftLength
<-1 ||
3845 right
==NULL
|| rightLength
<-1
3847 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3851 /* check for overlapping right and destination */
3853 ((right
>=dest
&& right
<(dest
+destCapacity
)) ||
3854 (rightLength
>0 && dest
>=right
&& dest
<(right
+rightLength
)))
3856 *pErrorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
3860 /* allow left==dest */
3862 /* set up intermediate buffer */
3864 bufferCapacity
=(int32_t)(sizeof(stackBuffer
)/U_SIZEOF_UCHAR
);
3867 * Input: left[0..leftLength[ + right[0..rightLength[
3869 * Find normalization-safe boundaries leftBoundary and rightBoundary
3870 * and copy the end parts together:
3871 * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
3873 * dest=left[0..leftBoundary[ +
3874 * normalize(buffer) +
3875 * right[rightBoundary..rightLength[
3879 * find a normalization boundary at the end of the left string
3880 * and copy the end part into the buffer
3882 uiter_setString(&iter
, left
, leftLength
);
3883 iter
.index
=leftLength
=iter
.length
; /* end of left string */
3885 bufferLength
=unorm_previous(&iter
, buffer
, bufferCapacity
,
3889 leftBoundary
=iter
.index
;
3890 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
3891 *pErrorCode
=U_ZERO_ERROR
;
3892 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, 2*bufferLength
, 0)) {
3893 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
3894 /* dont need to cleanup here since
3895 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
3900 /* just copy from the left string: we know the boundary already */
3901 uprv_memcpy(buffer
, left
+leftBoundary
, bufferLength
*U_SIZEOF_UCHAR
);
3905 * find a normalization boundary at the beginning of the right string
3906 * and concatenate the beginning part to the buffer
3908 uiter_setString(&iter
, right
, rightLength
);
3909 rightLength
=iter
.length
; /* in case it was -1 */
3911 rightBoundary
=unorm_next(&iter
, buffer
+bufferLength
, bufferCapacity
-bufferLength
,
3915 if(*pErrorCode
==U_BUFFER_OVERFLOW_ERROR
) {
3916 *pErrorCode
=U_ZERO_ERROR
;
3917 if(!u_growBufferFromStatic(stackBuffer
, &buffer
, &bufferCapacity
, bufferLength
+rightBoundary
, 0)) {
3918 *pErrorCode
=U_MEMORY_ALLOCATION_ERROR
;
3919 /* dont need to cleanup here since
3920 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
3925 /* just copy from the right string: we know the boundary already */
3926 uprv_memcpy(buffer
+bufferLength
, right
, rightBoundary
*U_SIZEOF_UCHAR
);
3929 bufferLength
+=rightBoundary
;
3931 /* copy left[0..leftBoundary[ to dest */
3932 if(left
!=dest
&& leftBoundary
>0 && destCapacity
>0) {
3933 uprv_memcpy(dest
, left
, uprv_min(leftBoundary
, destCapacity
)*U_SIZEOF_UCHAR
);
3935 destLength
=leftBoundary
;
3937 /* concatenate the normalization of the buffer to dest */
3938 if(destCapacity
>destLength
) {
3939 destLength
+=unorm_internalNormalize(dest
+destLength
, destCapacity
-destLength
,
3940 buffer
, bufferLength
,
3944 destLength
+=unorm_internalNormalize(NULL
, 0,
3945 buffer
, bufferLength
,
3950 * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
3951 * so we dont check for the error code here..just let it pass through
3953 /* concatenate right[rightBoundary..rightLength[ to dest */
3954 right
+=rightBoundary
;
3955 rightLength
-=rightBoundary
;
3956 if(rightLength
>0 && destCapacity
>destLength
) {
3957 uprv_memcpy(dest
+destLength
, right
, uprv_min(rightLength
, destCapacity
-destLength
)*U_SIZEOF_UCHAR
);
3959 destLength
+=rightLength
;
3962 if(buffer
!=stackBuffer
) {
3966 return u_terminateUChars(dest
, destCapacity
, destLength
, pErrorCode
);
3969 /* data swapping ------------------------------------------------------------ */
3971 U_CAPI
int32_t U_EXPORT2
3972 unorm_swap(const UDataSwapper
*ds
,
3973 const void *inData
, int32_t length
, void *outData
,
3974 UErrorCode
*pErrorCode
) {
3975 const UDataInfo
*pInfo
;
3978 const uint8_t *inBytes
;
3981 const int32_t *inIndexes
;
3982 int32_t indexes
[32];
3984 int32_t i
, offset
, count
, size
;
3986 /* udata_swapDataHeader checks the arguments */
3987 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
3988 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
3992 /* check data format and format version */
3993 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
3995 pInfo
->dataFormat
[0]==0x4e && /* dataFormat="Norm" */
3996 pInfo
->dataFormat
[1]==0x6f &&
3997 pInfo
->dataFormat
[2]==0x72 &&
3998 pInfo
->dataFormat
[3]==0x6d &&
3999 pInfo
->formatVersion
[0]==2
4001 udata_printError(ds
, "unorm_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unorm.icu\n",
4002 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
4003 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
4004 pInfo
->formatVersion
[0]);
4005 *pErrorCode
=U_UNSUPPORTED_ERROR
;
4009 inBytes
=(const uint8_t *)inData
+headerSize
;
4010 outBytes
=(uint8_t *)outData
+headerSize
;
4012 inIndexes
=(const int32_t *)inBytes
;
4017 udata_printError(ds
, "unorm_swap(): too few bytes (%d after header) for unorm.icu\n",
4019 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
4024 /* read the first 32 indexes (ICU 2.8/format version 2.2: _NORM_INDEX_TOP==32, might grow) */
4025 for(i
=0; i
<32; ++i
) {
4026 indexes
[i
]=udata_readInt32(ds
, inIndexes
[i
]);
4029 /* calculate the total length of the data */
4031 32*4+ /* size of indexes[] */
4032 indexes
[_NORM_INDEX_TRIE_SIZE
]+
4033 indexes
[_NORM_INDEX_UCHAR_COUNT
]*2+
4034 indexes
[_NORM_INDEX_COMBINE_DATA_COUNT
]*2+
4035 indexes
[_NORM_INDEX_FCD_TRIE_SIZE
]+
4036 indexes
[_NORM_INDEX_AUX_TRIE_SIZE
]+
4037 indexes
[_NORM_INDEX_CANON_SET_COUNT
]*2;
4041 udata_printError(ds
, "unorm_swap(): too few bytes (%d after header) for all of unorm.icu\n",
4043 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
4047 /* copy the data for inaccessible bytes */
4048 if(inBytes
!=outBytes
) {
4049 uprv_memcpy(outBytes
, inBytes
, size
);
4054 /* swap the indexes[] */
4056 ds
->swapArray32(ds
, inBytes
, count
, outBytes
, pErrorCode
);
4059 /* swap the main UTrie */
4060 count
=indexes
[_NORM_INDEX_TRIE_SIZE
];
4061 utrie_swap(ds
, inBytes
+offset
, count
, outBytes
+offset
, pErrorCode
);
4064 /* swap the uint16_t extraData[] and the uint16_t combiningTable[] */
4065 count
=(indexes
[_NORM_INDEX_UCHAR_COUNT
]+indexes
[_NORM_INDEX_COMBINE_DATA_COUNT
])*2;
4066 ds
->swapArray16(ds
, inBytes
+offset
, count
, outBytes
+offset
, pErrorCode
);
4069 /* swap the FCD UTrie */
4070 count
=indexes
[_NORM_INDEX_FCD_TRIE_SIZE
];
4072 utrie_swap(ds
, inBytes
+offset
, count
, outBytes
+offset
, pErrorCode
);
4076 /* swap the aux UTrie */
4077 count
=indexes
[_NORM_INDEX_AUX_TRIE_SIZE
];
4079 utrie_swap(ds
, inBytes
+offset
, count
, outBytes
+offset
, pErrorCode
);
4083 /* swap the uint16_t combiningTable[] */
4084 count
=indexes
[_NORM_INDEX_CANON_SET_COUNT
]*2;
4085 ds
->swapArray16(ds
, inBytes
+offset
, count
, outBytes
+offset
, pErrorCode
);
4089 return headerSize
+size
;
4092 #endif /* #if !UCONFIG_NO_NORMALIZATION */