]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unorm.cpp
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / common / unorm.cpp
CommitLineData
b75a7d8f
A
1/*
2******************************************************************************
73c04bcf 3* Copyright (c) 1996-2006, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5******************************************************************************
6* File unorm.cpp
7*
8* Created by: Vladimir Weinstein 12052000
9*
10* Modification history :
11*
12* Date Name Description
13* 02/01/01 synwee Added normalization quickcheck enum and method.
14* 02/12/01 synwee Commented out quickcheck util api has been approved
15* Added private method for doing FCD checks
16* 02/23/01 synwee Modified quickcheck and checkFCE to run through
17* string for codepoints < 0x300 for the normalization
18* mode NFC.
19* 05/25/01+ Markus Scherer total rewrite, implement all normalization here
20* instead of just wrappers around normlzr.cpp,
21* load unorm.dat, support Unicode 3.1 with
22* supplementary code points, etc.
23*/
24
25#include "unicode/utypes.h"
26
b75a7d8f
A
27#if !UCONFIG_NO_NORMALIZATION
28
29#include "unicode/udata.h"
30#include "unicode/uchar.h"
374ca955 31#include "unicode/ustring.h"
b75a7d8f
A
32#include "unicode/uiter.h"
33#include "unicode/uniset.h"
34#include "unicode/usetiter.h"
35#include "unicode/unorm.h"
374ca955
A
36#include "ucln_cmn.h"
37#include "unormimp.h"
38#include "ucase.h"
b75a7d8f
A
39#include "cmemory.h"
40#include "umutex.h"
41#include "utrie.h"
42#include "unicode/uset.h"
374ca955
A
43#include "udataswp.h"
44#include "putilimp.h"
b75a7d8f
A
45
46/*
47 * Status of tailored normalization
48 *
49 * This was done initially for investigation on Unicode public review issue 7
50 * (http://www.unicode.org/review/). See Jitterbug 2481.
51 * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
52 * a permanent feature in ICU 2.6 in support of IDNA which requires true
53 * Unicode 3.2 normalization.
54 * (NormalizationCorrections are rolled into IDNA mapping tables.)
55 *
56 * Tailored normalization as implemented here allows to "normalize less"
57 * than full Unicode normalization would.
58 * Based internally on a UnicodeSet of code points that are
59 * "excluded from normalization", the normalization functions leave those
60 * code points alone ("inert"). This means that tailored normalization
61 * still transforms text into a canonically equivalent form.
62 * It does not add decompositions to code points that do not have any or
63 * change decomposition results.
64 *
65 * Any function that searches for a safe boundary has not been touched,
66 * which means that these functions will be over-pessimistic when
67 * exclusions are applied.
68 * This should not matter because subsequent checks and normalizations
69 * do apply the exclusions; only a little more of the text may be processed
70 * than necessary under exclusions.
71 *
72 * Normalization exclusions have the following effect on excluded code points c:
73 * - c is not decomposed
74 * - c is not a composition target
75 * - c does not combine forward or backward for composition
76 * except that this is not implemented for Jamo
77 * - c is treated as having a combining class of 0
78 */
374ca955 79#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
b75a7d8f
A
80
81/*
82 * This new implementation of the normalization code loads its data from
83 * unorm.dat, which is generated with the gennorm tool.
84 * The format of that file is described in unormimp.h .
85 */
86
87/* -------------------------------------------------------------------------- */
88
89enum {
90 _STACK_BUFFER_CAPACITY=100
91};
92
93/*
94 * Constants for the bit fields in the options bit set parameter.
95 * These need not be public.
96 * A user only needs to know the currently assigned values.
97 * The number and positions of reserved bits per field can remain private
98 * and may change in future implementations.
99 */
100enum {
101 _NORM_OPTIONS_NX_MASK=0x1f,
374ca955
A
102 _NORM_OPTIONS_UNICODE_MASK=0x60,
103 _NORM_OPTIONS_SETS_MASK=0x7f,
104
105 _NORM_OPTIONS_UNICODE_SHIFT=5,
106
107 /*
108 * The following options are used only in some composition functions.
109 * They use bits 12 and up to preserve lower bits for the available options
110 * space in unorm_compare() -
111 * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
112 */
b75a7d8f 113
374ca955
A
114 /** Options bit 12, for compatibility vs. canonical decomposition. */
115 _NORM_OPTIONS_COMPAT=0x1000,
116 /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
117 _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
b75a7d8f
A
118};
119
73c04bcf 120U_CDECL_BEGIN
b75a7d8f
A
121static inline UBool
122isHangulWithoutJamoT(UChar c) {
123 c-=HANGUL_BASE;
124 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
125}
126
127/* norm32 helpers */
128
129/* is this a norm32 with a regular index? */
130static inline UBool
131isNorm32Regular(uint32_t norm32) {
132 return norm32<_NORM_MIN_SPECIAL;
133}
134
135/* is this a norm32 with a special index for a lead surrogate? */
136static inline UBool
137isNorm32LeadSurrogate(uint32_t norm32) {
138 return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
139}
140
141/* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
142static inline UBool
143isNorm32HangulOrJamo(uint32_t norm32) {
144 return norm32>=_NORM_MIN_HANGUL;
145}
146
147/*
148 * Given isNorm32HangulOrJamo(),
149 * is this a Hangul syllable or a Jamo?
150 */
73c04bcf 151/*static inline UBool
b75a7d8f
A
152isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
153 return norm32<_NORM_MIN_JAMO_V;
73c04bcf 154}*/
b75a7d8f
A
155
156/*
157 * Given norm32 for Jamo V or T,
158 * is this a Jamo V?
159 */
160static inline UBool
161isJamoVTNorm32JamoV(uint32_t norm32) {
162 return norm32<_NORM_JAMO_V_TOP;
163}
164
b75a7d8f
A
165/* load unorm.dat ----------------------------------------------------------- */
166
73c04bcf
A
167/* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
168static int32_t U_CALLCONV
169getFoldingNormOffset(uint32_t norm32) {
170 if(isNorm32LeadSurrogate(norm32)) {
171 return
172 UTRIE_BMP_INDEX_LENGTH+
173 (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
174 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
175 } else {
176 return 0;
177 }
178}
179
180/* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
181static int32_t U_CALLCONV
182getFoldingAuxOffset(uint32_t data) {
183 return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
184}
185U_CDECL_END
186
187#define UNORM_HARDCODE_DATA 1
188
189#if UNORM_HARDCODE_DATA
190
191/* unorm_props_data.c is machine-generated by gennorm --csource */
192#include "unorm_props_data.c"
193
194static const UBool formatVersion_2_2=TRUE;
195
196#else
197
b75a7d8f
A
198#define DATA_NAME "unorm"
199#define DATA_TYPE "icu"
200
201static UDataMemory *normData=NULL;
202static UErrorCode dataErrorCode=U_ZERO_ERROR;
203static int8_t haveNormData=0;
204
205static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
206static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
207
208/*
209 * pointers into the memory-mapped unorm.icu
210 */
211static const uint16_t *extraData=NULL,
212 *combiningTable=NULL,
213 *canonStartSets=NULL;
214
215static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
216static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
217
218/* the Unicode version of the normalization data */
219static UVersionInfo dataVersion={ 0, 0, 0, 0 };
220
73c04bcf
A
221#endif
222
b75a7d8f
A
223/* cache UnicodeSets for each combination of exclusion flags */
224static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
225
226U_CDECL_BEGIN
227
374ca955 228static UBool U_CALLCONV
73c04bcf 229unorm_cleanup(void) {
b75a7d8f
A
230 int32_t i;
231
73c04bcf 232#if !UNORM_HARDCODE_DATA
b75a7d8f
A
233 if(normData!=NULL) {
234 udata_close(normData);
235 normData=NULL;
236 }
237 dataErrorCode=U_ZERO_ERROR;
238 haveNormData=0;
73c04bcf 239#endif
b75a7d8f
A
240
241 for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
73c04bcf
A
242 if (nxCache[i]) {
243 delete nxCache[i];
244 nxCache[i] = 0;
245 }
b75a7d8f 246 }
b75a7d8f
A
247
248 return TRUE;
249}
250
73c04bcf 251#if !UNORM_HARDCODE_DATA
b75a7d8f
A
252
253static UBool U_CALLCONV
254isAcceptable(void * /* context */,
255 const char * /* type */, const char * /* name */,
256 const UDataInfo *pInfo) {
257 if(
258 pInfo->size>=20 &&
259 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
260 pInfo->charsetFamily==U_CHARSET_FAMILY &&
261 pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */
262 pInfo->dataFormat[1]==0x6f &&
263 pInfo->dataFormat[2]==0x72 &&
264 pInfo->dataFormat[3]==0x6d &&
265 pInfo->formatVersion[0]==2 &&
266 pInfo->formatVersion[2]==UTRIE_SHIFT &&
267 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
268 ) {
269 uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
270 uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
271 return TRUE;
272 } else {
273 return FALSE;
274 }
275}
276
73c04bcf
A
277#endif
278
b75a7d8f
A
279static UBool U_CALLCONV
280_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) {
281 /* add the start code point to the USet */
73c04bcf 282 const USetAdder *sa=(const USetAdder *)context;
374ca955 283 sa->add(sa->set, start);
b75a7d8f
A
284 return TRUE;
285}
286
287U_CDECL_END
288
73c04bcf
A
289#if !UNORM_HARDCODE_DATA
290
b75a7d8f
A
291static int8_t
292loadNormData(UErrorCode &errorCode) {
293 /* load Unicode normalization data from file */
294
295 /*
296 * This lazy intialization with double-checked locking (without mutex protection for
297 * haveNormData==0) is transiently unsafe under certain circumstances.
298 * Check the readme and use u_init() if necessary.
299 *
300 * While u_init() initializes the main normalization data via this functions,
301 * it does not do so for exclusion sets (which are fully mutexed).
302 * This is because
303 * - there can be many exclusion sets
304 * - they are rarely used
305 * - they are not usually used in execution paths that are
306 * as performance-sensitive as others
307 * (e.g., IDNA takes more time than unorm_quickCheck() anyway)
308 */
309 if(haveNormData==0) {
310 UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
311 UDataMemory *data;
73c04bcf 312
b75a7d8f
A
313 const int32_t *p=NULL;
314 const uint8_t *pb;
315
316 if(&errorCode==NULL || U_FAILURE(errorCode)) {
317 return 0;
318 }
319
320 /* open the data outside the mutex block */
321 data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
322 dataErrorCode=errorCode;
323 if(U_FAILURE(errorCode)) {
324 return haveNormData=-1;
325 }
326
327 p=(const int32_t *)udata_getMemory(data);
328 pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
329 utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
330 _normTrie.getFoldingOffset=getFoldingNormOffset;
331
332 pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
b75a7d8f 333 if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
73c04bcf
A
334 utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
335 }
336 pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
337
338 if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
b75a7d8f
A
339 utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
340 _auxTrie.getFoldingOffset=getFoldingAuxOffset;
341 }
342
343 if(U_FAILURE(errorCode)) {
344 dataErrorCode=errorCode;
345 udata_close(data);
346 return haveNormData=-1;
347 }
348
349 /* in the mutex block, set the data for this process */
350 umtx_lock(NULL);
351 if(normData==NULL) {
352 normData=data;
353 data=NULL;
354
355 uprv_memcpy(&indexes, p, sizeof(indexes));
356 uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
357 uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
358 uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
359 } else {
360 p=(const int32_t *)udata_getMemory(normData);
361 }
b75a7d8f
A
362
363 /* initialize some variables */
364 extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
365 combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
366 formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
367 formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
368 if(formatVersion_2_1) {
369 canonStartSets=combiningTable+
370 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
371 (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
372 }
373 haveNormData=1;
374ca955
A
374 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
375 umtx_unlock(NULL);
b75a7d8f
A
376
377 /* if a different thread set it first, then close the extra data */
378 if(data!=NULL) {
379 udata_close(data); /* NULL if it was set correctly */
380 }
381 }
382
383 return haveNormData;
384}
385
73c04bcf
A
386#endif
387
b75a7d8f
A
388static inline UBool
389_haveData(UErrorCode &errorCode) {
73c04bcf
A
390#if UNORM_HARDCODE_DATA
391 return U_SUCCESS(errorCode);
392#else
393 if(U_FAILURE(errorCode)) {
394 return FALSE;
395 } else if(haveNormData>0) {
396 return TRUE;
397 } else if(haveNormData<0) {
b75a7d8f 398 errorCode=dataErrorCode;
73c04bcf
A
399 return FALSE;
400 } else /* haveNormData==0 */ {
b75a7d8f
A
401 return (UBool)(loadNormData(errorCode)>0);
402 }
73c04bcf 403#endif
b75a7d8f
A
404}
405
406U_CAPI UBool U_EXPORT2
407unorm_haveData(UErrorCode *pErrorCode) {
408 return _haveData(*pErrorCode);
409}
410
411U_CAPI const uint16_t * U_EXPORT2
412unorm_getFCDTrie(UErrorCode *pErrorCode) {
413 if(_haveData(*pErrorCode)) {
414 return fcdTrie.index;
415 } else {
416 return NULL;
417 }
418}
419
420/* data access primitives --------------------------------------------------- */
421
422static inline uint32_t
423_getNorm32(UChar c) {
424 return UTRIE_GET32_FROM_LEAD(&normTrie, c);
425}
426
427static inline uint32_t
428_getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) {
429 /*
430 * the surrogate index in norm32 stores only the number of the surrogate index block
431 * see gennorm/store.c/getFoldedNormValue()
432 */
433 norm32=
434 UTRIE_BMP_INDEX_LENGTH+
435 ((norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
436 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
437 return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie, norm32, c2);
438}
439
440/*
441 * get a norm32 from text with complete code points
442 * (like from decompositions)
443 */
444static inline uint32_t
445_getNorm32(const UChar *p, uint32_t mask) {
446 uint32_t norm32=_getNorm32(*p);
447 if((norm32&mask) && isNorm32LeadSurrogate(norm32)) {
448 /* *p is a lead surrogate, get the real norm32 */
449 norm32=_getNorm32FromSurrogatePair(norm32, *(p+1));
450 }
451 return norm32;
452}
453
454static inline uint16_t
455_getFCD16(UChar c) {
456 return UTRIE_GET16_FROM_LEAD(&fcdTrie, c);
457}
458
459static inline uint16_t
460_getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) {
461 /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
462 return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie, fcd16, c2);
463}
464
465static inline const uint16_t *
466_getExtraData(uint32_t norm32) {
467 return extraData+(norm32>>_NORM_EXTRA_SHIFT);
468}
469
73c04bcf
A
470#if 0
471/*
472 * It is possible to get the FCD data from the main trie if unorm.icu
473 * was built without the FCD trie, although it is slower.
474 * This is not implemented because it is hard to test, and because it seems
475 * unusual to want to use FCD and not build the data file for it.
476 *
477 * Untested sample code:
478 */
479static inline uint16_t
480_getFCD16FromNormData(UChar32 c) {
481 uint32_t norm32, fcd;
482
483 norm32=_getNorm32(c);
484 if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) {
485 /* get the lead/trail cc from the decomposition data */
486 const uint16_t *nfd=_getExtraData(norm32);
487 if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
488 fcd=nfd[1];
489 }
490 } else {
491 fcd=norm32&_NORM_CC_MASK;
492 if(fcd!=0) {
493 /* use the code point cc value for both lead and trail cc's */
494 fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */
495 }
496 }
497
498 return (uint16_t)fcd;
499}
500#endif
501
b75a7d8f
A
502/* normalization exclusion sets --------------------------------------------- */
503
504/*
505 * Normalization exclusion UnicodeSets are used for tailored normalization;
506 * see the comment near the beginning of this file.
507 *
508 * By specifying one or several sets of code points,
509 * those code points become inert for normalization.
510 */
511
512static const UnicodeSet *
513internalGetNXHangul(UErrorCode &errorCode) {
514 /* internal function, does not check for incoming U_FAILURE */
b75a7d8f
A
515 UBool isCached;
516
374ca955 517 UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached);
b75a7d8f
A
518
519 if(!isCached) {
520 UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
521 if(set==NULL) {
522 errorCode=U_MEMORY_ALLOCATION_ERROR;
523 return NULL;
524 }
525
526 umtx_lock(NULL);
527 if(nxCache[UNORM_NX_HANGUL]==NULL) {
528 nxCache[UNORM_NX_HANGUL]=set;
529 set=NULL;
73c04bcf 530 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
b75a7d8f
A
531 }
532 umtx_unlock(NULL);
533
534 delete set;
535 }
536
537 return nxCache[UNORM_NX_HANGUL];
538}
539
374ca955 540/* unorm.cpp 1.116 had and used
b75a7d8f 541static const UnicodeSet *
374ca955
A
542internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
543 ...
b75a7d8f 544}
374ca955 545*/
b75a7d8f 546
374ca955 547/* get and set an exclusion set from a serialized UnicodeSet */
b75a7d8f 548static const UnicodeSet *
374ca955 549internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
b75a7d8f 550 /* internal function, does not check for incoming U_FAILURE */
b75a7d8f
A
551 UBool isCached;
552
374ca955 553 UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
b75a7d8f 554
374ca955
A
555 if( !isCached &&
556 canonStartSets!=NULL &&
557 canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
558 ) {
559 USerializedSet sset;
b75a7d8f 560 UnicodeSet *set;
374ca955
A
561 UChar32 start, end;
562 int32_t i;
b75a7d8f 563
374ca955
A
564 if( !uset_getSerializedSet(
565 &sset,
566 canonStartSets+canonStartSets[nxIndex],
567 canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
568 ) {
569 errorCode=U_INVALID_FORMAT_ERROR;
b75a7d8f
A
570 return NULL;
571 }
572
374ca955
A
573 /* turn the serialized set into a UnicodeSet */
574 set=new UnicodeSet();
b75a7d8f
A
575 if(set==NULL) {
576 errorCode=U_MEMORY_ALLOCATION_ERROR;
577 return NULL;
578 }
374ca955
A
579 for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
580 set->add(start, end);
b75a7d8f
A
581 }
582
583 umtx_lock(NULL);
584 if(nxCache[options]==NULL) {
585 nxCache[options]=set;
586 set=NULL;
73c04bcf 587 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
b75a7d8f
A
588 }
589 umtx_unlock(NULL);
590
591 delete set;
592 }
593
594 return nxCache[options];
595}
596
374ca955
A
597static const UnicodeSet *
598internalGetNXCJKCompat(UErrorCode &errorCode) {
599 /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
600 return internalGetSerializedNX(
601 UNORM_NX_CJK_COMPAT,
602 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
603 errorCode);
604}
605
606static const UnicodeSet *
607internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
608 /* internal function, does not check for incoming U_FAILURE */
609 int32_t nxIndex;
610
611 options&=_NORM_OPTIONS_UNICODE_MASK;
612 switch(options) {
613 case 0:
614 return NULL;
615 case UNORM_UNICODE_3_2:
616 /* [:^Age=3.2:] */
617 nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
618 break;
619 default:
620 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
621 return NULL;
622 }
623
624 /* build a set with all code points that were not designated by the specified Unicode version */
625 return internalGetSerializedNX(options, nxIndex, errorCode);
626}
627
b75a7d8f
A
628/* Get a decomposition exclusion set. The data must be loaded. */
629static const UnicodeSet *
630internalGetNX(int32_t options, UErrorCode &errorCode) {
631 options&=_NORM_OPTIONS_SETS_MASK;
632
633 UBool isCached;
634
374ca955 635 UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
b75a7d8f
A
636
637 if(!isCached) {
638 /* return basic sets */
639 if(options==UNORM_NX_HANGUL) {
640 return internalGetNXHangul(errorCode);
641 }
642 if(options==UNORM_NX_CJK_COMPAT) {
643 return internalGetNXCJKCompat(errorCode);
644 }
645 if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
646 return internalGetNXUnicode(options, errorCode);
647 }
648
649 /* build a set from multiple subsets */
650 UnicodeSet *set;
651 const UnicodeSet *other;
652
653 set=new UnicodeSet();
654 if(set==NULL) {
655 errorCode=U_MEMORY_ALLOCATION_ERROR;
656 return NULL;
657 }
658
659 if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
660 set->addAll(*other);
661 }
662 if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
663 set->addAll(*other);
664 }
665 if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
666 set->addAll(*other);
667 }
668
669 if(U_FAILURE(errorCode)) {
670 delete set;
671 return NULL;
672 }
673
674 umtx_lock(NULL);
675 if(nxCache[options]==NULL) {
676 nxCache[options]=set;
677 set=NULL;
73c04bcf 678 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
b75a7d8f
A
679 }
680 umtx_unlock(NULL);
681
682 delete set;
683 }
684
685 return nxCache[options];
686}
687
688static inline const UnicodeSet *
689getNX(int32_t options, UErrorCode &errorCode) {
690 if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
691 /* incoming failure, or no decomposition exclusions requested */
692 return NULL;
693 } else {
694 return internalGetNX(options, errorCode);
695 }
696}
697
374ca955
A
698U_CFUNC const UnicodeSet *
699unorm_getNX(int32_t options, UErrorCode *pErrorCode) {
700 return getNX(options, *pErrorCode);
701}
702
b75a7d8f
A
703static inline UBool
704nx_contains(const UnicodeSet *nx, UChar32 c) {
705 return nx!=NULL && nx->contains(c);
706}
707
708static inline UBool
709nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
710 return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
711}
712
713/* other normalization primitives ------------------------------------------- */
714
715/* get the canonical or compatibility decomposition for one character */
716static inline const UChar *
717_decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
718 uint8_t &cc, uint8_t &trailCC) {
719 const UChar *p=(const UChar *)_getExtraData(norm32);
720 length=*p++;
721
722 if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
723 /* use compatibility decomposition, skip canonical data */
724 p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
725 length>>=8;
726 }
727
728 if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
729 /* get the lead and trail cc's */
730 UChar bothCCs=*p++;
731 cc=(uint8_t)(bothCCs>>8);
732 trailCC=(uint8_t)bothCCs;
733 } else {
734 /* lead and trail cc's are both 0 */
735 cc=trailCC=0;
736 }
737
738 length&=_NORM_DECOMP_LENGTH_MASK;
739 return p;
740}
741
742/* get the canonical decomposition for one character */
743static inline const UChar *
744_decompose(uint32_t norm32, int32_t &length,
745 uint8_t &cc, uint8_t &trailCC) {
746 const UChar *p=(const UChar *)_getExtraData(norm32);
747 length=*p++;
748
749 if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
750 /* get the lead and trail cc's */
751 UChar bothCCs=*p++;
752 cc=(uint8_t)(bothCCs>>8);
753 trailCC=(uint8_t)bothCCs;
754 } else {
755 /* lead and trail cc's are both 0 */
756 cc=trailCC=0;
757 }
758
759 length&=_NORM_DECOMP_LENGTH_MASK;
760 return p;
761}
762
763/**
764 * Get the canonical decomposition for one code point.
765 * @param c code point
766 * @param buffer out-only buffer for algorithmic decompositions of Hangul
767 * @param length out-only, takes the length of the decomposition, if any
768 * @return pointer to decomposition, or 0 if none
769 * @internal
770 */
374ca955
A
771U_CFUNC const UChar *
772unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) {
b75a7d8f
A
773 uint32_t norm32;
774
374ca955
A
775 if(c<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
776 /* trivial case */
777 return NULL;
778 }
779
b75a7d8f
A
780 UTRIE_GET32(&normTrie, c, norm32);
781 if(norm32&_NORM_QC_NFD) {
782 if(isNorm32HangulOrJamo(norm32)) {
783 /* Hangul syllable: decompose algorithmically */
784 UChar c2;
785
786 c-=HANGUL_BASE;
787
788 c2=(UChar)(c%JAMO_T_COUNT);
789 c/=JAMO_T_COUNT;
790 if(c2>0) {
791 buffer[2]=(UChar)(JAMO_T_BASE+c2);
374ca955 792 *pLength=3;
b75a7d8f 793 } else {
374ca955 794 *pLength=2;
b75a7d8f
A
795 }
796
797 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
798 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
799 return buffer;
800 } else {
801 /* normal decomposition */
802 uint8_t cc, trailCC;
374ca955 803 return _decompose(norm32, *pLength, cc, trailCC);
b75a7d8f
A
804 }
805 } else {
806 return 0;
807 }
808}
809
810/*
811 * get the combining class of (c, c2)=*p++
812 * before: p<limit after: p<=limit
813 * if only one code unit is used, then c2==0
814 */
815static inline uint8_t
816_getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
817 uint32_t norm32;
818
819 c=*p++;
820 norm32=_getNorm32(c);
821 if((norm32&_NORM_CC_MASK)==0) {
822 c2=0;
823 return 0;
824 } else {
825 if(!isNorm32LeadSurrogate(norm32)) {
826 c2=0;
827 } else {
828 /* c is a lead surrogate, get the real norm32 */
829 if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
830 ++p;
831 norm32=_getNorm32FromSurrogatePair(norm32, c2);
832 } else {
833 c2=0;
834 return 0;
835 }
836 }
837
838 return (uint8_t)(norm32>>_NORM_CC_SHIFT);
839 }
840}
841
842/*
843 * read backwards and get norm32
844 * return 0 if the character is <minC
845 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
846 */
847static inline uint32_t
848_getPrevNorm32(const UChar *start, const UChar *&src,
849 uint32_t minC, uint32_t mask,
850 UChar &c, UChar &c2) {
851 uint32_t norm32;
852
853 c=*--src;
854 c2=0;
855
856 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
857 if(c<minC) {
858 return 0;
859 } else if(!UTF_IS_SURROGATE(c)) {
860 return _getNorm32(c);
861 } else if(UTF_IS_SURROGATE_FIRST(c)) {
862 /* unpaired first surrogate */
863 return 0;
864 } else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) {
865 --src;
866 norm32=_getNorm32(c2);
867
868 if((norm32&mask)==0) {
869 /* all surrogate pairs with this lead surrogate have only irrelevant data */
870 return 0;
871 } else {
872 /* norm32 must be a surrogate special */
873 return _getNorm32FromSurrogatePair(norm32, c);
874 }
875 } else {
876 /* unpaired second surrogate */
877 c2=0;
878 return 0;
879 }
880}
881
882/*
883 * get the combining class of (c, c2)=*--p
884 * before: start<p after: start<=p
885 */
886static inline uint8_t
887_getPrevCC(const UChar *start, const UChar *&p) {
888 UChar c, c2;
889
890 return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, _NORM_CC_MASK, c, c2)>>_NORM_CC_SHIFT);
891}
892
893/*
894 * is this a safe boundary character for NF*D?
895 * (lead cc==0)
896 */
897static inline UBool
898_isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
899 if((norm32&ccOrQCMask)==0) {
900 return TRUE; /* cc==0 and no decomposition: this is NF*D safe */
901 }
902
903 /* inspect its decomposition - maybe a Hangul but not a surrogate here */
904 if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
905 int32_t length;
906 uint8_t cc, trailCC;
907
908 /* decomposes, get everything from the variable-length extra data */
909 _decompose(norm32, decompQCMask, length, cc, trailCC);
910 return cc==0;
911 } else {
912 /* no decomposition (or Hangul), test the cc directly */
913 return (norm32&_NORM_CC_MASK)==0;
914 }
915}
916
917/*
918 * is this (or does its decomposition begin with) a "true starter"?
919 * (cc==0 and NF*C_YES)
920 */
921static inline UBool
922_isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
923 if((norm32&ccOrQCMask)==0) {
924 return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
925 }
926
927 /* inspect its decomposition - not a Hangul or a surrogate here */
928 if((norm32&decompQCMask)!=0) {
929 const UChar *p;
930 int32_t length;
931 uint8_t cc, trailCC;
932
933 /* decomposes, get everything from the variable-length extra data */
934 p=_decompose(norm32, decompQCMask, length, cc, trailCC);
935 if(cc==0) {
936 uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
937
938 /* does it begin with NFC_YES? */
939 if((_getNorm32(p, qcMask)&qcMask)==0) {
940 /* yes, the decomposition begins with a true starter */
941 return TRUE;
942 }
943 }
944 }
945 return FALSE;
946}
947
948/* uchar.h */
949U_CAPI uint8_t U_EXPORT2
950u_getCombiningClass(UChar32 c) {
73c04bcf 951#if !UNORM_HARDCODE_DATA
b75a7d8f
A
952 UErrorCode errorCode=U_ZERO_ERROR;
953 if(_haveData(errorCode)) {
73c04bcf 954#endif
b75a7d8f
A
955 uint32_t norm32;
956
957 UTRIE_GET32(&normTrie, c, norm32);
958 return (uint8_t)(norm32>>_NORM_CC_SHIFT);
73c04bcf 959#if !UNORM_HARDCODE_DATA
b75a7d8f
A
960 } else {
961 return 0;
962 }
73c04bcf 963#endif
b75a7d8f
A
964}
965
966U_CAPI UBool U_EXPORT2
967unorm_internalIsFullCompositionExclusion(UChar32 c) {
73c04bcf
A
968#if UNORM_HARDCODE_DATA
969 if(auxTrie.index!=NULL) {
970#else
b75a7d8f 971 UErrorCode errorCode=U_ZERO_ERROR;
73c04bcf
A
972 if(_haveData(errorCode) && auxTrie.index!=NULL) {
973#endif
b75a7d8f
A
974 uint16_t aux;
975
976 UTRIE_GET16(&auxTrie, c, aux);
977 return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
978 } else {
979 return FALSE;
980 }
981}
982
983U_CAPI UBool U_EXPORT2
984unorm_isCanonSafeStart(UChar32 c) {
73c04bcf
A
985#if UNORM_HARDCODE_DATA
986 if(auxTrie.index!=NULL) {
987#else
b75a7d8f 988 UErrorCode errorCode=U_ZERO_ERROR;
73c04bcf
A
989 if(_haveData(errorCode) && auxTrie.index!=NULL) {
990#endif
b75a7d8f
A
991 uint16_t aux;
992
993 UTRIE_GET16(&auxTrie, c, aux);
994 return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
995 } else {
996 return FALSE;
997 }
998}
999
374ca955
A
1000U_CAPI void U_EXPORT2
1001unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
1002 if(unorm_haveData(pErrorCode)){
1003 uprv_memcpy(*versionInfo, dataVersion, 4);
1004 }
1005}
1006
1007
b75a7d8f
A
1008U_CAPI UBool U_EXPORT2
1009unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
73c04bcf 1010#if !UNORM_HARDCODE_DATA
b75a7d8f 1011 UErrorCode errorCode=U_ZERO_ERROR;
73c04bcf 1012#endif
b75a7d8f 1013 if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
73c04bcf
A
1014#if !UNORM_HARDCODE_DATA
1015 _haveData(errorCode) &&
1016#endif
1017 canonStartSets!=NULL
b75a7d8f
A
1018 ) {
1019 const uint16_t *table;
1020 int32_t i, start, limit;
1021
1022 /*
1023 * binary search for c
1024 *
1025 * There are two search tables,
1026 * one for BMP code points and one for supplementary ones.
1027 * See unormimp.h for details.
1028 */
1029 if(c<=0xffff) {
1030 table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
1031 start=0;
1032 limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1033
1034 /* each entry is a pair { c, result } */
1035 while(start<limit-2) {
1036 i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
1037 if(c<table[i]) {
1038 limit=i;
1039 } else {
1040 start=i;
1041 }
1042 }
1043
1044 /* found? */
1045 if(c==table[start]) {
1046 i=table[start+1];
1047 if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
1048 /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
1049 i&=(_NORM_MAX_CANON_SETS-1);
1050 return uset_getSerializedSet(fillSet,
1051 canonStartSets+i,
1052 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1053 } else {
1054 /* other result values are BMP code points for single-code point sets */
1055 uset_setSerializedToOne(fillSet, (UChar32)i);
1056 return TRUE;
1057 }
1058 }
1059 } else {
1060 uint16_t high, low, h;
1061
1062 table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
1063 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1064 start=0;
1065 limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1066
1067 high=(uint16_t)(c>>16);
1068 low=(uint16_t)c;
1069
1070 /* each entry is a triplet { high(c), low(c), result } */
1071 while(start<limit-3) {
1072 i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
1073 h=table[i]&0x1f; /* high word */
1074 if(high<h || (high==h && low<table[i+1])) {
1075 limit=i;
1076 } else {
1077 start=i;
1078 }
1079 }
1080
1081 /* found? */
1082 h=table[start];
1083 if(high==(h&0x1f) && low==table[start+1]) {
1084 i=table[start+2];
1085 if((h&0x8000)==0) {
1086 /* the result is an index to a USerializedSet */
1087 return uset_getSerializedSet(fillSet,
1088 canonStartSets+i,
1089 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1090 } else {
1091 /*
1092 * single-code point set {x} in
1093 * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
1094 */
1095 i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
1096 uset_setSerializedToOne(fillSet, (UChar32)i);
1097 return TRUE;
1098 }
1099 }
1100 }
1101 }
1102
1103 return FALSE; /* not found */
1104}
1105
1106U_CAPI int32_t U_EXPORT2
1107u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
1108 uint16_t aux;
1109
1110 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1111 return 0;
1112 }
1113 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1114 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1115 return 0;
1116 }
73c04bcf 1117 if(!_haveData(*pErrorCode) || auxTrie.index==NULL) {
b75a7d8f
A
1118 return 0;
1119 }
1120
1121 UTRIE_GET16(&auxTrie, c, aux);
1122 aux&=_NORM_AUX_FNC_MASK;
1123 if(aux!=0) {
1124 const UChar *s;
1125 int32_t length;
1126
1127 s=(const UChar *)(extraData+aux);
1128 if(*s<0xff00) {
1129 /* s points to the single-unit string */
1130 length=1;
1131 } else {
1132 length=*s&0xff;
1133 ++s;
1134 }
1135 if(0<length && length<=destCapacity) {
1136 uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
1137 }
1138 return u_terminateUChars(dest, destCapacity, length, pErrorCode);
1139 } else {
1140 return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
1141 }
1142}
1143
1144/* Is c an NF<mode>-skippable code point? See unormimp.h. */
1145U_CAPI UBool U_EXPORT2
1146unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
b75a7d8f
A
1147 uint32_t norm32, mask;
1148 uint16_t aux, fcd;
1149
73c04bcf
A
1150#if !UNORM_HARDCODE_DATA
1151 UErrorCode errorCode=U_ZERO_ERROR;
b75a7d8f
A
1152 if(!_haveData(errorCode)) {
1153 return FALSE;
1154 }
73c04bcf 1155#endif
b75a7d8f
A
1156
1157 /* handle trivial cases; set the comparison mask for the normal ones */
1158 switch(mode) {
1159 case UNORM_NONE:
1160 return TRUE;
1161 case UNORM_NFD:
1162 mask=_NORM_CC_MASK|_NORM_QC_NFD;
1163 break;
1164 case UNORM_NFKD:
1165 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
1166 break;
1167 case UNORM_NFC:
1168 /* case UNORM_FCC: */
1169 mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
1170 break;
1171 case UNORM_NFKC:
1172 mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
1173 break;
1174 case UNORM_FCD:
1175 /* FCD: skippable if lead cc==0 and trail cc<=1 */
73c04bcf
A
1176 if(fcdTrie.index!=NULL) {
1177 UTRIE_GET16(&fcdTrie, c, fcd);
1178 return fcd<=1;
1179 } else {
1180 return FALSE;
1181 }
b75a7d8f
A
1182 default:
1183 return FALSE;
1184 }
1185
1186 /* check conditions (a)..(e), see unormimp.h */
1187 UTRIE_GET32(&normTrie, c, norm32);
1188 if((norm32&mask)!=0) {
1189 return FALSE; /* fails (a)..(e), not skippable */
1190 }
1191
1192 if(mode<UNORM_NFC) {
1193 return TRUE; /* NF*D, passed (a)..(c), is skippable */
1194 }
1195
1196 /* NF*C/FCC, passed (a)..(e) */
1197 if((norm32&_NORM_QC_NFD)==0) {
1198 return TRUE; /* no canonical decomposition, is skippable */
1199 }
1200
1201 /* check Hangul syllables algorithmically */
1202 if(isNorm32HangulOrJamo(norm32)) {
1203 /* Jamo passed (a)..(e) above, must be Hangul */
1204 return !isHangulWithoutJamoT((UChar)c); /* LVT are skippable, LV are not */
1205 }
1206
1207 /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1208 /* NF*C, test (f) flag */
73c04bcf 1209 if(!formatVersion_2_2 || auxTrie.index==NULL) {
b75a7d8f
A
1210 return FALSE; /* no (f) data, say not skippable to be safe */
1211 }
1212
1213 UTRIE_GET16(&auxTrie, c, aux);
1214 return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
1215
1216 /* } else { FCC, test fcd<=1 instead of the above } */
1217}
1218
1219U_CAPI void U_EXPORT2
73c04bcf 1220unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
b75a7d8f
A
1221 UChar c;
1222
73c04bcf 1223 if(!_haveData(*pErrorCode)) {
b75a7d8f
A
1224 return;
1225 }
1226
1227 /* add the start code point of each same-value range of each trie */
374ca955 1228 utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
73c04bcf
A
1229 if(fcdTrie.index!=NULL) {
1230 utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
1231 }
1232 if(auxTrie.index!=NULL) {
374ca955 1233 utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
b75a7d8f
A
1234 }
1235
1236 /* add Hangul LV syllables and LV+1 because of skippables */
1237 for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
374ca955
A
1238 sa->add(sa->set, c);
1239 sa->add(sa->set, c+1);
1240 }
1241 sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
1242}
1243
1244U_CAPI UNormalizationCheckResult U_EXPORT2
1245unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
1246 static const uint32_t qcMask[UNORM_MODE_COUNT]={
1247 0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
1248 };
1249
374ca955
A
1250 uint32_t norm32;
1251
73c04bcf
A
1252#if !UNORM_HARDCODE_DATA
1253 UErrorCode errorCode=U_ZERO_ERROR;
374ca955
A
1254 if(!_haveData(errorCode)) {
1255 return UNORM_YES;
1256 }
73c04bcf 1257#endif
374ca955
A
1258
1259 UTRIE_GET32(&normTrie, c, norm32);
1260 norm32&=qcMask[mode];
1261
1262 if(norm32==0) {
1263 return UNORM_YES;
1264 } else if(norm32&_NORM_QC_ANY_NO) {
1265 return UNORM_NO;
1266 } else /* _NORM_QC_ANY_MAYBE */ {
1267 return UNORM_MAYBE;
1268 }
1269}
1270
1271U_CAPI uint16_t U_EXPORT2
1272unorm_getFCD16FromCodePoint(UChar32 c) {
1273 UErrorCode errorCode;
1274 uint16_t fcd;
1275
1276 errorCode=U_ZERO_ERROR;
73c04bcf
A
1277 if(
1278#if !UNORM_HARDCODE_DATA
1279 !_haveData(errorCode) ||
1280#endif
1281 fcdTrie.index==NULL
1282 ) {
374ca955 1283 return 0;
b75a7d8f 1284 }
374ca955
A
1285
1286 UTRIE_GET16(&fcdTrie, c, fcd);
1287 return fcd;
b75a7d8f
A
1288}
1289
1290/* reorder UTF-16 in-place -------------------------------------------------- */
1291
1292/*
1293 * simpler, single-character version of _mergeOrdered() -
1294 * bubble-insert one single code point into the preceding string
1295 * which is already canonically ordered
1296 * (c, c2) may or may not yet have been inserted at [current..p[
1297 *
1298 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1299 *
1300 * before: [start..current[ is already ordered, and
1301 * [current..p[ may or may not hold (c, c2) but
1302 * must be exactly the same length as (c, c2)
1303 * after: [start..p[ is ordered
1304 *
1305 * returns the trailing combining class
1306 */
1307static uint8_t
1308_insertOrdered(const UChar *start, UChar *current, UChar *p,
1309 UChar c, UChar c2, uint8_t cc) {
1310 const UChar *pBack, *pPreBack;
1311 UChar *r;
1312 uint8_t prevCC, trailCC=cc;
1313
1314 if(start<current && cc!=0) {
1315 /* search for the insertion point where cc>=prevCC */
1316 pPreBack=pBack=current;
1317 prevCC=_getPrevCC(start, pPreBack);
1318 if(cc<prevCC) {
1319 /* this will be the last code point, so keep its cc */
1320 trailCC=prevCC;
1321 pBack=pPreBack;
1322 while(start<pPreBack) {
1323 prevCC=_getPrevCC(start, pPreBack);
1324 if(cc>=prevCC) {
1325 break;
1326 }
1327 pBack=pPreBack;
1328 }
1329
1330 /*
1331 * this is where we are right now with all these pointers:
1332 * [start..pPreBack[ 0..? code points that we can ignore
1333 * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1334 * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2)
1335 * [current..p[ 1 code point (c, c2) with cc
1336 */
1337
1338 /* move the code units in between up */
1339 r=p;
1340 do {
1341 *--r=*--current;
1342 } while(pBack!=current);
1343 }
1344 }
1345
1346 /* insert (c, c2) */
1347 *current=c;
1348 if(c2!=0) {
1349 *(current+1)=c2;
1350 }
1351
1352 /* we know the cc of the last code point */
1353 return trailCC;
1354}
1355
1356/*
1357 * merge two UTF-16 string parts together
1358 * to canonically order (order by combining classes) their concatenation
1359 *
1360 * the two strings may already be adjacent, so that the merging is done in-place
1361 * if the two strings are not adjacent, then the buffer holding the first one
1362 * must be large enough
1363 * the second string may or may not be ordered in itself
1364 *
1365 * before: [start..current[ is already ordered, and
1366 * [next..limit[ may be ordered in itself, but
1367 * is not in relation to [start..current[
1368 * after: [start..current+(limit-next)[ is ordered
1369 *
1370 * the algorithm is a simple bubble-sort that takes the characters from *next++
1371 * and inserts them in correct combining class order into the preceding part
1372 * of the string
1373 *
1374 * since this function is called much less often than the single-code point
1375 * _insertOrdered(), it just uses that for easier maintenance
1376 * (see file version from before 2001aug31 for a more optimized version)
1377 *
1378 * returns the trailing combining class
1379 */
1380static uint8_t
1381_mergeOrdered(UChar *start, UChar *current,
1382 const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
1383 UChar *r;
1384 UChar c, c2;
1385 uint8_t cc, trailCC=0;
1386 UBool adjacent;
1387
1388 adjacent= current==next;
1389
1390 if(start!=current || !isOrdered) {
1391 while(next<limit) {
1392 cc=_getNextCC(next, limit, c, c2);
1393 if(cc==0) {
1394 /* does not bubble back */
1395 trailCC=0;
1396 if(adjacent) {
1397 current=(UChar *)next;
1398 } else {
1399 *current++=c;
1400 if(c2!=0) {
1401 *current++=c2;
1402 }
1403 }
1404 if(isOrdered) {
1405 break;
1406 } else {
1407 start=current;
1408 }
1409 } else {
1410 r=current+(c2==0 ? 1 : 2);
1411 trailCC=_insertOrdered(start, current, r, c, c2, cc);
1412 current=r;
1413 }
1414 }
1415 }
1416
1417 if(next==limit) {
1418 /* we know the cc of the last code point */
1419 return trailCC;
1420 } else {
1421 if(!adjacent) {
1422 /* copy the second string part */
1423 do {
1424 *current++=*next++;
1425 } while(next!=limit);
1426 limit=current;
1427 }
1428 return _getPrevCC(start, limit);
1429 }
1430}
1431
374ca955
A
1432/* find the last true starter in [start..src[ and return the pointer to it */
1433static const UChar *
1434_findPreviousStarter(const UChar *start, const UChar *src,
1435 uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
1436 uint32_t norm32;
b75a7d8f 1437 UChar c, c2;
b75a7d8f 1438
374ca955
A
1439 while(start<src) {
1440 norm32=_getPrevNorm32(start, src, minNoMaybe, ccOrQCMask|decompQCMask, c, c2);
1441 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1442 break;
1443 }
b75a7d8f 1444 }
374ca955
A
1445 return src;
1446}
b75a7d8f 1447
374ca955
A
1448/* find the first true starter in [src..limit[ and return the pointer to it */
1449static const UChar *
1450_findNextStarter(const UChar *src, const UChar *limit,
1451 uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
1452 const UChar *p;
1453 uint32_t norm32, ccOrQCMask;
1454 int32_t length;
1455 UChar c, c2;
1456 uint8_t cc, trailCC;
1457
1458 ccOrQCMask=_NORM_CC_MASK|qcMask;
b75a7d8f
A
1459
1460 for(;;) {
374ca955
A
1461 if(src==limit) {
1462 break; /* end of string */
1463 }
1464 c=*src;
1465 if(c<minNoMaybe) {
1466 break; /* catches NUL terminater, too */
b75a7d8f
A
1467 }
1468
374ca955
A
1469 norm32=_getNorm32(c);
1470 if((norm32&ccOrQCMask)==0) {
1471 break; /* true starter */
1472 }
1473
1474 if(isNorm32LeadSurrogate(norm32)) {
1475 /* c is a lead surrogate, get the real norm32 */
1476 if((src+1)==limit || !UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
1477 break; /* unmatched first surrogate: counts as a true starter */
1478 }
1479 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1480
1481 if((norm32&ccOrQCMask)==0) {
1482 break; /* true starter */
b75a7d8f
A
1483 }
1484 } else {
1485 c2=0;
1486 }
1487
374ca955
A
1488 /* (c, c2) is not a true starter but its decomposition may be */
1489 if(norm32&decompQCMask) {
1490 /* (c, c2) decomposes, get everything from the variable-length extra data */
1491 p=_decompose(norm32, decompQCMask, length, cc, trailCC);
1492
1493 /* get the first character's norm32 to check if it is a true starter */
1494 if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
1495 break; /* true starter */
1496 }
b75a7d8f
A
1497 }
1498
374ca955
A
1499 src+= c2==0 ? 1 : 2; /* not a true starter, continue */
1500 }
b75a7d8f 1501
374ca955 1502 return src;
b75a7d8f
A
1503}
1504
1505/* make NFD & NFKD ---------------------------------------------------------- */
1506
1507U_CAPI int32_t U_EXPORT2
1508unorm_getDecomposition(UChar32 c, UBool compat,
1509 UChar *dest, int32_t destCapacity) {
73c04bcf 1510#if !UNORM_HARDCODE_DATA
b75a7d8f 1511 UErrorCode errorCode=U_ZERO_ERROR;
73c04bcf 1512#endif
b75a7d8f 1513 if( (uint32_t)c<=0x10ffff &&
73c04bcf 1514#if !UNORM_HARDCODE_DATA
b75a7d8f 1515 _haveData(errorCode) &&
73c04bcf 1516#endif
b75a7d8f
A
1517 ((dest!=NULL && destCapacity>0) || destCapacity==0)
1518 ) {
1519 uint32_t norm32, qcMask;
1520 UChar32 minNoMaybe;
1521 int32_t length;
1522
1523 /* initialize */
1524 if(!compat) {
1525 minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1526 qcMask=_NORM_QC_NFD;
1527 } else {
1528 minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1529 qcMask=_NORM_QC_NFKD;
1530 }
1531
1532 if(c<minNoMaybe) {
1533 /* trivial case */
1534 if(destCapacity>0) {
1535 dest[0]=(UChar)c;
1536 }
1537 return -1;
1538 }
1539
1540 /* data lookup */
1541 UTRIE_GET32(&normTrie, c, norm32);
1542 if((norm32&qcMask)==0) {
1543 /* simple case: no decomposition */
1544 if(c<=0xffff) {
1545 if(destCapacity>0) {
1546 dest[0]=(UChar)c;
1547 }
1548 return -1;
1549 } else {
1550 if(destCapacity>=2) {
1551 dest[0]=UTF16_LEAD(c);
1552 dest[1]=UTF16_TRAIL(c);
1553 }
1554 return -2;
1555 }
1556 } else if(isNorm32HangulOrJamo(norm32)) {
1557 /* Hangul syllable: decompose algorithmically */
1558 UChar c2;
1559
1560 c-=HANGUL_BASE;
1561
1562 c2=(UChar)(c%JAMO_T_COUNT);
1563 c/=JAMO_T_COUNT;
1564 if(c2>0) {
1565 if(destCapacity>=3) {
1566 dest[2]=(UChar)(JAMO_T_BASE+c2);
1567 }
1568 length=3;
1569 } else {
1570 length=2;
1571 }
1572
1573 if(destCapacity>=2) {
1574 dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1575 dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1576 }
1577 return length;
1578 } else {
1579 /* c decomposes, get everything from the variable-length extra data */
1580 const UChar *p, *limit;
1581 uint8_t cc, trailCC;
1582
1583 p=_decompose(norm32, qcMask, length, cc, trailCC);
1584 if(length<=destCapacity) {
1585 limit=p+length;
1586 do {
1587 *dest++=*p++;
1588 } while(p<limit);
1589 }
1590 return length;
1591 }
1592 } else {
1593 return 0;
1594 }
1595}
1596
1597static int32_t
1598_decompose(UChar *dest, int32_t destCapacity,
1599 const UChar *src, int32_t srcLength,
1600 UBool compat, const UnicodeSet *nx,
1601 uint8_t &outTrailCC) {
1602 UChar buffer[3];
1603 const UChar *limit, *prevSrc, *p;
1604 uint32_t norm32, ccOrQCMask, qcMask;
1605 int32_t destIndex, reorderStartIndex, length;
1606 UChar c, c2, minNoMaybe;
1607 uint8_t cc, prevCC, trailCC;
1608
1609 if(!compat) {
1610 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1611 qcMask=_NORM_QC_NFD;
1612 } else {
1613 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1614 qcMask=_NORM_QC_NFKD;
1615 }
1616
1617 /* initialize */
1618 ccOrQCMask=_NORM_CC_MASK|qcMask;
1619 destIndex=reorderStartIndex=0;
1620 prevCC=0;
1621
1622 /* avoid compiler warnings */
1623 norm32=0;
1624 c=0;
73c04bcf
A
1625 cc=0;
1626 trailCC=0;
b75a7d8f
A
1627
1628 if(srcLength>=0) {
1629 /* string with length */
1630 limit=src+srcLength;
1631 } else /* srcLength==-1 */ {
1632 /* zero-terminated string */
1633 limit=NULL;
1634 }
1635
1636 U_ALIGN_CODE(16);
1637
1638 for(;;) {
1639 /* count code units below the minimum or with irrelevant data for the quick check */
1640 prevSrc=src;
1641 if(limit==NULL) {
1642 while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
1643 prevCC=0;
1644 ++src;
1645 }
1646 } else {
1647 while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
1648 prevCC=0;
1649 ++src;
1650 }
1651 }
1652
1653 /* copy these code units all at once */
1654 if(src!=prevSrc) {
1655 length=(int32_t)(src-prevSrc);
1656 if((destIndex+length)<=destCapacity) {
1657 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
1658 }
1659 destIndex+=length;
1660 reorderStartIndex=destIndex;
1661 }
1662
1663 /* end of source reached? */
1664 if(limit==NULL ? c==0 : src==limit) {
1665 break;
1666 }
1667
1668 /* c already contains *src and norm32 is set for it, increment src */
1669 ++src;
1670
1671 /* check one above-minimum, relevant code unit */
1672 /*
1673 * generally, set p and length to the decomposition string
1674 * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1675 * in all cases, set cc to the lead and trailCC to the trail combining class
1676 *
1677 * the following merge-sort of the current character into the preceding,
1678 * canonically ordered result text will use the optimized _insertOrdered()
1679 * if there is only one single code point to process;
1680 * this is indicated with p==NULL, and (c, c2) is the character to insert
1681 * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1682 * for a supplementary character)
1683 * otherwise, p[length] is merged in with _mergeOrdered()
1684 */
1685 if(isNorm32HangulOrJamo(norm32)) {
1686 if(nx_contains(nx, c)) {
1687 c2=0;
1688 p=NULL;
1689 length=1;
1690 } else {
1691 /* Hangul syllable: decompose algorithmically */
1692 p=buffer;
1693 cc=trailCC=0;
1694
1695 c-=HANGUL_BASE;
1696
1697 c2=(UChar)(c%JAMO_T_COUNT);
1698 c/=JAMO_T_COUNT;
1699 if(c2>0) {
1700 buffer[2]=(UChar)(JAMO_T_BASE+c2);
1701 length=3;
1702 } else {
1703 length=2;
1704 }
1705
1706 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1707 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1708 }
1709 } else {
1710 if(isNorm32Regular(norm32)) {
1711 c2=0;
1712 length=1;
1713 } else {
1714 /* c is a lead surrogate, get the real norm32 */
1715 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
1716 ++src;
1717 length=2;
1718 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1719 } else {
1720 c2=0;
1721 length=1;
1722 norm32=0;
1723 }
1724 }
1725
1726 /* get the decomposition and the lead and trail cc's */
1727 if(nx_contains(nx, c, c2)) {
1728 /* excluded: norm32==0 */
1729 cc=trailCC=0;
1730 p=NULL;
1731 } else if((norm32&qcMask)==0) {
1732 /* c does not decompose */
1733 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1734 p=NULL;
1735 } else {
1736 /* c decomposes, get everything from the variable-length extra data */
1737 p=_decompose(norm32, qcMask, length, cc, trailCC);
1738 if(length==1) {
1739 /* fastpath a single code unit from decomposition */
1740 c=*p;
1741 c2=0;
1742 p=NULL;
1743 }
1744 }
1745 }
1746
1747 /* append the decomposition to the destination buffer, assume length>0 */
1748 if((destIndex+length)<=destCapacity) {
1749 UChar *reorderSplit=dest+destIndex;
1750 if(p==NULL) {
1751 /* fastpath: single code point */
1752 if(cc!=0 && cc<prevCC) {
1753 /* (c, c2) is out of order with respect to the preceding text */
1754 destIndex+=length;
1755 trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
1756 } else {
1757 /* just append (c, c2) */
1758 dest[destIndex++]=c;
1759 if(c2!=0) {
1760 dest[destIndex++]=c2;
1761 }
1762 }
1763 } else {
1764 /* general: multiple code points (ordered by themselves) from decomposition */
1765 if(cc!=0 && cc<prevCC) {
1766 /* the decomposition is out of order with respect to the preceding text */
1767 destIndex+=length;
1768 trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
1769 } else {
1770 /* just append the decomposition */
1771 do {
1772 dest[destIndex++]=*p++;
1773 } while(--length>0);
1774 }
1775 }
1776 } else {
1777 /* buffer overflow */
1778 /* keep incrementing the destIndex for preflighting */
1779 destIndex+=length;
1780 }
1781
1782 prevCC=trailCC;
1783 if(prevCC==0) {
1784 reorderStartIndex=destIndex;
1785 }
1786 }
1787
1788 outTrailCC=prevCC;
1789 return destIndex;
1790}
1791
1792U_CAPI int32_t U_EXPORT2
1793unorm_decompose(UChar *dest, int32_t destCapacity,
1794 const UChar *src, int32_t srcLength,
1795 UBool compat, int32_t options,
1796 UErrorCode *pErrorCode) {
1797 const UnicodeSet *nx;
1798 int32_t destIndex;
1799 uint8_t trailCC;
1800
1801 if(!_haveData(*pErrorCode)) {
1802 return 0;
1803 }
1804
1805 nx=getNX(options, *pErrorCode);
1806 if(U_FAILURE(*pErrorCode)) {
1807 return 0;
1808 }
1809
1810 destIndex=_decompose(dest, destCapacity,
1811 src, srcLength,
1812 compat, nx,
1813 trailCC);
1814
1815 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
1816}
1817
374ca955 1818/* make NFC & NFKC ---------------------------------------------------------- */
b75a7d8f 1819
374ca955
A
1820/* get the composition properties of the next character */
1821static inline uint32_t
1822_getNextCombining(UChar *&p, const UChar *limit,
1823 UChar &c, UChar &c2,
1824 uint16_t &combiningIndex, uint8_t &cc,
1825 const UnicodeSet *nx) {
1826 uint32_t norm32, combineFlags;
b75a7d8f 1827
374ca955
A
1828 /* get properties */
1829 c=*p++;
1830 norm32=_getNorm32(c);
b75a7d8f 1831
374ca955
A
1832 /* preset output values for most characters */
1833 c2=0;
1834 combiningIndex=0;
1835 cc=0;
b75a7d8f 1836
374ca955
A
1837 if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
1838 return 0;
1839 } else {
b75a7d8f 1840 if(isNorm32Regular(norm32)) {
374ca955
A
1841 /* set cc etc. below */
1842 } else if(isNorm32HangulOrJamo(norm32)) {
1843 /* a compatibility decomposition contained Jamos */
1844 combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
1845 return norm32&_NORM_COMBINES_ANY;
b75a7d8f 1846 } else {
b75a7d8f 1847 /* c is a lead surrogate, get the real norm32 */
374ca955
A
1848 if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
1849 ++p;
b75a7d8f
A
1850 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1851 } else {
1852 c2=0;
374ca955 1853 return 0;
b75a7d8f
A
1854 }
1855 }
1856
b75a7d8f 1857 if(nx_contains(nx, c, c2)) {
374ca955 1858 return 0; /* excluded: norm32==0 */
b75a7d8f
A
1859 }
1860
374ca955 1861 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
b75a7d8f 1862
374ca955
A
1863 combineFlags=norm32&_NORM_COMBINES_ANY;
1864 if(combineFlags!=0) {
1865 combiningIndex=*(_getExtraData(norm32)-1);
b75a7d8f 1866 }
374ca955 1867 return combineFlags;
b75a7d8f 1868 }
b75a7d8f
A
1869}
1870
374ca955
A
1871/*
1872 * given a composition-result starter (c, c2) - which means its cc==0,
1873 * it combines forward, it has extra data, its norm32!=0,
1874 * it is not a Hangul or Jamo,
1875 * get just its combineFwdIndex
1876 *
1877 * norm32(c) is special if and only if c2!=0
1878 */
1879static inline uint16_t
1880_getCombiningIndexFromStarter(UChar c, UChar c2) {
1881 uint32_t norm32;
b75a7d8f 1882
374ca955
A
1883 norm32=_getNorm32(c);
1884 if(c2!=0) {
1885 norm32=_getNorm32FromSurrogatePair(norm32, c2);
b75a7d8f 1886 }
374ca955
A
1887 return *(_getExtraData(norm32)-1);
1888}
b75a7d8f 1889
374ca955
A
1890/*
1891 * Find the recomposition result for
1892 * a forward-combining character
1893 * (specified with a pointer to its part of the combiningTable[])
1894 * and a backward-combining character
1895 * (specified with its combineBackIndex).
1896 *
1897 * If these two characters combine, then set (value, value2)
1898 * with the code unit(s) of the composition character.
1899 *
1900 * Return value:
1901 * 0 do not combine
1902 * 1 combine
1903 * >1 combine, and the composition is a forward-combining starter
1904 *
1905 * See unormimp.h for a description of the composition table format.
1906 */
1907static inline uint16_t
1908_combine(const uint16_t *table, uint16_t combineBackIndex,
1909 uint16_t &value, uint16_t &value2) {
1910 uint16_t key;
b75a7d8f 1911
374ca955
A
1912 /* search in the starter's composition table */
1913 for(;;) {
1914 key=*table++;
1915 if(key>=combineBackIndex) {
1916 break;
1917 }
1918 table+= *table&0x8000 ? 2 : 1;
b75a7d8f
A
1919 }
1920
374ca955
A
1921 /* mask off bit 15, the last-entry-in-the-list flag */
1922 if((key&0x7fff)==combineBackIndex) {
1923 /* found! combine! */
1924 value=*table;
b75a7d8f 1925
374ca955
A
1926 /* is the composition a starter that combines forward? */
1927 key=(uint16_t)((value&0x2000)+1);
1928
1929 /* get the composition result code point from the variable-length result value */
1930 if(value&0x8000) {
1931 if(value&0x4000) {
1932 /* surrogate pair composition result */
1933 value=(uint16_t)((value&0x3ff)|0xd800);
1934 value2=*(table+1);
1935 } else {
1936 /* BMP composition result U+2000..U+ffff */
1937 value=*(table+1);
1938 value2=0;
b75a7d8f
A
1939 }
1940 } else {
374ca955
A
1941 /* BMP composition result U+0000..U+1fff */
1942 value&=0x1fff;
1943 value2=0;
b75a7d8f
A
1944 }
1945
374ca955
A
1946 return key;
1947 } else {
1948 /* not found */
1949 return 0;
1950 }
1951}
b75a7d8f 1952
374ca955
A
1953static inline UBool
1954_composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
1955 UBool compat, UChar *dest, const UnicodeSet *nx) {
1956 if(isJamoVTNorm32JamoV(norm32)) {
1957 /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1958 prev=(UChar)(prev-JAMO_L_BASE);
1959 if(prev<JAMO_L_COUNT) {
1960 c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
b75a7d8f 1961
374ca955
A
1962 /* check if the next character is a Jamo T (normal or compatibility) */
1963 if(src!=limit) {
1964 UChar next, t;
b75a7d8f 1965
374ca955
A
1966 next=*src;
1967 if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1968 /* normal Jamo T */
1969 ++src;
1970 c+=t;
1971 } else if(compat) {
1972 /* if NFKC, then check for compatibility Jamo T (BMP only) */
1973 norm32=_getNorm32(next);
1974 if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
1975 const UChar *p;
1976 int32_t length;
1977 uint8_t cc, trailCC;
1978
1979 p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
1980 if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1981 /* compatibility Jamo T */
1982 ++src;
1983 c+=t;
1984 }
1985 }
1986 }
1987 }
1988 if(nx_contains(nx, c)) {
1989 if(!isHangulWithoutJamoT(c)) {
1990 --src; /* undo ++src from reading the Jamo T */
1991 }
1992 return FALSE;
1993 }
1994 if(dest!=0) {
1995 *dest=c;
b75a7d8f 1996 }
374ca955 1997 return TRUE;
b75a7d8f 1998 }
374ca955
A
1999 } else if(isHangulWithoutJamoT(prev)) {
2000 /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
2001 c=(UChar)(prev+(c-JAMO_T_BASE));
2002 if(nx_contains(nx, c)) {
2003 return FALSE;
b75a7d8f 2004 }
374ca955
A
2005 if(dest!=0) {
2006 *dest=c;
b75a7d8f 2007 }
374ca955
A
2008 return TRUE;
2009 }
2010 return FALSE;
2011}
b75a7d8f 2012
374ca955
A
2013/*
2014 * recompose the characters in [p..limit[
2015 * (which is in NFD - decomposed and canonically ordered),
2016 * adjust limit, and return the trailing cc
2017 *
2018 * since for NFKC we may get Jamos in decompositions, we need to
2019 * recompose those too
2020 *
2021 * note that recomposition never lengthens the text:
2022 * any character consists of either one or two code units;
2023 * a composition may contain at most one more code unit than the original starter,
2024 * while the combining mark that is removed has at least one code unit
2025 */
2026static uint8_t
2027_recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
2028 UChar *starter, *pRemove, *q, *r;
2029 uint32_t combineFlags;
2030 UChar c, c2;
2031 uint16_t combineFwdIndex, combineBackIndex;
2032 uint16_t result, value, value2;
2033 uint8_t cc, prevCC;
2034 UBool starterIsSupplementary;
b75a7d8f 2035
374ca955
A
2036 starter=NULL; /* no starter */
2037 combineFwdIndex=0; /* will not be used until starter!=NULL - avoid compiler warnings */
2038 combineBackIndex=0; /* will always be set if combineFlags!=0 - avoid compiler warnings */
2039 value=value2=0; /* always set by _combine() before used - avoid compiler warnings */
2040 starterIsSupplementary=FALSE; /* will not be used until starter!=NULL - avoid compiler warnings */
2041 prevCC=0;
b75a7d8f 2042
374ca955
A
2043 for(;;) {
2044 combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
2045 if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
2046 if(combineBackIndex&0x8000) {
2047 /* c is a Jamo V/T, see if we can compose it with the previous character */
2048 /* for the PRI #29 fix, check that there is no intervening combining mark */
2049 if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
2050 pRemove=NULL; /* NULL while no Hangul composition */
2051 combineFlags=0;
2052 c2=*starter;
2053 if(combineBackIndex==0xfff2) {
2054 /* Jamo V, compose with previous Jamo L and following Jamo T */
2055 c2=(UChar)(c2-JAMO_L_BASE);
2056 if(c2<JAMO_L_COUNT) {
2057 pRemove=p-1;
2058 c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
2059 if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
2060 ++p;
2061 c+=c2;
2062 } else {
2063 /* the result is an LV syllable, which is a starter (unlike LVT) */
2064 combineFlags=_NORM_COMBINES_FWD;
2065 }
2066 if(!nx_contains(nx, c)) {
2067 *starter=c;
2068 } else {
2069 /* excluded */
2070 if(!isHangulWithoutJamoT(c)) {
2071 --p; /* undo the ++p from reading the Jamo T */
2072 }
2073 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
2074 pRemove=NULL;
2075 }
2076 }
b75a7d8f 2077
374ca955
A
2078 /*
2079 * Normally, the following can not occur:
2080 * Since the input is in NFD, there are no Hangul LV syllables that
2081 * a Jamo T could combine with.
2082 * All Jamo Ts are combined above when handling Jamo Vs.
2083 *
2084 * However, before the PRI #29 fix, this can occur due to
2085 * an intervening combining mark between the Hangul LV and the Jamo T.
2086 */
2087 } else {
2088 /* Jamo T, compose with previous Hangul that does not have a Jamo T */
2089 if(isHangulWithoutJamoT(c2)) {
2090 c2+=(UChar)(c-JAMO_T_BASE);
2091 if(!nx_contains(nx, c2)) {
2092 pRemove=p-1;
2093 *starter=c2;
2094 }
2095 }
2096 }
b75a7d8f 2097
374ca955
A
2098 if(pRemove!=NULL) {
2099 /* remove the Jamo(s) */
2100 q=pRemove;
2101 r=p;
2102 while(r<limit) {
2103 *q++=*r++;
2104 }
2105 p=pRemove;
2106 limit=q;
2107 }
b75a7d8f 2108
374ca955 2109 c2=0; /* c2 held *starter temporarily */
b75a7d8f 2110
374ca955
A
2111 if(combineFlags!=0) {
2112 /*
2113 * not starter=NULL because the composition is a Hangul LV syllable
2114 * and might combine once more (but only before the PRI #29 fix)
2115 */
b75a7d8f 2116
374ca955
A
2117 /* done? */
2118 if(p==limit) {
2119 return prevCC;
2120 }
b75a7d8f 2121
374ca955
A
2122 /* the composition is a Hangul LV syllable which is a starter that combines forward */
2123 combineFwdIndex=0xfff0;
b75a7d8f 2124
374ca955
A
2125 /* we combined; continue with looking for compositions */
2126 continue;
2127 }
2128 }
b75a7d8f 2129
374ca955
A
2130 /*
2131 * now: cc==0 and the combining index does not include "forward" ->
2132 * the rest of the loop body will reset starter to NULL;
2133 * technically, a composed Hangul syllable is a starter, but it
2134 * does not combine forward now that we have consumed all eligible Jamos;
2135 * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2136 */
b75a7d8f 2137
374ca955
A
2138 } else if(
2139 /* the starter is not a Hangul LV or Jamo V/T and */
2140 !(combineFwdIndex&0x8000) &&
2141 /* the combining mark is not blocked and */
2142 ((options&UNORM_BEFORE_PRI_29) ?
2143 (prevCC!=cc || prevCC==0) :
2144 (prevCC<cc || prevCC==0)) &&
2145 /* the starter and the combining mark (c, c2) do combine and */
2146 0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
2147 /* the composition result is not excluded */
2148 !nx_contains(nx, value, value2)
2149 ) {
2150 /* replace the starter with the composition, remove the combining mark */
2151 pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
b75a7d8f
A
2152
2153 /* replace the starter with the composition */
2154 *starter=(UChar)value;
2155 if(starterIsSupplementary) {
2156 if(value2!=0) {
2157 /* both are supplementary */
2158 *(starter+1)=(UChar)value2;
2159 } else {
2160 /* the composition is shorter than the starter, move the intermediate characters forward one */
2161 starterIsSupplementary=FALSE;
2162 q=starter+1;
2163 r=q+1;
2164 while(r<pRemove) {
2165 *q++=*r++;
2166 }
2167 --pRemove;
2168 }
2169 } else if(value2!=0) {
2170 /* the composition is longer than the starter, move the intermediate characters back one */
2171 starterIsSupplementary=TRUE;
2172 ++starter; /* temporarily increment for the loop boundary */
2173 q=pRemove;
2174 r=++pRemove;
2175 while(starter<q) {
2176 *--r=*--q;
2177 }
2178 *starter=(UChar)value2;
2179 --starter; /* undo the temporary increment */
2180 /* } else { both are on the BMP, nothing more to do */
2181 }
2182
2183 /* remove the combining mark by moving the following text over it */
2184 if(pRemove<p) {
2185 q=pRemove;
2186 r=p;
2187 while(r<limit) {
2188 *q++=*r++;
2189 }
2190 p=pRemove;
2191 limit=q;
2192 }
2193
2194 /* keep prevCC because we removed the combining mark */
2195
2196 /* done? */
2197 if(p==limit) {
2198 return prevCC;
2199 }
2200
2201 /* is the composition a starter that combines forward? */
2202 if(result>1) {
2203 combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
2204 } else {
2205 starter=NULL;
2206 }
2207
374ca955 2208 /* we combined; continue with looking for compositions */
b75a7d8f
A
2209 continue;
2210 }
2211 }
2212
374ca955
A
2213 /* no combination this time */
2214 prevCC=cc;
2215 if(p==limit) {
2216 return prevCC;
2217 }
2218
2219 /* if (c, c2) did not combine, then check if it is a starter */
2220 if(cc==0) {
2221 /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2222 if(combineFlags&_NORM_COMBINES_FWD) {
2223 /* it may combine with something, prepare for it */
2224 if(c2==0) {
2225 starterIsSupplementary=FALSE;
2226 starter=p-1;
2227 } else {
2228 starterIsSupplementary=TRUE;
2229 starter=p-2;
2230 }
2231 combineFwdIndex=combineBackIndex;
2232 } else {
2233 /* it will not combine with anything */
2234 starter=NULL;
2235 }
2236 } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
2237 /* FCC: no discontiguous compositions; any intervening character blocks */
2238 starter=NULL;
2239 }
2240 }
2241}
2242
2243/* decompose and recompose [prevStarter..src[ */
2244static const UChar *
2245_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
2246 const UChar *prevStarter, const UChar *src,
2247 uint8_t &prevCC,
2248 int32_t options, const UnicodeSet *nx,
2249 UErrorCode *pErrorCode) {
2250 UChar *recomposeLimit;
2251 uint8_t trailCC;
2252 UBool compat;
2253
2254 compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
2255
2256 /* decompose [prevStarter..src[ */
2257 length=_decompose(buffer, bufferCapacity,
73c04bcf 2258 prevStarter, (int32_t)(src-prevStarter),
374ca955
A
2259 compat, nx,
2260 trailCC);
2261 if(length>bufferCapacity) {
2262 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
2263 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2264 return NULL;
2265 }
2266 length=_decompose(buffer, bufferCapacity,
73c04bcf 2267 prevStarter, (int32_t)(src-prevStarter),
374ca955
A
2268 compat, nx,
2269 trailCC);
2270 }
2271
2272 /* recompose the decomposition */
2273 recomposeLimit=buffer+length;
2274 if(length>=2) {
2275 prevCC=_recompose(buffer, recomposeLimit, options, nx);
2276 }
2277
2278 /* return with a pointer to the recomposition and its length */
73c04bcf 2279 length=(int32_t)(recomposeLimit-buffer);
374ca955
A
2280 return buffer;
2281}
2282
2283static int32_t
2284_compose(UChar *dest, int32_t destCapacity,
2285 const UChar *src, int32_t srcLength,
2286 int32_t options, const UnicodeSet *nx,
2287 UErrorCode *pErrorCode) {
2288 UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2289 UChar *buffer;
2290 int32_t bufferCapacity;
2291
2292 const UChar *limit, *prevSrc, *prevStarter;
2293 uint32_t norm32, ccOrQCMask, qcMask;
2294 int32_t destIndex, reorderStartIndex, length;
2295 UChar c, c2, minNoMaybe;
2296 uint8_t cc, prevCC;
2297
2298 if(options&_NORM_OPTIONS_COMPAT) {
2299 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2300 qcMask=_NORM_QC_NFKC;
2301 } else {
2302 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2303 qcMask=_NORM_QC_NFC;
2304 }
2305
2306 /* initialize */
2307 buffer=stackBuffer;
2308 bufferCapacity=_STACK_BUFFER_CAPACITY;
2309
2310 /*
2311 * prevStarter points to the last character before the current one
2312 * that is a "true" starter with cc==0 and quick check "yes".
2313 *
2314 * prevStarter will be used instead of looking for a true starter
2315 * while incrementally decomposing [prevStarter..prevSrc[
2316 * in _composePart(). Having a good prevStarter allows to just decompose
2317 * the entire [prevStarter..prevSrc[.
2318 *
2319 * When _composePart() backs out from prevSrc back to prevStarter,
2320 * then it also backs out destIndex by the same amount.
2321 * Therefore, at all times, the (prevSrc-prevStarter) source units
2322 * must correspond 1:1 to destination units counted with destIndex,
2323 * except for reordering.
2324 * This is true for the qc "yes" characters copied in the fast loop,
2325 * and for pure reordering.
2326 * prevStarter must be set forward to src when this is not true:
2327 * In _composePart() and after composing a Hangul syllable.
2328 *
2329 * This mechanism relies on the assumption that the decomposition of a true starter
2330 * also begins with a true starter. gennorm/store.c checks for this.
2331 */
2332 prevStarter=src;
2333
2334 ccOrQCMask=_NORM_CC_MASK|qcMask;
2335 destIndex=reorderStartIndex=0;
2336 prevCC=0;
2337
2338 /* avoid compiler warnings */
2339 norm32=0;
2340 c=0;
2341
2342 if(srcLength>=0) {
2343 /* string with length */
2344 limit=src+srcLength;
2345 } else /* srcLength==-1 */ {
2346 /* zero-terminated string */
2347 limit=NULL;
2348 }
2349
2350 U_ALIGN_CODE(16);
2351
2352 for(;;) {
2353 /* count code units below the minimum or with irrelevant data for the quick check */
2354 prevSrc=src;
2355 if(limit==NULL) {
2356 while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
2357 prevCC=0;
2358 ++src;
2359 }
2360 } else {
2361 while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
2362 prevCC=0;
2363 ++src;
2364 }
2365 }
2366
2367 /* copy these code units all at once */
2368 if(src!=prevSrc) {
2369 length=(int32_t)(src-prevSrc);
2370 if((destIndex+length)<=destCapacity) {
2371 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2372 }
2373 destIndex+=length;
2374 reorderStartIndex=destIndex;
2375
2376 /* set prevStarter to the last character in the quick check loop */
2377 prevStarter=src-1;
2378 if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
2379 --prevStarter;
2380 }
2381
2382 prevSrc=src;
2383 }
2384
2385 /* end of source reached? */
2386 if(limit==NULL ? c==0 : src==limit) {
2387 break;
2388 }
2389
2390 /* c already contains *src and norm32 is set for it, increment src */
2391 ++src;
2392
2393 /*
2394 * source buffer pointers:
2395 *
2396 * all done quick check current char not yet
2397 * "yes" but (c, c2) processed
2398 * may combine
2399 * forward
2400 * [-------------[-------------[-------------[-------------[
2401 * | | | | |
2402 * start prevStarter prevSrc src limit
2403 *
2404 *
2405 * destination buffer pointers and indexes:
2406 *
2407 * all done might take not filled yet
2408 * characters for
2409 * reordering
2410 * [-------------[-------------[-------------[
2411 * | | | |
2412 * dest reorderStartIndex destIndex destCapacity
2413 */
2414
2415 /* check one above-minimum, relevant code unit */
2416 /*
2417 * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2418 * check for Jamo V/T, then for surrogates and regular characters
2419 * c is not a Hangul syllable or Jamo L because
2420 * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2421 */
2422 if(isNorm32HangulOrJamo(norm32)) {
2423 /*
2424 * c is a Jamo V/T:
2425 * try to compose with the previous character, Jamo V also with a following Jamo T,
2426 * and set values here right now in case we just continue with the main loop
2427 */
2428 prevCC=cc=0;
2429 reorderStartIndex=destIndex;
2430
2431 if(
2432 destIndex>0 &&
2433 _composeHangul(
2434 *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
2435 destIndex<=destCapacity ? dest+(destIndex-1) : 0,
2436 nx)
2437 ) {
2438 prevStarter=src;
2439 continue;
2440 }
2441
2442 /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2443 c2=0;
2444 length=1;
2445 prevStarter=prevSrc;
2446 } else {
2447 if(isNorm32Regular(norm32)) {
2448 c2=0;
2449 length=1;
2450 } else {
2451 /* c is a lead surrogate, get the real norm32 */
2452 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2453 ++src;
2454 length=2;
2455 norm32=_getNorm32FromSurrogatePair(norm32, c2);
2456 } else {
2457 /* c is an unpaired lead surrogate, nothing to do */
2458 c2=0;
2459 length=1;
2460 norm32=0;
2461 }
2462 }
2463
2464 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2465 if(nx_contains(nx, c, c2)) {
2466 /* excluded: norm32==0 */
2467 cc=0;
2468 } else if((norm32&qcMask)==0) {
2469 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2470 } else {
2471 const UChar *p;
2472 uint32_t decompQCMask;
2473
2474 /*
2475 * find appropriate boundaries around this character,
2476 * decompose the source text from between the boundaries,
2477 * and recompose it
2478 *
2479 * this puts the intermediate text into the side buffer because
2480 * it might be longer than the recomposition end result,
2481 * or the destination buffer may be too short or missing
2482 *
2483 * note that destIndex may be adjusted backwards to account
2484 * for source text that passed the quick check but needed to
2485 * take part in the recomposition
2486 */
2487 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
2488
2489 /*
2490 * find the last true starter in [prevStarter..src[
2491 * it is either the decomposition of the current character (at prevSrc),
2492 * or prevStarter
2493 */
2494 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
2495 prevStarter=prevSrc;
2496 } else {
2497 /* adjust destIndex: back out what had been copied with qc "yes" */
2498 destIndex-=(int32_t)(prevSrc-prevStarter);
2499 }
2500
2501 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2502 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
2503
2504 /* compose [prevStarter..src[ */
2505 p=_composePart(stackBuffer, buffer, bufferCapacity,
2506 length, /* output */
2507 prevStarter, src,
2508 prevCC, /* output */
2509 options, nx,
2510 pErrorCode);
2511
2512 if(p==NULL) {
2513 destIndex=0; /* an error occurred (out of memory) */
2514 break;
2515 }
2516
2517 /* append the recomposed buffer contents to the destination buffer */
2518 if((destIndex+length)<=destCapacity) {
2519 while(length>0) {
2520 dest[destIndex++]=*p++;
2521 --length;
2522 }
2523 } else {
2524 /* buffer overflow */
2525 /* keep incrementing the destIndex for preflighting */
2526 destIndex+=length;
2527 }
2528
2529 /* set the next starter */
2530 prevStarter=src;
2531
2532 continue;
2533 }
2534 }
2535
2536 /* append the single code point (c, c2) to the destination buffer */
2537 if((destIndex+length)<=destCapacity) {
2538 if(cc!=0 && cc<prevCC) {
2539 /* (c, c2) is out of order with respect to the preceding text */
2540 UChar *reorderSplit=dest+destIndex;
2541 destIndex+=length;
2542 prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2543 } else {
2544 /* just append (c, c2) */
2545 dest[destIndex++]=c;
2546 if(c2!=0) {
2547 dest[destIndex++]=c2;
2548 }
2549 prevCC=cc;
2550 }
2551 } else {
2552 /* buffer overflow */
2553 /* keep incrementing the destIndex for preflighting */
2554 destIndex+=length;
2555 prevCC=cc;
2556 }
2557 }
2558
2559 /* cleanup */
2560 if(buffer!=stackBuffer) {
2561 uprv_free(buffer);
2562 }
2563
2564 return destIndex;
2565}
2566
2567U_CAPI int32_t U_EXPORT2
2568unorm_compose(UChar *dest, int32_t destCapacity,
2569 const UChar *src, int32_t srcLength,
2570 UBool compat, int32_t options,
2571 UErrorCode *pErrorCode) {
2572 const UnicodeSet *nx;
2573 int32_t destIndex;
2574
2575 if(!_haveData(*pErrorCode)) {
2576 return 0;
2577 }
2578
2579 nx=getNX(options, *pErrorCode);
2580 if(U_FAILURE(*pErrorCode)) {
2581 return 0;
2582 }
2583
2584 /* reset options bits that should only be set here or inside _compose() */
2585 options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
2586
2587 if(compat) {
2588 options|=_NORM_OPTIONS_COMPAT;
2589 }
2590
2591 destIndex=_compose(dest, destCapacity,
2592 src, srcLength,
2593 options, nx,
2594 pErrorCode);
2595
2596 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2597}
2598
2599/* make FCD ----------------------------------------------------------------- */
2600
2601static const UChar *
2602_findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
2603 UChar c, c2;
2604
2605 /*
2606 * find the first position in [src..limit[ after some cc==0 according to FCD data
2607 *
2608 * at the beginning of the loop, we have fcd16 from before src
2609 *
2610 * stop at positions:
2611 * - after trail cc==0
2612 * - at the end of the source
2613 * - before lead cc==0
2614 */
2615 for(;;) {
2616 /* stop if trail cc==0 for the previous character */
2617 if((fcd16&0xff)==0) {
2618 break;
2619 }
2620
2621 /* get c=*src - stop at end of string */
2622 if(src==limit) {
2623 break;
2624 }
2625 c=*src;
2626
2627 /* stop if lead cc==0 for this character */
2628 if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
2629 break; /* catches terminating NUL, too */
2630 }
2631
2632 if(!UTF_IS_FIRST_SURROGATE(c)) {
2633 if(fcd16<=0xff) {
2634 break;
2635 }
2636 ++src;
2637 } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
2638 /* c is a lead surrogate, get the real fcd16 */
2639 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2640 if(fcd16<=0xff) {
2641 break;
2642 }
2643 src+=2;
2644 } else {
2645 /* c is an unpaired first surrogate, lead cc==0 */
2646 break;
2647 }
2648 }
2649
2650 return src;
2651}
2652
2653static uint8_t
2654_decomposeFCD(const UChar *src, const UChar *decompLimit,
2655 UChar *dest, int32_t &destIndex, int32_t destCapacity,
2656 const UnicodeSet *nx) {
2657 const UChar *p;
2658 uint32_t norm32;
2659 int32_t reorderStartIndex, length;
2660 UChar c, c2;
2661 uint8_t cc, prevCC, trailCC;
2662
2663 /*
2664 * canonically decompose [src..decompLimit[
2665 *
2666 * all characters in this range have some non-zero cc,
2667 * directly or in decomposition,
2668 * so that we do not need to check in the following for quick-check limits etc.
2669 *
2670 * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2671 *
2672 * we also do not need to check for c==0 because we have an established decompLimit
2673 */
2674 reorderStartIndex=destIndex;
2675 prevCC=0;
2676
2677 while(src<decompLimit) {
2678 c=*src++;
2679 norm32=_getNorm32(c);
2680 if(isNorm32Regular(norm32)) {
2681 c2=0;
2682 length=1;
2683 } else {
2684 /*
2685 * reminder: this function is called with [src..decompLimit[
2686 * not containing any Hangul/Jamo characters,
2687 * therefore the only specials are lead surrogates
2688 */
2689 /* c is a lead surrogate, get the real norm32 */
2690 if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2691 ++src;
2692 length=2;
2693 norm32=_getNorm32FromSurrogatePair(norm32, c2);
2694 } else {
2695 c2=0;
2696 length=1;
2697 norm32=0;
2698 }
2699 }
2700
2701 /* get the decomposition and the lead and trail cc's */
2702 if(nx_contains(nx, c, c2)) {
2703 /* excluded: norm32==0 */
2704 cc=trailCC=0;
2705 p=NULL;
2706 } else if((norm32&_NORM_QC_NFD)==0) {
2707 /* c does not decompose */
2708 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2709 p=NULL;
2710 } else {
2711 /* c decomposes, get everything from the variable-length extra data */
2712 p=_decompose(norm32, length, cc, trailCC);
2713 if(length==1) {
2714 /* fastpath a single code unit from decomposition */
2715 c=*p;
2716 c2=0;
2717 p=NULL;
2718 }
2719 }
2720
2721 /* append the decomposition to the destination buffer, assume length>0 */
2722 if((destIndex+length)<=destCapacity) {
2723 UChar *reorderSplit=dest+destIndex;
2724 if(p==NULL) {
2725 /* fastpath: single code point */
2726 if(cc!=0 && cc<prevCC) {
2727 /* (c, c2) is out of order with respect to the preceding text */
2728 destIndex+=length;
2729 trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
b75a7d8f 2730 } else {
374ca955
A
2731 /* just append (c, c2) */
2732 dest[destIndex++]=c;
2733 if(c2!=0) {
2734 dest[destIndex++]=c2;
2735 }
b75a7d8f 2736 }
b75a7d8f 2737 } else {
374ca955
A
2738 /* general: multiple code points (ordered by themselves) from decomposition */
2739 if(cc!=0 && cc<prevCC) {
2740 /* the decomposition is out of order with respect to the preceding text */
2741 destIndex+=length;
2742 trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
2743 } else {
2744 /* just append the decomposition */
2745 do {
2746 dest[destIndex++]=*p++;
2747 } while(--length>0);
2748 }
b75a7d8f 2749 }
374ca955
A
2750 } else {
2751 /* buffer overflow */
2752 /* keep incrementing the destIndex for preflighting */
2753 destIndex+=length;
2754 }
2755
2756 prevCC=trailCC;
2757 if(prevCC==0) {
2758 reorderStartIndex=destIndex;
b75a7d8f
A
2759 }
2760 }
374ca955
A
2761
2762 return prevCC;
b75a7d8f
A
2763}
2764
374ca955
A
2765static int32_t
2766unorm_makeFCD(UChar *dest, int32_t destCapacity,
2767 const UChar *src, int32_t srcLength,
2768 const UnicodeSet *nx,
2769 UErrorCode *pErrorCode) {
2770 const UChar *limit, *prevSrc, *decompStart;
2771 int32_t destIndex, length;
b75a7d8f 2772 UChar c, c2;
374ca955
A
2773 uint16_t fcd16;
2774 int16_t prevCC, cc;
b75a7d8f 2775
374ca955
A
2776 if(!_haveData(*pErrorCode)) {
2777 return 0;
b75a7d8f 2778 }
b75a7d8f 2779
374ca955
A
2780 /* initialize */
2781 decompStart=src;
2782 destIndex=0;
2783 prevCC=0;
b75a7d8f 2784
374ca955
A
2785 /* avoid compiler warnings */
2786 c=0;
2787 fcd16=0;
2788
2789 if(srcLength>=0) {
2790 /* string with length */
2791 limit=src+srcLength;
2792 } else /* srcLength==-1 */ {
2793 /* zero-terminated string */
2794 limit=NULL;
2795 }
2796
2797 U_ALIGN_CODE(16);
b75a7d8f
A
2798
2799 for(;;) {
374ca955
A
2800 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2801 prevSrc=src;
2802 if(limit==NULL) {
2803 for(;;) {
2804 c=*src;
2805 if(c<_NORM_MIN_WITH_LEAD_CC) {
2806 if(c==0) {
2807 break;
2808 }
2809 prevCC=(int16_t)-c;
2810 } else if((fcd16=_getFCD16(c))==0) {
2811 prevCC=0;
2812 } else {
2813 break;
2814 }
2815 ++src;
2816 }
2817 } else {
2818 for(;;) {
2819 if(src==limit) {
2820 break;
2821 } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
2822 prevCC=(int16_t)-c;
2823 } else if((fcd16=_getFCD16(c))==0) {
2824 prevCC=0;
2825 } else {
2826 break;
2827 }
2828 ++src;
2829 }
b75a7d8f 2830 }
374ca955
A
2831
2832 /*
2833 * prevCC has values from the following ranges:
2834 * 0..0xff - the previous trail combining class
2835 * <0 - the negative value of the previous code unit;
2836 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2837 * was deferred so that average text is checked faster
2838 */
2839
2840 /* copy these code units all at once */
2841 if(src!=prevSrc) {
2842 length=(int32_t)(src-prevSrc);
2843 if((destIndex+length)<=destCapacity) {
2844 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2845 }
2846 destIndex+=length;
2847 prevSrc=src;
2848
2849 /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2850 if(prevCC<0) {
2851 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2852 if(!nx_contains(nx, (UChar32)-prevCC)) {
2853 prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2854 } else {
2855 prevCC=0; /* excluded: fcd16==0 */
2856 }
2857
2858 /*
2859 * set a pointer to this below-U+0300 character;
2860 * if prevCC==0 then it will moved to after this character below
2861 */
2862 decompStart=prevSrc-1;
2863 }
b75a7d8f 2864 }
374ca955
A
2865 /*
2866 * now:
2867 * prevSrc==src - used later to adjust destIndex before decomposition
2868 * prevCC>=0
2869 */
b75a7d8f 2870
374ca955
A
2871 /* end of source reached? */
2872 if(limit==NULL ? c==0 : src==limit) {
2873 break;
b75a7d8f
A
2874 }
2875
374ca955
A
2876 /* set a pointer to after the last source position where prevCC==0 */
2877 if(prevCC==0) {
2878 decompStart=prevSrc;
2879 }
b75a7d8f 2880
374ca955
A
2881 /* c already contains *src and fcd16 is set for it, increment src */
2882 ++src;
2883
2884 /* check one above-minimum, relevant code unit */
2885 if(UTF_IS_FIRST_SURROGATE(c)) {
2886 /* c is a lead surrogate, get the real fcd16 */
2887 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2888 ++src;
2889 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2890 } else {
2891 c2=0;
2892 fcd16=0;
b75a7d8f
A
2893 }
2894 } else {
2895 c2=0;
2896 }
2897
374ca955
A
2898 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2899 if(nx_contains(nx, c, c2)) {
2900 fcd16=0; /* excluded: fcd16==0 */
2901 }
b75a7d8f 2902
374ca955
A
2903 /* check the combining order, get the lead cc */
2904 cc=(int16_t)(fcd16>>8);
2905 if(cc==0 || cc>=prevCC) {
2906 /* the order is ok */
2907 if(cc==0) {
2908 decompStart=prevSrc;
2909 }
2910 prevCC=(int16_t)(fcd16&0xff);
2911
2912 /* just append (c, c2) */
2913 length= c2==0 ? 1 : 2;
2914 if((destIndex+length)<=destCapacity) {
2915 dest[destIndex++]=c;
2916 if(c2!=0) {
2917 dest[destIndex++]=c2;
2918 }
2919 } else {
2920 destIndex+=length;
b75a7d8f 2921 }
374ca955
A
2922 } else {
2923 /*
2924 * back out the part of the source that we copied already but
2925 * is now going to be decomposed;
2926 * prevSrc is set to after what was copied
2927 */
2928 destIndex-=(int32_t)(prevSrc-decompStart);
2929
2930 /*
2931 * find the part of the source that needs to be decomposed;
2932 * to be safe and simple, decompose to before the next character with lead cc==0
2933 */
2934 src=_findSafeFCD(src, limit, fcd16);
2935
2936 /*
2937 * the source text does not fulfill the conditions for FCD;
2938 * decompose and reorder a limited piece of the text
2939 */
2940 prevCC=_decomposeFCD(decompStart, src,
2941 dest, destIndex, destCapacity,
2942 nx);
2943 decompStart=src;
b75a7d8f 2944 }
b75a7d8f
A
2945 }
2946
374ca955 2947 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
b75a7d8f
A
2948}
2949
374ca955 2950/* quick check functions ---------------------------------------------------- */
b75a7d8f 2951
374ca955
A
2952static UBool
2953unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
2954 const UChar *limit;
2955 UChar c, c2;
2956 uint16_t fcd16;
2957 int16_t prevCC, cc;
b75a7d8f 2958
374ca955
A
2959 /* initialize */
2960 prevCC=0;
b75a7d8f 2961
374ca955
A
2962 if(srcLength>=0) {
2963 /* string with length */
2964 limit=src+srcLength;
2965 } else /* srcLength==-1 */ {
2966 /* zero-terminated string */
2967 limit=NULL;
b75a7d8f
A
2968 }
2969
374ca955 2970 U_ALIGN_CODE(16);
b75a7d8f 2971
374ca955
A
2972 for(;;) {
2973 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2974 if(limit==NULL) {
2975 for(;;) {
2976 c=*src++;
2977 if(c<_NORM_MIN_WITH_LEAD_CC) {
2978 if(c==0) {
2979 return TRUE;
b75a7d8f 2980 }
374ca955
A
2981 /*
2982 * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2983 * because chances are good that the next one will have
2984 * a leading cc of 0;
2985 * _getFCD16(-prevCC) is later called when necessary -
2986 * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2987 */
2988 prevCC=(int16_t)-c;
2989 } else if((fcd16=_getFCD16(c))==0) {
2990 prevCC=0;
2991 } else {
2992 break;
b75a7d8f
A
2993 }
2994 }
374ca955
A
2995 } else {
2996 for(;;) {
2997 if(src==limit) {
2998 return TRUE;
2999 } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
3000 prevCC=(int16_t)-c;
3001 } else if((fcd16=_getFCD16(c))==0) {
3002 prevCC=0;
3003 } else {
3004 break;
b75a7d8f 3005 }
b75a7d8f 3006 }
374ca955
A
3007 }
3008
3009 /* check one above-minimum, relevant code unit */
3010 if(UTF_IS_FIRST_SURROGATE(c)) {
3011 /* c is a lead surrogate, get the real fcd16 */
3012 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3013 ++src;
3014 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
3015 } else {
3016 c2=0;
3017 fcd16=0;
b75a7d8f 3018 }
374ca955
A
3019 } else {
3020 c2=0;
b75a7d8f 3021 }
374ca955
A
3022
3023 if(nx_contains(nx, c, c2)) {
3024 prevCC=0; /* excluded: fcd16==0 */
3025 continue;
b75a7d8f 3026 }
374ca955
A
3027
3028 /*
3029 * prevCC has values from the following ranges:
3030 * 0..0xff - the previous trail combining class
3031 * <0 - the negative value of the previous code unit;
3032 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
3033 * was deferred so that average text is checked faster
3034 */
3035
3036 /* check the combining order */
3037 cc=(int16_t)(fcd16>>8);
3038 if(cc!=0) {
3039 if(prevCC<0) {
3040 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
3041 if(!nx_contains(nx, (UChar32)-prevCC)) {
3042 prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
3043 } else {
3044 prevCC=0; /* excluded: fcd16==0 */
3045 }
3046 }
3047
3048 if(cc<prevCC) {
3049 return FALSE;
3050 }
b75a7d8f 3051 }
374ca955 3052 prevCC=(int16_t)(fcd16&0xff);
b75a7d8f 3053 }
b75a7d8f
A
3054}
3055
374ca955
A
3056static UNormalizationCheckResult
3057_quickCheck(const UChar *src,
3058 int32_t srcLength,
3059 UNormalizationMode mode,
3060 UBool allowMaybe,
3061 const UnicodeSet *nx,
3062 UErrorCode *pErrorCode) {
b75a7d8f
A
3063 UChar stackBuffer[_STACK_BUFFER_CAPACITY];
3064 UChar *buffer;
3065 int32_t bufferCapacity;
3066
374ca955
A
3067 const UChar *start, *limit;
3068 uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
3069 int32_t options;
b75a7d8f
A
3070 UChar c, c2, minNoMaybe;
3071 uint8_t cc, prevCC;
374ca955 3072 UNormalizationCheckResult result;
b75a7d8f 3073
374ca955
A
3074 /* check arguments */
3075 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3076 return UNORM_MAYBE;
3077 }
3078
3079 if(src==NULL || srcLength<-1) {
3080 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3081 return UNORM_MAYBE;
3082 }
3083
3084 if(!_haveData(*pErrorCode)) {
3085 return UNORM_MAYBE;
3086 }
3087
3088 /* check for a valid mode and set the quick check minimum and mask */
3089 switch(mode) {
3090 case UNORM_NFC:
b75a7d8f
A
3091 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3092 qcMask=_NORM_QC_NFC;
374ca955
A
3093 options=0;
3094 break;
3095 case UNORM_NFKC:
b75a7d8f
A
3096 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3097 qcMask=_NORM_QC_NFKC;
374ca955
A
3098 options=_NORM_OPTIONS_COMPAT;
3099 break;
3100 case UNORM_NFD:
3101 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
3102 qcMask=_NORM_QC_NFD;
3103 options=0;
3104 break;
3105 case UNORM_NFKD:
3106 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
3107 qcMask=_NORM_QC_NFKD;
3108 options=_NORM_OPTIONS_COMPAT;
3109 break;
3110 case UNORM_FCD:
73c04bcf
A
3111 if(fcdTrie.index==NULL) {
3112 *pErrorCode=U_UNSUPPORTED_ERROR;
3113 return UNORM_MAYBE;
3114 }
374ca955
A
3115 return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
3116 default:
3117 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3118 return UNORM_MAYBE;
b75a7d8f
A
3119 }
3120
374ca955
A
3121 /* initialize */
3122 buffer=stackBuffer;
3123 bufferCapacity=_STACK_BUFFER_CAPACITY;
3124
b75a7d8f 3125 ccOrQCMask=_NORM_CC_MASK|qcMask;
374ca955 3126 result=UNORM_YES;
b75a7d8f
A
3127 prevCC=0;
3128
374ca955 3129 start=src;
b75a7d8f
A
3130 if(srcLength>=0) {
3131 /* string with length */
3132 limit=src+srcLength;
3133 } else /* srcLength==-1 */ {
3134 /* zero-terminated string */
3135 limit=NULL;
3136 }
3137
3138 U_ALIGN_CODE(16);
3139
3140 for(;;) {
374ca955 3141 /* skip a run of code units below the minimum or with irrelevant data for the quick check */
b75a7d8f 3142 if(limit==NULL) {
374ca955
A
3143 for(;;) {
3144 c=*src++;
3145 if(c<minNoMaybe) {
3146 if(c==0) {
3147 goto endloop; /* break out of outer loop */
3148 }
3149 } else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3150 break;
3151 }
b75a7d8f 3152 prevCC=0;
b75a7d8f
A
3153 }
3154 } else {
374ca955
A
3155 for(;;) {
3156 if(src==limit) {
3157 goto endloop; /* break out of outer loop */
3158 } else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3159 break;
3160 }
b75a7d8f 3161 prevCC=0;
b75a7d8f
A
3162 }
3163 }
3164
374ca955
A
3165 /* check one above-minimum, relevant code unit */
3166 if(isNorm32LeadSurrogate(norm32)) {
3167 /* c is a lead surrogate, get the real norm32 */
3168 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3169 ++src;
3170 norm32=_getNorm32FromSurrogatePair(norm32, c2);
3171 } else {
3172 c2=0;
3173 norm32=0;
b75a7d8f 3174 }
374ca955
A
3175 } else {
3176 c2=0;
3177 }
b75a7d8f 3178
374ca955
A
3179 if(nx_contains(nx, c, c2)) {
3180 /* excluded: norm32==0 */
3181 norm32=0;
b75a7d8f
A
3182 }
3183
374ca955
A
3184 /* check the combining order */
3185 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
3186 if(cc!=0 && cc<prevCC) {
3187 result=UNORM_NO;
b75a7d8f
A
3188 break;
3189 }
374ca955 3190 prevCC=cc;
b75a7d8f 3191
374ca955
A
3192 /* check for "no" or "maybe" quick check flags */
3193 qcNorm32=norm32&qcMask;
3194 if(qcNorm32&_NORM_QC_ANY_NO) {
3195 result=UNORM_NO;
3196 break;
3197 } else if(qcNorm32!=0) {
3198 /* "maybe" can only occur for NFC and NFKC */
3199 if(allowMaybe) {
3200 result=UNORM_MAYBE;
b75a7d8f 3201 } else {
374ca955
A
3202 /* normalize a section around here to see if it is really normalized or not */
3203 const UChar *prevStarter;
b75a7d8f 3204 uint32_t decompQCMask;
374ca955 3205 int32_t length;
b75a7d8f 3206
b75a7d8f
A
3207 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
3208
374ca955
A
3209 /* find the previous starter */
3210 prevStarter=src-1; /* set prevStarter to the beginning of the current character */
3211 if(UTF_IS_TRAIL(*prevStarter)) {
3212 --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */
b75a7d8f 3213 }
374ca955 3214 prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
b75a7d8f
A
3215
3216 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3217 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
3218
374ca955
A
3219 /* decompose and recompose [prevStarter..src[ */
3220 _composePart(stackBuffer, buffer, bufferCapacity,
3221 length,
3222 prevStarter,
3223 src,
3224 prevCC,
3225 options, nx, pErrorCode);
3226 if(U_FAILURE(*pErrorCode)) {
3227 result=UNORM_MAYBE; /* error (out of memory) */
b75a7d8f
A
3228 break;
3229 }
3230
374ca955
A
3231 /* compare the normalized version with the original */
3232 if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
3233 result=UNORM_NO; /* normalization differs */
3234 break;
b75a7d8f
A
3235 }
3236
374ca955 3237 /* continue after the next starter */
b75a7d8f 3238 }
b75a7d8f
A
3239 }
3240 }
374ca955 3241endloop:
b75a7d8f 3242
b75a7d8f
A
3243 if(buffer!=stackBuffer) {
3244 uprv_free(buffer);
3245 }
3246
374ca955
A
3247 return result;
3248}
3249
3250U_CAPI UNormalizationCheckResult U_EXPORT2
3251unorm_quickCheck(const UChar *src,
3252 int32_t srcLength,
3253 UNormalizationMode mode,
3254 UErrorCode *pErrorCode) {
3255 return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
b75a7d8f
A
3256}
3257
374ca955
A
3258U_CAPI UNormalizationCheckResult U_EXPORT2
3259unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
3260 UNormalizationMode mode, int32_t options,
3261 UErrorCode *pErrorCode) {
3262 return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
3263}
b75a7d8f 3264
374ca955
A
3265U_CFUNC UNormalizationCheckResult
3266unorm_internalQuickCheck(const UChar *src,
3267 int32_t srcLength,
3268 UNormalizationMode mode,
3269 UBool allowMaybe,
3270 const UnicodeSet *nx,
3271 UErrorCode *pErrorCode) {
3272 return _quickCheck(src, srcLength, mode, allowMaybe, nx, pErrorCode);
3273}
b75a7d8f 3274
374ca955
A
3275U_CAPI UBool U_EXPORT2
3276unorm_isNormalized(const UChar *src, int32_t srcLength,
3277 UNormalizationMode mode,
3278 UErrorCode *pErrorCode) {
3279 return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
3280}
b75a7d8f 3281
374ca955
A
3282U_CAPI UBool U_EXPORT2
3283unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
3284 UNormalizationMode mode, int32_t options,
3285 UErrorCode *pErrorCode) {
3286 return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
b75a7d8f
A
3287}
3288
3289/* normalize() API ---------------------------------------------------------- */
3290
3291/**
3292 * Internal API for normalizing.
3293 * Does not check for bad input.
3294 * Requires _haveData() to be true.
3295 * @internal
3296 */
374ca955
A
3297U_CFUNC int32_t
3298unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
3299 const UChar *src, int32_t srcLength,
3300 UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
3301 UErrorCode *pErrorCode) {
b75a7d8f
A
3302 int32_t destLength;
3303 uint8_t trailCC;
3304
3305 switch(mode) {
3306 case UNORM_NFD:
3307 destLength=_decompose(dest, destCapacity,
3308 src, srcLength,
3309 FALSE, nx, trailCC);
3310 break;
3311 case UNORM_NFKD:
3312 destLength=_decompose(dest, destCapacity,
3313 src, srcLength,
3314 TRUE, nx, trailCC);
3315 break;
3316 case UNORM_NFC:
3317 destLength=_compose(dest, destCapacity,
3318 src, srcLength,
374ca955 3319 options, nx, pErrorCode);
b75a7d8f
A
3320 break;
3321 case UNORM_NFKC:
3322 destLength=_compose(dest, destCapacity,
3323 src, srcLength,
374ca955 3324 options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
b75a7d8f
A
3325 break;
3326 case UNORM_FCD:
73c04bcf
A
3327 if(fcdTrie.index==NULL) {
3328 *pErrorCode=U_UNSUPPORTED_ERROR;
3329 return 0;
3330 }
b75a7d8f
A
3331 return unorm_makeFCD(dest, destCapacity,
3332 src, srcLength,
3333 nx,
3334 pErrorCode);
374ca955
A
3335#if 0
3336 case UNORM_FCC:
3337 destLength=_compose(dest, destCapacity,
3338 src, srcLength,
3339 options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
3340 break;
3341#endif
b75a7d8f
A
3342 case UNORM_NONE:
3343 /* just copy the string */
3344 if(srcLength==-1) {
3345 srcLength=u_strlen(src);
3346 }
3347 if(srcLength>0 && srcLength<=destCapacity) {
3348 uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
3349 }
3350 destLength=srcLength;
3351 break;
3352 default:
3353 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3354 return 0;
3355 }
3356
3357 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3358}
3359
3360/**
3361 * Internal API for normalizing.
3362 * Does not check for bad input.
3363 * @internal
3364 */
3365U_CAPI int32_t U_EXPORT2
3366unorm_internalNormalize(UChar *dest, int32_t destCapacity,
3367 const UChar *src, int32_t srcLength,
3368 UNormalizationMode mode, int32_t options,
3369 UErrorCode *pErrorCode) {
3370 const UnicodeSet *nx;
3371
3372 if(!_haveData(*pErrorCode)) {
3373 return 0;
3374 }
3375
3376 nx=getNX(options, *pErrorCode);
3377 if(U_FAILURE(*pErrorCode)) {
3378 return 0;
3379 }
3380
374ca955
A
3381 /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3382 options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
3383
3384 return unorm_internalNormalizeWithNX(dest, destCapacity,
3385 src, srcLength,
3386 mode, options, nx,
3387 pErrorCode);
b75a7d8f
A
3388}
3389
3390/** Public API for normalizing. */
3391U_CAPI int32_t U_EXPORT2
3392unorm_normalize(const UChar *src, int32_t srcLength,
3393 UNormalizationMode mode, int32_t options,
3394 UChar *dest, int32_t destCapacity,
3395 UErrorCode *pErrorCode) {
3396 /* check argument values */
3397 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3398 return 0;
3399 }
3400
3401 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3402 src==NULL || srcLength<-1
3403 ) {
3404 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3405 return 0;
3406 }
3407
3408 /* check for overlapping src and destination */
3409 if( dest!=NULL &&
3410 ((src>=dest && src<(dest+destCapacity)) ||
3411 (srcLength>0 && dest>=src && dest<(src+srcLength)))
3412 ) {
3413 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3414 return 0;
3415 }
3416
3417 return unorm_internalNormalize(dest, destCapacity,
3418 src, srcLength,
3419 mode, options,
3420 pErrorCode);
3421}
3422
3423
3424/* iteration functions ------------------------------------------------------ */
3425
3426/*
3427 * These iteration functions are the core implementations of the
3428 * Normalizer class iteration API.
3429 * They read from a UCharIterator into their own buffer
3430 * and normalize into the Normalizer iteration buffer.
3431 * Normalizer itself then iterates over its buffer until that needs to be
3432 * filled again.
3433 */
3434
3435/*
3436 * ### TODO:
3437 * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3438 * if iteration bounds are reached,
3439 * try to not call hasNext/hasPrevious and instead check for >=0.
3440 */
3441
3442/* backward iteration ------------------------------------------------------- */
3443
3444/*
3445 * read backwards and get norm32
3446 * return 0 if the character is <minC
3447 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3448 */
3449static inline uint32_t
3450_getPrevNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3451 uint32_t norm32;
3452
3453 /* need src.hasPrevious() */
3454 c=(UChar)src.previous(&src);
3455 c2=0;
3456
3457 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3458 if(c<minC) {
3459 return 0;
3460 } else if(!UTF_IS_SURROGATE(c)) {
3461 return _getNorm32(c);
3462 } else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
3463 /* unpaired surrogate */
3464 return 0;
3465 } else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
3466 norm32=_getNorm32(c2);
3467 if((norm32&mask)==0) {
3468 /* all surrogate pairs with this lead surrogate have irrelevant data */
3469 return 0;
3470 } else {
3471 /* norm32 must be a surrogate special */
3472 return _getNorm32FromSurrogatePair(norm32, c);
3473 }
3474 } else {
3475 /* unpaired second surrogate, undo the c2=src.previous() movement */
3476 src.move(&src, 1, UITER_CURRENT);
3477 c2=0;
3478 return 0;
3479 }
3480}
3481
3482/*
3483 * read backwards and check if the character is a previous-iteration boundary
3484 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3485 */
3486typedef UBool
3487IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3488
3489/*
3490 * for NF*D:
3491 * read backwards and check if the lead combining class is 0
3492 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3493 */
3494static UBool
3495_isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3496 return _isNFDSafe(_getPrevNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3497}
3498
3499/*
3500 * read backwards and check if the character is (or its decomposition begins with)
3501 * a "true starter" (cc==0 and NF*C_YES)
3502 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3503 */
3504static UBool
3505_isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3506 uint32_t norm32, decompQCMask;
3507
3508 decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3509 norm32=_getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3510 return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3511}
3512
3513static int32_t
3514_findPreviousIterationBoundary(UCharIterator &src,
3515 IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
3516 UChar *&buffer, int32_t &bufferCapacity,
3517 int32_t &startIndex,
3518 UErrorCode *pErrorCode) {
3519 UChar *stackBuffer;
3520 UChar c, c2;
3521 UBool isBoundary;
3522
3523 /* initialize */
3524 stackBuffer=buffer;
3525 startIndex=bufferCapacity; /* fill the buffer from the end backwards */
3526
3527 while(src.hasPrevious(&src)) {
3528 isBoundary=isPrevBoundary(src, minC, mask, c, c2);
3529
3530 /* always write this character to the front of the buffer */
3531 /* make sure there is enough space in the buffer */
3532 if(startIndex < (c2==0 ? 1 : 2)) {
3533 int32_t bufferLength=bufferCapacity;
3534
3535 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
3536 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3537 src.move(&src, 0, UITER_START);
3538 return 0;
3539 }
3540
3541 /* move the current buffer contents up */
3542 uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
3543 startIndex+=bufferCapacity-bufferLength;
3544 }
3545
3546 buffer[--startIndex]=c;
3547 if(c2!=0) {
3548 buffer[--startIndex]=c2;
3549 }
3550
3551 /* stop if this just-copied character is a boundary */
3552 if(isBoundary) {
3553 break;
3554 }
3555 }
3556
3557 /* return the length of the buffer contents */
3558 return bufferCapacity-startIndex;
3559}
3560
3561U_CAPI int32_t U_EXPORT2
3562unorm_previous(UCharIterator *src,
3563 UChar *dest, int32_t destCapacity,
3564 UNormalizationMode mode, int32_t options,
3565 UBool doNormalize, UBool *pNeededToNormalize,
3566 UErrorCode *pErrorCode) {
3567 UChar stackBuffer[100];
3568 UChar *buffer=NULL;
3569 IsPrevBoundaryFn *isPreviousBoundary=NULL;
3570 uint32_t mask=0;
3571 int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
3572 int32_t c=0, c2=0;
3573 UChar minC=0;
3574
3575 /* check argument values */
3576 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3577 return 0;
3578 }
3579
3580 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3581 src==NULL
3582 ) {
3583 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3584 return 0;
3585 }
3586
3587 if(!_haveData(*pErrorCode)) {
3588 return 0;
3589 }
3590
3591 if(pNeededToNormalize!=NULL) {
3592 *pNeededToNormalize=FALSE;
3593 }
3594
3595 switch(mode) {
b75a7d8f 3596 case UNORM_FCD:
73c04bcf
A
3597 if(fcdTrie.index==NULL) {
3598 *pErrorCode=U_UNSUPPORTED_ERROR;
3599 return 0;
3600 }
3601 /* fall through to NFD */
3602 case UNORM_NFD:
b75a7d8f
A
3603 isPreviousBoundary=_isPrevNFDSafe;
3604 minC=_NORM_MIN_WITH_LEAD_CC;
3605 mask=_NORM_CC_MASK|_NORM_QC_NFD;
3606 break;
3607 case UNORM_NFKD:
3608 isPreviousBoundary=_isPrevNFDSafe;
3609 minC=_NORM_MIN_WITH_LEAD_CC;
3610 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3611 break;
3612 case UNORM_NFC:
3613 isPreviousBoundary=_isPrevTrueStarter;
3614 minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3615 mask=_NORM_CC_MASK|_NORM_QC_NFC;
3616 break;
3617 case UNORM_NFKC:
3618 isPreviousBoundary=_isPrevTrueStarter;
3619 minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3620 mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3621 break;
3622 case UNORM_NONE:
3623 destLength=0;
3624 if((c=src->previous(src))>=0) {
3625 destLength=1;
3626 if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
3627 if(UTF_IS_LEAD(c2)) {
3628 if(destCapacity>=2) {
3629 dest[1]=(UChar)c; /* trail surrogate */
3630 destLength=2;
3631 }
3632 c=c2; /* lead surrogate to be written below */
3633 } else {
3634 src->move(src, 1, UITER_CURRENT);
3635 }
3636 }
3637
3638 if(destCapacity>0) {
3639 dest[0]=(UChar)c;
3640 }
3641 }
3642 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3643 default:
3644 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3645 return 0;
3646 }
3647
3648 buffer=stackBuffer;
3649 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3650 bufferLength=_findPreviousIterationBoundary(*src,
3651 isPreviousBoundary, minC, mask,
3652 buffer, bufferCapacity,
3653 startIndex,
3654 pErrorCode);
3655 if(bufferLength>0) {
3656 if(doNormalize) {
3657 destLength=unorm_internalNormalize(dest, destCapacity,
3658 buffer+startIndex, bufferLength,
3659 mode, options,
3660 pErrorCode);
3661 if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3662 *pNeededToNormalize=
3663 (UBool)(destLength!=bufferLength ||
3664 0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
3665 }
3666 } else {
3667 /* just copy the source characters */
3668 if(destCapacity>0) {
3669 uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3670 }
3671 destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3672 }
3673 } else {
3674 destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3675 }
3676
3677 /* cleanup */
3678 if(buffer!=stackBuffer) {
3679 uprv_free(buffer);
3680 }
3681
3682 return destLength;
3683}
3684
3685/* forward iteration -------------------------------------------------------- */
3686
3687/*
3688 * read forward and get norm32
3689 * return 0 if the character is <minC
3690 * if c2!=0 then (c2, c) is a surrogate pair
3691 * always reads complete characters
3692 */
3693static inline uint32_t
3694_getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3695 uint32_t norm32;
3696
3697 /* need src.hasNext() to be true */
3698 c=(UChar)src.next(&src);
3699 c2=0;
3700
3701 if(c<minC) {
3702 return 0;
3703 }
3704
3705 norm32=_getNorm32(c);
3706 if(UTF_IS_FIRST_SURROGATE(c)) {
3707 if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
3708 src.move(&src, 1, UITER_CURRENT); /* skip the c2 surrogate */
3709 if((norm32&mask)==0) {
3710 /* irrelevant data */
3711 return 0;
3712 } else {
3713 /* norm32 must be a surrogate special */
3714 return _getNorm32FromSurrogatePair(norm32, c2);
3715 }
3716 } else {
3717 /* unmatched surrogate */
3718 c2=0;
3719 return 0;
3720 }
3721 }
3722 return norm32;
3723}
3724
3725/*
3726 * read forward and check if the character is a next-iteration boundary
3727 * if c2!=0 then (c, c2) is a surrogate pair
3728 */
3729typedef UBool
3730IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3731
3732/*
3733 * for NF*D:
3734 * read forward and check if the lead combining class is 0
3735 * if c2!=0 then (c, c2) is a surrogate pair
3736 */
3737static UBool
3738_isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3739 return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3740}
3741
3742/*
3743 * for NF*C:
3744 * read forward and check if the character is (or its decomposition begins with)
3745 * a "true starter" (cc==0 and NF*C_YES)
3746 * if c2!=0 then (c, c2) is a surrogate pair
3747 */
3748static UBool
3749_isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3750 uint32_t norm32, decompQCMask;
3751
3752 decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3753 norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3754 return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3755}
3756
3757static int32_t
3758_findNextIterationBoundary(UCharIterator &src,
3759 IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
3760 UChar *&buffer, int32_t &bufferCapacity,
3761 UErrorCode *pErrorCode) {
3762 UChar *stackBuffer;
3763 int32_t bufferIndex;
3764 UChar c, c2;
3765
3766 if(!src.hasNext(&src)) {
3767 return 0;
3768 }
3769
3770 /* initialize */
3771 stackBuffer=buffer;
3772
3773 /* get one character and ignore its properties */
3774 buffer[0]=c=(UChar)src.next(&src);
3775 bufferIndex=1;
3776 if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
3777 if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
3778 buffer[bufferIndex++]=c2;
3779 } else {
3780 src.move(&src, -1, UITER_CURRENT); /* back out the non-trail-surrogate */
3781 }
3782 }
3783
3784 /* get all following characters until we see a boundary */
3785 /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3786 while(src.hasNext(&src)) {
3787 if(isNextBoundary(src, minC, mask, c, c2)) {
3788 /* back out the latest movement to stop at the boundary */
3789 src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
3790 break;
3791 } else {
3792 if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
3793 /* attempt to grow the buffer */
3794 u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
3795 2*bufferCapacity,
3796 bufferIndex)
3797 ) {
3798 buffer[bufferIndex++]=c;
3799 if(c2!=0) {
3800 buffer[bufferIndex++]=c2;
3801 }
3802 } else {
3803 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3804 src.move(&src, 0, UITER_LIMIT);
3805 return 0;
3806 }
3807 }
3808 }
3809
3810 /* return the length of the buffer contents */
3811 return bufferIndex;
3812}
3813
3814U_CAPI int32_t U_EXPORT2
3815unorm_next(UCharIterator *src,
3816 UChar *dest, int32_t destCapacity,
3817 UNormalizationMode mode, int32_t options,
3818 UBool doNormalize, UBool *pNeededToNormalize,
3819 UErrorCode *pErrorCode) {
3820 UChar stackBuffer[100];
3821 UChar *buffer;
3822 IsNextBoundaryFn *isNextBoundary;
3823 uint32_t mask;
3824 int32_t bufferLength, bufferCapacity, destLength;
3825 int32_t c, c2;
3826 UChar minC;
3827
3828 /* check argument values */
3829 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3830 return 0;
3831 }
3832
3833 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3834 src==NULL
3835 ) {
3836 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3837 return 0;
3838 }
3839
3840 if(!_haveData(*pErrorCode)) {
3841 return 0;
3842 }
3843
3844 if(pNeededToNormalize!=NULL) {
3845 *pNeededToNormalize=FALSE;
3846 }
3847
3848 switch(mode) {
b75a7d8f 3849 case UNORM_FCD:
73c04bcf
A
3850 if(fcdTrie.index==NULL) {
3851 *pErrorCode=U_UNSUPPORTED_ERROR;
3852 return 0;
3853 }
3854 /* fall through to NFD */
3855 case UNORM_NFD:
b75a7d8f
A
3856 isNextBoundary=_isNextNFDSafe;
3857 minC=_NORM_MIN_WITH_LEAD_CC;
3858 mask=_NORM_CC_MASK|_NORM_QC_NFD;
3859 break;
3860 case UNORM_NFKD:
3861 isNextBoundary=_isNextNFDSafe;
3862 minC=_NORM_MIN_WITH_LEAD_CC;
3863 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3864 break;
3865 case UNORM_NFC:
3866 isNextBoundary=_isNextTrueStarter;
3867 minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3868 mask=_NORM_CC_MASK|_NORM_QC_NFC;
3869 break;
3870 case UNORM_NFKC:
3871 isNextBoundary=_isNextTrueStarter;
3872 minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3873 mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3874 break;
3875 case UNORM_NONE:
3876 destLength=0;
3877 if((c=src->next(src))>=0) {
3878 destLength=1;
3879 if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
3880 if(UTF_IS_TRAIL(c2)) {
3881 if(destCapacity>=2) {
3882 dest[1]=(UChar)c2; /* trail surrogate */
3883 destLength=2;
3884 }
3885 /* lead surrogate to be written below */
3886 } else {
3887 src->move(src, -1, UITER_CURRENT);
3888 }
3889 }
3890
3891 if(destCapacity>0) {
3892 dest[0]=(UChar)c;
3893 }
3894 }
3895 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3896 default:
3897 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3898 return 0;
3899 }
3900
3901 buffer=stackBuffer;
3902 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3903 bufferLength=_findNextIterationBoundary(*src,
3904 isNextBoundary, minC, mask,
3905 buffer, bufferCapacity,
3906 pErrorCode);
3907 if(bufferLength>0) {
3908 if(doNormalize) {
3909 destLength=unorm_internalNormalize(dest, destCapacity,
3910 buffer, bufferLength,
3911 mode, options,
3912 pErrorCode);
3913 if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3914 *pNeededToNormalize=
3915 (UBool)(destLength!=bufferLength ||
3916 0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
3917 }
3918 } else {
3919 /* just copy the source characters */
3920 if(destCapacity>0) {
3921 uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3922 }
3923 destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3924 }
3925 } else {
3926 destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3927 }
3928
3929 /* cleanup */
3930 if(buffer!=stackBuffer) {
3931 uprv_free(buffer);
3932 }
3933
3934 return destLength;
3935}
3936
3937/*
3938 * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3939 * and if not, how hard it would be to improve it.
3940 * For example, see _findSafeFCD().
3941 */
3942
3943/* Concatenation of normalized strings -------------------------------------- */
3944
3945U_CAPI int32_t U_EXPORT2
3946unorm_concatenate(const UChar *left, int32_t leftLength,
3947 const UChar *right, int32_t rightLength,
3948 UChar *dest, int32_t destCapacity,
3949 UNormalizationMode mode, int32_t options,
3950 UErrorCode *pErrorCode) {
3951 UChar stackBuffer[100];
3952 UChar *buffer;
3953 int32_t bufferLength, bufferCapacity;
3954
3955 UCharIterator iter;
3956 int32_t leftBoundary, rightBoundary, destLength;
3957
3958 /* check argument values */
3959 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3960 return 0;
3961 }
3962
3963 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3964 left==NULL || leftLength<-1 ||
3965 right==NULL || rightLength<-1
3966 ) {
3967 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3968 return 0;
3969 }
3970
3971 /* check for overlapping right and destination */
3972 if( dest!=NULL &&
3973 ((right>=dest && right<(dest+destCapacity)) ||
3974 (rightLength>0 && dest>=right && dest<(right+rightLength)))
3975 ) {
3976 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3977 return 0;
3978 }
3979
3980 /* allow left==dest */
3981
3982 /* set up intermediate buffer */
3983 buffer=stackBuffer;
3984 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3985
3986 /*
3987 * Input: left[0..leftLength[ + right[0..rightLength[
3988 *
3989 * Find normalization-safe boundaries leftBoundary and rightBoundary
3990 * and copy the end parts together:
3991 * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
3992 *
3993 * dest=left[0..leftBoundary[ +
3994 * normalize(buffer) +
3995 * right[rightBoundary..rightLength[
3996 */
3997
3998 /*
3999 * find a normalization boundary at the end of the left string
4000 * and copy the end part into the buffer
4001 */
4002 uiter_setString(&iter, left, leftLength);
4003 iter.index=leftLength=iter.length; /* end of left string */
4004
4005 bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
4006 mode, options,
4007 FALSE, NULL,
4008 pErrorCode);
4009 leftBoundary=iter.index;
4010 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
4011 *pErrorCode=U_ZERO_ERROR;
4012 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
4013 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
4014 /* dont need to cleanup here since
4015 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4016 */
4017 return 0;
4018 }
4019
4020 /* just copy from the left string: we know the boundary already */
4021 uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
4022 }
4023
4024 /*
4025 * find a normalization boundary at the beginning of the right string
4026 * and concatenate the beginning part to the buffer
4027 */
4028 uiter_setString(&iter, right, rightLength);
4029 rightLength=iter.length; /* in case it was -1 */
4030
4031 rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
4032 mode, options,
4033 FALSE, NULL,
4034 pErrorCode);
4035 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
4036 *pErrorCode=U_ZERO_ERROR;
4037 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
4038 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
4039 /* dont need to cleanup here since
4040 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4041 */
4042 return 0;
4043 }
4044
4045 /* just copy from the right string: we know the boundary already */
4046 uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
4047 }
4048
4049 bufferLength+=rightBoundary;
4050
4051 /* copy left[0..leftBoundary[ to dest */
4052 if(left!=dest && leftBoundary>0 && destCapacity>0) {
4053 uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
4054 }
4055 destLength=leftBoundary;
4056
4057 /* concatenate the normalization of the buffer to dest */
4058 if(destCapacity>destLength) {
4059 destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
4060 buffer, bufferLength,
4061 mode, options,
4062 pErrorCode);
4063 } else {
4064 destLength+=unorm_internalNormalize(NULL, 0,
4065 buffer, bufferLength,
4066 mode, options,
4067 pErrorCode);
4068 }
4069 /*
4070 * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
4071 * so we dont check for the error code here..just let it pass through
4072 */
4073 /* concatenate right[rightBoundary..rightLength[ to dest */
4074 right+=rightBoundary;
4075 rightLength-=rightBoundary;
4076 if(rightLength>0 && destCapacity>destLength) {
4077 uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
4078 }
4079 destLength+=rightLength;
4080
4081 /* cleanup */
4082 if(buffer!=stackBuffer) {
4083 uprv_free(buffer);
4084 }
4085
4086 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
4087}
4088
b75a7d8f 4089#endif /* #if !UCONFIG_NO_NORMALIZATION */