]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/unorm.cpp
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / common / unorm.cpp
CommitLineData
b75a7d8f
A
1/*
2******************************************************************************
46f4442e 3* Copyright (c) 1996-2007, International Business Machines
b75a7d8f
A
4* Corporation and others. All Rights Reserved.
5******************************************************************************
6* File unorm.cpp
7*
8* Created by: Vladimir Weinstein 12052000
9*
10* Modification history :
11*
12* Date Name Description
13* 02/01/01 synwee Added normalization quickcheck enum and method.
14* 02/12/01 synwee Commented out quickcheck util api has been approved
15* Added private method for doing FCD checks
16* 02/23/01 synwee Modified quickcheck and checkFCE to run through
17* string for codepoints < 0x300 for the normalization
18* mode NFC.
19* 05/25/01+ Markus Scherer total rewrite, implement all normalization here
20* instead of just wrappers around normlzr.cpp,
21* load unorm.dat, support Unicode 3.1 with
22* supplementary code points, etc.
23*/
24
25#include "unicode/utypes.h"
26
b75a7d8f
A
27#if !UCONFIG_NO_NORMALIZATION
28
29#include "unicode/udata.h"
30#include "unicode/uchar.h"
374ca955 31#include "unicode/ustring.h"
b75a7d8f
A
32#include "unicode/uiter.h"
33#include "unicode/uniset.h"
34#include "unicode/usetiter.h"
35#include "unicode/unorm.h"
374ca955
A
36#include "ucln_cmn.h"
37#include "unormimp.h"
38#include "ucase.h"
b75a7d8f
A
39#include "cmemory.h"
40#include "umutex.h"
41#include "utrie.h"
42#include "unicode/uset.h"
374ca955
A
43#include "udataswp.h"
44#include "putilimp.h"
b75a7d8f
A
45
46/*
47 * Status of tailored normalization
48 *
49 * This was done initially for investigation on Unicode public review issue 7
50 * (http://www.unicode.org/review/). See Jitterbug 2481.
51 * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
52 * a permanent feature in ICU 2.6 in support of IDNA which requires true
53 * Unicode 3.2 normalization.
54 * (NormalizationCorrections are rolled into IDNA mapping tables.)
55 *
56 * Tailored normalization as implemented here allows to "normalize less"
57 * than full Unicode normalization would.
58 * Based internally on a UnicodeSet of code points that are
59 * "excluded from normalization", the normalization functions leave those
60 * code points alone ("inert"). This means that tailored normalization
61 * still transforms text into a canonically equivalent form.
62 * It does not add decompositions to code points that do not have any or
63 * change decomposition results.
64 *
65 * Any function that searches for a safe boundary has not been touched,
66 * which means that these functions will be over-pessimistic when
67 * exclusions are applied.
68 * This should not matter because subsequent checks and normalizations
69 * do apply the exclusions; only a little more of the text may be processed
70 * than necessary under exclusions.
71 *
72 * Normalization exclusions have the following effect on excluded code points c:
73 * - c is not decomposed
74 * - c is not a composition target
75 * - c does not combine forward or backward for composition
76 * except that this is not implemented for Jamo
77 * - c is treated as having a combining class of 0
78 */
374ca955 79#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
b75a7d8f 80
46f4442e
A
81U_NAMESPACE_USE
82
b75a7d8f
A
83/*
84 * This new implementation of the normalization code loads its data from
85 * unorm.dat, which is generated with the gennorm tool.
86 * The format of that file is described in unormimp.h .
87 */
88
89/* -------------------------------------------------------------------------- */
90
91enum {
92 _STACK_BUFFER_CAPACITY=100
93};
94
95/*
96 * Constants for the bit fields in the options bit set parameter.
97 * These need not be public.
98 * A user only needs to know the currently assigned values.
99 * The number and positions of reserved bits per field can remain private
100 * and may change in future implementations.
101 */
102enum {
103 _NORM_OPTIONS_NX_MASK=0x1f,
374ca955
A
104 _NORM_OPTIONS_UNICODE_MASK=0x60,
105 _NORM_OPTIONS_SETS_MASK=0x7f,
106
107 _NORM_OPTIONS_UNICODE_SHIFT=5,
108
109 /*
110 * The following options are used only in some composition functions.
111 * They use bits 12 and up to preserve lower bits for the available options
112 * space in unorm_compare() -
113 * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
114 */
b75a7d8f 115
374ca955
A
116 /** Options bit 12, for compatibility vs. canonical decomposition. */
117 _NORM_OPTIONS_COMPAT=0x1000,
118 /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
119 _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
b75a7d8f
A
120};
121
73c04bcf 122U_CDECL_BEGIN
b75a7d8f
A
123static inline UBool
124isHangulWithoutJamoT(UChar c) {
125 c-=HANGUL_BASE;
126 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
127}
128
129/* norm32 helpers */
130
131/* is this a norm32 with a regular index? */
132static inline UBool
133isNorm32Regular(uint32_t norm32) {
134 return norm32<_NORM_MIN_SPECIAL;
135}
136
137/* is this a norm32 with a special index for a lead surrogate? */
138static inline UBool
139isNorm32LeadSurrogate(uint32_t norm32) {
140 return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
141}
142
143/* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
144static inline UBool
145isNorm32HangulOrJamo(uint32_t norm32) {
146 return norm32>=_NORM_MIN_HANGUL;
147}
148
149/*
150 * Given isNorm32HangulOrJamo(),
151 * is this a Hangul syllable or a Jamo?
152 */
73c04bcf 153/*static inline UBool
b75a7d8f
A
154isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
155 return norm32<_NORM_MIN_JAMO_V;
73c04bcf 156}*/
b75a7d8f
A
157
158/*
159 * Given norm32 for Jamo V or T,
160 * is this a Jamo V?
161 */
162static inline UBool
163isJamoVTNorm32JamoV(uint32_t norm32) {
164 return norm32<_NORM_JAMO_V_TOP;
165}
166
b75a7d8f
A
167/* load unorm.dat ----------------------------------------------------------- */
168
73c04bcf
A
169/* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
170static int32_t U_CALLCONV
171getFoldingNormOffset(uint32_t norm32) {
172 if(isNorm32LeadSurrogate(norm32)) {
173 return
174 UTRIE_BMP_INDEX_LENGTH+
175 (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
176 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
177 } else {
178 return 0;
179 }
180}
181
182/* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
183static int32_t U_CALLCONV
184getFoldingAuxOffset(uint32_t data) {
185 return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
186}
187U_CDECL_END
188
189#define UNORM_HARDCODE_DATA 1
190
191#if UNORM_HARDCODE_DATA
192
193/* unorm_props_data.c is machine-generated by gennorm --csource */
194#include "unorm_props_data.c"
195
196static const UBool formatVersion_2_2=TRUE;
197
198#else
199
b75a7d8f
A
200#define DATA_NAME "unorm"
201#define DATA_TYPE "icu"
202
203static UDataMemory *normData=NULL;
204static UErrorCode dataErrorCode=U_ZERO_ERROR;
205static int8_t haveNormData=0;
206
207static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
208static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
209
210/*
211 * pointers into the memory-mapped unorm.icu
212 */
213static const uint16_t *extraData=NULL,
214 *combiningTable=NULL,
215 *canonStartSets=NULL;
216
217static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
218static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
219
220/* the Unicode version of the normalization data */
221static UVersionInfo dataVersion={ 0, 0, 0, 0 };
222
73c04bcf
A
223#endif
224
b75a7d8f
A
225/* cache UnicodeSets for each combination of exclusion flags */
226static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
227
228U_CDECL_BEGIN
229
374ca955 230static UBool U_CALLCONV
73c04bcf 231unorm_cleanup(void) {
b75a7d8f
A
232 int32_t i;
233
73c04bcf 234#if !UNORM_HARDCODE_DATA
b75a7d8f
A
235 if(normData!=NULL) {
236 udata_close(normData);
237 normData=NULL;
238 }
239 dataErrorCode=U_ZERO_ERROR;
240 haveNormData=0;
73c04bcf 241#endif
b75a7d8f
A
242
243 for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
73c04bcf
A
244 if (nxCache[i]) {
245 delete nxCache[i];
246 nxCache[i] = 0;
247 }
b75a7d8f 248 }
b75a7d8f
A
249
250 return TRUE;
251}
252
73c04bcf 253#if !UNORM_HARDCODE_DATA
b75a7d8f
A
254
255static UBool U_CALLCONV
256isAcceptable(void * /* context */,
257 const char * /* type */, const char * /* name */,
258 const UDataInfo *pInfo) {
259 if(
260 pInfo->size>=20 &&
261 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
262 pInfo->charsetFamily==U_CHARSET_FAMILY &&
263 pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */
264 pInfo->dataFormat[1]==0x6f &&
265 pInfo->dataFormat[2]==0x72 &&
266 pInfo->dataFormat[3]==0x6d &&
267 pInfo->formatVersion[0]==2 &&
268 pInfo->formatVersion[2]==UTRIE_SHIFT &&
269 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
270 ) {
271 uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
272 uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
273 return TRUE;
274 } else {
275 return FALSE;
276 }
277}
278
73c04bcf
A
279#endif
280
b75a7d8f
A
281static UBool U_CALLCONV
282_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) {
283 /* add the start code point to the USet */
73c04bcf 284 const USetAdder *sa=(const USetAdder *)context;
374ca955 285 sa->add(sa->set, start);
b75a7d8f
A
286 return TRUE;
287}
288
289U_CDECL_END
290
73c04bcf
A
291#if !UNORM_HARDCODE_DATA
292
b75a7d8f
A
293static int8_t
294loadNormData(UErrorCode &errorCode) {
295 /* load Unicode normalization data from file */
296
297 /*
298 * This lazy intialization with double-checked locking (without mutex protection for
299 * haveNormData==0) is transiently unsafe under certain circumstances.
300 * Check the readme and use u_init() if necessary.
301 *
302 * While u_init() initializes the main normalization data via this functions,
303 * it does not do so for exclusion sets (which are fully mutexed).
304 * This is because
305 * - there can be many exclusion sets
306 * - they are rarely used
307 * - they are not usually used in execution paths that are
308 * as performance-sensitive as others
309 * (e.g., IDNA takes more time than unorm_quickCheck() anyway)
310 */
311 if(haveNormData==0) {
312 UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
313 UDataMemory *data;
73c04bcf 314
b75a7d8f
A
315 const int32_t *p=NULL;
316 const uint8_t *pb;
317
318 if(&errorCode==NULL || U_FAILURE(errorCode)) {
319 return 0;
320 }
321
322 /* open the data outside the mutex block */
323 data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
324 dataErrorCode=errorCode;
325 if(U_FAILURE(errorCode)) {
326 return haveNormData=-1;
327 }
328
329 p=(const int32_t *)udata_getMemory(data);
330 pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
331 utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
332 _normTrie.getFoldingOffset=getFoldingNormOffset;
333
334 pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
b75a7d8f 335 if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
73c04bcf
A
336 utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
337 }
338 pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
339
340 if(p[_NORM_INDEX_AUX_TRIE_SIZE]!=0) {
b75a7d8f
A
341 utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
342 _auxTrie.getFoldingOffset=getFoldingAuxOffset;
343 }
344
345 if(U_FAILURE(errorCode)) {
346 dataErrorCode=errorCode;
347 udata_close(data);
348 return haveNormData=-1;
349 }
350
351 /* in the mutex block, set the data for this process */
352 umtx_lock(NULL);
353 if(normData==NULL) {
354 normData=data;
355 data=NULL;
356
357 uprv_memcpy(&indexes, p, sizeof(indexes));
358 uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
359 uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
360 uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
361 } else {
362 p=(const int32_t *)udata_getMemory(normData);
363 }
b75a7d8f
A
364
365 /* initialize some variables */
366 extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
367 combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
368 formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
369 formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
370 if(formatVersion_2_1) {
371 canonStartSets=combiningTable+
372 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
373 (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
374 }
375 haveNormData=1;
374ca955
A
376 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
377 umtx_unlock(NULL);
b75a7d8f
A
378
379 /* if a different thread set it first, then close the extra data */
380 if(data!=NULL) {
381 udata_close(data); /* NULL if it was set correctly */
382 }
383 }
384
385 return haveNormData;
386}
387
73c04bcf
A
388#endif
389
b75a7d8f
A
390static inline UBool
391_haveData(UErrorCode &errorCode) {
73c04bcf
A
392#if UNORM_HARDCODE_DATA
393 return U_SUCCESS(errorCode);
394#else
395 if(U_FAILURE(errorCode)) {
396 return FALSE;
397 } else if(haveNormData>0) {
398 return TRUE;
399 } else if(haveNormData<0) {
b75a7d8f 400 errorCode=dataErrorCode;
73c04bcf
A
401 return FALSE;
402 } else /* haveNormData==0 */ {
b75a7d8f
A
403 return (UBool)(loadNormData(errorCode)>0);
404 }
73c04bcf 405#endif
b75a7d8f
A
406}
407
408U_CAPI UBool U_EXPORT2
409unorm_haveData(UErrorCode *pErrorCode) {
410 return _haveData(*pErrorCode);
411}
412
413U_CAPI const uint16_t * U_EXPORT2
414unorm_getFCDTrie(UErrorCode *pErrorCode) {
415 if(_haveData(*pErrorCode)) {
416 return fcdTrie.index;
417 } else {
418 return NULL;
419 }
420}
421
422/* data access primitives --------------------------------------------------- */
423
424static inline uint32_t
425_getNorm32(UChar c) {
426 return UTRIE_GET32_FROM_LEAD(&normTrie, c);
427}
428
429static inline uint32_t
430_getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) {
431 /*
432 * the surrogate index in norm32 stores only the number of the surrogate index block
433 * see gennorm/store.c/getFoldedNormValue()
434 */
435 norm32=
436 UTRIE_BMP_INDEX_LENGTH+
437 ((norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
438 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
439 return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie, norm32, c2);
440}
441
442/*
443 * get a norm32 from text with complete code points
444 * (like from decompositions)
445 */
446static inline uint32_t
447_getNorm32(const UChar *p, uint32_t mask) {
448 uint32_t norm32=_getNorm32(*p);
449 if((norm32&mask) && isNorm32LeadSurrogate(norm32)) {
450 /* *p is a lead surrogate, get the real norm32 */
451 norm32=_getNorm32FromSurrogatePair(norm32, *(p+1));
452 }
453 return norm32;
454}
455
456static inline uint16_t
457_getFCD16(UChar c) {
458 return UTRIE_GET16_FROM_LEAD(&fcdTrie, c);
459}
460
461static inline uint16_t
462_getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) {
463 /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
464 return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie, fcd16, c2);
465}
466
467static inline const uint16_t *
468_getExtraData(uint32_t norm32) {
469 return extraData+(norm32>>_NORM_EXTRA_SHIFT);
470}
471
73c04bcf
A
472#if 0
473/*
474 * It is possible to get the FCD data from the main trie if unorm.icu
475 * was built without the FCD trie, although it is slower.
476 * This is not implemented because it is hard to test, and because it seems
477 * unusual to want to use FCD and not build the data file for it.
478 *
479 * Untested sample code:
480 */
481static inline uint16_t
482_getFCD16FromNormData(UChar32 c) {
483 uint32_t norm32, fcd;
484
485 norm32=_getNorm32(c);
486 if((norm32&_NORM_QC_NFD) && isNorm32Regular(norm32)) {
487 /* get the lead/trail cc from the decomposition data */
488 const uint16_t *nfd=_getExtraData(norm32);
489 if(*nfd&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
490 fcd=nfd[1];
491 }
492 } else {
493 fcd=norm32&_NORM_CC_MASK;
494 if(fcd!=0) {
495 /* use the code point cc value for both lead and trail cc's */
496 fcd|=fcd>>_NORM_CC_SHIFT; /* assume that the cc is in bits 15..8 */
497 }
498 }
499
500 return (uint16_t)fcd;
501}
502#endif
503
b75a7d8f
A
504/* normalization exclusion sets --------------------------------------------- */
505
506/*
507 * Normalization exclusion UnicodeSets are used for tailored normalization;
508 * see the comment near the beginning of this file.
509 *
510 * By specifying one or several sets of code points,
511 * those code points become inert for normalization.
512 */
513
514static const UnicodeSet *
515internalGetNXHangul(UErrorCode &errorCode) {
516 /* internal function, does not check for incoming U_FAILURE */
b75a7d8f
A
517 UBool isCached;
518
374ca955 519 UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached);
b75a7d8f
A
520
521 if(!isCached) {
522 UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
523 if(set==NULL) {
524 errorCode=U_MEMORY_ALLOCATION_ERROR;
525 return NULL;
526 }
46f4442e
A
527 // Compact the set for caching.
528 set->compact();
b75a7d8f
A
529
530 umtx_lock(NULL);
531 if(nxCache[UNORM_NX_HANGUL]==NULL) {
532 nxCache[UNORM_NX_HANGUL]=set;
533 set=NULL;
73c04bcf 534 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
b75a7d8f
A
535 }
536 umtx_unlock(NULL);
537
538 delete set;
539 }
540
541 return nxCache[UNORM_NX_HANGUL];
542}
543
374ca955 544/* unorm.cpp 1.116 had and used
b75a7d8f 545static const UnicodeSet *
374ca955
A
546internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
547 ...
b75a7d8f 548}
374ca955 549*/
b75a7d8f 550
374ca955 551/* get and set an exclusion set from a serialized UnicodeSet */
b75a7d8f 552static const UnicodeSet *
374ca955 553internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
b75a7d8f 554 /* internal function, does not check for incoming U_FAILURE */
b75a7d8f
A
555 UBool isCached;
556
374ca955 557 UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
b75a7d8f 558
374ca955
A
559 if( !isCached &&
560 canonStartSets!=NULL &&
561 canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
562 ) {
563 USerializedSet sset;
b75a7d8f 564 UnicodeSet *set;
374ca955
A
565 UChar32 start, end;
566 int32_t i;
b75a7d8f 567
374ca955
A
568 if( !uset_getSerializedSet(
569 &sset,
570 canonStartSets+canonStartSets[nxIndex],
571 canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
572 ) {
573 errorCode=U_INVALID_FORMAT_ERROR;
b75a7d8f
A
574 return NULL;
575 }
576
374ca955
A
577 /* turn the serialized set into a UnicodeSet */
578 set=new UnicodeSet();
b75a7d8f
A
579 if(set==NULL) {
580 errorCode=U_MEMORY_ALLOCATION_ERROR;
581 return NULL;
582 }
374ca955
A
583 for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
584 set->add(start, end);
b75a7d8f 585 }
46f4442e
A
586 // Compact the set for caching.
587 set->compact();
b75a7d8f
A
588
589 umtx_lock(NULL);
590 if(nxCache[options]==NULL) {
591 nxCache[options]=set;
592 set=NULL;
73c04bcf 593 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
b75a7d8f
A
594 }
595 umtx_unlock(NULL);
596
597 delete set;
598 }
599
600 return nxCache[options];
601}
602
374ca955
A
603static const UnicodeSet *
604internalGetNXCJKCompat(UErrorCode &errorCode) {
605 /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
606 return internalGetSerializedNX(
607 UNORM_NX_CJK_COMPAT,
608 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
609 errorCode);
610}
611
612static const UnicodeSet *
613internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
614 /* internal function, does not check for incoming U_FAILURE */
615 int32_t nxIndex;
616
617 options&=_NORM_OPTIONS_UNICODE_MASK;
618 switch(options) {
619 case 0:
620 return NULL;
621 case UNORM_UNICODE_3_2:
622 /* [:^Age=3.2:] */
623 nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
624 break;
625 default:
626 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
627 return NULL;
628 }
629
630 /* build a set with all code points that were not designated by the specified Unicode version */
631 return internalGetSerializedNX(options, nxIndex, errorCode);
632}
633
b75a7d8f
A
634/* Get a decomposition exclusion set. The data must be loaded. */
635static const UnicodeSet *
636internalGetNX(int32_t options, UErrorCode &errorCode) {
637 options&=_NORM_OPTIONS_SETS_MASK;
638
639 UBool isCached;
640
374ca955 641 UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
b75a7d8f
A
642
643 if(!isCached) {
644 /* return basic sets */
645 if(options==UNORM_NX_HANGUL) {
646 return internalGetNXHangul(errorCode);
647 }
648 if(options==UNORM_NX_CJK_COMPAT) {
649 return internalGetNXCJKCompat(errorCode);
650 }
651 if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
652 return internalGetNXUnicode(options, errorCode);
653 }
654
655 /* build a set from multiple subsets */
656 UnicodeSet *set;
657 const UnicodeSet *other;
658
659 set=new UnicodeSet();
660 if(set==NULL) {
661 errorCode=U_MEMORY_ALLOCATION_ERROR;
662 return NULL;
663 }
664
665 if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
666 set->addAll(*other);
667 }
668 if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
669 set->addAll(*other);
670 }
671 if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
672 set->addAll(*other);
673 }
674
675 if(U_FAILURE(errorCode)) {
676 delete set;
677 return NULL;
678 }
46f4442e
A
679 // Compact the set for caching.
680 set->compact();
b75a7d8f
A
681
682 umtx_lock(NULL);
683 if(nxCache[options]==NULL) {
684 nxCache[options]=set;
685 set=NULL;
73c04bcf 686 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
b75a7d8f
A
687 }
688 umtx_unlock(NULL);
689
690 delete set;
691 }
692
693 return nxCache[options];
694}
695
696static inline const UnicodeSet *
697getNX(int32_t options, UErrorCode &errorCode) {
698 if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
699 /* incoming failure, or no decomposition exclusions requested */
700 return NULL;
701 } else {
702 return internalGetNX(options, errorCode);
703 }
704}
705
374ca955
A
706U_CFUNC const UnicodeSet *
707unorm_getNX(int32_t options, UErrorCode *pErrorCode) {
708 return getNX(options, *pErrorCode);
709}
710
b75a7d8f
A
711static inline UBool
712nx_contains(const UnicodeSet *nx, UChar32 c) {
713 return nx!=NULL && nx->contains(c);
714}
715
716static inline UBool
717nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
718 return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
719}
720
721/* other normalization primitives ------------------------------------------- */
722
723/* get the canonical or compatibility decomposition for one character */
724static inline const UChar *
725_decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
726 uint8_t &cc, uint8_t &trailCC) {
727 const UChar *p=(const UChar *)_getExtraData(norm32);
728 length=*p++;
729
730 if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
731 /* use compatibility decomposition, skip canonical data */
732 p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
733 length>>=8;
734 }
735
736 if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
737 /* get the lead and trail cc's */
738 UChar bothCCs=*p++;
739 cc=(uint8_t)(bothCCs>>8);
740 trailCC=(uint8_t)bothCCs;
741 } else {
742 /* lead and trail cc's are both 0 */
743 cc=trailCC=0;
744 }
745
746 length&=_NORM_DECOMP_LENGTH_MASK;
747 return p;
748}
749
750/* get the canonical decomposition for one character */
751static inline const UChar *
752_decompose(uint32_t norm32, int32_t &length,
753 uint8_t &cc, uint8_t &trailCC) {
754 const UChar *p=(const UChar *)_getExtraData(norm32);
755 length=*p++;
756
757 if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
758 /* get the lead and trail cc's */
759 UChar bothCCs=*p++;
760 cc=(uint8_t)(bothCCs>>8);
761 trailCC=(uint8_t)bothCCs;
762 } else {
763 /* lead and trail cc's are both 0 */
764 cc=trailCC=0;
765 }
766
767 length&=_NORM_DECOMP_LENGTH_MASK;
768 return p;
769}
770
771/**
772 * Get the canonical decomposition for one code point.
773 * @param c code point
774 * @param buffer out-only buffer for algorithmic decompositions of Hangul
775 * @param length out-only, takes the length of the decomposition, if any
776 * @return pointer to decomposition, or 0 if none
777 * @internal
778 */
374ca955
A
779U_CFUNC const UChar *
780unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) {
b75a7d8f
A
781 uint32_t norm32;
782
374ca955
A
783 if(c<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
784 /* trivial case */
785 return NULL;
786 }
787
b75a7d8f
A
788 UTRIE_GET32(&normTrie, c, norm32);
789 if(norm32&_NORM_QC_NFD) {
790 if(isNorm32HangulOrJamo(norm32)) {
791 /* Hangul syllable: decompose algorithmically */
792 UChar c2;
793
794 c-=HANGUL_BASE;
795
796 c2=(UChar)(c%JAMO_T_COUNT);
797 c/=JAMO_T_COUNT;
798 if(c2>0) {
799 buffer[2]=(UChar)(JAMO_T_BASE+c2);
374ca955 800 *pLength=3;
b75a7d8f 801 } else {
374ca955 802 *pLength=2;
b75a7d8f
A
803 }
804
805 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
806 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
807 return buffer;
808 } else {
809 /* normal decomposition */
810 uint8_t cc, trailCC;
374ca955 811 return _decompose(norm32, *pLength, cc, trailCC);
b75a7d8f
A
812 }
813 } else {
814 return 0;
815 }
816}
817
818/*
819 * get the combining class of (c, c2)=*p++
820 * before: p<limit after: p<=limit
821 * if only one code unit is used, then c2==0
822 */
823static inline uint8_t
824_getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
825 uint32_t norm32;
826
827 c=*p++;
828 norm32=_getNorm32(c);
829 if((norm32&_NORM_CC_MASK)==0) {
830 c2=0;
831 return 0;
832 } else {
833 if(!isNorm32LeadSurrogate(norm32)) {
834 c2=0;
835 } else {
836 /* c is a lead surrogate, get the real norm32 */
837 if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
838 ++p;
839 norm32=_getNorm32FromSurrogatePair(norm32, c2);
840 } else {
841 c2=0;
842 return 0;
843 }
844 }
845
846 return (uint8_t)(norm32>>_NORM_CC_SHIFT);
847 }
848}
849
850/*
851 * read backwards and get norm32
852 * return 0 if the character is <minC
853 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
854 */
855static inline uint32_t
856_getPrevNorm32(const UChar *start, const UChar *&src,
857 uint32_t minC, uint32_t mask,
858 UChar &c, UChar &c2) {
859 uint32_t norm32;
860
861 c=*--src;
862 c2=0;
863
864 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
865 if(c<minC) {
866 return 0;
867 } else if(!UTF_IS_SURROGATE(c)) {
868 return _getNorm32(c);
869 } else if(UTF_IS_SURROGATE_FIRST(c)) {
870 /* unpaired first surrogate */
871 return 0;
872 } else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) {
873 --src;
874 norm32=_getNorm32(c2);
875
876 if((norm32&mask)==0) {
877 /* all surrogate pairs with this lead surrogate have only irrelevant data */
878 return 0;
879 } else {
880 /* norm32 must be a surrogate special */
881 return _getNorm32FromSurrogatePair(norm32, c);
882 }
883 } else {
884 /* unpaired second surrogate */
885 c2=0;
886 return 0;
887 }
888}
889
890/*
891 * get the combining class of (c, c2)=*--p
892 * before: start<p after: start<=p
893 */
894static inline uint8_t
895_getPrevCC(const UChar *start, const UChar *&p) {
896 UChar c, c2;
897
898 return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, _NORM_CC_MASK, c, c2)>>_NORM_CC_SHIFT);
899}
900
901/*
902 * is this a safe boundary character for NF*D?
903 * (lead cc==0)
904 */
905static inline UBool
906_isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
907 if((norm32&ccOrQCMask)==0) {
908 return TRUE; /* cc==0 and no decomposition: this is NF*D safe */
909 }
910
911 /* inspect its decomposition - maybe a Hangul but not a surrogate here */
912 if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
913 int32_t length;
914 uint8_t cc, trailCC;
915
916 /* decomposes, get everything from the variable-length extra data */
917 _decompose(norm32, decompQCMask, length, cc, trailCC);
918 return cc==0;
919 } else {
920 /* no decomposition (or Hangul), test the cc directly */
921 return (norm32&_NORM_CC_MASK)==0;
922 }
923}
924
925/*
926 * is this (or does its decomposition begin with) a "true starter"?
927 * (cc==0 and NF*C_YES)
928 */
929static inline UBool
930_isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
931 if((norm32&ccOrQCMask)==0) {
932 return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
933 }
934
935 /* inspect its decomposition - not a Hangul or a surrogate here */
936 if((norm32&decompQCMask)!=0) {
937 const UChar *p;
938 int32_t length;
939 uint8_t cc, trailCC;
940
941 /* decomposes, get everything from the variable-length extra data */
942 p=_decompose(norm32, decompQCMask, length, cc, trailCC);
943 if(cc==0) {
944 uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
945
946 /* does it begin with NFC_YES? */
947 if((_getNorm32(p, qcMask)&qcMask)==0) {
948 /* yes, the decomposition begins with a true starter */
949 return TRUE;
950 }
951 }
952 }
953 return FALSE;
954}
955
956/* uchar.h */
957U_CAPI uint8_t U_EXPORT2
958u_getCombiningClass(UChar32 c) {
73c04bcf 959#if !UNORM_HARDCODE_DATA
b75a7d8f
A
960 UErrorCode errorCode=U_ZERO_ERROR;
961 if(_haveData(errorCode)) {
73c04bcf 962#endif
b75a7d8f
A
963 uint32_t norm32;
964
965 UTRIE_GET32(&normTrie, c, norm32);
966 return (uint8_t)(norm32>>_NORM_CC_SHIFT);
73c04bcf 967#if !UNORM_HARDCODE_DATA
b75a7d8f
A
968 } else {
969 return 0;
970 }
73c04bcf 971#endif
b75a7d8f
A
972}
973
46f4442e 974U_CFUNC UBool U_EXPORT2
b75a7d8f 975unorm_internalIsFullCompositionExclusion(UChar32 c) {
73c04bcf
A
976#if UNORM_HARDCODE_DATA
977 if(auxTrie.index!=NULL) {
978#else
b75a7d8f 979 UErrorCode errorCode=U_ZERO_ERROR;
73c04bcf
A
980 if(_haveData(errorCode) && auxTrie.index!=NULL) {
981#endif
b75a7d8f
A
982 uint16_t aux;
983
984 UTRIE_GET16(&auxTrie, c, aux);
985 return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
986 } else {
987 return FALSE;
988 }
989}
990
46f4442e 991U_CFUNC UBool U_EXPORT2
b75a7d8f 992unorm_isCanonSafeStart(UChar32 c) {
73c04bcf
A
993#if UNORM_HARDCODE_DATA
994 if(auxTrie.index!=NULL) {
995#else
b75a7d8f 996 UErrorCode errorCode=U_ZERO_ERROR;
73c04bcf
A
997 if(_haveData(errorCode) && auxTrie.index!=NULL) {
998#endif
b75a7d8f
A
999 uint16_t aux;
1000
1001 UTRIE_GET16(&auxTrie, c, aux);
1002 return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
1003 } else {
1004 return FALSE;
1005 }
1006}
1007
374ca955
A
1008U_CAPI void U_EXPORT2
1009unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
1010 if(unorm_haveData(pErrorCode)){
1011 uprv_memcpy(*versionInfo, dataVersion, 4);
1012 }
1013}
1014
1015
b75a7d8f
A
1016U_CAPI UBool U_EXPORT2
1017unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
73c04bcf 1018#if !UNORM_HARDCODE_DATA
b75a7d8f 1019 UErrorCode errorCode=U_ZERO_ERROR;
73c04bcf 1020#endif
b75a7d8f 1021 if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
73c04bcf
A
1022#if !UNORM_HARDCODE_DATA
1023 _haveData(errorCode) &&
1024#endif
1025 canonStartSets!=NULL
b75a7d8f
A
1026 ) {
1027 const uint16_t *table;
1028 int32_t i, start, limit;
1029
1030 /*
1031 * binary search for c
1032 *
1033 * There are two search tables,
1034 * one for BMP code points and one for supplementary ones.
1035 * See unormimp.h for details.
1036 */
1037 if(c<=0xffff) {
1038 table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
1039 start=0;
1040 limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1041
1042 /* each entry is a pair { c, result } */
1043 while(start<limit-2) {
1044 i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
1045 if(c<table[i]) {
1046 limit=i;
1047 } else {
1048 start=i;
1049 }
1050 }
1051
1052 /* found? */
1053 if(c==table[start]) {
1054 i=table[start+1];
1055 if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
1056 /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
1057 i&=(_NORM_MAX_CANON_SETS-1);
1058 return uset_getSerializedSet(fillSet,
1059 canonStartSets+i,
1060 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1061 } else {
1062 /* other result values are BMP code points for single-code point sets */
1063 uset_setSerializedToOne(fillSet, (UChar32)i);
1064 return TRUE;
1065 }
1066 }
1067 } else {
1068 uint16_t high, low, h;
1069
1070 table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
1071 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
1072 start=0;
1073 limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
1074
1075 high=(uint16_t)(c>>16);
1076 low=(uint16_t)c;
1077
1078 /* each entry is a triplet { high(c), low(c), result } */
1079 while(start<limit-3) {
1080 i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
1081 h=table[i]&0x1f; /* high word */
1082 if(high<h || (high==h && low<table[i+1])) {
1083 limit=i;
1084 } else {
1085 start=i;
1086 }
1087 }
1088
1089 /* found? */
1090 h=table[start];
1091 if(high==(h&0x1f) && low==table[start+1]) {
1092 i=table[start+2];
1093 if((h&0x8000)==0) {
1094 /* the result is an index to a USerializedSet */
1095 return uset_getSerializedSet(fillSet,
1096 canonStartSets+i,
1097 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1098 } else {
1099 /*
1100 * single-code point set {x} in
1101 * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
1102 */
1103 i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
1104 uset_setSerializedToOne(fillSet, (UChar32)i);
1105 return TRUE;
1106 }
1107 }
1108 }
1109 }
1110
1111 return FALSE; /* not found */
1112}
1113
1114U_CAPI int32_t U_EXPORT2
1115u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
1116 uint16_t aux;
1117
1118 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1119 return 0;
1120 }
1121 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1122 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1123 return 0;
1124 }
73c04bcf 1125 if(!_haveData(*pErrorCode) || auxTrie.index==NULL) {
b75a7d8f
A
1126 return 0;
1127 }
1128
1129 UTRIE_GET16(&auxTrie, c, aux);
1130 aux&=_NORM_AUX_FNC_MASK;
1131 if(aux!=0) {
1132 const UChar *s;
1133 int32_t length;
1134
1135 s=(const UChar *)(extraData+aux);
1136 if(*s<0xff00) {
1137 /* s points to the single-unit string */
1138 length=1;
1139 } else {
1140 length=*s&0xff;
1141 ++s;
1142 }
1143 if(0<length && length<=destCapacity) {
1144 uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
1145 }
1146 return u_terminateUChars(dest, destCapacity, length, pErrorCode);
1147 } else {
1148 return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
1149 }
1150}
1151
1152/* Is c an NF<mode>-skippable code point? See unormimp.h. */
1153U_CAPI UBool U_EXPORT2
1154unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
b75a7d8f
A
1155 uint32_t norm32, mask;
1156 uint16_t aux, fcd;
1157
73c04bcf
A
1158#if !UNORM_HARDCODE_DATA
1159 UErrorCode errorCode=U_ZERO_ERROR;
b75a7d8f
A
1160 if(!_haveData(errorCode)) {
1161 return FALSE;
1162 }
73c04bcf 1163#endif
b75a7d8f
A
1164
1165 /* handle trivial cases; set the comparison mask for the normal ones */
1166 switch(mode) {
1167 case UNORM_NONE:
1168 return TRUE;
1169 case UNORM_NFD:
1170 mask=_NORM_CC_MASK|_NORM_QC_NFD;
1171 break;
1172 case UNORM_NFKD:
1173 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
1174 break;
1175 case UNORM_NFC:
1176 /* case UNORM_FCC: */
1177 mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
1178 break;
1179 case UNORM_NFKC:
1180 mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
1181 break;
1182 case UNORM_FCD:
1183 /* FCD: skippable if lead cc==0 and trail cc<=1 */
73c04bcf
A
1184 if(fcdTrie.index!=NULL) {
1185 UTRIE_GET16(&fcdTrie, c, fcd);
1186 return fcd<=1;
1187 } else {
1188 return FALSE;
1189 }
b75a7d8f
A
1190 default:
1191 return FALSE;
1192 }
1193
1194 /* check conditions (a)..(e), see unormimp.h */
1195 UTRIE_GET32(&normTrie, c, norm32);
1196 if((norm32&mask)!=0) {
1197 return FALSE; /* fails (a)..(e), not skippable */
1198 }
1199
1200 if(mode<UNORM_NFC) {
1201 return TRUE; /* NF*D, passed (a)..(c), is skippable */
1202 }
1203
1204 /* NF*C/FCC, passed (a)..(e) */
1205 if((norm32&_NORM_QC_NFD)==0) {
1206 return TRUE; /* no canonical decomposition, is skippable */
1207 }
1208
1209 /* check Hangul syllables algorithmically */
1210 if(isNorm32HangulOrJamo(norm32)) {
1211 /* Jamo passed (a)..(e) above, must be Hangul */
1212 return !isHangulWithoutJamoT((UChar)c); /* LVT are skippable, LV are not */
1213 }
1214
1215 /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1216 /* NF*C, test (f) flag */
73c04bcf 1217 if(!formatVersion_2_2 || auxTrie.index==NULL) {
b75a7d8f
A
1218 return FALSE; /* no (f) data, say not skippable to be safe */
1219 }
1220
1221 UTRIE_GET16(&auxTrie, c, aux);
1222 return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
1223
1224 /* } else { FCC, test fcd<=1 instead of the above } */
1225}
1226
1227U_CAPI void U_EXPORT2
73c04bcf 1228unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
b75a7d8f
A
1229 UChar c;
1230
73c04bcf 1231 if(!_haveData(*pErrorCode)) {
b75a7d8f
A
1232 return;
1233 }
1234
1235 /* add the start code point of each same-value range of each trie */
374ca955 1236 utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
73c04bcf
A
1237 if(fcdTrie.index!=NULL) {
1238 utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
1239 }
1240 if(auxTrie.index!=NULL) {
374ca955 1241 utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
b75a7d8f
A
1242 }
1243
1244 /* add Hangul LV syllables and LV+1 because of skippables */
1245 for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
374ca955
A
1246 sa->add(sa->set, c);
1247 sa->add(sa->set, c+1);
1248 }
1249 sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
1250}
1251
46f4442e 1252U_CFUNC UNormalizationCheckResult U_EXPORT2
374ca955
A
1253unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
1254 static const uint32_t qcMask[UNORM_MODE_COUNT]={
1255 0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
1256 };
1257
374ca955
A
1258 uint32_t norm32;
1259
73c04bcf
A
1260#if !UNORM_HARDCODE_DATA
1261 UErrorCode errorCode=U_ZERO_ERROR;
374ca955
A
1262 if(!_haveData(errorCode)) {
1263 return UNORM_YES;
1264 }
73c04bcf 1265#endif
374ca955
A
1266
1267 UTRIE_GET32(&normTrie, c, norm32);
1268 norm32&=qcMask[mode];
1269
1270 if(norm32==0) {
1271 return UNORM_YES;
1272 } else if(norm32&_NORM_QC_ANY_NO) {
1273 return UNORM_NO;
1274 } else /* _NORM_QC_ANY_MAYBE */ {
1275 return UNORM_MAYBE;
1276 }
1277}
1278
46f4442e 1279U_CFUNC uint16_t U_EXPORT2
374ca955 1280unorm_getFCD16FromCodePoint(UChar32 c) {
374ca955 1281 uint16_t fcd;
46f4442e
A
1282#if !UNORM_HARDCODE_DATA
1283 UErrorCode errorCode;
374ca955 1284 errorCode=U_ZERO_ERROR;
46f4442e
A
1285#endif
1286
73c04bcf
A
1287 if(
1288#if !UNORM_HARDCODE_DATA
1289 !_haveData(errorCode) ||
1290#endif
1291 fcdTrie.index==NULL
1292 ) {
374ca955 1293 return 0;
b75a7d8f 1294 }
374ca955
A
1295
1296 UTRIE_GET16(&fcdTrie, c, fcd);
1297 return fcd;
b75a7d8f
A
1298}
1299
1300/* reorder UTF-16 in-place -------------------------------------------------- */
1301
1302/*
1303 * simpler, single-character version of _mergeOrdered() -
1304 * bubble-insert one single code point into the preceding string
1305 * which is already canonically ordered
1306 * (c, c2) may or may not yet have been inserted at [current..p[
1307 *
1308 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1309 *
1310 * before: [start..current[ is already ordered, and
1311 * [current..p[ may or may not hold (c, c2) but
1312 * must be exactly the same length as (c, c2)
1313 * after: [start..p[ is ordered
1314 *
1315 * returns the trailing combining class
1316 */
1317static uint8_t
1318_insertOrdered(const UChar *start, UChar *current, UChar *p,
1319 UChar c, UChar c2, uint8_t cc) {
1320 const UChar *pBack, *pPreBack;
1321 UChar *r;
1322 uint8_t prevCC, trailCC=cc;
1323
1324 if(start<current && cc!=0) {
1325 /* search for the insertion point where cc>=prevCC */
1326 pPreBack=pBack=current;
1327 prevCC=_getPrevCC(start, pPreBack);
1328 if(cc<prevCC) {
1329 /* this will be the last code point, so keep its cc */
1330 trailCC=prevCC;
1331 pBack=pPreBack;
1332 while(start<pPreBack) {
1333 prevCC=_getPrevCC(start, pPreBack);
1334 if(cc>=prevCC) {
1335 break;
1336 }
1337 pBack=pPreBack;
1338 }
1339
1340 /*
1341 * this is where we are right now with all these pointers:
1342 * [start..pPreBack[ 0..? code points that we can ignore
1343 * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1344 * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2)
1345 * [current..p[ 1 code point (c, c2) with cc
1346 */
1347
1348 /* move the code units in between up */
1349 r=p;
1350 do {
1351 *--r=*--current;
1352 } while(pBack!=current);
1353 }
1354 }
1355
1356 /* insert (c, c2) */
1357 *current=c;
1358 if(c2!=0) {
1359 *(current+1)=c2;
1360 }
1361
1362 /* we know the cc of the last code point */
1363 return trailCC;
1364}
1365
1366/*
1367 * merge two UTF-16 string parts together
1368 * to canonically order (order by combining classes) their concatenation
1369 *
1370 * the two strings may already be adjacent, so that the merging is done in-place
1371 * if the two strings are not adjacent, then the buffer holding the first one
1372 * must be large enough
1373 * the second string may or may not be ordered in itself
1374 *
1375 * before: [start..current[ is already ordered, and
1376 * [next..limit[ may be ordered in itself, but
1377 * is not in relation to [start..current[
1378 * after: [start..current+(limit-next)[ is ordered
1379 *
1380 * the algorithm is a simple bubble-sort that takes the characters from *next++
1381 * and inserts them in correct combining class order into the preceding part
1382 * of the string
1383 *
1384 * since this function is called much less often than the single-code point
1385 * _insertOrdered(), it just uses that for easier maintenance
1386 * (see file version from before 2001aug31 for a more optimized version)
1387 *
1388 * returns the trailing combining class
1389 */
1390static uint8_t
1391_mergeOrdered(UChar *start, UChar *current,
1392 const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
1393 UChar *r;
1394 UChar c, c2;
1395 uint8_t cc, trailCC=0;
1396 UBool adjacent;
1397
1398 adjacent= current==next;
1399
1400 if(start!=current || !isOrdered) {
1401 while(next<limit) {
1402 cc=_getNextCC(next, limit, c, c2);
1403 if(cc==0) {
1404 /* does not bubble back */
1405 trailCC=0;
1406 if(adjacent) {
1407 current=(UChar *)next;
1408 } else {
1409 *current++=c;
1410 if(c2!=0) {
1411 *current++=c2;
1412 }
1413 }
1414 if(isOrdered) {
1415 break;
1416 } else {
1417 start=current;
1418 }
1419 } else {
1420 r=current+(c2==0 ? 1 : 2);
1421 trailCC=_insertOrdered(start, current, r, c, c2, cc);
1422 current=r;
1423 }
1424 }
1425 }
1426
1427 if(next==limit) {
1428 /* we know the cc of the last code point */
1429 return trailCC;
1430 } else {
1431 if(!adjacent) {
1432 /* copy the second string part */
1433 do {
1434 *current++=*next++;
1435 } while(next!=limit);
1436 limit=current;
1437 }
1438 return _getPrevCC(start, limit);
1439 }
1440}
1441
374ca955
A
1442/* find the last true starter in [start..src[ and return the pointer to it */
1443static const UChar *
1444_findPreviousStarter(const UChar *start, const UChar *src,
1445 uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
1446 uint32_t norm32;
b75a7d8f 1447 UChar c, c2;
b75a7d8f 1448
374ca955
A
1449 while(start<src) {
1450 norm32=_getPrevNorm32(start, src, minNoMaybe, ccOrQCMask|decompQCMask, c, c2);
1451 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1452 break;
1453 }
b75a7d8f 1454 }
374ca955
A
1455 return src;
1456}
b75a7d8f 1457
374ca955
A
1458/* find the first true starter in [src..limit[ and return the pointer to it */
1459static const UChar *
1460_findNextStarter(const UChar *src, const UChar *limit,
1461 uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
1462 const UChar *p;
1463 uint32_t norm32, ccOrQCMask;
1464 int32_t length;
1465 UChar c, c2;
1466 uint8_t cc, trailCC;
1467
1468 ccOrQCMask=_NORM_CC_MASK|qcMask;
b75a7d8f
A
1469
1470 for(;;) {
374ca955
A
1471 if(src==limit) {
1472 break; /* end of string */
1473 }
1474 c=*src;
1475 if(c<minNoMaybe) {
1476 break; /* catches NUL terminater, too */
b75a7d8f
A
1477 }
1478
374ca955
A
1479 norm32=_getNorm32(c);
1480 if((norm32&ccOrQCMask)==0) {
1481 break; /* true starter */
1482 }
1483
1484 if(isNorm32LeadSurrogate(norm32)) {
1485 /* c is a lead surrogate, get the real norm32 */
1486 if((src+1)==limit || !UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
1487 break; /* unmatched first surrogate: counts as a true starter */
1488 }
1489 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1490
1491 if((norm32&ccOrQCMask)==0) {
1492 break; /* true starter */
b75a7d8f
A
1493 }
1494 } else {
1495 c2=0;
1496 }
1497
374ca955
A
1498 /* (c, c2) is not a true starter but its decomposition may be */
1499 if(norm32&decompQCMask) {
1500 /* (c, c2) decomposes, get everything from the variable-length extra data */
1501 p=_decompose(norm32, decompQCMask, length, cc, trailCC);
1502
1503 /* get the first character's norm32 to check if it is a true starter */
1504 if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
1505 break; /* true starter */
1506 }
b75a7d8f
A
1507 }
1508
374ca955
A
1509 src+= c2==0 ? 1 : 2; /* not a true starter, continue */
1510 }
b75a7d8f 1511
374ca955 1512 return src;
b75a7d8f
A
1513}
1514
1515/* make NFD & NFKD ---------------------------------------------------------- */
1516
1517U_CAPI int32_t U_EXPORT2
1518unorm_getDecomposition(UChar32 c, UBool compat,
1519 UChar *dest, int32_t destCapacity) {
73c04bcf 1520#if !UNORM_HARDCODE_DATA
b75a7d8f 1521 UErrorCode errorCode=U_ZERO_ERROR;
73c04bcf 1522#endif
b75a7d8f 1523 if( (uint32_t)c<=0x10ffff &&
73c04bcf 1524#if !UNORM_HARDCODE_DATA
b75a7d8f 1525 _haveData(errorCode) &&
73c04bcf 1526#endif
b75a7d8f
A
1527 ((dest!=NULL && destCapacity>0) || destCapacity==0)
1528 ) {
1529 uint32_t norm32, qcMask;
1530 UChar32 minNoMaybe;
1531 int32_t length;
1532
1533 /* initialize */
1534 if(!compat) {
1535 minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1536 qcMask=_NORM_QC_NFD;
1537 } else {
1538 minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1539 qcMask=_NORM_QC_NFKD;
1540 }
1541
1542 if(c<minNoMaybe) {
1543 /* trivial case */
1544 if(destCapacity>0) {
1545 dest[0]=(UChar)c;
1546 }
1547 return -1;
1548 }
1549
1550 /* data lookup */
1551 UTRIE_GET32(&normTrie, c, norm32);
1552 if((norm32&qcMask)==0) {
1553 /* simple case: no decomposition */
1554 if(c<=0xffff) {
1555 if(destCapacity>0) {
1556 dest[0]=(UChar)c;
1557 }
1558 return -1;
1559 } else {
1560 if(destCapacity>=2) {
1561 dest[0]=UTF16_LEAD(c);
1562 dest[1]=UTF16_TRAIL(c);
1563 }
1564 return -2;
1565 }
1566 } else if(isNorm32HangulOrJamo(norm32)) {
1567 /* Hangul syllable: decompose algorithmically */
1568 UChar c2;
1569
1570 c-=HANGUL_BASE;
1571
1572 c2=(UChar)(c%JAMO_T_COUNT);
1573 c/=JAMO_T_COUNT;
1574 if(c2>0) {
1575 if(destCapacity>=3) {
1576 dest[2]=(UChar)(JAMO_T_BASE+c2);
1577 }
1578 length=3;
1579 } else {
1580 length=2;
1581 }
1582
1583 if(destCapacity>=2) {
1584 dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1585 dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1586 }
1587 return length;
1588 } else {
1589 /* c decomposes, get everything from the variable-length extra data */
1590 const UChar *p, *limit;
1591 uint8_t cc, trailCC;
1592
1593 p=_decompose(norm32, qcMask, length, cc, trailCC);
1594 if(length<=destCapacity) {
1595 limit=p+length;
1596 do {
1597 *dest++=*p++;
1598 } while(p<limit);
1599 }
1600 return length;
1601 }
1602 } else {
1603 return 0;
1604 }
1605}
1606
1607static int32_t
1608_decompose(UChar *dest, int32_t destCapacity,
1609 const UChar *src, int32_t srcLength,
1610 UBool compat, const UnicodeSet *nx,
1611 uint8_t &outTrailCC) {
1612 UChar buffer[3];
1613 const UChar *limit, *prevSrc, *p;
1614 uint32_t norm32, ccOrQCMask, qcMask;
1615 int32_t destIndex, reorderStartIndex, length;
1616 UChar c, c2, minNoMaybe;
1617 uint8_t cc, prevCC, trailCC;
1618
1619 if(!compat) {
1620 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1621 qcMask=_NORM_QC_NFD;
1622 } else {
1623 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1624 qcMask=_NORM_QC_NFKD;
1625 }
1626
1627 /* initialize */
1628 ccOrQCMask=_NORM_CC_MASK|qcMask;
1629 destIndex=reorderStartIndex=0;
1630 prevCC=0;
1631
1632 /* avoid compiler warnings */
1633 norm32=0;
1634 c=0;
73c04bcf
A
1635 cc=0;
1636 trailCC=0;
b75a7d8f
A
1637
1638 if(srcLength>=0) {
1639 /* string with length */
1640 limit=src+srcLength;
1641 } else /* srcLength==-1 */ {
1642 /* zero-terminated string */
1643 limit=NULL;
1644 }
1645
1646 U_ALIGN_CODE(16);
1647
1648 for(;;) {
1649 /* count code units below the minimum or with irrelevant data for the quick check */
1650 prevSrc=src;
1651 if(limit==NULL) {
1652 while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
1653 prevCC=0;
1654 ++src;
1655 }
1656 } else {
1657 while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
1658 prevCC=0;
1659 ++src;
1660 }
1661 }
1662
1663 /* copy these code units all at once */
1664 if(src!=prevSrc) {
1665 length=(int32_t)(src-prevSrc);
1666 if((destIndex+length)<=destCapacity) {
1667 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
1668 }
1669 destIndex+=length;
1670 reorderStartIndex=destIndex;
1671 }
1672
1673 /* end of source reached? */
1674 if(limit==NULL ? c==0 : src==limit) {
1675 break;
1676 }
1677
1678 /* c already contains *src and norm32 is set for it, increment src */
1679 ++src;
1680
1681 /* check one above-minimum, relevant code unit */
1682 /*
1683 * generally, set p and length to the decomposition string
1684 * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1685 * in all cases, set cc to the lead and trailCC to the trail combining class
1686 *
1687 * the following merge-sort of the current character into the preceding,
1688 * canonically ordered result text will use the optimized _insertOrdered()
1689 * if there is only one single code point to process;
1690 * this is indicated with p==NULL, and (c, c2) is the character to insert
1691 * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1692 * for a supplementary character)
1693 * otherwise, p[length] is merged in with _mergeOrdered()
1694 */
1695 if(isNorm32HangulOrJamo(norm32)) {
1696 if(nx_contains(nx, c)) {
1697 c2=0;
1698 p=NULL;
1699 length=1;
1700 } else {
1701 /* Hangul syllable: decompose algorithmically */
1702 p=buffer;
1703 cc=trailCC=0;
1704
1705 c-=HANGUL_BASE;
1706
1707 c2=(UChar)(c%JAMO_T_COUNT);
1708 c/=JAMO_T_COUNT;
1709 if(c2>0) {
1710 buffer[2]=(UChar)(JAMO_T_BASE+c2);
1711 length=3;
1712 } else {
1713 length=2;
1714 }
1715
1716 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1717 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1718 }
1719 } else {
1720 if(isNorm32Regular(norm32)) {
1721 c2=0;
1722 length=1;
1723 } else {
1724 /* c is a lead surrogate, get the real norm32 */
1725 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
1726 ++src;
1727 length=2;
1728 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1729 } else {
1730 c2=0;
1731 length=1;
1732 norm32=0;
1733 }
1734 }
1735
1736 /* get the decomposition and the lead and trail cc's */
1737 if(nx_contains(nx, c, c2)) {
1738 /* excluded: norm32==0 */
1739 cc=trailCC=0;
1740 p=NULL;
1741 } else if((norm32&qcMask)==0) {
1742 /* c does not decompose */
1743 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1744 p=NULL;
1745 } else {
1746 /* c decomposes, get everything from the variable-length extra data */
1747 p=_decompose(norm32, qcMask, length, cc, trailCC);
1748 if(length==1) {
1749 /* fastpath a single code unit from decomposition */
1750 c=*p;
1751 c2=0;
1752 p=NULL;
1753 }
1754 }
1755 }
1756
1757 /* append the decomposition to the destination buffer, assume length>0 */
1758 if((destIndex+length)<=destCapacity) {
1759 UChar *reorderSplit=dest+destIndex;
1760 if(p==NULL) {
1761 /* fastpath: single code point */
1762 if(cc!=0 && cc<prevCC) {
1763 /* (c, c2) is out of order with respect to the preceding text */
1764 destIndex+=length;
1765 trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
1766 } else {
1767 /* just append (c, c2) */
1768 dest[destIndex++]=c;
1769 if(c2!=0) {
1770 dest[destIndex++]=c2;
1771 }
1772 }
1773 } else {
1774 /* general: multiple code points (ordered by themselves) from decomposition */
1775 if(cc!=0 && cc<prevCC) {
1776 /* the decomposition is out of order with respect to the preceding text */
1777 destIndex+=length;
1778 trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
1779 } else {
1780 /* just append the decomposition */
1781 do {
1782 dest[destIndex++]=*p++;
1783 } while(--length>0);
1784 }
1785 }
1786 } else {
1787 /* buffer overflow */
1788 /* keep incrementing the destIndex for preflighting */
1789 destIndex+=length;
1790 }
1791
1792 prevCC=trailCC;
1793 if(prevCC==0) {
1794 reorderStartIndex=destIndex;
1795 }
1796 }
1797
1798 outTrailCC=prevCC;
1799 return destIndex;
1800}
1801
1802U_CAPI int32_t U_EXPORT2
1803unorm_decompose(UChar *dest, int32_t destCapacity,
1804 const UChar *src, int32_t srcLength,
1805 UBool compat, int32_t options,
1806 UErrorCode *pErrorCode) {
1807 const UnicodeSet *nx;
1808 int32_t destIndex;
1809 uint8_t trailCC;
1810
1811 if(!_haveData(*pErrorCode)) {
1812 return 0;
1813 }
1814
1815 nx=getNX(options, *pErrorCode);
1816 if(U_FAILURE(*pErrorCode)) {
1817 return 0;
1818 }
1819
1820 destIndex=_decompose(dest, destCapacity,
1821 src, srcLength,
1822 compat, nx,
1823 trailCC);
1824
1825 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
1826}
1827
374ca955 1828/* make NFC & NFKC ---------------------------------------------------------- */
b75a7d8f 1829
374ca955
A
1830/* get the composition properties of the next character */
1831static inline uint32_t
1832_getNextCombining(UChar *&p, const UChar *limit,
1833 UChar &c, UChar &c2,
1834 uint16_t &combiningIndex, uint8_t &cc,
1835 const UnicodeSet *nx) {
1836 uint32_t norm32, combineFlags;
b75a7d8f 1837
374ca955
A
1838 /* get properties */
1839 c=*p++;
1840 norm32=_getNorm32(c);
b75a7d8f 1841
374ca955
A
1842 /* preset output values for most characters */
1843 c2=0;
1844 combiningIndex=0;
1845 cc=0;
b75a7d8f 1846
374ca955
A
1847 if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
1848 return 0;
1849 } else {
b75a7d8f 1850 if(isNorm32Regular(norm32)) {
374ca955
A
1851 /* set cc etc. below */
1852 } else if(isNorm32HangulOrJamo(norm32)) {
1853 /* a compatibility decomposition contained Jamos */
1854 combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
1855 return norm32&_NORM_COMBINES_ANY;
b75a7d8f 1856 } else {
b75a7d8f 1857 /* c is a lead surrogate, get the real norm32 */
374ca955
A
1858 if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
1859 ++p;
b75a7d8f
A
1860 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1861 } else {
1862 c2=0;
374ca955 1863 return 0;
b75a7d8f
A
1864 }
1865 }
1866
b75a7d8f 1867 if(nx_contains(nx, c, c2)) {
374ca955 1868 return 0; /* excluded: norm32==0 */
b75a7d8f
A
1869 }
1870
374ca955 1871 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
b75a7d8f 1872
374ca955
A
1873 combineFlags=norm32&_NORM_COMBINES_ANY;
1874 if(combineFlags!=0) {
1875 combiningIndex=*(_getExtraData(norm32)-1);
b75a7d8f 1876 }
374ca955 1877 return combineFlags;
b75a7d8f 1878 }
b75a7d8f
A
1879}
1880
374ca955
A
1881/*
1882 * given a composition-result starter (c, c2) - which means its cc==0,
1883 * it combines forward, it has extra data, its norm32!=0,
1884 * it is not a Hangul or Jamo,
1885 * get just its combineFwdIndex
1886 *
1887 * norm32(c) is special if and only if c2!=0
1888 */
1889static inline uint16_t
1890_getCombiningIndexFromStarter(UChar c, UChar c2) {
1891 uint32_t norm32;
b75a7d8f 1892
374ca955
A
1893 norm32=_getNorm32(c);
1894 if(c2!=0) {
1895 norm32=_getNorm32FromSurrogatePair(norm32, c2);
b75a7d8f 1896 }
374ca955
A
1897 return *(_getExtraData(norm32)-1);
1898}
b75a7d8f 1899
374ca955
A
1900/*
1901 * Find the recomposition result for
1902 * a forward-combining character
1903 * (specified with a pointer to its part of the combiningTable[])
1904 * and a backward-combining character
1905 * (specified with its combineBackIndex).
1906 *
1907 * If these two characters combine, then set (value, value2)
1908 * with the code unit(s) of the composition character.
1909 *
1910 * Return value:
1911 * 0 do not combine
1912 * 1 combine
1913 * >1 combine, and the composition is a forward-combining starter
1914 *
1915 * See unormimp.h for a description of the composition table format.
1916 */
1917static inline uint16_t
1918_combine(const uint16_t *table, uint16_t combineBackIndex,
1919 uint16_t &value, uint16_t &value2) {
1920 uint16_t key;
b75a7d8f 1921
374ca955
A
1922 /* search in the starter's composition table */
1923 for(;;) {
1924 key=*table++;
1925 if(key>=combineBackIndex) {
1926 break;
1927 }
1928 table+= *table&0x8000 ? 2 : 1;
b75a7d8f
A
1929 }
1930
374ca955
A
1931 /* mask off bit 15, the last-entry-in-the-list flag */
1932 if((key&0x7fff)==combineBackIndex) {
1933 /* found! combine! */
1934 value=*table;
b75a7d8f 1935
374ca955
A
1936 /* is the composition a starter that combines forward? */
1937 key=(uint16_t)((value&0x2000)+1);
1938
1939 /* get the composition result code point from the variable-length result value */
1940 if(value&0x8000) {
1941 if(value&0x4000) {
1942 /* surrogate pair composition result */
1943 value=(uint16_t)((value&0x3ff)|0xd800);
1944 value2=*(table+1);
1945 } else {
1946 /* BMP composition result U+2000..U+ffff */
1947 value=*(table+1);
1948 value2=0;
b75a7d8f
A
1949 }
1950 } else {
374ca955
A
1951 /* BMP composition result U+0000..U+1fff */
1952 value&=0x1fff;
1953 value2=0;
b75a7d8f
A
1954 }
1955
374ca955
A
1956 return key;
1957 } else {
1958 /* not found */
1959 return 0;
1960 }
1961}
b75a7d8f 1962
374ca955
A
1963static inline UBool
1964_composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
1965 UBool compat, UChar *dest, const UnicodeSet *nx) {
1966 if(isJamoVTNorm32JamoV(norm32)) {
1967 /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1968 prev=(UChar)(prev-JAMO_L_BASE);
1969 if(prev<JAMO_L_COUNT) {
1970 c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
b75a7d8f 1971
374ca955
A
1972 /* check if the next character is a Jamo T (normal or compatibility) */
1973 if(src!=limit) {
1974 UChar next, t;
b75a7d8f 1975
374ca955
A
1976 next=*src;
1977 if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1978 /* normal Jamo T */
1979 ++src;
1980 c+=t;
1981 } else if(compat) {
1982 /* if NFKC, then check for compatibility Jamo T (BMP only) */
1983 norm32=_getNorm32(next);
1984 if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
1985 const UChar *p;
1986 int32_t length;
1987 uint8_t cc, trailCC;
1988
1989 p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
1990 if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1991 /* compatibility Jamo T */
1992 ++src;
1993 c+=t;
1994 }
1995 }
1996 }
1997 }
1998 if(nx_contains(nx, c)) {
1999 if(!isHangulWithoutJamoT(c)) {
2000 --src; /* undo ++src from reading the Jamo T */
2001 }
2002 return FALSE;
2003 }
2004 if(dest!=0) {
2005 *dest=c;
b75a7d8f 2006 }
374ca955 2007 return TRUE;
b75a7d8f 2008 }
374ca955
A
2009 } else if(isHangulWithoutJamoT(prev)) {
2010 /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
2011 c=(UChar)(prev+(c-JAMO_T_BASE));
2012 if(nx_contains(nx, c)) {
2013 return FALSE;
b75a7d8f 2014 }
374ca955
A
2015 if(dest!=0) {
2016 *dest=c;
b75a7d8f 2017 }
374ca955
A
2018 return TRUE;
2019 }
2020 return FALSE;
2021}
b75a7d8f 2022
374ca955
A
2023/*
2024 * recompose the characters in [p..limit[
2025 * (which is in NFD - decomposed and canonically ordered),
2026 * adjust limit, and return the trailing cc
2027 *
2028 * since for NFKC we may get Jamos in decompositions, we need to
2029 * recompose those too
2030 *
2031 * note that recomposition never lengthens the text:
2032 * any character consists of either one or two code units;
2033 * a composition may contain at most one more code unit than the original starter,
2034 * while the combining mark that is removed has at least one code unit
2035 */
2036static uint8_t
2037_recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
2038 UChar *starter, *pRemove, *q, *r;
2039 uint32_t combineFlags;
2040 UChar c, c2;
2041 uint16_t combineFwdIndex, combineBackIndex;
2042 uint16_t result, value, value2;
2043 uint8_t cc, prevCC;
2044 UBool starterIsSupplementary;
b75a7d8f 2045
374ca955
A
2046 starter=NULL; /* no starter */
2047 combineFwdIndex=0; /* will not be used until starter!=NULL - avoid compiler warnings */
2048 combineBackIndex=0; /* will always be set if combineFlags!=0 - avoid compiler warnings */
2049 value=value2=0; /* always set by _combine() before used - avoid compiler warnings */
2050 starterIsSupplementary=FALSE; /* will not be used until starter!=NULL - avoid compiler warnings */
2051 prevCC=0;
b75a7d8f 2052
374ca955
A
2053 for(;;) {
2054 combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
2055 if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
2056 if(combineBackIndex&0x8000) {
2057 /* c is a Jamo V/T, see if we can compose it with the previous character */
2058 /* for the PRI #29 fix, check that there is no intervening combining mark */
2059 if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
2060 pRemove=NULL; /* NULL while no Hangul composition */
2061 combineFlags=0;
2062 c2=*starter;
2063 if(combineBackIndex==0xfff2) {
2064 /* Jamo V, compose with previous Jamo L and following Jamo T */
2065 c2=(UChar)(c2-JAMO_L_BASE);
2066 if(c2<JAMO_L_COUNT) {
2067 pRemove=p-1;
2068 c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
2069 if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
2070 ++p;
2071 c+=c2;
2072 } else {
2073 /* the result is an LV syllable, which is a starter (unlike LVT) */
2074 combineFlags=_NORM_COMBINES_FWD;
2075 }
2076 if(!nx_contains(nx, c)) {
2077 *starter=c;
2078 } else {
2079 /* excluded */
2080 if(!isHangulWithoutJamoT(c)) {
2081 --p; /* undo the ++p from reading the Jamo T */
2082 }
2083 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
2084 pRemove=NULL;
2085 }
2086 }
b75a7d8f 2087
374ca955
A
2088 /*
2089 * Normally, the following can not occur:
2090 * Since the input is in NFD, there are no Hangul LV syllables that
2091 * a Jamo T could combine with.
2092 * All Jamo Ts are combined above when handling Jamo Vs.
2093 *
2094 * However, before the PRI #29 fix, this can occur due to
2095 * an intervening combining mark between the Hangul LV and the Jamo T.
2096 */
2097 } else {
2098 /* Jamo T, compose with previous Hangul that does not have a Jamo T */
2099 if(isHangulWithoutJamoT(c2)) {
2100 c2+=(UChar)(c-JAMO_T_BASE);
2101 if(!nx_contains(nx, c2)) {
2102 pRemove=p-1;
2103 *starter=c2;
2104 }
2105 }
2106 }
b75a7d8f 2107
374ca955
A
2108 if(pRemove!=NULL) {
2109 /* remove the Jamo(s) */
2110 q=pRemove;
2111 r=p;
2112 while(r<limit) {
2113 *q++=*r++;
2114 }
2115 p=pRemove;
2116 limit=q;
2117 }
b75a7d8f 2118
374ca955 2119 c2=0; /* c2 held *starter temporarily */
b75a7d8f 2120
374ca955
A
2121 if(combineFlags!=0) {
2122 /*
2123 * not starter=NULL because the composition is a Hangul LV syllable
2124 * and might combine once more (but only before the PRI #29 fix)
2125 */
b75a7d8f 2126
374ca955
A
2127 /* done? */
2128 if(p==limit) {
2129 return prevCC;
2130 }
b75a7d8f 2131
374ca955
A
2132 /* the composition is a Hangul LV syllable which is a starter that combines forward */
2133 combineFwdIndex=0xfff0;
b75a7d8f 2134
374ca955
A
2135 /* we combined; continue with looking for compositions */
2136 continue;
2137 }
2138 }
b75a7d8f 2139
374ca955
A
2140 /*
2141 * now: cc==0 and the combining index does not include "forward" ->
2142 * the rest of the loop body will reset starter to NULL;
2143 * technically, a composed Hangul syllable is a starter, but it
2144 * does not combine forward now that we have consumed all eligible Jamos;
2145 * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2146 */
b75a7d8f 2147
374ca955
A
2148 } else if(
2149 /* the starter is not a Hangul LV or Jamo V/T and */
2150 !(combineFwdIndex&0x8000) &&
2151 /* the combining mark is not blocked and */
2152 ((options&UNORM_BEFORE_PRI_29) ?
2153 (prevCC!=cc || prevCC==0) :
2154 (prevCC<cc || prevCC==0)) &&
2155 /* the starter and the combining mark (c, c2) do combine and */
2156 0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
2157 /* the composition result is not excluded */
2158 !nx_contains(nx, value, value2)
2159 ) {
2160 /* replace the starter with the composition, remove the combining mark */
2161 pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
b75a7d8f
A
2162
2163 /* replace the starter with the composition */
2164 *starter=(UChar)value;
2165 if(starterIsSupplementary) {
2166 if(value2!=0) {
2167 /* both are supplementary */
2168 *(starter+1)=(UChar)value2;
2169 } else {
2170 /* the composition is shorter than the starter, move the intermediate characters forward one */
2171 starterIsSupplementary=FALSE;
2172 q=starter+1;
2173 r=q+1;
2174 while(r<pRemove) {
2175 *q++=*r++;
2176 }
2177 --pRemove;
2178 }
2179 } else if(value2!=0) {
2180 /* the composition is longer than the starter, move the intermediate characters back one */
2181 starterIsSupplementary=TRUE;
2182 ++starter; /* temporarily increment for the loop boundary */
2183 q=pRemove;
2184 r=++pRemove;
2185 while(starter<q) {
2186 *--r=*--q;
2187 }
2188 *starter=(UChar)value2;
2189 --starter; /* undo the temporary increment */
2190 /* } else { both are on the BMP, nothing more to do */
2191 }
2192
2193 /* remove the combining mark by moving the following text over it */
2194 if(pRemove<p) {
2195 q=pRemove;
2196 r=p;
2197 while(r<limit) {
2198 *q++=*r++;
2199 }
2200 p=pRemove;
2201 limit=q;
2202 }
2203
2204 /* keep prevCC because we removed the combining mark */
2205
2206 /* done? */
2207 if(p==limit) {
2208 return prevCC;
2209 }
2210
2211 /* is the composition a starter that combines forward? */
2212 if(result>1) {
2213 combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
2214 } else {
2215 starter=NULL;
2216 }
2217
374ca955 2218 /* we combined; continue with looking for compositions */
b75a7d8f
A
2219 continue;
2220 }
2221 }
2222
374ca955
A
2223 /* no combination this time */
2224 prevCC=cc;
2225 if(p==limit) {
2226 return prevCC;
2227 }
2228
2229 /* if (c, c2) did not combine, then check if it is a starter */
2230 if(cc==0) {
2231 /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2232 if(combineFlags&_NORM_COMBINES_FWD) {
2233 /* it may combine with something, prepare for it */
2234 if(c2==0) {
2235 starterIsSupplementary=FALSE;
2236 starter=p-1;
2237 } else {
2238 starterIsSupplementary=TRUE;
2239 starter=p-2;
2240 }
2241 combineFwdIndex=combineBackIndex;
2242 } else {
2243 /* it will not combine with anything */
2244 starter=NULL;
2245 }
2246 } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
2247 /* FCC: no discontiguous compositions; any intervening character blocks */
2248 starter=NULL;
2249 }
2250 }
2251}
2252
2253/* decompose and recompose [prevStarter..src[ */
2254static const UChar *
2255_composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
2256 const UChar *prevStarter, const UChar *src,
2257 uint8_t &prevCC,
2258 int32_t options, const UnicodeSet *nx,
2259 UErrorCode *pErrorCode) {
2260 UChar *recomposeLimit;
2261 uint8_t trailCC;
2262 UBool compat;
2263
2264 compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
2265
2266 /* decompose [prevStarter..src[ */
2267 length=_decompose(buffer, bufferCapacity,
73c04bcf 2268 prevStarter, (int32_t)(src-prevStarter),
374ca955
A
2269 compat, nx,
2270 trailCC);
2271 if(length>bufferCapacity) {
2272 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
2273 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2274 return NULL;
2275 }
2276 length=_decompose(buffer, bufferCapacity,
73c04bcf 2277 prevStarter, (int32_t)(src-prevStarter),
374ca955
A
2278 compat, nx,
2279 trailCC);
2280 }
2281
2282 /* recompose the decomposition */
2283 recomposeLimit=buffer+length;
2284 if(length>=2) {
2285 prevCC=_recompose(buffer, recomposeLimit, options, nx);
2286 }
2287
2288 /* return with a pointer to the recomposition and its length */
73c04bcf 2289 length=(int32_t)(recomposeLimit-buffer);
374ca955
A
2290 return buffer;
2291}
2292
2293static int32_t
2294_compose(UChar *dest, int32_t destCapacity,
2295 const UChar *src, int32_t srcLength,
2296 int32_t options, const UnicodeSet *nx,
2297 UErrorCode *pErrorCode) {
2298 UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2299 UChar *buffer;
2300 int32_t bufferCapacity;
2301
2302 const UChar *limit, *prevSrc, *prevStarter;
2303 uint32_t norm32, ccOrQCMask, qcMask;
2304 int32_t destIndex, reorderStartIndex, length;
2305 UChar c, c2, minNoMaybe;
2306 uint8_t cc, prevCC;
2307
2308 if(options&_NORM_OPTIONS_COMPAT) {
2309 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2310 qcMask=_NORM_QC_NFKC;
2311 } else {
2312 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2313 qcMask=_NORM_QC_NFC;
2314 }
2315
2316 /* initialize */
2317 buffer=stackBuffer;
2318 bufferCapacity=_STACK_BUFFER_CAPACITY;
2319
2320 /*
2321 * prevStarter points to the last character before the current one
2322 * that is a "true" starter with cc==0 and quick check "yes".
2323 *
2324 * prevStarter will be used instead of looking for a true starter
2325 * while incrementally decomposing [prevStarter..prevSrc[
2326 * in _composePart(). Having a good prevStarter allows to just decompose
2327 * the entire [prevStarter..prevSrc[.
2328 *
2329 * When _composePart() backs out from prevSrc back to prevStarter,
2330 * then it also backs out destIndex by the same amount.
2331 * Therefore, at all times, the (prevSrc-prevStarter) source units
2332 * must correspond 1:1 to destination units counted with destIndex,
2333 * except for reordering.
2334 * This is true for the qc "yes" characters copied in the fast loop,
2335 * and for pure reordering.
2336 * prevStarter must be set forward to src when this is not true:
2337 * In _composePart() and after composing a Hangul syllable.
2338 *
2339 * This mechanism relies on the assumption that the decomposition of a true starter
2340 * also begins with a true starter. gennorm/store.c checks for this.
2341 */
2342 prevStarter=src;
2343
2344 ccOrQCMask=_NORM_CC_MASK|qcMask;
2345 destIndex=reorderStartIndex=0;
2346 prevCC=0;
2347
2348 /* avoid compiler warnings */
2349 norm32=0;
2350 c=0;
2351
2352 if(srcLength>=0) {
2353 /* string with length */
2354 limit=src+srcLength;
2355 } else /* srcLength==-1 */ {
2356 /* zero-terminated string */
2357 limit=NULL;
2358 }
2359
2360 U_ALIGN_CODE(16);
2361
2362 for(;;) {
2363 /* count code units below the minimum or with irrelevant data for the quick check */
2364 prevSrc=src;
2365 if(limit==NULL) {
2366 while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
2367 prevCC=0;
2368 ++src;
2369 }
2370 } else {
2371 while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
2372 prevCC=0;
2373 ++src;
2374 }
2375 }
2376
2377 /* copy these code units all at once */
2378 if(src!=prevSrc) {
2379 length=(int32_t)(src-prevSrc);
2380 if((destIndex+length)<=destCapacity) {
2381 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2382 }
2383 destIndex+=length;
2384 reorderStartIndex=destIndex;
2385
2386 /* set prevStarter to the last character in the quick check loop */
2387 prevStarter=src-1;
2388 if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
2389 --prevStarter;
2390 }
2391
2392 prevSrc=src;
2393 }
2394
2395 /* end of source reached? */
2396 if(limit==NULL ? c==0 : src==limit) {
2397 break;
2398 }
2399
2400 /* c already contains *src and norm32 is set for it, increment src */
2401 ++src;
2402
2403 /*
2404 * source buffer pointers:
2405 *
2406 * all done quick check current char not yet
2407 * "yes" but (c, c2) processed
2408 * may combine
2409 * forward
2410 * [-------------[-------------[-------------[-------------[
2411 * | | | | |
2412 * start prevStarter prevSrc src limit
2413 *
2414 *
2415 * destination buffer pointers and indexes:
2416 *
2417 * all done might take not filled yet
2418 * characters for
2419 * reordering
2420 * [-------------[-------------[-------------[
2421 * | | | |
2422 * dest reorderStartIndex destIndex destCapacity
2423 */
2424
2425 /* check one above-minimum, relevant code unit */
2426 /*
2427 * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2428 * check for Jamo V/T, then for surrogates and regular characters
2429 * c is not a Hangul syllable or Jamo L because
2430 * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2431 */
2432 if(isNorm32HangulOrJamo(norm32)) {
2433 /*
2434 * c is a Jamo V/T:
2435 * try to compose with the previous character, Jamo V also with a following Jamo T,
2436 * and set values here right now in case we just continue with the main loop
2437 */
2438 prevCC=cc=0;
2439 reorderStartIndex=destIndex;
2440
2441 if(
2442 destIndex>0 &&
2443 _composeHangul(
2444 *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
2445 destIndex<=destCapacity ? dest+(destIndex-1) : 0,
2446 nx)
2447 ) {
2448 prevStarter=src;
2449 continue;
2450 }
2451
2452 /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2453 c2=0;
2454 length=1;
2455 prevStarter=prevSrc;
2456 } else {
2457 if(isNorm32Regular(norm32)) {
2458 c2=0;
2459 length=1;
2460 } else {
2461 /* c is a lead surrogate, get the real norm32 */
2462 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2463 ++src;
2464 length=2;
2465 norm32=_getNorm32FromSurrogatePair(norm32, c2);
2466 } else {
2467 /* c is an unpaired lead surrogate, nothing to do */
2468 c2=0;
2469 length=1;
2470 norm32=0;
2471 }
2472 }
2473
2474 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2475 if(nx_contains(nx, c, c2)) {
2476 /* excluded: norm32==0 */
2477 cc=0;
2478 } else if((norm32&qcMask)==0) {
2479 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2480 } else {
2481 const UChar *p;
2482 uint32_t decompQCMask;
2483
2484 /*
2485 * find appropriate boundaries around this character,
2486 * decompose the source text from between the boundaries,
2487 * and recompose it
2488 *
2489 * this puts the intermediate text into the side buffer because
2490 * it might be longer than the recomposition end result,
2491 * or the destination buffer may be too short or missing
2492 *
2493 * note that destIndex may be adjusted backwards to account
2494 * for source text that passed the quick check but needed to
2495 * take part in the recomposition
2496 */
2497 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
2498
2499 /*
2500 * find the last true starter in [prevStarter..src[
2501 * it is either the decomposition of the current character (at prevSrc),
2502 * or prevStarter
2503 */
2504 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
2505 prevStarter=prevSrc;
2506 } else {
2507 /* adjust destIndex: back out what had been copied with qc "yes" */
2508 destIndex-=(int32_t)(prevSrc-prevStarter);
2509 }
2510
2511 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2512 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
2513
2514 /* compose [prevStarter..src[ */
2515 p=_composePart(stackBuffer, buffer, bufferCapacity,
2516 length, /* output */
2517 prevStarter, src,
2518 prevCC, /* output */
2519 options, nx,
2520 pErrorCode);
2521
2522 if(p==NULL) {
2523 destIndex=0; /* an error occurred (out of memory) */
2524 break;
2525 }
2526
2527 /* append the recomposed buffer contents to the destination buffer */
2528 if((destIndex+length)<=destCapacity) {
2529 while(length>0) {
2530 dest[destIndex++]=*p++;
2531 --length;
2532 }
2533 } else {
2534 /* buffer overflow */
2535 /* keep incrementing the destIndex for preflighting */
2536 destIndex+=length;
2537 }
2538
2539 /* set the next starter */
2540 prevStarter=src;
2541
2542 continue;
2543 }
2544 }
2545
2546 /* append the single code point (c, c2) to the destination buffer */
2547 if((destIndex+length)<=destCapacity) {
2548 if(cc!=0 && cc<prevCC) {
2549 /* (c, c2) is out of order with respect to the preceding text */
2550 UChar *reorderSplit=dest+destIndex;
2551 destIndex+=length;
2552 prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2553 } else {
2554 /* just append (c, c2) */
2555 dest[destIndex++]=c;
2556 if(c2!=0) {
2557 dest[destIndex++]=c2;
2558 }
2559 prevCC=cc;
2560 }
2561 } else {
2562 /* buffer overflow */
2563 /* keep incrementing the destIndex for preflighting */
2564 destIndex+=length;
2565 prevCC=cc;
2566 }
2567 }
2568
2569 /* cleanup */
2570 if(buffer!=stackBuffer) {
2571 uprv_free(buffer);
2572 }
2573
2574 return destIndex;
2575}
2576
2577U_CAPI int32_t U_EXPORT2
2578unorm_compose(UChar *dest, int32_t destCapacity,
2579 const UChar *src, int32_t srcLength,
2580 UBool compat, int32_t options,
2581 UErrorCode *pErrorCode) {
2582 const UnicodeSet *nx;
2583 int32_t destIndex;
2584
2585 if(!_haveData(*pErrorCode)) {
2586 return 0;
2587 }
2588
2589 nx=getNX(options, *pErrorCode);
2590 if(U_FAILURE(*pErrorCode)) {
2591 return 0;
2592 }
2593
2594 /* reset options bits that should only be set here or inside _compose() */
2595 options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
2596
2597 if(compat) {
2598 options|=_NORM_OPTIONS_COMPAT;
2599 }
2600
2601 destIndex=_compose(dest, destCapacity,
2602 src, srcLength,
2603 options, nx,
2604 pErrorCode);
2605
2606 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2607}
2608
2609/* make FCD ----------------------------------------------------------------- */
2610
2611static const UChar *
2612_findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
2613 UChar c, c2;
2614
2615 /*
2616 * find the first position in [src..limit[ after some cc==0 according to FCD data
2617 *
2618 * at the beginning of the loop, we have fcd16 from before src
2619 *
2620 * stop at positions:
2621 * - after trail cc==0
2622 * - at the end of the source
2623 * - before lead cc==0
2624 */
2625 for(;;) {
2626 /* stop if trail cc==0 for the previous character */
2627 if((fcd16&0xff)==0) {
2628 break;
2629 }
2630
2631 /* get c=*src - stop at end of string */
2632 if(src==limit) {
2633 break;
2634 }
2635 c=*src;
2636
2637 /* stop if lead cc==0 for this character */
2638 if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
2639 break; /* catches terminating NUL, too */
2640 }
2641
2642 if(!UTF_IS_FIRST_SURROGATE(c)) {
2643 if(fcd16<=0xff) {
2644 break;
2645 }
2646 ++src;
2647 } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
2648 /* c is a lead surrogate, get the real fcd16 */
2649 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2650 if(fcd16<=0xff) {
2651 break;
2652 }
2653 src+=2;
2654 } else {
2655 /* c is an unpaired first surrogate, lead cc==0 */
2656 break;
2657 }
2658 }
2659
2660 return src;
2661}
2662
2663static uint8_t
2664_decomposeFCD(const UChar *src, const UChar *decompLimit,
2665 UChar *dest, int32_t &destIndex, int32_t destCapacity,
2666 const UnicodeSet *nx) {
2667 const UChar *p;
2668 uint32_t norm32;
2669 int32_t reorderStartIndex, length;
2670 UChar c, c2;
2671 uint8_t cc, prevCC, trailCC;
2672
2673 /*
2674 * canonically decompose [src..decompLimit[
2675 *
2676 * all characters in this range have some non-zero cc,
2677 * directly or in decomposition,
2678 * so that we do not need to check in the following for quick-check limits etc.
2679 *
2680 * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2681 *
2682 * we also do not need to check for c==0 because we have an established decompLimit
2683 */
2684 reorderStartIndex=destIndex;
2685 prevCC=0;
2686
2687 while(src<decompLimit) {
2688 c=*src++;
2689 norm32=_getNorm32(c);
2690 if(isNorm32Regular(norm32)) {
2691 c2=0;
2692 length=1;
2693 } else {
2694 /*
2695 * reminder: this function is called with [src..decompLimit[
2696 * not containing any Hangul/Jamo characters,
2697 * therefore the only specials are lead surrogates
2698 */
2699 /* c is a lead surrogate, get the real norm32 */
2700 if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2701 ++src;
2702 length=2;
2703 norm32=_getNorm32FromSurrogatePair(norm32, c2);
2704 } else {
2705 c2=0;
2706 length=1;
2707 norm32=0;
2708 }
2709 }
2710
2711 /* get the decomposition and the lead and trail cc's */
2712 if(nx_contains(nx, c, c2)) {
2713 /* excluded: norm32==0 */
2714 cc=trailCC=0;
2715 p=NULL;
2716 } else if((norm32&_NORM_QC_NFD)==0) {
2717 /* c does not decompose */
2718 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2719 p=NULL;
2720 } else {
2721 /* c decomposes, get everything from the variable-length extra data */
2722 p=_decompose(norm32, length, cc, trailCC);
2723 if(length==1) {
2724 /* fastpath a single code unit from decomposition */
2725 c=*p;
2726 c2=0;
2727 p=NULL;
2728 }
2729 }
2730
2731 /* append the decomposition to the destination buffer, assume length>0 */
2732 if((destIndex+length)<=destCapacity) {
2733 UChar *reorderSplit=dest+destIndex;
2734 if(p==NULL) {
2735 /* fastpath: single code point */
2736 if(cc!=0 && cc<prevCC) {
2737 /* (c, c2) is out of order with respect to the preceding text */
2738 destIndex+=length;
2739 trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
b75a7d8f 2740 } else {
374ca955
A
2741 /* just append (c, c2) */
2742 dest[destIndex++]=c;
2743 if(c2!=0) {
2744 dest[destIndex++]=c2;
2745 }
b75a7d8f 2746 }
b75a7d8f 2747 } else {
374ca955
A
2748 /* general: multiple code points (ordered by themselves) from decomposition */
2749 if(cc!=0 && cc<prevCC) {
2750 /* the decomposition is out of order with respect to the preceding text */
2751 destIndex+=length;
2752 trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
2753 } else {
2754 /* just append the decomposition */
2755 do {
2756 dest[destIndex++]=*p++;
2757 } while(--length>0);
2758 }
b75a7d8f 2759 }
374ca955
A
2760 } else {
2761 /* buffer overflow */
2762 /* keep incrementing the destIndex for preflighting */
2763 destIndex+=length;
2764 }
2765
2766 prevCC=trailCC;
2767 if(prevCC==0) {
2768 reorderStartIndex=destIndex;
b75a7d8f
A
2769 }
2770 }
374ca955
A
2771
2772 return prevCC;
b75a7d8f
A
2773}
2774
374ca955
A
2775static int32_t
2776unorm_makeFCD(UChar *dest, int32_t destCapacity,
2777 const UChar *src, int32_t srcLength,
2778 const UnicodeSet *nx,
2779 UErrorCode *pErrorCode) {
2780 const UChar *limit, *prevSrc, *decompStart;
2781 int32_t destIndex, length;
b75a7d8f 2782 UChar c, c2;
374ca955
A
2783 uint16_t fcd16;
2784 int16_t prevCC, cc;
b75a7d8f 2785
374ca955
A
2786 if(!_haveData(*pErrorCode)) {
2787 return 0;
b75a7d8f 2788 }
b75a7d8f 2789
374ca955
A
2790 /* initialize */
2791 decompStart=src;
2792 destIndex=0;
2793 prevCC=0;
b75a7d8f 2794
374ca955
A
2795 /* avoid compiler warnings */
2796 c=0;
2797 fcd16=0;
2798
2799 if(srcLength>=0) {
2800 /* string with length */
2801 limit=src+srcLength;
2802 } else /* srcLength==-1 */ {
2803 /* zero-terminated string */
2804 limit=NULL;
2805 }
2806
2807 U_ALIGN_CODE(16);
b75a7d8f
A
2808
2809 for(;;) {
374ca955
A
2810 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2811 prevSrc=src;
2812 if(limit==NULL) {
2813 for(;;) {
2814 c=*src;
2815 if(c<_NORM_MIN_WITH_LEAD_CC) {
2816 if(c==0) {
2817 break;
2818 }
2819 prevCC=(int16_t)-c;
2820 } else if((fcd16=_getFCD16(c))==0) {
2821 prevCC=0;
2822 } else {
2823 break;
2824 }
2825 ++src;
2826 }
2827 } else {
2828 for(;;) {
2829 if(src==limit) {
2830 break;
2831 } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
2832 prevCC=(int16_t)-c;
2833 } else if((fcd16=_getFCD16(c))==0) {
2834 prevCC=0;
2835 } else {
2836 break;
2837 }
2838 ++src;
2839 }
b75a7d8f 2840 }
374ca955
A
2841
2842 /*
2843 * prevCC has values from the following ranges:
2844 * 0..0xff - the previous trail combining class
2845 * <0 - the negative value of the previous code unit;
2846 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2847 * was deferred so that average text is checked faster
2848 */
2849
2850 /* copy these code units all at once */
2851 if(src!=prevSrc) {
2852 length=(int32_t)(src-prevSrc);
2853 if((destIndex+length)<=destCapacity) {
2854 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2855 }
2856 destIndex+=length;
2857 prevSrc=src;
2858
2859 /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2860 if(prevCC<0) {
2861 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2862 if(!nx_contains(nx, (UChar32)-prevCC)) {
2863 prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2864 } else {
2865 prevCC=0; /* excluded: fcd16==0 */
2866 }
2867
2868 /*
2869 * set a pointer to this below-U+0300 character;
2870 * if prevCC==0 then it will moved to after this character below
2871 */
2872 decompStart=prevSrc-1;
2873 }
b75a7d8f 2874 }
374ca955
A
2875 /*
2876 * now:
2877 * prevSrc==src - used later to adjust destIndex before decomposition
2878 * prevCC>=0
2879 */
b75a7d8f 2880
374ca955
A
2881 /* end of source reached? */
2882 if(limit==NULL ? c==0 : src==limit) {
2883 break;
b75a7d8f
A
2884 }
2885
374ca955
A
2886 /* set a pointer to after the last source position where prevCC==0 */
2887 if(prevCC==0) {
2888 decompStart=prevSrc;
2889 }
b75a7d8f 2890
374ca955
A
2891 /* c already contains *src and fcd16 is set for it, increment src */
2892 ++src;
2893
2894 /* check one above-minimum, relevant code unit */
2895 if(UTF_IS_FIRST_SURROGATE(c)) {
2896 /* c is a lead surrogate, get the real fcd16 */
2897 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2898 ++src;
2899 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2900 } else {
2901 c2=0;
2902 fcd16=0;
b75a7d8f
A
2903 }
2904 } else {
2905 c2=0;
2906 }
2907
374ca955
A
2908 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2909 if(nx_contains(nx, c, c2)) {
2910 fcd16=0; /* excluded: fcd16==0 */
2911 }
b75a7d8f 2912
374ca955
A
2913 /* check the combining order, get the lead cc */
2914 cc=(int16_t)(fcd16>>8);
2915 if(cc==0 || cc>=prevCC) {
2916 /* the order is ok */
2917 if(cc==0) {
2918 decompStart=prevSrc;
2919 }
2920 prevCC=(int16_t)(fcd16&0xff);
2921
2922 /* just append (c, c2) */
2923 length= c2==0 ? 1 : 2;
2924 if((destIndex+length)<=destCapacity) {
2925 dest[destIndex++]=c;
2926 if(c2!=0) {
2927 dest[destIndex++]=c2;
2928 }
2929 } else {
2930 destIndex+=length;
b75a7d8f 2931 }
374ca955
A
2932 } else {
2933 /*
2934 * back out the part of the source that we copied already but
2935 * is now going to be decomposed;
2936 * prevSrc is set to after what was copied
2937 */
2938 destIndex-=(int32_t)(prevSrc-decompStart);
2939
2940 /*
2941 * find the part of the source that needs to be decomposed;
2942 * to be safe and simple, decompose to before the next character with lead cc==0
2943 */
2944 src=_findSafeFCD(src, limit, fcd16);
2945
2946 /*
2947 * the source text does not fulfill the conditions for FCD;
2948 * decompose and reorder a limited piece of the text
2949 */
2950 prevCC=_decomposeFCD(decompStart, src,
2951 dest, destIndex, destCapacity,
2952 nx);
2953 decompStart=src;
b75a7d8f 2954 }
b75a7d8f
A
2955 }
2956
374ca955 2957 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
b75a7d8f
A
2958}
2959
374ca955 2960/* quick check functions ---------------------------------------------------- */
b75a7d8f 2961
374ca955
A
2962static UBool
2963unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
2964 const UChar *limit;
2965 UChar c, c2;
2966 uint16_t fcd16;
2967 int16_t prevCC, cc;
b75a7d8f 2968
374ca955
A
2969 /* initialize */
2970 prevCC=0;
b75a7d8f 2971
374ca955
A
2972 if(srcLength>=0) {
2973 /* string with length */
2974 limit=src+srcLength;
2975 } else /* srcLength==-1 */ {
2976 /* zero-terminated string */
2977 limit=NULL;
b75a7d8f
A
2978 }
2979
374ca955 2980 U_ALIGN_CODE(16);
b75a7d8f 2981
374ca955
A
2982 for(;;) {
2983 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2984 if(limit==NULL) {
2985 for(;;) {
2986 c=*src++;
2987 if(c<_NORM_MIN_WITH_LEAD_CC) {
2988 if(c==0) {
2989 return TRUE;
b75a7d8f 2990 }
374ca955
A
2991 /*
2992 * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2993 * because chances are good that the next one will have
2994 * a leading cc of 0;
2995 * _getFCD16(-prevCC) is later called when necessary -
2996 * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2997 */
2998 prevCC=(int16_t)-c;
2999 } else if((fcd16=_getFCD16(c))==0) {
3000 prevCC=0;
3001 } else {
3002 break;
b75a7d8f
A
3003 }
3004 }
374ca955
A
3005 } else {
3006 for(;;) {
3007 if(src==limit) {
3008 return TRUE;
3009 } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
3010 prevCC=(int16_t)-c;
3011 } else if((fcd16=_getFCD16(c))==0) {
3012 prevCC=0;
3013 } else {
3014 break;
b75a7d8f 3015 }
b75a7d8f 3016 }
374ca955
A
3017 }
3018
3019 /* check one above-minimum, relevant code unit */
3020 if(UTF_IS_FIRST_SURROGATE(c)) {
3021 /* c is a lead surrogate, get the real fcd16 */
3022 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3023 ++src;
3024 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
3025 } else {
3026 c2=0;
3027 fcd16=0;
b75a7d8f 3028 }
374ca955
A
3029 } else {
3030 c2=0;
b75a7d8f 3031 }
374ca955
A
3032
3033 if(nx_contains(nx, c, c2)) {
3034 prevCC=0; /* excluded: fcd16==0 */
3035 continue;
b75a7d8f 3036 }
374ca955
A
3037
3038 /*
3039 * prevCC has values from the following ranges:
3040 * 0..0xff - the previous trail combining class
3041 * <0 - the negative value of the previous code unit;
3042 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
3043 * was deferred so that average text is checked faster
3044 */
3045
3046 /* check the combining order */
3047 cc=(int16_t)(fcd16>>8);
3048 if(cc!=0) {
3049 if(prevCC<0) {
3050 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
3051 if(!nx_contains(nx, (UChar32)-prevCC)) {
3052 prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
3053 } else {
3054 prevCC=0; /* excluded: fcd16==0 */
3055 }
3056 }
3057
3058 if(cc<prevCC) {
3059 return FALSE;
3060 }
b75a7d8f 3061 }
374ca955 3062 prevCC=(int16_t)(fcd16&0xff);
b75a7d8f 3063 }
b75a7d8f
A
3064}
3065
374ca955
A
3066static UNormalizationCheckResult
3067_quickCheck(const UChar *src,
3068 int32_t srcLength,
3069 UNormalizationMode mode,
3070 UBool allowMaybe,
3071 const UnicodeSet *nx,
3072 UErrorCode *pErrorCode) {
b75a7d8f
A
3073 UChar stackBuffer[_STACK_BUFFER_CAPACITY];
3074 UChar *buffer;
3075 int32_t bufferCapacity;
3076
374ca955
A
3077 const UChar *start, *limit;
3078 uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
3079 int32_t options;
b75a7d8f
A
3080 UChar c, c2, minNoMaybe;
3081 uint8_t cc, prevCC;
374ca955 3082 UNormalizationCheckResult result;
b75a7d8f 3083
374ca955
A
3084 /* check arguments */
3085 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3086 return UNORM_MAYBE;
3087 }
3088
3089 if(src==NULL || srcLength<-1) {
3090 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3091 return UNORM_MAYBE;
3092 }
3093
3094 if(!_haveData(*pErrorCode)) {
3095 return UNORM_MAYBE;
3096 }
3097
3098 /* check for a valid mode and set the quick check minimum and mask */
3099 switch(mode) {
3100 case UNORM_NFC:
b75a7d8f
A
3101 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3102 qcMask=_NORM_QC_NFC;
374ca955
A
3103 options=0;
3104 break;
3105 case UNORM_NFKC:
b75a7d8f
A
3106 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3107 qcMask=_NORM_QC_NFKC;
374ca955
A
3108 options=_NORM_OPTIONS_COMPAT;
3109 break;
3110 case UNORM_NFD:
3111 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
3112 qcMask=_NORM_QC_NFD;
3113 options=0;
3114 break;
3115 case UNORM_NFKD:
3116 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
3117 qcMask=_NORM_QC_NFKD;
3118 options=_NORM_OPTIONS_COMPAT;
3119 break;
3120 case UNORM_FCD:
73c04bcf
A
3121 if(fcdTrie.index==NULL) {
3122 *pErrorCode=U_UNSUPPORTED_ERROR;
3123 return UNORM_MAYBE;
3124 }
374ca955
A
3125 return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
3126 default:
3127 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3128 return UNORM_MAYBE;
b75a7d8f
A
3129 }
3130
374ca955
A
3131 /* initialize */
3132 buffer=stackBuffer;
3133 bufferCapacity=_STACK_BUFFER_CAPACITY;
3134
b75a7d8f 3135 ccOrQCMask=_NORM_CC_MASK|qcMask;
374ca955 3136 result=UNORM_YES;
b75a7d8f
A
3137 prevCC=0;
3138
374ca955 3139 start=src;
b75a7d8f
A
3140 if(srcLength>=0) {
3141 /* string with length */
3142 limit=src+srcLength;
3143 } else /* srcLength==-1 */ {
3144 /* zero-terminated string */
3145 limit=NULL;
3146 }
3147
3148 U_ALIGN_CODE(16);
3149
3150 for(;;) {
374ca955 3151 /* skip a run of code units below the minimum or with irrelevant data for the quick check */
b75a7d8f 3152 if(limit==NULL) {
374ca955
A
3153 for(;;) {
3154 c=*src++;
3155 if(c<minNoMaybe) {
3156 if(c==0) {
3157 goto endloop; /* break out of outer loop */
3158 }
3159 } else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3160 break;
3161 }
b75a7d8f 3162 prevCC=0;
b75a7d8f
A
3163 }
3164 } else {
374ca955
A
3165 for(;;) {
3166 if(src==limit) {
3167 goto endloop; /* break out of outer loop */
3168 } else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3169 break;
3170 }
b75a7d8f 3171 prevCC=0;
b75a7d8f
A
3172 }
3173 }
3174
374ca955
A
3175 /* check one above-minimum, relevant code unit */
3176 if(isNorm32LeadSurrogate(norm32)) {
3177 /* c is a lead surrogate, get the real norm32 */
3178 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3179 ++src;
3180 norm32=_getNorm32FromSurrogatePair(norm32, c2);
3181 } else {
3182 c2=0;
3183 norm32=0;
b75a7d8f 3184 }
374ca955
A
3185 } else {
3186 c2=0;
3187 }
b75a7d8f 3188
374ca955
A
3189 if(nx_contains(nx, c, c2)) {
3190 /* excluded: norm32==0 */
3191 norm32=0;
b75a7d8f
A
3192 }
3193
374ca955
A
3194 /* check the combining order */
3195 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
3196 if(cc!=0 && cc<prevCC) {
3197 result=UNORM_NO;
b75a7d8f
A
3198 break;
3199 }
374ca955 3200 prevCC=cc;
b75a7d8f 3201
374ca955
A
3202 /* check for "no" or "maybe" quick check flags */
3203 qcNorm32=norm32&qcMask;
3204 if(qcNorm32&_NORM_QC_ANY_NO) {
3205 result=UNORM_NO;
3206 break;
3207 } else if(qcNorm32!=0) {
3208 /* "maybe" can only occur for NFC and NFKC */
3209 if(allowMaybe) {
3210 result=UNORM_MAYBE;
b75a7d8f 3211 } else {
374ca955
A
3212 /* normalize a section around here to see if it is really normalized or not */
3213 const UChar *prevStarter;
b75a7d8f 3214 uint32_t decompQCMask;
374ca955 3215 int32_t length;
b75a7d8f 3216
b75a7d8f
A
3217 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
3218
374ca955
A
3219 /* find the previous starter */
3220 prevStarter=src-1; /* set prevStarter to the beginning of the current character */
3221 if(UTF_IS_TRAIL(*prevStarter)) {
3222 --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */
b75a7d8f 3223 }
374ca955 3224 prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
b75a7d8f
A
3225
3226 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3227 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
3228
374ca955
A
3229 /* decompose and recompose [prevStarter..src[ */
3230 _composePart(stackBuffer, buffer, bufferCapacity,
3231 length,
3232 prevStarter,
3233 src,
3234 prevCC,
3235 options, nx, pErrorCode);
3236 if(U_FAILURE(*pErrorCode)) {
3237 result=UNORM_MAYBE; /* error (out of memory) */
b75a7d8f
A
3238 break;
3239 }
3240
374ca955
A
3241 /* compare the normalized version with the original */
3242 if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
3243 result=UNORM_NO; /* normalization differs */
3244 break;
b75a7d8f
A
3245 }
3246
374ca955 3247 /* continue after the next starter */
b75a7d8f 3248 }
b75a7d8f
A
3249 }
3250 }
374ca955 3251endloop:
b75a7d8f 3252
b75a7d8f
A
3253 if(buffer!=stackBuffer) {
3254 uprv_free(buffer);
3255 }
3256
374ca955
A
3257 return result;
3258}
3259
3260U_CAPI UNormalizationCheckResult U_EXPORT2
3261unorm_quickCheck(const UChar *src,
3262 int32_t srcLength,
3263 UNormalizationMode mode,
3264 UErrorCode *pErrorCode) {
3265 return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
b75a7d8f
A
3266}
3267
374ca955
A
3268U_CAPI UNormalizationCheckResult U_EXPORT2
3269unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
3270 UNormalizationMode mode, int32_t options,
3271 UErrorCode *pErrorCode) {
3272 return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
3273}
b75a7d8f 3274
374ca955
A
3275U_CFUNC UNormalizationCheckResult
3276unorm_internalQuickCheck(const UChar *src,
3277 int32_t srcLength,
3278 UNormalizationMode mode,
3279 UBool allowMaybe,
3280 const UnicodeSet *nx,
3281 UErrorCode *pErrorCode) {
3282 return _quickCheck(src, srcLength, mode, allowMaybe, nx, pErrorCode);
3283}
b75a7d8f 3284
374ca955
A
3285U_CAPI UBool U_EXPORT2
3286unorm_isNormalized(const UChar *src, int32_t srcLength,
3287 UNormalizationMode mode,
3288 UErrorCode *pErrorCode) {
3289 return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
3290}
b75a7d8f 3291
374ca955
A
3292U_CAPI UBool U_EXPORT2
3293unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
3294 UNormalizationMode mode, int32_t options,
3295 UErrorCode *pErrorCode) {
3296 return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
b75a7d8f
A
3297}
3298
3299/* normalize() API ---------------------------------------------------------- */
3300
3301/**
3302 * Internal API for normalizing.
3303 * Does not check for bad input.
3304 * Requires _haveData() to be true.
3305 * @internal
3306 */
374ca955
A
3307U_CFUNC int32_t
3308unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
3309 const UChar *src, int32_t srcLength,
3310 UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
3311 UErrorCode *pErrorCode) {
b75a7d8f
A
3312 int32_t destLength;
3313 uint8_t trailCC;
3314
3315 switch(mode) {
3316 case UNORM_NFD:
3317 destLength=_decompose(dest, destCapacity,
3318 src, srcLength,
3319 FALSE, nx, trailCC);
3320 break;
3321 case UNORM_NFKD:
3322 destLength=_decompose(dest, destCapacity,
3323 src, srcLength,
3324 TRUE, nx, trailCC);
3325 break;
3326 case UNORM_NFC:
3327 destLength=_compose(dest, destCapacity,
3328 src, srcLength,
374ca955 3329 options, nx, pErrorCode);
b75a7d8f
A
3330 break;
3331 case UNORM_NFKC:
3332 destLength=_compose(dest, destCapacity,
3333 src, srcLength,
374ca955 3334 options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
b75a7d8f
A
3335 break;
3336 case UNORM_FCD:
73c04bcf
A
3337 if(fcdTrie.index==NULL) {
3338 *pErrorCode=U_UNSUPPORTED_ERROR;
3339 return 0;
3340 }
b75a7d8f
A
3341 return unorm_makeFCD(dest, destCapacity,
3342 src, srcLength,
3343 nx,
3344 pErrorCode);
374ca955
A
3345#if 0
3346 case UNORM_FCC:
3347 destLength=_compose(dest, destCapacity,
3348 src, srcLength,
3349 options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
3350 break;
3351#endif
b75a7d8f
A
3352 case UNORM_NONE:
3353 /* just copy the string */
3354 if(srcLength==-1) {
3355 srcLength=u_strlen(src);
3356 }
3357 if(srcLength>0 && srcLength<=destCapacity) {
3358 uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
3359 }
3360 destLength=srcLength;
3361 break;
3362 default:
3363 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3364 return 0;
3365 }
3366
3367 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3368}
3369
3370/**
3371 * Internal API for normalizing.
3372 * Does not check for bad input.
3373 * @internal
3374 */
3375U_CAPI int32_t U_EXPORT2
3376unorm_internalNormalize(UChar *dest, int32_t destCapacity,
3377 const UChar *src, int32_t srcLength,
3378 UNormalizationMode mode, int32_t options,
3379 UErrorCode *pErrorCode) {
3380 const UnicodeSet *nx;
3381
3382 if(!_haveData(*pErrorCode)) {
3383 return 0;
3384 }
3385
3386 nx=getNX(options, *pErrorCode);
3387 if(U_FAILURE(*pErrorCode)) {
3388 return 0;
3389 }
3390
374ca955
A
3391 /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3392 options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
3393
3394 return unorm_internalNormalizeWithNX(dest, destCapacity,
3395 src, srcLength,
3396 mode, options, nx,
3397 pErrorCode);
b75a7d8f
A
3398}
3399
3400/** Public API for normalizing. */
3401U_CAPI int32_t U_EXPORT2
3402unorm_normalize(const UChar *src, int32_t srcLength,
3403 UNormalizationMode mode, int32_t options,
3404 UChar *dest, int32_t destCapacity,
3405 UErrorCode *pErrorCode) {
3406 /* check argument values */
3407 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3408 return 0;
3409 }
3410
3411 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3412 src==NULL || srcLength<-1
3413 ) {
3414 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3415 return 0;
3416 }
3417
3418 /* check for overlapping src and destination */
3419 if( dest!=NULL &&
3420 ((src>=dest && src<(dest+destCapacity)) ||
3421 (srcLength>0 && dest>=src && dest<(src+srcLength)))
3422 ) {
3423 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3424 return 0;
3425 }
3426
3427 return unorm_internalNormalize(dest, destCapacity,
3428 src, srcLength,
3429 mode, options,
3430 pErrorCode);
3431}
3432
3433
3434/* iteration functions ------------------------------------------------------ */
3435
3436/*
3437 * These iteration functions are the core implementations of the
3438 * Normalizer class iteration API.
3439 * They read from a UCharIterator into their own buffer
3440 * and normalize into the Normalizer iteration buffer.
3441 * Normalizer itself then iterates over its buffer until that needs to be
3442 * filled again.
3443 */
3444
3445/*
3446 * ### TODO:
3447 * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3448 * if iteration bounds are reached,
3449 * try to not call hasNext/hasPrevious and instead check for >=0.
3450 */
3451
3452/* backward iteration ------------------------------------------------------- */
3453
3454/*
3455 * read backwards and get norm32
3456 * return 0 if the character is <minC
3457 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3458 */
3459static inline uint32_t
3460_getPrevNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3461 uint32_t norm32;
3462
3463 /* need src.hasPrevious() */
3464 c=(UChar)src.previous(&src);
3465 c2=0;
3466
3467 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3468 if(c<minC) {
3469 return 0;
3470 } else if(!UTF_IS_SURROGATE(c)) {
3471 return _getNorm32(c);
3472 } else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
3473 /* unpaired surrogate */
3474 return 0;
3475 } else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
3476 norm32=_getNorm32(c2);
3477 if((norm32&mask)==0) {
3478 /* all surrogate pairs with this lead surrogate have irrelevant data */
3479 return 0;
3480 } else {
3481 /* norm32 must be a surrogate special */
3482 return _getNorm32FromSurrogatePair(norm32, c);
3483 }
3484 } else {
3485 /* unpaired second surrogate, undo the c2=src.previous() movement */
3486 src.move(&src, 1, UITER_CURRENT);
3487 c2=0;
3488 return 0;
3489 }
3490}
3491
3492/*
3493 * read backwards and check if the character is a previous-iteration boundary
3494 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3495 */
3496typedef UBool
3497IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3498
3499/*
3500 * for NF*D:
3501 * read backwards and check if the lead combining class is 0
3502 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3503 */
3504static UBool
3505_isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3506 return _isNFDSafe(_getPrevNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3507}
3508
3509/*
3510 * read backwards and check if the character is (or its decomposition begins with)
3511 * a "true starter" (cc==0 and NF*C_YES)
3512 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3513 */
3514static UBool
3515_isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3516 uint32_t norm32, decompQCMask;
3517
3518 decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3519 norm32=_getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3520 return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3521}
3522
3523static int32_t
3524_findPreviousIterationBoundary(UCharIterator &src,
3525 IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
3526 UChar *&buffer, int32_t &bufferCapacity,
3527 int32_t &startIndex,
3528 UErrorCode *pErrorCode) {
3529 UChar *stackBuffer;
3530 UChar c, c2;
3531 UBool isBoundary;
3532
3533 /* initialize */
3534 stackBuffer=buffer;
3535 startIndex=bufferCapacity; /* fill the buffer from the end backwards */
3536
3537 while(src.hasPrevious(&src)) {
3538 isBoundary=isPrevBoundary(src, minC, mask, c, c2);
3539
3540 /* always write this character to the front of the buffer */
3541 /* make sure there is enough space in the buffer */
3542 if(startIndex < (c2==0 ? 1 : 2)) {
3543 int32_t bufferLength=bufferCapacity;
3544
3545 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
3546 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3547 src.move(&src, 0, UITER_START);
3548 return 0;
3549 }
3550
3551 /* move the current buffer contents up */
3552 uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
3553 startIndex+=bufferCapacity-bufferLength;
3554 }
3555
3556 buffer[--startIndex]=c;
3557 if(c2!=0) {
3558 buffer[--startIndex]=c2;
3559 }
3560
3561 /* stop if this just-copied character is a boundary */
3562 if(isBoundary) {
3563 break;
3564 }
3565 }
3566
3567 /* return the length of the buffer contents */
3568 return bufferCapacity-startIndex;
3569}
3570
3571U_CAPI int32_t U_EXPORT2
3572unorm_previous(UCharIterator *src,
3573 UChar *dest, int32_t destCapacity,
3574 UNormalizationMode mode, int32_t options,
3575 UBool doNormalize, UBool *pNeededToNormalize,
3576 UErrorCode *pErrorCode) {
3577 UChar stackBuffer[100];
3578 UChar *buffer=NULL;
3579 IsPrevBoundaryFn *isPreviousBoundary=NULL;
3580 uint32_t mask=0;
3581 int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
3582 int32_t c=0, c2=0;
3583 UChar minC=0;
3584
3585 /* check argument values */
3586 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3587 return 0;
3588 }
3589
3590 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3591 src==NULL
3592 ) {
3593 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3594 return 0;
3595 }
3596
3597 if(!_haveData(*pErrorCode)) {
3598 return 0;
3599 }
3600
3601 if(pNeededToNormalize!=NULL) {
3602 *pNeededToNormalize=FALSE;
3603 }
3604
3605 switch(mode) {
b75a7d8f 3606 case UNORM_FCD:
73c04bcf
A
3607 if(fcdTrie.index==NULL) {
3608 *pErrorCode=U_UNSUPPORTED_ERROR;
3609 return 0;
3610 }
3611 /* fall through to NFD */
3612 case UNORM_NFD:
b75a7d8f
A
3613 isPreviousBoundary=_isPrevNFDSafe;
3614 minC=_NORM_MIN_WITH_LEAD_CC;
3615 mask=_NORM_CC_MASK|_NORM_QC_NFD;
3616 break;
3617 case UNORM_NFKD:
3618 isPreviousBoundary=_isPrevNFDSafe;
3619 minC=_NORM_MIN_WITH_LEAD_CC;
3620 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3621 break;
3622 case UNORM_NFC:
3623 isPreviousBoundary=_isPrevTrueStarter;
3624 minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3625 mask=_NORM_CC_MASK|_NORM_QC_NFC;
3626 break;
3627 case UNORM_NFKC:
3628 isPreviousBoundary=_isPrevTrueStarter;
3629 minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3630 mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3631 break;
3632 case UNORM_NONE:
3633 destLength=0;
3634 if((c=src->previous(src))>=0) {
3635 destLength=1;
3636 if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
3637 if(UTF_IS_LEAD(c2)) {
3638 if(destCapacity>=2) {
3639 dest[1]=(UChar)c; /* trail surrogate */
3640 destLength=2;
3641 }
3642 c=c2; /* lead surrogate to be written below */
3643 } else {
3644 src->move(src, 1, UITER_CURRENT);
3645 }
3646 }
3647
3648 if(destCapacity>0) {
3649 dest[0]=(UChar)c;
3650 }
3651 }
3652 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3653 default:
3654 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3655 return 0;
3656 }
3657
3658 buffer=stackBuffer;
3659 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3660 bufferLength=_findPreviousIterationBoundary(*src,
3661 isPreviousBoundary, minC, mask,
3662 buffer, bufferCapacity,
3663 startIndex,
3664 pErrorCode);
3665 if(bufferLength>0) {
3666 if(doNormalize) {
3667 destLength=unorm_internalNormalize(dest, destCapacity,
3668 buffer+startIndex, bufferLength,
3669 mode, options,
3670 pErrorCode);
3671 if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3672 *pNeededToNormalize=
3673 (UBool)(destLength!=bufferLength ||
3674 0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
3675 }
3676 } else {
3677 /* just copy the source characters */
3678 if(destCapacity>0) {
3679 uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3680 }
3681 destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3682 }
3683 } else {
3684 destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3685 }
3686
3687 /* cleanup */
3688 if(buffer!=stackBuffer) {
3689 uprv_free(buffer);
3690 }
3691
3692 return destLength;
3693}
3694
3695/* forward iteration -------------------------------------------------------- */
3696
3697/*
3698 * read forward and get norm32
3699 * return 0 if the character is <minC
3700 * if c2!=0 then (c2, c) is a surrogate pair
3701 * always reads complete characters
3702 */
3703static inline uint32_t
3704_getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3705 uint32_t norm32;
3706
3707 /* need src.hasNext() to be true */
3708 c=(UChar)src.next(&src);
3709 c2=0;
3710
3711 if(c<minC) {
3712 return 0;
3713 }
3714
3715 norm32=_getNorm32(c);
3716 if(UTF_IS_FIRST_SURROGATE(c)) {
3717 if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
3718 src.move(&src, 1, UITER_CURRENT); /* skip the c2 surrogate */
3719 if((norm32&mask)==0) {
3720 /* irrelevant data */
3721 return 0;
3722 } else {
3723 /* norm32 must be a surrogate special */
3724 return _getNorm32FromSurrogatePair(norm32, c2);
3725 }
3726 } else {
3727 /* unmatched surrogate */
3728 c2=0;
3729 return 0;
3730 }
3731 }
3732 return norm32;
3733}
3734
3735/*
3736 * read forward and check if the character is a next-iteration boundary
3737 * if c2!=0 then (c, c2) is a surrogate pair
3738 */
3739typedef UBool
3740IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3741
3742/*
3743 * for NF*D:
3744 * read forward and check if the lead combining class is 0
3745 * if c2!=0 then (c, c2) is a surrogate pair
3746 */
3747static UBool
3748_isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3749 return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3750}
3751
3752/*
3753 * for NF*C:
3754 * read forward and check if the character is (or its decomposition begins with)
3755 * a "true starter" (cc==0 and NF*C_YES)
3756 * if c2!=0 then (c, c2) is a surrogate pair
3757 */
3758static UBool
3759_isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3760 uint32_t norm32, decompQCMask;
3761
3762 decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3763 norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3764 return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3765}
3766
3767static int32_t
3768_findNextIterationBoundary(UCharIterator &src,
3769 IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
3770 UChar *&buffer, int32_t &bufferCapacity,
3771 UErrorCode *pErrorCode) {
3772 UChar *stackBuffer;
3773 int32_t bufferIndex;
3774 UChar c, c2;
3775
3776 if(!src.hasNext(&src)) {
3777 return 0;
3778 }
3779
3780 /* initialize */
3781 stackBuffer=buffer;
3782
3783 /* get one character and ignore its properties */
3784 buffer[0]=c=(UChar)src.next(&src);
3785 bufferIndex=1;
3786 if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
3787 if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
3788 buffer[bufferIndex++]=c2;
3789 } else {
3790 src.move(&src, -1, UITER_CURRENT); /* back out the non-trail-surrogate */
3791 }
3792 }
3793
3794 /* get all following characters until we see a boundary */
3795 /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3796 while(src.hasNext(&src)) {
3797 if(isNextBoundary(src, minC, mask, c, c2)) {
3798 /* back out the latest movement to stop at the boundary */
3799 src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
3800 break;
3801 } else {
3802 if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
3803 /* attempt to grow the buffer */
3804 u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
3805 2*bufferCapacity,
3806 bufferIndex)
3807 ) {
3808 buffer[bufferIndex++]=c;
3809 if(c2!=0) {
3810 buffer[bufferIndex++]=c2;
3811 }
3812 } else {
3813 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3814 src.move(&src, 0, UITER_LIMIT);
3815 return 0;
3816 }
3817 }
3818 }
3819
3820 /* return the length of the buffer contents */
3821 return bufferIndex;
3822}
3823
3824U_CAPI int32_t U_EXPORT2
3825unorm_next(UCharIterator *src,
3826 UChar *dest, int32_t destCapacity,
3827 UNormalizationMode mode, int32_t options,
3828 UBool doNormalize, UBool *pNeededToNormalize,
3829 UErrorCode *pErrorCode) {
3830 UChar stackBuffer[100];
3831 UChar *buffer;
3832 IsNextBoundaryFn *isNextBoundary;
3833 uint32_t mask;
3834 int32_t bufferLength, bufferCapacity, destLength;
3835 int32_t c, c2;
3836 UChar minC;
3837
3838 /* check argument values */
3839 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3840 return 0;
3841 }
3842
3843 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3844 src==NULL
3845 ) {
3846 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3847 return 0;
3848 }
3849
3850 if(!_haveData(*pErrorCode)) {
3851 return 0;
3852 }
3853
3854 if(pNeededToNormalize!=NULL) {
3855 *pNeededToNormalize=FALSE;
3856 }
3857
3858 switch(mode) {
b75a7d8f 3859 case UNORM_FCD:
73c04bcf
A
3860 if(fcdTrie.index==NULL) {
3861 *pErrorCode=U_UNSUPPORTED_ERROR;
3862 return 0;
3863 }
3864 /* fall through to NFD */
3865 case UNORM_NFD:
b75a7d8f
A
3866 isNextBoundary=_isNextNFDSafe;
3867 minC=_NORM_MIN_WITH_LEAD_CC;
3868 mask=_NORM_CC_MASK|_NORM_QC_NFD;
3869 break;
3870 case UNORM_NFKD:
3871 isNextBoundary=_isNextNFDSafe;
3872 minC=_NORM_MIN_WITH_LEAD_CC;
3873 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3874 break;
3875 case UNORM_NFC:
3876 isNextBoundary=_isNextTrueStarter;
3877 minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3878 mask=_NORM_CC_MASK|_NORM_QC_NFC;
3879 break;
3880 case UNORM_NFKC:
3881 isNextBoundary=_isNextTrueStarter;
3882 minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3883 mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3884 break;
3885 case UNORM_NONE:
3886 destLength=0;
3887 if((c=src->next(src))>=0) {
3888 destLength=1;
3889 if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
3890 if(UTF_IS_TRAIL(c2)) {
3891 if(destCapacity>=2) {
3892 dest[1]=(UChar)c2; /* trail surrogate */
3893 destLength=2;
3894 }
3895 /* lead surrogate to be written below */
3896 } else {
3897 src->move(src, -1, UITER_CURRENT);
3898 }
3899 }
3900
3901 if(destCapacity>0) {
3902 dest[0]=(UChar)c;
3903 }
3904 }
3905 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3906 default:
3907 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3908 return 0;
3909 }
3910
3911 buffer=stackBuffer;
3912 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3913 bufferLength=_findNextIterationBoundary(*src,
3914 isNextBoundary, minC, mask,
3915 buffer, bufferCapacity,
3916 pErrorCode);
3917 if(bufferLength>0) {
3918 if(doNormalize) {
3919 destLength=unorm_internalNormalize(dest, destCapacity,
3920 buffer, bufferLength,
3921 mode, options,
3922 pErrorCode);
3923 if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3924 *pNeededToNormalize=
3925 (UBool)(destLength!=bufferLength ||
3926 0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
3927 }
3928 } else {
3929 /* just copy the source characters */
3930 if(destCapacity>0) {
3931 uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3932 }
3933 destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3934 }
3935 } else {
3936 destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3937 }
3938
3939 /* cleanup */
3940 if(buffer!=stackBuffer) {
3941 uprv_free(buffer);
3942 }
3943
3944 return destLength;
3945}
3946
3947/*
3948 * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3949 * and if not, how hard it would be to improve it.
3950 * For example, see _findSafeFCD().
3951 */
3952
3953/* Concatenation of normalized strings -------------------------------------- */
3954
3955U_CAPI int32_t U_EXPORT2
3956unorm_concatenate(const UChar *left, int32_t leftLength,
3957 const UChar *right, int32_t rightLength,
3958 UChar *dest, int32_t destCapacity,
3959 UNormalizationMode mode, int32_t options,
3960 UErrorCode *pErrorCode) {
3961 UChar stackBuffer[100];
3962 UChar *buffer;
3963 int32_t bufferLength, bufferCapacity;
3964
3965 UCharIterator iter;
3966 int32_t leftBoundary, rightBoundary, destLength;
3967
3968 /* check argument values */
3969 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3970 return 0;
3971 }
3972
3973 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3974 left==NULL || leftLength<-1 ||
3975 right==NULL || rightLength<-1
3976 ) {
3977 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3978 return 0;
3979 }
3980
3981 /* check for overlapping right and destination */
3982 if( dest!=NULL &&
3983 ((right>=dest && right<(dest+destCapacity)) ||
3984 (rightLength>0 && dest>=right && dest<(right+rightLength)))
3985 ) {
3986 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3987 return 0;
3988 }
3989
3990 /* allow left==dest */
3991
3992 /* set up intermediate buffer */
3993 buffer=stackBuffer;
3994 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3995
3996 /*
3997 * Input: left[0..leftLength[ + right[0..rightLength[
3998 *
3999 * Find normalization-safe boundaries leftBoundary and rightBoundary
4000 * and copy the end parts together:
4001 * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
4002 *
4003 * dest=left[0..leftBoundary[ +
4004 * normalize(buffer) +
4005 * right[rightBoundary..rightLength[
4006 */
4007
4008 /*
4009 * find a normalization boundary at the end of the left string
4010 * and copy the end part into the buffer
4011 */
4012 uiter_setString(&iter, left, leftLength);
4013 iter.index=leftLength=iter.length; /* end of left string */
4014
4015 bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
4016 mode, options,
4017 FALSE, NULL,
4018 pErrorCode);
4019 leftBoundary=iter.index;
4020 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
4021 *pErrorCode=U_ZERO_ERROR;
4022 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
4023 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
4024 /* dont need to cleanup here since
4025 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4026 */
4027 return 0;
4028 }
4029
4030 /* just copy from the left string: we know the boundary already */
4031 uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
4032 }
4033
4034 /*
4035 * find a normalization boundary at the beginning of the right string
4036 * and concatenate the beginning part to the buffer
4037 */
4038 uiter_setString(&iter, right, rightLength);
4039 rightLength=iter.length; /* in case it was -1 */
4040
4041 rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
4042 mode, options,
4043 FALSE, NULL,
4044 pErrorCode);
4045 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
4046 *pErrorCode=U_ZERO_ERROR;
4047 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
4048 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
4049 /* dont need to cleanup here since
4050 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
4051 */
4052 return 0;
4053 }
4054
4055 /* just copy from the right string: we know the boundary already */
4056 uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
4057 }
4058
4059 bufferLength+=rightBoundary;
4060
4061 /* copy left[0..leftBoundary[ to dest */
4062 if(left!=dest && leftBoundary>0 && destCapacity>0) {
4063 uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
4064 }
4065 destLength=leftBoundary;
4066
4067 /* concatenate the normalization of the buffer to dest */
4068 if(destCapacity>destLength) {
4069 destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
4070 buffer, bufferLength,
4071 mode, options,
4072 pErrorCode);
4073 } else {
4074 destLength+=unorm_internalNormalize(NULL, 0,
4075 buffer, bufferLength,
4076 mode, options,
4077 pErrorCode);
4078 }
4079 /*
4080 * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
4081 * so we dont check for the error code here..just let it pass through
4082 */
4083 /* concatenate right[rightBoundary..rightLength[ to dest */
4084 right+=rightBoundary;
4085 rightLength-=rightBoundary;
4086 if(rightLength>0 && destCapacity>destLength) {
4087 uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
4088 }
4089 destLength+=rightLength;
4090
4091 /* cleanup */
4092 if(buffer!=stackBuffer) {
4093 uprv_free(buffer);
4094 }
4095
4096 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
4097}
4098
b75a7d8f 4099#endif /* #if !UCONFIG_NO_NORMALIZATION */