]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/unorm.cpp
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / common / unorm.cpp
1 /*
2 ******************************************************************************
3 * Copyright (c) 1996-2004, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 ******************************************************************************
6 * File unorm.cpp
7 *
8 * Created by: Vladimir Weinstein 12052000
9 *
10 * Modification history :
11 *
12 * Date Name Description
13 * 02/01/01 synwee Added normalization quickcheck enum and method.
14 * 02/12/01 synwee Commented out quickcheck util api has been approved
15 * Added private method for doing FCD checks
16 * 02/23/01 synwee Modified quickcheck and checkFCE to run through
17 * string for codepoints < 0x300 for the normalization
18 * mode NFC.
19 * 05/25/01+ Markus Scherer total rewrite, implement all normalization here
20 * instead of just wrappers around normlzr.cpp,
21 * load unorm.dat, support Unicode 3.1 with
22 * supplementary code points, etc.
23 */
24
25 #include "unicode/utypes.h"
26
27 #if !UCONFIG_NO_NORMALIZATION
28
29 #include "unicode/udata.h"
30 #include "unicode/uchar.h"
31 #include "unicode/ustring.h"
32 #include "unicode/uiter.h"
33 #include "unicode/uniset.h"
34 #include "unicode/usetiter.h"
35 #include "unicode/unorm.h"
36 #include "ucln_cmn.h"
37 #include "unormimp.h"
38 #include "ucase.h"
39 #include "cmemory.h"
40 #include "umutex.h"
41 #include "utrie.h"
42 #include "unicode/uset.h"
43 #include "udataswp.h"
44 #include "putilimp.h"
45
46 /*
47 * Status of tailored normalization
48 *
49 * This was done initially for investigation on Unicode public review issue 7
50 * (http://www.unicode.org/review/). See Jitterbug 2481.
51 * While the UTC at meeting #94 (2003mar) did not take up the issue, this is
52 * a permanent feature in ICU 2.6 in support of IDNA which requires true
53 * Unicode 3.2 normalization.
54 * (NormalizationCorrections are rolled into IDNA mapping tables.)
55 *
56 * Tailored normalization as implemented here allows to "normalize less"
57 * than full Unicode normalization would.
58 * Based internally on a UnicodeSet of code points that are
59 * "excluded from normalization", the normalization functions leave those
60 * code points alone ("inert"). This means that tailored normalization
61 * still transforms text into a canonically equivalent form.
62 * It does not add decompositions to code points that do not have any or
63 * change decomposition results.
64 *
65 * Any function that searches for a safe boundary has not been touched,
66 * which means that these functions will be over-pessimistic when
67 * exclusions are applied.
68 * This should not matter because subsequent checks and normalizations
69 * do apply the exclusions; only a little more of the text may be processed
70 * than necessary under exclusions.
71 *
72 * Normalization exclusions have the following effect on excluded code points c:
73 * - c is not decomposed
74 * - c is not a composition target
75 * - c does not combine forward or backward for composition
76 * except that this is not implemented for Jamo
77 * - c is treated as having a combining class of 0
78 */
79 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
80
81 /*
82 * This new implementation of the normalization code loads its data from
83 * unorm.dat, which is generated with the gennorm tool.
84 * The format of that file is described in unormimp.h .
85 */
86
87 /* -------------------------------------------------------------------------- */
88
89 enum {
90 _STACK_BUFFER_CAPACITY=100
91 };
92
93 /*
94 * Constants for the bit fields in the options bit set parameter.
95 * These need not be public.
96 * A user only needs to know the currently assigned values.
97 * The number and positions of reserved bits per field can remain private
98 * and may change in future implementations.
99 */
100 enum {
101 _NORM_OPTIONS_NX_MASK=0x1f,
102 _NORM_OPTIONS_UNICODE_MASK=0x60,
103 _NORM_OPTIONS_SETS_MASK=0x7f,
104
105 _NORM_OPTIONS_UNICODE_SHIFT=5,
106
107 /*
108 * The following options are used only in some composition functions.
109 * They use bits 12 and up to preserve lower bits for the available options
110 * space in unorm_compare() -
111 * see documentation for UNORM_COMPARE_NORM_OPTIONS_SHIFT.
112 */
113
114 /** Options bit 12, for compatibility vs. canonical decomposition. */
115 _NORM_OPTIONS_COMPAT=0x1000,
116 /** Options bit 13, no discontiguous composition (FCC vs. NFC). */
117 _NORM_OPTIONS_COMPOSE_CONTIGUOUS=0x2000
118 };
119
120 static inline UBool
121 isHangulWithoutJamoT(UChar c) {
122 c-=HANGUL_BASE;
123 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
124 }
125
126 /* norm32 helpers */
127
128 /* is this a norm32 with a regular index? */
129 static inline UBool
130 isNorm32Regular(uint32_t norm32) {
131 return norm32<_NORM_MIN_SPECIAL;
132 }
133
134 /* is this a norm32 with a special index for a lead surrogate? */
135 static inline UBool
136 isNorm32LeadSurrogate(uint32_t norm32) {
137 return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP;
138 }
139
140 /* is this a norm32 with a special index for a Hangul syllable or a Jamo? */
141 static inline UBool
142 isNorm32HangulOrJamo(uint32_t norm32) {
143 return norm32>=_NORM_MIN_HANGUL;
144 }
145
146 /*
147 * Given isNorm32HangulOrJamo(),
148 * is this a Hangul syllable or a Jamo?
149 */
150 static inline UBool
151 isHangulJamoNorm32HangulOrJamoL(uint32_t norm32) {
152 return norm32<_NORM_MIN_JAMO_V;
153 }
154
155 /*
156 * Given norm32 for Jamo V or T,
157 * is this a Jamo V?
158 */
159 static inline UBool
160 isJamoVTNorm32JamoV(uint32_t norm32) {
161 return norm32<_NORM_JAMO_V_TOP;
162 }
163
164 /* load unorm.dat ----------------------------------------------------------- */
165
166 #define DATA_NAME "unorm"
167 #define DATA_TYPE "icu"
168
169 static UDataMemory *normData=NULL;
170 static UErrorCode dataErrorCode=U_ZERO_ERROR;
171 static int8_t haveNormData=0;
172
173 static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
174 static UTrie normTrie={ 0,0,0,0,0,0,0 }, fcdTrie={ 0,0,0,0,0,0,0 }, auxTrie={ 0,0,0,0,0,0,0 };
175
176 /*
177 * pointers into the memory-mapped unorm.icu
178 */
179 static const uint16_t *extraData=NULL,
180 *combiningTable=NULL,
181 *canonStartSets=NULL;
182
183 static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
184 static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
185
186 /* the Unicode version of the normalization data */
187 static UVersionInfo dataVersion={ 0, 0, 0, 0 };
188
189 /* cache UnicodeSets for each combination of exclusion flags */
190 static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
191
192 U_CDECL_BEGIN
193
194 static UBool U_CALLCONV
195 unorm_cleanup() {
196 int32_t i;
197
198 if(normData!=NULL) {
199 udata_close(normData);
200 normData=NULL;
201 }
202 dataErrorCode=U_ZERO_ERROR;
203 haveNormData=0;
204
205 for(i=0; i<(int32_t)LENGTHOF(nxCache); ++i) {
206 delete nxCache[i];
207 }
208 uprv_memset(nxCache, 0, sizeof(nxCache));
209
210 return TRUE;
211 }
212
213 /* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */
214 static int32_t U_CALLCONV
215 getFoldingNormOffset(uint32_t norm32) {
216 if(isNorm32LeadSurrogate(norm32)) {
217 return
218 UTRIE_BMP_INDEX_LENGTH+
219 (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
220 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
221 } else {
222 return 0;
223 }
224 }
225
226 /* fcdTrie: the folding offset is the lead FCD value itself */
227 static int32_t U_CALLCONV
228 getFoldingFCDOffset(uint32_t data) {
229 return (int32_t)data;
230 }
231
232 /* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
233 static int32_t U_CALLCONV
234 getFoldingAuxOffset(uint32_t data) {
235 return (int32_t)(data&_NORM_AUX_FNC_MASK)<<UTRIE_SURROGATE_BLOCK_BITS;
236 }
237
238 static UBool U_CALLCONV
239 isAcceptable(void * /* context */,
240 const char * /* type */, const char * /* name */,
241 const UDataInfo *pInfo) {
242 if(
243 pInfo->size>=20 &&
244 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
245 pInfo->charsetFamily==U_CHARSET_FAMILY &&
246 pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */
247 pInfo->dataFormat[1]==0x6f &&
248 pInfo->dataFormat[2]==0x72 &&
249 pInfo->dataFormat[3]==0x6d &&
250 pInfo->formatVersion[0]==2 &&
251 pInfo->formatVersion[2]==UTRIE_SHIFT &&
252 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
253 ) {
254 uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
255 uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
256 return TRUE;
257 } else {
258 return FALSE;
259 }
260 }
261
262 static UBool U_CALLCONV
263 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) {
264 /* add the start code point to the USet */
265 USetAdder *sa=(USetAdder *)context;
266 sa->add(sa->set, start);
267 return TRUE;
268 }
269
270 U_CDECL_END
271
272 static int8_t
273 loadNormData(UErrorCode &errorCode) {
274 /* load Unicode normalization data from file */
275
276 /*
277 * This lazy intialization with double-checked locking (without mutex protection for
278 * haveNormData==0) is transiently unsafe under certain circumstances.
279 * Check the readme and use u_init() if necessary.
280 *
281 * While u_init() initializes the main normalization data via this functions,
282 * it does not do so for exclusion sets (which are fully mutexed).
283 * This is because
284 * - there can be many exclusion sets
285 * - they are rarely used
286 * - they are not usually used in execution paths that are
287 * as performance-sensitive as others
288 * (e.g., IDNA takes more time than unorm_quickCheck() anyway)
289 */
290 if(haveNormData==0) {
291 UTrie _normTrie={ 0,0,0,0,0,0,0 }, _fcdTrie={ 0,0,0,0,0,0,0 }, _auxTrie={ 0,0,0,0,0,0,0 };
292 UDataMemory *data;
293 const int32_t *p=NULL;
294 const uint8_t *pb;
295
296 if(&errorCode==NULL || U_FAILURE(errorCode)) {
297 return 0;
298 }
299
300 /* open the data outside the mutex block */
301 data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
302 dataErrorCode=errorCode;
303 if(U_FAILURE(errorCode)) {
304 return haveNormData=-1;
305 }
306
307 p=(const int32_t *)udata_getMemory(data);
308 pb=(const uint8_t *)(p+_NORM_INDEX_TOP);
309 utrie_unserialize(&_normTrie, pb, p[_NORM_INDEX_TRIE_SIZE], &errorCode);
310 _normTrie.getFoldingOffset=getFoldingNormOffset;
311
312 pb+=p[_NORM_INDEX_TRIE_SIZE]+p[_NORM_INDEX_UCHAR_COUNT]*2+p[_NORM_INDEX_COMBINE_DATA_COUNT]*2;
313 utrie_unserialize(&_fcdTrie, pb, p[_NORM_INDEX_FCD_TRIE_SIZE], &errorCode);
314 _fcdTrie.getFoldingOffset=getFoldingFCDOffset;
315
316 if(p[_NORM_INDEX_FCD_TRIE_SIZE]!=0) {
317 pb+=p[_NORM_INDEX_FCD_TRIE_SIZE];
318 utrie_unserialize(&_auxTrie, pb, p[_NORM_INDEX_AUX_TRIE_SIZE], &errorCode);
319 _auxTrie.getFoldingOffset=getFoldingAuxOffset;
320 }
321
322 if(U_FAILURE(errorCode)) {
323 dataErrorCode=errorCode;
324 udata_close(data);
325 return haveNormData=-1;
326 }
327
328 /* in the mutex block, set the data for this process */
329 umtx_lock(NULL);
330 if(normData==NULL) {
331 normData=data;
332 data=NULL;
333
334 uprv_memcpy(&indexes, p, sizeof(indexes));
335 uprv_memcpy(&normTrie, &_normTrie, sizeof(UTrie));
336 uprv_memcpy(&fcdTrie, &_fcdTrie, sizeof(UTrie));
337 uprv_memcpy(&auxTrie, &_auxTrie, sizeof(UTrie));
338 } else {
339 p=(const int32_t *)udata_getMemory(normData);
340 }
341
342 /* initialize some variables */
343 extraData=(uint16_t *)((uint8_t *)(p+_NORM_INDEX_TOP)+indexes[_NORM_INDEX_TRIE_SIZE]);
344 combiningTable=extraData+indexes[_NORM_INDEX_UCHAR_COUNT];
345 formatVersion_2_1=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=1);
346 formatVersion_2_2=formatVersion[0]>2 || (formatVersion[0]==2 && formatVersion[1]>=2);
347 if(formatVersion_2_1) {
348 canonStartSets=combiningTable+
349 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]+
350 (indexes[_NORM_INDEX_FCD_TRIE_SIZE]+indexes[_NORM_INDEX_AUX_TRIE_SIZE])/2;
351 }
352 haveNormData=1;
353 ucln_common_registerCleanup(UCLN_COMMON_UNORM, unorm_cleanup);
354 umtx_unlock(NULL);
355
356 /* if a different thread set it first, then close the extra data */
357 if(data!=NULL) {
358 udata_close(data); /* NULL if it was set correctly */
359 }
360 }
361
362 return haveNormData;
363 }
364
365 static inline UBool
366 _haveData(UErrorCode &errorCode) {
367 if(haveNormData!=0) {
368 errorCode=dataErrorCode;
369 return (UBool)(haveNormData>0);
370 } else {
371 return (UBool)(loadNormData(errorCode)>0);
372 }
373 }
374
375 U_CAPI UBool U_EXPORT2
376 unorm_haveData(UErrorCode *pErrorCode) {
377 return _haveData(*pErrorCode);
378 }
379
380 U_CAPI const uint16_t * U_EXPORT2
381 unorm_getFCDTrie(UErrorCode *pErrorCode) {
382 if(_haveData(*pErrorCode)) {
383 return fcdTrie.index;
384 } else {
385 return NULL;
386 }
387 }
388
389 /* data access primitives --------------------------------------------------- */
390
391 static inline uint32_t
392 _getNorm32(UChar c) {
393 return UTRIE_GET32_FROM_LEAD(&normTrie, c);
394 }
395
396 static inline uint32_t
397 _getNorm32FromSurrogatePair(uint32_t norm32, UChar c2) {
398 /*
399 * the surrogate index in norm32 stores only the number of the surrogate index block
400 * see gennorm/store.c/getFoldedNormValue()
401 */
402 norm32=
403 UTRIE_BMP_INDEX_LENGTH+
404 ((norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))&
405 (0x3ff<<UTRIE_SURROGATE_BLOCK_BITS));
406 return UTRIE_GET32_FROM_OFFSET_TRAIL(&normTrie, norm32, c2);
407 }
408
409 /*
410 * get a norm32 from text with complete code points
411 * (like from decompositions)
412 */
413 static inline uint32_t
414 _getNorm32(const UChar *p, uint32_t mask) {
415 uint32_t norm32=_getNorm32(*p);
416 if((norm32&mask) && isNorm32LeadSurrogate(norm32)) {
417 /* *p is a lead surrogate, get the real norm32 */
418 norm32=_getNorm32FromSurrogatePair(norm32, *(p+1));
419 }
420 return norm32;
421 }
422
423 static inline uint16_t
424 _getFCD16(UChar c) {
425 return UTRIE_GET16_FROM_LEAD(&fcdTrie, c);
426 }
427
428 static inline uint16_t
429 _getFCD16FromSurrogatePair(uint16_t fcd16, UChar c2) {
430 /* the surrogate index in fcd16 is an absolute offset over the start of stage 1 */
431 return UTRIE_GET16_FROM_OFFSET_TRAIL(&fcdTrie, fcd16, c2);
432 }
433
434 static inline const uint16_t *
435 _getExtraData(uint32_t norm32) {
436 return extraData+(norm32>>_NORM_EXTRA_SHIFT);
437 }
438
439 /* normalization exclusion sets --------------------------------------------- */
440
441 /*
442 * Normalization exclusion UnicodeSets are used for tailored normalization;
443 * see the comment near the beginning of this file.
444 *
445 * By specifying one or several sets of code points,
446 * those code points become inert for normalization.
447 */
448
449 static const UnicodeSet *
450 internalGetNXHangul(UErrorCode &errorCode) {
451 /* internal function, does not check for incoming U_FAILURE */
452 UBool isCached;
453
454 UMTX_CHECK(NULL, (UBool)(nxCache[UNORM_NX_HANGUL]!=NULL), isCached);
455
456 if(!isCached) {
457 UnicodeSet *set=new UnicodeSet(0xac00, 0xd7a3);
458 if(set==NULL) {
459 errorCode=U_MEMORY_ALLOCATION_ERROR;
460 return NULL;
461 }
462
463 umtx_lock(NULL);
464 if(nxCache[UNORM_NX_HANGUL]==NULL) {
465 nxCache[UNORM_NX_HANGUL]=set;
466 set=NULL;
467 }
468 umtx_unlock(NULL);
469
470 delete set;
471 }
472
473 return nxCache[UNORM_NX_HANGUL];
474 }
475
476 /* unorm.cpp 1.116 had and used
477 static const UnicodeSet *
478 internalGetNXFromPattern(int32_t options, const char *pattern, UErrorCode &errorCode) {
479 ...
480 }
481 */
482
483 /* get and set an exclusion set from a serialized UnicodeSet */
484 static const UnicodeSet *
485 internalGetSerializedNX(int32_t options, int32_t nxIndex, UErrorCode &errorCode) {
486 /* internal function, does not check for incoming U_FAILURE */
487 UBool isCached;
488
489 UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
490
491 if( !isCached &&
492 canonStartSets!=NULL &&
493 canonStartSets[nxIndex]!=0 && canonStartSets[nxIndex+1]>canonStartSets[nxIndex]
494 ) {
495 USerializedSet sset;
496 UnicodeSet *set;
497 UChar32 start, end;
498 int32_t i;
499
500 if( !uset_getSerializedSet(
501 &sset,
502 canonStartSets+canonStartSets[nxIndex],
503 canonStartSets[nxIndex+1]-canonStartSets[nxIndex])
504 ) {
505 errorCode=U_INVALID_FORMAT_ERROR;
506 return NULL;
507 }
508
509 /* turn the serialized set into a UnicodeSet */
510 set=new UnicodeSet();
511 if(set==NULL) {
512 errorCode=U_MEMORY_ALLOCATION_ERROR;
513 return NULL;
514 }
515 for(i=0; uset_getSerializedRange(&sset, i, &start, &end); ++i) {
516 set->add(start, end);
517 }
518
519 umtx_lock(NULL);
520 if(nxCache[options]==NULL) {
521 nxCache[options]=set;
522 set=NULL;
523 }
524 umtx_unlock(NULL);
525
526 delete set;
527 }
528
529 return nxCache[options];
530 }
531
532 static const UnicodeSet *
533 internalGetNXCJKCompat(UErrorCode &errorCode) {
534 /* build a set from [[:Ideographic:]&[:NFD_QC=No:]]=[CJK Ideographs]&[has canonical decomposition] */
535 return internalGetSerializedNX(
536 UNORM_NX_CJK_COMPAT,
537 _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,
538 errorCode);
539 }
540
541 static const UnicodeSet *
542 internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
543 /* internal function, does not check for incoming U_FAILURE */
544 int32_t nxIndex;
545
546 options&=_NORM_OPTIONS_UNICODE_MASK;
547 switch(options) {
548 case 0:
549 return NULL;
550 case UNORM_UNICODE_3_2:
551 /* [:^Age=3.2:] */
552 nxIndex=_NORM_SET_INDEX_NX_UNICODE32_OFFSET;
553 break;
554 default:
555 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
556 return NULL;
557 }
558
559 /* build a set with all code points that were not designated by the specified Unicode version */
560 return internalGetSerializedNX(options, nxIndex, errorCode);
561 }
562
563 /* Get a decomposition exclusion set. The data must be loaded. */
564 static const UnicodeSet *
565 internalGetNX(int32_t options, UErrorCode &errorCode) {
566 options&=_NORM_OPTIONS_SETS_MASK;
567
568 UBool isCached;
569
570 UMTX_CHECK(NULL, (UBool)(nxCache[options]!=NULL), isCached);
571
572 if(!isCached) {
573 /* return basic sets */
574 if(options==UNORM_NX_HANGUL) {
575 return internalGetNXHangul(errorCode);
576 }
577 if(options==UNORM_NX_CJK_COMPAT) {
578 return internalGetNXCJKCompat(errorCode);
579 }
580 if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
581 return internalGetNXUnicode(options, errorCode);
582 }
583
584 /* build a set from multiple subsets */
585 UnicodeSet *set;
586 const UnicodeSet *other;
587
588 set=new UnicodeSet();
589 if(set==NULL) {
590 errorCode=U_MEMORY_ALLOCATION_ERROR;
591 return NULL;
592 }
593
594 if((options&UNORM_NX_HANGUL)!=0 && NULL!=(other=internalGetNXHangul(errorCode))) {
595 set->addAll(*other);
596 }
597 if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
598 set->addAll(*other);
599 }
600 if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
601 set->addAll(*other);
602 }
603
604 if(U_FAILURE(errorCode)) {
605 delete set;
606 return NULL;
607 }
608
609 umtx_lock(NULL);
610 if(nxCache[options]==NULL) {
611 nxCache[options]=set;
612 set=NULL;
613 }
614 umtx_unlock(NULL);
615
616 delete set;
617 }
618
619 return nxCache[options];
620 }
621
622 static inline const UnicodeSet *
623 getNX(int32_t options, UErrorCode &errorCode) {
624 if(U_FAILURE(errorCode) || (options&=_NORM_OPTIONS_SETS_MASK)==0) {
625 /* incoming failure, or no decomposition exclusions requested */
626 return NULL;
627 } else {
628 return internalGetNX(options, errorCode);
629 }
630 }
631
632 U_CFUNC const UnicodeSet *
633 unorm_getNX(int32_t options, UErrorCode *pErrorCode) {
634 return getNX(options, *pErrorCode);
635 }
636
637 static inline UBool
638 nx_contains(const UnicodeSet *nx, UChar32 c) {
639 return nx!=NULL && nx->contains(c);
640 }
641
642 static inline UBool
643 nx_contains(const UnicodeSet *nx, UChar c, UChar c2) {
644 return nx!=NULL && nx->contains(c2==0 ? c : U16_GET_SUPPLEMENTARY(c, c2));
645 }
646
647 /* other normalization primitives ------------------------------------------- */
648
649 /* get the canonical or compatibility decomposition for one character */
650 static inline const UChar *
651 _decompose(uint32_t norm32, uint32_t qcMask, int32_t &length,
652 uint8_t &cc, uint8_t &trailCC) {
653 const UChar *p=(const UChar *)_getExtraData(norm32);
654 length=*p++;
655
656 if((norm32&qcMask&_NORM_QC_NFKD)!=0 && length>=0x100) {
657 /* use compatibility decomposition, skip canonical data */
658 p+=((length>>7)&1)+(length&_NORM_DECOMP_LENGTH_MASK);
659 length>>=8;
660 }
661
662 if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
663 /* get the lead and trail cc's */
664 UChar bothCCs=*p++;
665 cc=(uint8_t)(bothCCs>>8);
666 trailCC=(uint8_t)bothCCs;
667 } else {
668 /* lead and trail cc's are both 0 */
669 cc=trailCC=0;
670 }
671
672 length&=_NORM_DECOMP_LENGTH_MASK;
673 return p;
674 }
675
676 /* get the canonical decomposition for one character */
677 static inline const UChar *
678 _decompose(uint32_t norm32, int32_t &length,
679 uint8_t &cc, uint8_t &trailCC) {
680 const UChar *p=(const UChar *)_getExtraData(norm32);
681 length=*p++;
682
683 if(length&_NORM_DECOMP_FLAG_LENGTH_HAS_CC) {
684 /* get the lead and trail cc's */
685 UChar bothCCs=*p++;
686 cc=(uint8_t)(bothCCs>>8);
687 trailCC=(uint8_t)bothCCs;
688 } else {
689 /* lead and trail cc's are both 0 */
690 cc=trailCC=0;
691 }
692
693 length&=_NORM_DECOMP_LENGTH_MASK;
694 return p;
695 }
696
697 /**
698 * Get the canonical decomposition for one code point.
699 * @param c code point
700 * @param buffer out-only buffer for algorithmic decompositions of Hangul
701 * @param length out-only, takes the length of the decomposition, if any
702 * @return pointer to decomposition, or 0 if none
703 * @internal
704 */
705 U_CFUNC const UChar *
706 unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength) {
707 uint32_t norm32;
708
709 if(c<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
710 /* trivial case */
711 return NULL;
712 }
713
714 UTRIE_GET32(&normTrie, c, norm32);
715 if(norm32&_NORM_QC_NFD) {
716 if(isNorm32HangulOrJamo(norm32)) {
717 /* Hangul syllable: decompose algorithmically */
718 UChar c2;
719
720 c-=HANGUL_BASE;
721
722 c2=(UChar)(c%JAMO_T_COUNT);
723 c/=JAMO_T_COUNT;
724 if(c2>0) {
725 buffer[2]=(UChar)(JAMO_T_BASE+c2);
726 *pLength=3;
727 } else {
728 *pLength=2;
729 }
730
731 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
732 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
733 return buffer;
734 } else {
735 /* normal decomposition */
736 uint8_t cc, trailCC;
737 return _decompose(norm32, *pLength, cc, trailCC);
738 }
739 } else {
740 return 0;
741 }
742 }
743
744 /*
745 * get the combining class of (c, c2)=*p++
746 * before: p<limit after: p<=limit
747 * if only one code unit is used, then c2==0
748 */
749 static inline uint8_t
750 _getNextCC(const UChar *&p, const UChar *limit, UChar &c, UChar &c2) {
751 uint32_t norm32;
752
753 c=*p++;
754 norm32=_getNorm32(c);
755 if((norm32&_NORM_CC_MASK)==0) {
756 c2=0;
757 return 0;
758 } else {
759 if(!isNorm32LeadSurrogate(norm32)) {
760 c2=0;
761 } else {
762 /* c is a lead surrogate, get the real norm32 */
763 if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
764 ++p;
765 norm32=_getNorm32FromSurrogatePair(norm32, c2);
766 } else {
767 c2=0;
768 return 0;
769 }
770 }
771
772 return (uint8_t)(norm32>>_NORM_CC_SHIFT);
773 }
774 }
775
776 /*
777 * read backwards and get norm32
778 * return 0 if the character is <minC
779 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
780 */
781 static inline uint32_t
782 _getPrevNorm32(const UChar *start, const UChar *&src,
783 uint32_t minC, uint32_t mask,
784 UChar &c, UChar &c2) {
785 uint32_t norm32;
786
787 c=*--src;
788 c2=0;
789
790 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
791 if(c<minC) {
792 return 0;
793 } else if(!UTF_IS_SURROGATE(c)) {
794 return _getNorm32(c);
795 } else if(UTF_IS_SURROGATE_FIRST(c)) {
796 /* unpaired first surrogate */
797 return 0;
798 } else if(src!=start && UTF_IS_FIRST_SURROGATE(c2=*(src-1))) {
799 --src;
800 norm32=_getNorm32(c2);
801
802 if((norm32&mask)==0) {
803 /* all surrogate pairs with this lead surrogate have only irrelevant data */
804 return 0;
805 } else {
806 /* norm32 must be a surrogate special */
807 return _getNorm32FromSurrogatePair(norm32, c);
808 }
809 } else {
810 /* unpaired second surrogate */
811 c2=0;
812 return 0;
813 }
814 }
815
816 /*
817 * get the combining class of (c, c2)=*--p
818 * before: start<p after: start<=p
819 */
820 static inline uint8_t
821 _getPrevCC(const UChar *start, const UChar *&p) {
822 UChar c, c2;
823
824 return (uint8_t)(_getPrevNorm32(start, p, _NORM_MIN_WITH_LEAD_CC, _NORM_CC_MASK, c, c2)>>_NORM_CC_SHIFT);
825 }
826
827 /*
828 * is this a safe boundary character for NF*D?
829 * (lead cc==0)
830 */
831 static inline UBool
832 _isNFDSafe(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
833 if((norm32&ccOrQCMask)==0) {
834 return TRUE; /* cc==0 and no decomposition: this is NF*D safe */
835 }
836
837 /* inspect its decomposition - maybe a Hangul but not a surrogate here */
838 if(isNorm32Regular(norm32) && (norm32&decompQCMask)!=0) {
839 int32_t length;
840 uint8_t cc, trailCC;
841
842 /* decomposes, get everything from the variable-length extra data */
843 _decompose(norm32, decompQCMask, length, cc, trailCC);
844 return cc==0;
845 } else {
846 /* no decomposition (or Hangul), test the cc directly */
847 return (norm32&_NORM_CC_MASK)==0;
848 }
849 }
850
851 /*
852 * is this (or does its decomposition begin with) a "true starter"?
853 * (cc==0 and NF*C_YES)
854 */
855 static inline UBool
856 _isTrueStarter(uint32_t norm32, uint32_t ccOrQCMask, uint32_t decompQCMask) {
857 if((norm32&ccOrQCMask)==0) {
858 return TRUE; /* this is a true starter (could be Hangul or Jamo L) */
859 }
860
861 /* inspect its decomposition - not a Hangul or a surrogate here */
862 if((norm32&decompQCMask)!=0) {
863 const UChar *p;
864 int32_t length;
865 uint8_t cc, trailCC;
866
867 /* decomposes, get everything from the variable-length extra data */
868 p=_decompose(norm32, decompQCMask, length, cc, trailCC);
869 if(cc==0) {
870 uint32_t qcMask=ccOrQCMask&_NORM_QC_MASK;
871
872 /* does it begin with NFC_YES? */
873 if((_getNorm32(p, qcMask)&qcMask)==0) {
874 /* yes, the decomposition begins with a true starter */
875 return TRUE;
876 }
877 }
878 }
879 return FALSE;
880 }
881
882 /* uchar.h */
883 U_CAPI uint8_t U_EXPORT2
884 u_getCombiningClass(UChar32 c) {
885 UErrorCode errorCode=U_ZERO_ERROR;
886 if(_haveData(errorCode)) {
887 uint32_t norm32;
888
889 UTRIE_GET32(&normTrie, c, norm32);
890 return (uint8_t)(norm32>>_NORM_CC_SHIFT);
891 } else {
892 return 0;
893 }
894 }
895
896 U_CAPI UBool U_EXPORT2
897 unorm_internalIsFullCompositionExclusion(UChar32 c) {
898 UErrorCode errorCode=U_ZERO_ERROR;
899 if(_haveData(errorCode) && formatVersion_2_1) {
900 uint16_t aux;
901
902 UTRIE_GET16(&auxTrie, c, aux);
903 return (UBool)((aux&_NORM_AUX_COMP_EX_MASK)!=0);
904 } else {
905 return FALSE;
906 }
907 }
908
909 U_CAPI UBool U_EXPORT2
910 unorm_isCanonSafeStart(UChar32 c) {
911 UErrorCode errorCode=U_ZERO_ERROR;
912 if(_haveData(errorCode) && formatVersion_2_1) {
913 uint16_t aux;
914
915 UTRIE_GET16(&auxTrie, c, aux);
916 return (UBool)((aux&_NORM_AUX_UNSAFE_MASK)==0);
917 } else {
918 return FALSE;
919 }
920 }
921
922 U_CAPI void U_EXPORT2
923 unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode){
924 if(unorm_haveData(pErrorCode)){
925 uprv_memcpy(*versionInfo, dataVersion, 4);
926 }
927 }
928
929
930 U_CAPI UBool U_EXPORT2
931 unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet) {
932 UErrorCode errorCode=U_ZERO_ERROR;
933 if( fillSet!=NULL && (uint32_t)c<=0x10ffff &&
934 _haveData(errorCode) && canonStartSets!=NULL
935 ) {
936 const uint16_t *table;
937 int32_t i, start, limit;
938
939 /*
940 * binary search for c
941 *
942 * There are two search tables,
943 * one for BMP code points and one for supplementary ones.
944 * See unormimp.h for details.
945 */
946 if(c<=0xffff) {
947 table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH];
948 start=0;
949 limit=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
950
951 /* each entry is a pair { c, result } */
952 while(start<limit-2) {
953 i=(uint16_t)(((start+limit)/4)*2); /* (start+limit)/2 and address pairs */
954 if(c<table[i]) {
955 limit=i;
956 } else {
957 start=i;
958 }
959 }
960
961 /* found? */
962 if(c==table[start]) {
963 i=table[start+1];
964 if((i&_NORM_CANON_SET_BMP_MASK)==_NORM_CANON_SET_BMP_IS_INDEX) {
965 /* result 01xxxxxx xxxxxx contains index x to a USerializedSet */
966 i&=(_NORM_MAX_CANON_SETS-1);
967 return uset_getSerializedSet(fillSet,
968 canonStartSets+i,
969 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
970 } else {
971 /* other result values are BMP code points for single-code point sets */
972 uset_setSerializedToOne(fillSet, (UChar32)i);
973 return TRUE;
974 }
975 }
976 } else {
977 uint16_t high, low, h;
978
979 table=canonStartSets+canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]+
980 canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
981 start=0;
982 limit=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
983
984 high=(uint16_t)(c>>16);
985 low=(uint16_t)c;
986
987 /* each entry is a triplet { high(c), low(c), result } */
988 while(start<limit-3) {
989 i=(uint16_t)(((start+limit)/6)*3); /* (start+limit)/2 and address triplets */
990 h=table[i]&0x1f; /* high word */
991 if(high<h || (high==h && low<table[i+1])) {
992 limit=i;
993 } else {
994 start=i;
995 }
996 }
997
998 /* found? */
999 h=table[start];
1000 if(high==(h&0x1f) && low==table[start+1]) {
1001 i=table[start+2];
1002 if((h&0x8000)==0) {
1003 /* the result is an index to a USerializedSet */
1004 return uset_getSerializedSet(fillSet,
1005 canonStartSets+i,
1006 canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-i);
1007 } else {
1008 /*
1009 * single-code point set {x} in
1010 * triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
1011 */
1012 i|=((int32_t)h&0x1f00)<<8; /* add high bits from high(c) */
1013 uset_setSerializedToOne(fillSet, (UChar32)i);
1014 return TRUE;
1015 }
1016 }
1017 }
1018 }
1019
1020 return FALSE; /* not found */
1021 }
1022
1023 U_CAPI int32_t U_EXPORT2
1024 u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *pErrorCode) {
1025 uint16_t aux;
1026
1027 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1028 return 0;
1029 }
1030 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
1031 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1032 return 0;
1033 }
1034 if(!_haveData(*pErrorCode) || !formatVersion_2_1) {
1035 return 0;
1036 }
1037
1038 UTRIE_GET16(&auxTrie, c, aux);
1039 aux&=_NORM_AUX_FNC_MASK;
1040 if(aux!=0) {
1041 const UChar *s;
1042 int32_t length;
1043
1044 s=(const UChar *)(extraData+aux);
1045 if(*s<0xff00) {
1046 /* s points to the single-unit string */
1047 length=1;
1048 } else {
1049 length=*s&0xff;
1050 ++s;
1051 }
1052 if(0<length && length<=destCapacity) {
1053 uprv_memcpy(dest, s, length*U_SIZEOF_UCHAR);
1054 }
1055 return u_terminateUChars(dest, destCapacity, length, pErrorCode);
1056 } else {
1057 return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
1058 }
1059 }
1060
1061 /* Is c an NF<mode>-skippable code point? See unormimp.h. */
1062 U_CAPI UBool U_EXPORT2
1063 unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
1064 UErrorCode errorCode;
1065 uint32_t norm32, mask;
1066 uint16_t aux, fcd;
1067
1068 errorCode=U_ZERO_ERROR;
1069 if(!_haveData(errorCode)) {
1070 return FALSE;
1071 }
1072
1073 /* handle trivial cases; set the comparison mask for the normal ones */
1074 switch(mode) {
1075 case UNORM_NONE:
1076 return TRUE;
1077 case UNORM_NFD:
1078 mask=_NORM_CC_MASK|_NORM_QC_NFD;
1079 break;
1080 case UNORM_NFKD:
1081 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
1082 break;
1083 case UNORM_NFC:
1084 /* case UNORM_FCC: */
1085 mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFC&_NORM_QC_ANY_NO);
1086 break;
1087 case UNORM_NFKC:
1088 mask=_NORM_CC_MASK|_NORM_COMBINES_ANY|(_NORM_QC_NFKC&_NORM_QC_ANY_NO);
1089 break;
1090 case UNORM_FCD:
1091 /* FCD: skippable if lead cc==0 and trail cc<=1 */
1092 UTRIE_GET16(&fcdTrie, c, fcd);
1093 return fcd<=1;
1094 default:
1095 return FALSE;
1096 }
1097
1098 /* check conditions (a)..(e), see unormimp.h */
1099 UTRIE_GET32(&normTrie, c, norm32);
1100 if((norm32&mask)!=0) {
1101 return FALSE; /* fails (a)..(e), not skippable */
1102 }
1103
1104 if(mode<UNORM_NFC) {
1105 return TRUE; /* NF*D, passed (a)..(c), is skippable */
1106 }
1107
1108 /* NF*C/FCC, passed (a)..(e) */
1109 if((norm32&_NORM_QC_NFD)==0) {
1110 return TRUE; /* no canonical decomposition, is skippable */
1111 }
1112
1113 /* check Hangul syllables algorithmically */
1114 if(isNorm32HangulOrJamo(norm32)) {
1115 /* Jamo passed (a)..(e) above, must be Hangul */
1116 return !isHangulWithoutJamoT((UChar)c); /* LVT are skippable, LV are not */
1117 }
1118
1119 /* if(mode<=UNORM_NFKC) { -- enable when implementing FCC */
1120 /* NF*C, test (f) flag */
1121 if(!formatVersion_2_2) {
1122 return FALSE; /* no (f) data, say not skippable to be safe */
1123 }
1124
1125 UTRIE_GET16(&auxTrie, c, aux);
1126 return (aux&_NORM_AUX_NFC_SKIP_F_MASK)==0; /* TRUE=skippable if the (f) flag is not set */
1127
1128 /* } else { FCC, test fcd<=1 instead of the above } */
1129 }
1130
1131 U_CAPI void U_EXPORT2
1132 unorm_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
1133 UChar c;
1134
1135 if(U_FAILURE(*pErrorCode) || !_haveData(*pErrorCode)) {
1136 return;
1137 }
1138
1139 /* add the start code point of each same-value range of each trie */
1140 utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, sa);
1141 utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, sa);
1142 if(formatVersion_2_1) {
1143 utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, sa);
1144 }
1145
1146 /* add Hangul LV syllables and LV+1 because of skippables */
1147 for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
1148 sa->add(sa->set, c);
1149 sa->add(sa->set, c+1);
1150 }
1151 sa->add(sa->set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
1152 }
1153
1154 U_CAPI UNormalizationCheckResult U_EXPORT2
1155 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
1156 static const uint32_t qcMask[UNORM_MODE_COUNT]={
1157 0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
1158 };
1159
1160 UErrorCode errorCode;
1161 uint32_t norm32;
1162
1163 errorCode=U_ZERO_ERROR;
1164 if(!_haveData(errorCode)) {
1165 return UNORM_YES;
1166 }
1167
1168 UTRIE_GET32(&normTrie, c, norm32);
1169 norm32&=qcMask[mode];
1170
1171 if(norm32==0) {
1172 return UNORM_YES;
1173 } else if(norm32&_NORM_QC_ANY_NO) {
1174 return UNORM_NO;
1175 } else /* _NORM_QC_ANY_MAYBE */ {
1176 return UNORM_MAYBE;
1177 }
1178 }
1179
1180 U_CAPI uint16_t U_EXPORT2
1181 unorm_getFCD16FromCodePoint(UChar32 c) {
1182 UErrorCode errorCode;
1183 uint16_t fcd;
1184
1185 errorCode=U_ZERO_ERROR;
1186 if(!_haveData(errorCode)) {
1187 return 0;
1188 }
1189
1190 UTRIE_GET16(&fcdTrie, c, fcd);
1191 return fcd;
1192 }
1193
1194 /* reorder UTF-16 in-place -------------------------------------------------- */
1195
1196 /*
1197 * simpler, single-character version of _mergeOrdered() -
1198 * bubble-insert one single code point into the preceding string
1199 * which is already canonically ordered
1200 * (c, c2) may or may not yet have been inserted at [current..p[
1201 *
1202 * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2)
1203 *
1204 * before: [start..current[ is already ordered, and
1205 * [current..p[ may or may not hold (c, c2) but
1206 * must be exactly the same length as (c, c2)
1207 * after: [start..p[ is ordered
1208 *
1209 * returns the trailing combining class
1210 */
1211 static uint8_t
1212 _insertOrdered(const UChar *start, UChar *current, UChar *p,
1213 UChar c, UChar c2, uint8_t cc) {
1214 const UChar *pBack, *pPreBack;
1215 UChar *r;
1216 uint8_t prevCC, trailCC=cc;
1217
1218 if(start<current && cc!=0) {
1219 /* search for the insertion point where cc>=prevCC */
1220 pPreBack=pBack=current;
1221 prevCC=_getPrevCC(start, pPreBack);
1222 if(cc<prevCC) {
1223 /* this will be the last code point, so keep its cc */
1224 trailCC=prevCC;
1225 pBack=pPreBack;
1226 while(start<pPreBack) {
1227 prevCC=_getPrevCC(start, pPreBack);
1228 if(cc>=prevCC) {
1229 break;
1230 }
1231 pBack=pPreBack;
1232 }
1233
1234 /*
1235 * this is where we are right now with all these pointers:
1236 * [start..pPreBack[ 0..? code points that we can ignore
1237 * [pPreBack..pBack[ 0..1 code points with prevCC<=cc
1238 * [pBack..current[ 0..n code points with >cc, move up to insert (c, c2)
1239 * [current..p[ 1 code point (c, c2) with cc
1240 */
1241
1242 /* move the code units in between up */
1243 r=p;
1244 do {
1245 *--r=*--current;
1246 } while(pBack!=current);
1247 }
1248 }
1249
1250 /* insert (c, c2) */
1251 *current=c;
1252 if(c2!=0) {
1253 *(current+1)=c2;
1254 }
1255
1256 /* we know the cc of the last code point */
1257 return trailCC;
1258 }
1259
1260 /*
1261 * merge two UTF-16 string parts together
1262 * to canonically order (order by combining classes) their concatenation
1263 *
1264 * the two strings may already be adjacent, so that the merging is done in-place
1265 * if the two strings are not adjacent, then the buffer holding the first one
1266 * must be large enough
1267 * the second string may or may not be ordered in itself
1268 *
1269 * before: [start..current[ is already ordered, and
1270 * [next..limit[ may be ordered in itself, but
1271 * is not in relation to [start..current[
1272 * after: [start..current+(limit-next)[ is ordered
1273 *
1274 * the algorithm is a simple bubble-sort that takes the characters from *next++
1275 * and inserts them in correct combining class order into the preceding part
1276 * of the string
1277 *
1278 * since this function is called much less often than the single-code point
1279 * _insertOrdered(), it just uses that for easier maintenance
1280 * (see file version from before 2001aug31 for a more optimized version)
1281 *
1282 * returns the trailing combining class
1283 */
1284 static uint8_t
1285 _mergeOrdered(UChar *start, UChar *current,
1286 const UChar *next, const UChar *limit, UBool isOrdered=TRUE) {
1287 UChar *r;
1288 UChar c, c2;
1289 uint8_t cc, trailCC=0;
1290 UBool adjacent;
1291
1292 adjacent= current==next;
1293
1294 if(start!=current || !isOrdered) {
1295 while(next<limit) {
1296 cc=_getNextCC(next, limit, c, c2);
1297 if(cc==0) {
1298 /* does not bubble back */
1299 trailCC=0;
1300 if(adjacent) {
1301 current=(UChar *)next;
1302 } else {
1303 *current++=c;
1304 if(c2!=0) {
1305 *current++=c2;
1306 }
1307 }
1308 if(isOrdered) {
1309 break;
1310 } else {
1311 start=current;
1312 }
1313 } else {
1314 r=current+(c2==0 ? 1 : 2);
1315 trailCC=_insertOrdered(start, current, r, c, c2, cc);
1316 current=r;
1317 }
1318 }
1319 }
1320
1321 if(next==limit) {
1322 /* we know the cc of the last code point */
1323 return trailCC;
1324 } else {
1325 if(!adjacent) {
1326 /* copy the second string part */
1327 do {
1328 *current++=*next++;
1329 } while(next!=limit);
1330 limit=current;
1331 }
1332 return _getPrevCC(start, limit);
1333 }
1334 }
1335
1336 /* find the last true starter in [start..src[ and return the pointer to it */
1337 static const UChar *
1338 _findPreviousStarter(const UChar *start, const UChar *src,
1339 uint32_t ccOrQCMask, uint32_t decompQCMask, UChar minNoMaybe) {
1340 uint32_t norm32;
1341 UChar c, c2;
1342
1343 while(start<src) {
1344 norm32=_getPrevNorm32(start, src, minNoMaybe, ccOrQCMask|decompQCMask, c, c2);
1345 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
1346 break;
1347 }
1348 }
1349 return src;
1350 }
1351
1352 /* find the first true starter in [src..limit[ and return the pointer to it */
1353 static const UChar *
1354 _findNextStarter(const UChar *src, const UChar *limit,
1355 uint32_t qcMask, uint32_t decompQCMask, UChar minNoMaybe) {
1356 const UChar *p;
1357 uint32_t norm32, ccOrQCMask;
1358 int32_t length;
1359 UChar c, c2;
1360 uint8_t cc, trailCC;
1361
1362 ccOrQCMask=_NORM_CC_MASK|qcMask;
1363
1364 for(;;) {
1365 if(src==limit) {
1366 break; /* end of string */
1367 }
1368 c=*src;
1369 if(c<minNoMaybe) {
1370 break; /* catches NUL terminater, too */
1371 }
1372
1373 norm32=_getNorm32(c);
1374 if((norm32&ccOrQCMask)==0) {
1375 break; /* true starter */
1376 }
1377
1378 if(isNorm32LeadSurrogate(norm32)) {
1379 /* c is a lead surrogate, get the real norm32 */
1380 if((src+1)==limit || !UTF_IS_SECOND_SURROGATE(c2=*(src+1))) {
1381 break; /* unmatched first surrogate: counts as a true starter */
1382 }
1383 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1384
1385 if((norm32&ccOrQCMask)==0) {
1386 break; /* true starter */
1387 }
1388 } else {
1389 c2=0;
1390 }
1391
1392 /* (c, c2) is not a true starter but its decomposition may be */
1393 if(norm32&decompQCMask) {
1394 /* (c, c2) decomposes, get everything from the variable-length extra data */
1395 p=_decompose(norm32, decompQCMask, length, cc, trailCC);
1396
1397 /* get the first character's norm32 to check if it is a true starter */
1398 if(cc==0 && (_getNorm32(p, qcMask)&qcMask)==0) {
1399 break; /* true starter */
1400 }
1401 }
1402
1403 src+= c2==0 ? 1 : 2; /* not a true starter, continue */
1404 }
1405
1406 return src;
1407 }
1408
1409 /* make NFD & NFKD ---------------------------------------------------------- */
1410
1411 U_CAPI int32_t U_EXPORT2
1412 unorm_getDecomposition(UChar32 c, UBool compat,
1413 UChar *dest, int32_t destCapacity) {
1414 UErrorCode errorCode=U_ZERO_ERROR;
1415 if( (uint32_t)c<=0x10ffff &&
1416 _haveData(errorCode) &&
1417 ((dest!=NULL && destCapacity>0) || destCapacity==0)
1418 ) {
1419 uint32_t norm32, qcMask;
1420 UChar32 minNoMaybe;
1421 int32_t length;
1422
1423 /* initialize */
1424 if(!compat) {
1425 minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1426 qcMask=_NORM_QC_NFD;
1427 } else {
1428 minNoMaybe=(UChar32)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1429 qcMask=_NORM_QC_NFKD;
1430 }
1431
1432 if(c<minNoMaybe) {
1433 /* trivial case */
1434 if(destCapacity>0) {
1435 dest[0]=(UChar)c;
1436 }
1437 return -1;
1438 }
1439
1440 /* data lookup */
1441 UTRIE_GET32(&normTrie, c, norm32);
1442 if((norm32&qcMask)==0) {
1443 /* simple case: no decomposition */
1444 if(c<=0xffff) {
1445 if(destCapacity>0) {
1446 dest[0]=(UChar)c;
1447 }
1448 return -1;
1449 } else {
1450 if(destCapacity>=2) {
1451 dest[0]=UTF16_LEAD(c);
1452 dest[1]=UTF16_TRAIL(c);
1453 }
1454 return -2;
1455 }
1456 } else if(isNorm32HangulOrJamo(norm32)) {
1457 /* Hangul syllable: decompose algorithmically */
1458 UChar c2;
1459
1460 c-=HANGUL_BASE;
1461
1462 c2=(UChar)(c%JAMO_T_COUNT);
1463 c/=JAMO_T_COUNT;
1464 if(c2>0) {
1465 if(destCapacity>=3) {
1466 dest[2]=(UChar)(JAMO_T_BASE+c2);
1467 }
1468 length=3;
1469 } else {
1470 length=2;
1471 }
1472
1473 if(destCapacity>=2) {
1474 dest[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1475 dest[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1476 }
1477 return length;
1478 } else {
1479 /* c decomposes, get everything from the variable-length extra data */
1480 const UChar *p, *limit;
1481 uint8_t cc, trailCC;
1482
1483 p=_decompose(norm32, qcMask, length, cc, trailCC);
1484 if(length<=destCapacity) {
1485 limit=p+length;
1486 do {
1487 *dest++=*p++;
1488 } while(p<limit);
1489 }
1490 return length;
1491 }
1492 } else {
1493 return 0;
1494 }
1495 }
1496
1497 static int32_t
1498 _decompose(UChar *dest, int32_t destCapacity,
1499 const UChar *src, int32_t srcLength,
1500 UBool compat, const UnicodeSet *nx,
1501 uint8_t &outTrailCC) {
1502 UChar buffer[3];
1503 const UChar *limit, *prevSrc, *p;
1504 uint32_t norm32, ccOrQCMask, qcMask;
1505 int32_t destIndex, reorderStartIndex, length;
1506 UChar c, c2, minNoMaybe;
1507 uint8_t cc, prevCC, trailCC;
1508
1509 if(!compat) {
1510 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
1511 qcMask=_NORM_QC_NFD;
1512 } else {
1513 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
1514 qcMask=_NORM_QC_NFKD;
1515 }
1516
1517 /* initialize */
1518 ccOrQCMask=_NORM_CC_MASK|qcMask;
1519 destIndex=reorderStartIndex=0;
1520 prevCC=0;
1521
1522 /* avoid compiler warnings */
1523 norm32=0;
1524 c=0;
1525
1526 if(srcLength>=0) {
1527 /* string with length */
1528 limit=src+srcLength;
1529 } else /* srcLength==-1 */ {
1530 /* zero-terminated string */
1531 limit=NULL;
1532 }
1533
1534 U_ALIGN_CODE(16);
1535
1536 for(;;) {
1537 /* count code units below the minimum or with irrelevant data for the quick check */
1538 prevSrc=src;
1539 if(limit==NULL) {
1540 while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
1541 prevCC=0;
1542 ++src;
1543 }
1544 } else {
1545 while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
1546 prevCC=0;
1547 ++src;
1548 }
1549 }
1550
1551 /* copy these code units all at once */
1552 if(src!=prevSrc) {
1553 length=(int32_t)(src-prevSrc);
1554 if((destIndex+length)<=destCapacity) {
1555 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
1556 }
1557 destIndex+=length;
1558 reorderStartIndex=destIndex;
1559 }
1560
1561 /* end of source reached? */
1562 if(limit==NULL ? c==0 : src==limit) {
1563 break;
1564 }
1565
1566 /* c already contains *src and norm32 is set for it, increment src */
1567 ++src;
1568
1569 /* check one above-minimum, relevant code unit */
1570 /*
1571 * generally, set p and length to the decomposition string
1572 * in simple cases, p==NULL and (c, c2) will hold the length code units to append
1573 * in all cases, set cc to the lead and trailCC to the trail combining class
1574 *
1575 * the following merge-sort of the current character into the preceding,
1576 * canonically ordered result text will use the optimized _insertOrdered()
1577 * if there is only one single code point to process;
1578 * this is indicated with p==NULL, and (c, c2) is the character to insert
1579 * ((c, 0) for a BMP character and (lead surrogate, trail surrogate)
1580 * for a supplementary character)
1581 * otherwise, p[length] is merged in with _mergeOrdered()
1582 */
1583 if(isNorm32HangulOrJamo(norm32)) {
1584 if(nx_contains(nx, c)) {
1585 c2=0;
1586 p=NULL;
1587 length=1;
1588 } else {
1589 /* Hangul syllable: decompose algorithmically */
1590 p=buffer;
1591 cc=trailCC=0;
1592
1593 c-=HANGUL_BASE;
1594
1595 c2=(UChar)(c%JAMO_T_COUNT);
1596 c/=JAMO_T_COUNT;
1597 if(c2>0) {
1598 buffer[2]=(UChar)(JAMO_T_BASE+c2);
1599 length=3;
1600 } else {
1601 length=2;
1602 }
1603
1604 buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
1605 buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
1606 }
1607 } else {
1608 if(isNorm32Regular(norm32)) {
1609 c2=0;
1610 length=1;
1611 } else {
1612 /* c is a lead surrogate, get the real norm32 */
1613 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
1614 ++src;
1615 length=2;
1616 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1617 } else {
1618 c2=0;
1619 length=1;
1620 norm32=0;
1621 }
1622 }
1623
1624 /* get the decomposition and the lead and trail cc's */
1625 if(nx_contains(nx, c, c2)) {
1626 /* excluded: norm32==0 */
1627 cc=trailCC=0;
1628 p=NULL;
1629 } else if((norm32&qcMask)==0) {
1630 /* c does not decompose */
1631 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1632 p=NULL;
1633 } else {
1634 /* c decomposes, get everything from the variable-length extra data */
1635 p=_decompose(norm32, qcMask, length, cc, trailCC);
1636 if(length==1) {
1637 /* fastpath a single code unit from decomposition */
1638 c=*p;
1639 c2=0;
1640 p=NULL;
1641 }
1642 }
1643 }
1644
1645 /* append the decomposition to the destination buffer, assume length>0 */
1646 if((destIndex+length)<=destCapacity) {
1647 UChar *reorderSplit=dest+destIndex;
1648 if(p==NULL) {
1649 /* fastpath: single code point */
1650 if(cc!=0 && cc<prevCC) {
1651 /* (c, c2) is out of order with respect to the preceding text */
1652 destIndex+=length;
1653 trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
1654 } else {
1655 /* just append (c, c2) */
1656 dest[destIndex++]=c;
1657 if(c2!=0) {
1658 dest[destIndex++]=c2;
1659 }
1660 }
1661 } else {
1662 /* general: multiple code points (ordered by themselves) from decomposition */
1663 if(cc!=0 && cc<prevCC) {
1664 /* the decomposition is out of order with respect to the preceding text */
1665 destIndex+=length;
1666 trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
1667 } else {
1668 /* just append the decomposition */
1669 do {
1670 dest[destIndex++]=*p++;
1671 } while(--length>0);
1672 }
1673 }
1674 } else {
1675 /* buffer overflow */
1676 /* keep incrementing the destIndex for preflighting */
1677 destIndex+=length;
1678 }
1679
1680 prevCC=trailCC;
1681 if(prevCC==0) {
1682 reorderStartIndex=destIndex;
1683 }
1684 }
1685
1686 outTrailCC=prevCC;
1687 return destIndex;
1688 }
1689
1690 U_CAPI int32_t U_EXPORT2
1691 unorm_decompose(UChar *dest, int32_t destCapacity,
1692 const UChar *src, int32_t srcLength,
1693 UBool compat, int32_t options,
1694 UErrorCode *pErrorCode) {
1695 const UnicodeSet *nx;
1696 int32_t destIndex;
1697 uint8_t trailCC;
1698
1699 if(!_haveData(*pErrorCode)) {
1700 return 0;
1701 }
1702
1703 nx=getNX(options, *pErrorCode);
1704 if(U_FAILURE(*pErrorCode)) {
1705 return 0;
1706 }
1707
1708 destIndex=_decompose(dest, destCapacity,
1709 src, srcLength,
1710 compat, nx,
1711 trailCC);
1712
1713 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
1714 }
1715
1716 /* make NFC & NFKC ---------------------------------------------------------- */
1717
1718 /* get the composition properties of the next character */
1719 static inline uint32_t
1720 _getNextCombining(UChar *&p, const UChar *limit,
1721 UChar &c, UChar &c2,
1722 uint16_t &combiningIndex, uint8_t &cc,
1723 const UnicodeSet *nx) {
1724 uint32_t norm32, combineFlags;
1725
1726 /* get properties */
1727 c=*p++;
1728 norm32=_getNorm32(c);
1729
1730 /* preset output values for most characters */
1731 c2=0;
1732 combiningIndex=0;
1733 cc=0;
1734
1735 if((norm32&(_NORM_CC_MASK|_NORM_COMBINES_ANY))==0) {
1736 return 0;
1737 } else {
1738 if(isNorm32Regular(norm32)) {
1739 /* set cc etc. below */
1740 } else if(isNorm32HangulOrJamo(norm32)) {
1741 /* a compatibility decomposition contained Jamos */
1742 combiningIndex=(uint16_t)(0xfff0|(norm32>>_NORM_EXTRA_SHIFT));
1743 return norm32&_NORM_COMBINES_ANY;
1744 } else {
1745 /* c is a lead surrogate, get the real norm32 */
1746 if(p!=limit && UTF_IS_SECOND_SURROGATE(c2=*p)) {
1747 ++p;
1748 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1749 } else {
1750 c2=0;
1751 return 0;
1752 }
1753 }
1754
1755 if(nx_contains(nx, c, c2)) {
1756 return 0; /* excluded: norm32==0 */
1757 }
1758
1759 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
1760
1761 combineFlags=norm32&_NORM_COMBINES_ANY;
1762 if(combineFlags!=0) {
1763 combiningIndex=*(_getExtraData(norm32)-1);
1764 }
1765 return combineFlags;
1766 }
1767 }
1768
1769 /*
1770 * given a composition-result starter (c, c2) - which means its cc==0,
1771 * it combines forward, it has extra data, its norm32!=0,
1772 * it is not a Hangul or Jamo,
1773 * get just its combineFwdIndex
1774 *
1775 * norm32(c) is special if and only if c2!=0
1776 */
1777 static inline uint16_t
1778 _getCombiningIndexFromStarter(UChar c, UChar c2) {
1779 uint32_t norm32;
1780
1781 norm32=_getNorm32(c);
1782 if(c2!=0) {
1783 norm32=_getNorm32FromSurrogatePair(norm32, c2);
1784 }
1785 return *(_getExtraData(norm32)-1);
1786 }
1787
1788 /*
1789 * Find the recomposition result for
1790 * a forward-combining character
1791 * (specified with a pointer to its part of the combiningTable[])
1792 * and a backward-combining character
1793 * (specified with its combineBackIndex).
1794 *
1795 * If these two characters combine, then set (value, value2)
1796 * with the code unit(s) of the composition character.
1797 *
1798 * Return value:
1799 * 0 do not combine
1800 * 1 combine
1801 * >1 combine, and the composition is a forward-combining starter
1802 *
1803 * See unormimp.h for a description of the composition table format.
1804 */
1805 static inline uint16_t
1806 _combine(const uint16_t *table, uint16_t combineBackIndex,
1807 uint16_t &value, uint16_t &value2) {
1808 uint16_t key;
1809
1810 /* search in the starter's composition table */
1811 for(;;) {
1812 key=*table++;
1813 if(key>=combineBackIndex) {
1814 break;
1815 }
1816 table+= *table&0x8000 ? 2 : 1;
1817 }
1818
1819 /* mask off bit 15, the last-entry-in-the-list flag */
1820 if((key&0x7fff)==combineBackIndex) {
1821 /* found! combine! */
1822 value=*table;
1823
1824 /* is the composition a starter that combines forward? */
1825 key=(uint16_t)((value&0x2000)+1);
1826
1827 /* get the composition result code point from the variable-length result value */
1828 if(value&0x8000) {
1829 if(value&0x4000) {
1830 /* surrogate pair composition result */
1831 value=(uint16_t)((value&0x3ff)|0xd800);
1832 value2=*(table+1);
1833 } else {
1834 /* BMP composition result U+2000..U+ffff */
1835 value=*(table+1);
1836 value2=0;
1837 }
1838 } else {
1839 /* BMP composition result U+0000..U+1fff */
1840 value&=0x1fff;
1841 value2=0;
1842 }
1843
1844 return key;
1845 } else {
1846 /* not found */
1847 return 0;
1848 }
1849 }
1850
1851 static inline UBool
1852 _composeHangul(UChar prev, UChar c, uint32_t norm32, const UChar *&src, const UChar *limit,
1853 UBool compat, UChar *dest, const UnicodeSet *nx) {
1854 if(isJamoVTNorm32JamoV(norm32)) {
1855 /* c is a Jamo V, compose with previous Jamo L and following Jamo T */
1856 prev=(UChar)(prev-JAMO_L_BASE);
1857 if(prev<JAMO_L_COUNT) {
1858 c=(UChar)(HANGUL_BASE+(prev*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
1859
1860 /* check if the next character is a Jamo T (normal or compatibility) */
1861 if(src!=limit) {
1862 UChar next, t;
1863
1864 next=*src;
1865 if((t=(UChar)(next-JAMO_T_BASE))<JAMO_T_COUNT) {
1866 /* normal Jamo T */
1867 ++src;
1868 c+=t;
1869 } else if(compat) {
1870 /* if NFKC, then check for compatibility Jamo T (BMP only) */
1871 norm32=_getNorm32(next);
1872 if(isNorm32Regular(norm32) && (norm32&_NORM_QC_NFKD)) {
1873 const UChar *p;
1874 int32_t length;
1875 uint8_t cc, trailCC;
1876
1877 p=_decompose(norm32, _NORM_QC_NFKD, length, cc, trailCC);
1878 if(length==1 && (t=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1879 /* compatibility Jamo T */
1880 ++src;
1881 c+=t;
1882 }
1883 }
1884 }
1885 }
1886 if(nx_contains(nx, c)) {
1887 if(!isHangulWithoutJamoT(c)) {
1888 --src; /* undo ++src from reading the Jamo T */
1889 }
1890 return FALSE;
1891 }
1892 if(dest!=0) {
1893 *dest=c;
1894 }
1895 return TRUE;
1896 }
1897 } else if(isHangulWithoutJamoT(prev)) {
1898 /* c is a Jamo T, compose with previous Hangul LV that does not contain a Jamo T */
1899 c=(UChar)(prev+(c-JAMO_T_BASE));
1900 if(nx_contains(nx, c)) {
1901 return FALSE;
1902 }
1903 if(dest!=0) {
1904 *dest=c;
1905 }
1906 return TRUE;
1907 }
1908 return FALSE;
1909 }
1910
1911 /*
1912 * recompose the characters in [p..limit[
1913 * (which is in NFD - decomposed and canonically ordered),
1914 * adjust limit, and return the trailing cc
1915 *
1916 * since for NFKC we may get Jamos in decompositions, we need to
1917 * recompose those too
1918 *
1919 * note that recomposition never lengthens the text:
1920 * any character consists of either one or two code units;
1921 * a composition may contain at most one more code unit than the original starter,
1922 * while the combining mark that is removed has at least one code unit
1923 */
1924 static uint8_t
1925 _recompose(UChar *p, UChar *&limit, int32_t options, const UnicodeSet *nx) {
1926 UChar *starter, *pRemove, *q, *r;
1927 uint32_t combineFlags;
1928 UChar c, c2;
1929 uint16_t combineFwdIndex, combineBackIndex;
1930 uint16_t result, value, value2;
1931 uint8_t cc, prevCC;
1932 UBool starterIsSupplementary;
1933
1934 starter=NULL; /* no starter */
1935 combineFwdIndex=0; /* will not be used until starter!=NULL - avoid compiler warnings */
1936 combineBackIndex=0; /* will always be set if combineFlags!=0 - avoid compiler warnings */
1937 value=value2=0; /* always set by _combine() before used - avoid compiler warnings */
1938 starterIsSupplementary=FALSE; /* will not be used until starter!=NULL - avoid compiler warnings */
1939 prevCC=0;
1940
1941 for(;;) {
1942 combineFlags=_getNextCombining(p, limit, c, c2, combineBackIndex, cc, nx);
1943 if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
1944 if(combineBackIndex&0x8000) {
1945 /* c is a Jamo V/T, see if we can compose it with the previous character */
1946 /* for the PRI #29 fix, check that there is no intervening combining mark */
1947 if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
1948 pRemove=NULL; /* NULL while no Hangul composition */
1949 combineFlags=0;
1950 c2=*starter;
1951 if(combineBackIndex==0xfff2) {
1952 /* Jamo V, compose with previous Jamo L and following Jamo T */
1953 c2=(UChar)(c2-JAMO_L_BASE);
1954 if(c2<JAMO_L_COUNT) {
1955 pRemove=p-1;
1956 c=(UChar)(HANGUL_BASE+(c2*JAMO_V_COUNT+(c-JAMO_V_BASE))*JAMO_T_COUNT);
1957 if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
1958 ++p;
1959 c+=c2;
1960 } else {
1961 /* the result is an LV syllable, which is a starter (unlike LVT) */
1962 combineFlags=_NORM_COMBINES_FWD;
1963 }
1964 if(!nx_contains(nx, c)) {
1965 *starter=c;
1966 } else {
1967 /* excluded */
1968 if(!isHangulWithoutJamoT(c)) {
1969 --p; /* undo the ++p from reading the Jamo T */
1970 }
1971 /* c is modified but not used any more -- c=*(p-1); -- re-read the Jamo V/T */
1972 pRemove=NULL;
1973 }
1974 }
1975
1976 /*
1977 * Normally, the following can not occur:
1978 * Since the input is in NFD, there are no Hangul LV syllables that
1979 * a Jamo T could combine with.
1980 * All Jamo Ts are combined above when handling Jamo Vs.
1981 *
1982 * However, before the PRI #29 fix, this can occur due to
1983 * an intervening combining mark between the Hangul LV and the Jamo T.
1984 */
1985 } else {
1986 /* Jamo T, compose with previous Hangul that does not have a Jamo T */
1987 if(isHangulWithoutJamoT(c2)) {
1988 c2+=(UChar)(c-JAMO_T_BASE);
1989 if(!nx_contains(nx, c2)) {
1990 pRemove=p-1;
1991 *starter=c2;
1992 }
1993 }
1994 }
1995
1996 if(pRemove!=NULL) {
1997 /* remove the Jamo(s) */
1998 q=pRemove;
1999 r=p;
2000 while(r<limit) {
2001 *q++=*r++;
2002 }
2003 p=pRemove;
2004 limit=q;
2005 }
2006
2007 c2=0; /* c2 held *starter temporarily */
2008
2009 if(combineFlags!=0) {
2010 /*
2011 * not starter=NULL because the composition is a Hangul LV syllable
2012 * and might combine once more (but only before the PRI #29 fix)
2013 */
2014
2015 /* done? */
2016 if(p==limit) {
2017 return prevCC;
2018 }
2019
2020 /* the composition is a Hangul LV syllable which is a starter that combines forward */
2021 combineFwdIndex=0xfff0;
2022
2023 /* we combined; continue with looking for compositions */
2024 continue;
2025 }
2026 }
2027
2028 /*
2029 * now: cc==0 and the combining index does not include "forward" ->
2030 * the rest of the loop body will reset starter to NULL;
2031 * technically, a composed Hangul syllable is a starter, but it
2032 * does not combine forward now that we have consumed all eligible Jamos;
2033 * for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
2034 */
2035
2036 } else if(
2037 /* the starter is not a Hangul LV or Jamo V/T and */
2038 !(combineFwdIndex&0x8000) &&
2039 /* the combining mark is not blocked and */
2040 ((options&UNORM_BEFORE_PRI_29) ?
2041 (prevCC!=cc || prevCC==0) :
2042 (prevCC<cc || prevCC==0)) &&
2043 /* the starter and the combining mark (c, c2) do combine and */
2044 0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
2045 /* the composition result is not excluded */
2046 !nx_contains(nx, value, value2)
2047 ) {
2048 /* replace the starter with the composition, remove the combining mark */
2049 pRemove= c2==0 ? p-1 : p-2; /* pointer to the combining mark */
2050
2051 /* replace the starter with the composition */
2052 *starter=(UChar)value;
2053 if(starterIsSupplementary) {
2054 if(value2!=0) {
2055 /* both are supplementary */
2056 *(starter+1)=(UChar)value2;
2057 } else {
2058 /* the composition is shorter than the starter, move the intermediate characters forward one */
2059 starterIsSupplementary=FALSE;
2060 q=starter+1;
2061 r=q+1;
2062 while(r<pRemove) {
2063 *q++=*r++;
2064 }
2065 --pRemove;
2066 }
2067 } else if(value2!=0) {
2068 /* the composition is longer than the starter, move the intermediate characters back one */
2069 starterIsSupplementary=TRUE;
2070 ++starter; /* temporarily increment for the loop boundary */
2071 q=pRemove;
2072 r=++pRemove;
2073 while(starter<q) {
2074 *--r=*--q;
2075 }
2076 *starter=(UChar)value2;
2077 --starter; /* undo the temporary increment */
2078 /* } else { both are on the BMP, nothing more to do */
2079 }
2080
2081 /* remove the combining mark by moving the following text over it */
2082 if(pRemove<p) {
2083 q=pRemove;
2084 r=p;
2085 while(r<limit) {
2086 *q++=*r++;
2087 }
2088 p=pRemove;
2089 limit=q;
2090 }
2091
2092 /* keep prevCC because we removed the combining mark */
2093
2094 /* done? */
2095 if(p==limit) {
2096 return prevCC;
2097 }
2098
2099 /* is the composition a starter that combines forward? */
2100 if(result>1) {
2101 combineFwdIndex=_getCombiningIndexFromStarter((UChar)value, (UChar)value2);
2102 } else {
2103 starter=NULL;
2104 }
2105
2106 /* we combined; continue with looking for compositions */
2107 continue;
2108 }
2109 }
2110
2111 /* no combination this time */
2112 prevCC=cc;
2113 if(p==limit) {
2114 return prevCC;
2115 }
2116
2117 /* if (c, c2) did not combine, then check if it is a starter */
2118 if(cc==0) {
2119 /* found a new starter; combineFlags==0 if (c, c2) is excluded */
2120 if(combineFlags&_NORM_COMBINES_FWD) {
2121 /* it may combine with something, prepare for it */
2122 if(c2==0) {
2123 starterIsSupplementary=FALSE;
2124 starter=p-1;
2125 } else {
2126 starterIsSupplementary=TRUE;
2127 starter=p-2;
2128 }
2129 combineFwdIndex=combineBackIndex;
2130 } else {
2131 /* it will not combine with anything */
2132 starter=NULL;
2133 }
2134 } else if(options&_NORM_OPTIONS_COMPOSE_CONTIGUOUS) {
2135 /* FCC: no discontiguous compositions; any intervening character blocks */
2136 starter=NULL;
2137 }
2138 }
2139 }
2140
2141 /* decompose and recompose [prevStarter..src[ */
2142 static const UChar *
2143 _composePart(UChar *stackBuffer, UChar *&buffer, int32_t &bufferCapacity, int32_t &length,
2144 const UChar *prevStarter, const UChar *src,
2145 uint8_t &prevCC,
2146 int32_t options, const UnicodeSet *nx,
2147 UErrorCode *pErrorCode) {
2148 UChar *recomposeLimit;
2149 uint8_t trailCC;
2150 UBool compat;
2151
2152 compat=(UBool)((options&_NORM_OPTIONS_COMPAT)!=0);
2153
2154 /* decompose [prevStarter..src[ */
2155 length=_decompose(buffer, bufferCapacity,
2156 prevStarter, src-prevStarter,
2157 compat, nx,
2158 trailCC);
2159 if(length>bufferCapacity) {
2160 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*length, 0)) {
2161 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
2162 return NULL;
2163 }
2164 length=_decompose(buffer, bufferCapacity,
2165 prevStarter, src-prevStarter,
2166 compat, nx,
2167 trailCC);
2168 }
2169
2170 /* recompose the decomposition */
2171 recomposeLimit=buffer+length;
2172 if(length>=2) {
2173 prevCC=_recompose(buffer, recomposeLimit, options, nx);
2174 }
2175
2176 /* return with a pointer to the recomposition and its length */
2177 length=recomposeLimit-buffer;
2178 return buffer;
2179 }
2180
2181 static int32_t
2182 _compose(UChar *dest, int32_t destCapacity,
2183 const UChar *src, int32_t srcLength,
2184 int32_t options, const UnicodeSet *nx,
2185 UErrorCode *pErrorCode) {
2186 UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2187 UChar *buffer;
2188 int32_t bufferCapacity;
2189
2190 const UChar *limit, *prevSrc, *prevStarter;
2191 uint32_t norm32, ccOrQCMask, qcMask;
2192 int32_t destIndex, reorderStartIndex, length;
2193 UChar c, c2, minNoMaybe;
2194 uint8_t cc, prevCC;
2195
2196 if(options&_NORM_OPTIONS_COMPAT) {
2197 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2198 qcMask=_NORM_QC_NFKC;
2199 } else {
2200 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2201 qcMask=_NORM_QC_NFC;
2202 }
2203
2204 /* initialize */
2205 buffer=stackBuffer;
2206 bufferCapacity=_STACK_BUFFER_CAPACITY;
2207
2208 /*
2209 * prevStarter points to the last character before the current one
2210 * that is a "true" starter with cc==0 and quick check "yes".
2211 *
2212 * prevStarter will be used instead of looking for a true starter
2213 * while incrementally decomposing [prevStarter..prevSrc[
2214 * in _composePart(). Having a good prevStarter allows to just decompose
2215 * the entire [prevStarter..prevSrc[.
2216 *
2217 * When _composePart() backs out from prevSrc back to prevStarter,
2218 * then it also backs out destIndex by the same amount.
2219 * Therefore, at all times, the (prevSrc-prevStarter) source units
2220 * must correspond 1:1 to destination units counted with destIndex,
2221 * except for reordering.
2222 * This is true for the qc "yes" characters copied in the fast loop,
2223 * and for pure reordering.
2224 * prevStarter must be set forward to src when this is not true:
2225 * In _composePart() and after composing a Hangul syllable.
2226 *
2227 * This mechanism relies on the assumption that the decomposition of a true starter
2228 * also begins with a true starter. gennorm/store.c checks for this.
2229 */
2230 prevStarter=src;
2231
2232 ccOrQCMask=_NORM_CC_MASK|qcMask;
2233 destIndex=reorderStartIndex=0;
2234 prevCC=0;
2235
2236 /* avoid compiler warnings */
2237 norm32=0;
2238 c=0;
2239
2240 if(srcLength>=0) {
2241 /* string with length */
2242 limit=src+srcLength;
2243 } else /* srcLength==-1 */ {
2244 /* zero-terminated string */
2245 limit=NULL;
2246 }
2247
2248 U_ALIGN_CODE(16);
2249
2250 for(;;) {
2251 /* count code units below the minimum or with irrelevant data for the quick check */
2252 prevSrc=src;
2253 if(limit==NULL) {
2254 while((c=*src)<minNoMaybe ? c!=0 : ((norm32=_getNorm32(c))&ccOrQCMask)==0) {
2255 prevCC=0;
2256 ++src;
2257 }
2258 } else {
2259 while(src!=limit && ((c=*src)<minNoMaybe || ((norm32=_getNorm32(c))&ccOrQCMask)==0)) {
2260 prevCC=0;
2261 ++src;
2262 }
2263 }
2264
2265 /* copy these code units all at once */
2266 if(src!=prevSrc) {
2267 length=(int32_t)(src-prevSrc);
2268 if((destIndex+length)<=destCapacity) {
2269 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2270 }
2271 destIndex+=length;
2272 reorderStartIndex=destIndex;
2273
2274 /* set prevStarter to the last character in the quick check loop */
2275 prevStarter=src-1;
2276 if(UTF_IS_SECOND_SURROGATE(*prevStarter) && prevSrc<prevStarter && UTF_IS_FIRST_SURROGATE(*(prevStarter-1))) {
2277 --prevStarter;
2278 }
2279
2280 prevSrc=src;
2281 }
2282
2283 /* end of source reached? */
2284 if(limit==NULL ? c==0 : src==limit) {
2285 break;
2286 }
2287
2288 /* c already contains *src and norm32 is set for it, increment src */
2289 ++src;
2290
2291 /*
2292 * source buffer pointers:
2293 *
2294 * all done quick check current char not yet
2295 * "yes" but (c, c2) processed
2296 * may combine
2297 * forward
2298 * [-------------[-------------[-------------[-------------[
2299 * | | | | |
2300 * start prevStarter prevSrc src limit
2301 *
2302 *
2303 * destination buffer pointers and indexes:
2304 *
2305 * all done might take not filled yet
2306 * characters for
2307 * reordering
2308 * [-------------[-------------[-------------[
2309 * | | | |
2310 * dest reorderStartIndex destIndex destCapacity
2311 */
2312
2313 /* check one above-minimum, relevant code unit */
2314 /*
2315 * norm32 is for c=*(src-1), and the quick check flag is "no" or "maybe", and/or cc!=0
2316 * check for Jamo V/T, then for surrogates and regular characters
2317 * c is not a Hangul syllable or Jamo L because
2318 * they are not marked with no/maybe for NFC & NFKC (and their cc==0)
2319 */
2320 if(isNorm32HangulOrJamo(norm32)) {
2321 /*
2322 * c is a Jamo V/T:
2323 * try to compose with the previous character, Jamo V also with a following Jamo T,
2324 * and set values here right now in case we just continue with the main loop
2325 */
2326 prevCC=cc=0;
2327 reorderStartIndex=destIndex;
2328
2329 if(
2330 destIndex>0 &&
2331 _composeHangul(
2332 *(prevSrc-1), c, norm32, src, limit, (UBool)((options&_NORM_OPTIONS_COMPAT)!=0),
2333 destIndex<=destCapacity ? dest+(destIndex-1) : 0,
2334 nx)
2335 ) {
2336 prevStarter=src;
2337 continue;
2338 }
2339
2340 /* the Jamo V/T did not compose into a Hangul syllable, just append to dest */
2341 c2=0;
2342 length=1;
2343 prevStarter=prevSrc;
2344 } else {
2345 if(isNorm32Regular(norm32)) {
2346 c2=0;
2347 length=1;
2348 } else {
2349 /* c is a lead surrogate, get the real norm32 */
2350 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2351 ++src;
2352 length=2;
2353 norm32=_getNorm32FromSurrogatePair(norm32, c2);
2354 } else {
2355 /* c is an unpaired lead surrogate, nothing to do */
2356 c2=0;
2357 length=1;
2358 norm32=0;
2359 }
2360 }
2361
2362 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2363 if(nx_contains(nx, c, c2)) {
2364 /* excluded: norm32==0 */
2365 cc=0;
2366 } else if((norm32&qcMask)==0) {
2367 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2368 } else {
2369 const UChar *p;
2370 uint32_t decompQCMask;
2371
2372 /*
2373 * find appropriate boundaries around this character,
2374 * decompose the source text from between the boundaries,
2375 * and recompose it
2376 *
2377 * this puts the intermediate text into the side buffer because
2378 * it might be longer than the recomposition end result,
2379 * or the destination buffer may be too short or missing
2380 *
2381 * note that destIndex may be adjusted backwards to account
2382 * for source text that passed the quick check but needed to
2383 * take part in the recomposition
2384 */
2385 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
2386
2387 /*
2388 * find the last true starter in [prevStarter..src[
2389 * it is either the decomposition of the current character (at prevSrc),
2390 * or prevStarter
2391 */
2392 if(_isTrueStarter(norm32, ccOrQCMask, decompQCMask)) {
2393 prevStarter=prevSrc;
2394 } else {
2395 /* adjust destIndex: back out what had been copied with qc "yes" */
2396 destIndex-=(int32_t)(prevSrc-prevStarter);
2397 }
2398
2399 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
2400 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
2401
2402 /* compose [prevStarter..src[ */
2403 p=_composePart(stackBuffer, buffer, bufferCapacity,
2404 length, /* output */
2405 prevStarter, src,
2406 prevCC, /* output */
2407 options, nx,
2408 pErrorCode);
2409
2410 if(p==NULL) {
2411 destIndex=0; /* an error occurred (out of memory) */
2412 break;
2413 }
2414
2415 /* append the recomposed buffer contents to the destination buffer */
2416 if((destIndex+length)<=destCapacity) {
2417 while(length>0) {
2418 dest[destIndex++]=*p++;
2419 --length;
2420 }
2421 } else {
2422 /* buffer overflow */
2423 /* keep incrementing the destIndex for preflighting */
2424 destIndex+=length;
2425 }
2426
2427 /* set the next starter */
2428 prevStarter=src;
2429
2430 continue;
2431 }
2432 }
2433
2434 /* append the single code point (c, c2) to the destination buffer */
2435 if((destIndex+length)<=destCapacity) {
2436 if(cc!=0 && cc<prevCC) {
2437 /* (c, c2) is out of order with respect to the preceding text */
2438 UChar *reorderSplit=dest+destIndex;
2439 destIndex+=length;
2440 prevCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2441 } else {
2442 /* just append (c, c2) */
2443 dest[destIndex++]=c;
2444 if(c2!=0) {
2445 dest[destIndex++]=c2;
2446 }
2447 prevCC=cc;
2448 }
2449 } else {
2450 /* buffer overflow */
2451 /* keep incrementing the destIndex for preflighting */
2452 destIndex+=length;
2453 prevCC=cc;
2454 }
2455 }
2456
2457 /* cleanup */
2458 if(buffer!=stackBuffer) {
2459 uprv_free(buffer);
2460 }
2461
2462 return destIndex;
2463 }
2464
2465 U_CAPI int32_t U_EXPORT2
2466 unorm_compose(UChar *dest, int32_t destCapacity,
2467 const UChar *src, int32_t srcLength,
2468 UBool compat, int32_t options,
2469 UErrorCode *pErrorCode) {
2470 const UnicodeSet *nx;
2471 int32_t destIndex;
2472
2473 if(!_haveData(*pErrorCode)) {
2474 return 0;
2475 }
2476
2477 nx=getNX(options, *pErrorCode);
2478 if(U_FAILURE(*pErrorCode)) {
2479 return 0;
2480 }
2481
2482 /* reset options bits that should only be set here or inside _compose() */
2483 options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
2484
2485 if(compat) {
2486 options|=_NORM_OPTIONS_COMPAT;
2487 }
2488
2489 destIndex=_compose(dest, destCapacity,
2490 src, srcLength,
2491 options, nx,
2492 pErrorCode);
2493
2494 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2495 }
2496
2497 /* make FCD ----------------------------------------------------------------- */
2498
2499 static const UChar *
2500 _findSafeFCD(const UChar *src, const UChar *limit, uint16_t fcd16) {
2501 UChar c, c2;
2502
2503 /*
2504 * find the first position in [src..limit[ after some cc==0 according to FCD data
2505 *
2506 * at the beginning of the loop, we have fcd16 from before src
2507 *
2508 * stop at positions:
2509 * - after trail cc==0
2510 * - at the end of the source
2511 * - before lead cc==0
2512 */
2513 for(;;) {
2514 /* stop if trail cc==0 for the previous character */
2515 if((fcd16&0xff)==0) {
2516 break;
2517 }
2518
2519 /* get c=*src - stop at end of string */
2520 if(src==limit) {
2521 break;
2522 }
2523 c=*src;
2524
2525 /* stop if lead cc==0 for this character */
2526 if(c<_NORM_MIN_WITH_LEAD_CC || (fcd16=_getFCD16(c))==0) {
2527 break; /* catches terminating NUL, too */
2528 }
2529
2530 if(!UTF_IS_FIRST_SURROGATE(c)) {
2531 if(fcd16<=0xff) {
2532 break;
2533 }
2534 ++src;
2535 } else if((src+1)!=limit && (c2=*(src+1), UTF_IS_SECOND_SURROGATE(c2))) {
2536 /* c is a lead surrogate, get the real fcd16 */
2537 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2538 if(fcd16<=0xff) {
2539 break;
2540 }
2541 src+=2;
2542 } else {
2543 /* c is an unpaired first surrogate, lead cc==0 */
2544 break;
2545 }
2546 }
2547
2548 return src;
2549 }
2550
2551 static uint8_t
2552 _decomposeFCD(const UChar *src, const UChar *decompLimit,
2553 UChar *dest, int32_t &destIndex, int32_t destCapacity,
2554 const UnicodeSet *nx) {
2555 const UChar *p;
2556 uint32_t norm32;
2557 int32_t reorderStartIndex, length;
2558 UChar c, c2;
2559 uint8_t cc, prevCC, trailCC;
2560
2561 /*
2562 * canonically decompose [src..decompLimit[
2563 *
2564 * all characters in this range have some non-zero cc,
2565 * directly or in decomposition,
2566 * so that we do not need to check in the following for quick-check limits etc.
2567 *
2568 * there _are_ _no_ Hangul syllables or Jamos in here because they are FCD-safe (cc==0)!
2569 *
2570 * we also do not need to check for c==0 because we have an established decompLimit
2571 */
2572 reorderStartIndex=destIndex;
2573 prevCC=0;
2574
2575 while(src<decompLimit) {
2576 c=*src++;
2577 norm32=_getNorm32(c);
2578 if(isNorm32Regular(norm32)) {
2579 c2=0;
2580 length=1;
2581 } else {
2582 /*
2583 * reminder: this function is called with [src..decompLimit[
2584 * not containing any Hangul/Jamo characters,
2585 * therefore the only specials are lead surrogates
2586 */
2587 /* c is a lead surrogate, get the real norm32 */
2588 if(src!=decompLimit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2589 ++src;
2590 length=2;
2591 norm32=_getNorm32FromSurrogatePair(norm32, c2);
2592 } else {
2593 c2=0;
2594 length=1;
2595 norm32=0;
2596 }
2597 }
2598
2599 /* get the decomposition and the lead and trail cc's */
2600 if(nx_contains(nx, c, c2)) {
2601 /* excluded: norm32==0 */
2602 cc=trailCC=0;
2603 p=NULL;
2604 } else if((norm32&_NORM_QC_NFD)==0) {
2605 /* c does not decompose */
2606 cc=trailCC=(uint8_t)(norm32>>_NORM_CC_SHIFT);
2607 p=NULL;
2608 } else {
2609 /* c decomposes, get everything from the variable-length extra data */
2610 p=_decompose(norm32, length, cc, trailCC);
2611 if(length==1) {
2612 /* fastpath a single code unit from decomposition */
2613 c=*p;
2614 c2=0;
2615 p=NULL;
2616 }
2617 }
2618
2619 /* append the decomposition to the destination buffer, assume length>0 */
2620 if((destIndex+length)<=destCapacity) {
2621 UChar *reorderSplit=dest+destIndex;
2622 if(p==NULL) {
2623 /* fastpath: single code point */
2624 if(cc!=0 && cc<prevCC) {
2625 /* (c, c2) is out of order with respect to the preceding text */
2626 destIndex+=length;
2627 trailCC=_insertOrdered(dest+reorderStartIndex, reorderSplit, dest+destIndex, c, c2, cc);
2628 } else {
2629 /* just append (c, c2) */
2630 dest[destIndex++]=c;
2631 if(c2!=0) {
2632 dest[destIndex++]=c2;
2633 }
2634 }
2635 } else {
2636 /* general: multiple code points (ordered by themselves) from decomposition */
2637 if(cc!=0 && cc<prevCC) {
2638 /* the decomposition is out of order with respect to the preceding text */
2639 destIndex+=length;
2640 trailCC=_mergeOrdered(dest+reorderStartIndex, reorderSplit, p, p+length);
2641 } else {
2642 /* just append the decomposition */
2643 do {
2644 dest[destIndex++]=*p++;
2645 } while(--length>0);
2646 }
2647 }
2648 } else {
2649 /* buffer overflow */
2650 /* keep incrementing the destIndex for preflighting */
2651 destIndex+=length;
2652 }
2653
2654 prevCC=trailCC;
2655 if(prevCC==0) {
2656 reorderStartIndex=destIndex;
2657 }
2658 }
2659
2660 return prevCC;
2661 }
2662
2663 static int32_t
2664 unorm_makeFCD(UChar *dest, int32_t destCapacity,
2665 const UChar *src, int32_t srcLength,
2666 const UnicodeSet *nx,
2667 UErrorCode *pErrorCode) {
2668 const UChar *limit, *prevSrc, *decompStart;
2669 int32_t destIndex, length;
2670 UChar c, c2;
2671 uint16_t fcd16;
2672 int16_t prevCC, cc;
2673
2674 if(!_haveData(*pErrorCode)) {
2675 return 0;
2676 }
2677
2678 /* initialize */
2679 decompStart=src;
2680 destIndex=0;
2681 prevCC=0;
2682
2683 /* avoid compiler warnings */
2684 c=0;
2685 fcd16=0;
2686
2687 if(srcLength>=0) {
2688 /* string with length */
2689 limit=src+srcLength;
2690 } else /* srcLength==-1 */ {
2691 /* zero-terminated string */
2692 limit=NULL;
2693 }
2694
2695 U_ALIGN_CODE(16);
2696
2697 for(;;) {
2698 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2699 prevSrc=src;
2700 if(limit==NULL) {
2701 for(;;) {
2702 c=*src;
2703 if(c<_NORM_MIN_WITH_LEAD_CC) {
2704 if(c==0) {
2705 break;
2706 }
2707 prevCC=(int16_t)-c;
2708 } else if((fcd16=_getFCD16(c))==0) {
2709 prevCC=0;
2710 } else {
2711 break;
2712 }
2713 ++src;
2714 }
2715 } else {
2716 for(;;) {
2717 if(src==limit) {
2718 break;
2719 } else if((c=*src)<_NORM_MIN_WITH_LEAD_CC) {
2720 prevCC=(int16_t)-c;
2721 } else if((fcd16=_getFCD16(c))==0) {
2722 prevCC=0;
2723 } else {
2724 break;
2725 }
2726 ++src;
2727 }
2728 }
2729
2730 /*
2731 * prevCC has values from the following ranges:
2732 * 0..0xff - the previous trail combining class
2733 * <0 - the negative value of the previous code unit;
2734 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2735 * was deferred so that average text is checked faster
2736 */
2737
2738 /* copy these code units all at once */
2739 if(src!=prevSrc) {
2740 length=(int32_t)(src-prevSrc);
2741 if((destIndex+length)<=destCapacity) {
2742 uprv_memcpy(dest+destIndex, prevSrc, length*U_SIZEOF_UCHAR);
2743 }
2744 destIndex+=length;
2745 prevSrc=src;
2746
2747 /* prevCC<0 is only possible from the above loop, i.e., only if prevSrc<src */
2748 if(prevCC<0) {
2749 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2750 if(!nx_contains(nx, (UChar32)-prevCC)) {
2751 prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2752 } else {
2753 prevCC=0; /* excluded: fcd16==0 */
2754 }
2755
2756 /*
2757 * set a pointer to this below-U+0300 character;
2758 * if prevCC==0 then it will moved to after this character below
2759 */
2760 decompStart=prevSrc-1;
2761 }
2762 }
2763 /*
2764 * now:
2765 * prevSrc==src - used later to adjust destIndex before decomposition
2766 * prevCC>=0
2767 */
2768
2769 /* end of source reached? */
2770 if(limit==NULL ? c==0 : src==limit) {
2771 break;
2772 }
2773
2774 /* set a pointer to after the last source position where prevCC==0 */
2775 if(prevCC==0) {
2776 decompStart=prevSrc;
2777 }
2778
2779 /* c already contains *src and fcd16 is set for it, increment src */
2780 ++src;
2781
2782 /* check one above-minimum, relevant code unit */
2783 if(UTF_IS_FIRST_SURROGATE(c)) {
2784 /* c is a lead surrogate, get the real fcd16 */
2785 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2786 ++src;
2787 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2788 } else {
2789 c2=0;
2790 fcd16=0;
2791 }
2792 } else {
2793 c2=0;
2794 }
2795
2796 /* we are looking at the character (c, c2) at [prevSrc..src[ */
2797 if(nx_contains(nx, c, c2)) {
2798 fcd16=0; /* excluded: fcd16==0 */
2799 }
2800
2801 /* check the combining order, get the lead cc */
2802 cc=(int16_t)(fcd16>>8);
2803 if(cc==0 || cc>=prevCC) {
2804 /* the order is ok */
2805 if(cc==0) {
2806 decompStart=prevSrc;
2807 }
2808 prevCC=(int16_t)(fcd16&0xff);
2809
2810 /* just append (c, c2) */
2811 length= c2==0 ? 1 : 2;
2812 if((destIndex+length)<=destCapacity) {
2813 dest[destIndex++]=c;
2814 if(c2!=0) {
2815 dest[destIndex++]=c2;
2816 }
2817 } else {
2818 destIndex+=length;
2819 }
2820 } else {
2821 /*
2822 * back out the part of the source that we copied already but
2823 * is now going to be decomposed;
2824 * prevSrc is set to after what was copied
2825 */
2826 destIndex-=(int32_t)(prevSrc-decompStart);
2827
2828 /*
2829 * find the part of the source that needs to be decomposed;
2830 * to be safe and simple, decompose to before the next character with lead cc==0
2831 */
2832 src=_findSafeFCD(src, limit, fcd16);
2833
2834 /*
2835 * the source text does not fulfill the conditions for FCD;
2836 * decompose and reorder a limited piece of the text
2837 */
2838 prevCC=_decomposeFCD(decompStart, src,
2839 dest, destIndex, destCapacity,
2840 nx);
2841 decompStart=src;
2842 }
2843 }
2844
2845 return u_terminateUChars(dest, destCapacity, destIndex, pErrorCode);
2846 }
2847
2848 /* quick check functions ---------------------------------------------------- */
2849
2850 static UBool
2851 unorm_checkFCD(const UChar *src, int32_t srcLength, const UnicodeSet *nx) {
2852 const UChar *limit;
2853 UChar c, c2;
2854 uint16_t fcd16;
2855 int16_t prevCC, cc;
2856
2857 /* initialize */
2858 prevCC=0;
2859
2860 if(srcLength>=0) {
2861 /* string with length */
2862 limit=src+srcLength;
2863 } else /* srcLength==-1 */ {
2864 /* zero-terminated string */
2865 limit=NULL;
2866 }
2867
2868 U_ALIGN_CODE(16);
2869
2870 for(;;) {
2871 /* skip a run of code units below the minimum or with irrelevant data for the FCD check */
2872 if(limit==NULL) {
2873 for(;;) {
2874 c=*src++;
2875 if(c<_NORM_MIN_WITH_LEAD_CC) {
2876 if(c==0) {
2877 return TRUE;
2878 }
2879 /*
2880 * delay _getFCD16(c) for any character <_NORM_MIN_WITH_LEAD_CC
2881 * because chances are good that the next one will have
2882 * a leading cc of 0;
2883 * _getFCD16(-prevCC) is later called when necessary -
2884 * -c fits into int16_t because it is <_NORM_MIN_WITH_LEAD_CC==0x300
2885 */
2886 prevCC=(int16_t)-c;
2887 } else if((fcd16=_getFCD16(c))==0) {
2888 prevCC=0;
2889 } else {
2890 break;
2891 }
2892 }
2893 } else {
2894 for(;;) {
2895 if(src==limit) {
2896 return TRUE;
2897 } else if((c=*src++)<_NORM_MIN_WITH_LEAD_CC) {
2898 prevCC=(int16_t)-c;
2899 } else if((fcd16=_getFCD16(c))==0) {
2900 prevCC=0;
2901 } else {
2902 break;
2903 }
2904 }
2905 }
2906
2907 /* check one above-minimum, relevant code unit */
2908 if(UTF_IS_FIRST_SURROGATE(c)) {
2909 /* c is a lead surrogate, get the real fcd16 */
2910 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
2911 ++src;
2912 fcd16=_getFCD16FromSurrogatePair(fcd16, c2);
2913 } else {
2914 c2=0;
2915 fcd16=0;
2916 }
2917 } else {
2918 c2=0;
2919 }
2920
2921 if(nx_contains(nx, c, c2)) {
2922 prevCC=0; /* excluded: fcd16==0 */
2923 continue;
2924 }
2925
2926 /*
2927 * prevCC has values from the following ranges:
2928 * 0..0xff - the previous trail combining class
2929 * <0 - the negative value of the previous code unit;
2930 * that code unit was <_NORM_MIN_WITH_LEAD_CC and its _getFCD16()
2931 * was deferred so that average text is checked faster
2932 */
2933
2934 /* check the combining order */
2935 cc=(int16_t)(fcd16>>8);
2936 if(cc!=0) {
2937 if(prevCC<0) {
2938 /* the previous character was <_NORM_MIN_WITH_LEAD_CC, we need to get its trail cc */
2939 if(!nx_contains(nx, (UChar32)-prevCC)) {
2940 prevCC=(int16_t)(_getFCD16((UChar)-prevCC)&0xff);
2941 } else {
2942 prevCC=0; /* excluded: fcd16==0 */
2943 }
2944 }
2945
2946 if(cc<prevCC) {
2947 return FALSE;
2948 }
2949 }
2950 prevCC=(int16_t)(fcd16&0xff);
2951 }
2952 }
2953
2954 static UNormalizationCheckResult
2955 _quickCheck(const UChar *src,
2956 int32_t srcLength,
2957 UNormalizationMode mode,
2958 UBool allowMaybe,
2959 const UnicodeSet *nx,
2960 UErrorCode *pErrorCode) {
2961 UChar stackBuffer[_STACK_BUFFER_CAPACITY];
2962 UChar *buffer;
2963 int32_t bufferCapacity;
2964
2965 const UChar *start, *limit;
2966 uint32_t norm32, qcNorm32, ccOrQCMask, qcMask;
2967 int32_t options;
2968 UChar c, c2, minNoMaybe;
2969 uint8_t cc, prevCC;
2970 UNormalizationCheckResult result;
2971
2972 /* check arguments */
2973 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
2974 return UNORM_MAYBE;
2975 }
2976
2977 if(src==NULL || srcLength<-1) {
2978 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
2979 return UNORM_MAYBE;
2980 }
2981
2982 if(!_haveData(*pErrorCode)) {
2983 return UNORM_MAYBE;
2984 }
2985
2986 /* check for a valid mode and set the quick check minimum and mask */
2987 switch(mode) {
2988 case UNORM_NFC:
2989 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
2990 qcMask=_NORM_QC_NFC;
2991 options=0;
2992 break;
2993 case UNORM_NFKC:
2994 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
2995 qcMask=_NORM_QC_NFKC;
2996 options=_NORM_OPTIONS_COMPAT;
2997 break;
2998 case UNORM_NFD:
2999 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE];
3000 qcMask=_NORM_QC_NFD;
3001 options=0;
3002 break;
3003 case UNORM_NFKD:
3004 minNoMaybe=(UChar)indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE];
3005 qcMask=_NORM_QC_NFKD;
3006 options=_NORM_OPTIONS_COMPAT;
3007 break;
3008 case UNORM_FCD:
3009 return unorm_checkFCD(src, srcLength, nx) ? UNORM_YES : UNORM_NO;
3010 default:
3011 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3012 return UNORM_MAYBE;
3013 }
3014
3015 /* initialize */
3016 buffer=stackBuffer;
3017 bufferCapacity=_STACK_BUFFER_CAPACITY;
3018
3019 ccOrQCMask=_NORM_CC_MASK|qcMask;
3020 result=UNORM_YES;
3021 prevCC=0;
3022
3023 start=src;
3024 if(srcLength>=0) {
3025 /* string with length */
3026 limit=src+srcLength;
3027 } else /* srcLength==-1 */ {
3028 /* zero-terminated string */
3029 limit=NULL;
3030 }
3031
3032 U_ALIGN_CODE(16);
3033
3034 for(;;) {
3035 /* skip a run of code units below the minimum or with irrelevant data for the quick check */
3036 if(limit==NULL) {
3037 for(;;) {
3038 c=*src++;
3039 if(c<minNoMaybe) {
3040 if(c==0) {
3041 goto endloop; /* break out of outer loop */
3042 }
3043 } else if(((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3044 break;
3045 }
3046 prevCC=0;
3047 }
3048 } else {
3049 for(;;) {
3050 if(src==limit) {
3051 goto endloop; /* break out of outer loop */
3052 } else if((c=*src++)>=minNoMaybe && ((norm32=_getNorm32(c))&ccOrQCMask)!=0) {
3053 break;
3054 }
3055 prevCC=0;
3056 }
3057 }
3058
3059 /* check one above-minimum, relevant code unit */
3060 if(isNorm32LeadSurrogate(norm32)) {
3061 /* c is a lead surrogate, get the real norm32 */
3062 if(src!=limit && UTF_IS_SECOND_SURROGATE(c2=*src)) {
3063 ++src;
3064 norm32=_getNorm32FromSurrogatePair(norm32, c2);
3065 } else {
3066 c2=0;
3067 norm32=0;
3068 }
3069 } else {
3070 c2=0;
3071 }
3072
3073 if(nx_contains(nx, c, c2)) {
3074 /* excluded: norm32==0 */
3075 norm32=0;
3076 }
3077
3078 /* check the combining order */
3079 cc=(uint8_t)(norm32>>_NORM_CC_SHIFT);
3080 if(cc!=0 && cc<prevCC) {
3081 result=UNORM_NO;
3082 break;
3083 }
3084 prevCC=cc;
3085
3086 /* check for "no" or "maybe" quick check flags */
3087 qcNorm32=norm32&qcMask;
3088 if(qcNorm32&_NORM_QC_ANY_NO) {
3089 result=UNORM_NO;
3090 break;
3091 } else if(qcNorm32!=0) {
3092 /* "maybe" can only occur for NFC and NFKC */
3093 if(allowMaybe) {
3094 result=UNORM_MAYBE;
3095 } else {
3096 /* normalize a section around here to see if it is really normalized or not */
3097 const UChar *prevStarter;
3098 uint32_t decompQCMask;
3099 int32_t length;
3100
3101 decompQCMask=(qcMask<<2)&0xf; /* decomposition quick check mask */
3102
3103 /* find the previous starter */
3104 prevStarter=src-1; /* set prevStarter to the beginning of the current character */
3105 if(UTF_IS_TRAIL(*prevStarter)) {
3106 --prevStarter; /* safe because unpaired surrogates do not result in "maybe" */
3107 }
3108 prevStarter=_findPreviousStarter(start, prevStarter, ccOrQCMask, decompQCMask, minNoMaybe);
3109
3110 /* find the next true starter in [src..limit[ - modifies src to point to the next starter */
3111 src=_findNextStarter(src, limit, qcMask, decompQCMask, minNoMaybe);
3112
3113 /* decompose and recompose [prevStarter..src[ */
3114 _composePart(stackBuffer, buffer, bufferCapacity,
3115 length,
3116 prevStarter,
3117 src,
3118 prevCC,
3119 options, nx, pErrorCode);
3120 if(U_FAILURE(*pErrorCode)) {
3121 result=UNORM_MAYBE; /* error (out of memory) */
3122 break;
3123 }
3124
3125 /* compare the normalized version with the original */
3126 if(0!=uprv_strCompare(prevStarter, (int32_t)(src-prevStarter), buffer, length, FALSE, FALSE)) {
3127 result=UNORM_NO; /* normalization differs */
3128 break;
3129 }
3130
3131 /* continue after the next starter */
3132 }
3133 }
3134 }
3135 endloop:
3136
3137 if(buffer!=stackBuffer) {
3138 uprv_free(buffer);
3139 }
3140
3141 return result;
3142 }
3143
3144 U_CAPI UNormalizationCheckResult U_EXPORT2
3145 unorm_quickCheck(const UChar *src,
3146 int32_t srcLength,
3147 UNormalizationMode mode,
3148 UErrorCode *pErrorCode) {
3149 return _quickCheck(src, srcLength, mode, TRUE, NULL, pErrorCode);
3150 }
3151
3152 U_CAPI UNormalizationCheckResult U_EXPORT2
3153 unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
3154 UNormalizationMode mode, int32_t options,
3155 UErrorCode *pErrorCode) {
3156 return _quickCheck(src, srcLength, mode, TRUE, getNX(options, *pErrorCode), pErrorCode);
3157 }
3158
3159 U_CFUNC UNormalizationCheckResult
3160 unorm_internalQuickCheck(const UChar *src,
3161 int32_t srcLength,
3162 UNormalizationMode mode,
3163 UBool allowMaybe,
3164 const UnicodeSet *nx,
3165 UErrorCode *pErrorCode) {
3166 return _quickCheck(src, srcLength, mode, allowMaybe, nx, pErrorCode);
3167 }
3168
3169 U_CAPI UBool U_EXPORT2
3170 unorm_isNormalized(const UChar *src, int32_t srcLength,
3171 UNormalizationMode mode,
3172 UErrorCode *pErrorCode) {
3173 return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, NULL, pErrorCode));
3174 }
3175
3176 U_CAPI UBool U_EXPORT2
3177 unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
3178 UNormalizationMode mode, int32_t options,
3179 UErrorCode *pErrorCode) {
3180 return (UBool)(UNORM_YES==_quickCheck(src, srcLength, mode, FALSE, getNX(options, *pErrorCode), pErrorCode));
3181 }
3182
3183 /* normalize() API ---------------------------------------------------------- */
3184
3185 /**
3186 * Internal API for normalizing.
3187 * Does not check for bad input.
3188 * Requires _haveData() to be true.
3189 * @internal
3190 */
3191 U_CFUNC int32_t
3192 unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
3193 const UChar *src, int32_t srcLength,
3194 UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
3195 UErrorCode *pErrorCode) {
3196 int32_t destLength;
3197 uint8_t trailCC;
3198
3199 switch(mode) {
3200 case UNORM_NFD:
3201 destLength=_decompose(dest, destCapacity,
3202 src, srcLength,
3203 FALSE, nx, trailCC);
3204 break;
3205 case UNORM_NFKD:
3206 destLength=_decompose(dest, destCapacity,
3207 src, srcLength,
3208 TRUE, nx, trailCC);
3209 break;
3210 case UNORM_NFC:
3211 destLength=_compose(dest, destCapacity,
3212 src, srcLength,
3213 options, nx, pErrorCode);
3214 break;
3215 case UNORM_NFKC:
3216 destLength=_compose(dest, destCapacity,
3217 src, srcLength,
3218 options|_NORM_OPTIONS_COMPAT, nx, pErrorCode);
3219 break;
3220 case UNORM_FCD:
3221 return unorm_makeFCD(dest, destCapacity,
3222 src, srcLength,
3223 nx,
3224 pErrorCode);
3225 #if 0
3226 case UNORM_FCC:
3227 destLength=_compose(dest, destCapacity,
3228 src, srcLength,
3229 options|_NORM_OPTIONS_COMPOSE_CONTIGUOUS, nx, pErrorCode);
3230 break;
3231 #endif
3232 case UNORM_NONE:
3233 /* just copy the string */
3234 if(srcLength==-1) {
3235 srcLength=u_strlen(src);
3236 }
3237 if(srcLength>0 && srcLength<=destCapacity) {
3238 uprv_memcpy(dest, src, srcLength*U_SIZEOF_UCHAR);
3239 }
3240 destLength=srcLength;
3241 break;
3242 default:
3243 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3244 return 0;
3245 }
3246
3247 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3248 }
3249
3250 /**
3251 * Internal API for normalizing.
3252 * Does not check for bad input.
3253 * @internal
3254 */
3255 U_CAPI int32_t U_EXPORT2
3256 unorm_internalNormalize(UChar *dest, int32_t destCapacity,
3257 const UChar *src, int32_t srcLength,
3258 UNormalizationMode mode, int32_t options,
3259 UErrorCode *pErrorCode) {
3260 const UnicodeSet *nx;
3261
3262 if(!_haveData(*pErrorCode)) {
3263 return 0;
3264 }
3265
3266 nx=getNX(options, *pErrorCode);
3267 if(U_FAILURE(*pErrorCode)) {
3268 return 0;
3269 }
3270
3271 /* reset options bits that should only be set inside unorm_internalNormalizeWithNX() */
3272 options&=~(_NORM_OPTIONS_SETS_MASK|_NORM_OPTIONS_COMPAT|_NORM_OPTIONS_COMPOSE_CONTIGUOUS);
3273
3274 return unorm_internalNormalizeWithNX(dest, destCapacity,
3275 src, srcLength,
3276 mode, options, nx,
3277 pErrorCode);
3278 }
3279
3280 /** Public API for normalizing. */
3281 U_CAPI int32_t U_EXPORT2
3282 unorm_normalize(const UChar *src, int32_t srcLength,
3283 UNormalizationMode mode, int32_t options,
3284 UChar *dest, int32_t destCapacity,
3285 UErrorCode *pErrorCode) {
3286 /* check argument values */
3287 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3288 return 0;
3289 }
3290
3291 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3292 src==NULL || srcLength<-1
3293 ) {
3294 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3295 return 0;
3296 }
3297
3298 /* check for overlapping src and destination */
3299 if( dest!=NULL &&
3300 ((src>=dest && src<(dest+destCapacity)) ||
3301 (srcLength>0 && dest>=src && dest<(src+srcLength)))
3302 ) {
3303 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3304 return 0;
3305 }
3306
3307 return unorm_internalNormalize(dest, destCapacity,
3308 src, srcLength,
3309 mode, options,
3310 pErrorCode);
3311 }
3312
3313
3314 /* iteration functions ------------------------------------------------------ */
3315
3316 /*
3317 * These iteration functions are the core implementations of the
3318 * Normalizer class iteration API.
3319 * They read from a UCharIterator into their own buffer
3320 * and normalize into the Normalizer iteration buffer.
3321 * Normalizer itself then iterates over its buffer until that needs to be
3322 * filled again.
3323 */
3324
3325 /*
3326 * ### TODO:
3327 * Now that UCharIterator.next/previous return (int32_t)-1 not (UChar)0xffff
3328 * if iteration bounds are reached,
3329 * try to not call hasNext/hasPrevious and instead check for >=0.
3330 */
3331
3332 /* backward iteration ------------------------------------------------------- */
3333
3334 /*
3335 * read backwards and get norm32
3336 * return 0 if the character is <minC
3337 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3338 */
3339 static inline uint32_t
3340 _getPrevNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3341 uint32_t norm32;
3342
3343 /* need src.hasPrevious() */
3344 c=(UChar)src.previous(&src);
3345 c2=0;
3346
3347 /* check for a surrogate before getting norm32 to see if we need to predecrement further */
3348 if(c<minC) {
3349 return 0;
3350 } else if(!UTF_IS_SURROGATE(c)) {
3351 return _getNorm32(c);
3352 } else if(UTF_IS_SURROGATE_FIRST(c) || !src.hasPrevious(&src)) {
3353 /* unpaired surrogate */
3354 return 0;
3355 } else if(UTF_IS_FIRST_SURROGATE(c2=(UChar)src.previous(&src))) {
3356 norm32=_getNorm32(c2);
3357 if((norm32&mask)==0) {
3358 /* all surrogate pairs with this lead surrogate have irrelevant data */
3359 return 0;
3360 } else {
3361 /* norm32 must be a surrogate special */
3362 return _getNorm32FromSurrogatePair(norm32, c);
3363 }
3364 } else {
3365 /* unpaired second surrogate, undo the c2=src.previous() movement */
3366 src.move(&src, 1, UITER_CURRENT);
3367 c2=0;
3368 return 0;
3369 }
3370 }
3371
3372 /*
3373 * read backwards and check if the character is a previous-iteration boundary
3374 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3375 */
3376 typedef UBool
3377 IsPrevBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3378
3379 /*
3380 * for NF*D:
3381 * read backwards and check if the lead combining class is 0
3382 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3383 */
3384 static UBool
3385 _isPrevNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3386 return _isNFDSafe(_getPrevNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3387 }
3388
3389 /*
3390 * read backwards and check if the character is (or its decomposition begins with)
3391 * a "true starter" (cc==0 and NF*C_YES)
3392 * if c2!=0 then (c2, c) is a surrogate pair (reversed - c2 is first surrogate but read second!)
3393 */
3394 static UBool
3395 _isPrevTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3396 uint32_t norm32, decompQCMask;
3397
3398 decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3399 norm32=_getPrevNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3400 return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3401 }
3402
3403 static int32_t
3404 _findPreviousIterationBoundary(UCharIterator &src,
3405 IsPrevBoundaryFn *isPrevBoundary, uint32_t minC, uint32_t mask,
3406 UChar *&buffer, int32_t &bufferCapacity,
3407 int32_t &startIndex,
3408 UErrorCode *pErrorCode) {
3409 UChar *stackBuffer;
3410 UChar c, c2;
3411 UBool isBoundary;
3412
3413 /* initialize */
3414 stackBuffer=buffer;
3415 startIndex=bufferCapacity; /* fill the buffer from the end backwards */
3416
3417 while(src.hasPrevious(&src)) {
3418 isBoundary=isPrevBoundary(src, minC, mask, c, c2);
3419
3420 /* always write this character to the front of the buffer */
3421 /* make sure there is enough space in the buffer */
3422 if(startIndex < (c2==0 ? 1 : 2)) {
3423 int32_t bufferLength=bufferCapacity;
3424
3425 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferCapacity, bufferLength)) {
3426 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3427 src.move(&src, 0, UITER_START);
3428 return 0;
3429 }
3430
3431 /* move the current buffer contents up */
3432 uprv_memmove(buffer+(bufferCapacity-bufferLength), buffer, bufferLength*U_SIZEOF_UCHAR);
3433 startIndex+=bufferCapacity-bufferLength;
3434 }
3435
3436 buffer[--startIndex]=c;
3437 if(c2!=0) {
3438 buffer[--startIndex]=c2;
3439 }
3440
3441 /* stop if this just-copied character is a boundary */
3442 if(isBoundary) {
3443 break;
3444 }
3445 }
3446
3447 /* return the length of the buffer contents */
3448 return bufferCapacity-startIndex;
3449 }
3450
3451 U_CAPI int32_t U_EXPORT2
3452 unorm_previous(UCharIterator *src,
3453 UChar *dest, int32_t destCapacity,
3454 UNormalizationMode mode, int32_t options,
3455 UBool doNormalize, UBool *pNeededToNormalize,
3456 UErrorCode *pErrorCode) {
3457 UChar stackBuffer[100];
3458 UChar *buffer=NULL;
3459 IsPrevBoundaryFn *isPreviousBoundary=NULL;
3460 uint32_t mask=0;
3461 int32_t startIndex=0, bufferLength=0, bufferCapacity=0, destLength=0;
3462 int32_t c=0, c2=0;
3463 UChar minC=0;
3464
3465 /* check argument values */
3466 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3467 return 0;
3468 }
3469
3470 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3471 src==NULL
3472 ) {
3473 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3474 return 0;
3475 }
3476
3477 if(!_haveData(*pErrorCode)) {
3478 return 0;
3479 }
3480
3481 if(pNeededToNormalize!=NULL) {
3482 *pNeededToNormalize=FALSE;
3483 }
3484
3485 switch(mode) {
3486 case UNORM_NFD:
3487 case UNORM_FCD:
3488 isPreviousBoundary=_isPrevNFDSafe;
3489 minC=_NORM_MIN_WITH_LEAD_CC;
3490 mask=_NORM_CC_MASK|_NORM_QC_NFD;
3491 break;
3492 case UNORM_NFKD:
3493 isPreviousBoundary=_isPrevNFDSafe;
3494 minC=_NORM_MIN_WITH_LEAD_CC;
3495 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3496 break;
3497 case UNORM_NFC:
3498 isPreviousBoundary=_isPrevTrueStarter;
3499 minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3500 mask=_NORM_CC_MASK|_NORM_QC_NFC;
3501 break;
3502 case UNORM_NFKC:
3503 isPreviousBoundary=_isPrevTrueStarter;
3504 minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3505 mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3506 break;
3507 case UNORM_NONE:
3508 destLength=0;
3509 if((c=src->previous(src))>=0) {
3510 destLength=1;
3511 if(UTF_IS_TRAIL(c) && (c2=src->previous(src))>=0) {
3512 if(UTF_IS_LEAD(c2)) {
3513 if(destCapacity>=2) {
3514 dest[1]=(UChar)c; /* trail surrogate */
3515 destLength=2;
3516 }
3517 c=c2; /* lead surrogate to be written below */
3518 } else {
3519 src->move(src, 1, UITER_CURRENT);
3520 }
3521 }
3522
3523 if(destCapacity>0) {
3524 dest[0]=(UChar)c;
3525 }
3526 }
3527 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3528 default:
3529 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3530 return 0;
3531 }
3532
3533 buffer=stackBuffer;
3534 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3535 bufferLength=_findPreviousIterationBoundary(*src,
3536 isPreviousBoundary, minC, mask,
3537 buffer, bufferCapacity,
3538 startIndex,
3539 pErrorCode);
3540 if(bufferLength>0) {
3541 if(doNormalize) {
3542 destLength=unorm_internalNormalize(dest, destCapacity,
3543 buffer+startIndex, bufferLength,
3544 mode, options,
3545 pErrorCode);
3546 if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3547 *pNeededToNormalize=
3548 (UBool)(destLength!=bufferLength ||
3549 0!=uprv_memcmp(dest, buffer+startIndex, destLength*U_SIZEOF_UCHAR));
3550 }
3551 } else {
3552 /* just copy the source characters */
3553 if(destCapacity>0) {
3554 uprv_memcpy(dest, buffer+startIndex, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3555 }
3556 destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3557 }
3558 } else {
3559 destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3560 }
3561
3562 /* cleanup */
3563 if(buffer!=stackBuffer) {
3564 uprv_free(buffer);
3565 }
3566
3567 return destLength;
3568 }
3569
3570 /* forward iteration -------------------------------------------------------- */
3571
3572 /*
3573 * read forward and get norm32
3574 * return 0 if the character is <minC
3575 * if c2!=0 then (c2, c) is a surrogate pair
3576 * always reads complete characters
3577 */
3578 static inline uint32_t
3579 _getNextNorm32(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2) {
3580 uint32_t norm32;
3581
3582 /* need src.hasNext() to be true */
3583 c=(UChar)src.next(&src);
3584 c2=0;
3585
3586 if(c<minC) {
3587 return 0;
3588 }
3589
3590 norm32=_getNorm32(c);
3591 if(UTF_IS_FIRST_SURROGATE(c)) {
3592 if(src.hasNext(&src) && UTF_IS_SECOND_SURROGATE(c2=(UChar)src.current(&src))) {
3593 src.move(&src, 1, UITER_CURRENT); /* skip the c2 surrogate */
3594 if((norm32&mask)==0) {
3595 /* irrelevant data */
3596 return 0;
3597 } else {
3598 /* norm32 must be a surrogate special */
3599 return _getNorm32FromSurrogatePair(norm32, c2);
3600 }
3601 } else {
3602 /* unmatched surrogate */
3603 c2=0;
3604 return 0;
3605 }
3606 }
3607 return norm32;
3608 }
3609
3610 /*
3611 * read forward and check if the character is a next-iteration boundary
3612 * if c2!=0 then (c, c2) is a surrogate pair
3613 */
3614 typedef UBool
3615 IsNextBoundaryFn(UCharIterator &src, uint32_t minC, uint32_t mask, UChar &c, UChar &c2);
3616
3617 /*
3618 * for NF*D:
3619 * read forward and check if the lead combining class is 0
3620 * if c2!=0 then (c, c2) is a surrogate pair
3621 */
3622 static UBool
3623 _isNextNFDSafe(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3624 return _isNFDSafe(_getNextNorm32(src, minC, ccOrQCMask, c, c2), ccOrQCMask, ccOrQCMask&_NORM_QC_MASK);
3625 }
3626
3627 /*
3628 * for NF*C:
3629 * read forward and check if the character is (or its decomposition begins with)
3630 * a "true starter" (cc==0 and NF*C_YES)
3631 * if c2!=0 then (c, c2) is a surrogate pair
3632 */
3633 static UBool
3634 _isNextTrueStarter(UCharIterator &src, uint32_t minC, uint32_t ccOrQCMask, UChar &c, UChar &c2) {
3635 uint32_t norm32, decompQCMask;
3636
3637 decompQCMask=(ccOrQCMask<<2)&0xf; /* decomposition quick check mask */
3638 norm32=_getNextNorm32(src, minC, ccOrQCMask|decompQCMask, c, c2);
3639 return _isTrueStarter(norm32, ccOrQCMask, decompQCMask);
3640 }
3641
3642 static int32_t
3643 _findNextIterationBoundary(UCharIterator &src,
3644 IsNextBoundaryFn *isNextBoundary, uint32_t minC, uint32_t mask,
3645 UChar *&buffer, int32_t &bufferCapacity,
3646 UErrorCode *pErrorCode) {
3647 UChar *stackBuffer;
3648 int32_t bufferIndex;
3649 UChar c, c2;
3650
3651 if(!src.hasNext(&src)) {
3652 return 0;
3653 }
3654
3655 /* initialize */
3656 stackBuffer=buffer;
3657
3658 /* get one character and ignore its properties */
3659 buffer[0]=c=(UChar)src.next(&src);
3660 bufferIndex=1;
3661 if(UTF_IS_FIRST_SURROGATE(c) && src.hasNext(&src)) {
3662 if(UTF_IS_SECOND_SURROGATE(c2=(UChar)src.next(&src))) {
3663 buffer[bufferIndex++]=c2;
3664 } else {
3665 src.move(&src, -1, UITER_CURRENT); /* back out the non-trail-surrogate */
3666 }
3667 }
3668
3669 /* get all following characters until we see a boundary */
3670 /* checking hasNext() instead of c!=DONE on the off-chance that U+ffff is part of the string */
3671 while(src.hasNext(&src)) {
3672 if(isNextBoundary(src, minC, mask, c, c2)) {
3673 /* back out the latest movement to stop at the boundary */
3674 src.move(&src, c2==0 ? -1 : -2, UITER_CURRENT);
3675 break;
3676 } else {
3677 if(bufferIndex+(c2==0 ? 1 : 2)<=bufferCapacity ||
3678 /* attempt to grow the buffer */
3679 u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity,
3680 2*bufferCapacity,
3681 bufferIndex)
3682 ) {
3683 buffer[bufferIndex++]=c;
3684 if(c2!=0) {
3685 buffer[bufferIndex++]=c2;
3686 }
3687 } else {
3688 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3689 src.move(&src, 0, UITER_LIMIT);
3690 return 0;
3691 }
3692 }
3693 }
3694
3695 /* return the length of the buffer contents */
3696 return bufferIndex;
3697 }
3698
3699 U_CAPI int32_t U_EXPORT2
3700 unorm_next(UCharIterator *src,
3701 UChar *dest, int32_t destCapacity,
3702 UNormalizationMode mode, int32_t options,
3703 UBool doNormalize, UBool *pNeededToNormalize,
3704 UErrorCode *pErrorCode) {
3705 UChar stackBuffer[100];
3706 UChar *buffer;
3707 IsNextBoundaryFn *isNextBoundary;
3708 uint32_t mask;
3709 int32_t bufferLength, bufferCapacity, destLength;
3710 int32_t c, c2;
3711 UChar minC;
3712
3713 /* check argument values */
3714 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3715 return 0;
3716 }
3717
3718 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3719 src==NULL
3720 ) {
3721 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3722 return 0;
3723 }
3724
3725 if(!_haveData(*pErrorCode)) {
3726 return 0;
3727 }
3728
3729 if(pNeededToNormalize!=NULL) {
3730 *pNeededToNormalize=FALSE;
3731 }
3732
3733 switch(mode) {
3734 case UNORM_NFD:
3735 case UNORM_FCD:
3736 isNextBoundary=_isNextNFDSafe;
3737 minC=_NORM_MIN_WITH_LEAD_CC;
3738 mask=_NORM_CC_MASK|_NORM_QC_NFD;
3739 break;
3740 case UNORM_NFKD:
3741 isNextBoundary=_isNextNFDSafe;
3742 minC=_NORM_MIN_WITH_LEAD_CC;
3743 mask=_NORM_CC_MASK|_NORM_QC_NFKD;
3744 break;
3745 case UNORM_NFC:
3746 isNextBoundary=_isNextTrueStarter;
3747 minC=(UChar)indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE];
3748 mask=_NORM_CC_MASK|_NORM_QC_NFC;
3749 break;
3750 case UNORM_NFKC:
3751 isNextBoundary=_isNextTrueStarter;
3752 minC=(UChar)indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE];
3753 mask=_NORM_CC_MASK|_NORM_QC_NFKC;
3754 break;
3755 case UNORM_NONE:
3756 destLength=0;
3757 if((c=src->next(src))>=0) {
3758 destLength=1;
3759 if(UTF_IS_LEAD(c) && (c2=src->next(src))>=0) {
3760 if(UTF_IS_TRAIL(c2)) {
3761 if(destCapacity>=2) {
3762 dest[1]=(UChar)c2; /* trail surrogate */
3763 destLength=2;
3764 }
3765 /* lead surrogate to be written below */
3766 } else {
3767 src->move(src, -1, UITER_CURRENT);
3768 }
3769 }
3770
3771 if(destCapacity>0) {
3772 dest[0]=(UChar)c;
3773 }
3774 }
3775 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3776 default:
3777 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3778 return 0;
3779 }
3780
3781 buffer=stackBuffer;
3782 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3783 bufferLength=_findNextIterationBoundary(*src,
3784 isNextBoundary, minC, mask,
3785 buffer, bufferCapacity,
3786 pErrorCode);
3787 if(bufferLength>0) {
3788 if(doNormalize) {
3789 destLength=unorm_internalNormalize(dest, destCapacity,
3790 buffer, bufferLength,
3791 mode, options,
3792 pErrorCode);
3793 if(pNeededToNormalize!=0 && U_SUCCESS(*pErrorCode)) {
3794 *pNeededToNormalize=
3795 (UBool)(destLength!=bufferLength ||
3796 0!=uprv_memcmp(dest, buffer, destLength*U_SIZEOF_UCHAR));
3797 }
3798 } else {
3799 /* just copy the source characters */
3800 if(destCapacity>0) {
3801 uprv_memcpy(dest, buffer, uprv_min(bufferLength, destCapacity)*U_SIZEOF_UCHAR);
3802 }
3803 destLength=u_terminateUChars(dest, destCapacity, bufferLength, pErrorCode);
3804 }
3805 } else {
3806 destLength=u_terminateUChars(dest, destCapacity, 0, pErrorCode);
3807 }
3808
3809 /* cleanup */
3810 if(buffer!=stackBuffer) {
3811 uprv_free(buffer);
3812 }
3813
3814 return destLength;
3815 }
3816
3817 /*
3818 * ### TODO: check if NF*D and FCD iteration finds optimal boundaries
3819 * and if not, how hard it would be to improve it.
3820 * For example, see _findSafeFCD().
3821 */
3822
3823 /* Concatenation of normalized strings -------------------------------------- */
3824
3825 U_CAPI int32_t U_EXPORT2
3826 unorm_concatenate(const UChar *left, int32_t leftLength,
3827 const UChar *right, int32_t rightLength,
3828 UChar *dest, int32_t destCapacity,
3829 UNormalizationMode mode, int32_t options,
3830 UErrorCode *pErrorCode) {
3831 UChar stackBuffer[100];
3832 UChar *buffer;
3833 int32_t bufferLength, bufferCapacity;
3834
3835 UCharIterator iter;
3836 int32_t leftBoundary, rightBoundary, destLength;
3837
3838 /* check argument values */
3839 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3840 return 0;
3841 }
3842
3843 if( destCapacity<0 || (dest==NULL && destCapacity>0) ||
3844 left==NULL || leftLength<-1 ||
3845 right==NULL || rightLength<-1
3846 ) {
3847 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3848 return 0;
3849 }
3850
3851 /* check for overlapping right and destination */
3852 if( dest!=NULL &&
3853 ((right>=dest && right<(dest+destCapacity)) ||
3854 (rightLength>0 && dest>=right && dest<(right+rightLength)))
3855 ) {
3856 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
3857 return 0;
3858 }
3859
3860 /* allow left==dest */
3861
3862 /* set up intermediate buffer */
3863 buffer=stackBuffer;
3864 bufferCapacity=(int32_t)(sizeof(stackBuffer)/U_SIZEOF_UCHAR);
3865
3866 /*
3867 * Input: left[0..leftLength[ + right[0..rightLength[
3868 *
3869 * Find normalization-safe boundaries leftBoundary and rightBoundary
3870 * and copy the end parts together:
3871 * buffer=left[leftBoundary..leftLength[ + right[0..rightBoundary[
3872 *
3873 * dest=left[0..leftBoundary[ +
3874 * normalize(buffer) +
3875 * right[rightBoundary..rightLength[
3876 */
3877
3878 /*
3879 * find a normalization boundary at the end of the left string
3880 * and copy the end part into the buffer
3881 */
3882 uiter_setString(&iter, left, leftLength);
3883 iter.index=leftLength=iter.length; /* end of left string */
3884
3885 bufferLength=unorm_previous(&iter, buffer, bufferCapacity,
3886 mode, options,
3887 FALSE, NULL,
3888 pErrorCode);
3889 leftBoundary=iter.index;
3890 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3891 *pErrorCode=U_ZERO_ERROR;
3892 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, 2*bufferLength, 0)) {
3893 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3894 /* dont need to cleanup here since
3895 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
3896 */
3897 return 0;
3898 }
3899
3900 /* just copy from the left string: we know the boundary already */
3901 uprv_memcpy(buffer, left+leftBoundary, bufferLength*U_SIZEOF_UCHAR);
3902 }
3903
3904 /*
3905 * find a normalization boundary at the beginning of the right string
3906 * and concatenate the beginning part to the buffer
3907 */
3908 uiter_setString(&iter, right, rightLength);
3909 rightLength=iter.length; /* in case it was -1 */
3910
3911 rightBoundary=unorm_next(&iter, buffer+bufferLength, bufferCapacity-bufferLength,
3912 mode, options,
3913 FALSE, NULL,
3914 pErrorCode);
3915 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
3916 *pErrorCode=U_ZERO_ERROR;
3917 if(!u_growBufferFromStatic(stackBuffer, &buffer, &bufferCapacity, bufferLength+rightBoundary, 0)) {
3918 *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
3919 /* dont need to cleanup here since
3920 * u_growBufferFromStatic frees buffer if(buffer!=stackBuffer)
3921 */
3922 return 0;
3923 }
3924
3925 /* just copy from the right string: we know the boundary already */
3926 uprv_memcpy(buffer+bufferLength, right, rightBoundary*U_SIZEOF_UCHAR);
3927 }
3928
3929 bufferLength+=rightBoundary;
3930
3931 /* copy left[0..leftBoundary[ to dest */
3932 if(left!=dest && leftBoundary>0 && destCapacity>0) {
3933 uprv_memcpy(dest, left, uprv_min(leftBoundary, destCapacity)*U_SIZEOF_UCHAR);
3934 }
3935 destLength=leftBoundary;
3936
3937 /* concatenate the normalization of the buffer to dest */
3938 if(destCapacity>destLength) {
3939 destLength+=unorm_internalNormalize(dest+destLength, destCapacity-destLength,
3940 buffer, bufferLength,
3941 mode, options,
3942 pErrorCode);
3943 } else {
3944 destLength+=unorm_internalNormalize(NULL, 0,
3945 buffer, bufferLength,
3946 mode, options,
3947 pErrorCode);
3948 }
3949 /*
3950 * only errorCode that is expected is a U_BUFFER_OVERFLOW_ERROR
3951 * so we dont check for the error code here..just let it pass through
3952 */
3953 /* concatenate right[rightBoundary..rightLength[ to dest */
3954 right+=rightBoundary;
3955 rightLength-=rightBoundary;
3956 if(rightLength>0 && destCapacity>destLength) {
3957 uprv_memcpy(dest+destLength, right, uprv_min(rightLength, destCapacity-destLength)*U_SIZEOF_UCHAR);
3958 }
3959 destLength+=rightLength;
3960
3961 /* cleanup */
3962 if(buffer!=stackBuffer) {
3963 uprv_free(buffer);
3964 }
3965
3966 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
3967 }
3968
3969 /* data swapping ------------------------------------------------------------ */
3970
3971 U_CAPI int32_t U_EXPORT2
3972 unorm_swap(const UDataSwapper *ds,
3973 const void *inData, int32_t length, void *outData,
3974 UErrorCode *pErrorCode) {
3975 const UDataInfo *pInfo;
3976 int32_t headerSize;
3977
3978 const uint8_t *inBytes;
3979 uint8_t *outBytes;
3980
3981 const int32_t *inIndexes;
3982 int32_t indexes[32];
3983
3984 int32_t i, offset, count, size;
3985
3986 /* udata_swapDataHeader checks the arguments */
3987 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
3988 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
3989 return 0;
3990 }
3991
3992 /* check data format and format version */
3993 pInfo=(const UDataInfo *)((const char *)inData+4);
3994 if(!(
3995 pInfo->dataFormat[0]==0x4e && /* dataFormat="Norm" */
3996 pInfo->dataFormat[1]==0x6f &&
3997 pInfo->dataFormat[2]==0x72 &&
3998 pInfo->dataFormat[3]==0x6d &&
3999 pInfo->formatVersion[0]==2
4000 )) {
4001 udata_printError(ds, "unorm_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unorm.icu\n",
4002 pInfo->dataFormat[0], pInfo->dataFormat[1],
4003 pInfo->dataFormat[2], pInfo->dataFormat[3],
4004 pInfo->formatVersion[0]);
4005 *pErrorCode=U_UNSUPPORTED_ERROR;
4006 return 0;
4007 }
4008
4009 inBytes=(const uint8_t *)inData+headerSize;
4010 outBytes=(uint8_t *)outData+headerSize;
4011
4012 inIndexes=(const int32_t *)inBytes;
4013
4014 if(length>=0) {
4015 length-=headerSize;
4016 if(length<32*4) {
4017 udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for unorm.icu\n",
4018 length);
4019 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
4020 return 0;
4021 }
4022 }
4023
4024 /* read the first 32 indexes (ICU 2.8/format version 2.2: _NORM_INDEX_TOP==32, might grow) */
4025 for(i=0; i<32; ++i) {
4026 indexes[i]=udata_readInt32(ds, inIndexes[i]);
4027 }
4028
4029 /* calculate the total length of the data */
4030 size=
4031 32*4+ /* size of indexes[] */
4032 indexes[_NORM_INDEX_TRIE_SIZE]+
4033 indexes[_NORM_INDEX_UCHAR_COUNT]*2+
4034 indexes[_NORM_INDEX_COMBINE_DATA_COUNT]*2+
4035 indexes[_NORM_INDEX_FCD_TRIE_SIZE]+
4036 indexes[_NORM_INDEX_AUX_TRIE_SIZE]+
4037 indexes[_NORM_INDEX_CANON_SET_COUNT]*2;
4038
4039 if(length>=0) {
4040 if(length<size) {
4041 udata_printError(ds, "unorm_swap(): too few bytes (%d after header) for all of unorm.icu\n",
4042 length);
4043 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
4044 return 0;
4045 }
4046
4047 /* copy the data for inaccessible bytes */
4048 if(inBytes!=outBytes) {
4049 uprv_memcpy(outBytes, inBytes, size);
4050 }
4051
4052 offset=0;
4053
4054 /* swap the indexes[] */
4055 count=32*4;
4056 ds->swapArray32(ds, inBytes, count, outBytes, pErrorCode);
4057 offset+=count;
4058
4059 /* swap the main UTrie */
4060 count=indexes[_NORM_INDEX_TRIE_SIZE];
4061 utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
4062 offset+=count;
4063
4064 /* swap the uint16_t extraData[] and the uint16_t combiningTable[] */
4065 count=(indexes[_NORM_INDEX_UCHAR_COUNT]+indexes[_NORM_INDEX_COMBINE_DATA_COUNT])*2;
4066 ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
4067 offset+=count;
4068
4069 /* swap the FCD UTrie */
4070 count=indexes[_NORM_INDEX_FCD_TRIE_SIZE];
4071 if(count!=0) {
4072 utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
4073 offset+=count;
4074 }
4075
4076 /* swap the aux UTrie */
4077 count=indexes[_NORM_INDEX_AUX_TRIE_SIZE];
4078 if(count!=0) {
4079 utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
4080 offset+=count;
4081 }
4082
4083 /* swap the uint16_t combiningTable[] */
4084 count=indexes[_NORM_INDEX_CANON_SET_COUNT]*2;
4085 ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
4086 offset+=count;
4087 }
4088
4089 return headerSize+size;
4090 }
4091
4092 #endif /* #if !UCONFIG_NO_NORMALIZATION */