]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/normalizer2impl.cpp
ICU-491.11.3.tar.gz
[apple/icu.git] / icuSources / common / normalizer2impl.cpp
CommitLineData
729e4ab9
A
1/*
2*******************************************************************************
3*
4388f060 4* Copyright (C) 2009-2012, International Business Machines
729e4ab9
A
5* Corporation and others. All Rights Reserved.
6*
7*******************************************************************************
8* file name: normalizer2impl.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2009nov22
14* created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_NORMALIZATION
20
21#include "unicode/normalizer2.h"
22#include "unicode/udata.h"
23#include "unicode/ustring.h"
4388f060 24#include "unicode/utf16.h"
729e4ab9
A
25#include "cmemory.h"
26#include "mutex.h"
27#include "normalizer2impl.h"
4388f060 28#include "putilimp.h"
729e4ab9 29#include "uassert.h"
729e4ab9
A
30#include "uset_imp.h"
31#include "utrie2.h"
32#include "uvector.h"
33
34U_NAMESPACE_BEGIN
35
36// ReorderingBuffer -------------------------------------------------------- ***
37
38UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
39 int32_t length=str.length();
40 start=str.getBuffer(destCapacity);
41 if(start==NULL) {
42 // getBuffer() already did str.setToBogus()
43 errorCode=U_MEMORY_ALLOCATION_ERROR;
44 return FALSE;
45 }
46 limit=start+length;
47 remainingCapacity=str.getCapacity()-length;
48 reorderStart=start;
49 if(start==limit) {
50 lastCC=0;
51 } else {
52 setIterator();
53 lastCC=previousCC();
54 // Set reorderStart after the last code point with cc<=1 if there is one.
55 if(lastCC>1) {
56 while(previousCC()>1) {}
57 }
58 reorderStart=codePointLimit;
59 }
60 return TRUE;
61}
62
63UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
64 int32_t length=(int32_t)(limit-start);
65 return
66 length==(int32_t)(otherLimit-otherStart) &&
67 0==u_memcmp(start, otherStart, length);
68}
69
70UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
71 if(remainingCapacity<2 && !resize(2, errorCode)) {
72 return FALSE;
73 }
74 if(lastCC<=cc || cc==0) {
75 limit[0]=U16_LEAD(c);
76 limit[1]=U16_TRAIL(c);
77 limit+=2;
78 lastCC=cc;
79 if(cc<=1) {
80 reorderStart=limit;
81 }
82 } else {
83 insert(c, cc);
84 }
85 remainingCapacity-=2;
86 return TRUE;
87}
88
89UBool ReorderingBuffer::append(const UChar *s, int32_t length,
90 uint8_t leadCC, uint8_t trailCC,
91 UErrorCode &errorCode) {
92 if(length==0) {
93 return TRUE;
94 }
95 if(remainingCapacity<length && !resize(length, errorCode)) {
96 return FALSE;
97 }
98 remainingCapacity-=length;
99 if(lastCC<=leadCC || leadCC==0) {
100 if(trailCC<=1) {
101 reorderStart=limit+length;
102 } else if(leadCC<=1) {
103 reorderStart=limit+1; // Ok if not a code point boundary.
104 }
105 const UChar *sLimit=s+length;
106 do { *limit++=*s++; } while(s!=sLimit);
107 lastCC=trailCC;
108 } else {
109 int32_t i=0;
110 UChar32 c;
111 U16_NEXT(s, i, length, c);
112 insert(c, leadCC); // insert first code point
113 while(i<length) {
114 U16_NEXT(s, i, length, c);
115 if(i<length) {
116 // s must be in NFD, otherwise we need to use getCC().
117 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
118 } else {
119 leadCC=trailCC;
120 }
121 append(c, leadCC, errorCode);
122 }
123 }
124 return TRUE;
125}
126
127UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
128 int32_t cpLength=U16_LENGTH(c);
129 if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
130 return FALSE;
131 }
132 remainingCapacity-=cpLength;
133 if(cpLength==1) {
134 *limit++=(UChar)c;
135 } else {
136 limit[0]=U16_LEAD(c);
137 limit[1]=U16_TRAIL(c);
138 limit+=2;
139 }
140 lastCC=0;
141 reorderStart=limit;
142 return TRUE;
143}
144
145UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
146 if(s==sLimit) {
147 return TRUE;
148 }
149 int32_t length=(int32_t)(sLimit-s);
150 if(remainingCapacity<length && !resize(length, errorCode)) {
151 return FALSE;
152 }
153 u_memcpy(limit, s, length);
154 limit+=length;
155 remainingCapacity-=length;
156 lastCC=0;
157 reorderStart=limit;
158 return TRUE;
159}
160
161void ReorderingBuffer::remove() {
162 reorderStart=limit=start;
163 remainingCapacity=str.getCapacity();
164 lastCC=0;
165}
166
167void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
168 if(suffixLength<(limit-start)) {
169 limit-=suffixLength;
170 remainingCapacity+=suffixLength;
171 } else {
172 limit=start;
173 remainingCapacity=str.getCapacity();
174 }
175 lastCC=0;
176 reorderStart=limit;
177}
178
179UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
180 int32_t reorderStartIndex=(int32_t)(reorderStart-start);
181 int32_t length=(int32_t)(limit-start);
182 str.releaseBuffer(length);
183 int32_t newCapacity=length+appendLength;
184 int32_t doubleCapacity=2*str.getCapacity();
185 if(newCapacity<doubleCapacity) {
186 newCapacity=doubleCapacity;
187 }
188 if(newCapacity<256) {
189 newCapacity=256;
190 }
191 start=str.getBuffer(newCapacity);
192 if(start==NULL) {
193 // getBuffer() already did str.setToBogus()
194 errorCode=U_MEMORY_ALLOCATION_ERROR;
195 return FALSE;
196 }
197 reorderStart=start+reorderStartIndex;
198 limit=start+length;
199 remainingCapacity=str.getCapacity()-length;
200 return TRUE;
201}
202
203void ReorderingBuffer::skipPrevious() {
204 codePointLimit=codePointStart;
205 UChar c=*--codePointStart;
206 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
207 --codePointStart;
208 }
209}
210
211uint8_t ReorderingBuffer::previousCC() {
212 codePointLimit=codePointStart;
213 if(reorderStart>=codePointStart) {
214 return 0;
215 }
216 UChar32 c=*--codePointStart;
217 if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
218 return 0;
219 }
220
221 UChar c2;
222 if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
223 --codePointStart;
224 c=U16_GET_SUPPLEMENTARY(c2, c);
225 }
226 return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
227}
228
229// Inserts c somewhere before the last character.
230// Requires 0<cc<lastCC which implies reorderStart<limit.
231void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
232 for(setIterator(), skipPrevious(); previousCC()>cc;) {}
233 // insert c at codePointLimit, after the character with prevCC<=cc
234 UChar *q=limit;
235 UChar *r=limit+=U16_LENGTH(c);
236 do {
237 *--r=*--q;
238 } while(codePointLimit!=q);
239 writeCodePoint(q, c);
240 if(cc<=1) {
241 reorderStart=r;
242 }
243}
244
245// Normalizer2Impl --------------------------------------------------------- ***
246
247struct CanonIterData : public UMemory {
248 CanonIterData(UErrorCode &errorCode);
249 ~CanonIterData();
250 void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
251 UTrie2 *trie;
252 UVector canonStartSets; // contains UnicodeSet *
253};
254
255Normalizer2Impl::~Normalizer2Impl() {
256 udata_close(memory);
257 utrie2_close(normTrie);
729e4ab9
A
258 delete (CanonIterData *)canonIterDataSingleton.fInstance;
259}
260
261UBool U_CALLCONV
262Normalizer2Impl::isAcceptable(void *context,
263 const char * /* type */, const char * /*name*/,
264 const UDataInfo *pInfo) {
265 if(
266 pInfo->size>=20 &&
267 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
268 pInfo->charsetFamily==U_CHARSET_FAMILY &&
269 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
270 pInfo->dataFormat[1]==0x72 &&
271 pInfo->dataFormat[2]==0x6d &&
272 pInfo->dataFormat[3]==0x32 &&
4388f060 273 pInfo->formatVersion[0]==2
729e4ab9
A
274 ) {
275 Normalizer2Impl *me=(Normalizer2Impl *)context;
276 uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
277 return TRUE;
278 } else {
279 return FALSE;
280 }
281}
282
283void
284Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &errorCode) {
285 if(U_FAILURE(errorCode)) {
286 return;
287 }
288 memory=udata_openChoice(packageName, "nrm", name, isAcceptable, this, &errorCode);
289 if(U_FAILURE(errorCode)) {
290 return;
291 }
292 const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
293 const int32_t *inIndexes=(const int32_t *)inBytes;
294 int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
295 if(indexesLength<=IX_MIN_MAYBE_YES) {
296 errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
297 return;
298 }
299
300 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
301 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
302
303 minYesNo=inIndexes[IX_MIN_YES_NO];
4388f060 304 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
729e4ab9
A
305 minNoNo=inIndexes[IX_MIN_NO_NO];
306 limitNoNo=inIndexes[IX_LIMIT_NO_NO];
307 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
308
309 int32_t offset=inIndexes[IX_NORM_TRIE_OFFSET];
310 int32_t nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
311 normTrie=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
312 inBytes+offset, nextOffset-offset, NULL,
313 &errorCode);
314 if(U_FAILURE(errorCode)) {
315 return;
316 }
317
318 offset=nextOffset;
4388f060 319 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
729e4ab9
A
320 maybeYesCompositions=(const uint16_t *)(inBytes+offset);
321 extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
4388f060
A
322
323 // smallFCD: new in formatVersion 2
324 offset=nextOffset;
325 smallFCD=inBytes+offset;
326
327 // Build tccc180[].
328 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
329 uint8_t bits=0;
330 for(UChar c=0; c<0x180; bits>>=1) {
331 if((c&0xff)==0) {
332 bits=smallFCD[c>>8]; // one byte per 0x100 code points
333 }
334 if(bits&1) {
335 for(int i=0; i<0x20; ++i, ++c) {
336 tccc180[c]=(uint8_t)getFCD16FromNormData(c);
337 }
338 } else {
339 uprv_memset(tccc180+c, 0, 0x20);
340 c+=0x20;
341 }
342 }
729e4ab9
A
343}
344
345uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
346 UChar32 c;
347 if(cpStart==(cpLimit-1)) {
348 c=*cpStart;
349 } else {
350 c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
351 }
352 uint16_t prevNorm16=getNorm16(c);
353 if(prevNorm16<=minYesNo) {
354 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
355 } else {
356 return (uint8_t)(*getMapping(prevNorm16)>>8); // tccc from yesNo
357 }
358}
359
360U_CDECL_BEGIN
361
362static UBool U_CALLCONV
363enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
364 /* add the start code point to the USet */
365 const USetAdder *sa=(const USetAdder *)context;
366 sa->add(sa->set, start);
367 return TRUE;
368}
369
370static uint32_t U_CALLCONV
371segmentStarterMapper(const void * /*context*/, uint32_t value) {
372 return value&CANON_NOT_SEGMENT_STARTER;
373}
374
375U_CDECL_END
376
377void
378Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
379 /* add the start code point of each same-value range of each trie */
380 utrie2_enum(normTrie, NULL, enumPropertyStartsRange, sa);
381
382 /* add Hangul LV syllables and LV+1 because of skippables */
383 for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
384 sa->add(sa->set, c);
385 sa->add(sa->set, c+1);
386 }
387 sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
388}
389
390void
391Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
392 /* add the start code point of each same-value range of the canonical iterator data trie */
393 if(ensureCanonIterData(errorCode)) {
394 // currently only used for the SEGMENT_STARTER property
395 utrie2_enum(((CanonIterData *)canonIterDataSingleton.fInstance)->trie,
396 segmentStarterMapper, enumPropertyStartsRange, sa);
397 }
398}
399
400const UChar *
401Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
402 UChar32 minNeedDataCP,
403 ReorderingBuffer *buffer,
404 UErrorCode &errorCode) const {
405 // Make some effort to support NUL-terminated strings reasonably.
406 // Take the part of the fast quick check loop that does not look up
407 // data and check the first part of the string.
408 // After this prefix, determine the string length to simplify the rest
409 // of the code.
410 const UChar *prevSrc=src;
411 UChar c;
412 while((c=*src++)<minNeedDataCP && c!=0) {}
413 // Back out the last character for full processing.
414 // Copy this prefix.
415 if(--src!=prevSrc) {
416 if(buffer!=NULL) {
417 buffer->appendZeroCC(prevSrc, src, errorCode);
418 }
419 }
420 return src;
421}
422
423// Dual functionality:
424// buffer!=NULL: normalize
425// buffer==NULL: isNormalized/spanQuickCheckYes
426const UChar *
427Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
428 ReorderingBuffer *buffer,
429 UErrorCode &errorCode) const {
430 UChar32 minNoCP=minDecompNoCP;
431 if(limit==NULL) {
432 src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
433 if(U_FAILURE(errorCode)) {
434 return src;
435 }
436 limit=u_strchr(src, 0);
437 }
438
439 const UChar *prevSrc;
440 UChar32 c=0;
441 uint16_t norm16=0;
442
443 // only for quick check
444 const UChar *prevBoundary=src;
445 uint8_t prevCC=0;
446
447 for(;;) {
448 // count code units below the minimum or with irrelevant data for the quick check
449 for(prevSrc=src; src!=limit;) {
450 if( (c=*src)<minNoCP ||
451 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
452 ) {
453 ++src;
454 } else if(!U16_IS_SURROGATE(c)) {
455 break;
456 } else {
457 UChar c2;
458 if(U16_IS_SURROGATE_LEAD(c)) {
459 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
460 c=U16_GET_SUPPLEMENTARY(c, c2);
461 }
462 } else /* trail surrogate */ {
463 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
464 --src;
465 c=U16_GET_SUPPLEMENTARY(c2, c);
466 }
467 }
468 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
469 src+=U16_LENGTH(c);
470 } else {
471 break;
472 }
473 }
474 }
475 // copy these code units all at once
476 if(src!=prevSrc) {
477 if(buffer!=NULL) {
478 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
479 break;
480 }
481 } else {
482 prevCC=0;
483 prevBoundary=src;
484 }
485 }
486 if(src==limit) {
487 break;
488 }
489
490 // Check one above-minimum, relevant code point.
491 src+=U16_LENGTH(c);
492 if(buffer!=NULL) {
493 if(!decompose(c, norm16, *buffer, errorCode)) {
494 break;
495 }
496 } else {
497 if(isDecompYes(norm16)) {
498 uint8_t cc=getCCFromYesOrMaybe(norm16);
499 if(prevCC<=cc || cc==0) {
500 prevCC=cc;
501 if(cc<=1) {
502 prevBoundary=src;
503 }
504 continue;
505 }
506 }
507 return prevBoundary; // "no" or cc out of order
508 }
509 }
510 return src;
511}
512
513// Decompose a short piece of text which is likely to contain characters that
514// fail the quick check loop and/or where the quick check loop's overhead
515// is unlikely to be amortized.
516// Called by the compose() and makeFCD() implementations.
517UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
518 ReorderingBuffer &buffer,
519 UErrorCode &errorCode) const {
520 while(src<limit) {
521 UChar32 c;
522 uint16_t norm16;
523 UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
524 if(!decompose(c, norm16, buffer, errorCode)) {
525 return FALSE;
526 }
527 }
528 return TRUE;
529}
530
531UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
532 ReorderingBuffer &buffer,
533 UErrorCode &errorCode) const {
534 // Only loops for 1:1 algorithmic mappings.
535 for(;;) {
536 // get the decomposition and the lead and trail cc's
537 if(isDecompYes(norm16)) {
538 // c does not decompose
539 return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
540 } else if(isHangul(norm16)) {
541 // Hangul syllable: decompose algorithmically
542 UChar jamos[3];
543 return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
544 } else if(isDecompNoAlgorithmic(norm16)) {
545 c=mapAlgorithmic(c, norm16);
546 norm16=getNorm16(c);
547 } else {
548 // c decomposes, get everything from the variable-length extra data
549 const uint16_t *mapping=getMapping(norm16);
4388f060 550 uint16_t firstUnit=*mapping;
729e4ab9
A
551 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
552 uint8_t leadCC, trailCC;
553 trailCC=(uint8_t)(firstUnit>>8);
554 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
4388f060 555 leadCC=(uint8_t)(*(mapping-1)>>8);
729e4ab9
A
556 } else {
557 leadCC=0;
558 }
4388f060 559 return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
729e4ab9
A
560 }
561 }
562}
563
564const UChar *
565Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
566 const UChar *decomp=NULL;
567 uint16_t norm16;
568 for(;;) {
569 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
570 // c does not decompose
571 return decomp;
572 } else if(isHangul(norm16)) {
573 // Hangul syllable: decompose algorithmically
574 length=Hangul::decompose(c, buffer);
575 return buffer;
576 } else if(isDecompNoAlgorithmic(norm16)) {
577 c=mapAlgorithmic(c, norm16);
578 decomp=buffer;
579 length=0;
580 U16_APPEND_UNSAFE(buffer, length, c);
581 } else {
582 // c decomposes, get everything from the variable-length extra data
583 const uint16_t *mapping=getMapping(norm16);
4388f060
A
584 length=*mapping&MAPPING_LENGTH_MASK;
585 return (const UChar *)mapping+1;
586 }
587 }
588}
589
590// The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
591// so that a raw mapping fits that consists of one unit ("rm0")
592// plus all but the first two code units of the normal mapping.
593// The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
594const UChar *
595Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
596 // We do not loop in this method because an algorithmic mapping itself
597 // becomes a final result rather than having to be decomposed recursively.
598 uint16_t norm16;
599 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
600 // c does not decompose
601 return NULL;
602 } else if(isHangul(norm16)) {
603 // Hangul syllable: decompose algorithmically
604 Hangul::getRawDecomposition(c, buffer);
605 length=2;
606 return buffer;
607 } else if(isDecompNoAlgorithmic(norm16)) {
608 c=mapAlgorithmic(c, norm16);
609 length=0;
610 U16_APPEND_UNSAFE(buffer, length, c);
611 return buffer;
612 } else {
613 // c decomposes, get everything from the variable-length extra data
614 const uint16_t *mapping=getMapping(norm16);
615 uint16_t firstUnit=*mapping;
616 int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping
617 if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
618 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
619 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
620 const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
621 uint16_t rm0=*rawMapping;
622 if(rm0<=MAPPING_LENGTH_MASK) {
623 length=rm0;
624 return (const UChar *)rawMapping-rm0;
625 } else {
626 // Copy the normal mapping and replace its first two code units with rm0.
627 buffer[0]=(UChar)rm0;
628 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
629 length=mLength-1;
630 return buffer;
729e4ab9 631 }
4388f060
A
632 } else {
633 length=mLength;
634 return (const UChar *)mapping+1;
729e4ab9
A
635 }
636 }
637}
638
639void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
640 UBool doDecompose,
4388f060 641 UnicodeString &safeMiddle,
729e4ab9
A
642 ReorderingBuffer &buffer,
643 UErrorCode &errorCode) const {
4388f060 644 buffer.copyReorderableSuffixTo(safeMiddle);
729e4ab9
A
645 if(doDecompose) {
646 decompose(src, limit, &buffer, errorCode);
647 return;
648 }
649 // Just merge the strings at the boundary.
650 ForwardUTrie2StringIterator iter(normTrie, src, limit);
651 uint8_t firstCC, prevCC, cc;
652 firstCC=prevCC=cc=getCC(iter.next16());
653 while(cc!=0) {
654 prevCC=cc;
655 cc=getCC(iter.next16());
656 };
4388f060
A
657 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
658 limit=u_strchr(iter.codePointStart, 0);
659 }
729e4ab9
A
660 buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode) &&
661 buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
662}
663
664// Note: hasDecompBoundary() could be implemented as aliases to
665// hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
666// at the cost of building the FCD trie for a decomposition normalizer.
667UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
668 for(;;) {
669 if(c<minDecompNoCP) {
670 return TRUE;
671 }
672 uint16_t norm16=getNorm16(c);
673 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
674 return TRUE;
675 } else if(norm16>MIN_NORMAL_MAYBE_YES) {
676 return FALSE; // ccc!=0
677 } else if(isDecompNoAlgorithmic(norm16)) {
678 c=mapAlgorithmic(c, norm16);
679 } else {
680 // c decomposes, get everything from the variable-length extra data
681 const uint16_t *mapping=getMapping(norm16);
4388f060 682 uint16_t firstUnit=*mapping;
729e4ab9
A
683 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
684 return FALSE;
685 }
686 if(!before) {
687 // decomp after-boundary: same as hasFCDBoundaryAfter(),
688 // fcd16<=1 || trailCC==0
689 if(firstUnit>0x1ff) {
690 return FALSE; // trailCC>1
691 }
692 if(firstUnit<=0xff) {
693 return TRUE; // trailCC==0
694 }
695 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
696 }
697 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
4388f060 698 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
729e4ab9
A
699 }
700 }
701}
702
703/*
704 * Finds the recomposition result for
705 * a forward-combining "lead" character,
706 * specified with a pointer to its compositions list,
707 * and a backward-combining "trail" character.
708 *
709 * If the lead and trail characters combine, then this function returns
710 * the following "compositeAndFwd" value:
711 * Bits 21..1 composite character
712 * Bit 0 set if the composite is a forward-combining starter
713 * otherwise it returns -1.
714 *
715 * The compositions list has (trail, compositeAndFwd) pair entries,
716 * encoded as either pairs or triples of 16-bit units.
717 * The last entry has the high bit of its first unit set.
718 *
719 * The list is sorted by ascending trail characters (there are no duplicates).
720 * A linear search is used.
721 *
722 * See normalizer2impl.h for a more detailed description
723 * of the compositions list format.
724 */
725int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
726 uint16_t key1, firstUnit;
727 if(trail<COMP_1_TRAIL_LIMIT) {
728 // trail character is 0..33FF
729 // result entry may have 2 or 3 units
730 key1=(uint16_t)(trail<<1);
731 while(key1>(firstUnit=*list)) {
732 list+=2+(firstUnit&COMP_1_TRIPLE);
733 }
734 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
735 if(firstUnit&COMP_1_TRIPLE) {
736 return ((int32_t)list[1]<<16)|list[2];
737 } else {
738 return list[1];
739 }
740 }
741 } else {
742 // trail character is 3400..10FFFF
743 // result entry has 3 units
744 key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
745 (((trail>>COMP_1_TRAIL_SHIFT))&
746 ~COMP_1_TRIPLE));
747 uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
748 uint16_t secondUnit;
749 for(;;) {
750 if(key1>(firstUnit=*list)) {
751 list+=2+(firstUnit&COMP_1_TRIPLE);
752 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
753 if(key2>(secondUnit=list[1])) {
754 if(firstUnit&COMP_1_LAST_TUPLE) {
755 break;
756 } else {
757 list+=3;
758 }
759 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
760 return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
761 } else {
762 break;
763 }
764 } else {
765 break;
766 }
767 }
768 }
769 return -1;
770}
771
772/**
773 * @param list some character's compositions list
774 * @param set recursively receives the composites from these compositions
775 */
776void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
777 uint16_t firstUnit;
778 int32_t compositeAndFwd;
779 do {
780 firstUnit=*list;
781 if((firstUnit&COMP_1_TRIPLE)==0) {
782 compositeAndFwd=list[1];
783 list+=2;
784 } else {
785 compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
786 list+=3;
787 }
788 UChar32 composite=compositeAndFwd>>1;
789 if((compositeAndFwd&1)!=0) {
790 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
791 }
792 set.add(composite);
793 } while((firstUnit&COMP_1_LAST_TUPLE)==0);
794}
795
796/*
797 * Recomposes the buffer text starting at recomposeStartIndex
798 * (which is in NFD - decomposed and canonically ordered),
799 * and truncates the buffer contents.
800 *
801 * Note that recomposition never lengthens the text:
802 * Any character consists of either one or two code units;
803 * a composition may contain at most one more code unit than the original starter,
804 * while the combining mark that is removed has at least one code unit.
805 */
806void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
807 UBool onlyContiguous) const {
808 UChar *p=buffer.getStart()+recomposeStartIndex;
809 UChar *limit=buffer.getLimit();
810 if(p==limit) {
811 return;
812 }
813
814 UChar *starter, *pRemove, *q, *r;
815 const uint16_t *compositionsList;
816 UChar32 c, compositeAndFwd;
817 uint16_t norm16;
818 uint8_t cc, prevCC;
819 UBool starterIsSupplementary;
820
821 // Some of the following variables are not used until we have a forward-combining starter
822 // and are only initialized now to avoid compiler warnings.
823 compositionsList=NULL; // used as indicator for whether we have a forward-combining starter
824 starter=NULL;
825 starterIsSupplementary=FALSE;
826 prevCC=0;
827
828 for(;;) {
829 UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
830 cc=getCCFromYesOrMaybe(norm16);
831 if( // this character combines backward and
832 isMaybe(norm16) &&
833 // we have seen a starter that combines forward and
834 compositionsList!=NULL &&
835 // the backward-combining character is not blocked
836 (prevCC<cc || prevCC==0)
837 ) {
838 if(isJamoVT(norm16)) {
839 // c is a Jamo V/T, see if we can compose it with the previous character.
840 if(c<Hangul::JAMO_T_BASE) {
841 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
842 UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
843 if(prev<Hangul::JAMO_L_COUNT) {
844 pRemove=p-1;
845 UChar syllable=(UChar)
846 (Hangul::HANGUL_BASE+
847 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
848 Hangul::JAMO_T_COUNT);
849 UChar t;
850 if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
851 ++p;
852 syllable+=t; // The next character was a Jamo T.
853 }
854 *starter=syllable;
855 // remove the Jamo V/T
856 q=pRemove;
857 r=p;
858 while(r<limit) {
859 *q++=*r++;
860 }
861 limit=q;
862 p=pRemove;
863 }
864 }
865 /*
866 * No "else" for Jamo T:
867 * Since the input is in NFD, there are no Hangul LV syllables that
868 * a Jamo T could combine with.
869 * All Jamo Ts are combined above when handling Jamo Vs.
870 */
871 if(p==limit) {
872 break;
873 }
874 compositionsList=NULL;
875 continue;
876 } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
877 // The starter and the combining mark (c) do combine.
878 UChar32 composite=compositeAndFwd>>1;
879
880 // Replace the starter with the composite, remove the combining mark.
881 pRemove=p-U16_LENGTH(c); // pRemove & p: start & limit of the combining mark
882 if(starterIsSupplementary) {
883 if(U_IS_SUPPLEMENTARY(composite)) {
884 // both are supplementary
885 starter[0]=U16_LEAD(composite);
886 starter[1]=U16_TRAIL(composite);
887 } else {
888 *starter=(UChar)composite;
889 // The composite is shorter than the starter,
890 // move the intermediate characters forward one.
891 starterIsSupplementary=FALSE;
892 q=starter+1;
893 r=q+1;
894 while(r<pRemove) {
895 *q++=*r++;
896 }
897 --pRemove;
898 }
899 } else if(U_IS_SUPPLEMENTARY(composite)) {
900 // The composite is longer than the starter,
901 // move the intermediate characters back one.
902 starterIsSupplementary=TRUE;
903 ++starter; // temporarily increment for the loop boundary
904 q=pRemove;
905 r=++pRemove;
906 while(starter<q) {
907 *--r=*--q;
908 }
909 *starter=U16_TRAIL(composite);
910 *--starter=U16_LEAD(composite); // undo the temporary increment
911 } else {
912 // both are on the BMP
913 *starter=(UChar)composite;
914 }
915
916 /* remove the combining mark by moving the following text over it */
917 if(pRemove<p) {
918 q=pRemove;
919 r=p;
920 while(r<limit) {
921 *q++=*r++;
922 }
923 limit=q;
924 p=pRemove;
925 }
926 // Keep prevCC because we removed the combining mark.
927
928 if(p==limit) {
929 break;
930 }
931 // Is the composite a starter that combines forward?
932 if(compositeAndFwd&1) {
933 compositionsList=
934 getCompositionsListForComposite(getNorm16(composite));
935 } else {
936 compositionsList=NULL;
937 }
938
939 // We combined; continue with looking for compositions.
940 continue;
941 }
942 }
943
944 // no combination this time
945 prevCC=cc;
946 if(p==limit) {
947 break;
948 }
949
950 // If c did not combine, then check if it is a starter.
951 if(cc==0) {
952 // Found a new starter.
953 if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
954 // It may combine with something, prepare for it.
955 if(U_IS_BMP(c)) {
956 starterIsSupplementary=FALSE;
957 starter=p-1;
958 } else {
959 starterIsSupplementary=TRUE;
960 starter=p-2;
961 }
962 }
963 } else if(onlyContiguous) {
964 // FCC: no discontiguous compositions; any intervening character blocks.
965 compositionsList=NULL;
966 }
967 }
968 buffer.setReorderingLimit(limit);
969}
970
4388f060
A
971UChar32
972Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
973 uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
974 const uint16_t *list;
975 if(isInert(norm16)) {
976 return U_SENTINEL;
977 } else if(norm16<minYesNoMappingsOnly) {
978 if(isJamoL(norm16)) {
979 b-=Hangul::JAMO_V_BASE;
980 if(0<=b && b<Hangul::JAMO_V_COUNT) {
981 return
982 (Hangul::HANGUL_BASE+
983 ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
984 Hangul::JAMO_T_COUNT);
985 } else {
986 return U_SENTINEL;
987 }
988 } else if(isHangul(norm16)) {
989 b-=Hangul::JAMO_T_BASE;
990 if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
991 return a+b;
992 } else {
993 return U_SENTINEL;
994 }
995 } else {
996 // 'a' has a compositions list in extraData
997 list=extraData+norm16;
998 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
999 list+= // mapping pointer
1000 1+ // +1 to skip the first unit with the mapping lenth
1001 (*list&MAPPING_LENGTH_MASK); // + mapping length
1002 }
1003 }
1004 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
1005 return U_SENTINEL;
1006 } else {
1007 list=maybeYesCompositions+norm16-minMaybeYes;
1008 }
1009 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
1010 return U_SENTINEL;
1011 }
1012#if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1013 return combine(list, b)>>1;
1014#else
1015 int32_t compositeAndFwd=combine(list, b);
1016 return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
1017#endif
1018}
1019
729e4ab9
A
1020// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1021// doCompose: normalize
1022// !doCompose: isNormalized (buffer must be empty and initialized)
1023UBool
1024Normalizer2Impl::compose(const UChar *src, const UChar *limit,
1025 UBool onlyContiguous,
1026 UBool doCompose,
1027 ReorderingBuffer &buffer,
1028 UErrorCode &errorCode) const {
1029 /*
1030 * prevBoundary points to the last character before the current one
1031 * that has a composition boundary before it with ccc==0 and quick check "yes".
1032 * Keeping track of prevBoundary saves us looking for a composition boundary
1033 * when we find a "no" or "maybe".
1034 *
1035 * When we back out from prevSrc back to prevBoundary,
1036 * then we also remove those same characters (which had been simply copied
1037 * or canonically-order-inserted) from the ReorderingBuffer.
1038 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
1039 * must correspond 1:1 to destination units at the end of the destination buffer.
1040 */
1041 const UChar *prevBoundary=src;
1042 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1043 if(limit==NULL) {
1044 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
1045 doCompose ? &buffer : NULL,
1046 errorCode);
1047 if(U_FAILURE(errorCode)) {
1048 return FALSE;
1049 }
1050 if(prevBoundary<src) {
1051 // Set prevBoundary to the last character in the prefix.
1052 prevBoundary=src-1;
1053 }
1054 limit=u_strchr(src, 0);
1055 }
1056
1057 const UChar *prevSrc;
1058 UChar32 c=0;
1059 uint16_t norm16=0;
1060
1061 // only for isNormalized
1062 uint8_t prevCC=0;
1063
1064 for(;;) {
1065 // count code units below the minimum or with irrelevant data for the quick check
1066 for(prevSrc=src; src!=limit;) {
1067 if( (c=*src)<minNoMaybeCP ||
1068 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1069 ) {
1070 ++src;
1071 } else if(!U16_IS_SURROGATE(c)) {
1072 break;
1073 } else {
1074 UChar c2;
1075 if(U16_IS_SURROGATE_LEAD(c)) {
1076 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1077 c=U16_GET_SUPPLEMENTARY(c, c2);
1078 }
1079 } else /* trail surrogate */ {
1080 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1081 --src;
1082 c=U16_GET_SUPPLEMENTARY(c2, c);
1083 }
1084 }
1085 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1086 src+=U16_LENGTH(c);
1087 } else {
1088 break;
1089 }
1090 }
1091 }
1092 // copy these code units all at once
1093 if(src!=prevSrc) {
1094 if(doCompose) {
1095 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
1096 break;
1097 }
1098 } else {
1099 prevCC=0;
1100 }
1101 if(src==limit) {
1102 break;
1103 }
1104 // Set prevBoundary to the last character in the quick check loop.
1105 prevBoundary=src-1;
1106 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1107 U16_IS_LEAD(*(prevBoundary-1))
1108 ) {
1109 --prevBoundary;
1110 }
1111 // The start of the current character (c).
1112 prevSrc=src;
1113 } else if(src==limit) {
1114 break;
1115 }
1116
1117 src+=U16_LENGTH(c);
1118 /*
1119 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1120 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1121 * or has ccc!=0.
1122 * Check for Jamo V/T, then for regular characters.
1123 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1124 */
1125 if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1126 UChar prev=*(prevSrc-1);
1127 UBool needToDecompose=FALSE;
1128 if(c<Hangul::JAMO_T_BASE) {
1129 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1130 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
1131 if(prev<Hangul::JAMO_L_COUNT) {
1132 if(!doCompose) {
1133 return FALSE;
1134 }
1135 UChar syllable=(UChar)
1136 (Hangul::HANGUL_BASE+
1137 (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
1138 Hangul::JAMO_T_COUNT);
1139 UChar t;
1140 if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
1141 ++src;
1142 syllable+=t; // The next character was a Jamo T.
1143 prevBoundary=src;
1144 buffer.setLastChar(syllable);
1145 continue;
1146 }
1147 // If we see L+V+x where x!=T then we drop to the slow path,
1148 // decompose and recompose.
1149 // This is to deal with NFKC finding normal L and V but a
1150 // compatibility variant of a T. We need to either fully compose that
1151 // combination here (which would complicate the code and may not work
1152 // with strange custom data) or use the slow path -- or else our replacing
1153 // two input characters (L+V) with one output character (LV syllable)
1154 // would violate the invariant that [prevBoundary..prevSrc[ has the same
1155 // length as what we appended to the buffer since prevBoundary.
1156 needToDecompose=TRUE;
1157 }
1158 } else if(Hangul::isHangulWithoutJamoT(prev)) {
1159 // c is a Jamo Trailing consonant,
1160 // compose with previous Hangul LV that does not contain a Jamo T.
1161 if(!doCompose) {
1162 return FALSE;
1163 }
1164 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
1165 prevBoundary=src;
1166 continue;
1167 }
1168 if(!needToDecompose) {
1169 // The Jamo V/T did not compose into a Hangul syllable.
1170 if(doCompose) {
1171 if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
1172 break;
1173 }
1174 } else {
1175 prevCC=0;
1176 }
1177 continue;
1178 }
1179 }
1180 /*
1181 * Source buffer pointers:
1182 *
1183 * all done quick check current char not yet
1184 * "yes" but (c) processed
1185 * may combine
1186 * forward
1187 * [-------------[-------------[-------------[-------------[
1188 * | | | | |
1189 * orig. src prevBoundary prevSrc src limit
1190 *
1191 *
1192 * Destination buffer pointers inside the ReorderingBuffer:
1193 *
1194 * all done might take not filled yet
1195 * characters for
1196 * reordering
1197 * [-------------[-------------[-------------[
1198 * | | | |
1199 * start reorderStart limit |
1200 * +remainingCap.+
1201 */
1202 if(norm16>=MIN_YES_YES_WITH_CC) {
1203 uint8_t cc=(uint8_t)norm16; // cc!=0
1204 if( onlyContiguous && // FCC
1205 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1206 prevBoundary<prevSrc &&
1207 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1208 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1209 // passed the quick check "yes && ccc==0" test.
1210 // Check whether the last character was a "yesYes" or a "yesNo".
1211 // If a "yesNo", then we get its trailing ccc from its
1212 // mapping and check for canonical order.
1213 // All other cases are ok.
1214 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1215 ) {
1216 // Fails FCD test, need to decompose and contiguously recompose.
1217 if(!doCompose) {
1218 return FALSE;
1219 }
1220 } else if(doCompose) {
1221 if(!buffer.append(c, cc, errorCode)) {
1222 break;
1223 }
1224 continue;
1225 } else if(prevCC<=cc) {
1226 prevCC=cc;
1227 continue;
1228 } else {
1229 return FALSE;
1230 }
1231 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1232 return FALSE;
1233 }
1234
1235 /*
1236 * Find appropriate boundaries around this character,
1237 * decompose the source text from between the boundaries,
1238 * and recompose it.
1239 *
1240 * We may need to remove the last few characters from the ReorderingBuffer
1241 * to account for source text that was copied or appended
1242 * but needs to take part in the recomposition.
1243 */
1244
1245 /*
1246 * Find the last composition boundary in [prevBoundary..src[.
1247 * It is either the decomposition of the current character (at prevSrc),
1248 * or prevBoundary.
1249 */
1250 if(hasCompBoundaryBefore(c, norm16)) {
1251 prevBoundary=prevSrc;
1252 } else if(doCompose) {
1253 buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
1254 }
1255
1256 // Find the next composition boundary in [src..limit[ -
1257 // modifies src to point to the next starter.
1258 src=(UChar *)findNextCompBoundary(src, limit);
1259
1260 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1261 int32_t recomposeStartIndex=buffer.length();
1262 if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
1263 break;
1264 }
1265 recompose(buffer, recomposeStartIndex, onlyContiguous);
1266 if(!doCompose) {
1267 if(!buffer.equals(prevBoundary, src)) {
1268 return FALSE;
1269 }
1270 buffer.remove();
1271 prevCC=0;
1272 }
1273
1274 // Move to the next starter. We never need to look back before this point again.
1275 prevBoundary=src;
1276 }
1277 return TRUE;
1278}
1279
1280// Very similar to compose(): Make the same changes in both places if relevant.
1281// pQCResult==NULL: spanQuickCheckYes
1282// pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1283const UChar *
1284Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
1285 UBool onlyContiguous,
1286 UNormalizationCheckResult *pQCResult) const {
1287 /*
1288 * prevBoundary points to the last character before the current one
1289 * that has a composition boundary before it with ccc==0 and quick check "yes".
1290 */
1291 const UChar *prevBoundary=src;
1292 UChar32 minNoMaybeCP=minCompNoMaybeCP;
1293 if(limit==NULL) {
1294 UErrorCode errorCode=U_ZERO_ERROR;
1295 src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
1296 if(prevBoundary<src) {
1297 // Set prevBoundary to the last character in the prefix.
1298 prevBoundary=src-1;
1299 }
1300 limit=u_strchr(src, 0);
1301 }
1302
1303 const UChar *prevSrc;
1304 UChar32 c=0;
1305 uint16_t norm16=0;
1306 uint8_t prevCC=0;
1307
1308 for(;;) {
1309 // count code units below the minimum or with irrelevant data for the quick check
1310 for(prevSrc=src;;) {
1311 if(src==limit) {
1312 return src;
1313 }
1314 if( (c=*src)<minNoMaybeCP ||
1315 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
1316 ) {
1317 ++src;
1318 } else if(!U16_IS_SURROGATE(c)) {
1319 break;
1320 } else {
1321 UChar c2;
1322 if(U16_IS_SURROGATE_LEAD(c)) {
1323 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1324 c=U16_GET_SUPPLEMENTARY(c, c2);
1325 }
1326 } else /* trail surrogate */ {
1327 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1328 --src;
1329 c=U16_GET_SUPPLEMENTARY(c2, c);
1330 }
1331 }
1332 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1333 src+=U16_LENGTH(c);
1334 } else {
1335 break;
1336 }
1337 }
1338 }
1339 if(src!=prevSrc) {
1340 // Set prevBoundary to the last character in the quick check loop.
1341 prevBoundary=src-1;
1342 if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
1343 U16_IS_LEAD(*(prevBoundary-1))
1344 ) {
1345 --prevBoundary;
1346 }
1347 prevCC=0;
1348 // The start of the current character (c).
1349 prevSrc=src;
1350 }
1351
1352 src+=U16_LENGTH(c);
1353 /*
1354 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1355 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1356 * or has ccc!=0.
1357 */
1358 if(isMaybeOrNonZeroCC(norm16)) {
1359 uint8_t cc=getCCFromYesOrMaybe(norm16);
1360 if( onlyContiguous && // FCC
1361 cc!=0 &&
1362 prevCC==0 &&
1363 prevBoundary<prevSrc &&
1364 // prevCC==0 && prevBoundary<prevSrc tell us that
1365 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1366 // passed the quick check "yes && ccc==0" test.
1367 // Check whether the last character was a "yesYes" or a "yesNo".
1368 // If a "yesNo", then we get its trailing ccc from its
1369 // mapping and check for canonical order.
1370 // All other cases are ok.
1371 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
1372 ) {
1373 // Fails FCD test.
1374 } else if(prevCC<=cc || cc==0) {
1375 prevCC=cc;
1376 if(norm16<MIN_YES_YES_WITH_CC) {
1377 if(pQCResult!=NULL) {
1378 *pQCResult=UNORM_MAYBE;
1379 } else {
1380 return prevBoundary;
1381 }
1382 }
1383 continue;
1384 }
1385 }
1386 if(pQCResult!=NULL) {
1387 *pQCResult=UNORM_NO;
1388 }
1389 return prevBoundary;
1390 }
1391}
1392
1393void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
1394 UBool doCompose,
1395 UBool onlyContiguous,
4388f060 1396 UnicodeString &safeMiddle,
729e4ab9
A
1397 ReorderingBuffer &buffer,
1398 UErrorCode &errorCode) const {
1399 if(!buffer.isEmpty()) {
1400 const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
1401 if(src!=firstStarterInSrc) {
1402 const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
1403 buffer.getLimit());
4388f060
A
1404 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
1405 UnicodeString middle(lastStarterInDest, destSuffixLength);
1406 buffer.removeSuffix(destSuffixLength);
1407 safeMiddle=middle;
729e4ab9
A
1408 middle.append(src, (int32_t)(firstStarterInSrc-src));
1409 const UChar *middleStart=middle.getBuffer();
1410 compose(middleStart, middleStart+middle.length(), onlyContiguous,
1411 TRUE, buffer, errorCode);
1412 if(U_FAILURE(errorCode)) {
1413 return;
1414 }
1415 src=firstStarterInSrc;
1416 }
1417 }
1418 if(doCompose) {
1419 compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
1420 } else {
4388f060
A
1421 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1422 limit=u_strchr(src, 0);
1423 }
729e4ab9
A
1424 buffer.appendZeroCC(src, limit, errorCode);
1425 }
1426}
1427
1428/**
1429 * Does c have a composition boundary before it?
1430 * True if its decomposition begins with a character that has
1431 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1432 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1433 * (isCompYesAndZeroCC()) so we need not decompose.
1434 */
1435UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
1436 for(;;) {
1437 if(isCompYesAndZeroCC(norm16)) {
1438 return TRUE;
1439 } else if(isMaybeOrNonZeroCC(norm16)) {
1440 return FALSE;
1441 } else if(isDecompNoAlgorithmic(norm16)) {
1442 c=mapAlgorithmic(c, norm16);
1443 norm16=getNorm16(c);
1444 } else {
1445 // c decomposes, get everything from the variable-length extra data
1446 const uint16_t *mapping=getMapping(norm16);
4388f060 1447 uint16_t firstUnit=*mapping;
729e4ab9
A
1448 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1449 return FALSE;
1450 }
4388f060 1451 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
729e4ab9
A
1452 return FALSE; // non-zero leadCC
1453 }
4388f060 1454 int32_t i=1; // skip over the firstUnit
729e4ab9
A
1455 UChar32 c;
1456 U16_NEXT_UNSAFE(mapping, i, c);
1457 return isCompYesAndZeroCC(getNorm16(c));
1458 }
1459 }
1460}
1461
1462UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
1463 for(;;) {
1464 uint16_t norm16=getNorm16(c);
1465 if(isInert(norm16)) {
1466 return TRUE;
1467 } else if(norm16<=minYesNo) {
4388f060
A
1468 // Hangul: norm16==minYesNo
1469 // Hangul LVT has a boundary after it.
729e4ab9
A
1470 // Hangul LV and non-inert yesYes characters combine forward.
1471 return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
1472 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1473 return FALSE;
1474 } else if(isDecompNoAlgorithmic(norm16)) {
1475 c=mapAlgorithmic(c, norm16);
1476 } else {
1477 // c decomposes, get everything from the variable-length extra data.
1478 // If testInert, then c must be a yesNo character which has lccc=0,
1479 // otherwise it could be a noNo.
1480 const uint16_t *mapping=getMapping(norm16);
1481 uint16_t firstUnit=*mapping;
1482 // TRUE if
4388f060
A
1483 // not MAPPING_NO_COMP_BOUNDARY_AFTER
1484 // (which is set if
1485 // c is not deleted, and
1486 // it and its decomposition do not combine forward, and it has a starter)
1487 // and if FCC then trailCC<=1
729e4ab9 1488 return
4388f060 1489 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
729e4ab9
A
1490 (!onlyContiguous || firstUnit<=0x1ff);
1491 }
1492 }
1493}
1494
1495const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
1496 BackwardUTrie2StringIterator iter(normTrie, start, p);
1497 uint16_t norm16;
1498 do {
1499 norm16=iter.previous16();
1500 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1501 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1502 // but that's probably not worth the extra cost.
1503 return iter.codePointStart;
1504}
1505
1506const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
1507 ForwardUTrie2StringIterator iter(normTrie, p, limit);
1508 uint16_t norm16;
1509 do {
1510 norm16=iter.next16();
1511 } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
1512 return iter.codePointStart;
1513}
1514
4388f060
A
1515// Note: normalizer2impl.cpp r30982 (2011-nov-27)
1516// still had getFCDTrie() which built and cached an FCD trie.
1517// That provided faster access to FCD data than getFCD16FromNormData()
1518// but required synchronization and consumed some 10kB of heap memory
1519// in any process that uses FCD (e.g., via collation).
1520// tccc180[] and smallFCD[] are intended to help with any loss of performance,
1521// at least for Latin & CJK.
729e4ab9 1522
4388f060
A
1523// Gets the FCD value from the regular normalization data.
1524uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
729e4ab9
A
1525 // Only loops for 1:1 algorithmic mappings.
1526 for(;;) {
4388f060
A
1527 uint16_t norm16=getNorm16(c);
1528 if(norm16<=minYesNo) {
729e4ab9 1529 // no decomposition or Hangul syllable, all zeros
4388f060
A
1530 return 0;
1531 } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
1532 // combining mark
1533 norm16&=0xff;
1534 return norm16|(norm16<<8);
1535 } else if(norm16>=minMaybeYes) {
1536 return 0;
1537 } else if(isDecompNoAlgorithmic(norm16)) {
1538 c=mapAlgorithmic(c, norm16);
729e4ab9
A
1539 } else {
1540 // c decomposes, get everything from the variable-length extra data
1541 const uint16_t *mapping=getMapping(norm16);
1542 uint16_t firstUnit=*mapping;
1543 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1544 // A character that is deleted (maps to an empty string) must
1545 // get the worst-case lccc and tccc values because arbitrary
1546 // characters on both sides will become adjacent.
4388f060 1547 return 0x1ff;
729e4ab9 1548 } else {
4388f060 1549 norm16=firstUnit>>8; // tccc
729e4ab9 1550 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
4388f060 1551 norm16|=*(mapping-1)&0xff00; // lccc
729e4ab9 1552 }
4388f060 1553 return norm16;
729e4ab9
A
1554 }
1555 }
729e4ab9
A
1556 }
1557}
1558
729e4ab9
A
1559// Dual functionality:
1560// buffer!=NULL: normalize
1561// buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1562const UChar *
1563Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
1564 ReorderingBuffer *buffer,
1565 UErrorCode &errorCode) const {
1566 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1567 // Similar to the prevBoundary in the compose() implementation.
1568 const UChar *prevBoundary=src;
1569 int32_t prevFCD16=0;
1570 if(limit==NULL) {
1571 src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
1572 if(U_FAILURE(errorCode)) {
1573 return src;
1574 }
1575 if(prevBoundary<src) {
1576 prevBoundary=src;
1577 // We know that the previous character's lccc==0.
1578 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
4388f060 1579 prevFCD16=getFCD16(*(src-1));
729e4ab9
A
1580 if(prevFCD16>1) {
1581 --prevBoundary;
1582 }
1583 }
1584 limit=u_strchr(src, 0);
1585 }
1586
1587 // Note: In this function we use buffer->appendZeroCC() because we track
1588 // the lead and trail combining classes here, rather than leaving it to
1589 // the ReorderingBuffer.
1590 // The exception is the call to decomposeShort() which uses the buffer
1591 // in the normal way.
1592
729e4ab9
A
1593 const UChar *prevSrc;
1594 UChar32 c=0;
1595 uint16_t fcd16=0;
1596
1597 for(;;) {
1598 // count code units with lccc==0
1599 for(prevSrc=src; src!=limit;) {
1600 if((c=*src)<MIN_CCC_LCCC_CP) {
1601 prevFCD16=~c;
1602 ++src;
4388f060
A
1603 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1604 prevFCD16=0;
729e4ab9 1605 ++src;
729e4ab9 1606 } else {
4388f060
A
1607 if(U16_IS_SURROGATE(c)) {
1608 UChar c2;
1609 if(U16_IS_SURROGATE_LEAD(c)) {
1610 if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
1611 c=U16_GET_SUPPLEMENTARY(c, c2);
1612 }
1613 } else /* trail surrogate */ {
1614 if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
1615 --src;
1616 c=U16_GET_SUPPLEMENTARY(c2, c);
1617 }
729e4ab9
A
1618 }
1619 }
4388f060 1620 if((fcd16=getFCD16FromNormData(c))<=0xff) {
729e4ab9
A
1621 prevFCD16=fcd16;
1622 src+=U16_LENGTH(c);
1623 } else {
1624 break;
1625 }
1626 }
1627 }
1628 // copy these code units all at once
1629 if(src!=prevSrc) {
1630 if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
1631 break;
1632 }
1633 if(src==limit) {
1634 break;
1635 }
1636 prevBoundary=src;
1637 // We know that the previous character's lccc==0.
1638 if(prevFCD16<0) {
1639 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
4388f060
A
1640 UChar32 prev=~prevFCD16;
1641 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
729e4ab9
A
1642 if(prevFCD16>1) {
1643 --prevBoundary;
1644 }
1645 } else {
1646 const UChar *p=src-1;
1647 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
1648 --p;
1649 // Need to fetch the previous character's FCD value because
1650 // prevFCD16 was just for the trail surrogate code point.
4388f060 1651 prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
729e4ab9
A
1652 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1653 }
1654 if(prevFCD16>1) {
1655 prevBoundary=p;
1656 }
1657 }
1658 // The start of the current character (c).
1659 prevSrc=src;
1660 } else if(src==limit) {
1661 break;
1662 }
1663
1664 src+=U16_LENGTH(c);
1665 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1666 // Check for proper order, and decompose locally if necessary.
1667 if((prevFCD16&0xff)<=(fcd16>>8)) {
1668 // proper order: prev tccc <= current lccc
1669 if((fcd16&0xff)<=1) {
1670 prevBoundary=src;
1671 }
1672 if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
1673 break;
1674 }
1675 prevFCD16=fcd16;
1676 continue;
1677 } else if(buffer==NULL) {
1678 return prevBoundary; // quick check "no"
1679 } else {
1680 /*
1681 * Back out the part of the source that we copied or appended
1682 * already but is now going to be decomposed.
1683 * prevSrc is set to after what was copied/appended.
1684 */
1685 buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
1686 /*
1687 * Find the part of the source that needs to be decomposed,
1688 * up to the next safe boundary.
1689 */
1690 src=findNextFCDBoundary(src, limit);
1691 /*
1692 * The source text does not fulfill the conditions for FCD.
1693 * Decompose and reorder a limited piece of the text.
1694 */
1695 if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
1696 break;
1697 }
1698 prevBoundary=src;
1699 prevFCD16=0;
1700 }
1701 }
1702 return src;
1703}
1704
1705void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
1706 UBool doMakeFCD,
4388f060 1707 UnicodeString &safeMiddle,
729e4ab9
A
1708 ReorderingBuffer &buffer,
1709 UErrorCode &errorCode) const {
1710 if(!buffer.isEmpty()) {
1711 const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
1712 if(src!=firstBoundaryInSrc) {
1713 const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
1714 buffer.getLimit());
4388f060
A
1715 int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
1716 UnicodeString middle(lastBoundaryInDest, destSuffixLength);
1717 buffer.removeSuffix(destSuffixLength);
1718 safeMiddle=middle;
729e4ab9
A
1719 middle.append(src, (int32_t)(firstBoundaryInSrc-src));
1720 const UChar *middleStart=middle.getBuffer();
1721 makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
1722 if(U_FAILURE(errorCode)) {
1723 return;
1724 }
1725 src=firstBoundaryInSrc;
1726 }
1727 }
1728 if(doMakeFCD) {
1729 makeFCD(src, limit, &buffer, errorCode);
1730 } else {
4388f060
A
1731 if(limit==NULL) { // appendZeroCC() needs limit!=NULL
1732 limit=u_strchr(src, 0);
1733 }
729e4ab9
A
1734 buffer.appendZeroCC(src, limit, errorCode);
1735 }
1736}
1737
1738const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
4388f060
A
1739 while(start<p && previousFCD16(start, p)>0xff) {}
1740 return p;
729e4ab9
A
1741}
1742
1743const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
4388f060
A
1744 while(p<limit) {
1745 const UChar *codePointStart=p;
1746 if(nextFCD16(p, limit)<=0xff) {
1747 return codePointStart;
1748 }
1749 }
1750 return p;
729e4ab9
A
1751}
1752
1753// CanonicalIterator data -------------------------------------------------- ***
1754
1755CanonIterData::CanonIterData(UErrorCode &errorCode) :
1756 trie(utrie2_open(0, 0, &errorCode)),
4388f060 1757 canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
729e4ab9
A
1758
1759CanonIterData::~CanonIterData() {
1760 utrie2_close(trie);
1761}
1762
1763void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
1764 uint32_t canonValue=utrie2_get32(trie, decompLead);
1765 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
1766 // origin is the first character whose decomposition starts with
1767 // the character for which we are setting the value.
1768 utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
1769 } else {
1770 // origin is not the first character, or it is U+0000.
1771 UnicodeSet *set;
1772 if((canonValue&CANON_HAS_SET)==0) {
1773 set=new UnicodeSet;
1774 if(set==NULL) {
1775 errorCode=U_MEMORY_ALLOCATION_ERROR;
1776 return;
1777 }
1778 UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
1779 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
1780 utrie2_set32(trie, decompLead, canonValue, &errorCode);
1781 canonStartSets.addElement(set, errorCode);
1782 if(firstOrigin!=0) {
1783 set->add(firstOrigin);
1784 }
1785 } else {
1786 set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
1787 }
1788 set->add(origin);
1789 }
1790}
1791
1792class CanonIterDataSingleton {
1793public:
1794 CanonIterDataSingleton(SimpleSingleton &s, Normalizer2Impl &ni, UErrorCode &ec) :
1795 singleton(s), impl(ni), errorCode(ec) {}
1796 CanonIterData *getInstance(UErrorCode &errorCode) {
1797 void *duplicate;
1798 CanonIterData *instance=
1799 (CanonIterData *)singleton.getInstance(createInstance, this, duplicate, errorCode);
1800 delete (CanonIterData *)duplicate;
1801 return instance;
1802 }
1803 static void *createInstance(const void *context, UErrorCode &errorCode);
1804 UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
1805 if(value!=0) {
1806 impl.makeCanonIterDataFromNorm16(start, end, (uint16_t)value, *newData, errorCode);
1807 }
1808 return U_SUCCESS(errorCode);
1809 }
1810
1811private:
1812 SimpleSingleton &singleton;
1813 Normalizer2Impl &impl;
1814 CanonIterData *newData;
1815 UErrorCode &errorCode;
1816};
1817
1818U_CDECL_BEGIN
1819
1820// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1821static UBool U_CALLCONV
1822enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
1823 return ((CanonIterDataSingleton *)context)->rangeHandler(start, end, value);
1824}
1825
1826U_CDECL_END
1827
1828void *CanonIterDataSingleton::createInstance(const void *context, UErrorCode &errorCode) {
1829 CanonIterDataSingleton *me=(CanonIterDataSingleton *)context;
1830 me->newData=new CanonIterData(errorCode);
1831 if(me->newData==NULL) {
1832 errorCode=U_MEMORY_ALLOCATION_ERROR;
1833 return NULL;
1834 }
1835 if(U_SUCCESS(errorCode)) {
1836 utrie2_enum(me->impl.getNormTrie(), NULL, enumCIDRangeHandler, me);
1837 utrie2_freeze(me->newData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
1838 if(U_SUCCESS(errorCode)) {
1839 return me->newData;
1840 }
1841 }
1842 delete me->newData;
1843 return NULL;
1844}
1845
1846void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
1847 CanonIterData &newData,
1848 UErrorCode &errorCode) const {
1849 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
1850 // Inert, or 2-way mapping (including Hangul syllable).
1851 // We do not write a canonStartSet for any yesNo character.
1852 // Composites from 2-way mappings are added at runtime from the
1853 // starter's compositions list, and the other characters in
1854 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1855 // "maybe" characters.
1856 return;
1857 }
1858 for(UChar32 c=start; c<=end; ++c) {
1859 uint32_t oldValue=utrie2_get32(newData.trie, c);
1860 uint32_t newValue=oldValue;
1861 if(norm16>=minMaybeYes) {
1862 // not a segment starter if it occurs in a decomposition or has cc!=0
1863 newValue|=CANON_NOT_SEGMENT_STARTER;
1864 if(norm16<MIN_NORMAL_MAYBE_YES) {
1865 newValue|=CANON_HAS_COMPOSITIONS;
1866 }
1867 } else if(norm16<minYesNo) {
1868 newValue|=CANON_HAS_COMPOSITIONS;
1869 } else {
1870 // c has a one-way decomposition
1871 UChar32 c2=c;
1872 uint16_t norm16_2=norm16;
1873 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
1874 c2=mapAlgorithmic(c2, norm16_2);
1875 norm16_2=getNorm16(c2);
1876 }
1877 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
1878 // c decomposes, get everything from the variable-length extra data
1879 const uint16_t *mapping=getMapping(norm16_2);
4388f060 1880 uint16_t firstUnit=*mapping;
729e4ab9
A
1881 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
1882 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
4388f060 1883 if(c==c2 && (*(mapping-1)&0xff)!=0) {
729e4ab9
A
1884 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
1885 }
729e4ab9
A
1886 }
1887 // Skip empty mappings (no characters in the decomposition).
1888 if(length!=0) {
4388f060 1889 ++mapping; // skip over the firstUnit
729e4ab9
A
1890 // add c to first code point's start set
1891 int32_t i=0;
1892 U16_NEXT_UNSAFE(mapping, i, c2);
1893 newData.addToStartSet(c, c2, errorCode);
1894 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1895 // one-way mapping. A 2-way mapping is possible here after
1896 // intermediate algorithmic mapping.
1897 if(norm16_2>=minNoNo) {
1898 while(i<length) {
1899 U16_NEXT_UNSAFE(mapping, i, c2);
1900 uint32_t c2Value=utrie2_get32(newData.trie, c2);
1901 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
1902 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
1903 &errorCode);
1904 }
1905 }
1906 }
1907 }
1908 } else {
1909 // c decomposed to c2 algorithmically; c has cc==0
1910 newData.addToStartSet(c, c2, errorCode);
1911 }
1912 }
1913 if(newValue!=oldValue) {
1914 utrie2_set32(newData.trie, c, newValue, &errorCode);
1915 }
1916 }
1917}
1918
1919UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
1920 // Logically const: Synchronized instantiation.
1921 Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
1922 CanonIterDataSingleton(me->canonIterDataSingleton, *me, errorCode).getInstance(errorCode);
1923 return U_SUCCESS(errorCode);
1924}
1925
1926int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
1927 return (int32_t)utrie2_get32(((CanonIterData *)canonIterDataSingleton.fInstance)->trie, c);
1928}
1929
1930const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
1931 return *(const UnicodeSet *)(
1932 ((CanonIterData *)canonIterDataSingleton.fInstance)->canonStartSets[n]);
1933}
1934
1935UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
1936 return getCanonValue(c)>=0;
1937}
1938
1939UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
1940 int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
1941 if(canonValue==0) {
1942 return FALSE;
1943 }
1944 set.clear();
1945 int32_t value=canonValue&CANON_VALUE_MASK;
1946 if((canonValue&CANON_HAS_SET)!=0) {
1947 set.addAll(getCanonStartSet(value));
1948 } else if(value!=0) {
1949 set.add(value);
1950 }
1951 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
1952 uint16_t norm16=getNorm16(c);
1953 if(norm16==JAMO_L) {
1954 UChar32 syllable=
1955 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
1956 set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
1957 } else {
1958 addComposites(getCompositionsList(norm16), set);
1959 }
1960 }
1961 return TRUE;
1962}
1963
1964U_NAMESPACE_END
1965
1966// Normalizer2 data swapping ----------------------------------------------- ***
1967
1968U_NAMESPACE_USE
1969
1970U_CAPI int32_t U_EXPORT2
1971unorm2_swap(const UDataSwapper *ds,
1972 const void *inData, int32_t length, void *outData,
1973 UErrorCode *pErrorCode) {
1974 const UDataInfo *pInfo;
1975 int32_t headerSize;
1976
1977 const uint8_t *inBytes;
1978 uint8_t *outBytes;
1979
1980 const int32_t *inIndexes;
1981 int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
1982
1983 int32_t i, offset, nextOffset, size;
1984
1985 /* udata_swapDataHeader checks the arguments */
1986 headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1987 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1988 return 0;
1989 }
1990
1991 /* check data format and format version */
1992 pInfo=(const UDataInfo *)((const char *)inData+4);
1993 if(!(
1994 pInfo->dataFormat[0]==0x4e && /* dataFormat="Nrm2" */
1995 pInfo->dataFormat[1]==0x72 &&
1996 pInfo->dataFormat[2]==0x6d &&
1997 pInfo->dataFormat[3]==0x32 &&
4388f060 1998 (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
729e4ab9
A
1999 )) {
2000 udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2001 pInfo->dataFormat[0], pInfo->dataFormat[1],
2002 pInfo->dataFormat[2], pInfo->dataFormat[3],
2003 pInfo->formatVersion[0]);
2004 *pErrorCode=U_UNSUPPORTED_ERROR;
2005 return 0;
2006 }
2007
2008 inBytes=(const uint8_t *)inData+headerSize;
2009 outBytes=(uint8_t *)outData+headerSize;
2010
2011 inIndexes=(const int32_t *)inBytes;
2012
2013 if(length>=0) {
2014 length-=headerSize;
2015 if(length<(int32_t)sizeof(indexes)) {
2016 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2017 length);
2018 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2019 return 0;
2020 }
2021 }
2022
2023 /* read the first few indexes */
2024 for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
2025 indexes[i]=udata_readInt32(ds, inIndexes[i]);
2026 }
2027
2028 /* get the total length of the data */
2029 size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
2030
2031 if(length>=0) {
2032 if(length<size) {
2033 udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2034 length);
2035 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2036 return 0;
2037 }
2038
2039 /* copy the data for inaccessible bytes */
2040 if(inBytes!=outBytes) {
2041 uprv_memcpy(outBytes, inBytes, size);
2042 }
2043
2044 offset=0;
2045
2046 /* swap the int32_t indexes[] */
2047 nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
2048 ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
2049 offset=nextOffset;
2050
2051 /* swap the UTrie2 */
2052 nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
2053 utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2054 offset=nextOffset;
2055
2056 /* swap the uint16_t extraData[] */
4388f060 2057 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
729e4ab9
A
2058 ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
2059 offset=nextOffset;
2060
4388f060
A
2061 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2062 nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
2063 offset=nextOffset;
2064
729e4ab9
A
2065 U_ASSERT(offset==size);
2066 }
2067
2068 return headerSize+size;
2069}
2070
2071#endif // !UCONFIG_NO_NORMALIZATION