2 *******************************************************************************
4 * Copyright (C) 2009-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: normalizer2impl.cpp
10 * tab size: 8 (not used)
13 * created on: 2009nov22
14 * created by: Markus W. Scherer
17 #include "unicode/utypes.h"
19 #if !UCONFIG_NO_NORMALIZATION
21 #include "unicode/normalizer2.h"
22 #include "unicode/udata.h"
23 #include "unicode/ustring.h"
26 #include "normalizer2impl.h"
35 // ReorderingBuffer -------------------------------------------------------- ***
37 UBool
ReorderingBuffer::init(int32_t destCapacity
, UErrorCode
&errorCode
) {
38 int32_t length
=str
.length();
39 start
=str
.getBuffer(destCapacity
);
41 // getBuffer() already did str.setToBogus()
42 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
46 remainingCapacity
=str
.getCapacity()-length
;
53 // Set reorderStart after the last code point with cc<=1 if there is one.
55 while(previousCC()>1) {}
57 reorderStart
=codePointLimit
;
62 UBool
ReorderingBuffer::equals(const UChar
*otherStart
, const UChar
*otherLimit
) const {
63 int32_t length
=(int32_t)(limit
-start
);
65 length
==(int32_t)(otherLimit
-otherStart
) &&
66 0==u_memcmp(start
, otherStart
, length
);
69 UBool
ReorderingBuffer::appendSupplementary(UChar32 c
, uint8_t cc
, UErrorCode
&errorCode
) {
70 if(remainingCapacity
<2 && !resize(2, errorCode
)) {
73 if(lastCC
<=cc
|| cc
==0) {
75 limit
[1]=U16_TRAIL(c
);
88 UBool
ReorderingBuffer::append(const UChar
*s
, int32_t length
,
89 uint8_t leadCC
, uint8_t trailCC
,
90 UErrorCode
&errorCode
) {
94 if(remainingCapacity
<length
&& !resize(length
, errorCode
)) {
97 remainingCapacity
-=length
;
98 if(lastCC
<=leadCC
|| leadCC
==0) {
100 reorderStart
=limit
+length
;
101 } else if(leadCC
<=1) {
102 reorderStart
=limit
+1; // Ok if not a code point boundary.
104 const UChar
*sLimit
=s
+length
;
105 do { *limit
++=*s
++; } while(s
!=sLimit
);
110 U16_NEXT(s
, i
, length
, c
);
111 insert(c
, leadCC
); // insert first code point
113 U16_NEXT(s
, i
, length
, c
);
115 // s must be in NFD, otherwise we need to use getCC().
116 leadCC
=Normalizer2Impl::getCCFromYesOrMaybe(impl
.getNorm16(c
));
120 append(c
, leadCC
, errorCode
);
126 UBool
ReorderingBuffer::appendZeroCC(UChar32 c
, UErrorCode
&errorCode
) {
127 int32_t cpLength
=U16_LENGTH(c
);
128 if(remainingCapacity
<cpLength
&& !resize(cpLength
, errorCode
)) {
131 remainingCapacity
-=cpLength
;
135 limit
[0]=U16_LEAD(c
);
136 limit
[1]=U16_TRAIL(c
);
144 UBool
ReorderingBuffer::appendZeroCC(const UChar
*s
, const UChar
*sLimit
, UErrorCode
&errorCode
) {
148 int32_t length
=(int32_t)(sLimit
-s
);
149 if(remainingCapacity
<length
&& !resize(length
, errorCode
)) {
152 u_memcpy(limit
, s
, length
);
154 remainingCapacity
-=length
;
160 void ReorderingBuffer::remove() {
161 reorderStart
=limit
=start
;
162 remainingCapacity
=str
.getCapacity();
166 void ReorderingBuffer::removeSuffix(int32_t suffixLength
) {
167 if(suffixLength
<(limit
-start
)) {
169 remainingCapacity
+=suffixLength
;
172 remainingCapacity
=str
.getCapacity();
178 UBool
ReorderingBuffer::resize(int32_t appendLength
, UErrorCode
&errorCode
) {
179 int32_t reorderStartIndex
=(int32_t)(reorderStart
-start
);
180 int32_t length
=(int32_t)(limit
-start
);
181 str
.releaseBuffer(length
);
182 int32_t newCapacity
=length
+appendLength
;
183 int32_t doubleCapacity
=2*str
.getCapacity();
184 if(newCapacity
<doubleCapacity
) {
185 newCapacity
=doubleCapacity
;
187 if(newCapacity
<256) {
190 start
=str
.getBuffer(newCapacity
);
192 // getBuffer() already did str.setToBogus()
193 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
196 reorderStart
=start
+reorderStartIndex
;
198 remainingCapacity
=str
.getCapacity()-length
;
202 void ReorderingBuffer::skipPrevious() {
203 codePointLimit
=codePointStart
;
204 UChar c
=*--codePointStart
;
205 if(U16_IS_TRAIL(c
) && start
<codePointStart
&& U16_IS_LEAD(*(codePointStart
-1))) {
210 uint8_t ReorderingBuffer::previousCC() {
211 codePointLimit
=codePointStart
;
212 if(reorderStart
>=codePointStart
) {
215 UChar32 c
=*--codePointStart
;
216 if(c
<Normalizer2Impl::MIN_CCC_LCCC_CP
) {
221 if(U16_IS_TRAIL(c
) && start
<codePointStart
&& U16_IS_LEAD(c2
=*(codePointStart
-1))) {
223 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
225 return Normalizer2Impl::getCCFromYesOrMaybe(impl
.getNorm16(c
));
228 // Inserts c somewhere before the last character.
229 // Requires 0<cc<lastCC which implies reorderStart<limit.
230 void ReorderingBuffer::insert(UChar32 c
, uint8_t cc
) {
231 for(setIterator(), skipPrevious(); previousCC()>cc
;) {}
232 // insert c at codePointLimit, after the character with prevCC<=cc
234 UChar
*r
=limit
+=U16_LENGTH(c
);
237 } while(codePointLimit
!=q
);
238 writeCodePoint(q
, c
);
244 // Normalizer2Impl --------------------------------------------------------- ***
246 struct CanonIterData
: public UMemory
{
247 CanonIterData(UErrorCode
&errorCode
);
249 void addToStartSet(UChar32 origin
, UChar32 decompLead
, UErrorCode
&errorCode
);
251 UVector canonStartSets
; // contains UnicodeSet *
254 Normalizer2Impl::~Normalizer2Impl() {
256 utrie2_close(normTrie
);
257 UTrie2Singleton(fcdTrieSingleton
).deleteInstance();
258 delete (CanonIterData
*)canonIterDataSingleton
.fInstance
;
262 Normalizer2Impl::isAcceptable(void *context
,
263 const char * /* type */, const char * /*name*/,
264 const UDataInfo
*pInfo
) {
267 pInfo
->isBigEndian
==U_IS_BIG_ENDIAN
&&
268 pInfo
->charsetFamily
==U_CHARSET_FAMILY
&&
269 pInfo
->dataFormat
[0]==0x4e && /* dataFormat="Nrm2" */
270 pInfo
->dataFormat
[1]==0x72 &&
271 pInfo
->dataFormat
[2]==0x6d &&
272 pInfo
->dataFormat
[3]==0x32 &&
273 pInfo
->formatVersion
[0]==1
275 Normalizer2Impl
*me
=(Normalizer2Impl
*)context
;
276 uprv_memcpy(me
->dataVersion
, pInfo
->dataVersion
, 4);
284 Normalizer2Impl::load(const char *packageName
, const char *name
, UErrorCode
&errorCode
) {
285 if(U_FAILURE(errorCode
)) {
288 memory
=udata_openChoice(packageName
, "nrm", name
, isAcceptable
, this, &errorCode
);
289 if(U_FAILURE(errorCode
)) {
292 const uint8_t *inBytes
=(const uint8_t *)udata_getMemory(memory
);
293 const int32_t *inIndexes
=(const int32_t *)inBytes
;
294 int32_t indexesLength
=inIndexes
[IX_NORM_TRIE_OFFSET
]/4;
295 if(indexesLength
<=IX_MIN_MAYBE_YES
) {
296 errorCode
=U_INVALID_FORMAT_ERROR
; // Not enough indexes.
300 minDecompNoCP
=inIndexes
[IX_MIN_DECOMP_NO_CP
];
301 minCompNoMaybeCP
=inIndexes
[IX_MIN_COMP_NO_MAYBE_CP
];
303 minYesNo
=inIndexes
[IX_MIN_YES_NO
];
304 minNoNo
=inIndexes
[IX_MIN_NO_NO
];
305 limitNoNo
=inIndexes
[IX_LIMIT_NO_NO
];
306 minMaybeYes
=inIndexes
[IX_MIN_MAYBE_YES
];
308 int32_t offset
=inIndexes
[IX_NORM_TRIE_OFFSET
];
309 int32_t nextOffset
=inIndexes
[IX_EXTRA_DATA_OFFSET
];
310 normTrie
=utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS
,
311 inBytes
+offset
, nextOffset
-offset
, NULL
,
313 if(U_FAILURE(errorCode
)) {
318 maybeYesCompositions
=(const uint16_t *)(inBytes
+offset
);
319 extraData
=maybeYesCompositions
+(MIN_NORMAL_MAYBE_YES
-minMaybeYes
);
322 uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar
*cpStart
, const UChar
*cpLimit
) const {
324 if(cpStart
==(cpLimit
-1)) {
327 c
=U16_GET_SUPPLEMENTARY(cpStart
[0], cpStart
[1]);
329 uint16_t prevNorm16
=getNorm16(c
);
330 if(prevNorm16
<=minYesNo
) {
331 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0
333 return (uint8_t)(*getMapping(prevNorm16
)>>8); // tccc from yesNo
339 static UBool U_CALLCONV
340 enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32
/*end*/, uint32_t /*value*/) {
341 /* add the start code point to the USet */
342 const USetAdder
*sa
=(const USetAdder
*)context
;
343 sa
->add(sa
->set
, start
);
347 static uint32_t U_CALLCONV
348 segmentStarterMapper(const void * /*context*/, uint32_t value
) {
349 return value
&CANON_NOT_SEGMENT_STARTER
;
355 Normalizer2Impl::addPropertyStarts(const USetAdder
*sa
, UErrorCode
& /*errorCode*/) const {
356 /* add the start code point of each same-value range of each trie */
357 utrie2_enum(normTrie
, NULL
, enumPropertyStartsRange
, sa
);
359 /* add Hangul LV syllables and LV+1 because of skippables */
360 for(UChar c
=Hangul::HANGUL_BASE
; c
<Hangul::HANGUL_LIMIT
; c
+=Hangul::JAMO_T_COUNT
) {
362 sa
->add(sa
->set
, c
+1);
364 sa
->add(sa
->set
, Hangul::HANGUL_LIMIT
); /* add Hangul+1 to continue with other properties */
368 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder
*sa
, UErrorCode
&errorCode
) const {
369 /* add the start code point of each same-value range of the canonical iterator data trie */
370 if(ensureCanonIterData(errorCode
)) {
371 // currently only used for the SEGMENT_STARTER property
372 utrie2_enum(((CanonIterData
*)canonIterDataSingleton
.fInstance
)->trie
,
373 segmentStarterMapper
, enumPropertyStartsRange
, sa
);
378 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar
*src
,
379 UChar32 minNeedDataCP
,
380 ReorderingBuffer
*buffer
,
381 UErrorCode
&errorCode
) const {
382 // Make some effort to support NUL-terminated strings reasonably.
383 // Take the part of the fast quick check loop that does not look up
384 // data and check the first part of the string.
385 // After this prefix, determine the string length to simplify the rest
387 const UChar
*prevSrc
=src
;
389 while((c
=*src
++)<minNeedDataCP
&& c
!=0) {}
390 // Back out the last character for full processing.
394 buffer
->appendZeroCC(prevSrc
, src
, errorCode
);
400 // Dual functionality:
401 // buffer!=NULL: normalize
402 // buffer==NULL: isNormalized/spanQuickCheckYes
404 Normalizer2Impl::decompose(const UChar
*src
, const UChar
*limit
,
405 ReorderingBuffer
*buffer
,
406 UErrorCode
&errorCode
) const {
407 UChar32 minNoCP
=minDecompNoCP
;
409 src
=copyLowPrefixFromNulTerminated(src
, minNoCP
, buffer
, errorCode
);
410 if(U_FAILURE(errorCode
)) {
413 limit
=u_strchr(src
, 0);
416 const UChar
*prevSrc
;
420 // only for quick check
421 const UChar
*prevBoundary
=src
;
425 // count code units below the minimum or with irrelevant data for the quick check
426 for(prevSrc
=src
; src
!=limit
;) {
427 if( (c
=*src
)<minNoCP
||
428 isMostDecompYesAndZeroCC(norm16
=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie
, c
))
431 } else if(!U16_IS_SURROGATE(c
)) {
435 if(U16_IS_SURROGATE_LEAD(c
)) {
436 if((src
+1)!=limit
&& U16_IS_TRAIL(c2
=src
[1])) {
437 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
439 } else /* trail surrogate */ {
440 if(prevSrc
<src
&& U16_IS_LEAD(c2
=*(src
-1))) {
442 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
445 if(isMostDecompYesAndZeroCC(norm16
=getNorm16(c
))) {
452 // copy these code units all at once
455 if(!buffer
->appendZeroCC(prevSrc
, src
, errorCode
)) {
467 // Check one above-minimum, relevant code point.
470 if(!decompose(c
, norm16
, *buffer
, errorCode
)) {
474 if(isDecompYes(norm16
)) {
475 uint8_t cc
=getCCFromYesOrMaybe(norm16
);
476 if(prevCC
<=cc
|| cc
==0) {
484 return prevBoundary
; // "no" or cc out of order
490 // Decompose a short piece of text which is likely to contain characters that
491 // fail the quick check loop and/or where the quick check loop's overhead
492 // is unlikely to be amortized.
493 // Called by the compose() and makeFCD() implementations.
494 UBool
Normalizer2Impl::decomposeShort(const UChar
*src
, const UChar
*limit
,
495 ReorderingBuffer
&buffer
,
496 UErrorCode
&errorCode
) const {
500 UTRIE2_U16_NEXT16(normTrie
, src
, limit
, c
, norm16
);
501 if(!decompose(c
, norm16
, buffer
, errorCode
)) {
508 UBool
Normalizer2Impl::decompose(UChar32 c
, uint16_t norm16
,
509 ReorderingBuffer
&buffer
,
510 UErrorCode
&errorCode
) const {
511 // Only loops for 1:1 algorithmic mappings.
513 // get the decomposition and the lead and trail cc's
514 if(isDecompYes(norm16
)) {
515 // c does not decompose
516 return buffer
.append(c
, getCCFromYesOrMaybe(norm16
), errorCode
);
517 } else if(isHangul(norm16
)) {
518 // Hangul syllable: decompose algorithmically
520 return buffer
.appendZeroCC(jamos
, jamos
+Hangul::decompose(c
, jamos
), errorCode
);
521 } else if(isDecompNoAlgorithmic(norm16
)) {
522 c
=mapAlgorithmic(c
, norm16
);
525 // c decomposes, get everything from the variable-length extra data
526 const uint16_t *mapping
=getMapping(norm16
);
527 uint16_t firstUnit
=*mapping
++;
528 int32_t length
=firstUnit
&MAPPING_LENGTH_MASK
;
529 uint8_t leadCC
, trailCC
;
530 trailCC
=(uint8_t)(firstUnit
>>8);
531 if(firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
) {
532 leadCC
=(uint8_t)(*mapping
++>>8);
536 return buffer
.append((const UChar
*)mapping
, length
, leadCC
, trailCC
, errorCode
);
542 Normalizer2Impl::getDecomposition(UChar32 c
, UChar buffer
[4], int32_t &length
) const {
543 const UChar
*decomp
=NULL
;
546 if(c
<minDecompNoCP
|| isDecompYes(norm16
=getNorm16(c
))) {
547 // c does not decompose
549 } else if(isHangul(norm16
)) {
550 // Hangul syllable: decompose algorithmically
551 length
=Hangul::decompose(c
, buffer
);
553 } else if(isDecompNoAlgorithmic(norm16
)) {
554 c
=mapAlgorithmic(c
, norm16
);
557 U16_APPEND_UNSAFE(buffer
, length
, c
);
559 // c decomposes, get everything from the variable-length extra data
560 const uint16_t *mapping
=getMapping(norm16
);
561 uint16_t firstUnit
=*mapping
++;
562 length
=firstUnit
&MAPPING_LENGTH_MASK
;
563 if(firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
) {
566 return (const UChar
*)mapping
;
571 void Normalizer2Impl::decomposeAndAppend(const UChar
*src
, const UChar
*limit
,
573 ReorderingBuffer
&buffer
,
574 UErrorCode
&errorCode
) const {
576 decompose(src
, limit
, &buffer
, errorCode
);
579 // Just merge the strings at the boundary.
580 ForwardUTrie2StringIterator
iter(normTrie
, src
, limit
);
581 uint8_t firstCC
, prevCC
, cc
;
582 firstCC
=prevCC
=cc
=getCC(iter
.next16());
585 cc
=getCC(iter
.next16());
587 buffer
.append(src
, (int32_t)(iter
.codePointStart
-src
), firstCC
, prevCC
, errorCode
) &&
588 buffer
.appendZeroCC(iter
.codePointStart
, limit
, errorCode
);
591 // Note: hasDecompBoundary() could be implemented as aliases to
592 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
593 // at the cost of building the FCD trie for a decomposition normalizer.
594 UBool
Normalizer2Impl::hasDecompBoundary(UChar32 c
, UBool before
) const {
596 if(c
<minDecompNoCP
) {
599 uint16_t norm16
=getNorm16(c
);
600 if(isHangul(norm16
) || isDecompYesAndZeroCC(norm16
)) {
602 } else if(norm16
>MIN_NORMAL_MAYBE_YES
) {
603 return FALSE
; // ccc!=0
604 } else if(isDecompNoAlgorithmic(norm16
)) {
605 c
=mapAlgorithmic(c
, norm16
);
607 // c decomposes, get everything from the variable-length extra data
608 const uint16_t *mapping
=getMapping(norm16
);
609 uint16_t firstUnit
=*mapping
++;
610 if((firstUnit
&MAPPING_LENGTH_MASK
)==0) {
614 // decomp after-boundary: same as hasFCDBoundaryAfter(),
615 // fcd16<=1 || trailCC==0
616 if(firstUnit
>0x1ff) {
617 return FALSE
; // trailCC>1
619 if(firstUnit
<=0xff) {
620 return TRUE
; // trailCC==0
622 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
624 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
625 return (firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
)==0 || (*mapping
&0xff00)==0;
631 * Finds the recomposition result for
632 * a forward-combining "lead" character,
633 * specified with a pointer to its compositions list,
634 * and a backward-combining "trail" character.
636 * If the lead and trail characters combine, then this function returns
637 * the following "compositeAndFwd" value:
638 * Bits 21..1 composite character
639 * Bit 0 set if the composite is a forward-combining starter
640 * otherwise it returns -1.
642 * The compositions list has (trail, compositeAndFwd) pair entries,
643 * encoded as either pairs or triples of 16-bit units.
644 * The last entry has the high bit of its first unit set.
646 * The list is sorted by ascending trail characters (there are no duplicates).
647 * A linear search is used.
649 * See normalizer2impl.h for a more detailed description
650 * of the compositions list format.
652 int32_t Normalizer2Impl::combine(const uint16_t *list
, UChar32 trail
) {
653 uint16_t key1
, firstUnit
;
654 if(trail
<COMP_1_TRAIL_LIMIT
) {
655 // trail character is 0..33FF
656 // result entry may have 2 or 3 units
657 key1
=(uint16_t)(trail
<<1);
658 while(key1
>(firstUnit
=*list
)) {
659 list
+=2+(firstUnit
&COMP_1_TRIPLE
);
661 if(key1
==(firstUnit
&COMP_1_TRAIL_MASK
)) {
662 if(firstUnit
&COMP_1_TRIPLE
) {
663 return ((int32_t)list
[1]<<16)|list
[2];
669 // trail character is 3400..10FFFF
670 // result entry has 3 units
671 key1
=(uint16_t)(COMP_1_TRAIL_LIMIT
+
672 (((trail
>>COMP_1_TRAIL_SHIFT
))&
674 uint16_t key2
=(uint16_t)(trail
<<COMP_2_TRAIL_SHIFT
);
677 if(key1
>(firstUnit
=*list
)) {
678 list
+=2+(firstUnit
&COMP_1_TRIPLE
);
679 } else if(key1
==(firstUnit
&COMP_1_TRAIL_MASK
)) {
680 if(key2
>(secondUnit
=list
[1])) {
681 if(firstUnit
&COMP_1_LAST_TUPLE
) {
686 } else if(key2
==(secondUnit
&COMP_2_TRAIL_MASK
)) {
687 return ((int32_t)(secondUnit
&~COMP_2_TRAIL_MASK
)<<16)|list
[2];
700 * @param list some character's compositions list
701 * @param set recursively receives the composites from these compositions
703 void Normalizer2Impl::addComposites(const uint16_t *list
, UnicodeSet
&set
) const {
705 int32_t compositeAndFwd
;
708 if((firstUnit
&COMP_1_TRIPLE
)==0) {
709 compositeAndFwd
=list
[1];
712 compositeAndFwd
=(((int32_t)list
[1]&~COMP_2_TRAIL_MASK
)<<16)|list
[2];
715 UChar32 composite
=compositeAndFwd
>>1;
716 if((compositeAndFwd
&1)!=0) {
717 addComposites(getCompositionsListForComposite(getNorm16(composite
)), set
);
720 } while((firstUnit
&COMP_1_LAST_TUPLE
)==0);
724 * Recomposes the buffer text starting at recomposeStartIndex
725 * (which is in NFD - decomposed and canonically ordered),
726 * and truncates the buffer contents.
728 * Note that recomposition never lengthens the text:
729 * Any character consists of either one or two code units;
730 * a composition may contain at most one more code unit than the original starter,
731 * while the combining mark that is removed has at least one code unit.
733 void Normalizer2Impl::recompose(ReorderingBuffer
&buffer
, int32_t recomposeStartIndex
,
734 UBool onlyContiguous
) const {
735 UChar
*p
=buffer
.getStart()+recomposeStartIndex
;
736 UChar
*limit
=buffer
.getLimit();
741 UChar
*starter
, *pRemove
, *q
, *r
;
742 const uint16_t *compositionsList
;
743 UChar32 c
, compositeAndFwd
;
746 UBool starterIsSupplementary
;
748 // Some of the following variables are not used until we have a forward-combining starter
749 // and are only initialized now to avoid compiler warnings.
750 compositionsList
=NULL
; // used as indicator for whether we have a forward-combining starter
752 starterIsSupplementary
=FALSE
;
756 UTRIE2_U16_NEXT16(normTrie
, p
, limit
, c
, norm16
);
757 cc
=getCCFromYesOrMaybe(norm16
);
758 if( // this character combines backward and
760 // we have seen a starter that combines forward and
761 compositionsList
!=NULL
&&
762 // the backward-combining character is not blocked
763 (prevCC
<cc
|| prevCC
==0)
765 if(isJamoVT(norm16
)) {
766 // c is a Jamo V/T, see if we can compose it with the previous character.
767 if(c
<Hangul::JAMO_T_BASE
) {
768 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
769 UChar prev
=(UChar
)(*starter
-Hangul::JAMO_L_BASE
);
770 if(prev
<Hangul::JAMO_L_COUNT
) {
772 UChar syllable
=(UChar
)
773 (Hangul::HANGUL_BASE
+
774 (prev
*Hangul::JAMO_V_COUNT
+(c
-Hangul::JAMO_V_BASE
))*
775 Hangul::JAMO_T_COUNT
);
777 if(p
!=limit
&& (t
=(UChar
)(*p
-Hangul::JAMO_T_BASE
))<Hangul::JAMO_T_COUNT
) {
779 syllable
+=t
; // The next character was a Jamo T.
782 // remove the Jamo V/T
793 * No "else" for Jamo T:
794 * Since the input is in NFD, there are no Hangul LV syllables that
795 * a Jamo T could combine with.
796 * All Jamo Ts are combined above when handling Jamo Vs.
801 compositionsList
=NULL
;
803 } else if((compositeAndFwd
=combine(compositionsList
, c
))>=0) {
804 // The starter and the combining mark (c) do combine.
805 UChar32 composite
=compositeAndFwd
>>1;
807 // Replace the starter with the composite, remove the combining mark.
808 pRemove
=p
-U16_LENGTH(c
); // pRemove & p: start & limit of the combining mark
809 if(starterIsSupplementary
) {
810 if(U_IS_SUPPLEMENTARY(composite
)) {
811 // both are supplementary
812 starter
[0]=U16_LEAD(composite
);
813 starter
[1]=U16_TRAIL(composite
);
815 *starter
=(UChar
)composite
;
816 // The composite is shorter than the starter,
817 // move the intermediate characters forward one.
818 starterIsSupplementary
=FALSE
;
826 } else if(U_IS_SUPPLEMENTARY(composite
)) {
827 // The composite is longer than the starter,
828 // move the intermediate characters back one.
829 starterIsSupplementary
=TRUE
;
830 ++starter
; // temporarily increment for the loop boundary
836 *starter
=U16_TRAIL(composite
);
837 *--starter
=U16_LEAD(composite
); // undo the temporary increment
839 // both are on the BMP
840 *starter
=(UChar
)composite
;
843 /* remove the combining mark by moving the following text over it */
853 // Keep prevCC because we removed the combining mark.
858 // Is the composite a starter that combines forward?
859 if(compositeAndFwd
&1) {
861 getCompositionsListForComposite(getNorm16(composite
));
863 compositionsList
=NULL
;
866 // We combined; continue with looking for compositions.
871 // no combination this time
877 // If c did not combine, then check if it is a starter.
879 // Found a new starter.
880 if((compositionsList
=getCompositionsListForDecompYes(norm16
))!=NULL
) {
881 // It may combine with something, prepare for it.
883 starterIsSupplementary
=FALSE
;
886 starterIsSupplementary
=TRUE
;
890 } else if(onlyContiguous
) {
891 // FCC: no discontiguous compositions; any intervening character blocks.
892 compositionsList
=NULL
;
895 buffer
.setReorderingLimit(limit
);
898 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
899 // doCompose: normalize
900 // !doCompose: isNormalized (buffer must be empty and initialized)
902 Normalizer2Impl::compose(const UChar
*src
, const UChar
*limit
,
903 UBool onlyContiguous
,
905 ReorderingBuffer
&buffer
,
906 UErrorCode
&errorCode
) const {
908 * prevBoundary points to the last character before the current one
909 * that has a composition boundary before it with ccc==0 and quick check "yes".
910 * Keeping track of prevBoundary saves us looking for a composition boundary
911 * when we find a "no" or "maybe".
913 * When we back out from prevSrc back to prevBoundary,
914 * then we also remove those same characters (which had been simply copied
915 * or canonically-order-inserted) from the ReorderingBuffer.
916 * Therefore, at all times, the [prevBoundary..prevSrc[ source units
917 * must correspond 1:1 to destination units at the end of the destination buffer.
919 const UChar
*prevBoundary
=src
;
920 UChar32 minNoMaybeCP
=minCompNoMaybeCP
;
922 src
=copyLowPrefixFromNulTerminated(src
, minNoMaybeCP
,
923 doCompose
? &buffer
: NULL
,
925 if(U_FAILURE(errorCode
)) {
928 if(prevBoundary
<src
) {
929 // Set prevBoundary to the last character in the prefix.
932 limit
=u_strchr(src
, 0);
935 const UChar
*prevSrc
;
939 // only for isNormalized
943 // count code units below the minimum or with irrelevant data for the quick check
944 for(prevSrc
=src
; src
!=limit
;) {
945 if( (c
=*src
)<minNoMaybeCP
||
946 isCompYesAndZeroCC(norm16
=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie
, c
))
949 } else if(!U16_IS_SURROGATE(c
)) {
953 if(U16_IS_SURROGATE_LEAD(c
)) {
954 if((src
+1)!=limit
&& U16_IS_TRAIL(c2
=src
[1])) {
955 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
957 } else /* trail surrogate */ {
958 if(prevSrc
<src
&& U16_IS_LEAD(c2
=*(src
-1))) {
960 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
963 if(isCompYesAndZeroCC(norm16
=getNorm16(c
))) {
970 // copy these code units all at once
973 if(!buffer
.appendZeroCC(prevSrc
, src
, errorCode
)) {
982 // Set prevBoundary to the last character in the quick check loop.
984 if( U16_IS_TRAIL(*prevBoundary
) && prevSrc
<prevBoundary
&&
985 U16_IS_LEAD(*(prevBoundary
-1))
989 // The start of the current character (c).
991 } else if(src
==limit
) {
997 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
998 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1000 * Check for Jamo V/T, then for regular characters.
1001 * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1003 if(isJamoVT(norm16
) && prevBoundary
!=prevSrc
) {
1004 UChar prev
=*(prevSrc
-1);
1005 UBool needToDecompose
=FALSE
;
1006 if(c
<Hangul::JAMO_T_BASE
) {
1007 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1008 prev
=(UChar
)(prev
-Hangul::JAMO_L_BASE
);
1009 if(prev
<Hangul::JAMO_L_COUNT
) {
1013 UChar syllable
=(UChar
)
1014 (Hangul::HANGUL_BASE
+
1015 (prev
*Hangul::JAMO_V_COUNT
+(c
-Hangul::JAMO_V_BASE
))*
1016 Hangul::JAMO_T_COUNT
);
1018 if(src
!=limit
&& (t
=(UChar
)(*src
-Hangul::JAMO_T_BASE
))<Hangul::JAMO_T_COUNT
) {
1020 syllable
+=t
; // The next character was a Jamo T.
1022 buffer
.setLastChar(syllable
);
1025 // If we see L+V+x where x!=T then we drop to the slow path,
1026 // decompose and recompose.
1027 // This is to deal with NFKC finding normal L and V but a
1028 // compatibility variant of a T. We need to either fully compose that
1029 // combination here (which would complicate the code and may not work
1030 // with strange custom data) or use the slow path -- or else our replacing
1031 // two input characters (L+V) with one output character (LV syllable)
1032 // would violate the invariant that [prevBoundary..prevSrc[ has the same
1033 // length as what we appended to the buffer since prevBoundary.
1034 needToDecompose
=TRUE
;
1036 } else if(Hangul::isHangulWithoutJamoT(prev
)) {
1037 // c is a Jamo Trailing consonant,
1038 // compose with previous Hangul LV that does not contain a Jamo T.
1042 buffer
.setLastChar((UChar
)(prev
+c
-Hangul::JAMO_T_BASE
));
1046 if(!needToDecompose
) {
1047 // The Jamo V/T did not compose into a Hangul syllable.
1049 if(!buffer
.appendBMP((UChar
)c
, 0, errorCode
)) {
1059 * Source buffer pointers:
1061 * all done quick check current char not yet
1062 * "yes" but (c) processed
1065 * [-------------[-------------[-------------[-------------[
1067 * orig. src prevBoundary prevSrc src limit
1070 * Destination buffer pointers inside the ReorderingBuffer:
1072 * all done might take not filled yet
1075 * [-------------[-------------[-------------[
1077 * start reorderStart limit |
1080 if(norm16
>=MIN_YES_YES_WITH_CC
) {
1081 uint8_t cc
=(uint8_t)norm16
; // cc!=0
1082 if( onlyContiguous
&& // FCC
1083 (doCompose
? buffer
.getLastCC() : prevCC
)==0 &&
1084 prevBoundary
<prevSrc
&&
1085 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1086 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1087 // passed the quick check "yes && ccc==0" test.
1088 // Check whether the last character was a "yesYes" or a "yesNo".
1089 // If a "yesNo", then we get its trailing ccc from its
1090 // mapping and check for canonical order.
1091 // All other cases are ok.
1092 getTrailCCFromCompYesAndZeroCC(prevBoundary
, prevSrc
)>cc
1094 // Fails FCD test, need to decompose and contiguously recompose.
1098 } else if(doCompose
) {
1099 if(!buffer
.append(c
, cc
, errorCode
)) {
1103 } else if(prevCC
<=cc
) {
1109 } else if(!doCompose
&& !isMaybeOrNonZeroCC(norm16
)) {
1114 * Find appropriate boundaries around this character,
1115 * decompose the source text from between the boundaries,
1118 * We may need to remove the last few characters from the ReorderingBuffer
1119 * to account for source text that was copied or appended
1120 * but needs to take part in the recomposition.
1124 * Find the last composition boundary in [prevBoundary..src[.
1125 * It is either the decomposition of the current character (at prevSrc),
1128 if(hasCompBoundaryBefore(c
, norm16
)) {
1129 prevBoundary
=prevSrc
;
1130 } else if(doCompose
) {
1131 buffer
.removeSuffix((int32_t)(prevSrc
-prevBoundary
));
1134 // Find the next composition boundary in [src..limit[ -
1135 // modifies src to point to the next starter.
1136 src
=(UChar
*)findNextCompBoundary(src
, limit
);
1138 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1139 int32_t recomposeStartIndex
=buffer
.length();
1140 if(!decomposeShort(prevBoundary
, src
, buffer
, errorCode
)) {
1143 recompose(buffer
, recomposeStartIndex
, onlyContiguous
);
1145 if(!buffer
.equals(prevBoundary
, src
)) {
1152 // Move to the next starter. We never need to look back before this point again.
1158 // Very similar to compose(): Make the same changes in both places if relevant.
1159 // pQCResult==NULL: spanQuickCheckYes
1160 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1162 Normalizer2Impl::composeQuickCheck(const UChar
*src
, const UChar
*limit
,
1163 UBool onlyContiguous
,
1164 UNormalizationCheckResult
*pQCResult
) const {
1166 * prevBoundary points to the last character before the current one
1167 * that has a composition boundary before it with ccc==0 and quick check "yes".
1169 const UChar
*prevBoundary
=src
;
1170 UChar32 minNoMaybeCP
=minCompNoMaybeCP
;
1172 UErrorCode errorCode
=U_ZERO_ERROR
;
1173 src
=copyLowPrefixFromNulTerminated(src
, minNoMaybeCP
, NULL
, errorCode
);
1174 if(prevBoundary
<src
) {
1175 // Set prevBoundary to the last character in the prefix.
1178 limit
=u_strchr(src
, 0);
1181 const UChar
*prevSrc
;
1187 // count code units below the minimum or with irrelevant data for the quick check
1188 for(prevSrc
=src
;;) {
1192 if( (c
=*src
)<minNoMaybeCP
||
1193 isCompYesAndZeroCC(norm16
=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie
, c
))
1196 } else if(!U16_IS_SURROGATE(c
)) {
1200 if(U16_IS_SURROGATE_LEAD(c
)) {
1201 if((src
+1)!=limit
&& U16_IS_TRAIL(c2
=src
[1])) {
1202 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
1204 } else /* trail surrogate */ {
1205 if(prevSrc
<src
&& U16_IS_LEAD(c2
=*(src
-1))) {
1207 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
1210 if(isCompYesAndZeroCC(norm16
=getNorm16(c
))) {
1218 // Set prevBoundary to the last character in the quick check loop.
1220 if( U16_IS_TRAIL(*prevBoundary
) && prevSrc
<prevBoundary
&&
1221 U16_IS_LEAD(*(prevBoundary
-1))
1226 // The start of the current character (c).
1232 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1233 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1236 if(isMaybeOrNonZeroCC(norm16
)) {
1237 uint8_t cc
=getCCFromYesOrMaybe(norm16
);
1238 if( onlyContiguous
&& // FCC
1241 prevBoundary
<prevSrc
&&
1242 // prevCC==0 && prevBoundary<prevSrc tell us that
1243 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1244 // passed the quick check "yes && ccc==0" test.
1245 // Check whether the last character was a "yesYes" or a "yesNo".
1246 // If a "yesNo", then we get its trailing ccc from its
1247 // mapping and check for canonical order.
1248 // All other cases are ok.
1249 getTrailCCFromCompYesAndZeroCC(prevBoundary
, prevSrc
)>cc
1252 } else if(prevCC
<=cc
|| cc
==0) {
1254 if(norm16
<MIN_YES_YES_WITH_CC
) {
1255 if(pQCResult
!=NULL
) {
1256 *pQCResult
=UNORM_MAYBE
;
1258 return prevBoundary
;
1264 if(pQCResult
!=NULL
) {
1265 *pQCResult
=UNORM_NO
;
1267 return prevBoundary
;
1271 void Normalizer2Impl::composeAndAppend(const UChar
*src
, const UChar
*limit
,
1273 UBool onlyContiguous
,
1274 ReorderingBuffer
&buffer
,
1275 UErrorCode
&errorCode
) const {
1276 if(!buffer
.isEmpty()) {
1277 const UChar
*firstStarterInSrc
=findNextCompBoundary(src
, limit
);
1278 if(src
!=firstStarterInSrc
) {
1279 const UChar
*lastStarterInDest
=findPreviousCompBoundary(buffer
.getStart(),
1281 UnicodeString
middle(lastStarterInDest
,
1282 (int32_t)(buffer
.getLimit()-lastStarterInDest
));
1283 buffer
.removeSuffix((int32_t)(buffer
.getLimit()-lastStarterInDest
));
1284 middle
.append(src
, (int32_t)(firstStarterInSrc
-src
));
1285 const UChar
*middleStart
=middle
.getBuffer();
1286 compose(middleStart
, middleStart
+middle
.length(), onlyContiguous
,
1287 TRUE
, buffer
, errorCode
);
1288 if(U_FAILURE(errorCode
)) {
1291 src
=firstStarterInSrc
;
1295 compose(src
, limit
, onlyContiguous
, TRUE
, buffer
, errorCode
);
1297 buffer
.appendZeroCC(src
, limit
, errorCode
);
1302 * Does c have a composition boundary before it?
1303 * True if its decomposition begins with a character that has
1304 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
1305 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
1306 * (isCompYesAndZeroCC()) so we need not decompose.
1308 UBool
Normalizer2Impl::hasCompBoundaryBefore(UChar32 c
, uint16_t norm16
) const {
1310 if(isCompYesAndZeroCC(norm16
)) {
1312 } else if(isMaybeOrNonZeroCC(norm16
)) {
1314 } else if(isDecompNoAlgorithmic(norm16
)) {
1315 c
=mapAlgorithmic(c
, norm16
);
1316 norm16
=getNorm16(c
);
1318 // c decomposes, get everything from the variable-length extra data
1319 const uint16_t *mapping
=getMapping(norm16
);
1320 uint16_t firstUnit
=*mapping
++;
1321 if((firstUnit
&MAPPING_LENGTH_MASK
)==0) {
1324 if((firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
) && (*mapping
++&0xff00)) {
1325 return FALSE
; // non-zero leadCC
1329 U16_NEXT_UNSAFE(mapping
, i
, c
);
1330 return isCompYesAndZeroCC(getNorm16(c
));
1335 UBool
Normalizer2Impl::hasCompBoundaryAfter(UChar32 c
, UBool onlyContiguous
, UBool testInert
) const {
1337 uint16_t norm16
=getNorm16(c
);
1338 if(isInert(norm16
)) {
1340 } else if(norm16
<=minYesNo
) {
1341 // Hangul LVT (==minYesNo) has a boundary after it.
1342 // Hangul LV and non-inert yesYes characters combine forward.
1343 return isHangul(norm16
) && !Hangul::isHangulWithoutJamoT((UChar
)c
);
1344 } else if(norm16
>= (testInert
? minNoNo
: minMaybeYes
)) {
1346 } else if(isDecompNoAlgorithmic(norm16
)) {
1347 c
=mapAlgorithmic(c
, norm16
);
1349 // c decomposes, get everything from the variable-length extra data.
1350 // If testInert, then c must be a yesNo character which has lccc=0,
1351 // otherwise it could be a noNo.
1352 const uint16_t *mapping
=getMapping(norm16
);
1353 uint16_t firstUnit
=*mapping
;
1355 // c is not deleted, and
1356 // it and its decomposition do not combine forward, and it has a starter, and
1357 // if FCC then trailCC<=1
1359 (firstUnit
&MAPPING_LENGTH_MASK
)!=0 &&
1360 (firstUnit
&(MAPPING_PLUS_COMPOSITION_LIST
|MAPPING_NO_COMP_BOUNDARY_AFTER
))==0 &&
1361 (!onlyContiguous
|| firstUnit
<=0x1ff);
1366 const UChar
*Normalizer2Impl::findPreviousCompBoundary(const UChar
*start
, const UChar
*p
) const {
1367 BackwardUTrie2StringIterator
iter(normTrie
, start
, p
);
1370 norm16
=iter
.previous16();
1371 } while(!hasCompBoundaryBefore(iter
.codePoint
, norm16
));
1372 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
1373 // but that's probably not worth the extra cost.
1374 return iter
.codePointStart
;
1377 const UChar
*Normalizer2Impl::findNextCompBoundary(const UChar
*p
, const UChar
*limit
) const {
1378 ForwardUTrie2StringIterator
iter(normTrie
, p
, limit
);
1381 norm16
=iter
.next16();
1382 } while(!hasCompBoundaryBefore(iter
.codePoint
, norm16
));
1383 return iter
.codePointStart
;
1386 class FCDTrieSingleton
: public UTrie2Singleton
{
1388 FCDTrieSingleton(SimpleSingleton
&s
, Normalizer2Impl
&ni
, UErrorCode
&ec
) :
1389 UTrie2Singleton(s
), impl(ni
), errorCode(ec
) {}
1390 UTrie2
*getInstance(UErrorCode
&errorCode
) {
1391 return UTrie2Singleton::getInstance(createInstance
, this, errorCode
);
1393 static void *createInstance(const void *context
, UErrorCode
&errorCode
);
1394 UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
1396 impl
.setFCD16FromNorm16(start
, end
, (uint16_t)value
, newFCDTrie
, errorCode
);
1398 return U_SUCCESS(errorCode
);
1401 Normalizer2Impl
&impl
;
1403 UErrorCode
&errorCode
;
1408 // Set the FCD value for a range of same-norm16 characters.
1409 static UBool U_CALLCONV
1410 enumRangeHandler(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
1411 return ((FCDTrieSingleton
*)context
)->rangeHandler(start
, end
, value
);
1414 // Collect (OR together) the FCD values for a range of supplementary characters,
1415 // for their lead surrogate code unit.
1416 static UBool U_CALLCONV
1417 enumRangeOrValue(const void *context
, UChar32
/*start*/, UChar32
/*end*/, uint32_t value
) {
1418 *((uint32_t *)context
)|=value
;
1424 void *FCDTrieSingleton::createInstance(const void *context
, UErrorCode
&errorCode
) {
1425 FCDTrieSingleton
*me
=(FCDTrieSingleton
*)context
;
1426 me
->newFCDTrie
=utrie2_open(0, 0, &errorCode
);
1427 if(U_SUCCESS(errorCode
)) {
1428 utrie2_enum(me
->impl
.getNormTrie(), NULL
, enumRangeHandler
, me
);
1429 for(UChar lead
=0xd800; lead
<0xdc00; ++lead
) {
1430 uint32_t oredValue
=utrie2_get32(me
->newFCDTrie
, lead
);
1431 utrie2_enumForLeadSurrogate(me
->newFCDTrie
, lead
, NULL
, enumRangeOrValue
, &oredValue
);
1433 // Set a "bad" value for makeFCD() to break the quick check loop
1434 // and look up the value for the supplementary code point.
1435 // If there is any lccc, then set the worst-case lccc of 1.
1436 // The ORed-together value's tccc is already the worst case.
1437 if(oredValue
>0xff) {
1438 oredValue
=0x100|(oredValue
&0xff);
1440 utrie2_set32ForLeadSurrogateCodeUnit(me
->newFCDTrie
, lead
, oredValue
, &errorCode
);
1443 utrie2_freeze(me
->newFCDTrie
, UTRIE2_16_VALUE_BITS
, &errorCode
);
1444 if(U_SUCCESS(errorCode
)) {
1445 return me
->newFCDTrie
;
1448 utrie2_close(me
->newFCDTrie
);
1452 void Normalizer2Impl::setFCD16FromNorm16(UChar32 start
, UChar32 end
, uint16_t norm16
,
1453 UTrie2
*newFCDTrie
, UErrorCode
&errorCode
) const {
1454 // Only loops for 1:1 algorithmic mappings.
1456 if(norm16
>=MIN_NORMAL_MAYBE_YES
) {
1459 } else if(norm16
<=minYesNo
|| minMaybeYes
<=norm16
) {
1460 // no decomposition or Hangul syllable, all zeros
1462 } else if(limitNoNo
<=norm16
) {
1463 int32_t delta
=norm16
-(minMaybeYes
-MAX_DELTA
-1);
1466 norm16
=getNorm16(start
);
1468 // the same delta leads from different original characters to different mappings
1470 UChar32 c
=start
+delta
;
1471 setFCD16FromNorm16(c
, c
, getNorm16(c
), newFCDTrie
, errorCode
);
1472 } while(++start
<=end
);
1476 // c decomposes, get everything from the variable-length extra data
1477 const uint16_t *mapping
=getMapping(norm16
);
1478 uint16_t firstUnit
=*mapping
;
1479 if((firstUnit
&MAPPING_LENGTH_MASK
)==0) {
1480 // A character that is deleted (maps to an empty string) must
1481 // get the worst-case lccc and tccc values because arbitrary
1482 // characters on both sides will become adjacent.
1485 if(firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
) {
1486 norm16
=mapping
[1]&0xff00; // lccc
1490 norm16
|=firstUnit
>>8; // tccc
1493 utrie2_setRange32(newFCDTrie
, start
, end
, norm16
, TRUE
, &errorCode
);
1498 const UTrie2
*Normalizer2Impl::getFCDTrie(UErrorCode
&errorCode
) const {
1499 // Logically const: Synchronized instantiation.
1500 Normalizer2Impl
*me
=const_cast<Normalizer2Impl
*>(this);
1501 return FCDTrieSingleton(me
->fcdTrieSingleton
, *me
, errorCode
).getInstance(errorCode
);
1504 // Dual functionality:
1505 // buffer!=NULL: normalize
1506 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
1508 Normalizer2Impl::makeFCD(const UChar
*src
, const UChar
*limit
,
1509 ReorderingBuffer
*buffer
,
1510 UErrorCode
&errorCode
) const {
1511 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1512 // Similar to the prevBoundary in the compose() implementation.
1513 const UChar
*prevBoundary
=src
;
1514 int32_t prevFCD16
=0;
1516 src
=copyLowPrefixFromNulTerminated(src
, MIN_CCC_LCCC_CP
, buffer
, errorCode
);
1517 if(U_FAILURE(errorCode
)) {
1520 if(prevBoundary
<src
) {
1522 // We know that the previous character's lccc==0.
1523 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1524 prevFCD16
=getFCD16FromSingleLead(*(src
-1));
1529 limit
=u_strchr(src
, 0);
1532 // Note: In this function we use buffer->appendZeroCC() because we track
1533 // the lead and trail combining classes here, rather than leaving it to
1534 // the ReorderingBuffer.
1535 // The exception is the call to decomposeShort() which uses the buffer
1536 // in the normal way.
1538 const UTrie2
*trie
=fcdTrie();
1540 const UChar
*prevSrc
;
1545 // count code units with lccc==0
1546 for(prevSrc
=src
; src
!=limit
;) {
1547 if((c
=*src
)<MIN_CCC_LCCC_CP
) {
1550 } else if((fcd16
=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(trie
, c
))<=0xff) {
1553 } else if(!U16_IS_SURROGATE(c
)) {
1557 if(U16_IS_SURROGATE_LEAD(c
)) {
1558 if((src
+1)!=limit
&& U16_IS_TRAIL(c2
=src
[1])) {
1559 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
1561 } else /* trail surrogate */ {
1562 if(prevSrc
<src
&& U16_IS_LEAD(c2
=*(src
-1))) {
1564 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
1567 if((fcd16
=getFCD16(c
))<=0xff) {
1575 // copy these code units all at once
1577 if(buffer
!=NULL
&& !buffer
->appendZeroCC(prevSrc
, src
, errorCode
)) {
1584 // We know that the previous character's lccc==0.
1586 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1587 prevFCD16
=getFCD16FromSingleLead((UChar
)~prevFCD16
);
1592 const UChar
*p
=src
-1;
1593 if(U16_IS_TRAIL(*p
) && prevSrc
<p
&& U16_IS_LEAD(*(p
-1))) {
1595 // Need to fetch the previous character's FCD value because
1596 // prevFCD16 was just for the trail surrogate code point.
1597 prevFCD16
=getFCD16FromSurrogatePair(p
[0], p
[1]);
1598 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1604 // The start of the current character (c).
1606 } else if(src
==limit
) {
1611 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1612 // Check for proper order, and decompose locally if necessary.
1613 if((prevFCD16
&0xff)<=(fcd16
>>8)) {
1614 // proper order: prev tccc <= current lccc
1615 if((fcd16
&0xff)<=1) {
1618 if(buffer
!=NULL
&& !buffer
->appendZeroCC(c
, errorCode
)) {
1623 } else if(buffer
==NULL
) {
1624 return prevBoundary
; // quick check "no"
1627 * Back out the part of the source that we copied or appended
1628 * already but is now going to be decomposed.
1629 * prevSrc is set to after what was copied/appended.
1631 buffer
->removeSuffix((int32_t)(prevSrc
-prevBoundary
));
1633 * Find the part of the source that needs to be decomposed,
1634 * up to the next safe boundary.
1636 src
=findNextFCDBoundary(src
, limit
);
1638 * The source text does not fulfill the conditions for FCD.
1639 * Decompose and reorder a limited piece of the text.
1641 if(!decomposeShort(prevBoundary
, src
, *buffer
, errorCode
)) {
1651 void Normalizer2Impl::makeFCDAndAppend(const UChar
*src
, const UChar
*limit
,
1653 ReorderingBuffer
&buffer
,
1654 UErrorCode
&errorCode
) const {
1655 if(!buffer
.isEmpty()) {
1656 const UChar
*firstBoundaryInSrc
=findNextFCDBoundary(src
, limit
);
1657 if(src
!=firstBoundaryInSrc
) {
1658 const UChar
*lastBoundaryInDest
=findPreviousFCDBoundary(buffer
.getStart(),
1660 UnicodeString
middle(lastBoundaryInDest
,
1661 (int32_t)(buffer
.getLimit()-lastBoundaryInDest
));
1662 buffer
.removeSuffix((int32_t)(buffer
.getLimit()-lastBoundaryInDest
));
1663 middle
.append(src
, (int32_t)(firstBoundaryInSrc
-src
));
1664 const UChar
*middleStart
=middle
.getBuffer();
1665 makeFCD(middleStart
, middleStart
+middle
.length(), &buffer
, errorCode
);
1666 if(U_FAILURE(errorCode
)) {
1669 src
=firstBoundaryInSrc
;
1673 makeFCD(src
, limit
, &buffer
, errorCode
);
1675 buffer
.appendZeroCC(src
, limit
, errorCode
);
1679 const UChar
*Normalizer2Impl::findPreviousFCDBoundary(const UChar
*start
, const UChar
*p
) const {
1680 BackwardUTrie2StringIterator
iter(fcdTrie(), start
, p
);
1683 fcd16
=iter
.previous16();
1684 } while(fcd16
>0xff);
1685 return iter
.codePointStart
;
1688 const UChar
*Normalizer2Impl::findNextFCDBoundary(const UChar
*p
, const UChar
*limit
) const {
1689 ForwardUTrie2StringIterator
iter(fcdTrie(), p
, limit
);
1692 fcd16
=iter
.next16();
1693 } while(fcd16
>0xff);
1694 return iter
.codePointStart
;
1697 // CanonicalIterator data -------------------------------------------------- ***
1699 CanonIterData::CanonIterData(UErrorCode
&errorCode
) :
1700 trie(utrie2_open(0, 0, &errorCode
)),
1701 canonStartSets(uhash_deleteUObject
, NULL
, errorCode
) {}
1703 CanonIterData::~CanonIterData() {
1707 void CanonIterData::addToStartSet(UChar32 origin
, UChar32 decompLead
, UErrorCode
&errorCode
) {
1708 uint32_t canonValue
=utrie2_get32(trie
, decompLead
);
1709 if((canonValue
&(CANON_HAS_SET
|CANON_VALUE_MASK
))==0 && origin
!=0) {
1710 // origin is the first character whose decomposition starts with
1711 // the character for which we are setting the value.
1712 utrie2_set32(trie
, decompLead
, canonValue
|origin
, &errorCode
);
1714 // origin is not the first character, or it is U+0000.
1716 if((canonValue
&CANON_HAS_SET
)==0) {
1719 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
1722 UChar32 firstOrigin
=(UChar32
)(canonValue
&CANON_VALUE_MASK
);
1723 canonValue
=(canonValue
&~CANON_VALUE_MASK
)|CANON_HAS_SET
|(uint32_t)canonStartSets
.size();
1724 utrie2_set32(trie
, decompLead
, canonValue
, &errorCode
);
1725 canonStartSets
.addElement(set
, errorCode
);
1726 if(firstOrigin
!=0) {
1727 set
->add(firstOrigin
);
1730 set
=(UnicodeSet
*)canonStartSets
[(int32_t)(canonValue
&CANON_VALUE_MASK
)];
1736 class CanonIterDataSingleton
{
1738 CanonIterDataSingleton(SimpleSingleton
&s
, Normalizer2Impl
&ni
, UErrorCode
&ec
) :
1739 singleton(s
), impl(ni
), errorCode(ec
) {}
1740 CanonIterData
*getInstance(UErrorCode
&errorCode
) {
1742 CanonIterData
*instance
=
1743 (CanonIterData
*)singleton
.getInstance(createInstance
, this, duplicate
, errorCode
);
1744 delete (CanonIterData
*)duplicate
;
1747 static void *createInstance(const void *context
, UErrorCode
&errorCode
);
1748 UBool
rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
1750 impl
.makeCanonIterDataFromNorm16(start
, end
, (uint16_t)value
, *newData
, errorCode
);
1752 return U_SUCCESS(errorCode
);
1756 SimpleSingleton
&singleton
;
1757 Normalizer2Impl
&impl
;
1758 CanonIterData
*newData
;
1759 UErrorCode
&errorCode
;
1764 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
1765 static UBool U_CALLCONV
1766 enumCIDRangeHandler(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
1767 return ((CanonIterDataSingleton
*)context
)->rangeHandler(start
, end
, value
);
1772 void *CanonIterDataSingleton::createInstance(const void *context
, UErrorCode
&errorCode
) {
1773 CanonIterDataSingleton
*me
=(CanonIterDataSingleton
*)context
;
1774 me
->newData
=new CanonIterData(errorCode
);
1775 if(me
->newData
==NULL
) {
1776 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
1779 if(U_SUCCESS(errorCode
)) {
1780 utrie2_enum(me
->impl
.getNormTrie(), NULL
, enumCIDRangeHandler
, me
);
1781 utrie2_freeze(me
->newData
->trie
, UTRIE2_32_VALUE_BITS
, &errorCode
);
1782 if(U_SUCCESS(errorCode
)) {
1790 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start
, UChar32 end
, uint16_t norm16
,
1791 CanonIterData
&newData
,
1792 UErrorCode
&errorCode
) const {
1793 if(norm16
==0 || (minYesNo
<=norm16
&& norm16
<minNoNo
)) {
1794 // Inert, or 2-way mapping (including Hangul syllable).
1795 // We do not write a canonStartSet for any yesNo character.
1796 // Composites from 2-way mappings are added at runtime from the
1797 // starter's compositions list, and the other characters in
1798 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
1799 // "maybe" characters.
1802 for(UChar32 c
=start
; c
<=end
; ++c
) {
1803 uint32_t oldValue
=utrie2_get32(newData
.trie
, c
);
1804 uint32_t newValue
=oldValue
;
1805 if(norm16
>=minMaybeYes
) {
1806 // not a segment starter if it occurs in a decomposition or has cc!=0
1807 newValue
|=CANON_NOT_SEGMENT_STARTER
;
1808 if(norm16
<MIN_NORMAL_MAYBE_YES
) {
1809 newValue
|=CANON_HAS_COMPOSITIONS
;
1811 } else if(norm16
<minYesNo
) {
1812 newValue
|=CANON_HAS_COMPOSITIONS
;
1814 // c has a one-way decomposition
1816 uint16_t norm16_2
=norm16
;
1817 while(limitNoNo
<=norm16_2
&& norm16_2
<minMaybeYes
) {
1818 c2
=mapAlgorithmic(c2
, norm16_2
);
1819 norm16_2
=getNorm16(c2
);
1821 if(minYesNo
<=norm16_2
&& norm16_2
<limitNoNo
) {
1822 // c decomposes, get everything from the variable-length extra data
1823 const uint16_t *mapping
=getMapping(norm16_2
);
1824 uint16_t firstUnit
=*mapping
++;
1825 int32_t length
=firstUnit
&MAPPING_LENGTH_MASK
;
1826 if((firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
)!=0) {
1827 if(c
==c2
&& (*mapping
&0xff)!=0) {
1828 newValue
|=CANON_NOT_SEGMENT_STARTER
; // original c has cc!=0
1832 // Skip empty mappings (no characters in the decomposition).
1834 // add c to first code point's start set
1836 U16_NEXT_UNSAFE(mapping
, i
, c2
);
1837 newData
.addToStartSet(c
, c2
, errorCode
);
1838 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
1839 // one-way mapping. A 2-way mapping is possible here after
1840 // intermediate algorithmic mapping.
1841 if(norm16_2
>=minNoNo
) {
1843 U16_NEXT_UNSAFE(mapping
, i
, c2
);
1844 uint32_t c2Value
=utrie2_get32(newData
.trie
, c2
);
1845 if((c2Value
&CANON_NOT_SEGMENT_STARTER
)==0) {
1846 utrie2_set32(newData
.trie
, c2
, c2Value
|CANON_NOT_SEGMENT_STARTER
,
1853 // c decomposed to c2 algorithmically; c has cc==0
1854 newData
.addToStartSet(c
, c2
, errorCode
);
1857 if(newValue
!=oldValue
) {
1858 utrie2_set32(newData
.trie
, c
, newValue
, &errorCode
);
1863 UBool
Normalizer2Impl::ensureCanonIterData(UErrorCode
&errorCode
) const {
1864 // Logically const: Synchronized instantiation.
1865 Normalizer2Impl
*me
=const_cast<Normalizer2Impl
*>(this);
1866 CanonIterDataSingleton(me
->canonIterDataSingleton
, *me
, errorCode
).getInstance(errorCode
);
1867 return U_SUCCESS(errorCode
);
1870 int32_t Normalizer2Impl::getCanonValue(UChar32 c
) const {
1871 return (int32_t)utrie2_get32(((CanonIterData
*)canonIterDataSingleton
.fInstance
)->trie
, c
);
1874 const UnicodeSet
&Normalizer2Impl::getCanonStartSet(int32_t n
) const {
1875 return *(const UnicodeSet
*)(
1876 ((CanonIterData
*)canonIterDataSingleton
.fInstance
)->canonStartSets
[n
]);
1879 UBool
Normalizer2Impl::isCanonSegmentStarter(UChar32 c
) const {
1880 return getCanonValue(c
)>=0;
1883 UBool
Normalizer2Impl::getCanonStartSet(UChar32 c
, UnicodeSet
&set
) const {
1884 int32_t canonValue
=getCanonValue(c
)&~CANON_NOT_SEGMENT_STARTER
;
1889 int32_t value
=canonValue
&CANON_VALUE_MASK
;
1890 if((canonValue
&CANON_HAS_SET
)!=0) {
1891 set
.addAll(getCanonStartSet(value
));
1892 } else if(value
!=0) {
1895 if((canonValue
&CANON_HAS_COMPOSITIONS
)!=0) {
1896 uint16_t norm16
=getNorm16(c
);
1897 if(norm16
==JAMO_L
) {
1899 (UChar32
)(Hangul::HANGUL_BASE
+(c
-Hangul::JAMO_L_BASE
)*Hangul::JAMO_VT_COUNT
);
1900 set
.add(syllable
, syllable
+Hangul::JAMO_VT_COUNT
-1);
1902 addComposites(getCompositionsList(norm16
), set
);
1910 // Normalizer2 data swapping ----------------------------------------------- ***
1914 U_CAPI
int32_t U_EXPORT2
1915 unorm2_swap(const UDataSwapper
*ds
,
1916 const void *inData
, int32_t length
, void *outData
,
1917 UErrorCode
*pErrorCode
) {
1918 const UDataInfo
*pInfo
;
1921 const uint8_t *inBytes
;
1924 const int32_t *inIndexes
;
1925 int32_t indexes
[Normalizer2Impl::IX_MIN_MAYBE_YES
+1];
1927 int32_t i
, offset
, nextOffset
, size
;
1929 /* udata_swapDataHeader checks the arguments */
1930 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
1931 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
1935 /* check data format and format version */
1936 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
1938 pInfo
->dataFormat
[0]==0x4e && /* dataFormat="Nrm2" */
1939 pInfo
->dataFormat
[1]==0x72 &&
1940 pInfo
->dataFormat
[2]==0x6d &&
1941 pInfo
->dataFormat
[3]==0x32 &&
1942 pInfo
->formatVersion
[0]==1
1944 udata_printError(ds
, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
1945 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
1946 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
1947 pInfo
->formatVersion
[0]);
1948 *pErrorCode
=U_UNSUPPORTED_ERROR
;
1952 inBytes
=(const uint8_t *)inData
+headerSize
;
1953 outBytes
=(uint8_t *)outData
+headerSize
;
1955 inIndexes
=(const int32_t *)inBytes
;
1959 if(length
<(int32_t)sizeof(indexes
)) {
1960 udata_printError(ds
, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
1962 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1967 /* read the first few indexes */
1968 for(i
=0; i
<=Normalizer2Impl::IX_MIN_MAYBE_YES
; ++i
) {
1969 indexes
[i
]=udata_readInt32(ds
, inIndexes
[i
]);
1972 /* get the total length of the data */
1973 size
=indexes
[Normalizer2Impl::IX_TOTAL_SIZE
];
1977 udata_printError(ds
, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
1979 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
1983 /* copy the data for inaccessible bytes */
1984 if(inBytes
!=outBytes
) {
1985 uprv_memcpy(outBytes
, inBytes
, size
);
1990 /* swap the int32_t indexes[] */
1991 nextOffset
=indexes
[Normalizer2Impl::IX_NORM_TRIE_OFFSET
];
1992 ds
->swapArray32(ds
, inBytes
, nextOffset
-offset
, outBytes
, pErrorCode
);
1995 /* swap the UTrie2 */
1996 nextOffset
=indexes
[Normalizer2Impl::IX_EXTRA_DATA_OFFSET
];
1997 utrie2_swap(ds
, inBytes
+offset
, nextOffset
-offset
, outBytes
+offset
, pErrorCode
);
2000 /* swap the uint16_t extraData[] */
2001 nextOffset
=indexes
[Normalizer2Impl::IX_EXTRA_DATA_OFFSET
+1];
2002 ds
->swapArray16(ds
, inBytes
+offset
, nextOffset
-offset
, outBytes
+offset
, pErrorCode
);
2005 U_ASSERT(offset
==size
);
2008 return headerSize
+size
;
2011 #endif // !UCONFIG_NO_NORMALIZATION