1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: normalizer2impl.cpp
12 * tab size: 8 (not used)
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
19 #include "unicode/utypes.h"
21 #if !UCONFIG_NO_NORMALIZATION
23 #include "unicode/bytestream.h"
24 #include "unicode/edits.h"
25 #include "unicode/normalizer2.h"
26 #include "unicode/stringoptions.h"
27 #include "unicode/udata.h"
28 #include "unicode/ustring.h"
29 #include "unicode/utf16.h"
30 #include "unicode/utf8.h"
31 #include "bytesinkutil.h"
34 #include "normalizer2impl.h"
46 * UTF-8 lead byte for minNoMaybeCP.
47 * Can be lower than the actual lead byte for c.
48 * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
50 inline uint8_t leadByteForCP(UChar32 c
) {
53 } else if (c
<= 0x7ff) {
54 return (uint8_t)(0xc0+(c
>>6));
56 // Should not occur because ccc(U+0300)!=0.
62 * Returns the code point from one single well-formed UTF-8 byte sequence
63 * between cpStart and cpLimit.
65 * UTrie2 UTF-8 macros do not assemble whole code points (for efficiency).
66 * When we do need the code point, we call this function.
67 * We should not need it for normalization-inert data (norm16==0).
68 * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
70 UChar32
codePointFromValidUTF8(const uint8_t *cpStart
, const uint8_t *cpLimit
) {
71 // Similar to U8_NEXT_UNSAFE(s, i, c).
72 U_ASSERT(cpStart
< cpLimit
);
74 switch(cpLimit
-cpStart
) {
78 return ((c
&0x1f)<<6) | (cpStart
[1]&0x3f);
80 // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar)
81 return (UChar
)((c
<<12) | ((cpStart
[1]&0x3f)<<6) | (cpStart
[2]&0x3f));
83 return ((c
&7)<<18) | ((cpStart
[1]&0x3f)<<12) | ((cpStart
[2]&0x3f)<<6) | (cpStart
[3]&0x3f);
85 U_ASSERT(FALSE
); // Should not occur.
91 * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.
92 * Otherwise returns a negative value.
94 UChar32
previousHangulOrJamo(const uint8_t *start
, const uint8_t *p
) {
95 if ((p
- start
) >= 3) {
99 if (0xe1 <= l
&& l
<= 0xed &&
100 (t1
= (uint8_t)(p
[1] - 0x80)) <= 0x3f &&
101 (t2
= (uint8_t)(p
[2] - 0x80)) <= 0x3f &&
102 (l
< 0xed || t1
<= 0x1f)) {
103 return ((l
& 0xf) << 12) | (t1
<< 6) | t2
;
110 * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
111 * Otherwise returns a negative value.
113 int32_t getJamoTMinusBase(const uint8_t *src
, const uint8_t *limit
) {
114 // Jamo T: E1 86 A8..E1 87 82
115 if ((limit
- src
) >= 3 && *src
== 0xe1) {
116 if (src
[1] == 0x86) {
118 // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
119 // Offset 0 does not correspond to any conjoining Jamo.
120 if (0xa8 <= t
&& t
<= 0xbf) {
123 } else if (src
[1] == 0x87) {
125 if ((int8_t)t
<= (int8_t)0x82) {
126 return t
- (0xa7 - 0x40);
134 appendCodePointDelta(const uint8_t *cpStart
, const uint8_t *cpLimit
, int32_t delta
,
135 ByteSink
&sink
, Edits
*edits
) {
136 char buffer
[U8_MAX_LENGTH
];
138 int32_t cpLength
= (int32_t)(cpLimit
- cpStart
);
140 // The builder makes ASCII map to ASCII.
141 buffer
[0] = (uint8_t)(*cpStart
+ delta
);
144 int32_t trail
= *(cpLimit
-1) + delta
;
145 if (0x80 <= trail
&& trail
<= 0xbf) {
146 // The delta only changes the last trail byte.
149 do { buffer
[length
++] = *cpStart
++; } while (cpStart
< cpLimit
);
150 buffer
[length
++] = (uint8_t)trail
;
152 // Decode the code point, add the delta, re-encode.
153 UChar32 c
= codePointFromValidUTF8(cpStart
, cpLimit
) + delta
;
155 U8_APPEND_UNSAFE(buffer
, length
, c
);
158 if (edits
!= nullptr) {
159 edits
->addReplace(cpLength
, length
);
161 sink
.Append(buffer
, length
);
166 // ReorderingBuffer -------------------------------------------------------- ***
168 ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl
&ni
, UnicodeString
&dest
,
169 UErrorCode
&errorCode
) :
171 start(str
.getBuffer(8)), reorderStart(start
), limit(start
),
172 remainingCapacity(str
.getCapacity()), lastCC(0) {
173 if (start
== nullptr && U_SUCCESS(errorCode
)) {
174 // getBuffer() already did str.setToBogus()
175 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
179 UBool
ReorderingBuffer::init(int32_t destCapacity
, UErrorCode
&errorCode
) {
180 int32_t length
=str
.length();
181 start
=str
.getBuffer(destCapacity
);
183 // getBuffer() already did str.setToBogus()
184 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
188 remainingCapacity
=str
.getCapacity()-length
;
195 // Set reorderStart after the last code point with cc<=1 if there is one.
197 while(previousCC()>1) {}
199 reorderStart
=codePointLimit
;
204 UBool
ReorderingBuffer::equals(const UChar
*otherStart
, const UChar
*otherLimit
) const {
205 int32_t length
=(int32_t)(limit
-start
);
207 length
==(int32_t)(otherLimit
-otherStart
) &&
208 0==u_memcmp(start
, otherStart
, length
);
211 UBool
ReorderingBuffer::equals(const uint8_t *otherStart
, const uint8_t *otherLimit
) const {
212 U_ASSERT((otherLimit
- otherStart
) <= INT32_MAX
); // ensured by caller
213 int32_t length
= (int32_t)(limit
- start
);
214 int32_t otherLength
= (int32_t)(otherLimit
- otherStart
);
215 // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
216 if (otherLength
< length
|| (otherLength
/ 3) > length
) {
219 // Compare valid strings from between normalization boundaries.
220 // (Invalid sequences are normalization-inert.)
221 for (int32_t i
= 0, j
= 0;;) {
223 return j
>= otherLength
;
224 } else if (j
>= otherLength
) {
227 // Not at the end of either string yet.
229 U16_NEXT_UNSAFE(start
, i
, c
);
230 U8_NEXT_UNSAFE(otherStart
, j
, other
);
237 UBool
ReorderingBuffer::appendSupplementary(UChar32 c
, uint8_t cc
, UErrorCode
&errorCode
) {
238 if(remainingCapacity
<2 && !resize(2, errorCode
)) {
241 if(lastCC
<=cc
|| cc
==0) {
242 limit
[0]=U16_LEAD(c
);
243 limit
[1]=U16_TRAIL(c
);
252 remainingCapacity
-=2;
256 UBool
ReorderingBuffer::append(const UChar
*s
, int32_t length
,
257 uint8_t leadCC
, uint8_t trailCC
,
258 UErrorCode
&errorCode
) {
262 if(remainingCapacity
<length
&& !resize(length
, errorCode
)) {
265 remainingCapacity
-=length
;
266 if(lastCC
<=leadCC
|| leadCC
==0) {
268 reorderStart
=limit
+length
;
269 } else if(leadCC
<=1) {
270 reorderStart
=limit
+1; // Ok if not a code point boundary.
272 const UChar
*sLimit
=s
+length
;
273 do { *limit
++=*s
++; } while(s
!=sLimit
);
278 U16_NEXT(s
, i
, length
, c
);
279 insert(c
, leadCC
); // insert first code point
281 U16_NEXT(s
, i
, length
, c
);
283 // s must be in NFD, otherwise we need to use getCC().
284 leadCC
=Normalizer2Impl::getCCFromYesOrMaybe(impl
.getNorm16(c
));
288 append(c
, leadCC
, errorCode
);
294 UBool
ReorderingBuffer::appendZeroCC(UChar32 c
, UErrorCode
&errorCode
) {
295 int32_t cpLength
=U16_LENGTH(c
);
296 if(remainingCapacity
<cpLength
&& !resize(cpLength
, errorCode
)) {
299 remainingCapacity
-=cpLength
;
303 limit
[0]=U16_LEAD(c
);
304 limit
[1]=U16_TRAIL(c
);
312 UBool
ReorderingBuffer::appendZeroCC(const UChar
*s
, const UChar
*sLimit
, UErrorCode
&errorCode
) {
316 int32_t length
=(int32_t)(sLimit
-s
);
317 if(remainingCapacity
<length
&& !resize(length
, errorCode
)) {
320 u_memcpy(limit
, s
, length
);
322 remainingCapacity
-=length
;
328 void ReorderingBuffer::remove() {
329 reorderStart
=limit
=start
;
330 remainingCapacity
=str
.getCapacity();
334 void ReorderingBuffer::removeSuffix(int32_t suffixLength
) {
335 if(suffixLength
<(limit
-start
)) {
337 remainingCapacity
+=suffixLength
;
340 remainingCapacity
=str
.getCapacity();
346 UBool
ReorderingBuffer::resize(int32_t appendLength
, UErrorCode
&errorCode
) {
347 int32_t reorderStartIndex
=(int32_t)(reorderStart
-start
);
348 int32_t length
=(int32_t)(limit
-start
);
349 str
.releaseBuffer(length
);
350 int32_t newCapacity
=length
+appendLength
;
351 int32_t doubleCapacity
=2*str
.getCapacity();
352 if(newCapacity
<doubleCapacity
) {
353 newCapacity
=doubleCapacity
;
355 if(newCapacity
<256) {
358 start
=str
.getBuffer(newCapacity
);
360 // getBuffer() already did str.setToBogus()
361 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
364 reorderStart
=start
+reorderStartIndex
;
366 remainingCapacity
=str
.getCapacity()-length
;
370 void ReorderingBuffer::skipPrevious() {
371 codePointLimit
=codePointStart
;
372 UChar c
=*--codePointStart
;
373 if(U16_IS_TRAIL(c
) && start
<codePointStart
&& U16_IS_LEAD(*(codePointStart
-1))) {
378 uint8_t ReorderingBuffer::previousCC() {
379 codePointLimit
=codePointStart
;
380 if(reorderStart
>=codePointStart
) {
383 UChar32 c
=*--codePointStart
;
385 if(U16_IS_TRAIL(c
) && start
<codePointStart
&& U16_IS_LEAD(c2
=*(codePointStart
-1))) {
387 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
389 return impl
.getCCFromYesOrMaybeCP(c
);
392 // Inserts c somewhere before the last character.
393 // Requires 0<cc<lastCC which implies reorderStart<limit.
394 void ReorderingBuffer::insert(UChar32 c
, uint8_t cc
) {
395 for(setIterator(), skipPrevious(); previousCC()>cc
;) {}
396 // insert c at codePointLimit, after the character with prevCC<=cc
398 UChar
*r
=limit
+=U16_LENGTH(c
);
401 } while(codePointLimit
!=q
);
402 writeCodePoint(q
, c
);
408 // Normalizer2Impl --------------------------------------------------------- ***
410 struct CanonIterData
: public UMemory
{
411 CanonIterData(UErrorCode
&errorCode
);
413 void addToStartSet(UChar32 origin
, UChar32 decompLead
, UErrorCode
&errorCode
);
415 UVector canonStartSets
; // contains UnicodeSet *
418 Normalizer2Impl::~Normalizer2Impl() {
419 delete fCanonIterData
;
423 Normalizer2Impl::init(const int32_t *inIndexes
, const UTrie2
*inTrie
,
424 const uint16_t *inExtraData
, const uint8_t *inSmallFCD
) {
425 minDecompNoCP
=inIndexes
[IX_MIN_DECOMP_NO_CP
];
426 minCompNoMaybeCP
=inIndexes
[IX_MIN_COMP_NO_MAYBE_CP
];
427 minLcccCP
=inIndexes
[IX_MIN_LCCC_CP
];
429 minYesNo
=inIndexes
[IX_MIN_YES_NO
];
430 minYesNoMappingsOnly
=inIndexes
[IX_MIN_YES_NO_MAPPINGS_ONLY
];
431 minNoNo
=inIndexes
[IX_MIN_NO_NO
];
432 minNoNoCompBoundaryBefore
=inIndexes
[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE
];
433 minNoNoCompNoMaybeCC
=inIndexes
[IX_MIN_NO_NO_COMP_NO_MAYBE_CC
];
434 minNoNoEmpty
=inIndexes
[IX_MIN_NO_NO_EMPTY
];
435 limitNoNo
=inIndexes
[IX_LIMIT_NO_NO
];
436 minMaybeYes
=inIndexes
[IX_MIN_MAYBE_YES
];
437 U_ASSERT((minMaybeYes
&7)==0); // 8-aligned for noNoDelta bit fields
438 centerNoNoDelta
=(minMaybeYes
>>DELTA_SHIFT
)-MAX_DELTA
-1;
442 maybeYesCompositions
=inExtraData
;
443 extraData
=maybeYesCompositions
+((MIN_NORMAL_MAYBE_YES
-minMaybeYes
)>>OFFSET_SHIFT
);
450 LcccContext(const Normalizer2Impl
&ni
, UnicodeSet
&s
) : impl(ni
), set(s
) {}
452 void handleRange(UChar32 start
, UChar32 end
, uint16_t norm16
) {
453 if (norm16
> Normalizer2Impl::MIN_NORMAL_MAYBE_YES
&&
454 norm16
!= Normalizer2Impl::JAMO_VT
) {
456 } else if (impl
.minNoNoCompNoMaybeCC
<= norm16
&& norm16
< impl
.limitNoNo
) {
457 uint16_t fcd16
=impl
.getFCD16(start
);
458 if(fcd16
>0xff) { set
.add(start
, end
); }
463 const Normalizer2Impl
&impl
;
469 struct PropertyStartsContext
{
470 PropertyStartsContext(const Normalizer2Impl
&ni
, const USetAdder
*adder
)
471 : impl(ni
), sa(adder
) {}
473 const Normalizer2Impl
&impl
;
481 static UBool U_CALLCONV
482 enumLcccRange(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
483 ((LcccContext
*)context
)->handleRange(start
, end
, (uint16_t)value
);
487 static UBool U_CALLCONV
488 enumNorm16PropertyStartsRange(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
489 /* add the start code point to the USet */
490 const PropertyStartsContext
*ctx
=(const PropertyStartsContext
*)context
;
491 const USetAdder
*sa
=ctx
->sa
;
492 sa
->add(sa
->set
, start
);
493 if (start
!= end
&& ctx
->impl
.isAlgorithmicNoNo((uint16_t)value
) &&
494 (value
& Normalizer2Impl::DELTA_TCCC_MASK
) > Normalizer2Impl::DELTA_TCCC_1
) {
495 // Range of code points with same-norm16-value algorithmic decompositions.
496 // They might have different non-zero FCD16 values.
497 uint16_t prevFCD16
=ctx
->impl
.getFCD16(start
);
498 while(++start
<=end
) {
499 uint16_t fcd16
=ctx
->impl
.getFCD16(start
);
500 if(fcd16
!=prevFCD16
) {
501 sa
->add(sa
->set
, start
);
509 static UBool U_CALLCONV
510 enumPropertyStartsRange(const void *context
, UChar32 start
, UChar32
/*end*/, uint32_t /*value*/) {
511 /* add the start code point to the USet */
512 const USetAdder
*sa
=(const USetAdder
*)context
;
513 sa
->add(sa
->set
, start
);
517 static uint32_t U_CALLCONV
518 segmentStarterMapper(const void * /*context*/, uint32_t value
) {
519 return value
&CANON_NOT_SEGMENT_STARTER
;
525 Normalizer2Impl::addLcccChars(UnicodeSet
&set
) const {
526 LcccContext
context(*this, set
);
527 utrie2_enum(normTrie
, NULL
, enumLcccRange
, &context
);
531 Normalizer2Impl::addPropertyStarts(const USetAdder
*sa
, UErrorCode
& /*errorCode*/) const {
532 /* add the start code point of each same-value range of each trie */
533 PropertyStartsContext
context(*this, sa
);
534 utrie2_enum(normTrie
, NULL
, enumNorm16PropertyStartsRange
, &context
);
536 /* add Hangul LV syllables and LV+1 because of skippables */
537 for(UChar c
=Hangul::HANGUL_BASE
; c
<Hangul::HANGUL_LIMIT
; c
+=Hangul::JAMO_T_COUNT
) {
539 sa
->add(sa
->set
, c
+1);
541 sa
->add(sa
->set
, Hangul::HANGUL_LIMIT
); /* add Hangul+1 to continue with other properties */
545 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder
*sa
, UErrorCode
&errorCode
) const {
546 /* add the start code point of each same-value range of the canonical iterator data trie */
547 if(ensureCanonIterData(errorCode
)) {
548 // currently only used for the SEGMENT_STARTER property
549 utrie2_enum(fCanonIterData
->trie
, segmentStarterMapper
, enumPropertyStartsRange
, sa
);
554 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar
*src
,
555 UChar32 minNeedDataCP
,
556 ReorderingBuffer
*buffer
,
557 UErrorCode
&errorCode
) const {
558 // Make some effort to support NUL-terminated strings reasonably.
559 // Take the part of the fast quick check loop that does not look up
560 // data and check the first part of the string.
561 // After this prefix, determine the string length to simplify the rest
563 const UChar
*prevSrc
=src
;
565 while((c
=*src
++)<minNeedDataCP
&& c
!=0) {}
566 // Back out the last character for full processing.
570 buffer
->appendZeroCC(prevSrc
, src
, errorCode
);
577 Normalizer2Impl::decompose(const UnicodeString
&src
, UnicodeString
&dest
,
578 UErrorCode
&errorCode
) const {
579 if(U_FAILURE(errorCode
)) {
583 const UChar
*sArray
=src
.getBuffer();
584 if(&dest
==&src
|| sArray
==NULL
) {
585 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
589 decompose(sArray
, sArray
+src
.length(), dest
, src
.length(), errorCode
);
594 Normalizer2Impl::decompose(const UChar
*src
, const UChar
*limit
,
596 int32_t destLengthEstimate
,
597 UErrorCode
&errorCode
) const {
598 if(destLengthEstimate
<0 && limit
!=NULL
) {
599 destLengthEstimate
=(int32_t)(limit
-src
);
602 ReorderingBuffer
buffer(*this, dest
);
603 if(buffer
.init(destLengthEstimate
, errorCode
)) {
604 decompose(src
, limit
, &buffer
, errorCode
);
608 // Dual functionality:
609 // buffer!=NULL: normalize
610 // buffer==NULL: isNormalized/spanQuickCheckYes
612 Normalizer2Impl::decompose(const UChar
*src
, const UChar
*limit
,
613 ReorderingBuffer
*buffer
,
614 UErrorCode
&errorCode
) const {
615 UChar32 minNoCP
=minDecompNoCP
;
617 src
=copyLowPrefixFromNulTerminated(src
, minNoCP
, buffer
, errorCode
);
618 if(U_FAILURE(errorCode
)) {
621 limit
=u_strchr(src
, 0);
624 const UChar
*prevSrc
;
628 // only for quick check
629 const UChar
*prevBoundary
=src
;
633 // count code units below the minimum or with irrelevant data for the quick check
634 for(prevSrc
=src
; src
!=limit
;) {
635 if( (c
=*src
)<minNoCP
||
636 isMostDecompYesAndZeroCC(norm16
=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie
, c
))
639 } else if(!U16_IS_SURROGATE(c
)) {
643 if(U16_IS_SURROGATE_LEAD(c
)) {
644 if((src
+1)!=limit
&& U16_IS_TRAIL(c2
=src
[1])) {
645 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
647 } else /* trail surrogate */ {
648 if(prevSrc
<src
&& U16_IS_LEAD(c2
=*(src
-1))) {
650 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
653 if(isMostDecompYesAndZeroCC(norm16
=getNorm16(c
))) {
660 // copy these code units all at once
663 if(!buffer
->appendZeroCC(prevSrc
, src
, errorCode
)) {
675 // Check one above-minimum, relevant code point.
678 if(!decompose(c
, norm16
, *buffer
, errorCode
)) {
682 if(isDecompYes(norm16
)) {
683 uint8_t cc
=getCCFromYesOrMaybe(norm16
);
684 if(prevCC
<=cc
|| cc
==0) {
692 return prevBoundary
; // "no" or cc out of order
698 // Decompose a short piece of text which is likely to contain characters that
699 // fail the quick check loop and/or where the quick check loop's overhead
700 // is unlikely to be amortized.
701 // Called by the compose() and makeFCD() implementations.
703 Normalizer2Impl::decomposeShort(const UChar
*src
, const UChar
*limit
,
704 UBool stopAtCompBoundary
, UBool onlyContiguous
,
705 ReorderingBuffer
&buffer
, UErrorCode
&errorCode
) const {
706 if (U_FAILURE(errorCode
)) {
710 if (stopAtCompBoundary
&& *src
< minCompNoMaybeCP
) {
713 const UChar
*prevSrc
= src
;
716 UTRIE2_U16_NEXT16(normTrie
, src
, limit
, c
, norm16
);
717 if (stopAtCompBoundary
&& norm16HasCompBoundaryBefore(norm16
)) {
720 if(!decompose(c
, norm16
, buffer
, errorCode
)) {
723 if (stopAtCompBoundary
&& norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
730 UBool
Normalizer2Impl::decompose(UChar32 c
, uint16_t norm16
,
731 ReorderingBuffer
&buffer
,
732 UErrorCode
&errorCode
) const {
733 // get the decomposition and the lead and trail cc's
734 if (norm16
>= limitNoNo
) {
735 if (isMaybeOrNonZeroCC(norm16
)) {
736 return buffer
.append(c
, getCCFromYesOrMaybe(norm16
), errorCode
);
738 // Maps to an isCompYesAndZeroCC.
739 c
=mapAlgorithmic(c
, norm16
);
742 if (norm16
< minYesNo
) {
743 // c does not decompose
744 return buffer
.append(c
, 0, errorCode
);
745 } else if(isHangulLV(norm16
) || isHangulLVT(norm16
)) {
746 // Hangul syllable: decompose algorithmically
748 return buffer
.appendZeroCC(jamos
, jamos
+Hangul::decompose(c
, jamos
), errorCode
);
750 // c decomposes, get everything from the variable-length extra data
751 const uint16_t *mapping
=getMapping(norm16
);
752 uint16_t firstUnit
=*mapping
;
753 int32_t length
=firstUnit
&MAPPING_LENGTH_MASK
;
754 uint8_t leadCC
, trailCC
;
755 trailCC
=(uint8_t)(firstUnit
>>8);
756 if(firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
) {
757 leadCC
=(uint8_t)(*(mapping
-1)>>8);
761 return buffer
.append((const UChar
*)mapping
+1, length
, leadCC
, trailCC
, errorCode
);
765 Normalizer2Impl::decomposeShort(const uint8_t *src
, const uint8_t *limit
,
766 UBool stopAtCompBoundary
, UBool onlyContiguous
,
767 ReorderingBuffer
&buffer
, UErrorCode
&errorCode
) const {
768 if (U_FAILURE(errorCode
)) {
771 while (src
< limit
) {
772 const uint8_t *prevSrc
= src
;
774 UTRIE2_U8_NEXT16(normTrie
, src
, limit
, norm16
);
775 // Get the decomposition and the lead and trail cc's.
776 UChar32 c
= U_SENTINEL
;
777 if (norm16
>= limitNoNo
) {
778 if (isMaybeOrNonZeroCC(norm16
)) {
779 // No boundaries around this character.
780 c
= codePointFromValidUTF8(prevSrc
, src
);
781 if (!buffer
.append(c
, getCCFromYesOrMaybe(norm16
), errorCode
)) {
786 // Maps to an isCompYesAndZeroCC.
787 if (stopAtCompBoundary
) {
790 c
= codePointFromValidUTF8(prevSrc
, src
);
791 c
= mapAlgorithmic(c
, norm16
);
792 norm16
= getNorm16(c
);
793 } else if (stopAtCompBoundary
&& norm16
< minNoNoCompNoMaybeCC
) {
796 // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
797 // We do not see invalid UTF-8 here because
798 // its norm16==INERT is normalization-inert,
799 // so it gets copied unchanged in the fast path,
800 // and we stop the slow path where invalid UTF-8 begins.
801 U_ASSERT(norm16
!= INERT
);
802 if (norm16
< minYesNo
) {
804 c
= codePointFromValidUTF8(prevSrc
, src
);
806 // does not decompose
807 if (!buffer
.append(c
, 0, errorCode
)) {
810 } else if (isHangulLV(norm16
) || isHangulLVT(norm16
)) {
811 // Hangul syllable: decompose algorithmically
813 c
= codePointFromValidUTF8(prevSrc
, src
);
816 if (!buffer
.appendZeroCC(jamos
, jamos
+Hangul::decompose(c
, jamos
), errorCode
)) {
820 // The character decomposes, get everything from the variable-length extra data.
821 const uint16_t *mapping
= getMapping(norm16
);
822 uint16_t firstUnit
= *mapping
;
823 int32_t length
= firstUnit
& MAPPING_LENGTH_MASK
;
824 uint8_t trailCC
= (uint8_t)(firstUnit
>> 8);
826 if (firstUnit
& MAPPING_HAS_CCC_LCCC_WORD
) {
827 leadCC
= (uint8_t)(*(mapping
-1) >> 8);
831 if (!buffer
.append((const char16_t *)mapping
+1, length
, leadCC
, trailCC
, errorCode
)) {
835 if (stopAtCompBoundary
&& norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
843 Normalizer2Impl::getDecomposition(UChar32 c
, UChar buffer
[4], int32_t &length
) const {
845 if(c
<minDecompNoCP
|| isMaybeOrNonZeroCC(norm16
=getNorm16(c
))) {
846 // c does not decompose
849 const UChar
*decomp
= nullptr;
850 if(isDecompNoAlgorithmic(norm16
)) {
851 // Maps to an isCompYesAndZeroCC.
852 c
=mapAlgorithmic(c
, norm16
);
855 U16_APPEND_UNSAFE(buffer
, length
, c
);
856 // The mapping might decompose further.
857 norm16
= getNorm16(c
);
859 if (norm16
< minYesNo
) {
861 } else if(isHangulLV(norm16
) || isHangulLVT(norm16
)) {
862 // Hangul syllable: decompose algorithmically
863 length
=Hangul::decompose(c
, buffer
);
866 // c decomposes, get everything from the variable-length extra data
867 const uint16_t *mapping
=getMapping(norm16
);
868 length
=*mapping
&MAPPING_LENGTH_MASK
;
869 return (const UChar
*)mapping
+1;
872 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
873 // so that a raw mapping fits that consists of one unit ("rm0")
874 // plus all but the first two code units of the normal mapping.
875 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
877 Normalizer2Impl::getRawDecomposition(UChar32 c
, UChar buffer
[30], int32_t &length
) const {
879 if(c
<minDecompNoCP
|| isDecompYes(norm16
=getNorm16(c
))) {
880 // c does not decompose
882 } else if(isHangulLV(norm16
) || isHangulLVT(norm16
)) {
883 // Hangul syllable: decompose algorithmically
884 Hangul::getRawDecomposition(c
, buffer
);
887 } else if(isDecompNoAlgorithmic(norm16
)) {
888 c
=mapAlgorithmic(c
, norm16
);
890 U16_APPEND_UNSAFE(buffer
, length
, c
);
893 // c decomposes, get everything from the variable-length extra data
894 const uint16_t *mapping
=getMapping(norm16
);
895 uint16_t firstUnit
=*mapping
;
896 int32_t mLength
=firstUnit
&MAPPING_LENGTH_MASK
; // length of normal mapping
897 if(firstUnit
&MAPPING_HAS_RAW_MAPPING
) {
898 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
899 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
900 const uint16_t *rawMapping
=mapping
-((firstUnit
>>7)&1)-1;
901 uint16_t rm0
=*rawMapping
;
902 if(rm0
<=MAPPING_LENGTH_MASK
) {
904 return (const UChar
*)rawMapping
-rm0
;
906 // Copy the normal mapping and replace its first two code units with rm0.
907 buffer
[0]=(UChar
)rm0
;
908 u_memcpy(buffer
+1, (const UChar
*)mapping
+1+2, mLength
-2);
914 return (const UChar
*)mapping
+1;
918 void Normalizer2Impl::decomposeAndAppend(const UChar
*src
, const UChar
*limit
,
920 UnicodeString
&safeMiddle
,
921 ReorderingBuffer
&buffer
,
922 UErrorCode
&errorCode
) const {
923 buffer
.copyReorderableSuffixTo(safeMiddle
);
925 decompose(src
, limit
, &buffer
, errorCode
);
928 // Just merge the strings at the boundary.
929 ForwardUTrie2StringIterator
iter(normTrie
, src
, limit
);
930 uint8_t firstCC
, prevCC
, cc
;
931 firstCC
=prevCC
=cc
=getCC(iter
.next16());
934 cc
=getCC(iter
.next16());
936 if(limit
==NULL
) { // appendZeroCC() needs limit!=NULL
937 limit
=u_strchr(iter
.codePointStart
, 0);
940 if (buffer
.append(src
, (int32_t)(iter
.codePointStart
-src
), firstCC
, prevCC
, errorCode
)) {
941 buffer
.appendZeroCC(iter
.codePointStart
, limit
, errorCode
);
945 UBool
Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c
) const {
946 return c
< minLcccCP
|| (c
<= 0xffff && !singleLeadMightHaveNonZeroFCD16(c
)) ||
947 norm16HasDecompBoundaryBefore(getNorm16(c
));
950 UBool
Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16
) const {
951 if (norm16
< minNoNoCompNoMaybeCC
) {
954 if (norm16
>= limitNoNo
) {
955 return norm16
<= MIN_NORMAL_MAYBE_YES
|| norm16
== JAMO_VT
;
957 // c decomposes, get everything from the variable-length extra data
958 const uint16_t *mapping
=getMapping(norm16
);
959 uint16_t firstUnit
=*mapping
;
960 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
961 return (firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
)==0 || (*(mapping
-1)&0xff00)==0;
964 UBool
Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c
) const {
965 if (c
< minDecompNoCP
) {
968 if (c
<= 0xffff && !singleLeadMightHaveNonZeroFCD16(c
)) {
971 return norm16HasDecompBoundaryAfter(getNorm16(c
));
974 UBool
Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16
) const {
975 if(norm16
<= minYesNo
|| isHangulLVT(norm16
)) {
978 if (norm16
>= limitNoNo
) {
979 if (isMaybeOrNonZeroCC(norm16
)) {
980 return norm16
<= MIN_NORMAL_MAYBE_YES
|| norm16
== JAMO_VT
;
982 // Maps to an isCompYesAndZeroCC.
983 return (norm16
& DELTA_TCCC_MASK
) <= DELTA_TCCC_1
;
985 // c decomposes, get everything from the variable-length extra data
986 const uint16_t *mapping
=getMapping(norm16
);
987 uint16_t firstUnit
=*mapping
;
988 // decomp after-boundary: same as hasFCDBoundaryAfter(),
989 // fcd16<=1 || trailCC==0
990 if(firstUnit
>0x1ff) {
991 return FALSE
; // trailCC>1
993 if(firstUnit
<=0xff) {
994 return TRUE
; // trailCC==0
996 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
997 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
998 return (firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
)==0 || (*(mapping
-1)&0xff00)==0;
1002 * Finds the recomposition result for
1003 * a forward-combining "lead" character,
1004 * specified with a pointer to its compositions list,
1005 * and a backward-combining "trail" character.
1007 * If the lead and trail characters combine, then this function returns
1008 * the following "compositeAndFwd" value:
1009 * Bits 21..1 composite character
1010 * Bit 0 set if the composite is a forward-combining starter
1011 * otherwise it returns -1.
1013 * The compositions list has (trail, compositeAndFwd) pair entries,
1014 * encoded as either pairs or triples of 16-bit units.
1015 * The last entry has the high bit of its first unit set.
1017 * The list is sorted by ascending trail characters (there are no duplicates).
1018 * A linear search is used.
1020 * See normalizer2impl.h for a more detailed description
1021 * of the compositions list format.
1023 int32_t Normalizer2Impl::combine(const uint16_t *list
, UChar32 trail
) {
1024 uint16_t key1
, firstUnit
;
1025 if(trail
<COMP_1_TRAIL_LIMIT
) {
1026 // trail character is 0..33FF
1027 // result entry may have 2 or 3 units
1028 key1
=(uint16_t)(trail
<<1);
1029 while(key1
>(firstUnit
=*list
)) {
1030 list
+=2+(firstUnit
&COMP_1_TRIPLE
);
1032 if(key1
==(firstUnit
&COMP_1_TRAIL_MASK
)) {
1033 if(firstUnit
&COMP_1_TRIPLE
) {
1034 return ((int32_t)list
[1]<<16)|list
[2];
1040 // trail character is 3400..10FFFF
1041 // result entry has 3 units
1042 key1
=(uint16_t)(COMP_1_TRAIL_LIMIT
+
1043 (((trail
>>COMP_1_TRAIL_SHIFT
))&
1045 uint16_t key2
=(uint16_t)(trail
<<COMP_2_TRAIL_SHIFT
);
1046 uint16_t secondUnit
;
1048 if(key1
>(firstUnit
=*list
)) {
1049 list
+=2+(firstUnit
&COMP_1_TRIPLE
);
1050 } else if(key1
==(firstUnit
&COMP_1_TRAIL_MASK
)) {
1051 if(key2
>(secondUnit
=list
[1])) {
1052 if(firstUnit
&COMP_1_LAST_TUPLE
) {
1057 } else if(key2
==(secondUnit
&COMP_2_TRAIL_MASK
)) {
1058 return ((int32_t)(secondUnit
&~COMP_2_TRAIL_MASK
)<<16)|list
[2];
1071 * @param list some character's compositions list
1072 * @param set recursively receives the composites from these compositions
1074 void Normalizer2Impl::addComposites(const uint16_t *list
, UnicodeSet
&set
) const {
1076 int32_t compositeAndFwd
;
1079 if((firstUnit
&COMP_1_TRIPLE
)==0) {
1080 compositeAndFwd
=list
[1];
1083 compositeAndFwd
=(((int32_t)list
[1]&~COMP_2_TRAIL_MASK
)<<16)|list
[2];
1086 UChar32 composite
=compositeAndFwd
>>1;
1087 if((compositeAndFwd
&1)!=0) {
1088 addComposites(getCompositionsListForComposite(getNorm16(composite
)), set
);
1091 } while((firstUnit
&COMP_1_LAST_TUPLE
)==0);
1095 * Recomposes the buffer text starting at recomposeStartIndex
1096 * (which is in NFD - decomposed and canonically ordered),
1097 * and truncates the buffer contents.
1099 * Note that recomposition never lengthens the text:
1100 * Any character consists of either one or two code units;
1101 * a composition may contain at most one more code unit than the original starter,
1102 * while the combining mark that is removed has at least one code unit.
1104 void Normalizer2Impl::recompose(ReorderingBuffer
&buffer
, int32_t recomposeStartIndex
,
1105 UBool onlyContiguous
) const {
1106 UChar
*p
=buffer
.getStart()+recomposeStartIndex
;
1107 UChar
*limit
=buffer
.getLimit();
1112 UChar
*starter
, *pRemove
, *q
, *r
;
1113 const uint16_t *compositionsList
;
1114 UChar32 c
, compositeAndFwd
;
1117 UBool starterIsSupplementary
;
1119 // Some of the following variables are not used until we have a forward-combining starter
1120 // and are only initialized now to avoid compiler warnings.
1121 compositionsList
=NULL
; // used as indicator for whether we have a forward-combining starter
1123 starterIsSupplementary
=FALSE
;
1127 UTRIE2_U16_NEXT16(normTrie
, p
, limit
, c
, norm16
);
1128 cc
=getCCFromYesOrMaybe(norm16
);
1129 if( // this character combines backward and
1131 // we have seen a starter that combines forward and
1132 compositionsList
!=NULL
&&
1133 // the backward-combining character is not blocked
1134 (prevCC
<cc
|| prevCC
==0)
1136 if(isJamoVT(norm16
)) {
1137 // c is a Jamo V/T, see if we can compose it with the previous character.
1138 if(c
<Hangul::JAMO_T_BASE
) {
1139 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1140 UChar prev
=(UChar
)(*starter
-Hangul::JAMO_L_BASE
);
1141 if(prev
<Hangul::JAMO_L_COUNT
) {
1143 UChar syllable
=(UChar
)
1144 (Hangul::HANGUL_BASE
+
1145 (prev
*Hangul::JAMO_V_COUNT
+(c
-Hangul::JAMO_V_BASE
))*
1146 Hangul::JAMO_T_COUNT
);
1148 if(p
!=limit
&& (t
=(UChar
)(*p
-Hangul::JAMO_T_BASE
))<Hangul::JAMO_T_COUNT
) {
1150 syllable
+=t
; // The next character was a Jamo T.
1153 // remove the Jamo V/T
1164 * No "else" for Jamo T:
1165 * Since the input is in NFD, there are no Hangul LV syllables that
1166 * a Jamo T could combine with.
1167 * All Jamo Ts are combined above when handling Jamo Vs.
1172 compositionsList
=NULL
;
1174 } else if((compositeAndFwd
=combine(compositionsList
, c
))>=0) {
1175 // The starter and the combining mark (c) do combine.
1176 UChar32 composite
=compositeAndFwd
>>1;
1178 // Replace the starter with the composite, remove the combining mark.
1179 pRemove
=p
-U16_LENGTH(c
); // pRemove & p: start & limit of the combining mark
1180 if(starterIsSupplementary
) {
1181 if(U_IS_SUPPLEMENTARY(composite
)) {
1182 // both are supplementary
1183 starter
[0]=U16_LEAD(composite
);
1184 starter
[1]=U16_TRAIL(composite
);
1186 *starter
=(UChar
)composite
;
1187 // The composite is shorter than the starter,
1188 // move the intermediate characters forward one.
1189 starterIsSupplementary
=FALSE
;
1197 } else if(U_IS_SUPPLEMENTARY(composite
)) {
1198 // The composite is longer than the starter,
1199 // move the intermediate characters back one.
1200 starterIsSupplementary
=TRUE
;
1201 ++starter
; // temporarily increment for the loop boundary
1207 *starter
=U16_TRAIL(composite
);
1208 *--starter
=U16_LEAD(composite
); // undo the temporary increment
1210 // both are on the BMP
1211 *starter
=(UChar
)composite
;
1214 /* remove the combining mark by moving the following text over it */
1224 // Keep prevCC because we removed the combining mark.
1229 // Is the composite a starter that combines forward?
1230 if(compositeAndFwd
&1) {
1232 getCompositionsListForComposite(getNorm16(composite
));
1234 compositionsList
=NULL
;
1237 // We combined; continue with looking for compositions.
1242 // no combination this time
1248 // If c did not combine, then check if it is a starter.
1250 // Found a new starter.
1251 if((compositionsList
=getCompositionsListForDecompYes(norm16
))!=NULL
) {
1252 // It may combine with something, prepare for it.
1254 starterIsSupplementary
=FALSE
;
1257 starterIsSupplementary
=TRUE
;
1261 } else if(onlyContiguous
) {
1262 // FCC: no discontiguous compositions; any intervening character blocks.
1263 compositionsList
=NULL
;
1266 buffer
.setReorderingLimit(limit
);
1270 Normalizer2Impl::composePair(UChar32 a
, UChar32 b
) const {
1271 uint16_t norm16
=getNorm16(a
); // maps an out-of-range 'a' to inert norm16=0
1272 const uint16_t *list
;
1273 if(isInert(norm16
)) {
1275 } else if(norm16
<minYesNoMappingsOnly
) {
1276 // a combines forward.
1277 if(isJamoL(norm16
)) {
1278 b
-=Hangul::JAMO_V_BASE
;
1279 if(0<=b
&& b
<Hangul::JAMO_V_COUNT
) {
1281 (Hangul::HANGUL_BASE
+
1282 ((a
-Hangul::JAMO_L_BASE
)*Hangul::JAMO_V_COUNT
+b
)*
1283 Hangul::JAMO_T_COUNT
);
1287 } else if(isHangulLV(norm16
)) {
1288 b
-=Hangul::JAMO_T_BASE
;
1289 if(0<b
&& b
<Hangul::JAMO_T_COUNT
) { // not b==0!
1295 // 'a' has a compositions list in extraData
1296 list
=getMapping(norm16
);
1297 if(norm16
>minYesNo
) { // composite 'a' has both mapping & compositions list
1298 list
+= // mapping pointer
1299 1+ // +1 to skip the first unit with the mapping length
1300 (*list
&MAPPING_LENGTH_MASK
); // + mapping length
1303 } else if(norm16
<minMaybeYes
|| MIN_NORMAL_MAYBE_YES
<=norm16
) {
1306 list
=getCompositionsListForMaybe(norm16
);
1308 if(b
<0 || 0x10ffff<b
) { // combine(list, b) requires a valid code point b
1311 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1312 return combine(list
, b
)>>1;
1314 int32_t compositeAndFwd
=combine(list
, b
);
1315 return compositeAndFwd
>=0 ? compositeAndFwd
>>1 : U_SENTINEL
;
1319 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1320 // doCompose: normalize
1321 // !doCompose: isNormalized (buffer must be empty and initialized)
1323 Normalizer2Impl::compose(const UChar
*src
, const UChar
*limit
,
1324 UBool onlyContiguous
,
1326 ReorderingBuffer
&buffer
,
1327 UErrorCode
&errorCode
) const {
1328 const UChar
*prevBoundary
=src
;
1329 UChar32 minNoMaybeCP
=minCompNoMaybeCP
;
1331 src
=copyLowPrefixFromNulTerminated(src
, minNoMaybeCP
,
1332 doCompose
? &buffer
: NULL
,
1334 if(U_FAILURE(errorCode
)) {
1337 limit
=u_strchr(src
, 0);
1338 if (prevBoundary
!= src
) {
1339 if (hasCompBoundaryAfter(*(src
-1), onlyContiguous
)) {
1342 buffer
.removeSuffix(1);
1343 prevBoundary
= --src
;
1349 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1350 // or with (compYes && ccc==0) properties.
1351 const UChar
*prevSrc
;
1353 uint16_t norm16
= 0;
1356 if (prevBoundary
!= limit
&& doCompose
) {
1357 buffer
.appendZeroCC(prevBoundary
, limit
, errorCode
);
1361 if( (c
=*src
)<minNoMaybeCP
||
1362 isCompYesAndZeroCC(norm16
=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie
, c
))
1367 if(!U16_IS_SURROGATE(c
)) {
1371 if(U16_IS_SURROGATE_LEAD(c
)) {
1372 if(src
!=limit
&& U16_IS_TRAIL(c2
=*src
)) {
1374 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
1376 } else /* trail surrogate */ {
1377 if(prevBoundary
<prevSrc
&& U16_IS_LEAD(c2
=*(prevSrc
-1))) {
1379 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
1382 if(!isCompYesAndZeroCC(norm16
=getNorm16(c
))) {
1388 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1389 // The current character is either a "noNo" (has a mapping)
1390 // or a "maybeYes" (combines backward)
1391 // or a "yesYes" with ccc!=0.
1392 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1394 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1395 if (!isMaybeOrNonZeroCC(norm16
)) { // minNoNo <= norm16 < minMaybeYes
1399 // Fast path for mapping a character that is immediately surrounded by boundaries.
1400 // In this case, we need not decompose around the current character.
1401 if (isDecompNoAlgorithmic(norm16
)) {
1402 // Maps to a single isCompYesAndZeroCC character
1403 // which also implies hasCompBoundaryBefore.
1404 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
) ||
1405 hasCompBoundaryBefore(src
, limit
)) {
1406 if (prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1409 if(!buffer
.append(mapAlgorithmic(c
, norm16
), 0, errorCode
)) {
1415 } else if (norm16
< minNoNoCompBoundaryBefore
) {
1416 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1417 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
) ||
1418 hasCompBoundaryBefore(src
, limit
)) {
1419 if (prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1422 const UChar
*mapping
= reinterpret_cast<const UChar
*>(getMapping(norm16
));
1423 int32_t length
= *mapping
++ & MAPPING_LENGTH_MASK
;
1424 if(!buffer
.appendZeroCC(mapping
, mapping
+ length
, errorCode
)) {
1430 } else if (norm16
>= minNoNoEmpty
) {
1431 // The current character maps to nothing.
1432 // Simply omit it from the output if there is a boundary before _or_ after it.
1433 // The character itself implies no boundaries.
1434 if (hasCompBoundaryBefore(src
, limit
) ||
1435 hasCompBoundaryAfter(prevBoundary
, prevSrc
, onlyContiguous
)) {
1436 if (prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1443 // Other "noNo" type, or need to examine more text around this character:
1444 // Fall through to the slow path.
1445 } else if (isJamoVT(norm16
) && prevBoundary
!= prevSrc
) {
1446 UChar prev
=*(prevSrc
-1);
1447 if(c
<Hangul::JAMO_T_BASE
) {
1448 // The current character is a Jamo Vowel,
1449 // compose with previous Jamo L and following Jamo T.
1450 UChar l
= (UChar
)(prev
-Hangul::JAMO_L_BASE
);
1451 if(l
<Hangul::JAMO_L_COUNT
) {
1457 0 < (t
= ((int32_t)*src
- Hangul::JAMO_T_BASE
)) &&
1458 t
< Hangul::JAMO_T_COUNT
) {
1459 // The next character is a Jamo T.
1461 } else if (hasCompBoundaryBefore(src
, limit
)) {
1462 // No Jamo T follows, not even via decomposition.
1468 UChar32 syllable
= Hangul::HANGUL_BASE
+
1469 (l
*Hangul::JAMO_V_COUNT
+ (c
-Hangul::JAMO_V_BASE
)) *
1470 Hangul::JAMO_T_COUNT
+ t
;
1471 --prevSrc
; // Replace the Jamo L as well.
1472 if (prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1475 if(!buffer
.appendBMP((UChar
)syllable
, 0, errorCode
)) {
1481 // If we see L+V+x where x!=T then we drop to the slow path,
1482 // decompose and recompose.
1483 // This is to deal with NFKC finding normal L and V but a
1484 // compatibility variant of a T.
1485 // We need to either fully compose that combination here
1486 // (which would complicate the code and may not work with strange custom data)
1487 // or use the slow path.
1489 } else if (Hangul::isHangulLV(prev
)) {
1490 // The current character is a Jamo Trailing consonant,
1491 // compose with previous Hangul LV that does not contain a Jamo T.
1495 UChar32 syllable
= prev
+ c
- Hangul::JAMO_T_BASE
;
1496 --prevSrc
; // Replace the Hangul LV as well.
1497 if (prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1500 if(!buffer
.appendBMP((UChar
)syllable
, 0, errorCode
)) {
1506 // No matching context, or may need to decompose surrounding text first:
1507 // Fall through to the slow path.
1508 } else if (norm16
> JAMO_VT
) { // norm16 >= MIN_YES_YES_WITH_CC
1509 // One or more combining marks that do not combine-back:
1510 // Check for canonical order, copy unchanged if ok and
1511 // if followed by a character with a boundary-before.
1512 uint8_t cc
= getCCFromNormalYesOrMaybe(norm16
); // cc!=0
1513 if (onlyContiguous
/* FCC */ && getPreviousTrailCC(prevBoundary
, prevSrc
) > cc
) {
1514 // Fails FCD test, need to decompose and contiguously recompose.
1519 // If !onlyContiguous (not FCC), then we ignore the tccc of
1520 // the previous character which passed the quick check "yes && ccc==0" test.
1521 const UChar
*nextSrc
;
1526 buffer
.appendZeroCC(prevBoundary
, limit
, errorCode
);
1530 uint8_t prevCC
= cc
;
1532 UTRIE2_U16_NEXT16(normTrie
, nextSrc
, limit
, c
, n16
);
1533 if (n16
>= MIN_YES_YES_WITH_CC
) {
1534 cc
= getCCFromNormalYesOrMaybe(n16
);
1546 // src is after the last in-order combining mark.
1547 // If there is a boundary here, then we continue with no change.
1548 if (norm16HasCompBoundaryBefore(n16
)) {
1549 if (isCompYesAndZeroCC(n16
)) {
1554 // Use the slow path. There is no boundary in [prevSrc, src[.
1558 // Slow path: Find the nearest boundaries around the current character,
1559 // decompose and recompose.
1560 if (prevBoundary
!= prevSrc
&& !norm16HasCompBoundaryBefore(norm16
)) {
1561 const UChar
*p
= prevSrc
;
1562 UTRIE2_U16_PREV16(normTrie
, prevBoundary
, p
, c
, norm16
);
1563 if (!norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
1567 if (doCompose
&& prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1570 int32_t recomposeStartIndex
=buffer
.length();
1571 // We know there is not a boundary here.
1572 decomposeShort(prevSrc
, src
, FALSE
/* !stopAtCompBoundary */, onlyContiguous
,
1574 // Decompose until the next boundary.
1575 src
= decomposeShort(src
, limit
, TRUE
/* stopAtCompBoundary */, onlyContiguous
,
1577 if (U_FAILURE(errorCode
)) {
1580 if ((src
- prevSrc
) > INT32_MAX
) { // guard before buffer.equals()
1581 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
1584 recompose(buffer
, recomposeStartIndex
, onlyContiguous
);
1586 if(!buffer
.equals(prevSrc
, src
)) {
1596 // Very similar to compose(): Make the same changes in both places if relevant.
1597 // pQCResult==NULL: spanQuickCheckYes
1598 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1600 Normalizer2Impl::composeQuickCheck(const UChar
*src
, const UChar
*limit
,
1601 UBool onlyContiguous
,
1602 UNormalizationCheckResult
*pQCResult
) const {
1603 const UChar
*prevBoundary
=src
;
1604 UChar32 minNoMaybeCP
=minCompNoMaybeCP
;
1606 UErrorCode errorCode
=U_ZERO_ERROR
;
1607 src
=copyLowPrefixFromNulTerminated(src
, minNoMaybeCP
, NULL
, errorCode
);
1608 limit
=u_strchr(src
, 0);
1609 if (prevBoundary
!= src
) {
1610 if (hasCompBoundaryAfter(*(src
-1), onlyContiguous
)) {
1613 prevBoundary
= --src
;
1619 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1620 // or with (compYes && ccc==0) properties.
1621 const UChar
*prevSrc
;
1623 uint16_t norm16
= 0;
1628 if( (c
=*src
)<minNoMaybeCP
||
1629 isCompYesAndZeroCC(norm16
=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie
, c
))
1634 if(!U16_IS_SURROGATE(c
)) {
1638 if(U16_IS_SURROGATE_LEAD(c
)) {
1639 if(src
!=limit
&& U16_IS_TRAIL(c2
=*src
)) {
1641 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
1643 } else /* trail surrogate */ {
1644 if(prevBoundary
<prevSrc
&& U16_IS_LEAD(c2
=*(prevSrc
-1))) {
1646 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
1649 if(!isCompYesAndZeroCC(norm16
=getNorm16(c
))) {
1655 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1656 // The current character is either a "noNo" (has a mapping)
1657 // or a "maybeYes" (combines backward)
1658 // or a "yesYes" with ccc!=0.
1659 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1661 uint16_t prevNorm16
= INERT
;
1662 if (prevBoundary
!= prevSrc
) {
1663 if (norm16HasCompBoundaryBefore(norm16
)) {
1664 prevBoundary
= prevSrc
;
1666 const UChar
*p
= prevSrc
;
1668 UTRIE2_U16_PREV16(normTrie
, prevBoundary
, p
, c
, n16
);
1669 if (norm16HasCompBoundaryAfter(n16
, onlyContiguous
)) {
1670 prevBoundary
= prevSrc
;
1678 if(isMaybeOrNonZeroCC(norm16
)) {
1679 uint8_t cc
=getCCFromYesOrMaybe(norm16
);
1680 if (onlyContiguous
/* FCC */ && cc
!= 0 &&
1681 getTrailCCFromCompYesAndZeroCC(prevNorm16
) > cc
) {
1682 // The [prevBoundary..prevSrc[ character
1683 // passed the quick check "yes && ccc==0" test
1684 // but is out of canonical order with the current combining mark.
1686 // If !onlyContiguous (not FCC), then we ignore the tccc of
1687 // the previous character which passed the quick check "yes && ccc==0" test.
1688 const UChar
*nextSrc
;
1690 if (norm16
< MIN_YES_YES_WITH_CC
) {
1691 if (pQCResult
!= nullptr) {
1692 *pQCResult
= UNORM_MAYBE
;
1694 return prevBoundary
;
1700 uint8_t prevCC
= cc
;
1702 UTRIE2_U16_NEXT16(normTrie
, nextSrc
, limit
, c
, norm16
);
1703 if (isMaybeOrNonZeroCC(norm16
)) {
1704 cc
= getCCFromYesOrMaybe(norm16
);
1705 if (!(prevCC
<= cc
|| cc
== 0)) {
1713 // src is after the last in-order combining mark.
1714 if (isCompYesAndZeroCC(norm16
)) {
1721 if(pQCResult
!=NULL
) {
1722 *pQCResult
=UNORM_NO
;
1724 return prevBoundary
;
1728 void Normalizer2Impl::composeAndAppend(const UChar
*src
, const UChar
*limit
,
1730 UBool onlyContiguous
,
1731 UnicodeString
&safeMiddle
,
1732 ReorderingBuffer
&buffer
,
1733 UErrorCode
&errorCode
) const {
1734 if(!buffer
.isEmpty()) {
1735 const UChar
*firstStarterInSrc
=findNextCompBoundary(src
, limit
, onlyContiguous
);
1736 if(src
!=firstStarterInSrc
) {
1737 const UChar
*lastStarterInDest
=findPreviousCompBoundary(buffer
.getStart(),
1738 buffer
.getLimit(), onlyContiguous
);
1739 int32_t destSuffixLength
=(int32_t)(buffer
.getLimit()-lastStarterInDest
);
1740 UnicodeString
middle(lastStarterInDest
, destSuffixLength
);
1741 buffer
.removeSuffix(destSuffixLength
);
1743 middle
.append(src
, (int32_t)(firstStarterInSrc
-src
));
1744 const UChar
*middleStart
=middle
.getBuffer();
1745 compose(middleStart
, middleStart
+middle
.length(), onlyContiguous
,
1746 TRUE
, buffer
, errorCode
);
1747 if(U_FAILURE(errorCode
)) {
1750 src
=firstStarterInSrc
;
1754 compose(src
, limit
, onlyContiguous
, TRUE
, buffer
, errorCode
);
1756 if(limit
==NULL
) { // appendZeroCC() needs limit!=NULL
1757 limit
=u_strchr(src
, 0);
1759 buffer
.appendZeroCC(src
, limit
, errorCode
);
1764 Normalizer2Impl::composeUTF8(uint32_t options
, UBool onlyContiguous
,
1765 const uint8_t *src
, const uint8_t *limit
,
1766 ByteSink
*sink
, Edits
*edits
, UErrorCode
&errorCode
) const {
1767 U_ASSERT(limit
!= nullptr);
1769 uint8_t minNoMaybeLead
= leadByteForCP(minCompNoMaybeCP
);
1770 const uint8_t *prevBoundary
= src
;
1773 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1774 // or with (compYes && ccc==0) properties.
1775 const uint8_t *prevSrc
;
1776 uint16_t norm16
= 0;
1779 if (prevBoundary
!= limit
&& sink
!= nullptr) {
1780 ByteSinkUtil::appendUnchanged(prevBoundary
, limit
,
1781 *sink
, options
, edits
, errorCode
);
1785 if (*src
< minNoMaybeLead
) {
1789 UTRIE2_U8_NEXT16(normTrie
, src
, limit
, norm16
);
1790 if (!isCompYesAndZeroCC(norm16
)) {
1795 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1796 // The current character is either a "noNo" (has a mapping)
1797 // or a "maybeYes" (combines backward)
1798 // or a "yesYes" with ccc!=0.
1799 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1801 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1802 if (!isMaybeOrNonZeroCC(norm16
)) { // minNoNo <= norm16 < minMaybeYes
1803 if (sink
== nullptr) {
1806 // Fast path for mapping a character that is immediately surrounded by boundaries.
1807 // In this case, we need not decompose around the current character.
1808 if (isDecompNoAlgorithmic(norm16
)) {
1809 // Maps to a single isCompYesAndZeroCC character
1810 // which also implies hasCompBoundaryBefore.
1811 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
) ||
1812 hasCompBoundaryBefore(src
, limit
)) {
1813 if (prevBoundary
!= prevSrc
&&
1814 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1815 *sink
, options
, edits
, errorCode
)) {
1818 appendCodePointDelta(prevSrc
, src
, getAlgorithmicDelta(norm16
), *sink
, edits
);
1822 } else if (norm16
< minNoNoCompBoundaryBefore
) {
1823 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1824 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
) ||
1825 hasCompBoundaryBefore(src
, limit
)) {
1826 if (prevBoundary
!= prevSrc
&&
1827 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1828 *sink
, options
, edits
, errorCode
)) {
1831 const uint16_t *mapping
= getMapping(norm16
);
1832 int32_t length
= *mapping
++ & MAPPING_LENGTH_MASK
;
1833 if (!ByteSinkUtil::appendChange(prevSrc
, src
, (const UChar
*)mapping
, length
,
1834 *sink
, edits
, errorCode
)) {
1840 } else if (norm16
>= minNoNoEmpty
) {
1841 // The current character maps to nothing.
1842 // Simply omit it from the output if there is a boundary before _or_ after it.
1843 // The character itself implies no boundaries.
1844 if (hasCompBoundaryBefore(src
, limit
) ||
1845 hasCompBoundaryAfter(prevBoundary
, prevSrc
, onlyContiguous
)) {
1846 if (prevBoundary
!= prevSrc
&&
1847 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1848 *sink
, options
, edits
, errorCode
)) {
1851 if (edits
!= nullptr) {
1852 edits
->addReplace((int32_t)(src
- prevSrc
), 0);
1858 // Other "noNo" type, or need to examine more text around this character:
1859 // Fall through to the slow path.
1860 } else if (isJamoVT(norm16
)) {
1861 // Jamo L: E1 84 80..92
1862 // Jamo V: E1 85 A1..B5
1863 // Jamo T: E1 86 A8..E1 87 82
1864 U_ASSERT((src
- prevSrc
) == 3 && *prevSrc
== 0xe1);
1865 UChar32 prev
= previousHangulOrJamo(prevBoundary
, prevSrc
);
1866 if (prevSrc
[1] == 0x85) {
1867 // The current character is a Jamo Vowel,
1868 // compose with previous Jamo L and following Jamo T.
1869 UChar32 l
= prev
- Hangul::JAMO_L_BASE
;
1870 if ((uint32_t)l
< Hangul::JAMO_L_COUNT
) {
1871 if (sink
== nullptr) {
1874 int32_t t
= getJamoTMinusBase(src
, limit
);
1876 // The next character is a Jamo T.
1878 } else if (hasCompBoundaryBefore(src
, limit
)) {
1879 // No Jamo T follows, not even via decomposition.
1883 UChar32 syllable
= Hangul::HANGUL_BASE
+
1884 (l
*Hangul::JAMO_V_COUNT
+ (prevSrc
[2]-0xa1)) *
1885 Hangul::JAMO_T_COUNT
+ t
;
1886 prevSrc
-= 3; // Replace the Jamo L as well.
1887 if (prevBoundary
!= prevSrc
&&
1888 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1889 *sink
, options
, edits
, errorCode
)) {
1892 ByteSinkUtil::appendCodePoint(prevSrc
, src
, syllable
, *sink
, edits
);
1896 // If we see L+V+x where x!=T then we drop to the slow path,
1897 // decompose and recompose.
1898 // This is to deal with NFKC finding normal L and V but a
1899 // compatibility variant of a T.
1900 // We need to either fully compose that combination here
1901 // (which would complicate the code and may not work with strange custom data)
1902 // or use the slow path.
1904 } else if (Hangul::isHangulLV(prev
)) {
1905 // The current character is a Jamo Trailing consonant,
1906 // compose with previous Hangul LV that does not contain a Jamo T.
1907 if (sink
== nullptr) {
1910 UChar32 syllable
= prev
+ getJamoTMinusBase(prevSrc
, src
);
1911 prevSrc
-= 3; // Replace the Hangul LV as well.
1912 if (prevBoundary
!= prevSrc
&&
1913 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1914 *sink
, options
, edits
, errorCode
)) {
1917 ByteSinkUtil::appendCodePoint(prevSrc
, src
, syllable
, *sink
, edits
);
1921 // No matching context, or may need to decompose surrounding text first:
1922 // Fall through to the slow path.
1923 } else if (norm16
> JAMO_VT
) { // norm16 >= MIN_YES_YES_WITH_CC
1924 // One or more combining marks that do not combine-back:
1925 // Check for canonical order, copy unchanged if ok and
1926 // if followed by a character with a boundary-before.
1927 uint8_t cc
= getCCFromNormalYesOrMaybe(norm16
); // cc!=0
1928 if (onlyContiguous
/* FCC */ && getPreviousTrailCC(prevBoundary
, prevSrc
) > cc
) {
1929 // Fails FCD test, need to decompose and contiguously recompose.
1930 if (sink
== nullptr) {
1934 // If !onlyContiguous (not FCC), then we ignore the tccc of
1935 // the previous character which passed the quick check "yes && ccc==0" test.
1936 const uint8_t *nextSrc
;
1940 if (sink
!= nullptr) {
1941 ByteSinkUtil::appendUnchanged(prevBoundary
, limit
,
1942 *sink
, options
, edits
, errorCode
);
1946 uint8_t prevCC
= cc
;
1948 UTRIE2_U8_NEXT16(normTrie
, nextSrc
, limit
, n16
);
1949 if (n16
>= MIN_YES_YES_WITH_CC
) {
1950 cc
= getCCFromNormalYesOrMaybe(n16
);
1952 if (sink
== nullptr) {
1962 // src is after the last in-order combining mark.
1963 // If there is a boundary here, then we continue with no change.
1964 if (norm16HasCompBoundaryBefore(n16
)) {
1965 if (isCompYesAndZeroCC(n16
)) {
1970 // Use the slow path. There is no boundary in [prevSrc, src[.
1974 // Slow path: Find the nearest boundaries around the current character,
1975 // decompose and recompose.
1976 if (prevBoundary
!= prevSrc
&& !norm16HasCompBoundaryBefore(norm16
)) {
1977 const uint8_t *p
= prevSrc
;
1978 UTRIE2_U8_PREV16(normTrie
, prevBoundary
, p
, norm16
);
1979 if (!norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
1983 ReorderingBuffer
buffer(*this, s16
, errorCode
);
1984 if (U_FAILURE(errorCode
)) {
1987 // We know there is not a boundary here.
1988 decomposeShort(prevSrc
, src
, FALSE
/* !stopAtCompBoundary */, onlyContiguous
,
1990 // Decompose until the next boundary.
1991 src
= decomposeShort(src
, limit
, TRUE
/* stopAtCompBoundary */, onlyContiguous
,
1993 if (U_FAILURE(errorCode
)) {
1996 if ((src
- prevSrc
) > INT32_MAX
) { // guard before buffer.equals()
1997 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
2000 recompose(buffer
, 0, onlyContiguous
);
2001 if (!buffer
.equals(prevSrc
, src
)) {
2002 if (sink
== nullptr) {
2005 if (prevBoundary
!= prevSrc
&&
2006 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
2007 *sink
, options
, edits
, errorCode
)) {
2010 if (!ByteSinkUtil::appendChange(prevSrc
, src
, buffer
.getStart(), buffer
.length(),
2011 *sink
, edits
, errorCode
)) {
2020 UBool
Normalizer2Impl::hasCompBoundaryBefore(const UChar
*src
, const UChar
*limit
) const {
2021 if (src
== limit
|| *src
< minCompNoMaybeCP
) {
2026 UTRIE2_U16_NEXT16(normTrie
, src
, limit
, c
, norm16
);
2027 return norm16HasCompBoundaryBefore(norm16
);
2030 UBool
Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src
, const uint8_t *limit
) const {
2035 UTRIE2_U8_NEXT16(normTrie
, src
, limit
, norm16
);
2036 return norm16HasCompBoundaryBefore(norm16
);
2039 UBool
Normalizer2Impl::hasCompBoundaryAfter(const UChar
*start
, const UChar
*p
,
2040 UBool onlyContiguous
) const {
2046 UTRIE2_U16_PREV16(normTrie
, start
, p
, c
, norm16
);
2047 return norm16HasCompBoundaryAfter(norm16
, onlyContiguous
);
2050 UBool
Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start
, const uint8_t *p
,
2051 UBool onlyContiguous
) const {
2056 UTRIE2_U8_PREV16(normTrie
, start
, p
, norm16
);
2057 return norm16HasCompBoundaryAfter(norm16
, onlyContiguous
);
2060 const UChar
*Normalizer2Impl::findPreviousCompBoundary(const UChar
*start
, const UChar
*p
,
2061 UBool onlyContiguous
) const {
2062 BackwardUTrie2StringIterator
iter(normTrie
, start
, p
);
2064 uint16_t norm16
=iter
.previous16();
2065 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
2066 return iter
.codePointLimit
;
2068 if (hasCompBoundaryBefore(iter
.codePoint
, norm16
)) {
2069 return iter
.codePointStart
;
2074 const UChar
*Normalizer2Impl::findNextCompBoundary(const UChar
*p
, const UChar
*limit
,
2075 UBool onlyContiguous
) const {
2076 ForwardUTrie2StringIterator
iter(normTrie
, p
, limit
);
2078 uint16_t norm16
=iter
.next16();
2079 if (hasCompBoundaryBefore(iter
.codePoint
, norm16
)) {
2080 return iter
.codePointStart
;
2082 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
2083 return iter
.codePointLimit
;
2088 uint8_t Normalizer2Impl::getPreviousTrailCC(const UChar
*start
, const UChar
*p
) const {
2092 int32_t i
= (int32_t)(p
- start
);
2094 U16_PREV(start
, 0, i
, c
);
2095 return (uint8_t)getFCD16(c
);
2098 uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start
, const uint8_t *p
) const {
2102 int32_t i
= (int32_t)(p
- start
);
2104 U8_PREV(start
, 0, i
, c
);
2105 return (uint8_t)getFCD16(c
);
2108 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
2109 // still had getFCDTrie() which built and cached an FCD trie.
2110 // That provided faster access to FCD data than getFCD16FromNormData()
2111 // but required synchronization and consumed some 10kB of heap memory
2112 // in any process that uses FCD (e.g., via collation).
2113 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
2114 // at least for ASCII & CJK.
2116 // Gets the FCD value from the regular normalization data.
2117 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c
) const {
2118 uint16_t norm16
=getNorm16(c
);
2119 if (norm16
>= limitNoNo
) {
2120 if(norm16
>=MIN_NORMAL_MAYBE_YES
) {
2122 norm16
=getCCFromNormalYesOrMaybe(norm16
);
2123 return norm16
|(norm16
<<8);
2124 } else if(norm16
>=minMaybeYes
) {
2126 } else { // isDecompNoAlgorithmic(norm16)
2127 uint16_t deltaTrailCC
= norm16
& DELTA_TCCC_MASK
;
2128 if (deltaTrailCC
<= DELTA_TCCC_1
) {
2129 return deltaTrailCC
>> OFFSET_SHIFT
;
2131 // Maps to an isCompYesAndZeroCC.
2132 c
=mapAlgorithmic(c
, norm16
);
2133 norm16
=getNorm16(c
);
2136 if(norm16
<=minYesNo
|| isHangulLVT(norm16
)) {
2137 // no decomposition or Hangul syllable, all zeros
2140 // c decomposes, get everything from the variable-length extra data
2141 const uint16_t *mapping
=getMapping(norm16
);
2142 uint16_t firstUnit
=*mapping
;
2143 norm16
=firstUnit
>>8; // tccc
2144 if(firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
) {
2145 norm16
|=*(mapping
-1)&0xff00; // lccc
2150 // Dual functionality:
2151 // buffer!=NULL: normalize
2152 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
2154 Normalizer2Impl::makeFCD(const UChar
*src
, const UChar
*limit
,
2155 ReorderingBuffer
*buffer
,
2156 UErrorCode
&errorCode
) const {
2157 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
2158 // Similar to the prevBoundary in the compose() implementation.
2159 const UChar
*prevBoundary
=src
;
2160 int32_t prevFCD16
=0;
2162 src
=copyLowPrefixFromNulTerminated(src
, minLcccCP
, buffer
, errorCode
);
2163 if(U_FAILURE(errorCode
)) {
2166 if(prevBoundary
<src
) {
2168 // We know that the previous character's lccc==0.
2169 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
2170 prevFCD16
=getFCD16(*(src
-1));
2175 limit
=u_strchr(src
, 0);
2178 // Note: In this function we use buffer->appendZeroCC() because we track
2179 // the lead and trail combining classes here, rather than leaving it to
2180 // the ReorderingBuffer.
2181 // The exception is the call to decomposeShort() which uses the buffer
2182 // in the normal way.
2184 const UChar
*prevSrc
;
2189 // count code units with lccc==0
2190 for(prevSrc
=src
; src
!=limit
;) {
2191 if((c
=*src
)<minLcccCP
) {
2194 } else if(!singleLeadMightHaveNonZeroFCD16(c
)) {
2198 if(U16_IS_SURROGATE(c
)) {
2200 if(U16_IS_SURROGATE_LEAD(c
)) {
2201 if((src
+1)!=limit
&& U16_IS_TRAIL(c2
=src
[1])) {
2202 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
2204 } else /* trail surrogate */ {
2205 if(prevSrc
<src
&& U16_IS_LEAD(c2
=*(src
-1))) {
2207 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
2211 if((fcd16
=getFCD16FromNormData(c
))<=0xff) {
2219 // copy these code units all at once
2221 if(buffer
!=NULL
&& !buffer
->appendZeroCC(prevSrc
, src
, errorCode
)) {
2228 // We know that the previous character's lccc==0.
2230 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
2231 UChar32 prev
=~prevFCD16
;
2232 if(prev
<minDecompNoCP
) {
2235 prevFCD16
=getFCD16FromNormData(prev
);
2241 const UChar
*p
=src
-1;
2242 if(U16_IS_TRAIL(*p
) && prevSrc
<p
&& U16_IS_LEAD(*(p
-1))) {
2244 // Need to fetch the previous character's FCD value because
2245 // prevFCD16 was just for the trail surrogate code point.
2246 prevFCD16
=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p
[0], p
[1]));
2247 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
2253 // The start of the current character (c).
2255 } else if(src
==limit
) {
2260 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
2261 // Check for proper order, and decompose locally if necessary.
2262 if((prevFCD16
&0xff)<=(fcd16
>>8)) {
2263 // proper order: prev tccc <= current lccc
2264 if((fcd16
&0xff)<=1) {
2267 if(buffer
!=NULL
&& !buffer
->appendZeroCC(c
, errorCode
)) {
2272 } else if(buffer
==NULL
) {
2273 return prevBoundary
; // quick check "no"
2276 * Back out the part of the source that we copied or appended
2277 * already but is now going to be decomposed.
2278 * prevSrc is set to after what was copied/appended.
2280 buffer
->removeSuffix((int32_t)(prevSrc
-prevBoundary
));
2282 * Find the part of the source that needs to be decomposed,
2283 * up to the next safe boundary.
2285 src
=findNextFCDBoundary(src
, limit
);
2287 * The source text does not fulfill the conditions for FCD.
2288 * Decompose and reorder a limited piece of the text.
2290 decomposeShort(prevBoundary
, src
, FALSE
, FALSE
, *buffer
, errorCode
);
2291 if (U_FAILURE(errorCode
)) {
2301 void Normalizer2Impl::makeFCDAndAppend(const UChar
*src
, const UChar
*limit
,
2303 UnicodeString
&safeMiddle
,
2304 ReorderingBuffer
&buffer
,
2305 UErrorCode
&errorCode
) const {
2306 if(!buffer
.isEmpty()) {
2307 const UChar
*firstBoundaryInSrc
=findNextFCDBoundary(src
, limit
);
2308 if(src
!=firstBoundaryInSrc
) {
2309 const UChar
*lastBoundaryInDest
=findPreviousFCDBoundary(buffer
.getStart(),
2311 int32_t destSuffixLength
=(int32_t)(buffer
.getLimit()-lastBoundaryInDest
);
2312 UnicodeString
middle(lastBoundaryInDest
, destSuffixLength
);
2313 buffer
.removeSuffix(destSuffixLength
);
2315 middle
.append(src
, (int32_t)(firstBoundaryInSrc
-src
));
2316 const UChar
*middleStart
=middle
.getBuffer();
2317 makeFCD(middleStart
, middleStart
+middle
.length(), &buffer
, errorCode
);
2318 if(U_FAILURE(errorCode
)) {
2321 src
=firstBoundaryInSrc
;
2325 makeFCD(src
, limit
, &buffer
, errorCode
);
2327 if(limit
==NULL
) { // appendZeroCC() needs limit!=NULL
2328 limit
=u_strchr(src
, 0);
2330 buffer
.appendZeroCC(src
, limit
, errorCode
);
2334 const UChar
*Normalizer2Impl::findPreviousFCDBoundary(const UChar
*start
, const UChar
*p
) const {
2336 const UChar
*codePointLimit
= p
;
2339 UTRIE2_U16_PREV16(normTrie
, start
, p
, c
, norm16
);
2340 if (c
< minDecompNoCP
|| norm16HasDecompBoundaryAfter(norm16
)) {
2341 return codePointLimit
;
2343 if (norm16HasDecompBoundaryBefore(norm16
)) {
2350 const UChar
*Normalizer2Impl::findNextFCDBoundary(const UChar
*p
, const UChar
*limit
) const {
2352 const UChar
*codePointStart
=p
;
2355 UTRIE2_U16_NEXT16(normTrie
, p
, limit
, c
, norm16
);
2356 if (c
< minLcccCP
|| norm16HasDecompBoundaryBefore(norm16
)) {
2357 return codePointStart
;
2359 if (norm16HasDecompBoundaryAfter(norm16
)) {
2366 // CanonicalIterator data -------------------------------------------------- ***
2368 CanonIterData::CanonIterData(UErrorCode
&errorCode
) :
2369 trie(utrie2_open(0, 0, &errorCode
)),
2370 canonStartSets(uprv_deleteUObject
, NULL
, errorCode
) {}
2372 CanonIterData::~CanonIterData() {
2376 void CanonIterData::addToStartSet(UChar32 origin
, UChar32 decompLead
, UErrorCode
&errorCode
) {
2377 uint32_t canonValue
=utrie2_get32(trie
, decompLead
);
2378 if((canonValue
&(CANON_HAS_SET
|CANON_VALUE_MASK
))==0 && origin
!=0) {
2379 // origin is the first character whose decomposition starts with
2380 // the character for which we are setting the value.
2381 utrie2_set32(trie
, decompLead
, canonValue
|origin
, &errorCode
);
2383 // origin is not the first character, or it is U+0000.
2385 if((canonValue
&CANON_HAS_SET
)==0) {
2388 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
2391 UChar32 firstOrigin
=(UChar32
)(canonValue
&CANON_VALUE_MASK
);
2392 canonValue
=(canonValue
&~CANON_VALUE_MASK
)|CANON_HAS_SET
|(uint32_t)canonStartSets
.size();
2393 utrie2_set32(trie
, decompLead
, canonValue
, &errorCode
);
2394 canonStartSets
.addElement(set
, errorCode
);
2395 if(firstOrigin
!=0) {
2396 set
->add(firstOrigin
);
2399 set
=(UnicodeSet
*)canonStartSets
[(int32_t)(canonValue
&CANON_VALUE_MASK
)];
2405 // C++ class for friend access to private Normalizer2Impl members.
2406 class InitCanonIterData
{
2408 static void doInit(Normalizer2Impl
*impl
, UErrorCode
&errorCode
);
2409 static void handleRange(Normalizer2Impl
*impl
, UChar32 start
, UChar32 end
, uint16_t value
, UErrorCode
&errorCode
);
2414 // UInitOnce instantiation function for CanonIterData
2415 static void U_CALLCONV
2416 initCanonIterData(Normalizer2Impl
*impl
, UErrorCode
&errorCode
) {
2417 InitCanonIterData::doInit(impl
, errorCode
);
2420 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
2421 // context: the Normalizer2Impl
2422 static UBool U_CALLCONV
2423 enumCIDRangeHandler(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
2424 UErrorCode errorCode
= U_ZERO_ERROR
;
2425 if (value
!= Normalizer2Impl::INERT
) {
2426 Normalizer2Impl
*impl
= (Normalizer2Impl
*)context
;
2427 InitCanonIterData::handleRange(impl
, start
, end
, (uint16_t)value
, errorCode
);
2429 return U_SUCCESS(errorCode
);
2434 void InitCanonIterData::doInit(Normalizer2Impl
*impl
, UErrorCode
&errorCode
) {
2435 U_ASSERT(impl
->fCanonIterData
== NULL
);
2436 impl
->fCanonIterData
= new CanonIterData(errorCode
);
2437 if (impl
->fCanonIterData
== NULL
) {
2438 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
2440 if (U_SUCCESS(errorCode
)) {
2441 utrie2_enum(impl
->normTrie
, NULL
, enumCIDRangeHandler
, impl
);
2442 utrie2_freeze(impl
->fCanonIterData
->trie
, UTRIE2_32_VALUE_BITS
, &errorCode
);
2444 if (U_FAILURE(errorCode
)) {
2445 delete impl
->fCanonIterData
;
2446 impl
->fCanonIterData
= NULL
;
2450 void InitCanonIterData::handleRange(
2451 Normalizer2Impl
*impl
, UChar32 start
, UChar32 end
, uint16_t value
, UErrorCode
&errorCode
) {
2452 impl
->makeCanonIterDataFromNorm16(start
, end
, value
, *impl
->fCanonIterData
, errorCode
);
2455 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start
, UChar32 end
, const uint16_t norm16
,
2456 CanonIterData
&newData
,
2457 UErrorCode
&errorCode
) const {
2458 if(isInert(norm16
) || (minYesNo
<=norm16
&& norm16
<minNoNo
)) {
2459 // Inert, or 2-way mapping (including Hangul syllable).
2460 // We do not write a canonStartSet for any yesNo character.
2461 // Composites from 2-way mappings are added at runtime from the
2462 // starter's compositions list, and the other characters in
2463 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
2464 // "maybe" characters.
2467 for(UChar32 c
=start
; c
<=end
; ++c
) {
2468 uint32_t oldValue
=utrie2_get32(newData
.trie
, c
);
2469 uint32_t newValue
=oldValue
;
2470 if(isMaybeOrNonZeroCC(norm16
)) {
2471 // not a segment starter if it occurs in a decomposition or has cc!=0
2472 newValue
|=CANON_NOT_SEGMENT_STARTER
;
2473 if(norm16
<MIN_NORMAL_MAYBE_YES
) {
2474 newValue
|=CANON_HAS_COMPOSITIONS
;
2476 } else if(norm16
<minYesNo
) {
2477 newValue
|=CANON_HAS_COMPOSITIONS
;
2479 // c has a one-way decomposition
2481 // Do not modify the whole-range norm16 value.
2482 uint16_t norm16_2
=norm16
;
2483 if (isDecompNoAlgorithmic(norm16_2
)) {
2484 // Maps to an isCompYesAndZeroCC.
2485 c2
= mapAlgorithmic(c2
, norm16_2
);
2486 norm16_2
= getNorm16(c2
);
2487 // No compatibility mappings for the CanonicalIterator.
2488 U_ASSERT(!(isHangulLV(norm16_2
) || isHangulLVT(norm16_2
)));
2490 if (norm16_2
> minYesNo
) {
2491 // c decomposes, get everything from the variable-length extra data
2492 const uint16_t *mapping
=getMapping(norm16_2
);
2493 uint16_t firstUnit
=*mapping
;
2494 int32_t length
=firstUnit
&MAPPING_LENGTH_MASK
;
2495 if((firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
)!=0) {
2496 if(c
==c2
&& (*(mapping
-1)&0xff)!=0) {
2497 newValue
|=CANON_NOT_SEGMENT_STARTER
; // original c has cc!=0
2500 // Skip empty mappings (no characters in the decomposition).
2502 ++mapping
; // skip over the firstUnit
2503 // add c to first code point's start set
2505 U16_NEXT_UNSAFE(mapping
, i
, c2
);
2506 newData
.addToStartSet(c
, c2
, errorCode
);
2507 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
2508 // one-way mapping. A 2-way mapping is possible here after
2509 // intermediate algorithmic mapping.
2510 if(norm16_2
>=minNoNo
) {
2512 U16_NEXT_UNSAFE(mapping
, i
, c2
);
2513 uint32_t c2Value
=utrie2_get32(newData
.trie
, c2
);
2514 if((c2Value
&CANON_NOT_SEGMENT_STARTER
)==0) {
2515 utrie2_set32(newData
.trie
, c2
, c2Value
|CANON_NOT_SEGMENT_STARTER
,
2522 // c decomposed to c2 algorithmically; c has cc==0
2523 newData
.addToStartSet(c
, c2
, errorCode
);
2526 if(newValue
!=oldValue
) {
2527 utrie2_set32(newData
.trie
, c
, newValue
, &errorCode
);
2532 UBool
Normalizer2Impl::ensureCanonIterData(UErrorCode
&errorCode
) const {
2533 // Logically const: Synchronized instantiation.
2534 Normalizer2Impl
*me
=const_cast<Normalizer2Impl
*>(this);
2535 umtx_initOnce(me
->fCanonIterDataInitOnce
, &initCanonIterData
, me
, errorCode
);
2536 return U_SUCCESS(errorCode
);
2539 int32_t Normalizer2Impl::getCanonValue(UChar32 c
) const {
2540 return (int32_t)utrie2_get32(fCanonIterData
->trie
, c
);
2543 const UnicodeSet
&Normalizer2Impl::getCanonStartSet(int32_t n
) const {
2544 return *(const UnicodeSet
*)fCanonIterData
->canonStartSets
[n
];
2547 UBool
Normalizer2Impl::isCanonSegmentStarter(UChar32 c
) const {
2548 return getCanonValue(c
)>=0;
2551 UBool
Normalizer2Impl::getCanonStartSet(UChar32 c
, UnicodeSet
&set
) const {
2552 int32_t canonValue
=getCanonValue(c
)&~CANON_NOT_SEGMENT_STARTER
;
2557 int32_t value
=canonValue
&CANON_VALUE_MASK
;
2558 if((canonValue
&CANON_HAS_SET
)!=0) {
2559 set
.addAll(getCanonStartSet(value
));
2560 } else if(value
!=0) {
2563 if((canonValue
&CANON_HAS_COMPOSITIONS
)!=0) {
2564 uint16_t norm16
=getNorm16(c
);
2565 if(norm16
==JAMO_L
) {
2567 (UChar32
)(Hangul::HANGUL_BASE
+(c
-Hangul::JAMO_L_BASE
)*Hangul::JAMO_VT_COUNT
);
2568 set
.add(syllable
, syllable
+Hangul::JAMO_VT_COUNT
-1);
2570 addComposites(getCompositionsList(norm16
), set
);
2578 // Normalizer2 data swapping ----------------------------------------------- ***
2582 U_CAPI
int32_t U_EXPORT2
2583 unorm2_swap(const UDataSwapper
*ds
,
2584 const void *inData
, int32_t length
, void *outData
,
2585 UErrorCode
*pErrorCode
) {
2586 const UDataInfo
*pInfo
;
2589 const uint8_t *inBytes
;
2592 const int32_t *inIndexes
;
2593 int32_t indexes
[Normalizer2Impl::IX_TOTAL_SIZE
+1];
2595 int32_t i
, offset
, nextOffset
, size
;
2597 /* udata_swapDataHeader checks the arguments */
2598 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
2599 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
2603 /* check data format and format version */
2604 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
2605 uint8_t formatVersion0
=pInfo
->formatVersion
[0];
2607 pInfo
->dataFormat
[0]==0x4e && /* dataFormat="Nrm2" */
2608 pInfo
->dataFormat
[1]==0x72 &&
2609 pInfo
->dataFormat
[2]==0x6d &&
2610 pInfo
->dataFormat
[3]==0x32 &&
2611 (1<=formatVersion0
&& formatVersion0
<=3)
2613 udata_printError(ds
, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2614 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
2615 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
2616 pInfo
->formatVersion
[0]);
2617 *pErrorCode
=U_UNSUPPORTED_ERROR
;
2621 inBytes
=(const uint8_t *)inData
+headerSize
;
2622 outBytes
=(uint8_t *)outData
+headerSize
;
2624 inIndexes
=(const int32_t *)inBytes
;
2625 int32_t minIndexesLength
;
2626 if(formatVersion0
==1) {
2627 minIndexesLength
=Normalizer2Impl::IX_MIN_MAYBE_YES
+1;
2628 } else if(formatVersion0
==2) {
2629 minIndexesLength
=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
+1;
2631 minIndexesLength
=Normalizer2Impl::IX_MIN_LCCC_CP
+1;
2636 if(length
<minIndexesLength
*4) {
2637 udata_printError(ds
, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2639 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2644 /* read the first few indexes */
2645 for(i
=0; i
<UPRV_LENGTHOF(indexes
); ++i
) {
2646 indexes
[i
]=udata_readInt32(ds
, inIndexes
[i
]);
2649 /* get the total length of the data */
2650 size
=indexes
[Normalizer2Impl::IX_TOTAL_SIZE
];
2654 udata_printError(ds
, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2656 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2660 /* copy the data for inaccessible bytes */
2661 if(inBytes
!=outBytes
) {
2662 uprv_memcpy(outBytes
, inBytes
, size
);
2667 /* swap the int32_t indexes[] */
2668 nextOffset
=indexes
[Normalizer2Impl::IX_NORM_TRIE_OFFSET
];
2669 ds
->swapArray32(ds
, inBytes
, nextOffset
-offset
, outBytes
, pErrorCode
);
2672 /* swap the UTrie2 */
2673 nextOffset
=indexes
[Normalizer2Impl::IX_EXTRA_DATA_OFFSET
];
2674 utrie2_swap(ds
, inBytes
+offset
, nextOffset
-offset
, outBytes
+offset
, pErrorCode
);
2677 /* swap the uint16_t extraData[] */
2678 nextOffset
=indexes
[Normalizer2Impl::IX_SMALL_FCD_OFFSET
];
2679 ds
->swapArray16(ds
, inBytes
+offset
, nextOffset
-offset
, outBytes
+offset
, pErrorCode
);
2682 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2683 nextOffset
=indexes
[Normalizer2Impl::IX_SMALL_FCD_OFFSET
+1];
2686 U_ASSERT(offset
==size
);
2689 return headerSize
+size
;
2692 #endif // !UCONFIG_NO_NORMALIZATION