1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 2009-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
10 * file name: normalizer2impl.cpp
12 * tab size: 8 (not used)
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
19 // #define UCPTRIE_DEBUG
21 #include "unicode/utypes.h"
23 #if !UCONFIG_NO_NORMALIZATION
25 #include "unicode/bytestream.h"
26 #include "unicode/edits.h"
27 #include "unicode/normalizer2.h"
28 #include "unicode/stringoptions.h"
29 #include "unicode/ucptrie.h"
30 #include "unicode/udata.h"
31 #include "unicode/umutablecptrie.h"
32 #include "unicode/ustring.h"
33 #include "unicode/utf16.h"
34 #include "unicode/utf8.h"
35 #include "bytesinkutil.h"
38 #include "normalizer2impl.h"
41 #include "ucptrie_impl.h"
50 * UTF-8 lead byte for minNoMaybeCP.
51 * Can be lower than the actual lead byte for c.
52 * Typically U+0300 for NFC/NFD, U+00A0 for NFKC/NFKD, U+0041 for NFKC_Casefold.
54 inline uint8_t leadByteForCP(UChar32 c
) {
57 } else if (c
<= 0x7ff) {
58 return (uint8_t)(0xc0+(c
>>6));
60 // Should not occur because ccc(U+0300)!=0.
66 * Returns the code point from one single well-formed UTF-8 byte sequence
67 * between cpStart and cpLimit.
69 * Trie UTF-8 macros do not assemble whole code points (for efficiency).
70 * When we do need the code point, we call this function.
71 * We should not need it for normalization-inert data (norm16==0).
72 * Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
74 UChar32
codePointFromValidUTF8(const uint8_t *cpStart
, const uint8_t *cpLimit
) {
75 // Similar to U8_NEXT_UNSAFE(s, i, c).
76 U_ASSERT(cpStart
< cpLimit
);
78 switch(cpLimit
-cpStart
) {
82 return ((c
&0x1f)<<6) | (cpStart
[1]&0x3f);
84 // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar)
85 return (UChar
)((c
<<12) | ((cpStart
[1]&0x3f)<<6) | (cpStart
[2]&0x3f));
87 return ((c
&7)<<18) | ((cpStart
[1]&0x3f)<<12) | ((cpStart
[2]&0x3f)<<6) | (cpStart
[3]&0x3f);
89 UPRV_UNREACHABLE
; // Should not occur.
94 * Returns the last code point in [start, p[ if it is valid and in U+1000..U+D7FF.
95 * Otherwise returns a negative value.
97 UChar32
previousHangulOrJamo(const uint8_t *start
, const uint8_t *p
) {
98 if ((p
- start
) >= 3) {
102 if (0xe1 <= l
&& l
<= 0xed &&
103 (t1
= (uint8_t)(p
[1] - 0x80)) <= 0x3f &&
104 (t2
= (uint8_t)(p
[2] - 0x80)) <= 0x3f &&
105 (l
< 0xed || t1
<= 0x1f)) {
106 return ((l
& 0xf) << 12) | (t1
<< 6) | t2
;
113 * Returns the offset from the Jamo T base if [src, limit[ starts with a single Jamo T code point.
114 * Otherwise returns a negative value.
116 int32_t getJamoTMinusBase(const uint8_t *src
, const uint8_t *limit
) {
117 // Jamo T: E1 86 A8..E1 87 82
118 if ((limit
- src
) >= 3 && *src
== 0xe1) {
119 if (src
[1] == 0x86) {
121 // The first Jamo T is U+11A8 but JAMO_T_BASE is 11A7.
122 // Offset 0 does not correspond to any conjoining Jamo.
123 if (0xa8 <= t
&& t
<= 0xbf) {
126 } else if (src
[1] == 0x87) {
128 if ((int8_t)t
<= (int8_t)0x82u
) {
129 return t
- (0xa7 - 0x40);
137 appendCodePointDelta(const uint8_t *cpStart
, const uint8_t *cpLimit
, int32_t delta
,
138 ByteSink
&sink
, Edits
*edits
) {
139 char buffer
[U8_MAX_LENGTH
];
141 int32_t cpLength
= (int32_t)(cpLimit
- cpStart
);
143 // The builder makes ASCII map to ASCII.
144 buffer
[0] = (uint8_t)(*cpStart
+ delta
);
147 int32_t trail
= *(cpLimit
-1) + delta
;
148 if (0x80 <= trail
&& trail
<= 0xbf) {
149 // The delta only changes the last trail byte.
152 do { buffer
[length
++] = *cpStart
++; } while (cpStart
< cpLimit
);
153 buffer
[length
++] = (uint8_t)trail
;
155 // Decode the code point, add the delta, re-encode.
156 UChar32 c
= codePointFromValidUTF8(cpStart
, cpLimit
) + delta
;
158 U8_APPEND_UNSAFE(buffer
, length
, c
);
161 if (edits
!= nullptr) {
162 edits
->addReplace(cpLength
, length
);
164 sink
.Append(buffer
, length
);
169 // ReorderingBuffer -------------------------------------------------------- ***
171 ReorderingBuffer::ReorderingBuffer(const Normalizer2Impl
&ni
, UnicodeString
&dest
,
172 UErrorCode
&errorCode
) :
174 start(str
.getBuffer(8)), reorderStart(start
), limit(start
),
175 remainingCapacity(str
.getCapacity()), lastCC(0) {
176 if (start
== nullptr && U_SUCCESS(errorCode
)) {
177 // getBuffer() already did str.setToBogus()
178 errorCode
= U_MEMORY_ALLOCATION_ERROR
;
182 UBool
ReorderingBuffer::init(int32_t destCapacity
, UErrorCode
&errorCode
) {
183 int32_t length
=str
.length();
184 start
=str
.getBuffer(destCapacity
);
186 // getBuffer() already did str.setToBogus()
187 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
191 remainingCapacity
=str
.getCapacity()-length
;
198 // Set reorderStart after the last code point with cc<=1 if there is one.
200 while(previousCC()>1) {}
202 reorderStart
=codePointLimit
;
207 UBool
ReorderingBuffer::equals(const UChar
*otherStart
, const UChar
*otherLimit
) const {
208 int32_t length
=(int32_t)(limit
-start
);
210 length
==(int32_t)(otherLimit
-otherStart
) &&
211 0==u_memcmp(start
, otherStart
, length
);
214 UBool
ReorderingBuffer::equals(const uint8_t *otherStart
, const uint8_t *otherLimit
) const {
215 U_ASSERT((otherLimit
- otherStart
) <= INT32_MAX
); // ensured by caller
216 int32_t length
= (int32_t)(limit
- start
);
217 int32_t otherLength
= (int32_t)(otherLimit
- otherStart
);
218 // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long.
219 if (otherLength
< length
|| (otherLength
/ 3) > length
) {
222 // Compare valid strings from between normalization boundaries.
223 // (Invalid sequences are normalization-inert.)
224 for (int32_t i
= 0, j
= 0;;) {
226 return j
>= otherLength
;
227 } else if (j
>= otherLength
) {
230 // Not at the end of either string yet.
232 U16_NEXT_UNSAFE(start
, i
, c
);
233 U8_NEXT_UNSAFE(otherStart
, j
, other
);
240 UBool
ReorderingBuffer::appendSupplementary(UChar32 c
, uint8_t cc
, UErrorCode
&errorCode
) {
241 if(remainingCapacity
<2 && !resize(2, errorCode
)) {
244 if(lastCC
<=cc
|| cc
==0) {
245 limit
[0]=U16_LEAD(c
);
246 limit
[1]=U16_TRAIL(c
);
255 remainingCapacity
-=2;
259 UBool
ReorderingBuffer::append(const UChar
*s
, int32_t length
, UBool isNFD
,
260 uint8_t leadCC
, uint8_t trailCC
,
261 UErrorCode
&errorCode
) {
265 if(remainingCapacity
<length
&& !resize(length
, errorCode
)) {
268 remainingCapacity
-=length
;
269 if(lastCC
<=leadCC
|| leadCC
==0) {
271 reorderStart
=limit
+length
;
272 } else if(leadCC
<=1) {
273 reorderStart
=limit
+1; // Ok if not a code point boundary.
275 const UChar
*sLimit
=s
+length
;
276 do { *limit
++=*s
++; } while(s
!=sLimit
);
281 U16_NEXT(s
, i
, length
, c
);
282 insert(c
, leadCC
); // insert first code point
284 U16_NEXT(s
, i
, length
, c
);
287 leadCC
= Normalizer2Impl::getCCFromYesOrMaybe(impl
.getRawNorm16(c
));
289 leadCC
= impl
.getCC(impl
.getNorm16(c
));
294 append(c
, leadCC
, errorCode
);
300 UBool
ReorderingBuffer::appendZeroCC(UChar32 c
, UErrorCode
&errorCode
) {
301 int32_t cpLength
=U16_LENGTH(c
);
302 if(remainingCapacity
<cpLength
&& !resize(cpLength
, errorCode
)) {
305 remainingCapacity
-=cpLength
;
309 limit
[0]=U16_LEAD(c
);
310 limit
[1]=U16_TRAIL(c
);
318 UBool
ReorderingBuffer::appendZeroCC(const UChar
*s
, const UChar
*sLimit
, UErrorCode
&errorCode
) {
322 int32_t length
=(int32_t)(sLimit
-s
);
323 if(remainingCapacity
<length
&& !resize(length
, errorCode
)) {
326 u_memcpy(limit
, s
, length
);
328 remainingCapacity
-=length
;
334 void ReorderingBuffer::remove() {
335 reorderStart
=limit
=start
;
336 remainingCapacity
=str
.getCapacity();
340 void ReorderingBuffer::removeSuffix(int32_t suffixLength
) {
341 if(suffixLength
<(limit
-start
)) {
343 remainingCapacity
+=suffixLength
;
346 remainingCapacity
=str
.getCapacity();
352 UBool
ReorderingBuffer::resize(int32_t appendLength
, UErrorCode
&errorCode
) {
353 int32_t reorderStartIndex
=(int32_t)(reorderStart
-start
);
354 int32_t length
=(int32_t)(limit
-start
);
355 str
.releaseBuffer(length
);
356 int32_t newCapacity
=length
+appendLength
;
357 int32_t doubleCapacity
=2*str
.getCapacity();
358 if(newCapacity
<doubleCapacity
) {
359 newCapacity
=doubleCapacity
;
361 if(newCapacity
<256) {
364 start
=str
.getBuffer(newCapacity
);
366 // getBuffer() already did str.setToBogus()
367 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
370 reorderStart
=start
+reorderStartIndex
;
372 remainingCapacity
=str
.getCapacity()-length
;
376 void ReorderingBuffer::skipPrevious() {
377 codePointLimit
=codePointStart
;
378 UChar c
=*--codePointStart
;
379 if(U16_IS_TRAIL(c
) && start
<codePointStart
&& U16_IS_LEAD(*(codePointStart
-1))) {
384 uint8_t ReorderingBuffer::previousCC() {
385 codePointLimit
=codePointStart
;
386 if(reorderStart
>=codePointStart
) {
389 UChar32 c
=*--codePointStart
;
391 if(U16_IS_TRAIL(c
) && start
<codePointStart
&& U16_IS_LEAD(c2
=*(codePointStart
-1))) {
393 c
=U16_GET_SUPPLEMENTARY(c2
, c
);
395 return impl
.getCCFromYesOrMaybeCP(c
);
398 // Inserts c somewhere before the last character.
399 // Requires 0<cc<lastCC which implies reorderStart<limit.
400 void ReorderingBuffer::insert(UChar32 c
, uint8_t cc
) {
401 for(setIterator(), skipPrevious(); previousCC()>cc
;) {}
402 // insert c at codePointLimit, after the character with prevCC<=cc
404 UChar
*r
=limit
+=U16_LENGTH(c
);
407 } while(codePointLimit
!=q
);
408 writeCodePoint(q
, c
);
414 // Normalizer2Impl --------------------------------------------------------- ***
416 struct CanonIterData
: public UMemory
{
417 CanonIterData(UErrorCode
&errorCode
);
419 void addToStartSet(UChar32 origin
, UChar32 decompLead
, UErrorCode
&errorCode
);
420 UMutableCPTrie
*mutableTrie
;
422 UVector canonStartSets
; // contains UnicodeSet *
425 Normalizer2Impl::~Normalizer2Impl() {
426 delete fCanonIterData
;
430 Normalizer2Impl::init(const int32_t *inIndexes
, const UCPTrie
*inTrie
,
431 const uint16_t *inExtraData
, const uint8_t *inSmallFCD
) {
432 minDecompNoCP
= static_cast<UChar
>(inIndexes
[IX_MIN_DECOMP_NO_CP
]);
433 minCompNoMaybeCP
= static_cast<UChar
>(inIndexes
[IX_MIN_COMP_NO_MAYBE_CP
]);
434 minLcccCP
= static_cast<UChar
>(inIndexes
[IX_MIN_LCCC_CP
]);
436 minYesNo
= static_cast<uint16_t>(inIndexes
[IX_MIN_YES_NO
]);
437 minYesNoMappingsOnly
= static_cast<uint16_t>(inIndexes
[IX_MIN_YES_NO_MAPPINGS_ONLY
]);
438 minNoNo
= static_cast<uint16_t>(inIndexes
[IX_MIN_NO_NO
]);
439 minNoNoCompBoundaryBefore
= static_cast<uint16_t>(inIndexes
[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE
]);
440 minNoNoCompNoMaybeCC
= static_cast<uint16_t>(inIndexes
[IX_MIN_NO_NO_COMP_NO_MAYBE_CC
]);
441 minNoNoEmpty
= static_cast<uint16_t>(inIndexes
[IX_MIN_NO_NO_EMPTY
]);
442 limitNoNo
= static_cast<uint16_t>(inIndexes
[IX_LIMIT_NO_NO
]);
443 minMaybeYes
= static_cast<uint16_t>(inIndexes
[IX_MIN_MAYBE_YES
]);
444 U_ASSERT((minMaybeYes
& 7) == 0); // 8-aligned for noNoDelta bit fields
445 centerNoNoDelta
= (minMaybeYes
>> DELTA_SHIFT
) - MAX_DELTA
- 1;
449 maybeYesCompositions
=inExtraData
;
450 extraData
=maybeYesCompositions
+((MIN_NORMAL_MAYBE_YES
-minMaybeYes
)>>OFFSET_SHIFT
);
457 static uint32_t U_CALLCONV
458 segmentStarterMapper(const void * /*context*/, uint32_t value
) {
459 return value
&CANON_NOT_SEGMENT_STARTER
;
465 Normalizer2Impl::addLcccChars(UnicodeSet
&set
) const {
466 UChar32 start
= 0, end
;
468 while ((end
= ucptrie_getRange(normTrie
, start
, UCPMAP_RANGE_FIXED_LEAD_SURROGATES
, INERT
,
469 nullptr, nullptr, &norm16
)) >= 0) {
470 if (norm16
> Normalizer2Impl::MIN_NORMAL_MAYBE_YES
&&
471 norm16
!= Normalizer2Impl::JAMO_VT
) {
473 } else if (minNoNoCompNoMaybeCC
<= norm16
&& norm16
< limitNoNo
) {
474 uint16_t fcd16
= getFCD16(start
);
475 if (fcd16
> 0xff) { set
.add(start
, end
); }
482 Normalizer2Impl::addPropertyStarts(const USetAdder
*sa
, UErrorCode
& /*errorCode*/) const {
483 // Add the start code point of each same-value range of the trie.
484 UChar32 start
= 0, end
;
486 while ((end
= ucptrie_getRange(normTrie
, start
, UCPMAP_RANGE_FIXED_LEAD_SURROGATES
, INERT
,
487 nullptr, nullptr, &value
)) >= 0) {
488 sa
->add(sa
->set
, start
);
489 if (start
!= end
&& isAlgorithmicNoNo((uint16_t)value
) &&
490 (value
& Normalizer2Impl::DELTA_TCCC_MASK
) > Normalizer2Impl::DELTA_TCCC_1
) {
491 // Range of code points with same-norm16-value algorithmic decompositions.
492 // They might have different non-zero FCD16 values.
493 uint16_t prevFCD16
= getFCD16(start
);
494 while (++start
<= end
) {
495 uint16_t fcd16
= getFCD16(start
);
496 if (fcd16
!= prevFCD16
) {
497 sa
->add(sa
->set
, start
);
505 /* add Hangul LV syllables and LV+1 because of skippables */
506 for(UChar c
=Hangul::HANGUL_BASE
; c
<Hangul::HANGUL_LIMIT
; c
+=Hangul::JAMO_T_COUNT
) {
508 sa
->add(sa
->set
, c
+1);
510 sa
->add(sa
->set
, Hangul::HANGUL_LIMIT
); /* add Hangul+1 to continue with other properties */
514 Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder
*sa
, UErrorCode
&errorCode
) const {
515 // Add the start code point of each same-value range of the canonical iterator data trie.
516 if (!ensureCanonIterData(errorCode
)) { return; }
517 // Currently only used for the SEGMENT_STARTER property.
518 UChar32 start
= 0, end
;
520 while ((end
= ucptrie_getRange(fCanonIterData
->trie
, start
, UCPMAP_RANGE_NORMAL
, 0,
521 segmentStarterMapper
, nullptr, &value
)) >= 0) {
522 sa
->add(sa
->set
, start
);
528 Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar
*src
,
529 UChar32 minNeedDataCP
,
530 ReorderingBuffer
*buffer
,
531 UErrorCode
&errorCode
) const {
532 // Make some effort to support NUL-terminated strings reasonably.
533 // Take the part of the fast quick check loop that does not look up
534 // data and check the first part of the string.
535 // After this prefix, determine the string length to simplify the rest
537 const UChar
*prevSrc
=src
;
539 while((c
=*src
++)<minNeedDataCP
&& c
!=0) {}
540 // Back out the last character for full processing.
544 buffer
->appendZeroCC(prevSrc
, src
, errorCode
);
551 Normalizer2Impl::decompose(const UnicodeString
&src
, UnicodeString
&dest
,
552 UErrorCode
&errorCode
) const {
553 if(U_FAILURE(errorCode
)) {
557 const UChar
*sArray
=src
.getBuffer();
558 if(&dest
==&src
|| sArray
==NULL
) {
559 errorCode
=U_ILLEGAL_ARGUMENT_ERROR
;
563 decompose(sArray
, sArray
+src
.length(), dest
, src
.length(), errorCode
);
568 Normalizer2Impl::decompose(const UChar
*src
, const UChar
*limit
,
570 int32_t destLengthEstimate
,
571 UErrorCode
&errorCode
) const {
572 if(destLengthEstimate
<0 && limit
!=NULL
) {
573 destLengthEstimate
=(int32_t)(limit
-src
);
576 ReorderingBuffer
buffer(*this, dest
);
577 if(buffer
.init(destLengthEstimate
, errorCode
)) {
578 decompose(src
, limit
, &buffer
, errorCode
);
582 // Dual functionality:
583 // buffer!=NULL: normalize
584 // buffer==NULL: isNormalized/spanQuickCheckYes
586 Normalizer2Impl::decompose(const UChar
*src
, const UChar
*limit
,
587 ReorderingBuffer
*buffer
,
588 UErrorCode
&errorCode
) const {
589 UChar32 minNoCP
=minDecompNoCP
;
591 src
=copyLowPrefixFromNulTerminated(src
, minNoCP
, buffer
, errorCode
);
592 if(U_FAILURE(errorCode
)) {
595 limit
=u_strchr(src
, 0);
598 const UChar
*prevSrc
;
602 // only for quick check
603 const UChar
*prevBoundary
=src
;
607 // count code units below the minimum or with irrelevant data for the quick check
608 for(prevSrc
=src
; src
!=limit
;) {
609 if( (c
=*src
)<minNoCP
||
610 isMostDecompYesAndZeroCC(norm16
=UCPTRIE_FAST_BMP_GET(normTrie
, UCPTRIE_16
, c
))
613 } else if(!U16_IS_LEAD(c
)) {
617 if((src
+1)!=limit
&& U16_IS_TRAIL(c2
=src
[1])) {
618 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
619 norm16
=UCPTRIE_FAST_SUPP_GET(normTrie
, UCPTRIE_16
, c
);
620 if(isMostDecompYesAndZeroCC(norm16
)) {
626 ++src
; // unpaired lead surrogate: inert
630 // copy these code units all at once
633 if(!buffer
->appendZeroCC(prevSrc
, src
, errorCode
)) {
645 // Check one above-minimum, relevant code point.
648 if(!decompose(c
, norm16
, *buffer
, errorCode
)) {
652 if(isDecompYes(norm16
)) {
653 uint8_t cc
=getCCFromYesOrMaybe(norm16
);
654 if(prevCC
<=cc
|| cc
==0) {
662 return prevBoundary
; // "no" or cc out of order
668 // Decompose a short piece of text which is likely to contain characters that
669 // fail the quick check loop and/or where the quick check loop's overhead
670 // is unlikely to be amortized.
671 // Called by the compose() and makeFCD() implementations.
673 Normalizer2Impl::decomposeShort(const UChar
*src
, const UChar
*limit
,
674 UBool stopAtCompBoundary
, UBool onlyContiguous
,
675 ReorderingBuffer
&buffer
, UErrorCode
&errorCode
) const {
676 if (U_FAILURE(errorCode
)) {
680 if (stopAtCompBoundary
&& *src
< minCompNoMaybeCP
) {
683 const UChar
*prevSrc
= src
;
686 UCPTRIE_FAST_U16_NEXT(normTrie
, UCPTRIE_16
, src
, limit
, c
, norm16
);
687 if (stopAtCompBoundary
&& norm16HasCompBoundaryBefore(norm16
)) {
690 if(!decompose(c
, norm16
, buffer
, errorCode
)) {
693 if (stopAtCompBoundary
&& norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
700 UBool
Normalizer2Impl::decompose(UChar32 c
, uint16_t norm16
,
701 ReorderingBuffer
&buffer
,
702 UErrorCode
&errorCode
) const {
703 // get the decomposition and the lead and trail cc's
704 if (norm16
>= limitNoNo
) {
705 if (isMaybeOrNonZeroCC(norm16
)) {
706 return buffer
.append(c
, getCCFromYesOrMaybe(norm16
), errorCode
);
708 // Maps to an isCompYesAndZeroCC.
709 c
=mapAlgorithmic(c
, norm16
);
710 norm16
=getRawNorm16(c
);
712 if (norm16
< minYesNo
) {
713 // c does not decompose
714 return buffer
.append(c
, 0, errorCode
);
715 } else if(isHangulLV(norm16
) || isHangulLVT(norm16
)) {
716 // Hangul syllable: decompose algorithmically
718 return buffer
.appendZeroCC(jamos
, jamos
+Hangul::decompose(c
, jamos
), errorCode
);
720 // c decomposes, get everything from the variable-length extra data
721 const uint16_t *mapping
=getMapping(norm16
);
722 uint16_t firstUnit
=*mapping
;
723 int32_t length
=firstUnit
&MAPPING_LENGTH_MASK
;
724 uint8_t leadCC
, trailCC
;
725 trailCC
=(uint8_t)(firstUnit
>>8);
726 if(firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
) {
727 leadCC
=(uint8_t)(*(mapping
-1)>>8);
731 return buffer
.append((const UChar
*)mapping
+1, length
, TRUE
, leadCC
, trailCC
, errorCode
);
735 Normalizer2Impl::decomposeShort(const uint8_t *src
, const uint8_t *limit
,
736 UBool stopAtCompBoundary
, UBool onlyContiguous
,
737 ReorderingBuffer
&buffer
, UErrorCode
&errorCode
) const {
738 if (U_FAILURE(errorCode
)) {
741 while (src
< limit
) {
742 const uint8_t *prevSrc
= src
;
744 UCPTRIE_FAST_U8_NEXT(normTrie
, UCPTRIE_16
, src
, limit
, norm16
);
745 // Get the decomposition and the lead and trail cc's.
746 UChar32 c
= U_SENTINEL
;
747 if (norm16
>= limitNoNo
) {
748 if (isMaybeOrNonZeroCC(norm16
)) {
749 // No boundaries around this character.
750 c
= codePointFromValidUTF8(prevSrc
, src
);
751 if (!buffer
.append(c
, getCCFromYesOrMaybe(norm16
), errorCode
)) {
756 // Maps to an isCompYesAndZeroCC.
757 if (stopAtCompBoundary
) {
760 c
= codePointFromValidUTF8(prevSrc
, src
);
761 c
= mapAlgorithmic(c
, norm16
);
762 norm16
= getRawNorm16(c
);
763 } else if (stopAtCompBoundary
&& norm16
< minNoNoCompNoMaybeCC
) {
766 // norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
767 // We do not see invalid UTF-8 here because
768 // its norm16==INERT is normalization-inert,
769 // so it gets copied unchanged in the fast path,
770 // and we stop the slow path where invalid UTF-8 begins.
771 U_ASSERT(norm16
!= INERT
);
772 if (norm16
< minYesNo
) {
774 c
= codePointFromValidUTF8(prevSrc
, src
);
776 // does not decompose
777 if (!buffer
.append(c
, 0, errorCode
)) {
780 } else if (isHangulLV(norm16
) || isHangulLVT(norm16
)) {
781 // Hangul syllable: decompose algorithmically
783 c
= codePointFromValidUTF8(prevSrc
, src
);
786 if (!buffer
.appendZeroCC(jamos
, jamos
+Hangul::decompose(c
, jamos
), errorCode
)) {
790 // The character decomposes, get everything from the variable-length extra data.
791 const uint16_t *mapping
= getMapping(norm16
);
792 uint16_t firstUnit
= *mapping
;
793 int32_t length
= firstUnit
& MAPPING_LENGTH_MASK
;
794 uint8_t trailCC
= (uint8_t)(firstUnit
>> 8);
796 if (firstUnit
& MAPPING_HAS_CCC_LCCC_WORD
) {
797 leadCC
= (uint8_t)(*(mapping
-1) >> 8);
801 if (!buffer
.append((const char16_t *)mapping
+1, length
, TRUE
, leadCC
, trailCC
, errorCode
)) {
805 if (stopAtCompBoundary
&& norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
813 Normalizer2Impl::getDecomposition(UChar32 c
, UChar buffer
[4], int32_t &length
) const {
815 if(c
<minDecompNoCP
|| isMaybeOrNonZeroCC(norm16
=getNorm16(c
))) {
816 // c does not decompose
819 const UChar
*decomp
= nullptr;
820 if(isDecompNoAlgorithmic(norm16
)) {
821 // Maps to an isCompYesAndZeroCC.
822 c
=mapAlgorithmic(c
, norm16
);
825 U16_APPEND_UNSAFE(buffer
, length
, c
);
826 // The mapping might decompose further.
827 norm16
= getRawNorm16(c
);
829 if (norm16
< minYesNo
) {
831 } else if(isHangulLV(norm16
) || isHangulLVT(norm16
)) {
832 // Hangul syllable: decompose algorithmically
833 length
=Hangul::decompose(c
, buffer
);
836 // c decomposes, get everything from the variable-length extra data
837 const uint16_t *mapping
=getMapping(norm16
);
838 length
=*mapping
&MAPPING_LENGTH_MASK
;
839 return (const UChar
*)mapping
+1;
842 // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
843 // so that a raw mapping fits that consists of one unit ("rm0")
844 // plus all but the first two code units of the normal mapping.
845 // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
847 Normalizer2Impl::getRawDecomposition(UChar32 c
, UChar buffer
[30], int32_t &length
) const {
849 if(c
<minDecompNoCP
|| isDecompYes(norm16
=getNorm16(c
))) {
850 // c does not decompose
852 } else if(isHangulLV(norm16
) || isHangulLVT(norm16
)) {
853 // Hangul syllable: decompose algorithmically
854 Hangul::getRawDecomposition(c
, buffer
);
857 } else if(isDecompNoAlgorithmic(norm16
)) {
858 c
=mapAlgorithmic(c
, norm16
);
860 U16_APPEND_UNSAFE(buffer
, length
, c
);
863 // c decomposes, get everything from the variable-length extra data
864 const uint16_t *mapping
=getMapping(norm16
);
865 uint16_t firstUnit
=*mapping
;
866 int32_t mLength
=firstUnit
&MAPPING_LENGTH_MASK
; // length of normal mapping
867 if(firstUnit
&MAPPING_HAS_RAW_MAPPING
) {
868 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
869 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
870 const uint16_t *rawMapping
=mapping
-((firstUnit
>>7)&1)-1;
871 uint16_t rm0
=*rawMapping
;
872 if(rm0
<=MAPPING_LENGTH_MASK
) {
874 return (const UChar
*)rawMapping
-rm0
;
876 // Copy the normal mapping and replace its first two code units with rm0.
877 buffer
[0]=(UChar
)rm0
;
878 u_memcpy(buffer
+1, (const UChar
*)mapping
+1+2, mLength
-2);
884 return (const UChar
*)mapping
+1;
888 void Normalizer2Impl::decomposeAndAppend(const UChar
*src
, const UChar
*limit
,
890 UnicodeString
&safeMiddle
,
891 ReorderingBuffer
&buffer
,
892 UErrorCode
&errorCode
) const {
893 buffer
.copyReorderableSuffixTo(safeMiddle
);
895 decompose(src
, limit
, &buffer
, errorCode
);
898 // Just merge the strings at the boundary.
900 uint8_t firstCC
= 0, prevCC
= 0, cc
;
901 const UChar
*p
= src
;
903 const UChar
*codePointStart
= p
;
906 UCPTRIE_FAST_U16_NEXT(normTrie
, UCPTRIE_16
, p
, limit
, c
, norm16
);
907 if ((cc
= getCC(norm16
)) == 0) {
917 if(limit
==NULL
) { // appendZeroCC() needs limit!=NULL
918 limit
=u_strchr(p
, 0);
921 if (buffer
.append(src
, (int32_t)(p
- src
), FALSE
, firstCC
, prevCC
, errorCode
)) {
922 buffer
.appendZeroCC(p
, limit
, errorCode
);
926 UBool
Normalizer2Impl::hasDecompBoundaryBefore(UChar32 c
) const {
927 return c
< minLcccCP
|| (c
<= 0xffff && !singleLeadMightHaveNonZeroFCD16(c
)) ||
928 norm16HasDecompBoundaryBefore(getNorm16(c
));
931 UBool
Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16
) const {
932 if (norm16
< minNoNoCompNoMaybeCC
) {
935 if (norm16
>= limitNoNo
) {
936 return norm16
<= MIN_NORMAL_MAYBE_YES
|| norm16
== JAMO_VT
;
938 // c decomposes, get everything from the variable-length extra data
939 const uint16_t *mapping
=getMapping(norm16
);
940 uint16_t firstUnit
=*mapping
;
941 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
942 return (firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
)==0 || (*(mapping
-1)&0xff00)==0;
945 UBool
Normalizer2Impl::hasDecompBoundaryAfter(UChar32 c
) const {
946 if (c
< minDecompNoCP
) {
949 if (c
<= 0xffff && !singleLeadMightHaveNonZeroFCD16(c
)) {
952 return norm16HasDecompBoundaryAfter(getNorm16(c
));
955 UBool
Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16
) const {
956 if(norm16
<= minYesNo
|| isHangulLVT(norm16
)) {
959 if (norm16
>= limitNoNo
) {
960 if (isMaybeOrNonZeroCC(norm16
)) {
961 return norm16
<= MIN_NORMAL_MAYBE_YES
|| norm16
== JAMO_VT
;
963 // Maps to an isCompYesAndZeroCC.
964 return (norm16
& DELTA_TCCC_MASK
) <= DELTA_TCCC_1
;
966 // c decomposes, get everything from the variable-length extra data
967 const uint16_t *mapping
=getMapping(norm16
);
968 uint16_t firstUnit
=*mapping
;
969 // decomp after-boundary: same as hasFCDBoundaryAfter(),
970 // fcd16<=1 || trailCC==0
971 if(firstUnit
>0x1ff) {
972 return FALSE
; // trailCC>1
974 if(firstUnit
<=0xff) {
975 return TRUE
; // trailCC==0
977 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
978 // TRUE if leadCC==0 (hasFCDBoundaryBefore())
979 return (firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
)==0 || (*(mapping
-1)&0xff00)==0;
983 * Finds the recomposition result for
984 * a forward-combining "lead" character,
985 * specified with a pointer to its compositions list,
986 * and a backward-combining "trail" character.
988 * If the lead and trail characters combine, then this function returns
989 * the following "compositeAndFwd" value:
990 * Bits 21..1 composite character
991 * Bit 0 set if the composite is a forward-combining starter
992 * otherwise it returns -1.
994 * The compositions list has (trail, compositeAndFwd) pair entries,
995 * encoded as either pairs or triples of 16-bit units.
996 * The last entry has the high bit of its first unit set.
998 * The list is sorted by ascending trail characters (there are no duplicates).
999 * A linear search is used.
1001 * See normalizer2impl.h for a more detailed description
1002 * of the compositions list format.
1004 int32_t Normalizer2Impl::combine(const uint16_t *list
, UChar32 trail
) {
1005 uint16_t key1
, firstUnit
;
1006 if(trail
<COMP_1_TRAIL_LIMIT
) {
1007 // trail character is 0..33FF
1008 // result entry may have 2 or 3 units
1009 key1
=(uint16_t)(trail
<<1);
1010 while(key1
>(firstUnit
=*list
)) {
1011 list
+=2+(firstUnit
&COMP_1_TRIPLE
);
1013 if(key1
==(firstUnit
&COMP_1_TRAIL_MASK
)) {
1014 if(firstUnit
&COMP_1_TRIPLE
) {
1015 return ((int32_t)list
[1]<<16)|list
[2];
1021 // trail character is 3400..10FFFF
1022 // result entry has 3 units
1023 key1
=(uint16_t)(COMP_1_TRAIL_LIMIT
+
1024 (((trail
>>COMP_1_TRAIL_SHIFT
))&
1026 uint16_t key2
=(uint16_t)(trail
<<COMP_2_TRAIL_SHIFT
);
1027 uint16_t secondUnit
;
1029 if(key1
>(firstUnit
=*list
)) {
1030 list
+=2+(firstUnit
&COMP_1_TRIPLE
);
1031 } else if(key1
==(firstUnit
&COMP_1_TRAIL_MASK
)) {
1032 if(key2
>(secondUnit
=list
[1])) {
1033 if(firstUnit
&COMP_1_LAST_TUPLE
) {
1038 } else if(key2
==(secondUnit
&COMP_2_TRAIL_MASK
)) {
1039 return ((int32_t)(secondUnit
&~COMP_2_TRAIL_MASK
)<<16)|list
[2];
1052 * @param list some character's compositions list
1053 * @param set recursively receives the composites from these compositions
1055 void Normalizer2Impl::addComposites(const uint16_t *list
, UnicodeSet
&set
) const {
1057 int32_t compositeAndFwd
;
1060 if((firstUnit
&COMP_1_TRIPLE
)==0) {
1061 compositeAndFwd
=list
[1];
1064 compositeAndFwd
=(((int32_t)list
[1]&~COMP_2_TRAIL_MASK
)<<16)|list
[2];
1067 UChar32 composite
=compositeAndFwd
>>1;
1068 if((compositeAndFwd
&1)!=0) {
1069 addComposites(getCompositionsListForComposite(getRawNorm16(composite
)), set
);
1072 } while((firstUnit
&COMP_1_LAST_TUPLE
)==0);
1076 * Recomposes the buffer text starting at recomposeStartIndex
1077 * (which is in NFD - decomposed and canonically ordered),
1078 * and truncates the buffer contents.
1080 * Note that recomposition never lengthens the text:
1081 * Any character consists of either one or two code units;
1082 * a composition may contain at most one more code unit than the original starter,
1083 * while the combining mark that is removed has at least one code unit.
1085 void Normalizer2Impl::recompose(ReorderingBuffer
&buffer
, int32_t recomposeStartIndex
,
1086 UBool onlyContiguous
) const {
1087 UChar
*p
=buffer
.getStart()+recomposeStartIndex
;
1088 UChar
*limit
=buffer
.getLimit();
1093 UChar
*starter
, *pRemove
, *q
, *r
;
1094 const uint16_t *compositionsList
;
1095 UChar32 c
, compositeAndFwd
;
1098 UBool starterIsSupplementary
;
1100 // Some of the following variables are not used until we have a forward-combining starter
1101 // and are only initialized now to avoid compiler warnings.
1102 compositionsList
=NULL
; // used as indicator for whether we have a forward-combining starter
1104 starterIsSupplementary
=FALSE
;
1108 UCPTRIE_FAST_U16_NEXT(normTrie
, UCPTRIE_16
, p
, limit
, c
, norm16
);
1109 cc
=getCCFromYesOrMaybe(norm16
);
1110 if( // this character combines backward and
1112 // we have seen a starter that combines forward and
1113 compositionsList
!=NULL
&&
1114 // the backward-combining character is not blocked
1115 (prevCC
<cc
|| prevCC
==0)
1117 if(isJamoVT(norm16
)) {
1118 // c is a Jamo V/T, see if we can compose it with the previous character.
1119 if(c
<Hangul::JAMO_T_BASE
) {
1120 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1121 UChar prev
=(UChar
)(*starter
-Hangul::JAMO_L_BASE
);
1122 if(prev
<Hangul::JAMO_L_COUNT
) {
1124 UChar syllable
=(UChar
)
1125 (Hangul::HANGUL_BASE
+
1126 (prev
*Hangul::JAMO_V_COUNT
+(c
-Hangul::JAMO_V_BASE
))*
1127 Hangul::JAMO_T_COUNT
);
1129 if(p
!=limit
&& (t
=(UChar
)(*p
-Hangul::JAMO_T_BASE
))<Hangul::JAMO_T_COUNT
) {
1131 syllable
+=t
; // The next character was a Jamo T.
1134 // remove the Jamo V/T
1145 * No "else" for Jamo T:
1146 * Since the input is in NFD, there are no Hangul LV syllables that
1147 * a Jamo T could combine with.
1148 * All Jamo Ts are combined above when handling Jamo Vs.
1153 compositionsList
=NULL
;
1155 } else if((compositeAndFwd
=combine(compositionsList
, c
))>=0) {
1156 // The starter and the combining mark (c) do combine.
1157 UChar32 composite
=compositeAndFwd
>>1;
1159 // Replace the starter with the composite, remove the combining mark.
1160 pRemove
=p
-U16_LENGTH(c
); // pRemove & p: start & limit of the combining mark
1161 if(starterIsSupplementary
) {
1162 if(U_IS_SUPPLEMENTARY(composite
)) {
1163 // both are supplementary
1164 starter
[0]=U16_LEAD(composite
);
1165 starter
[1]=U16_TRAIL(composite
);
1167 *starter
=(UChar
)composite
;
1168 // The composite is shorter than the starter,
1169 // move the intermediate characters forward one.
1170 starterIsSupplementary
=FALSE
;
1178 } else if(U_IS_SUPPLEMENTARY(composite
)) {
1179 // The composite is longer than the starter,
1180 // move the intermediate characters back one.
1181 starterIsSupplementary
=TRUE
;
1182 ++starter
; // temporarily increment for the loop boundary
1188 *starter
=U16_TRAIL(composite
);
1189 *--starter
=U16_LEAD(composite
); // undo the temporary increment
1191 // both are on the BMP
1192 *starter
=(UChar
)composite
;
1195 /* remove the combining mark by moving the following text over it */
1205 // Keep prevCC because we removed the combining mark.
1210 // Is the composite a starter that combines forward?
1211 if(compositeAndFwd
&1) {
1213 getCompositionsListForComposite(getRawNorm16(composite
));
1215 compositionsList
=NULL
;
1218 // We combined; continue with looking for compositions.
1223 // no combination this time
1229 // If c did not combine, then check if it is a starter.
1231 // Found a new starter.
1232 if((compositionsList
=getCompositionsListForDecompYes(norm16
))!=NULL
) {
1233 // It may combine with something, prepare for it.
1235 starterIsSupplementary
=FALSE
;
1238 starterIsSupplementary
=TRUE
;
1242 } else if(onlyContiguous
) {
1243 // FCC: no discontiguous compositions; any intervening character blocks.
1244 compositionsList
=NULL
;
1247 buffer
.setReorderingLimit(limit
);
1251 Normalizer2Impl::composePair(UChar32 a
, UChar32 b
) const {
1252 uint16_t norm16
=getNorm16(a
); // maps an out-of-range 'a' to inert norm16
1253 const uint16_t *list
;
1254 if(isInert(norm16
)) {
1256 } else if(norm16
<minYesNoMappingsOnly
) {
1257 // a combines forward.
1258 if(isJamoL(norm16
)) {
1259 b
-=Hangul::JAMO_V_BASE
;
1260 if(0<=b
&& b
<Hangul::JAMO_V_COUNT
) {
1262 (Hangul::HANGUL_BASE
+
1263 ((a
-Hangul::JAMO_L_BASE
)*Hangul::JAMO_V_COUNT
+b
)*
1264 Hangul::JAMO_T_COUNT
);
1268 } else if(isHangulLV(norm16
)) {
1269 b
-=Hangul::JAMO_T_BASE
;
1270 if(0<b
&& b
<Hangul::JAMO_T_COUNT
) { // not b==0!
1276 // 'a' has a compositions list in extraData
1277 list
=getMapping(norm16
);
1278 if(norm16
>minYesNo
) { // composite 'a' has both mapping & compositions list
1279 list
+= // mapping pointer
1280 1+ // +1 to skip the first unit with the mapping length
1281 (*list
&MAPPING_LENGTH_MASK
); // + mapping length
1284 } else if(norm16
<minMaybeYes
|| MIN_NORMAL_MAYBE_YES
<=norm16
) {
1287 list
=getCompositionsListForMaybe(norm16
);
1289 if(b
<0 || 0x10ffff<b
) { // combine(list, b) requires a valid code point b
1292 #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
1293 return combine(list
, b
)>>1;
1295 int32_t compositeAndFwd
=combine(list
, b
);
1296 return compositeAndFwd
>=0 ? compositeAndFwd
>>1 : U_SENTINEL
;
1300 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1301 // doCompose: normalize
1302 // !doCompose: isNormalized (buffer must be empty and initialized)
1304 Normalizer2Impl::compose(const UChar
*src
, const UChar
*limit
,
1305 UBool onlyContiguous
,
1307 ReorderingBuffer
&buffer
,
1308 UErrorCode
&errorCode
) const {
1309 const UChar
*prevBoundary
=src
;
1310 UChar32 minNoMaybeCP
=minCompNoMaybeCP
;
1312 src
=copyLowPrefixFromNulTerminated(src
, minNoMaybeCP
,
1313 doCompose
? &buffer
: NULL
,
1315 if(U_FAILURE(errorCode
)) {
1318 limit
=u_strchr(src
, 0);
1319 if (prevBoundary
!= src
) {
1320 if (hasCompBoundaryAfter(*(src
-1), onlyContiguous
)) {
1323 buffer
.removeSuffix(1);
1324 prevBoundary
= --src
;
1330 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1331 // or with (compYes && ccc==0) properties.
1332 const UChar
*prevSrc
;
1334 uint16_t norm16
= 0;
1337 if (prevBoundary
!= limit
&& doCompose
) {
1338 buffer
.appendZeroCC(prevBoundary
, limit
, errorCode
);
1342 if( (c
=*src
)<minNoMaybeCP
||
1343 isCompYesAndZeroCC(norm16
=UCPTRIE_FAST_BMP_GET(normTrie
, UCPTRIE_16
, c
))
1348 if(!U16_IS_LEAD(c
)) {
1352 if(src
!=limit
&& U16_IS_TRAIL(c2
=*src
)) {
1354 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
1355 norm16
=UCPTRIE_FAST_SUPP_GET(normTrie
, UCPTRIE_16
, c
);
1356 if(!isCompYesAndZeroCC(norm16
)) {
1363 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1364 // The current character is either a "noNo" (has a mapping)
1365 // or a "maybeYes" (combines backward)
1366 // or a "yesYes" with ccc!=0.
1367 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1369 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1370 if (!isMaybeOrNonZeroCC(norm16
)) { // minNoNo <= norm16 < minMaybeYes
1374 // Fast path for mapping a character that is immediately surrounded by boundaries.
1375 // In this case, we need not decompose around the current character.
1376 if (isDecompNoAlgorithmic(norm16
)) {
1377 // Maps to a single isCompYesAndZeroCC character
1378 // which also implies hasCompBoundaryBefore.
1379 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
) ||
1380 hasCompBoundaryBefore(src
, limit
)) {
1381 if (prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1384 if(!buffer
.append(mapAlgorithmic(c
, norm16
), 0, errorCode
)) {
1390 } else if (norm16
< minNoNoCompBoundaryBefore
) {
1391 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1392 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
) ||
1393 hasCompBoundaryBefore(src
, limit
)) {
1394 if (prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1397 const UChar
*mapping
= reinterpret_cast<const UChar
*>(getMapping(norm16
));
1398 int32_t length
= *mapping
++ & MAPPING_LENGTH_MASK
;
1399 if(!buffer
.appendZeroCC(mapping
, mapping
+ length
, errorCode
)) {
1405 } else if (norm16
>= minNoNoEmpty
) {
1406 // The current character maps to nothing.
1407 // Simply omit it from the output if there is a boundary before _or_ after it.
1408 // The character itself implies no boundaries.
1409 if (hasCompBoundaryBefore(src
, limit
) ||
1410 hasCompBoundaryAfter(prevBoundary
, prevSrc
, onlyContiguous
)) {
1411 if (prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1418 // Other "noNo" type, or need to examine more text around this character:
1419 // Fall through to the slow path.
1420 } else if (isJamoVT(norm16
) && prevBoundary
!= prevSrc
) {
1421 UChar prev
=*(prevSrc
-1);
1422 if(c
<Hangul::JAMO_T_BASE
) {
1423 // The current character is a Jamo Vowel,
1424 // compose with previous Jamo L and following Jamo T.
1425 UChar l
= (UChar
)(prev
-Hangul::JAMO_L_BASE
);
1426 if(l
<Hangul::JAMO_L_COUNT
) {
1432 0 < (t
= ((int32_t)*src
- Hangul::JAMO_T_BASE
)) &&
1433 t
< Hangul::JAMO_T_COUNT
) {
1434 // The next character is a Jamo T.
1436 } else if (hasCompBoundaryBefore(src
, limit
)) {
1437 // No Jamo T follows, not even via decomposition.
1443 UChar32 syllable
= Hangul::HANGUL_BASE
+
1444 (l
*Hangul::JAMO_V_COUNT
+ (c
-Hangul::JAMO_V_BASE
)) *
1445 Hangul::JAMO_T_COUNT
+ t
;
1446 --prevSrc
; // Replace the Jamo L as well.
1447 if (prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1450 if(!buffer
.appendBMP((UChar
)syllable
, 0, errorCode
)) {
1456 // If we see L+V+x where x!=T then we drop to the slow path,
1457 // decompose and recompose.
1458 // This is to deal with NFKC finding normal L and V but a
1459 // compatibility variant of a T.
1460 // We need to either fully compose that combination here
1461 // (which would complicate the code and may not work with strange custom data)
1462 // or use the slow path.
1464 } else if (Hangul::isHangulLV(prev
)) {
1465 // The current character is a Jamo Trailing consonant,
1466 // compose with previous Hangul LV that does not contain a Jamo T.
1470 UChar32 syllable
= prev
+ c
- Hangul::JAMO_T_BASE
;
1471 --prevSrc
; // Replace the Hangul LV as well.
1472 if (prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1475 if(!buffer
.appendBMP((UChar
)syllable
, 0, errorCode
)) {
1481 // No matching context, or may need to decompose surrounding text first:
1482 // Fall through to the slow path.
1483 } else if (norm16
> JAMO_VT
) { // norm16 >= MIN_YES_YES_WITH_CC
1484 // One or more combining marks that do not combine-back:
1485 // Check for canonical order, copy unchanged if ok and
1486 // if followed by a character with a boundary-before.
1487 uint8_t cc
= getCCFromNormalYesOrMaybe(norm16
); // cc!=0
1488 if (onlyContiguous
/* FCC */ && getPreviousTrailCC(prevBoundary
, prevSrc
) > cc
) {
1489 // Fails FCD test, need to decompose and contiguously recompose.
1494 // If !onlyContiguous (not FCC), then we ignore the tccc of
1495 // the previous character which passed the quick check "yes && ccc==0" test.
1496 const UChar
*nextSrc
;
1501 buffer
.appendZeroCC(prevBoundary
, limit
, errorCode
);
1505 uint8_t prevCC
= cc
;
1507 UCPTRIE_FAST_U16_NEXT(normTrie
, UCPTRIE_16
, nextSrc
, limit
, c
, n16
);
1508 if (n16
>= MIN_YES_YES_WITH_CC
) {
1509 cc
= getCCFromNormalYesOrMaybe(n16
);
1521 // src is after the last in-order combining mark.
1522 // If there is a boundary here, then we continue with no change.
1523 if (norm16HasCompBoundaryBefore(n16
)) {
1524 if (isCompYesAndZeroCC(n16
)) {
1529 // Use the slow path. There is no boundary in [prevSrc, src[.
1533 // Slow path: Find the nearest boundaries around the current character,
1534 // decompose and recompose.
1535 if (prevBoundary
!= prevSrc
&& !norm16HasCompBoundaryBefore(norm16
)) {
1536 const UChar
*p
= prevSrc
;
1537 UCPTRIE_FAST_U16_PREV(normTrie
, UCPTRIE_16
, prevBoundary
, p
, c
, norm16
);
1538 if (!norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
1542 if (doCompose
&& prevBoundary
!= prevSrc
&& !buffer
.appendZeroCC(prevBoundary
, prevSrc
, errorCode
)) {
1545 int32_t recomposeStartIndex
=buffer
.length();
1546 // We know there is not a boundary here.
1547 decomposeShort(prevSrc
, src
, FALSE
/* !stopAtCompBoundary */, onlyContiguous
,
1549 // Decompose until the next boundary.
1550 src
= decomposeShort(src
, limit
, TRUE
/* stopAtCompBoundary */, onlyContiguous
,
1552 if (U_FAILURE(errorCode
)) {
1555 if ((src
- prevSrc
) > INT32_MAX
) { // guard before buffer.equals()
1556 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
1559 recompose(buffer
, recomposeStartIndex
, onlyContiguous
);
1561 if(!buffer
.equals(prevSrc
, src
)) {
1571 // Very similar to compose(): Make the same changes in both places if relevant.
1572 // pQCResult==NULL: spanQuickCheckYes
1573 // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
1575 Normalizer2Impl::composeQuickCheck(const UChar
*src
, const UChar
*limit
,
1576 UBool onlyContiguous
,
1577 UNormalizationCheckResult
*pQCResult
) const {
1578 const UChar
*prevBoundary
=src
;
1579 UChar32 minNoMaybeCP
=minCompNoMaybeCP
;
1581 UErrorCode errorCode
=U_ZERO_ERROR
;
1582 src
=copyLowPrefixFromNulTerminated(src
, minNoMaybeCP
, NULL
, errorCode
);
1583 limit
=u_strchr(src
, 0);
1584 if (prevBoundary
!= src
) {
1585 if (hasCompBoundaryAfter(*(src
-1), onlyContiguous
)) {
1588 prevBoundary
= --src
;
1594 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1595 // or with (compYes && ccc==0) properties.
1596 const UChar
*prevSrc
;
1598 uint16_t norm16
= 0;
1603 if( (c
=*src
)<minNoMaybeCP
||
1604 isCompYesAndZeroCC(norm16
=UCPTRIE_FAST_BMP_GET(normTrie
, UCPTRIE_16
, c
))
1609 if(!U16_IS_LEAD(c
)) {
1613 if(src
!=limit
&& U16_IS_TRAIL(c2
=*src
)) {
1615 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
1616 norm16
=UCPTRIE_FAST_SUPP_GET(normTrie
, UCPTRIE_16
, c
);
1617 if(!isCompYesAndZeroCC(norm16
)) {
1624 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1625 // The current character is either a "noNo" (has a mapping)
1626 // or a "maybeYes" (combines backward)
1627 // or a "yesYes" with ccc!=0.
1628 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1630 uint16_t prevNorm16
= INERT
;
1631 if (prevBoundary
!= prevSrc
) {
1632 if (norm16HasCompBoundaryBefore(norm16
)) {
1633 prevBoundary
= prevSrc
;
1635 const UChar
*p
= prevSrc
;
1637 UCPTRIE_FAST_U16_PREV(normTrie
, UCPTRIE_16
, prevBoundary
, p
, c
, n16
);
1638 if (norm16HasCompBoundaryAfter(n16
, onlyContiguous
)) {
1639 prevBoundary
= prevSrc
;
1647 if(isMaybeOrNonZeroCC(norm16
)) {
1648 uint8_t cc
=getCCFromYesOrMaybe(norm16
);
1649 if (onlyContiguous
/* FCC */ && cc
!= 0 &&
1650 getTrailCCFromCompYesAndZeroCC(prevNorm16
) > cc
) {
1651 // The [prevBoundary..prevSrc[ character
1652 // passed the quick check "yes && ccc==0" test
1653 // but is out of canonical order with the current combining mark.
1655 // If !onlyContiguous (not FCC), then we ignore the tccc of
1656 // the previous character which passed the quick check "yes && ccc==0" test.
1657 const UChar
*nextSrc
;
1659 if (norm16
< MIN_YES_YES_WITH_CC
) {
1660 if (pQCResult
!= nullptr) {
1661 *pQCResult
= UNORM_MAYBE
;
1663 return prevBoundary
;
1669 uint8_t prevCC
= cc
;
1671 UCPTRIE_FAST_U16_NEXT(normTrie
, UCPTRIE_16
, nextSrc
, limit
, c
, norm16
);
1672 if (isMaybeOrNonZeroCC(norm16
)) {
1673 cc
= getCCFromYesOrMaybe(norm16
);
1674 if (!(prevCC
<= cc
|| cc
== 0)) {
1682 // src is after the last in-order combining mark.
1683 if (isCompYesAndZeroCC(norm16
)) {
1690 if(pQCResult
!=NULL
) {
1691 *pQCResult
=UNORM_NO
;
1693 return prevBoundary
;
1697 void Normalizer2Impl::composeAndAppend(const UChar
*src
, const UChar
*limit
,
1699 UBool onlyContiguous
,
1700 UnicodeString
&safeMiddle
,
1701 ReorderingBuffer
&buffer
,
1702 UErrorCode
&errorCode
) const {
1703 if(!buffer
.isEmpty()) {
1704 const UChar
*firstStarterInSrc
=findNextCompBoundary(src
, limit
, onlyContiguous
);
1705 if(src
!=firstStarterInSrc
) {
1706 const UChar
*lastStarterInDest
=findPreviousCompBoundary(buffer
.getStart(),
1707 buffer
.getLimit(), onlyContiguous
);
1708 int32_t destSuffixLength
=(int32_t)(buffer
.getLimit()-lastStarterInDest
);
1709 UnicodeString
middle(lastStarterInDest
, destSuffixLength
);
1710 buffer
.removeSuffix(destSuffixLength
);
1712 middle
.append(src
, (int32_t)(firstStarterInSrc
-src
));
1713 const UChar
*middleStart
=middle
.getBuffer();
1714 compose(middleStart
, middleStart
+middle
.length(), onlyContiguous
,
1715 TRUE
, buffer
, errorCode
);
1716 if(U_FAILURE(errorCode
)) {
1719 src
=firstStarterInSrc
;
1723 compose(src
, limit
, onlyContiguous
, TRUE
, buffer
, errorCode
);
1725 if(limit
==NULL
) { // appendZeroCC() needs limit!=NULL
1726 limit
=u_strchr(src
, 0);
1728 buffer
.appendZeroCC(src
, limit
, errorCode
);
1733 Normalizer2Impl::composeUTF8(uint32_t options
, UBool onlyContiguous
,
1734 const uint8_t *src
, const uint8_t *limit
,
1735 ByteSink
*sink
, Edits
*edits
, UErrorCode
&errorCode
) const {
1736 U_ASSERT(limit
!= nullptr);
1738 uint8_t minNoMaybeLead
= leadByteForCP(minCompNoMaybeCP
);
1739 const uint8_t *prevBoundary
= src
;
1742 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point,
1743 // or with (compYes && ccc==0) properties.
1744 const uint8_t *prevSrc
;
1745 uint16_t norm16
= 0;
1748 if (prevBoundary
!= limit
&& sink
!= nullptr) {
1749 ByteSinkUtil::appendUnchanged(prevBoundary
, limit
,
1750 *sink
, options
, edits
, errorCode
);
1754 if (*src
< minNoMaybeLead
) {
1758 UCPTRIE_FAST_U8_NEXT(normTrie
, UCPTRIE_16
, src
, limit
, norm16
);
1759 if (!isCompYesAndZeroCC(norm16
)) {
1764 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1765 // The current character is either a "noNo" (has a mapping)
1766 // or a "maybeYes" (combines backward)
1767 // or a "yesYes" with ccc!=0.
1768 // It is not a Hangul syllable or Jamo L because those have "yes" properties.
1770 // Medium-fast path: Handle cases that do not require full decomposition and recomposition.
1771 if (!isMaybeOrNonZeroCC(norm16
)) { // minNoNo <= norm16 < minMaybeYes
1772 if (sink
== nullptr) {
1775 // Fast path for mapping a character that is immediately surrounded by boundaries.
1776 // In this case, we need not decompose around the current character.
1777 if (isDecompNoAlgorithmic(norm16
)) {
1778 // Maps to a single isCompYesAndZeroCC character
1779 // which also implies hasCompBoundaryBefore.
1780 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
) ||
1781 hasCompBoundaryBefore(src
, limit
)) {
1782 if (prevBoundary
!= prevSrc
&&
1783 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1784 *sink
, options
, edits
, errorCode
)) {
1787 appendCodePointDelta(prevSrc
, src
, getAlgorithmicDelta(norm16
), *sink
, edits
);
1791 } else if (norm16
< minNoNoCompBoundaryBefore
) {
1792 // The mapping is comp-normalized which also implies hasCompBoundaryBefore.
1793 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
) ||
1794 hasCompBoundaryBefore(src
, limit
)) {
1795 if (prevBoundary
!= prevSrc
&&
1796 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1797 *sink
, options
, edits
, errorCode
)) {
1800 const uint16_t *mapping
= getMapping(norm16
);
1801 int32_t length
= *mapping
++ & MAPPING_LENGTH_MASK
;
1802 if (!ByteSinkUtil::appendChange(prevSrc
, src
, (const UChar
*)mapping
, length
,
1803 *sink
, edits
, errorCode
)) {
1809 } else if (norm16
>= minNoNoEmpty
) {
1810 // The current character maps to nothing.
1811 // Simply omit it from the output if there is a boundary before _or_ after it.
1812 // The character itself implies no boundaries.
1813 if (hasCompBoundaryBefore(src
, limit
) ||
1814 hasCompBoundaryAfter(prevBoundary
, prevSrc
, onlyContiguous
)) {
1815 if (prevBoundary
!= prevSrc
&&
1816 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1817 *sink
, options
, edits
, errorCode
)) {
1820 if (edits
!= nullptr) {
1821 edits
->addReplace((int32_t)(src
- prevSrc
), 0);
1827 // Other "noNo" type, or need to examine more text around this character:
1828 // Fall through to the slow path.
1829 } else if (isJamoVT(norm16
)) {
1830 // Jamo L: E1 84 80..92
1831 // Jamo V: E1 85 A1..B5
1832 // Jamo T: E1 86 A8..E1 87 82
1833 U_ASSERT((src
- prevSrc
) == 3 && *prevSrc
== 0xe1);
1834 UChar32 prev
= previousHangulOrJamo(prevBoundary
, prevSrc
);
1835 if (prevSrc
[1] == 0x85) {
1836 // The current character is a Jamo Vowel,
1837 // compose with previous Jamo L and following Jamo T.
1838 UChar32 l
= prev
- Hangul::JAMO_L_BASE
;
1839 if ((uint32_t)l
< Hangul::JAMO_L_COUNT
) {
1840 if (sink
== nullptr) {
1843 int32_t t
= getJamoTMinusBase(src
, limit
);
1845 // The next character is a Jamo T.
1847 } else if (hasCompBoundaryBefore(src
, limit
)) {
1848 // No Jamo T follows, not even via decomposition.
1852 UChar32 syllable
= Hangul::HANGUL_BASE
+
1853 (l
*Hangul::JAMO_V_COUNT
+ (prevSrc
[2]-0xa1)) *
1854 Hangul::JAMO_T_COUNT
+ t
;
1855 prevSrc
-= 3; // Replace the Jamo L as well.
1856 if (prevBoundary
!= prevSrc
&&
1857 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1858 *sink
, options
, edits
, errorCode
)) {
1861 ByteSinkUtil::appendCodePoint(prevSrc
, src
, syllable
, *sink
, edits
);
1865 // If we see L+V+x where x!=T then we drop to the slow path,
1866 // decompose and recompose.
1867 // This is to deal with NFKC finding normal L and V but a
1868 // compatibility variant of a T.
1869 // We need to either fully compose that combination here
1870 // (which would complicate the code and may not work with strange custom data)
1871 // or use the slow path.
1873 } else if (Hangul::isHangulLV(prev
)) {
1874 // The current character is a Jamo Trailing consonant,
1875 // compose with previous Hangul LV that does not contain a Jamo T.
1876 if (sink
== nullptr) {
1879 UChar32 syllable
= prev
+ getJamoTMinusBase(prevSrc
, src
);
1880 prevSrc
-= 3; // Replace the Hangul LV as well.
1881 if (prevBoundary
!= prevSrc
&&
1882 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1883 *sink
, options
, edits
, errorCode
)) {
1886 ByteSinkUtil::appendCodePoint(prevSrc
, src
, syllable
, *sink
, edits
);
1890 // No matching context, or may need to decompose surrounding text first:
1891 // Fall through to the slow path.
1892 } else if (norm16
> JAMO_VT
) { // norm16 >= MIN_YES_YES_WITH_CC
1893 // One or more combining marks that do not combine-back:
1894 // Check for canonical order, copy unchanged if ok and
1895 // if followed by a character with a boundary-before.
1896 uint8_t cc
= getCCFromNormalYesOrMaybe(norm16
); // cc!=0
1897 if (onlyContiguous
/* FCC */ && getPreviousTrailCC(prevBoundary
, prevSrc
) > cc
) {
1898 // Fails FCD test, need to decompose and contiguously recompose.
1899 if (sink
== nullptr) {
1903 // If !onlyContiguous (not FCC), then we ignore the tccc of
1904 // the previous character which passed the quick check "yes && ccc==0" test.
1905 const uint8_t *nextSrc
;
1909 if (sink
!= nullptr) {
1910 ByteSinkUtil::appendUnchanged(prevBoundary
, limit
,
1911 *sink
, options
, edits
, errorCode
);
1915 uint8_t prevCC
= cc
;
1917 UCPTRIE_FAST_U8_NEXT(normTrie
, UCPTRIE_16
, nextSrc
, limit
, n16
);
1918 if (n16
>= MIN_YES_YES_WITH_CC
) {
1919 cc
= getCCFromNormalYesOrMaybe(n16
);
1921 if (sink
== nullptr) {
1931 // src is after the last in-order combining mark.
1932 // If there is a boundary here, then we continue with no change.
1933 if (norm16HasCompBoundaryBefore(n16
)) {
1934 if (isCompYesAndZeroCC(n16
)) {
1939 // Use the slow path. There is no boundary in [prevSrc, src[.
1943 // Slow path: Find the nearest boundaries around the current character,
1944 // decompose and recompose.
1945 if (prevBoundary
!= prevSrc
&& !norm16HasCompBoundaryBefore(norm16
)) {
1946 const uint8_t *p
= prevSrc
;
1947 UCPTRIE_FAST_U8_PREV(normTrie
, UCPTRIE_16
, prevBoundary
, p
, norm16
);
1948 if (!norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
1952 ReorderingBuffer
buffer(*this, s16
, errorCode
);
1953 if (U_FAILURE(errorCode
)) {
1956 // We know there is not a boundary here.
1957 decomposeShort(prevSrc
, src
, FALSE
/* !stopAtCompBoundary */, onlyContiguous
,
1959 // Decompose until the next boundary.
1960 src
= decomposeShort(src
, limit
, TRUE
/* stopAtCompBoundary */, onlyContiguous
,
1962 if (U_FAILURE(errorCode
)) {
1965 if ((src
- prevSrc
) > INT32_MAX
) { // guard before buffer.equals()
1966 errorCode
= U_INDEX_OUTOFBOUNDS_ERROR
;
1969 recompose(buffer
, 0, onlyContiguous
);
1970 if (!buffer
.equals(prevSrc
, src
)) {
1971 if (sink
== nullptr) {
1974 if (prevBoundary
!= prevSrc
&&
1975 !ByteSinkUtil::appendUnchanged(prevBoundary
, prevSrc
,
1976 *sink
, options
, edits
, errorCode
)) {
1979 if (!ByteSinkUtil::appendChange(prevSrc
, src
, buffer
.getStart(), buffer
.length(),
1980 *sink
, edits
, errorCode
)) {
1989 UBool
Normalizer2Impl::hasCompBoundaryBefore(const UChar
*src
, const UChar
*limit
) const {
1990 if (src
== limit
|| *src
< minCompNoMaybeCP
) {
1995 UCPTRIE_FAST_U16_NEXT(normTrie
, UCPTRIE_16
, src
, limit
, c
, norm16
);
1996 return norm16HasCompBoundaryBefore(norm16
);
1999 UBool
Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src
, const uint8_t *limit
) const {
2004 UCPTRIE_FAST_U8_NEXT(normTrie
, UCPTRIE_16
, src
, limit
, norm16
);
2005 return norm16HasCompBoundaryBefore(norm16
);
2008 UBool
Normalizer2Impl::hasCompBoundaryAfter(const UChar
*start
, const UChar
*p
,
2009 UBool onlyContiguous
) const {
2015 UCPTRIE_FAST_U16_PREV(normTrie
, UCPTRIE_16
, start
, p
, c
, norm16
);
2016 return norm16HasCompBoundaryAfter(norm16
, onlyContiguous
);
2019 UBool
Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start
, const uint8_t *p
,
2020 UBool onlyContiguous
) const {
2025 UCPTRIE_FAST_U8_PREV(normTrie
, UCPTRIE_16
, start
, p
, norm16
);
2026 return norm16HasCompBoundaryAfter(norm16
, onlyContiguous
);
2029 const UChar
*Normalizer2Impl::findPreviousCompBoundary(const UChar
*start
, const UChar
*p
,
2030 UBool onlyContiguous
) const {
2031 while (p
!= start
) {
2032 const UChar
*codePointLimit
= p
;
2035 UCPTRIE_FAST_U16_PREV(normTrie
, UCPTRIE_16
, start
, p
, c
, norm16
);
2036 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
2037 return codePointLimit
;
2039 if (hasCompBoundaryBefore(c
, norm16
)) {
2046 const UChar
*Normalizer2Impl::findNextCompBoundary(const UChar
*p
, const UChar
*limit
,
2047 UBool onlyContiguous
) const {
2048 while (p
!= limit
) {
2049 const UChar
*codePointStart
= p
;
2052 UCPTRIE_FAST_U16_NEXT(normTrie
, UCPTRIE_16
, p
, limit
, c
, norm16
);
2053 if (hasCompBoundaryBefore(c
, norm16
)) {
2054 return codePointStart
;
2056 if (norm16HasCompBoundaryAfter(norm16
, onlyContiguous
)) {
2063 uint8_t Normalizer2Impl::getPreviousTrailCC(const UChar
*start
, const UChar
*p
) const {
2067 int32_t i
= (int32_t)(p
- start
);
2069 U16_PREV(start
, 0, i
, c
);
2070 return (uint8_t)getFCD16(c
);
2073 uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start
, const uint8_t *p
) const {
2077 int32_t i
= (int32_t)(p
- start
);
2079 U8_PREV(start
, 0, i
, c
);
2080 return (uint8_t)getFCD16(c
);
2083 // Note: normalizer2impl.cpp r30982 (2011-nov-27)
2084 // still had getFCDTrie() which built and cached an FCD trie.
2085 // That provided faster access to FCD data than getFCD16FromNormData()
2086 // but required synchronization and consumed some 10kB of heap memory
2087 // in any process that uses FCD (e.g., via collation).
2088 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance,
2089 // at least for ASCII & CJK.
2091 // Ticket 20907 - The optimizer in MSVC/Visual Studio versions below 16.4 has trouble with this
2092 // function on Windows ARM64. As a work-around, we disable optimizations for this function.
2093 // This work-around could/should be removed once the following versions of Visual Studio are no
2094 // longer supported: All versions of VS2017, and versions of VS2019 below 16.4.
2095 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2096 #pragma optimize( "", off )
2098 // Gets the FCD value from the regular normalization data.
2099 uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c
) const {
2100 uint16_t norm16
=getNorm16(c
);
2101 if (norm16
>= limitNoNo
) {
2102 if(norm16
>=MIN_NORMAL_MAYBE_YES
) {
2104 norm16
=getCCFromNormalYesOrMaybe(norm16
);
2105 return norm16
|(norm16
<<8);
2106 } else if(norm16
>=minMaybeYes
) {
2108 } else { // isDecompNoAlgorithmic(norm16)
2109 uint16_t deltaTrailCC
= norm16
& DELTA_TCCC_MASK
;
2110 if (deltaTrailCC
<= DELTA_TCCC_1
) {
2111 return deltaTrailCC
>> OFFSET_SHIFT
;
2113 // Maps to an isCompYesAndZeroCC.
2114 c
=mapAlgorithmic(c
, norm16
);
2115 norm16
=getRawNorm16(c
);
2118 if(norm16
<=minYesNo
|| isHangulLVT(norm16
)) {
2119 // no decomposition or Hangul syllable, all zeros
2122 // c decomposes, get everything from the variable-length extra data
2123 const uint16_t *mapping
=getMapping(norm16
);
2124 uint16_t firstUnit
=*mapping
;
2125 norm16
=firstUnit
>>8; // tccc
2126 if(firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
) {
2127 norm16
|=*(mapping
-1)&0xff00; // lccc
2131 #if (defined(_MSC_VER) && (defined(_M_ARM64)) && (_MSC_VER < 1924))
2132 #pragma optimize( "", on )
2135 // Dual functionality:
2136 // buffer!=NULL: normalize
2137 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
2139 Normalizer2Impl::makeFCD(const UChar
*src
, const UChar
*limit
,
2140 ReorderingBuffer
*buffer
,
2141 UErrorCode
&errorCode
) const {
2142 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
2143 // Similar to the prevBoundary in the compose() implementation.
2144 const UChar
*prevBoundary
=src
;
2145 int32_t prevFCD16
=0;
2147 src
=copyLowPrefixFromNulTerminated(src
, minLcccCP
, buffer
, errorCode
);
2148 if(U_FAILURE(errorCode
)) {
2151 if(prevBoundary
<src
) {
2153 // We know that the previous character's lccc==0.
2154 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
2155 prevFCD16
=getFCD16(*(src
-1));
2160 limit
=u_strchr(src
, 0);
2163 // Note: In this function we use buffer->appendZeroCC() because we track
2164 // the lead and trail combining classes here, rather than leaving it to
2165 // the ReorderingBuffer.
2166 // The exception is the call to decomposeShort() which uses the buffer
2167 // in the normal way.
2169 const UChar
*prevSrc
;
2174 // count code units with lccc==0
2175 for(prevSrc
=src
; src
!=limit
;) {
2176 if((c
=*src
)<minLcccCP
) {
2179 } else if(!singleLeadMightHaveNonZeroFCD16(c
)) {
2183 if(U16_IS_LEAD(c
)) {
2185 if((src
+1)!=limit
&& U16_IS_TRAIL(c2
=src
[1])) {
2186 c
=U16_GET_SUPPLEMENTARY(c
, c2
);
2189 if((fcd16
=getFCD16FromNormData(c
))<=0xff) {
2197 // copy these code units all at once
2199 if(buffer
!=NULL
&& !buffer
->appendZeroCC(prevSrc
, src
, errorCode
)) {
2206 // We know that the previous character's lccc==0.
2208 // Fetching the fcd16 value was deferred for this below-minLcccCP code point.
2209 UChar32 prev
=~prevFCD16
;
2210 if(prev
<minDecompNoCP
) {
2213 prevFCD16
=getFCD16FromNormData(prev
);
2219 const UChar
*p
=src
-1;
2220 if(U16_IS_TRAIL(*p
) && prevSrc
<p
&& U16_IS_LEAD(*(p
-1))) {
2222 // Need to fetch the previous character's FCD value because
2223 // prevFCD16 was just for the trail surrogate code point.
2224 prevFCD16
=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p
[0], p
[1]));
2225 // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
2231 // The start of the current character (c).
2233 } else if(src
==limit
) {
2238 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
2239 // Check for proper order, and decompose locally if necessary.
2240 if((prevFCD16
&0xff)<=(fcd16
>>8)) {
2241 // proper order: prev tccc <= current lccc
2242 if((fcd16
&0xff)<=1) {
2245 if(buffer
!=NULL
&& !buffer
->appendZeroCC(c
, errorCode
)) {
2250 } else if(buffer
==NULL
) {
2251 return prevBoundary
; // quick check "no"
2254 * Back out the part of the source that we copied or appended
2255 * already but is now going to be decomposed.
2256 * prevSrc is set to after what was copied/appended.
2258 buffer
->removeSuffix((int32_t)(prevSrc
-prevBoundary
));
2260 * Find the part of the source that needs to be decomposed,
2261 * up to the next safe boundary.
2263 src
=findNextFCDBoundary(src
, limit
);
2265 * The source text does not fulfill the conditions for FCD.
2266 * Decompose and reorder a limited piece of the text.
2268 decomposeShort(prevBoundary
, src
, FALSE
, FALSE
, *buffer
, errorCode
);
2269 if (U_FAILURE(errorCode
)) {
2279 void Normalizer2Impl::makeFCDAndAppend(const UChar
*src
, const UChar
*limit
,
2281 UnicodeString
&safeMiddle
,
2282 ReorderingBuffer
&buffer
,
2283 UErrorCode
&errorCode
) const {
2284 if(!buffer
.isEmpty()) {
2285 const UChar
*firstBoundaryInSrc
=findNextFCDBoundary(src
, limit
);
2286 if(src
!=firstBoundaryInSrc
) {
2287 const UChar
*lastBoundaryInDest
=findPreviousFCDBoundary(buffer
.getStart(),
2289 int32_t destSuffixLength
=(int32_t)(buffer
.getLimit()-lastBoundaryInDest
);
2290 UnicodeString
middle(lastBoundaryInDest
, destSuffixLength
);
2291 buffer
.removeSuffix(destSuffixLength
);
2293 middle
.append(src
, (int32_t)(firstBoundaryInSrc
-src
));
2294 const UChar
*middleStart
=middle
.getBuffer();
2295 makeFCD(middleStart
, middleStart
+middle
.length(), &buffer
, errorCode
);
2296 if(U_FAILURE(errorCode
)) {
2299 src
=firstBoundaryInSrc
;
2303 makeFCD(src
, limit
, &buffer
, errorCode
);
2305 if(limit
==NULL
) { // appendZeroCC() needs limit!=NULL
2306 limit
=u_strchr(src
, 0);
2308 buffer
.appendZeroCC(src
, limit
, errorCode
);
2312 const UChar
*Normalizer2Impl::findPreviousFCDBoundary(const UChar
*start
, const UChar
*p
) const {
2314 const UChar
*codePointLimit
= p
;
2317 UCPTRIE_FAST_U16_PREV(normTrie
, UCPTRIE_16
, start
, p
, c
, norm16
);
2318 if (c
< minDecompNoCP
|| norm16HasDecompBoundaryAfter(norm16
)) {
2319 return codePointLimit
;
2321 if (norm16HasDecompBoundaryBefore(norm16
)) {
2328 const UChar
*Normalizer2Impl::findNextFCDBoundary(const UChar
*p
, const UChar
*limit
) const {
2330 const UChar
*codePointStart
=p
;
2333 UCPTRIE_FAST_U16_NEXT(normTrie
, UCPTRIE_16
, p
, limit
, c
, norm16
);
2334 if (c
< minLcccCP
|| norm16HasDecompBoundaryBefore(norm16
)) {
2335 return codePointStart
;
2337 if (norm16HasDecompBoundaryAfter(norm16
)) {
2344 // CanonicalIterator data -------------------------------------------------- ***
2346 CanonIterData::CanonIterData(UErrorCode
&errorCode
) :
2347 mutableTrie(umutablecptrie_open(0, 0, &errorCode
)), trie(nullptr),
2348 canonStartSets(uprv_deleteUObject
, NULL
, errorCode
) {}
2350 CanonIterData::~CanonIterData() {
2351 umutablecptrie_close(mutableTrie
);
2352 ucptrie_close(trie
);
2355 void CanonIterData::addToStartSet(UChar32 origin
, UChar32 decompLead
, UErrorCode
&errorCode
) {
2356 uint32_t canonValue
= umutablecptrie_get(mutableTrie
, decompLead
);
2357 if((canonValue
&(CANON_HAS_SET
|CANON_VALUE_MASK
))==0 && origin
!=0) {
2358 // origin is the first character whose decomposition starts with
2359 // the character for which we are setting the value.
2360 umutablecptrie_set(mutableTrie
, decompLead
, canonValue
|origin
, &errorCode
);
2362 // origin is not the first character, or it is U+0000.
2364 if((canonValue
&CANON_HAS_SET
)==0) {
2367 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
2370 UChar32 firstOrigin
=(UChar32
)(canonValue
&CANON_VALUE_MASK
);
2371 canonValue
=(canonValue
&~CANON_VALUE_MASK
)|CANON_HAS_SET
|(uint32_t)canonStartSets
.size();
2372 umutablecptrie_set(mutableTrie
, decompLead
, canonValue
, &errorCode
);
2373 canonStartSets
.addElement(set
, errorCode
);
2374 if(firstOrigin
!=0) {
2375 set
->add(firstOrigin
);
2378 set
=(UnicodeSet
*)canonStartSets
[(int32_t)(canonValue
&CANON_VALUE_MASK
)];
2384 // C++ class for friend access to private Normalizer2Impl members.
2385 class InitCanonIterData
{
2387 static void doInit(Normalizer2Impl
*impl
, UErrorCode
&errorCode
);
2392 // UInitOnce instantiation function for CanonIterData
2393 static void U_CALLCONV
2394 initCanonIterData(Normalizer2Impl
*impl
, UErrorCode
&errorCode
) {
2395 InitCanonIterData::doInit(impl
, errorCode
);
2400 void InitCanonIterData::doInit(Normalizer2Impl
*impl
, UErrorCode
&errorCode
) {
2401 U_ASSERT(impl
->fCanonIterData
== NULL
);
2402 impl
->fCanonIterData
= new CanonIterData(errorCode
);
2403 if (impl
->fCanonIterData
== NULL
) {
2404 errorCode
=U_MEMORY_ALLOCATION_ERROR
;
2406 if (U_SUCCESS(errorCode
)) {
2407 UChar32 start
= 0, end
;
2409 while ((end
= ucptrie_getRange(impl
->normTrie
, start
,
2410 UCPMAP_RANGE_FIXED_LEAD_SURROGATES
, Normalizer2Impl::INERT
,
2411 nullptr, nullptr, &value
)) >= 0) {
2412 // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
2413 if (value
!= Normalizer2Impl::INERT
) {
2414 impl
->makeCanonIterDataFromNorm16(start
, end
, value
, *impl
->fCanonIterData
, errorCode
);
2418 #ifdef UCPTRIE_DEBUG
2419 umutablecptrie_setName(impl
->fCanonIterData
->mutableTrie
, "CanonIterData");
2421 impl
->fCanonIterData
->trie
= umutablecptrie_buildImmutable(
2422 impl
->fCanonIterData
->mutableTrie
, UCPTRIE_TYPE_SMALL
, UCPTRIE_VALUE_BITS_32
, &errorCode
);
2423 umutablecptrie_close(impl
->fCanonIterData
->mutableTrie
);
2424 impl
->fCanonIterData
->mutableTrie
= nullptr;
2426 if (U_FAILURE(errorCode
)) {
2427 delete impl
->fCanonIterData
;
2428 impl
->fCanonIterData
= NULL
;
2432 void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start
, UChar32 end
, const uint16_t norm16
,
2433 CanonIterData
&newData
,
2434 UErrorCode
&errorCode
) const {
2435 if(isInert(norm16
) || (minYesNo
<=norm16
&& norm16
<minNoNo
)) {
2436 // Inert, or 2-way mapping (including Hangul syllable).
2437 // We do not write a canonStartSet for any yesNo character.
2438 // Composites from 2-way mappings are added at runtime from the
2439 // starter's compositions list, and the other characters in
2440 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
2441 // "maybe" characters.
2444 for(UChar32 c
=start
; c
<=end
; ++c
) {
2445 uint32_t oldValue
= umutablecptrie_get(newData
.mutableTrie
, c
);
2446 uint32_t newValue
=oldValue
;
2447 if(isMaybeOrNonZeroCC(norm16
)) {
2448 // not a segment starter if it occurs in a decomposition or has cc!=0
2449 newValue
|=CANON_NOT_SEGMENT_STARTER
;
2450 if(norm16
<MIN_NORMAL_MAYBE_YES
) {
2451 newValue
|=CANON_HAS_COMPOSITIONS
;
2453 } else if(norm16
<minYesNo
) {
2454 newValue
|=CANON_HAS_COMPOSITIONS
;
2456 // c has a one-way decomposition
2458 // Do not modify the whole-range norm16 value.
2459 uint16_t norm16_2
=norm16
;
2460 if (isDecompNoAlgorithmic(norm16_2
)) {
2461 // Maps to an isCompYesAndZeroCC.
2462 c2
= mapAlgorithmic(c2
, norm16_2
);
2463 norm16_2
= getRawNorm16(c2
);
2464 // No compatibility mappings for the CanonicalIterator.
2465 U_ASSERT(!(isHangulLV(norm16_2
) || isHangulLVT(norm16_2
)));
2467 if (norm16_2
> minYesNo
) {
2468 // c decomposes, get everything from the variable-length extra data
2469 const uint16_t *mapping
=getMapping(norm16_2
);
2470 uint16_t firstUnit
=*mapping
;
2471 int32_t length
=firstUnit
&MAPPING_LENGTH_MASK
;
2472 if((firstUnit
&MAPPING_HAS_CCC_LCCC_WORD
)!=0) {
2473 if(c
==c2
&& (*(mapping
-1)&0xff)!=0) {
2474 newValue
|=CANON_NOT_SEGMENT_STARTER
; // original c has cc!=0
2477 // Skip empty mappings (no characters in the decomposition).
2479 ++mapping
; // skip over the firstUnit
2480 // add c to first code point's start set
2482 U16_NEXT_UNSAFE(mapping
, i
, c2
);
2483 newData
.addToStartSet(c
, c2
, errorCode
);
2484 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
2485 // one-way mapping. A 2-way mapping is possible here after
2486 // intermediate algorithmic mapping.
2487 if(norm16_2
>=minNoNo
) {
2489 U16_NEXT_UNSAFE(mapping
, i
, c2
);
2490 uint32_t c2Value
= umutablecptrie_get(newData
.mutableTrie
, c2
);
2491 if((c2Value
&CANON_NOT_SEGMENT_STARTER
)==0) {
2492 umutablecptrie_set(newData
.mutableTrie
, c2
,
2493 c2Value
|CANON_NOT_SEGMENT_STARTER
, &errorCode
);
2499 // c decomposed to c2 algorithmically; c has cc==0
2500 newData
.addToStartSet(c
, c2
, errorCode
);
2503 if(newValue
!=oldValue
) {
2504 umutablecptrie_set(newData
.mutableTrie
, c
, newValue
, &errorCode
);
2509 UBool
Normalizer2Impl::ensureCanonIterData(UErrorCode
&errorCode
) const {
2510 // Logically const: Synchronized instantiation.
2511 Normalizer2Impl
*me
=const_cast<Normalizer2Impl
*>(this);
2512 umtx_initOnce(me
->fCanonIterDataInitOnce
, &initCanonIterData
, me
, errorCode
);
2513 return U_SUCCESS(errorCode
);
2516 int32_t Normalizer2Impl::getCanonValue(UChar32 c
) const {
2517 return (int32_t)ucptrie_get(fCanonIterData
->trie
, c
);
2520 const UnicodeSet
&Normalizer2Impl::getCanonStartSet(int32_t n
) const {
2521 return *(const UnicodeSet
*)fCanonIterData
->canonStartSets
[n
];
2524 UBool
Normalizer2Impl::isCanonSegmentStarter(UChar32 c
) const {
2525 return getCanonValue(c
)>=0;
2528 UBool
Normalizer2Impl::getCanonStartSet(UChar32 c
, UnicodeSet
&set
) const {
2529 int32_t canonValue
=getCanonValue(c
)&~CANON_NOT_SEGMENT_STARTER
;
2534 int32_t value
=canonValue
&CANON_VALUE_MASK
;
2535 if((canonValue
&CANON_HAS_SET
)!=0) {
2536 set
.addAll(getCanonStartSet(value
));
2537 } else if(value
!=0) {
2540 if((canonValue
&CANON_HAS_COMPOSITIONS
)!=0) {
2541 uint16_t norm16
=getRawNorm16(c
);
2542 if(norm16
==JAMO_L
) {
2544 (UChar32
)(Hangul::HANGUL_BASE
+(c
-Hangul::JAMO_L_BASE
)*Hangul::JAMO_VT_COUNT
);
2545 set
.add(syllable
, syllable
+Hangul::JAMO_VT_COUNT
-1);
2547 addComposites(getCompositionsList(norm16
), set
);
2555 // Normalizer2 data swapping ----------------------------------------------- ***
2559 U_CAPI
int32_t U_EXPORT2
2560 unorm2_swap(const UDataSwapper
*ds
,
2561 const void *inData
, int32_t length
, void *outData
,
2562 UErrorCode
*pErrorCode
) {
2563 const UDataInfo
*pInfo
;
2566 const uint8_t *inBytes
;
2569 const int32_t *inIndexes
;
2570 int32_t indexes
[Normalizer2Impl::IX_TOTAL_SIZE
+1];
2572 int32_t i
, offset
, nextOffset
, size
;
2574 /* udata_swapDataHeader checks the arguments */
2575 headerSize
=udata_swapDataHeader(ds
, inData
, length
, outData
, pErrorCode
);
2576 if(pErrorCode
==NULL
|| U_FAILURE(*pErrorCode
)) {
2580 /* check data format and format version */
2581 pInfo
=(const UDataInfo
*)((const char *)inData
+4);
2582 uint8_t formatVersion0
=pInfo
->formatVersion
[0];
2584 pInfo
->dataFormat
[0]==0x4e && /* dataFormat="Nrm2" */
2585 pInfo
->dataFormat
[1]==0x72 &&
2586 pInfo
->dataFormat
[2]==0x6d &&
2587 pInfo
->dataFormat
[3]==0x32 &&
2588 (1<=formatVersion0
&& formatVersion0
<=4)
2590 udata_printError(ds
, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
2591 pInfo
->dataFormat
[0], pInfo
->dataFormat
[1],
2592 pInfo
->dataFormat
[2], pInfo
->dataFormat
[3],
2593 pInfo
->formatVersion
[0]);
2594 *pErrorCode
=U_UNSUPPORTED_ERROR
;
2598 inBytes
=(const uint8_t *)inData
+headerSize
;
2599 outBytes
=(uint8_t *)outData
+headerSize
;
2601 inIndexes
=(const int32_t *)inBytes
;
2602 int32_t minIndexesLength
;
2603 if(formatVersion0
==1) {
2604 minIndexesLength
=Normalizer2Impl::IX_MIN_MAYBE_YES
+1;
2605 } else if(formatVersion0
==2) {
2606 minIndexesLength
=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY
+1;
2608 minIndexesLength
=Normalizer2Impl::IX_MIN_LCCC_CP
+1;
2613 if(length
<minIndexesLength
*4) {
2614 udata_printError(ds
, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
2616 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2621 /* read the first few indexes */
2622 for(i
=0; i
<UPRV_LENGTHOF(indexes
); ++i
) {
2623 indexes
[i
]=udata_readInt32(ds
, inIndexes
[i
]);
2626 /* get the total length of the data */
2627 size
=indexes
[Normalizer2Impl::IX_TOTAL_SIZE
];
2631 udata_printError(ds
, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
2633 *pErrorCode
=U_INDEX_OUTOFBOUNDS_ERROR
;
2637 /* copy the data for inaccessible bytes */
2638 if(inBytes
!=outBytes
) {
2639 uprv_memcpy(outBytes
, inBytes
, size
);
2644 /* swap the int32_t indexes[] */
2645 nextOffset
=indexes
[Normalizer2Impl::IX_NORM_TRIE_OFFSET
];
2646 ds
->swapArray32(ds
, inBytes
, nextOffset
-offset
, outBytes
, pErrorCode
);
2650 nextOffset
=indexes
[Normalizer2Impl::IX_EXTRA_DATA_OFFSET
];
2651 utrie_swapAnyVersion(ds
, inBytes
+offset
, nextOffset
-offset
, outBytes
+offset
, pErrorCode
);
2654 /* swap the uint16_t extraData[] */
2655 nextOffset
=indexes
[Normalizer2Impl::IX_SMALL_FCD_OFFSET
];
2656 ds
->swapArray16(ds
, inBytes
+offset
, nextOffset
-offset
, outBytes
+offset
, pErrorCode
);
2659 /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
2660 nextOffset
=indexes
[Normalizer2Impl::IX_SMALL_FCD_OFFSET
+1];
2663 U_ASSERT(offset
==size
);
2666 return headerSize
+size
;
2669 #endif // !UCONFIG_NO_NORMALIZATION