1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
5 // created: 2017jun04 Markus W. Scherer
6 // (pulled out of n2builder.cpp)
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_NORMALIZATION
14 #include "unicode/errorcode.h"
15 #include "unicode/unistr.h"
16 #include "unicode/utf16.h"
17 #include "extradata.h"
18 #include "normalizer2impl.h"
26 ExtraData::ExtraData(Norms
&n
, UBool fast
) :
28 yesYesCompositions(1000, (UChar32
)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
29 yesNoMappingsAndCompositions(1000, (UChar32
)0, 1), // 0=Hangul LV, 1=start of normal data
30 yesNoMappingsOnly(1000, (UChar32
)0, 1), // 0=Hangul LVT, 1=start of normal data
32 // Hangul LV algorithmically decomposes to two Jamo.
33 // Some code may harmlessly read this firstUnit.
34 yesNoMappingsAndCompositions
.setCharAt(0, 2);
35 // Hangul LVT algorithmically decomposes to three Jamo.
36 // Some code may harmlessly read this firstUnit.
37 yesNoMappingsOnly
.setCharAt(0, 3);
40 int32_t ExtraData::writeMapping(UChar32 c
, const Norm
&norm
, UnicodeString
&dataString
) {
41 UnicodeString
&m
=*norm
.mapping
;
42 int32_t length
=m
.length();
43 // Write the mapping & raw mapping extraData.
44 int32_t firstUnit
=length
|(norm
.trailCC
<<8);
45 int32_t preMappingLength
=0;
46 if(norm
.rawMapping
!=NULL
) {
47 UnicodeString
&rm
=*norm
.rawMapping
;
48 int32_t rmLength
=rm
.length();
49 if(rmLength
>Normalizer2Impl::MAPPING_LENGTH_MASK
) {
52 "raw mapping for U+%04lX longer than maximum of %d\n",
53 (long)c
, Normalizer2Impl::MAPPING_LENGTH_MASK
);
54 exit(U_INVALID_FORMAT_ERROR
);
56 UChar rm0
=rm
.charAt(0);
57 if( rmLength
==length
-1 &&
58 // 99: overlong substring lengths get pinned to remainder lengths anyway
59 0==rm
.compare(1, 99, m
, 2, 99) &&
60 rm0
>Normalizer2Impl::MAPPING_LENGTH_MASK
63 // rawMapping=rm0+mapping.substring(2) -> store only rm0
65 // The raw mapping is the same as the final mapping after replacing
66 // the final mapping's first two code units with the raw mapping's first one.
67 // In this case, we store only that first unit, rm0.
68 // This helps with a few hundred mappings.
69 dataString
.append(rm0
);
72 // Store the raw mapping with its length.
73 dataString
.append(rm
);
74 dataString
.append((UChar
)rmLength
);
75 preMappingLength
=rmLength
+1;
77 firstUnit
|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING
;
79 int32_t cccLccc
=norm
.cc
|(norm
.leadCC
<<8);
81 dataString
.append((UChar
)cccLccc
);
83 firstUnit
|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD
;
85 dataString
.append((UChar
)firstUnit
);
87 return preMappingLength
;
90 int32_t ExtraData::writeNoNoMapping(UChar32 c
, const Norm
&norm
,
91 UnicodeString
&dataString
,
92 Hashtable
&previousMappings
) {
93 UnicodeString newMapping
;
94 int32_t offset
=writeMapping(c
, norm
, newMapping
);
95 int32_t previousOffset
=previousMappings
.geti(newMapping
);
96 if(previousOffset
!=0) {
97 // Duplicate, point to the identical mapping that has already been stored.
98 offset
=previousOffset
-1;
100 // Append this new mapping and
101 // enter it into the hashtable, avoiding value 0 which is "not found".
102 offset
=dataString
.length()+offset
;
103 dataString
.append(newMapping
);
104 IcuToolErrorCode
errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
105 previousMappings
.puti(newMapping
, offset
+1, errorCode
);
110 UBool
ExtraData::setNoNoDelta(UChar32 c
, Norm
&norm
) const {
111 // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point.
112 // Do not map from ASCII to non-ASCII.
113 if(norm
.mappingCP
>=0 &&
114 !(c
<=0x7f && norm
.mappingCP
>0x7f) &&
115 norms
.getNormRef(norm
.mappingCP
).type
<Norm::NO_NO_COMP_YES
) {
116 int32_t delta
=norm
.mappingCP
-c
;
117 if(-Normalizer2Impl::MAX_DELTA
<=delta
&& delta
<=Normalizer2Impl::MAX_DELTA
) {
118 norm
.type
=Norm::NO_NO_DELTA
;
126 void ExtraData::writeCompositions(UChar32 c
, const Norm
&norm
, UnicodeString
&dataString
) {
130 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
132 exit(U_INVALID_FORMAT_ERROR
);
135 const CompositionPair
*pairs
=norm
.getCompositionPairs(length
);
136 for(int32_t i
=0; i
<length
; ++i
) {
137 const CompositionPair
&pair
=pairs
[i
];
138 // 22 bits for the composite character and whether it combines forward.
139 UChar32 compositeAndFwd
=pair
.composite
<<1;
140 if(norms
.getNormRef(pair
.composite
).compositions
!=NULL
) {
141 compositeAndFwd
|=1; // The composite character also combines-forward.
143 // Encode most pairs in two units and some in three.
144 int32_t firstUnit
, secondUnit
, thirdUnit
;
145 if(pair
.trail
<Normalizer2Impl::COMP_1_TRAIL_LIMIT
) {
146 if(compositeAndFwd
<=0xffff) {
147 firstUnit
=pair
.trail
<<1;
148 secondUnit
=compositeAndFwd
;
151 firstUnit
=(pair
.trail
<<1)|Normalizer2Impl::COMP_1_TRIPLE
;
152 secondUnit
=compositeAndFwd
>>16;
153 thirdUnit
=compositeAndFwd
;
156 firstUnit
=(Normalizer2Impl::COMP_1_TRAIL_LIMIT
+
157 (pair
.trail
>>Normalizer2Impl::COMP_1_TRAIL_SHIFT
))|
158 Normalizer2Impl::COMP_1_TRIPLE
;
159 secondUnit
=(pair
.trail
<<Normalizer2Impl::COMP_2_TRAIL_SHIFT
)|
160 (compositeAndFwd
>>16);
161 thirdUnit
=compositeAndFwd
;
163 // Set the high bit of the first unit if this is the last composition pair.
165 firstUnit
|=Normalizer2Impl::COMP_1_LAST_TUPLE
;
167 dataString
.append((UChar
)firstUnit
).append((UChar
)secondUnit
);
169 dataString
.append((UChar
)thirdUnit
);
174 void ExtraData::rangeHandler(UChar32 start
, UChar32 end
, Norm
&norm
) {
177 "gennorm2 error: unexpected shared data for "
178 "multiple code points U+%04lX..U+%04lX\n",
179 (long)start
, (long)end
);
180 exit(U_INTERNAL_PROGRAM_ERROR
);
182 if(norm
.error
!=nullptr) {
183 fprintf(stderr
, "gennorm2 error: U+%04lX %s\n", (long)start
, norm
.error
);
184 exit(U_INVALID_FORMAT_ERROR
);
186 writeExtraData(start
, norm
);
189 // Ticket #13342 - Disable optimizations on MSVC for this function as a workaround.
190 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
191 #pragma optimize( "", off )
194 void ExtraData::writeExtraData(UChar32 c
, Norm
&norm
) {
197 break; // no extra data
198 case Norm::YES_YES_COMBINES_FWD
:
199 norm
.offset
=yesYesCompositions
.length();
200 writeCompositions(c
, norm
, yesYesCompositions
);
202 case Norm::YES_NO_COMBINES_FWD
:
203 norm
.offset
=yesNoMappingsAndCompositions
.length()+
204 writeMapping(c
, norm
, yesNoMappingsAndCompositions
);
205 writeCompositions(c
, norm
, yesNoMappingsAndCompositions
);
207 case Norm::YES_NO_MAPPING_ONLY
:
208 norm
.offset
=yesNoMappingsOnly
.length()+
209 writeMapping(c
, norm
, yesNoMappingsOnly
);
211 case Norm::NO_NO_COMP_YES
:
212 if(!optimizeFast
&& setNoNoDelta(c
, norm
)) {
215 norm
.offset
=writeNoNoMapping(c
, norm
, noNoMappingsCompYes
, previousNoNoMappingsCompYes
);
217 case Norm::NO_NO_COMP_BOUNDARY_BEFORE
:
218 if(!optimizeFast
&& setNoNoDelta(c
, norm
)) {
221 norm
.offset
=writeNoNoMapping(
222 c
, norm
, noNoMappingsCompBoundaryBefore
, previousNoNoMappingsCompBoundaryBefore
);
224 case Norm::NO_NO_COMP_NO_MAYBE_CC
:
225 norm
.offset
=writeNoNoMapping(
226 c
, norm
, noNoMappingsCompNoMaybeCC
, previousNoNoMappingsCompNoMaybeCC
);
228 case Norm::NO_NO_EMPTY
:
229 // There can be multiple extra data entries for mappings to the empty string
230 // if they have different raw mappings.
231 norm
.offset
=writeNoNoMapping(c
, norm
, noNoMappingsEmpty
, previousNoNoMappingsEmpty
);
233 case Norm::MAYBE_YES_COMBINES_FWD
:
234 norm
.offset
=maybeYesCompositions
.length();
235 writeCompositions(c
, norm
, maybeYesCompositions
);
237 case Norm::MAYBE_YES_SIMPLE
:
238 break; // no extra data
239 case Norm::YES_YES_WITH_CC
:
240 break; // no extra data
241 default: // Should not occur.
242 exit(U_INTERNAL_PROGRAM_ERROR
);
246 // Ticket #13342 - Turn optimization back on.
247 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
248 #pragma optimize( "", on )
253 #endif // #if !UCONFIG_NO_NORMALIZATION