]>
Commit | Line | Data |
---|---|---|
1 | // © 2017 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | // extradata.cpp | |
5 | // created: 2017jun04 Markus W. Scherer | |
6 | // (pulled out of n2builder.cpp) | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_NORMALIZATION | |
11 | ||
12 | #include <stdio.h> | |
13 | #include <stdlib.h> | |
14 | #include "unicode/errorcode.h" | |
15 | #include "unicode/unistr.h" | |
16 | #include "unicode/utf16.h" | |
17 | #include "extradata.h" | |
18 | #include "normalizer2impl.h" | |
19 | #include "norms.h" | |
20 | #include "toolutil.h" | |
21 | #include "utrie2.h" | |
22 | #include "uvectr32.h" | |
23 | ||
24 | U_NAMESPACE_BEGIN | |
25 | ||
26 | ExtraData::ExtraData(Norms &n, UBool fast) : | |
27 | Norms::Enumerator(n), | |
28 | yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions | |
29 | yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul LV, 1=start of normal data | |
30 | yesNoMappingsOnly(1000, (UChar32)0, 1), // 0=Hangul LVT, 1=start of normal data | |
31 | optimizeFast(fast) { | |
32 | // Hangul LV algorithmically decomposes to two Jamo. | |
33 | // Some code may harmlessly read this firstUnit. | |
34 | yesNoMappingsAndCompositions.setCharAt(0, 2); | |
35 | // Hangul LVT algorithmically decomposes to three Jamo. | |
36 | // Some code may harmlessly read this firstUnit. | |
37 | yesNoMappingsOnly.setCharAt(0, 3); | |
38 | } | |
39 | ||
40 | int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) { | |
41 | UnicodeString &m=*norm.mapping; | |
42 | int32_t length=m.length(); | |
43 | // Write the mapping & raw mapping extraData. | |
44 | int32_t firstUnit=length|(norm.trailCC<<8); | |
45 | int32_t preMappingLength=0; | |
46 | if(norm.rawMapping!=NULL) { | |
47 | UnicodeString &rm=*norm.rawMapping; | |
48 | int32_t rmLength=rm.length(); | |
49 | if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { | |
50 | fprintf(stderr, | |
51 | "gennorm2 error: " | |
52 | "raw mapping for U+%04lX longer than maximum of %d\n", | |
53 | (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK); | |
54 | exit(U_INVALID_FORMAT_ERROR); | |
55 | } | |
56 | UChar rm0=rm.charAt(0); | |
57 | if( rmLength==length-1 && | |
58 | // 99: overlong substring lengths get pinned to remainder lengths anyway | |
59 | 0==rm.compare(1, 99, m, 2, 99) && | |
60 | rm0>Normalizer2Impl::MAPPING_LENGTH_MASK | |
61 | ) { | |
62 | // Compression: | |
63 | // rawMapping=rm0+mapping.substring(2) -> store only rm0 | |
64 | // | |
65 | // The raw mapping is the same as the final mapping after replacing | |
66 | // the final mapping's first two code units with the raw mapping's first one. | |
67 | // In this case, we store only that first unit, rm0. | |
68 | // This helps with a few hundred mappings. | |
69 | dataString.append(rm0); | |
70 | preMappingLength=1; | |
71 | } else { | |
72 | // Store the raw mapping with its length. | |
73 | dataString.append(rm); | |
74 | dataString.append((UChar)rmLength); | |
75 | preMappingLength=rmLength+1; | |
76 | } | |
77 | firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; | |
78 | } | |
79 | int32_t cccLccc=norm.cc|(norm.leadCC<<8); | |
80 | if(cccLccc!=0) { | |
81 | dataString.append((UChar)cccLccc); | |
82 | ++preMappingLength; | |
83 | firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; | |
84 | } | |
85 | dataString.append((UChar)firstUnit); | |
86 | dataString.append(m); | |
87 | return preMappingLength; | |
88 | } | |
89 | ||
90 | int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm, | |
91 | UnicodeString &dataString, | |
92 | Hashtable &previousMappings) { | |
93 | UnicodeString newMapping; | |
94 | int32_t offset=writeMapping(c, norm, newMapping); | |
95 | int32_t previousOffset=previousMappings.geti(newMapping); | |
96 | if(previousOffset!=0) { | |
97 | // Duplicate, point to the identical mapping that has already been stored. | |
98 | offset=previousOffset-1; | |
99 | } else { | |
100 | // Append this new mapping and | |
101 | // enter it into the hashtable, avoiding value 0 which is "not found". | |
102 | offset=dataString.length()+offset; | |
103 | dataString.append(newMapping); | |
104 | IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()"); | |
105 | previousMappings.puti(newMapping, offset+1, errorCode); | |
106 | } | |
107 | return offset; | |
108 | } | |
109 | ||
110 | UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const { | |
111 | // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point. | |
112 | // Do not map from ASCII to non-ASCII. | |
113 | if(norm.mappingCP>=0 && | |
114 | !(c<=0x7f && norm.mappingCP>0x7f) && | |
115 | norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) { | |
116 | int32_t delta=norm.mappingCP-c; | |
117 | if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { | |
118 | norm.type=Norm::NO_NO_DELTA; | |
119 | norm.offset=delta; | |
120 | return TRUE; | |
121 | } | |
122 | } | |
123 | return FALSE; | |
124 | } | |
125 | ||
126 | void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) { | |
127 | if(norm.cc!=0) { | |
128 | fprintf(stderr, | |
129 | "gennorm2 error: " | |
130 | "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", | |
131 | (long)c); | |
132 | exit(U_INVALID_FORMAT_ERROR); | |
133 | } | |
134 | int32_t length; | |
135 | const CompositionPair *pairs=norm.getCompositionPairs(length); | |
136 | for(int32_t i=0; i<length; ++i) { | |
137 | const CompositionPair &pair=pairs[i]; | |
138 | // 22 bits for the composite character and whether it combines forward. | |
139 | UChar32 compositeAndFwd=pair.composite<<1; | |
140 | if(norms.getNormRef(pair.composite).compositions!=NULL) { | |
141 | compositeAndFwd|=1; // The composite character also combines-forward. | |
142 | } | |
143 | // Encode most pairs in two units and some in three. | |
144 | int32_t firstUnit, secondUnit, thirdUnit; | |
145 | if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { | |
146 | if(compositeAndFwd<=0xffff) { | |
147 | firstUnit=pair.trail<<1; | |
148 | secondUnit=compositeAndFwd; | |
149 | thirdUnit=-1; | |
150 | } else { | |
151 | firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; | |
152 | secondUnit=compositeAndFwd>>16; | |
153 | thirdUnit=compositeAndFwd; | |
154 | } | |
155 | } else { | |
156 | firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ | |
157 | (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| | |
158 | Normalizer2Impl::COMP_1_TRIPLE; | |
159 | secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| | |
160 | (compositeAndFwd>>16); | |
161 | thirdUnit=compositeAndFwd; | |
162 | } | |
163 | // Set the high bit of the first unit if this is the last composition pair. | |
164 | if(i==(length-1)) { | |
165 | firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; | |
166 | } | |
167 | dataString.append((UChar)firstUnit).append((UChar)secondUnit); | |
168 | if(thirdUnit>=0) { | |
169 | dataString.append((UChar)thirdUnit); | |
170 | } | |
171 | } | |
172 | } | |
173 | ||
174 | void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { | |
175 | if(start!=end) { | |
176 | fprintf(stderr, | |
177 | "gennorm2 error: unexpected shared data for " | |
178 | "multiple code points U+%04lX..U+%04lX\n", | |
179 | (long)start, (long)end); | |
180 | exit(U_INTERNAL_PROGRAM_ERROR); | |
181 | } | |
182 | if(norm.error!=nullptr) { | |
183 | fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error); | |
184 | exit(U_INVALID_FORMAT_ERROR); | |
185 | } | |
186 | writeExtraData(start, norm); | |
187 | } | |
188 | ||
189 | // Ticket #13342 - Disable optimizations on MSVC for this function as a workaround. | |
190 | #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) | |
191 | #pragma optimize( "", off ) | |
192 | #endif | |
193 | ||
194 | void ExtraData::writeExtraData(UChar32 c, Norm &norm) { | |
195 | switch(norm.type) { | |
196 | case Norm::INERT: | |
197 | break; // no extra data | |
198 | case Norm::YES_YES_COMBINES_FWD: | |
199 | norm.offset=yesYesCompositions.length(); | |
200 | writeCompositions(c, norm, yesYesCompositions); | |
201 | break; | |
202 | case Norm::YES_NO_COMBINES_FWD: | |
203 | norm.offset=yesNoMappingsAndCompositions.length()+ | |
204 | writeMapping(c, norm, yesNoMappingsAndCompositions); | |
205 | writeCompositions(c, norm, yesNoMappingsAndCompositions); | |
206 | break; | |
207 | case Norm::YES_NO_MAPPING_ONLY: | |
208 | norm.offset=yesNoMappingsOnly.length()+ | |
209 | writeMapping(c, norm, yesNoMappingsOnly); | |
210 | break; | |
211 | case Norm::NO_NO_COMP_YES: | |
212 | if(!optimizeFast && setNoNoDelta(c, norm)) { | |
213 | break; | |
214 | } | |
215 | norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes); | |
216 | break; | |
217 | case Norm::NO_NO_COMP_BOUNDARY_BEFORE: | |
218 | if(!optimizeFast && setNoNoDelta(c, norm)) { | |
219 | break; | |
220 | } | |
221 | norm.offset=writeNoNoMapping( | |
222 | c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore); | |
223 | break; | |
224 | case Norm::NO_NO_COMP_NO_MAYBE_CC: | |
225 | norm.offset=writeNoNoMapping( | |
226 | c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC); | |
227 | break; | |
228 | case Norm::NO_NO_EMPTY: | |
229 | // There can be multiple extra data entries for mappings to the empty string | |
230 | // if they have different raw mappings. | |
231 | norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty); | |
232 | break; | |
233 | case Norm::MAYBE_YES_COMBINES_FWD: | |
234 | norm.offset=maybeYesCompositions.length(); | |
235 | writeCompositions(c, norm, maybeYesCompositions); | |
236 | break; | |
237 | case Norm::MAYBE_YES_SIMPLE: | |
238 | break; // no extra data | |
239 | case Norm::YES_YES_WITH_CC: | |
240 | break; // no extra data | |
241 | default: // Should not occur. | |
242 | exit(U_INTERNAL_PROGRAM_ERROR); | |
243 | } | |
244 | } | |
245 | ||
246 | // Ticket #13342 - Turn optimization back on. | |
247 | #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) | |
248 | #pragma optimize( "", on ) | |
249 | #endif | |
250 | ||
251 | U_NAMESPACE_END | |
252 | ||
253 | #endif // #if !UCONFIG_NO_NORMALIZATION |