]>
git.saurik.com Git - apple/icu.git/blob - icuSources/tools/gennorm2/norms.cpp
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
5 // created: 2017jun04 Markus W. Scherer
6 // (pulled out of n2builder.cpp)
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_NORMALIZATION
14 #include "unicode/errorcode.h"
15 #include "unicode/umutablecptrie.h"
16 #include "unicode/unistr.h"
17 #include "unicode/utf16.h"
18 #include "normalizer2impl.h"
25 void BuilderReorderingBuffer::append(UChar32 c
, uint8_t cc
) {
26 if(cc
==0 || fLength
==0 || ccAt(fLength
-1)<=cc
) {
28 fLastStarterIndex
=fLength
;
30 fArray
[fLength
++]=(c
<<8)|cc
;
33 // Let this character bubble back to its canonical order.
35 while(i
>fLastStarterIndex
&& ccAt(i
)>cc
) {
38 ++i
; // after the last starter or prevCC<=cc
39 // Move this and the following characters forward one to make space.
40 for(int32_t j
=fLength
; i
<j
; --j
) {
41 fArray
[j
]=fArray
[j
-1];
48 void BuilderReorderingBuffer::toString(UnicodeString
&dest
) const {
50 for(int32_t i
=0; i
<fLength
; ++i
) {
51 dest
.append(charAt(i
));
55 UChar32
Norm::combine(UChar32 trail
) const {
57 const CompositionPair
*pairs
=getCompositionPairs(length
);
58 for(int32_t i
=0; i
<length
; ++i
) {
59 if(trail
==pairs
[i
].trail
) {
60 return pairs
[i
].composite
;
62 if(trail
<pairs
[i
].trail
) {
69 Norms::Norms(UErrorCode
&errorCode
) {
70 normTrie
= umutablecptrie_open(0, 0, &errorCode
);
71 normMem
=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm
));
72 // Default "inert" Norm struct at index 0. Practically immutable.
74 norms
->type
=Norm::INERT
;
78 umutablecptrie_close(normTrie
);
79 int32_t normsLength
=utm_countItems(normMem
);
80 for(int32_t i
=1; i
<normsLength
; ++i
) {
81 delete norms
[i
].mapping
;
82 delete norms
[i
].rawMapping
;
83 delete norms
[i
].compositions
;
88 Norm
*Norms::allocNorm() {
89 Norm
*p
=(Norm
*)utm_alloc(normMem
);
90 norms
=(Norm
*)utm_getStart(normMem
); // in case it got reallocated
94 Norm
*Norms::getNorm(UChar32 c
) {
95 uint32_t i
= umutablecptrie_get(normTrie
, c
);
102 const Norm
*Norms::getNorm(UChar32 c
) const {
103 uint32_t i
= umutablecptrie_get(normTrie
, c
);
110 const Norm
&Norms::getNormRef(UChar32 c
) const {
111 return norms
[umutablecptrie_get(normTrie
, c
)];
114 Norm
*Norms::createNorm(UChar32 c
) {
115 uint32_t i
=umutablecptrie_get(normTrie
, c
);
121 IcuToolErrorCode
errorCode("gennorm2/createNorm()");
122 umutablecptrie_set(normTrie
, c
, (uint32_t)(p
- norms
), errorCode
);
127 void Norms::reorder(UnicodeString
&mapping
, BuilderReorderingBuffer
&buffer
) const {
128 int32_t length
=mapping
.length();
129 U_ASSERT(length
<=Normalizer2Impl::MAPPING_LENGTH_MASK
);
130 const char16_t *s
=mapping
.getBuffer();
134 U16_NEXT(s
, i
, length
, c
);
135 buffer
.append(c
, getCC(c
));
137 if(buffer
.didReorder()) {
138 buffer
.toString(mapping
);
142 UBool
Norms::combinesWithCCBetween(const Norm
&norm
, uint8_t lowCC
, int32_t highCC
) const {
143 if((highCC
-lowCC
)>=2) {
145 const CompositionPair
*pairs
=norm
.getCompositionPairs(length
);
146 for(int32_t i
=0; i
<length
; ++i
) {
147 uint8_t trailCC
=getCC(pairs
[i
].trail
);
148 if(lowCC
<trailCC
&& trailCC
<highCC
) {
156 void Norms::enumRanges(Enumerator
&e
) {
157 UChar32 start
= 0, end
;
159 while ((end
= umutablecptrie_getRange(normTrie
, start
, UCPMAP_RANGE_NORMAL
, 0,
160 nullptr, nullptr, &i
)) >= 0) {
162 e
.rangeHandler(start
, end
, norms
[i
]);
168 Norms::Enumerator::~Enumerator() {}
170 void CompositionBuilder::rangeHandler(UChar32 start
, UChar32 end
, Norm
&norm
) {
171 if(norm
.mappingType
!=Norm::ROUND_TRIP
) { return; }
174 "gennorm2 error: same round-trip mapping for "
175 "more than 1 code point U+%04lX..U+%04lX\n",
176 (long)start
, (long)end
);
177 exit(U_INVALID_FORMAT_ERROR
);
182 "U+%04lX has a round-trip mapping and ccc!=0, "
183 "not possible in Unicode normalization\n",
185 exit(U_INVALID_FORMAT_ERROR
);
187 // setRoundTripMapping() ensured that there are exactly two code points.
188 const UnicodeString
&m
=*norm
.mapping
;
189 UChar32 lead
=m
.char32At(0);
190 UChar32 trail
=m
.char32At(m
.length()-1);
191 if(norms
.getCC(lead
)!=0) {
194 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
195 "not possible in Unicode normalization\n",
196 (long)start
, (long)lead
);
197 exit(U_INVALID_FORMAT_ERROR
);
199 // Flag for trailing character.
200 norms
.createNorm(trail
)->combinesBack
=TRUE
;
201 // Insert (trail, composite) pair into compositions list for the lead character.
202 IcuToolErrorCode
errorCode("gennorm2/addComposition()");
203 Norm
*leadNorm
=norms
.createNorm(lead
);
204 UVector32
*compositions
=leadNorm
->compositions
;
206 if(compositions
==nullptr) {
207 compositions
=leadNorm
->compositions
=new UVector32(errorCode
);
208 i
=0; // "insert" the first pair at index 0
210 // Insertion sort, and check for duplicate trail characters.
212 const CompositionPair
*pairs
=leadNorm
->getCompositionPairs(length
);
213 for(i
=0; i
<length
; ++i
) {
214 if(trail
==pairs
[i
].trail
) {
216 "gennorm2 error: same round-trip mapping for "
217 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
218 (long)start
, (long)lead
, (long)trail
);
219 exit(U_INVALID_FORMAT_ERROR
);
221 if(trail
<pairs
[i
].trail
) {
226 compositions
->insertElementAt(trail
, 2*i
, errorCode
);
227 compositions
->insertElementAt(start
, 2*i
+1, errorCode
);
230 void Decomposer::rangeHandler(UChar32 start
, UChar32 end
, Norm
&norm
) {
231 if(!norm
.hasMapping()) { return; }
232 const UnicodeString
&m
=*norm
.mapping
;
233 UnicodeString
*decomposed
=nullptr;
234 const UChar
*s
=toUCharPtr(m
.getBuffer());
235 int32_t length
=m
.length();
240 U16_NEXT(s
, i
, length
, c
);
241 if(start
<=c
&& c
<=end
) {
243 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
245 exit(U_INVALID_FORMAT_ERROR
);
247 const Norm
&cNorm
=norms
.getNormRef(c
);
248 if(cNorm
.hasMapping()) {
249 if(norm
.mappingType
==Norm::ROUND_TRIP
) {
251 if(cNorm
.mappingType
!=Norm::ROUND_TRIP
) {
254 "U+%04lX's round-trip mapping's starter "
255 "U+%04lX one-way-decomposes, "
256 "not possible in Unicode normalization\n",
257 (long)start
, (long)c
);
258 exit(U_INVALID_FORMAT_ERROR
);
260 uint8_t myTrailCC
=norms
.getCC(m
.char32At(i
));
261 UChar32 cTrailChar
=cNorm
.mapping
->char32At(cNorm
.mapping
->length()-1);
262 uint8_t cTrailCC
=norms
.getCC(cTrailChar
);
263 if(cTrailCC
>myTrailCC
) {
266 "U+%04lX's round-trip mapping's starter "
267 "U+%04lX decomposes and the "
268 "inner/earlier tccc=%hu > outer/following tccc=%hu, "
269 "not possible in Unicode normalization\n",
270 (long)start
, (long)c
,
271 (short)cTrailCC
, (short)myTrailCC
);
272 exit(U_INVALID_FORMAT_ERROR
);
277 "U+%04lX's round-trip mapping's non-starter "
278 "U+%04lX decomposes, "
279 "not possible in Unicode normalization\n",
280 (long)start
, (long)c
);
281 exit(U_INVALID_FORMAT_ERROR
);
284 if(decomposed
==nullptr) {
285 decomposed
=new UnicodeString(m
, 0, prev
);
287 decomposed
->append(*cNorm
.mapping
);
288 } else if(Hangul::isHangul(c
)) {
290 int32_t hangulLength
=Hangul::decompose(c
, buffer
);
291 if(norm
.mappingType
==Norm::ROUND_TRIP
&& prev
!=0) {
294 "U+%04lX's round-trip mapping's non-starter "
295 "U+%04lX decomposes, "
296 "not possible in Unicode normalization\n",
297 (long)start
, (long)c
);
298 exit(U_INVALID_FORMAT_ERROR
);
300 if(decomposed
==nullptr) {
301 decomposed
=new UnicodeString(m
, 0, prev
);
303 decomposed
->append(buffer
, hangulLength
);
304 } else if(decomposed
!=nullptr) {
305 decomposed
->append(m
, prev
, i
-prev
);
308 if(decomposed
!=nullptr) {
309 if(norm
.rawMapping
==nullptr) {
310 // Remember the original mapping when decomposing recursively.
311 norm
.rawMapping
=norm
.mapping
;
315 norm
.mapping
=decomposed
;
316 // Not norm.setMappingCP(); because the original mapping
317 // is most likely to be encodable as a delta.
324 #endif // #if !UCONFIG_NO_NORMALIZATION