]>
git.saurik.com Git - apple/icu.git/blob - icuSources/tools/gennorm2/norms.cpp
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
5 // created: 2017jun04 Markus W. Scherer
6 // (pulled out of n2builder.cpp)
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_NORMALIZATION
14 #include "unicode/errorcode.h"
15 #include "unicode/unistr.h"
16 #include "unicode/utf16.h"
17 #include "normalizer2impl.h"
25 void BuilderReorderingBuffer::append(UChar32 c
, uint8_t cc
) {
26 if(cc
==0 || fLength
==0 || ccAt(fLength
-1)<=cc
) {
28 fLastStarterIndex
=fLength
;
30 fArray
[fLength
++]=(c
<<8)|cc
;
33 // Let this character bubble back to its canonical order.
35 while(i
>fLastStarterIndex
&& ccAt(i
)>cc
) {
38 ++i
; // after the last starter or prevCC<=cc
39 // Move this and the following characters forward one to make space.
40 for(int32_t j
=fLength
; i
<j
; --j
) {
41 fArray
[j
]=fArray
[j
-1];
48 void BuilderReorderingBuffer::toString(UnicodeString
&dest
) const {
50 for(int32_t i
=0; i
<fLength
; ++i
) {
51 dest
.append(charAt(i
));
55 UChar32
Norm::combine(UChar32 trail
) const {
57 const CompositionPair
*pairs
=getCompositionPairs(length
);
58 for(int32_t i
=0; i
<length
; ++i
) {
59 if(trail
==pairs
[i
].trail
) {
60 return pairs
[i
].composite
;
62 if(trail
<pairs
[i
].trail
) {
69 Norms::Norms(UErrorCode
&errorCode
) {
70 normTrie
=utrie2_open(0, 0, &errorCode
);
71 normMem
=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm
));
72 // Default "inert" Norm struct at index 0. Practically immutable.
74 norms
->type
=Norm::INERT
;
78 utrie2_close(normTrie
);
79 int32_t normsLength
=utm_countItems(normMem
);
80 for(int32_t i
=1; i
<normsLength
; ++i
) {
81 delete norms
[i
].mapping
;
82 delete norms
[i
].rawMapping
;
83 delete norms
[i
].compositions
;
88 Norm
*Norms::allocNorm() {
89 Norm
*p
=(Norm
*)utm_alloc(normMem
);
90 norms
=(Norm
*)utm_getStart(normMem
); // in case it got reallocated
94 Norm
*Norms::getNorm(UChar32 c
) {
95 uint32_t i
=utrie2_get32(normTrie
, c
);
102 const Norm
*Norms::getNorm(UChar32 c
) const {
103 uint32_t i
=utrie2_get32(normTrie
, c
);
110 const Norm
&Norms::getNormRef(UChar32 c
) const {
111 return norms
[utrie2_get32(normTrie
, c
)];
114 Norm
*Norms::createNorm(UChar32 c
) {
115 uint32_t i
=utrie2_get32(normTrie
, c
);
121 IcuToolErrorCode
errorCode("gennorm2/createNorm()");
122 utrie2_set32(normTrie
, c
, (uint32_t)(p
-norms
), errorCode
);
127 void Norms::reorder(UnicodeString
&mapping
, BuilderReorderingBuffer
&buffer
) const {
128 int32_t length
=mapping
.length();
129 U_ASSERT(length
<=Normalizer2Impl::MAPPING_LENGTH_MASK
);
130 const char16_t *s
=mapping
.getBuffer();
134 U16_NEXT(s
, i
, length
, c
);
135 buffer
.append(c
, getCC(c
));
137 if(buffer
.didReorder()) {
138 buffer
.toString(mapping
);
142 UBool
Norms::combinesWithCCBetween(const Norm
&norm
, uint8_t lowCC
, int32_t highCC
) const {
143 if((highCC
-lowCC
)>=2) {
145 const CompositionPair
*pairs
=norm
.getCompositionPairs(length
);
146 for(int32_t i
=0; i
<length
; ++i
) {
147 uint8_t trailCC
=getCC(pairs
[i
].trail
);
148 if(lowCC
<trailCC
&& trailCC
<highCC
) {
158 static UBool U_CALLCONV
159 enumRangeHandler(const void *context
, UChar32 start
, UChar32 end
, uint32_t value
) {
160 return ((Norms::Enumerator
*)context
)->rangeHandler(start
, end
, value
);
165 void Norms::enumRanges(Enumerator
&e
) {
166 utrie2_enum(normTrie
, nullptr, enumRangeHandler
, &e
);
169 Norms::Enumerator::~Enumerator() {}
171 UBool
Norms::Enumerator::rangeHandler(UChar32 start
, UChar32 end
, uint32_t value
) {
173 rangeHandler(start
, end
, norms
.getNormRefByIndex(value
));
178 void CompositionBuilder::rangeHandler(UChar32 start
, UChar32 end
, Norm
&norm
) {
179 if(norm
.mappingType
!=Norm::ROUND_TRIP
) { return; }
182 "gennorm2 error: same round-trip mapping for "
183 "more than 1 code point U+%04lX..U+%04lX\n",
184 (long)start
, (long)end
);
185 exit(U_INVALID_FORMAT_ERROR
);
190 "U+%04lX has a round-trip mapping and ccc!=0, "
191 "not possible in Unicode normalization\n",
193 exit(U_INVALID_FORMAT_ERROR
);
195 // setRoundTripMapping() ensured that there are exactly two code points.
196 const UnicodeString
&m
=*norm
.mapping
;
197 UChar32 lead
=m
.char32At(0);
198 UChar32 trail
=m
.char32At(m
.length()-1);
199 if(norms
.getCC(lead
)!=0) {
202 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
203 "not possible in Unicode normalization\n",
204 (long)start
, (long)lead
);
205 exit(U_INVALID_FORMAT_ERROR
);
207 // Flag for trailing character.
208 norms
.createNorm(trail
)->combinesBack
=TRUE
;
209 // Insert (trail, composite) pair into compositions list for the lead character.
210 IcuToolErrorCode
errorCode("gennorm2/addComposition()");
211 Norm
*leadNorm
=norms
.createNorm(lead
);
212 UVector32
*compositions
=leadNorm
->compositions
;
214 if(compositions
==nullptr) {
215 compositions
=leadNorm
->compositions
=new UVector32(errorCode
);
216 i
=0; // "insert" the first pair at index 0
218 // Insertion sort, and check for duplicate trail characters.
220 const CompositionPair
*pairs
=leadNorm
->getCompositionPairs(length
);
221 for(i
=0; i
<length
; ++i
) {
222 if(trail
==pairs
[i
].trail
) {
224 "gennorm2 error: same round-trip mapping for "
225 "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
226 (long)start
, (long)lead
, (long)trail
);
227 exit(U_INVALID_FORMAT_ERROR
);
229 if(trail
<pairs
[i
].trail
) {
234 compositions
->insertElementAt(trail
, 2*i
, errorCode
);
235 compositions
->insertElementAt(start
, 2*i
+1, errorCode
);
238 void Decomposer::rangeHandler(UChar32 start
, UChar32 end
, Norm
&norm
) {
239 if(!norm
.hasMapping()) { return; }
240 const UnicodeString
&m
=*norm
.mapping
;
241 UnicodeString
*decomposed
=nullptr;
242 const UChar
*s
=toUCharPtr(m
.getBuffer());
243 int32_t length
=m
.length();
248 U16_NEXT(s
, i
, length
, c
);
249 if(start
<=c
&& c
<=end
) {
251 "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
253 exit(U_INVALID_FORMAT_ERROR
);
255 const Norm
&cNorm
=norms
.getNormRef(c
);
256 if(cNorm
.hasMapping()) {
257 if(norm
.mappingType
==Norm::ROUND_TRIP
) {
259 if(cNorm
.mappingType
!=Norm::ROUND_TRIP
) {
262 "U+%04lX's round-trip mapping's starter "
263 "U+%04lX one-way-decomposes, "
264 "not possible in Unicode normalization\n",
265 (long)start
, (long)c
);
266 exit(U_INVALID_FORMAT_ERROR
);
268 uint8_t myTrailCC
=norms
.getCC(m
.char32At(i
));
269 UChar32 cTrailChar
=cNorm
.mapping
->char32At(cNorm
.mapping
->length()-1);
270 uint8_t cTrailCC
=norms
.getCC(cTrailChar
);
271 if(cTrailCC
>myTrailCC
) {
274 "U+%04lX's round-trip mapping's starter "
275 "U+%04lX decomposes and the "
276 "inner/earlier tccc=%hu > outer/following tccc=%hu, "
277 "not possible in Unicode normalization\n",
278 (long)start
, (long)c
,
279 (short)cTrailCC
, (short)myTrailCC
);
280 exit(U_INVALID_FORMAT_ERROR
);
285 "U+%04lX's round-trip mapping's non-starter "
286 "U+%04lX decomposes, "
287 "not possible in Unicode normalization\n",
288 (long)start
, (long)c
);
289 exit(U_INVALID_FORMAT_ERROR
);
292 if(decomposed
==nullptr) {
293 decomposed
=new UnicodeString(m
, 0, prev
);
295 decomposed
->append(*cNorm
.mapping
);
296 } else if(Hangul::isHangul(c
)) {
298 int32_t hangulLength
=Hangul::decompose(c
, buffer
);
299 if(norm
.mappingType
==Norm::ROUND_TRIP
&& prev
!=0) {
302 "U+%04lX's round-trip mapping's non-starter "
303 "U+%04lX decomposes, "
304 "not possible in Unicode normalization\n",
305 (long)start
, (long)c
);
306 exit(U_INVALID_FORMAT_ERROR
);
308 if(decomposed
==nullptr) {
309 decomposed
=new UnicodeString(m
, 0, prev
);
311 decomposed
->append(buffer
, hangulLength
);
312 } else if(decomposed
!=nullptr) {
313 decomposed
->append(m
, prev
, i
-prev
);
316 if(decomposed
!=nullptr) {
317 if(norm
.rawMapping
==nullptr) {
318 // Remember the original mapping when decomposing recursively.
319 norm
.rawMapping
=norm
.mapping
;
323 norm
.mapping
=decomposed
;
324 // Not norm.setMappingCP(); because the original mapping
325 // is most likely to be encodable as a delta.
332 #endif // #if !UCONFIG_NO_NORMALIZATION