]>
git.saurik.com Git - apple/icu.git/blob - icuSources/tools/gennorm2/norms.cpp
   1 // © 2017 and later: Unicode, Inc. and others. 
   2 // License & terms of use: http://www.unicode.org/copyright.html 
   5 // created: 2017jun04 Markus W. Scherer 
   6 // (pulled out of n2builder.cpp) 
   8 #include "unicode/utypes.h" 
  10 #if !UCONFIG_NO_NORMALIZATION 
  14 #include "unicode/errorcode.h" 
  15 #include "unicode/umutablecptrie.h" 
  16 #include "unicode/unistr.h" 
  17 #include "unicode/utf16.h" 
  18 #include "normalizer2impl.h" 
  25 void BuilderReorderingBuffer::append(UChar32 c
, uint8_t cc
) { 
  26     if(cc
==0 || fLength
==0 || ccAt(fLength
-1)<=cc
) { 
  28             fLastStarterIndex
=fLength
; 
  30         fArray
[fLength
++]=(c
<<8)|cc
; 
  33     // Let this character bubble back to its canonical order. 
  35     while(i
>fLastStarterIndex 
&& ccAt(i
)>cc
) { 
  38     ++i
;  // after the last starter or prevCC<=cc 
  39     // Move this and the following characters forward one to make space. 
  40     for(int32_t j
=fLength
; i
<j
; --j
) { 
  41         fArray
[j
]=fArray
[j
-1]; 
  48 void BuilderReorderingBuffer::toString(UnicodeString 
&dest
) const { 
  50     for(int32_t i
=0; i
<fLength
; ++i
) { 
  51         dest
.append(charAt(i
)); 
  55 UChar32 
Norm::combine(UChar32 trail
) const { 
  57     const CompositionPair 
*pairs
=getCompositionPairs(length
); 
  58     for(int32_t i
=0; i
<length
; ++i
) { 
  59         if(trail
==pairs
[i
].trail
) { 
  60             return pairs
[i
].composite
; 
  62         if(trail
<pairs
[i
].trail
) { 
  69 Norms::Norms(UErrorCode 
&errorCode
) { 
  70     normTrie 
= umutablecptrie_open(0, 0, &errorCode
); 
  71     normMem
=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm
)); 
  72     // Default "inert" Norm struct at index 0. Practically immutable. 
  74     norms
->type
=Norm::INERT
; 
  78     umutablecptrie_close(normTrie
); 
  79     int32_t normsLength
=utm_countItems(normMem
); 
  80     for(int32_t i
=1; i
<normsLength
; ++i
) { 
  81         delete norms
[i
].mapping
; 
  82         delete norms
[i
].rawMapping
; 
  83         delete norms
[i
].compositions
; 
  88 Norm 
*Norms::allocNorm() { 
  89     Norm 
*p
=(Norm 
*)utm_alloc(normMem
); 
  90     norms
=(Norm 
*)utm_getStart(normMem
);  // in case it got reallocated 
  94 Norm 
*Norms::getNorm(UChar32 c
) { 
  95     uint32_t i 
= umutablecptrie_get(normTrie
, c
); 
 102 const Norm 
*Norms::getNorm(UChar32 c
) const { 
 103     uint32_t i 
= umutablecptrie_get(normTrie
, c
); 
 110 const Norm 
&Norms::getNormRef(UChar32 c
) const { 
 111     return norms
[umutablecptrie_get(normTrie
, c
)]; 
 114 Norm 
*Norms::createNorm(UChar32 c
) { 
 115     uint32_t i
=umutablecptrie_get(normTrie
, c
); 
 121         IcuToolErrorCode 
errorCode("gennorm2/createNorm()"); 
 122         umutablecptrie_set(normTrie
, c
, (uint32_t)(p 
- norms
), errorCode
); 
 127 void Norms::reorder(UnicodeString 
&mapping
, BuilderReorderingBuffer 
&buffer
) const { 
 128     int32_t length
=mapping
.length(); 
 129     U_ASSERT(length
<=Normalizer2Impl::MAPPING_LENGTH_MASK
); 
 130     const char16_t *s
=mapping
.getBuffer(); 
 134         U16_NEXT(s
, i
, length
, c
); 
 135         buffer
.append(c
, getCC(c
)); 
 137     if(buffer
.didReorder()) { 
 138         buffer
.toString(mapping
); 
 142 UBool 
Norms::combinesWithCCBetween(const Norm 
&norm
, uint8_t lowCC
, int32_t highCC
) const { 
 143     if((highCC
-lowCC
)>=2) { 
 145         const CompositionPair 
*pairs
=norm
.getCompositionPairs(length
); 
 146         for(int32_t i
=0; i
<length
; ++i
) { 
 147             uint8_t trailCC
=getCC(pairs
[i
].trail
); 
 148             if(lowCC
<trailCC 
&& trailCC
<highCC
) { 
 156 void Norms::enumRanges(Enumerator 
&e
) { 
 157     UChar32 start 
= 0, end
; 
 159     while ((end 
= umutablecptrie_getRange(normTrie
, start
, UCPMAP_RANGE_NORMAL
, 0, 
 160                                           nullptr, nullptr, &i
)) >= 0) { 
 162             e
.rangeHandler(start
, end
, norms
[i
]); 
 168 Norms::Enumerator::~Enumerator() {} 
 170 void CompositionBuilder::rangeHandler(UChar32 start
, UChar32 end
, Norm 
&norm
) { 
 171     if(norm
.mappingType
!=Norm::ROUND_TRIP
) { return; } 
 174                 "gennorm2 error: same round-trip mapping for " 
 175                 "more than 1 code point U+%04lX..U+%04lX\n", 
 176                 (long)start
, (long)end
); 
 177         exit(U_INVALID_FORMAT_ERROR
); 
 182                 "U+%04lX has a round-trip mapping and ccc!=0, " 
 183                 "not possible in Unicode normalization\n", 
 185         exit(U_INVALID_FORMAT_ERROR
); 
 187     // setRoundTripMapping() ensured that there are exactly two code points. 
 188     const UnicodeString 
&m
=*norm
.mapping
; 
 189     UChar32 lead
=m
.char32At(0); 
 190     UChar32 trail
=m
.char32At(m
.length()-1); 
 191     if(norms
.getCC(lead
)!=0) { 
 194                 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, " 
 195                 "not possible in Unicode normalization\n", 
 196                 (long)start
, (long)lead
); 
 197         exit(U_INVALID_FORMAT_ERROR
); 
 199     // Flag for trailing character. 
 200     norms
.createNorm(trail
)->combinesBack
=TRUE
; 
 201     // Insert (trail, composite) pair into compositions list for the lead character. 
 202     IcuToolErrorCode 
errorCode("gennorm2/addComposition()"); 
 203     Norm 
*leadNorm
=norms
.createNorm(lead
); 
 204     UVector32 
*compositions
=leadNorm
->compositions
; 
 206     if(compositions
==nullptr) { 
 207         compositions
=leadNorm
->compositions
=new UVector32(errorCode
); 
 208         i
=0;  // "insert" the first pair at index 0 
 210         // Insertion sort, and check for duplicate trail characters. 
 212         const CompositionPair 
*pairs
=leadNorm
->getCompositionPairs(length
); 
 213         for(i
=0; i
<length
; ++i
) { 
 214             if(trail
==pairs
[i
].trail
) { 
 216                         "gennorm2 error: same round-trip mapping for " 
 217                         "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n", 
 218                         (long)start
, (long)lead
, (long)trail
); 
 219                 exit(U_INVALID_FORMAT_ERROR
); 
 221             if(trail
<pairs
[i
].trail
) { 
 226     compositions
->insertElementAt(trail
, 2*i
, errorCode
); 
 227     compositions
->insertElementAt(start
, 2*i
+1, errorCode
); 
 230 void Decomposer::rangeHandler(UChar32 start
, UChar32 end
, Norm 
&norm
) { 
 231     if(!norm
.hasMapping()) { return; } 
 232     const UnicodeString 
&m
=*norm
.mapping
; 
 233     UnicodeString 
*decomposed
=nullptr; 
 234     const UChar 
*s
=toUCharPtr(m
.getBuffer()); 
 235     int32_t length
=m
.length(); 
 240         U16_NEXT(s
, i
, length
, c
); 
 241         if(start
<=c 
&& c
<=end
) { 
 243                     "gennorm2 error: U+%04lX maps to itself directly or indirectly\n", 
 245             exit(U_INVALID_FORMAT_ERROR
); 
 247         const Norm 
&cNorm
=norms
.getNormRef(c
); 
 248         if(cNorm
.hasMapping()) { 
 249             if(norm
.mappingType
==Norm::ROUND_TRIP
) { 
 251                     if(cNorm
.mappingType
!=Norm::ROUND_TRIP
) { 
 254                                 "U+%04lX's round-trip mapping's starter " 
 255                                 "U+%04lX one-way-decomposes, " 
 256                                 "not possible in Unicode normalization\n", 
 257                                 (long)start
, (long)c
); 
 258                         exit(U_INVALID_FORMAT_ERROR
); 
 260                     uint8_t myTrailCC
=norms
.getCC(m
.char32At(i
)); 
 261                     UChar32 cTrailChar
=cNorm
.mapping
->char32At(cNorm
.mapping
->length()-1); 
 262                     uint8_t cTrailCC
=norms
.getCC(cTrailChar
); 
 263                     if(cTrailCC
>myTrailCC
) { 
 266                                 "U+%04lX's round-trip mapping's starter " 
 267                                 "U+%04lX decomposes and the " 
 268                                 "inner/earlier tccc=%hu > outer/following tccc=%hu, " 
 269                                 "not possible in Unicode normalization\n", 
 270                                 (long)start
, (long)c
, 
 271                                 (short)cTrailCC
, (short)myTrailCC
); 
 272                         exit(U_INVALID_FORMAT_ERROR
); 
 277                             "U+%04lX's round-trip mapping's non-starter " 
 278                             "U+%04lX decomposes, " 
 279                             "not possible in Unicode normalization\n", 
 280                             (long)start
, (long)c
); 
 281                     exit(U_INVALID_FORMAT_ERROR
); 
 284             if(decomposed
==nullptr) { 
 285                 decomposed
=new UnicodeString(m
, 0, prev
); 
 287             decomposed
->append(*cNorm
.mapping
); 
 288         } else if(Hangul::isHangul(c
)) { 
 290             int32_t hangulLength
=Hangul::decompose(c
, buffer
); 
 291             if(norm
.mappingType
==Norm::ROUND_TRIP 
&& prev
!=0) { 
 294                         "U+%04lX's round-trip mapping's non-starter " 
 295                         "U+%04lX decomposes, " 
 296                         "not possible in Unicode normalization\n", 
 297                         (long)start
, (long)c
); 
 298                 exit(U_INVALID_FORMAT_ERROR
); 
 300             if(decomposed
==nullptr) { 
 301                 decomposed
=new UnicodeString(m
, 0, prev
); 
 303             decomposed
->append(buffer
, hangulLength
); 
 304         } else if(decomposed
!=nullptr) { 
 305             decomposed
->append(m
, prev
, i
-prev
); 
 308     if(decomposed
!=nullptr) { 
 309         if(norm
.rawMapping
==nullptr) { 
 310             // Remember the original mapping when decomposing recursively. 
 311             norm
.rawMapping
=norm
.mapping
; 
 315         norm
.mapping
=decomposed
; 
 316         // Not  norm.setMappingCP();  because the original mapping 
 317         // is most likely to be encodable as a delta. 
 324 #endif // #if !UCONFIG_NO_NORMALIZATION