]>
Commit | Line | Data |
---|---|---|
0f5d89e8 A |
1 | // © 2017 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | // norms.h | |
5 | // created: 2017jun04 Markus W. Scherer | |
6 | // (pulled out of n2builder.cpp) | |
7 | ||
8 | // Storing & manipulating Normalizer2 builder data. | |
9 | ||
10 | #ifndef __NORMS_H__ | |
11 | #define __NORMS_H__ | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | ||
15 | #if !UCONFIG_NO_NORMALIZATION | |
16 | ||
17 | #include "unicode/errorcode.h" | |
3d1f044b | 18 | #include "unicode/umutablecptrie.h" |
0f5d89e8 A |
19 | #include "unicode/uniset.h" |
20 | #include "unicode/unistr.h" | |
21 | #include "unicode/utf16.h" | |
22 | #include "normalizer2impl.h" | |
23 | #include "toolutil.h" | |
0f5d89e8 A |
24 | #include "uvectr32.h" |
25 | ||
26 | U_NAMESPACE_BEGIN | |
27 | ||
28 | class BuilderReorderingBuffer { | |
29 | public: | |
30 | BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {} | |
31 | void reset() { | |
32 | fLength=0; | |
33 | fLastStarterIndex=-1; | |
34 | fDidReorder=FALSE; | |
35 | } | |
36 | int32_t length() const { return fLength; } | |
37 | UBool isEmpty() const { return fLength==0; } | |
38 | int32_t lastStarterIndex() const { return fLastStarterIndex; } | |
39 | UChar32 charAt(int32_t i) const { return fArray[i]>>8; } | |
40 | uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; } | |
41 | UBool didReorder() const { return fDidReorder; } | |
42 | ||
43 | void append(UChar32 c, uint8_t cc); | |
44 | void toString(UnicodeString &dest) const; | |
45 | ||
46 | private: | |
47 | int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK]; | |
48 | int32_t fLength; | |
49 | int32_t fLastStarterIndex; | |
50 | UBool fDidReorder; | |
51 | }; | |
52 | ||
53 | struct CompositionPair { | |
54 | CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {} | |
55 | UChar32 trail, composite; | |
56 | }; | |
57 | ||
58 | struct Norm { | |
59 | enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY }; | |
60 | ||
61 | UBool hasMapping() const { return mappingType>REMOVED; } | |
62 | ||
63 | // Requires hasMapping() and well-formed mapping. | |
64 | void setMappingCP() { | |
65 | UChar32 c; | |
66 | if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) { | |
67 | mappingCP=c; | |
68 | } else { | |
69 | mappingCP=U_SENTINEL; | |
70 | } | |
71 | } | |
72 | ||
73 | const CompositionPair *getCompositionPairs(int32_t &length) const { | |
74 | if(compositions==nullptr) { | |
75 | length=0; | |
76 | return nullptr; | |
77 | } else { | |
78 | length=compositions->size()/2; | |
79 | return reinterpret_cast<const CompositionPair *>(compositions->getBuffer()); | |
80 | } | |
81 | } | |
82 | UChar32 combine(UChar32 trail) const; | |
83 | ||
84 | UnicodeString *mapping; | |
85 | UnicodeString *rawMapping; // non-nullptr if the mapping is further decomposed | |
86 | UChar32 mappingCP; // >=0 if mapping to 1 code point | |
87 | int32_t mappingPhase; | |
88 | MappingType mappingType; | |
89 | ||
90 | UVector32 *compositions; // (trail, composite) pairs | |
91 | uint8_t cc, leadCC, trailCC; | |
92 | UBool combinesBack; | |
93 | UBool hasCompBoundaryBefore, hasCompBoundaryAfter; | |
94 | ||
95 | /** | |
96 | * Overall type of normalization properties. | |
97 | * Set after most processing is done. | |
98 | * | |
99 | * Corresponds to the rows in the chart on | |
100 | * http://site.icu-project.org/design/normalization/custom | |
101 | * in numerical (but reverse visual) order. | |
102 | * | |
103 | * YES_NO means composition quick check=yes, decomposition QC=no -- etc. | |
104 | */ | |
105 | enum Type { | |
106 | /** Initial value until most processing is done. */ | |
107 | UNKNOWN, | |
108 | /** No mapping, does not combine, ccc=0. */ | |
109 | INERT, | |
110 | /** Starter, no mapping, has compositions. */ | |
111 | YES_YES_COMBINES_FWD, | |
112 | /** Starter with a round-trip mapping and compositions. */ | |
113 | YES_NO_COMBINES_FWD, | |
114 | /** Starter with a round-trip mapping but no compositions. */ | |
115 | YES_NO_MAPPING_ONLY, | |
116 | /** Has a one-way mapping which is comp-normalized. */ | |
117 | NO_NO_COMP_YES, | |
118 | /** Has a one-way mapping which is not comp-normalized but has a comp boundary before. */ | |
119 | NO_NO_COMP_BOUNDARY_BEFORE, | |
120 | /** Has a one-way mapping which does not have a comp boundary before. */ | |
121 | NO_NO_COMP_NO_MAYBE_CC, | |
122 | /** Has a one-way mapping to the empty string. */ | |
123 | NO_NO_EMPTY, | |
124 | /** Has an algorithmic one-way mapping to a single code point. */ | |
125 | NO_NO_DELTA, | |
126 | /** | |
127 | * Combines both backward and forward, has compositions. | |
128 | * Allowed, but not normally used. | |
129 | */ | |
130 | MAYBE_YES_COMBINES_FWD, | |
131 | /** Combines only backward. */ | |
132 | MAYBE_YES_SIMPLE, | |
133 | /** Non-zero ccc but does not combine backward. */ | |
134 | YES_YES_WITH_CC | |
135 | } type; | |
136 | /** Offset into the type's part of the extra data, or the algorithmic-mapping delta. */ | |
137 | int32_t offset; | |
138 | ||
139 | /** | |
140 | * Error string set by processing functions that do not have access | |
141 | * to the code point, deferred for readable reporting. | |
142 | */ | |
143 | const char *error; | |
144 | }; | |
145 | ||
146 | class Norms { | |
147 | public: | |
148 | Norms(UErrorCode &errorCode); | |
149 | ~Norms(); | |
150 | ||
151 | int32_t length() const { return utm_countItems(normMem); } | |
152 | const Norm &getNormRefByIndex(int32_t i) const { return norms[i]; } | |
153 | Norm &getNormRefByIndex(int32_t i) { return norms[i]; } | |
154 | ||
155 | Norm *allocNorm(); | |
156 | /** Returns an existing Norm unit, or nullptr if c has no data. */ | |
157 | Norm *getNorm(UChar32 c); | |
158 | const Norm *getNorm(UChar32 c) const; | |
159 | /** Returns a Norm unit, creating a new one if necessary. */ | |
160 | Norm *createNorm(UChar32 c); | |
161 | /** Returns an existing Norm unit, or an immutable empty object if c has no data. */ | |
162 | const Norm &getNormRef(UChar32 c) const; | |
163 | uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; } | |
164 | UBool combinesBack(UChar32 c) const { | |
165 | return Hangul::isJamoV(c) || Hangul::isJamoT(c) || getNormRef(c).combinesBack; | |
166 | } | |
167 | ||
168 | void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const; | |
169 | ||
170 | // int32_t highCC not uint8_t so that we can pass in 256 as the upper limit. | |
171 | UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const; | |
172 | ||
173 | class Enumerator { | |
174 | public: | |
175 | Enumerator(Norms &n) : norms(n) {} | |
176 | virtual ~Enumerator(); | |
177 | /** Called for enumerated value!=0. */ | |
178 | virtual void rangeHandler(UChar32 start, UChar32 end, Norm &norm) = 0; | |
0f5d89e8 A |
179 | protected: |
180 | Norms &norms; | |
181 | }; | |
182 | ||
183 | void enumRanges(Enumerator &e); | |
184 | ||
185 | UnicodeSet ccSet, mappingSet; | |
186 | ||
187 | private: | |
188 | Norms(const Norms &other) = delete; | |
189 | Norms &operator=(const Norms &other) = delete; | |
190 | ||
3d1f044b | 191 | UMutableCPTrie *normTrie; |
0f5d89e8 A |
192 | UToolMemory *normMem; |
193 | Norm *norms; | |
194 | }; | |
195 | ||
196 | class CompositionBuilder : public Norms::Enumerator { | |
197 | public: | |
198 | CompositionBuilder(Norms &n) : Norms::Enumerator(n) {} | |
199 | /** Adds a composition mapping for the first character in a round-trip mapping. */ | |
200 | void rangeHandler(UChar32 start, UChar32 end, Norm &norm) U_OVERRIDE; | |
201 | }; | |
202 | ||
203 | class Decomposer : public Norms::Enumerator { | |
204 | public: | |
205 | Decomposer(Norms &n) : Norms::Enumerator(n), didDecompose(FALSE) {} | |
206 | /** Decomposes each character of the current mapping. Sets didDecompose if any. */ | |
207 | void rangeHandler(UChar32 start, UChar32 end, Norm &norm) U_OVERRIDE; | |
208 | UBool didDecompose; | |
209 | }; | |
210 | ||
211 | U_NAMESPACE_END | |
212 | ||
213 | #endif // #if !UCONFIG_NO_NORMALIZATION | |
214 | ||
215 | #endif // __NORMS_H__ |