]>
Commit | Line | Data |
---|---|---|
f3c0d7a5 A |
1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
73c04bcf A |
3 | /** |
4 | ******************************************************************************* | |
b331163b | 5 | * Copyright (C) 2006-2014, International Business Machines Corporation * |
4388f060 | 6 | * and others. All Rights Reserved. * |
73c04bcf A |
7 | ******************************************************************************* |
8 | */ | |
9 | ||
10 | #ifndef DICTBE_H | |
11 | #define DICTBE_H | |
12 | ||
13 | #include "unicode/utypes.h" | |
14 | #include "unicode/uniset.h" | |
15 | #include "unicode/utext.h" | |
16 | ||
17 | #include "brkeng.h" | |
18 | ||
19 | U_NAMESPACE_BEGIN | |
20 | ||
51004dcb | 21 | class DictionaryMatcher; |
b331163b | 22 | class Normalizer2; |
73c04bcf A |
23 | |
24 | /******************************************************************* | |
25 | * DictionaryBreakEngine | |
26 | */ | |
27 | ||
28 | /** | |
29 | * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a | |
30 | * dictionary to determine language-specific breaks.</p> | |
31 | * | |
32 | * <p>After it is constructed a DictionaryBreakEngine may be shared between | |
33 | * threads without synchronization.</p> | |
34 | */ | |
35 | class DictionaryBreakEngine : public LanguageBreakEngine { | |
36 | private: | |
37 | /** | |
38 | * The set of characters handled by this engine | |
39 | * @internal | |
40 | */ | |
41 | ||
42 | UnicodeSet fSet; | |
43 | ||
44 | /** | |
45 | * The set of break types handled by this engine | |
46 | * @internal | |
47 | */ | |
48 | ||
49 | uint32_t fTypes; | |
50 | ||
51 | /** | |
52 | * <p>Default constructor.</p> | |
53 | * | |
54 | */ | |
55 | DictionaryBreakEngine(); | |
56 | ||
57 | public: | |
58 | ||
59 | /** | |
60 | * <p>Constructor setting the break types handled.</p> | |
61 | * | |
62 | * @param breakTypes A bitmap of types handled by the engine. | |
63 | */ | |
64 | DictionaryBreakEngine( uint32_t breakTypes ); | |
65 | ||
66 | /** | |
67 | * <p>Virtual destructor.</p> | |
68 | */ | |
69 | virtual ~DictionaryBreakEngine(); | |
70 | ||
51004dcb A |
71 | /** |
72 | * <p>Indicate whether this engine handles a particular character for | |
73 | * a particular kind of break.</p> | |
74 | * | |
75 | * @param c A character which begins a run that the engine might handle | |
76 | * @param breakType The type of text break which the caller wants to determine | |
77 | * @return TRUE if this engine handles the particular character and break | |
78 | * type. | |
79 | */ | |
73c04bcf A |
80 | virtual UBool handles( UChar32 c, int32_t breakType ) const; |
81 | ||
51004dcb A |
82 | /** |
83 | * <p>Find any breaks within a run in the supplied text.</p> | |
84 | * | |
85 | * @param text A UText representing the text. The iterator is left at | |
86 | * the end of the run of characters which the engine is capable of handling | |
87 | * that starts from the first (or last) character in the range. | |
88 | * @param startPos The start of the run within the supplied text. | |
89 | * @param endPos The end of the run within the supplied text. | |
90 | * @param reverse Whether the caller is looking for breaks in a reverse | |
91 | * direction. | |
92 | * @param breakType The type of break desired, or -1. | |
93 | * @param foundBreaks An allocated C array of the breaks found, if any | |
94 | * @return The number of breaks found. | |
95 | */ | |
73c04bcf A |
96 | virtual int32_t findBreaks( UText *text, |
97 | int32_t startPos, | |
98 | int32_t endPos, | |
99 | UBool reverse, | |
100 | int32_t breakType, | |
101 | UStack &foundBreaks ) const; | |
102 | ||
103 | protected: | |
104 | ||
105 | /** | |
106 | * <p>Set the character set handled by this engine.</p> | |
107 | * | |
108 | * @param set A UnicodeSet of the set of characters handled by the engine | |
109 | */ | |
46f4442e | 110 | virtual void setCharacters( const UnicodeSet &set ); |
73c04bcf A |
111 | |
112 | /** | |
113 | * <p>Set the break types handled by this engine.</p> | |
114 | * | |
115 | * @param breakTypes A bitmap of types handled by the engine. | |
116 | */ | |
117 | // virtual void setBreakTypes( uint32_t breakTypes ); | |
118 | ||
119 | /** | |
51004dcb | 120 | * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
73c04bcf A |
121 | * |
122 | * @param text A UText representing the text | |
123 | * @param rangeStart The start of the range of dictionary characters | |
124 | * @param rangeEnd The end of the range of dictionary characters | |
125 | * @param foundBreaks Output of C array of int32_t break positions, or 0 | |
126 | * @return The number of breaks found | |
127 | */ | |
128 | virtual int32_t divideUpDictionaryRange( UText *text, | |
129 | int32_t rangeStart, | |
130 | int32_t rangeEnd, | |
131 | UStack &foundBreaks ) const = 0; | |
132 | ||
133 | }; | |
134 | ||
135 | /******************************************************************* | |
136 | * ThaiBreakEngine | |
137 | */ | |
138 | ||
139 | /** | |
140 | * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a | |
51004dcb | 141 | * dictionary and heuristics to determine Thai-specific breaks.</p> |
73c04bcf A |
142 | * |
143 | * <p>After it is constructed a ThaiBreakEngine may be shared between | |
144 | * threads without synchronization.</p> | |
145 | */ | |
146 | class ThaiBreakEngine : public DictionaryBreakEngine { | |
147 | private: | |
148 | /** | |
149 | * The set of characters handled by this engine | |
150 | * @internal | |
151 | */ | |
152 | ||
153 | UnicodeSet fThaiWordSet; | |
154 | UnicodeSet fEndWordSet; | |
155 | UnicodeSet fBeginWordSet; | |
156 | UnicodeSet fSuffixSet; | |
157 | UnicodeSet fMarkSet; | |
51004dcb | 158 | DictionaryMatcher *fDictionary; |
73c04bcf A |
159 | |
160 | public: | |
161 | ||
162 | /** | |
163 | * <p>Default constructor.</p> | |
164 | * | |
51004dcb | 165 | * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
73c04bcf A |
166 | * engine is deleted. |
167 | */ | |
51004dcb | 168 | ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
73c04bcf A |
169 | |
170 | /** | |
171 | * <p>Virtual destructor.</p> | |
172 | */ | |
173 | virtual ~ThaiBreakEngine(); | |
174 | ||
175 | protected: | |
176 | /** | |
51004dcb | 177 | * <p>Divide up a range of known dictionary characters handled by this break engine.</p> |
73c04bcf A |
178 | * |
179 | * @param text A UText representing the text | |
180 | * @param rangeStart The start of the range of dictionary characters | |
181 | * @param rangeEnd The end of the range of dictionary characters | |
182 | * @param foundBreaks Output of C array of int32_t break positions, or 0 | |
183 | * @return The number of breaks found | |
184 | */ | |
185 | virtual int32_t divideUpDictionaryRange( UText *text, | |
186 | int32_t rangeStart, | |
187 | int32_t rangeEnd, | |
188 | UStack &foundBreaks ) const; | |
189 | ||
190 | }; | |
191 | ||
51004dcb | 192 | /******************************************************************* |
57a6839d | 193 | * LaoBreakEngine |
51004dcb A |
194 | */ |
195 | ||
51004dcb | 196 | /** |
57a6839d A |
197 | * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a |
198 | * dictionary and heuristics to determine Lao-specific breaks.</p> | |
199 | * | |
200 | * <p>After it is constructed a LaoBreakEngine may be shared between | |
201 | * threads without synchronization.</p> | |
51004dcb | 202 | */ |
57a6839d A |
203 | class LaoBreakEngine : public DictionaryBreakEngine { |
204 | private: | |
51004dcb A |
205 | /** |
206 | * The set of characters handled by this engine | |
207 | * @internal | |
208 | */ | |
51004dcb | 209 | |
57a6839d A |
210 | UnicodeSet fLaoWordSet; |
211 | UnicodeSet fEndWordSet; | |
212 | UnicodeSet fBeginWordSet; | |
213 | UnicodeSet fMarkSet; | |
51004dcb A |
214 | DictionaryMatcher *fDictionary; |
215 | ||
216 | public: | |
217 | ||
57a6839d A |
218 | /** |
219 | * <p>Default constructor.</p> | |
220 | * | |
221 | * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the | |
222 | * engine is deleted. | |
223 | */ | |
224 | LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); | |
51004dcb | 225 | |
57a6839d A |
226 | /** |
227 | * <p>Virtual destructor.</p> | |
228 | */ | |
229 | virtual ~LaoBreakEngine(); | |
51004dcb A |
230 | |
231 | protected: | |
57a6839d A |
232 | /** |
233 | * <p>Divide up a range of known dictionary characters handled by this break engine.</p> | |
234 | * | |
235 | * @param text A UText representing the text | |
236 | * @param rangeStart The start of the range of dictionary characters | |
237 | * @param rangeEnd The end of the range of dictionary characters | |
238 | * @param foundBreaks Output of C array of int32_t break positions, or 0 | |
239 | * @return The number of breaks found | |
240 | */ | |
51004dcb | 241 | virtual int32_t divideUpDictionaryRange( UText *text, |
57a6839d A |
242 | int32_t rangeStart, |
243 | int32_t rangeEnd, | |
244 | UStack &foundBreaks ) const; | |
51004dcb A |
245 | |
246 | }; | |
247 | ||
b331163b A |
248 | /******************************************************************* |
249 | * BurmeseBreakEngine | |
250 | */ | |
251 | ||
252 | /** | |
253 | * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a | |
254 | * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p> | |
255 | * | |
256 | * <p>After it is constructed a BurmeseBreakEngine may be shared between | |
257 | * threads without synchronization.</p> | |
258 | */ | |
259 | class BurmeseBreakEngine : public DictionaryBreakEngine { | |
260 | private: | |
261 | /** | |
262 | * The set of characters handled by this engine | |
263 | * @internal | |
264 | */ | |
265 | ||
266 | UnicodeSet fBurmeseWordSet; | |
267 | UnicodeSet fEndWordSet; | |
268 | UnicodeSet fBeginWordSet; | |
269 | UnicodeSet fMarkSet; | |
270 | DictionaryMatcher *fDictionary; | |
271 | ||
272 | public: | |
273 | ||
274 | /** | |
275 | * <p>Default constructor.</p> | |
276 | * | |
277 | * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the | |
278 | * engine is deleted. | |
279 | */ | |
280 | BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); | |
281 | ||
282 | /** | |
283 | * <p>Virtual destructor.</p> | |
284 | */ | |
285 | virtual ~BurmeseBreakEngine(); | |
286 | ||
287 | protected: | |
288 | /** | |
289 | * <p>Divide up a range of known dictionary characters.</p> | |
290 | * | |
291 | * @param text A UText representing the text | |
292 | * @param rangeStart The start of the range of dictionary characters | |
293 | * @param rangeEnd The end of the range of dictionary characters | |
294 | * @param foundBreaks Output of C array of int32_t break positions, or 0 | |
295 | * @return The number of breaks found | |
296 | */ | |
297 | virtual int32_t divideUpDictionaryRange( UText *text, | |
298 | int32_t rangeStart, | |
299 | int32_t rangeEnd, | |
300 | UStack &foundBreaks ) const; | |
301 | ||
302 | }; | |
303 | ||
4388f060 A |
304 | /******************************************************************* |
305 | * KhmerBreakEngine | |
306 | */ | |
307 | ||
308 | /** | |
309 | * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a | |
51004dcb | 310 | * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p> |
4388f060 A |
311 | * |
312 | * <p>After it is constructed a KhmerBreakEngine may be shared between | |
313 | * threads without synchronization.</p> | |
314 | */ | |
315 | class KhmerBreakEngine : public DictionaryBreakEngine { | |
316 | private: | |
317 | /** | |
318 | * The set of characters handled by this engine | |
319 | * @internal | |
320 | */ | |
321 | ||
322 | UnicodeSet fKhmerWordSet; | |
323 | UnicodeSet fEndWordSet; | |
324 | UnicodeSet fBeginWordSet; | |
325 | UnicodeSet fMarkSet; | |
51004dcb | 326 | DictionaryMatcher *fDictionary; |
4388f060 A |
327 | |
328 | public: | |
329 | ||
330 | /** | |
331 | * <p>Default constructor.</p> | |
332 | * | |
51004dcb | 333 | * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the |
4388f060 A |
334 | * engine is deleted. |
335 | */ | |
51004dcb | 336 | KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status); |
4388f060 A |
337 | |
338 | /** | |
339 | * <p>Virtual destructor.</p> | |
340 | */ | |
341 | virtual ~KhmerBreakEngine(); | |
342 | ||
343 | protected: | |
344 | /** | |
345 | * <p>Divide up a range of known dictionary characters.</p> | |
346 | * | |
347 | * @param text A UText representing the text | |
348 | * @param rangeStart The start of the range of dictionary characters | |
349 | * @param rangeEnd The end of the range of dictionary characters | |
350 | * @param foundBreaks Output of C array of int32_t break positions, or 0 | |
351 | * @return The number of breaks found | |
352 | */ | |
353 | virtual int32_t divideUpDictionaryRange( UText *text, | |
354 | int32_t rangeStart, | |
355 | int32_t rangeEnd, | |
356 | UStack &foundBreaks ) const; | |
357 | ||
358 | }; | |
359 | ||
57a6839d A |
360 | #if !UCONFIG_NO_NORMALIZATION |
361 | ||
362 | /******************************************************************* | |
363 | * CjkBreakEngine | |
364 | */ | |
365 | ||
366 | //indicates language/script that the CjkBreakEngine will handle | |
367 | enum LanguageType { | |
368 | kKorean, | |
369 | kChineseJapanese | |
370 | }; | |
371 | ||
372 | /** | |
373 | * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a | |
374 | * dictionary with costs associated with each word and | |
375 | * Viterbi decoding to determine CJK-specific breaks.</p> | |
376 | */ | |
377 | class CjkBreakEngine : public DictionaryBreakEngine { | |
378 | protected: | |
379 | /** | |
380 | * The set of characters handled by this engine | |
381 | * @internal | |
382 | */ | |
383 | UnicodeSet fHangulWordSet; | |
384 | UnicodeSet fHanWordSet; | |
385 | UnicodeSet fKatakanaWordSet; | |
386 | UnicodeSet fHiraganaWordSet; | |
387 | ||
b331163b A |
388 | DictionaryMatcher *fDictionary; |
389 | const Normalizer2 *nfkcNorm2; | |
57a6839d A |
390 | |
391 | public: | |
392 | ||
393 | /** | |
394 | * <p>Default constructor.</p> | |
395 | * | |
396 | * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the | |
397 | * engine is deleted. The DictionaryMatcher must contain costs for each word | |
398 | * in order for the dictionary to work properly. | |
399 | */ | |
400 | CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status); | |
401 | ||
402 | /** | |
403 | * <p>Virtual destructor.</p> | |
404 | */ | |
405 | virtual ~CjkBreakEngine(); | |
406 | ||
407 | protected: | |
408 | /** | |
409 | * <p>Divide up a range of known dictionary characters handled by this break engine.</p> | |
410 | * | |
411 | * @param text A UText representing the text | |
412 | * @param rangeStart The start of the range of dictionary characters | |
413 | * @param rangeEnd The end of the range of dictionary characters | |
414 | * @param foundBreaks Output of C array of int32_t break positions, or 0 | |
415 | * @return The number of breaks found | |
416 | */ | |
417 | virtual int32_t divideUpDictionaryRange( UText *text, | |
418 | int32_t rangeStart, | |
419 | int32_t rangeEnd, | |
420 | UStack &foundBreaks ) const; | |
421 | ||
422 | }; | |
423 | ||
424 | #endif | |
425 | ||
73c04bcf A |
426 | U_NAMESPACE_END |
427 | ||
428 | /* DICTBE_H */ | |
429 | #endif |