]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/dictbe.h
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / common / dictbe.h
1 /**
2 *******************************************************************************
3 * Copyright (C) 2006-2014, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
6 */
7
8 #ifndef DICTBE_H
9 #define DICTBE_H
10
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/utext.h"
14
15 #include "brkeng.h"
16
17 U_NAMESPACE_BEGIN
18
19 class DictionaryMatcher;
20 class Normalizer2;
21
22 /*******************************************************************
23 * DictionaryBreakEngine
24 */
25
26 /**
27 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
28 * dictionary to determine language-specific breaks.</p>
29 *
30 * <p>After it is constructed a DictionaryBreakEngine may be shared between
31 * threads without synchronization.</p>
32 */
33 class DictionaryBreakEngine : public LanguageBreakEngine {
34 private:
35 /**
36 * The set of characters handled by this engine
37 * @internal
38 */
39
40 UnicodeSet fSet;
41
42 /**
43 * The set of break types handled by this engine
44 * @internal
45 */
46
47 uint32_t fTypes;
48
49 /**
50 * <p>Default constructor.</p>
51 *
52 */
53 DictionaryBreakEngine();
54
55 public:
56
57 /**
58 * <p>Constructor setting the break types handled.</p>
59 *
60 * @param breakTypes A bitmap of types handled by the engine.
61 */
62 DictionaryBreakEngine( uint32_t breakTypes );
63
64 /**
65 * <p>Virtual destructor.</p>
66 */
67 virtual ~DictionaryBreakEngine();
68
69 /**
70 * <p>Indicate whether this engine handles a particular character for
71 * a particular kind of break.</p>
72 *
73 * @param c A character which begins a run that the engine might handle
74 * @param breakType The type of text break which the caller wants to determine
75 * @return TRUE if this engine handles the particular character and break
76 * type.
77 */
78 virtual UBool handles( UChar32 c, int32_t breakType ) const;
79
80 /**
81 * <p>Find any breaks within a run in the supplied text.</p>
82 *
83 * @param text A UText representing the text. The iterator is left at
84 * the end of the run of characters which the engine is capable of handling
85 * that starts from the first (or last) character in the range.
86 * @param startPos The start of the run within the supplied text.
87 * @param endPos The end of the run within the supplied text.
88 * @param reverse Whether the caller is looking for breaks in a reverse
89 * direction.
90 * @param breakType The type of break desired, or -1.
91 * @param foundBreaks An allocated C array of the breaks found, if any
92 * @return The number of breaks found.
93 */
94 virtual int32_t findBreaks( UText *text,
95 int32_t startPos,
96 int32_t endPos,
97 UBool reverse,
98 int32_t breakType,
99 UStack &foundBreaks ) const;
100
101 protected:
102
103 /**
104 * <p>Set the character set handled by this engine.</p>
105 *
106 * @param set A UnicodeSet of the set of characters handled by the engine
107 */
108 virtual void setCharacters( const UnicodeSet &set );
109
110 /**
111 * <p>Set the break types handled by this engine.</p>
112 *
113 * @param breakTypes A bitmap of types handled by the engine.
114 */
115 // virtual void setBreakTypes( uint32_t breakTypes );
116
117 /**
118 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
119 *
120 * @param text A UText representing the text
121 * @param rangeStart The start of the range of dictionary characters
122 * @param rangeEnd The end of the range of dictionary characters
123 * @param foundBreaks Output of C array of int32_t break positions, or 0
124 * @return The number of breaks found
125 */
126 virtual int32_t divideUpDictionaryRange( UText *text,
127 int32_t rangeStart,
128 int32_t rangeEnd,
129 UStack &foundBreaks ) const = 0;
130
131 };
132
133 /*******************************************************************
134 * ThaiBreakEngine
135 */
136
137 /**
138 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
139 * dictionary and heuristics to determine Thai-specific breaks.</p>
140 *
141 * <p>After it is constructed a ThaiBreakEngine may be shared between
142 * threads without synchronization.</p>
143 */
144 class ThaiBreakEngine : public DictionaryBreakEngine {
145 private:
146 /**
147 * The set of characters handled by this engine
148 * @internal
149 */
150
151 UnicodeSet fThaiWordSet;
152 UnicodeSet fEndWordSet;
153 UnicodeSet fBeginWordSet;
154 UnicodeSet fSuffixSet;
155 UnicodeSet fMarkSet;
156 DictionaryMatcher *fDictionary;
157
158 public:
159
160 /**
161 * <p>Default constructor.</p>
162 *
163 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
164 * engine is deleted.
165 */
166 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
167
168 /**
169 * <p>Virtual destructor.</p>
170 */
171 virtual ~ThaiBreakEngine();
172
173 protected:
174 /**
175 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
176 *
177 * @param text A UText representing the text
178 * @param rangeStart The start of the range of dictionary characters
179 * @param rangeEnd The end of the range of dictionary characters
180 * @param foundBreaks Output of C array of int32_t break positions, or 0
181 * @return The number of breaks found
182 */
183 virtual int32_t divideUpDictionaryRange( UText *text,
184 int32_t rangeStart,
185 int32_t rangeEnd,
186 UStack &foundBreaks ) const;
187
188 };
189
190 /*******************************************************************
191 * LaoBreakEngine
192 */
193
194 /**
195 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
196 * dictionary and heuristics to determine Lao-specific breaks.</p>
197 *
198 * <p>After it is constructed a LaoBreakEngine may be shared between
199 * threads without synchronization.</p>
200 */
201 class LaoBreakEngine : public DictionaryBreakEngine {
202 private:
203 /**
204 * The set of characters handled by this engine
205 * @internal
206 */
207
208 UnicodeSet fLaoWordSet;
209 UnicodeSet fEndWordSet;
210 UnicodeSet fBeginWordSet;
211 UnicodeSet fMarkSet;
212 DictionaryMatcher *fDictionary;
213
214 public:
215
216 /**
217 * <p>Default constructor.</p>
218 *
219 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
220 * engine is deleted.
221 */
222 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
223
224 /**
225 * <p>Virtual destructor.</p>
226 */
227 virtual ~LaoBreakEngine();
228
229 protected:
230 /**
231 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
232 *
233 * @param text A UText representing the text
234 * @param rangeStart The start of the range of dictionary characters
235 * @param rangeEnd The end of the range of dictionary characters
236 * @param foundBreaks Output of C array of int32_t break positions, or 0
237 * @return The number of breaks found
238 */
239 virtual int32_t divideUpDictionaryRange( UText *text,
240 int32_t rangeStart,
241 int32_t rangeEnd,
242 UStack &foundBreaks ) const;
243
244 };
245
246 /*******************************************************************
247 * BurmeseBreakEngine
248 */
249
250 /**
251 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
252 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
253 *
254 * <p>After it is constructed a BurmeseBreakEngine may be shared between
255 * threads without synchronization.</p>
256 */
257 class BurmeseBreakEngine : public DictionaryBreakEngine {
258 private:
259 /**
260 * The set of characters handled by this engine
261 * @internal
262 */
263
264 UnicodeSet fBurmeseWordSet;
265 UnicodeSet fEndWordSet;
266 UnicodeSet fBeginWordSet;
267 UnicodeSet fMarkSet;
268 DictionaryMatcher *fDictionary;
269
270 public:
271
272 /**
273 * <p>Default constructor.</p>
274 *
275 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
276 * engine is deleted.
277 */
278 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
279
280 /**
281 * <p>Virtual destructor.</p>
282 */
283 virtual ~BurmeseBreakEngine();
284
285 protected:
286 /**
287 * <p>Divide up a range of known dictionary characters.</p>
288 *
289 * @param text A UText representing the text
290 * @param rangeStart The start of the range of dictionary characters
291 * @param rangeEnd The end of the range of dictionary characters
292 * @param foundBreaks Output of C array of int32_t break positions, or 0
293 * @return The number of breaks found
294 */
295 virtual int32_t divideUpDictionaryRange( UText *text,
296 int32_t rangeStart,
297 int32_t rangeEnd,
298 UStack &foundBreaks ) const;
299
300 };
301
302 /*******************************************************************
303 * KhmerBreakEngine
304 */
305
306 /**
307 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
308 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
309 *
310 * <p>After it is constructed a KhmerBreakEngine may be shared between
311 * threads without synchronization.</p>
312 */
313 class KhmerBreakEngine : public DictionaryBreakEngine {
314 private:
315 /**
316 * The set of characters handled by this engine
317 * @internal
318 */
319
320 UnicodeSet fKhmerWordSet;
321 UnicodeSet fEndWordSet;
322 UnicodeSet fBeginWordSet;
323 UnicodeSet fMarkSet;
324 DictionaryMatcher *fDictionary;
325
326 public:
327
328 /**
329 * <p>Default constructor.</p>
330 *
331 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
332 * engine is deleted.
333 */
334 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
335
336 /**
337 * <p>Virtual destructor.</p>
338 */
339 virtual ~KhmerBreakEngine();
340
341 protected:
342 /**
343 * <p>Divide up a range of known dictionary characters.</p>
344 *
345 * @param text A UText representing the text
346 * @param rangeStart The start of the range of dictionary characters
347 * @param rangeEnd The end of the range of dictionary characters
348 * @param foundBreaks Output of C array of int32_t break positions, or 0
349 * @return The number of breaks found
350 */
351 virtual int32_t divideUpDictionaryRange( UText *text,
352 int32_t rangeStart,
353 int32_t rangeEnd,
354 UStack &foundBreaks ) const;
355
356 };
357
358 #if !UCONFIG_NO_NORMALIZATION
359
360 /*******************************************************************
361 * CjkBreakEngine
362 */
363
364 //indicates language/script that the CjkBreakEngine will handle
365 enum LanguageType {
366 kKorean,
367 kChineseJapanese
368 };
369
370 /**
371 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
372 * dictionary with costs associated with each word and
373 * Viterbi decoding to determine CJK-specific breaks.</p>
374 */
375 class CjkBreakEngine : public DictionaryBreakEngine {
376 protected:
377 /**
378 * The set of characters handled by this engine
379 * @internal
380 */
381 UnicodeSet fHangulWordSet;
382 UnicodeSet fHanWordSet;
383 UnicodeSet fKatakanaWordSet;
384 UnicodeSet fHiraganaWordSet;
385
386 DictionaryMatcher *fDictionary;
387 const Normalizer2 *nfkcNorm2;
388
389 public:
390
391 /**
392 * <p>Default constructor.</p>
393 *
394 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
395 * engine is deleted. The DictionaryMatcher must contain costs for each word
396 * in order for the dictionary to work properly.
397 */
398 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
399
400 /**
401 * <p>Virtual destructor.</p>
402 */
403 virtual ~CjkBreakEngine();
404
405 protected:
406 /**
407 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
408 *
409 * @param text A UText representing the text
410 * @param rangeStart The start of the range of dictionary characters
411 * @param rangeEnd The end of the range of dictionary characters
412 * @param foundBreaks Output of C array of int32_t break positions, or 0
413 * @return The number of breaks found
414 */
415 virtual int32_t divideUpDictionaryRange( UText *text,
416 int32_t rangeStart,
417 int32_t rangeEnd,
418 UStack &foundBreaks ) const;
419
420 };
421
422 #endif
423
424 U_NAMESPACE_END
425
426 /* DICTBE_H */
427 #endif