]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/dictbe.h
ICU-531.30.tar.gz
[apple/icu.git] / icuSources / common / dictbe.h
1 /**
2 *******************************************************************************
3 * Copyright (C) 2006,2012-2013, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
6 */
7
8 #ifndef DICTBE_H
9 #define DICTBE_H
10
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/utext.h"
14
15 #include "brkeng.h"
16
17 U_NAMESPACE_BEGIN
18
19 class DictionaryMatcher;
20
21 /*******************************************************************
22 * DictionaryBreakEngine
23 */
24
25 /**
26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
27 * dictionary to determine language-specific breaks.</p>
28 *
29 * <p>After it is constructed a DictionaryBreakEngine may be shared between
30 * threads without synchronization.</p>
31 */
32 class DictionaryBreakEngine : public LanguageBreakEngine {
33 private:
34 /**
35 * The set of characters handled by this engine
36 * @internal
37 */
38
39 UnicodeSet fSet;
40
41 /**
42 * The set of break types handled by this engine
43 * @internal
44 */
45
46 uint32_t fTypes;
47
48 /**
49 * <p>Default constructor.</p>
50 *
51 */
52 DictionaryBreakEngine();
53
54 public:
55
56 /**
57 * <p>Constructor setting the break types handled.</p>
58 *
59 * @param breakTypes A bitmap of types handled by the engine.
60 */
61 DictionaryBreakEngine( uint32_t breakTypes );
62
63 /**
64 * <p>Virtual destructor.</p>
65 */
66 virtual ~DictionaryBreakEngine();
67
68 /**
69 * <p>Indicate whether this engine handles a particular character for
70 * a particular kind of break.</p>
71 *
72 * @param c A character which begins a run that the engine might handle
73 * @param breakType The type of text break which the caller wants to determine
74 * @return TRUE if this engine handles the particular character and break
75 * type.
76 */
77 virtual UBool handles( UChar32 c, int32_t breakType ) const;
78
79 /**
80 * <p>Find any breaks within a run in the supplied text.</p>
81 *
82 * @param text A UText representing the text. The iterator is left at
83 * the end of the run of characters which the engine is capable of handling
84 * that starts from the first (or last) character in the range.
85 * @param startPos The start of the run within the supplied text.
86 * @param endPos The end of the run within the supplied text.
87 * @param reverse Whether the caller is looking for breaks in a reverse
88 * direction.
89 * @param breakType The type of break desired, or -1.
90 * @param foundBreaks An allocated C array of the breaks found, if any
91 * @return The number of breaks found.
92 */
93 virtual int32_t findBreaks( UText *text,
94 int32_t startPos,
95 int32_t endPos,
96 UBool reverse,
97 int32_t breakType,
98 UStack &foundBreaks ) const;
99
100 protected:
101
102 /**
103 * <p>Set the character set handled by this engine.</p>
104 *
105 * @param set A UnicodeSet of the set of characters handled by the engine
106 */
107 virtual void setCharacters( const UnicodeSet &set );
108
109 /**
110 * <p>Set the break types handled by this engine.</p>
111 *
112 * @param breakTypes A bitmap of types handled by the engine.
113 */
114 // virtual void setBreakTypes( uint32_t breakTypes );
115
116 /**
117 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
118 *
119 * @param text A UText representing the text
120 * @param rangeStart The start of the range of dictionary characters
121 * @param rangeEnd The end of the range of dictionary characters
122 * @param foundBreaks Output of C array of int32_t break positions, or 0
123 * @return The number of breaks found
124 */
125 virtual int32_t divideUpDictionaryRange( UText *text,
126 int32_t rangeStart,
127 int32_t rangeEnd,
128 UStack &foundBreaks ) const = 0;
129
130 };
131
132 /*******************************************************************
133 * ThaiBreakEngine
134 */
135
136 /**
137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
138 * dictionary and heuristics to determine Thai-specific breaks.</p>
139 *
140 * <p>After it is constructed a ThaiBreakEngine may be shared between
141 * threads without synchronization.</p>
142 */
143 class ThaiBreakEngine : public DictionaryBreakEngine {
144 private:
145 /**
146 * The set of characters handled by this engine
147 * @internal
148 */
149
150 UnicodeSet fThaiWordSet;
151 UnicodeSet fEndWordSet;
152 UnicodeSet fBeginWordSet;
153 UnicodeSet fSuffixSet;
154 UnicodeSet fMarkSet;
155 DictionaryMatcher *fDictionary;
156
157 public:
158
159 /**
160 * <p>Default constructor.</p>
161 *
162 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
163 * engine is deleted.
164 */
165 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
166
167 /**
168 * <p>Virtual destructor.</p>
169 */
170 virtual ~ThaiBreakEngine();
171
172 protected:
173 /**
174 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
175 *
176 * @param text A UText representing the text
177 * @param rangeStart The start of the range of dictionary characters
178 * @param rangeEnd The end of the range of dictionary characters
179 * @param foundBreaks Output of C array of int32_t break positions, or 0
180 * @return The number of breaks found
181 */
182 virtual int32_t divideUpDictionaryRange( UText *text,
183 int32_t rangeStart,
184 int32_t rangeEnd,
185 UStack &foundBreaks ) const;
186
187 };
188
189 /*******************************************************************
190 * LaoBreakEngine
191 */
192
193 /**
194 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
195 * dictionary and heuristics to determine Lao-specific breaks.</p>
196 *
197 * <p>After it is constructed a LaoBreakEngine may be shared between
198 * threads without synchronization.</p>
199 */
200 class LaoBreakEngine : public DictionaryBreakEngine {
201 private:
202 /**
203 * The set of characters handled by this engine
204 * @internal
205 */
206
207 UnicodeSet fLaoWordSet;
208 UnicodeSet fEndWordSet;
209 UnicodeSet fBeginWordSet;
210 UnicodeSet fMarkSet;
211 DictionaryMatcher *fDictionary;
212
213 public:
214
215 /**
216 * <p>Default constructor.</p>
217 *
218 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
219 * engine is deleted.
220 */
221 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
222
223 /**
224 * <p>Virtual destructor.</p>
225 */
226 virtual ~LaoBreakEngine();
227
228 protected:
229 /**
230 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
231 *
232 * @param text A UText representing the text
233 * @param rangeStart The start of the range of dictionary characters
234 * @param rangeEnd The end of the range of dictionary characters
235 * @param foundBreaks Output of C array of int32_t break positions, or 0
236 * @return The number of breaks found
237 */
238 virtual int32_t divideUpDictionaryRange( UText *text,
239 int32_t rangeStart,
240 int32_t rangeEnd,
241 UStack &foundBreaks ) const;
242
243 };
244
245 /*******************************************************************
246 * KhmerBreakEngine
247 */
248
249 /**
250 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
251 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
252 *
253 * <p>After it is constructed a KhmerBreakEngine may be shared between
254 * threads without synchronization.</p>
255 */
256 class KhmerBreakEngine : public DictionaryBreakEngine {
257 private:
258 /**
259 * The set of characters handled by this engine
260 * @internal
261 */
262
263 UnicodeSet fKhmerWordSet;
264 UnicodeSet fEndWordSet;
265 UnicodeSet fBeginWordSet;
266 UnicodeSet fMarkSet;
267 DictionaryMatcher *fDictionary;
268
269 public:
270
271 /**
272 * <p>Default constructor.</p>
273 *
274 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
275 * engine is deleted.
276 */
277 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
278
279 /**
280 * <p>Virtual destructor.</p>
281 */
282 virtual ~KhmerBreakEngine();
283
284 protected:
285 /**
286 * <p>Divide up a range of known dictionary characters.</p>
287 *
288 * @param text A UText representing the text
289 * @param rangeStart The start of the range of dictionary characters
290 * @param rangeEnd The end of the range of dictionary characters
291 * @param foundBreaks Output of C array of int32_t break positions, or 0
292 * @return The number of breaks found
293 */
294 virtual int32_t divideUpDictionaryRange( UText *text,
295 int32_t rangeStart,
296 int32_t rangeEnd,
297 UStack &foundBreaks ) const;
298
299 };
300
301 #if !UCONFIG_NO_NORMALIZATION
302
303 /*******************************************************************
304 * CjkBreakEngine
305 */
306
307 //indicates language/script that the CjkBreakEngine will handle
308 enum LanguageType {
309 kKorean,
310 kChineseJapanese
311 };
312
313 /**
314 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
315 * dictionary with costs associated with each word and
316 * Viterbi decoding to determine CJK-specific breaks.</p>
317 */
318 class CjkBreakEngine : public DictionaryBreakEngine {
319 protected:
320 /**
321 * The set of characters handled by this engine
322 * @internal
323 */
324 UnicodeSet fHangulWordSet;
325 UnicodeSet fHanWordSet;
326 UnicodeSet fKatakanaWordSet;
327 UnicodeSet fHiraganaWordSet;
328
329 DictionaryMatcher *fDictionary;
330
331 public:
332
333 /**
334 * <p>Default constructor.</p>
335 *
336 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
337 * engine is deleted. The DictionaryMatcher must contain costs for each word
338 * in order for the dictionary to work properly.
339 */
340 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
341
342 /**
343 * <p>Virtual destructor.</p>
344 */
345 virtual ~CjkBreakEngine();
346
347 protected:
348 /**
349 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
350 *
351 * @param text A UText representing the text
352 * @param rangeStart The start of the range of dictionary characters
353 * @param rangeEnd The end of the range of dictionary characters
354 * @param foundBreaks Output of C array of int32_t break positions, or 0
355 * @return The number of breaks found
356 */
357 virtual int32_t divideUpDictionaryRange( UText *text,
358 int32_t rangeStart,
359 int32_t rangeEnd,
360 UStack &foundBreaks ) const;
361
362 };
363
364 #endif
365
366 U_NAMESPACE_END
367
368 /* DICTBE_H */
369 #endif