]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/dictbe.h
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / dictbe.h
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf
A
3/**
4 *******************************************************************************
b331163b 5 * Copyright (C) 2006-2014, International Business Machines Corporation *
4388f060 6 * and others. All Rights Reserved. *
73c04bcf
A
7 *******************************************************************************
8 */
9
10#ifndef DICTBE_H
11#define DICTBE_H
12
13#include "unicode/utypes.h"
14#include "unicode/uniset.h"
15#include "unicode/utext.h"
16
17#include "brkeng.h"
18
19U_NAMESPACE_BEGIN
20
51004dcb 21class DictionaryMatcher;
b331163b 22class Normalizer2;
73c04bcf
A
23
24/*******************************************************************
25 * DictionaryBreakEngine
26 */
27
28/**
29 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
30 * dictionary to determine language-specific breaks.</p>
31 *
32 * <p>After it is constructed a DictionaryBreakEngine may be shared between
33 * threads without synchronization.</p>
34 */
35class DictionaryBreakEngine : public LanguageBreakEngine {
36 private:
37 /**
38 * The set of characters handled by this engine
39 * @internal
40 */
41
42 UnicodeSet fSet;
43
44 /**
45 * The set of break types handled by this engine
46 * @internal
47 */
48
49 uint32_t fTypes;
50
51 /**
52 * <p>Default constructor.</p>
53 *
54 */
55 DictionaryBreakEngine();
56
57 public:
58
59 /**
60 * <p>Constructor setting the break types handled.</p>
61 *
62 * @param breakTypes A bitmap of types handled by the engine.
63 */
64 DictionaryBreakEngine( uint32_t breakTypes );
65
66 /**
67 * <p>Virtual destructor.</p>
68 */
69 virtual ~DictionaryBreakEngine();
70
51004dcb
A
71 /**
72 * <p>Indicate whether this engine handles a particular character for
73 * a particular kind of break.</p>
74 *
75 * @param c A character which begins a run that the engine might handle
76 * @param breakType The type of text break which the caller wants to determine
77 * @return TRUE if this engine handles the particular character and break
78 * type.
79 */
73c04bcf
A
80 virtual UBool handles( UChar32 c, int32_t breakType ) const;
81
51004dcb
A
82 /**
83 * <p>Find any breaks within a run in the supplied text.</p>
84 *
85 * @param text A UText representing the text. The iterator is left at
86 * the end of the run of characters which the engine is capable of handling
87 * that starts from the first (or last) character in the range.
88 * @param startPos The start of the run within the supplied text.
89 * @param endPos The end of the run within the supplied text.
90 * @param reverse Whether the caller is looking for breaks in a reverse
91 * direction.
92 * @param breakType The type of break desired, or -1.
93 * @param foundBreaks An allocated C array of the breaks found, if any
94 * @return The number of breaks found.
95 */
73c04bcf
A
96 virtual int32_t findBreaks( UText *text,
97 int32_t startPos,
98 int32_t endPos,
99 UBool reverse,
100 int32_t breakType,
101 UStack &foundBreaks ) const;
102
103 protected:
104
105 /**
106 * <p>Set the character set handled by this engine.</p>
107 *
108 * @param set A UnicodeSet of the set of characters handled by the engine
109 */
46f4442e 110 virtual void setCharacters( const UnicodeSet &set );
73c04bcf
A
111
112 /**
113 * <p>Set the break types handled by this engine.</p>
114 *
115 * @param breakTypes A bitmap of types handled by the engine.
116 */
117// virtual void setBreakTypes( uint32_t breakTypes );
118
119 /**
51004dcb 120 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
73c04bcf
A
121 *
122 * @param text A UText representing the text
123 * @param rangeStart The start of the range of dictionary characters
124 * @param rangeEnd The end of the range of dictionary characters
125 * @param foundBreaks Output of C array of int32_t break positions, or 0
126 * @return The number of breaks found
127 */
128 virtual int32_t divideUpDictionaryRange( UText *text,
129 int32_t rangeStart,
130 int32_t rangeEnd,
131 UStack &foundBreaks ) const = 0;
132
133};
134
135/*******************************************************************
136 * ThaiBreakEngine
137 */
138
139/**
140 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
51004dcb 141 * dictionary and heuristics to determine Thai-specific breaks.</p>
73c04bcf
A
142 *
143 * <p>After it is constructed a ThaiBreakEngine may be shared between
144 * threads without synchronization.</p>
145 */
146class ThaiBreakEngine : public DictionaryBreakEngine {
147 private:
148 /**
149 * The set of characters handled by this engine
150 * @internal
151 */
152
153 UnicodeSet fThaiWordSet;
154 UnicodeSet fEndWordSet;
155 UnicodeSet fBeginWordSet;
156 UnicodeSet fSuffixSet;
157 UnicodeSet fMarkSet;
51004dcb 158 DictionaryMatcher *fDictionary;
73c04bcf
A
159
160 public:
161
162 /**
163 * <p>Default constructor.</p>
164 *
51004dcb 165 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
73c04bcf
A
166 * engine is deleted.
167 */
51004dcb 168 ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
73c04bcf
A
169
170 /**
171 * <p>Virtual destructor.</p>
172 */
173 virtual ~ThaiBreakEngine();
174
175 protected:
176 /**
51004dcb 177 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
73c04bcf
A
178 *
179 * @param text A UText representing the text
180 * @param rangeStart The start of the range of dictionary characters
181 * @param rangeEnd The end of the range of dictionary characters
182 * @param foundBreaks Output of C array of int32_t break positions, or 0
183 * @return The number of breaks found
184 */
185 virtual int32_t divideUpDictionaryRange( UText *text,
186 int32_t rangeStart,
187 int32_t rangeEnd,
188 UStack &foundBreaks ) const;
189
190};
191
51004dcb 192/*******************************************************************
57a6839d 193 * LaoBreakEngine
51004dcb
A
194 */
195
51004dcb 196/**
57a6839d
A
197 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
198 * dictionary and heuristics to determine Lao-specific breaks.</p>
199 *
200 * <p>After it is constructed a LaoBreakEngine may be shared between
201 * threads without synchronization.</p>
51004dcb 202 */
57a6839d
A
203class LaoBreakEngine : public DictionaryBreakEngine {
204 private:
51004dcb
A
205 /**
206 * The set of characters handled by this engine
207 * @internal
208 */
51004dcb 209
57a6839d
A
210 UnicodeSet fLaoWordSet;
211 UnicodeSet fEndWordSet;
212 UnicodeSet fBeginWordSet;
213 UnicodeSet fMarkSet;
51004dcb
A
214 DictionaryMatcher *fDictionary;
215
216 public:
217
57a6839d
A
218 /**
219 * <p>Default constructor.</p>
220 *
221 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
222 * engine is deleted.
223 */
224 LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
51004dcb 225
57a6839d
A
226 /**
227 * <p>Virtual destructor.</p>
228 */
229 virtual ~LaoBreakEngine();
51004dcb
A
230
231 protected:
57a6839d
A
232 /**
233 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
234 *
235 * @param text A UText representing the text
236 * @param rangeStart The start of the range of dictionary characters
237 * @param rangeEnd The end of the range of dictionary characters
238 * @param foundBreaks Output of C array of int32_t break positions, or 0
239 * @return The number of breaks found
240 */
51004dcb 241 virtual int32_t divideUpDictionaryRange( UText *text,
57a6839d
A
242 int32_t rangeStart,
243 int32_t rangeEnd,
244 UStack &foundBreaks ) const;
51004dcb
A
245
246};
247
b331163b
A
248/*******************************************************************
249 * BurmeseBreakEngine
250 */
251
252/**
253 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
254 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
255 *
256 * <p>After it is constructed a BurmeseBreakEngine may be shared between
257 * threads without synchronization.</p>
258 */
259class BurmeseBreakEngine : public DictionaryBreakEngine {
260 private:
261 /**
262 * The set of characters handled by this engine
263 * @internal
264 */
265
266 UnicodeSet fBurmeseWordSet;
267 UnicodeSet fEndWordSet;
268 UnicodeSet fBeginWordSet;
269 UnicodeSet fMarkSet;
270 DictionaryMatcher *fDictionary;
271
272 public:
273
274 /**
275 * <p>Default constructor.</p>
276 *
277 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
278 * engine is deleted.
279 */
280 BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
281
282 /**
283 * <p>Virtual destructor.</p>
284 */
285 virtual ~BurmeseBreakEngine();
286
287 protected:
288 /**
289 * <p>Divide up a range of known dictionary characters.</p>
290 *
291 * @param text A UText representing the text
292 * @param rangeStart The start of the range of dictionary characters
293 * @param rangeEnd The end of the range of dictionary characters
294 * @param foundBreaks Output of C array of int32_t break positions, or 0
295 * @return The number of breaks found
296 */
297 virtual int32_t divideUpDictionaryRange( UText *text,
298 int32_t rangeStart,
299 int32_t rangeEnd,
300 UStack &foundBreaks ) const;
301
302};
303
4388f060
A
304/*******************************************************************
305 * KhmerBreakEngine
306 */
307
308/**
309 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
51004dcb 310 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
4388f060
A
311 *
312 * <p>After it is constructed a KhmerBreakEngine may be shared between
313 * threads without synchronization.</p>
314 */
315class KhmerBreakEngine : public DictionaryBreakEngine {
316 private:
317 /**
318 * The set of characters handled by this engine
319 * @internal
320 */
321
322 UnicodeSet fKhmerWordSet;
323 UnicodeSet fEndWordSet;
324 UnicodeSet fBeginWordSet;
325 UnicodeSet fMarkSet;
51004dcb 326 DictionaryMatcher *fDictionary;
4388f060
A
327
328 public:
329
330 /**
331 * <p>Default constructor.</p>
332 *
51004dcb 333 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
4388f060
A
334 * engine is deleted.
335 */
51004dcb 336 KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
4388f060
A
337
338 /**
339 * <p>Virtual destructor.</p>
340 */
341 virtual ~KhmerBreakEngine();
342
343 protected:
344 /**
345 * <p>Divide up a range of known dictionary characters.</p>
346 *
347 * @param text A UText representing the text
348 * @param rangeStart The start of the range of dictionary characters
349 * @param rangeEnd The end of the range of dictionary characters
350 * @param foundBreaks Output of C array of int32_t break positions, or 0
351 * @return The number of breaks found
352 */
353 virtual int32_t divideUpDictionaryRange( UText *text,
354 int32_t rangeStart,
355 int32_t rangeEnd,
356 UStack &foundBreaks ) const;
357
358};
359
57a6839d
A
360#if !UCONFIG_NO_NORMALIZATION
361
362/*******************************************************************
363 * CjkBreakEngine
364 */
365
366//indicates language/script that the CjkBreakEngine will handle
367enum LanguageType {
368 kKorean,
369 kChineseJapanese
370};
371
372/**
373 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
374 * dictionary with costs associated with each word and
375 * Viterbi decoding to determine CJK-specific breaks.</p>
376 */
377class CjkBreakEngine : public DictionaryBreakEngine {
378 protected:
379 /**
380 * The set of characters handled by this engine
381 * @internal
382 */
383 UnicodeSet fHangulWordSet;
384 UnicodeSet fHanWordSet;
385 UnicodeSet fKatakanaWordSet;
386 UnicodeSet fHiraganaWordSet;
387
b331163b
A
388 DictionaryMatcher *fDictionary;
389 const Normalizer2 *nfkcNorm2;
57a6839d
A
390
391 public:
392
393 /**
394 * <p>Default constructor.</p>
395 *
396 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
397 * engine is deleted. The DictionaryMatcher must contain costs for each word
398 * in order for the dictionary to work properly.
399 */
400 CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
401
402 /**
403 * <p>Virtual destructor.</p>
404 */
405 virtual ~CjkBreakEngine();
406
407 protected:
408 /**
409 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
410 *
411 * @param text A UText representing the text
412 * @param rangeStart The start of the range of dictionary characters
413 * @param rangeEnd The end of the range of dictionary characters
414 * @param foundBreaks Output of C array of int32_t break positions, or 0
415 * @return The number of breaks found
416 */
417 virtual int32_t divideUpDictionaryRange( UText *text,
418 int32_t rangeStart,
419 int32_t rangeEnd,
420 UStack &foundBreaks ) const;
421
422};
423
424#endif
425
73c04bcf
A
426U_NAMESPACE_END
427
428 /* DICTBE_H */
429#endif