2 *******************************************************************************
3 * Copyright (C) 2006-2014, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/utext.h"
19 class DictionaryMatcher
;
22 /*******************************************************************
23 * DictionaryBreakEngine
27 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
28 * dictionary to determine language-specific breaks.</p>
30 * <p>After it is constructed a DictionaryBreakEngine may be shared between
31 * threads without synchronization.</p>
33 class DictionaryBreakEngine
: public LanguageBreakEngine
{
36 * The set of characters handled by this engine
43 * The set of break types handled by this engine
50 * <p>Default constructor.</p>
53 DictionaryBreakEngine();
58 * <p>Constructor setting the break types handled.</p>
60 * @param breakTypes A bitmap of types handled by the engine.
62 DictionaryBreakEngine( uint32_t breakTypes
);
65 * <p>Virtual destructor.</p>
67 virtual ~DictionaryBreakEngine();
70 * <p>Indicate whether this engine handles a particular character for
71 * a particular kind of break.</p>
73 * @param c A character which begins a run that the engine might handle
74 * @param breakType The type of text break which the caller wants to determine
75 * @return TRUE if this engine handles the particular character and break
78 virtual UBool
handles( UChar32 c
, int32_t breakType
) const;
81 * <p>Find any breaks within a run in the supplied text.</p>
83 * @param text A UText representing the text. The iterator is left at
84 * the end of the run of characters which the engine is capable of handling
85 * that starts from the first (or last) character in the range.
86 * @param startPos The start of the run within the supplied text.
87 * @param endPos The end of the run within the supplied text.
88 * @param reverse Whether the caller is looking for breaks in a reverse
90 * @param breakType The type of break desired, or -1.
91 * @param foundBreaks An allocated C array of the breaks found, if any
92 * @return The number of breaks found.
94 virtual int32_t findBreaks( UText
*text
,
99 UStack
&foundBreaks
) const;
104 * <p>Set the character set handled by this engine.</p>
106 * @param set A UnicodeSet of the set of characters handled by the engine
108 virtual void setCharacters( const UnicodeSet
&set
);
111 * <p>Set the break types handled by this engine.</p>
113 * @param breakTypes A bitmap of types handled by the engine.
115 // virtual void setBreakTypes( uint32_t breakTypes );
118 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
120 * @param text A UText representing the text
121 * @param rangeStart The start of the range of dictionary characters
122 * @param rangeEnd The end of the range of dictionary characters
123 * @param foundBreaks Output of C array of int32_t break positions, or 0
124 * @return The number of breaks found
126 virtual int32_t divideUpDictionaryRange( UText
*text
,
129 UStack
&foundBreaks
) const = 0;
133 /*******************************************************************
138 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
139 * dictionary and heuristics to determine Thai-specific breaks.</p>
141 * <p>After it is constructed a ThaiBreakEngine may be shared between
142 * threads without synchronization.</p>
144 class ThaiBreakEngine
: public DictionaryBreakEngine
{
147 * The set of characters handled by this engine
151 UnicodeSet fThaiWordSet
;
152 UnicodeSet fEndWordSet
;
153 UnicodeSet fBeginWordSet
;
154 UnicodeSet fSuffixSet
;
156 DictionaryMatcher
*fDictionary
;
161 * <p>Default constructor.</p>
163 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
166 ThaiBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
169 * <p>Virtual destructor.</p>
171 virtual ~ThaiBreakEngine();
175 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
177 * @param text A UText representing the text
178 * @param rangeStart The start of the range of dictionary characters
179 * @param rangeEnd The end of the range of dictionary characters
180 * @param foundBreaks Output of C array of int32_t break positions, or 0
181 * @return The number of breaks found
183 virtual int32_t divideUpDictionaryRange( UText
*text
,
186 UStack
&foundBreaks
) const;
190 /*******************************************************************
195 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
196 * dictionary and heuristics to determine Lao-specific breaks.</p>
198 * <p>After it is constructed a LaoBreakEngine may be shared between
199 * threads without synchronization.</p>
201 class LaoBreakEngine
: public DictionaryBreakEngine
{
204 * The set of characters handled by this engine
208 UnicodeSet fLaoWordSet
;
209 UnicodeSet fEndWordSet
;
210 UnicodeSet fBeginWordSet
;
212 DictionaryMatcher
*fDictionary
;
217 * <p>Default constructor.</p>
219 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
222 LaoBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
225 * <p>Virtual destructor.</p>
227 virtual ~LaoBreakEngine();
231 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
233 * @param text A UText representing the text
234 * @param rangeStart The start of the range of dictionary characters
235 * @param rangeEnd The end of the range of dictionary characters
236 * @param foundBreaks Output of C array of int32_t break positions, or 0
237 * @return The number of breaks found
239 virtual int32_t divideUpDictionaryRange( UText
*text
,
242 UStack
&foundBreaks
) const;
246 /*******************************************************************
251 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
252 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
254 * <p>After it is constructed a BurmeseBreakEngine may be shared between
255 * threads without synchronization.</p>
257 class BurmeseBreakEngine
: public DictionaryBreakEngine
{
260 * The set of characters handled by this engine
264 UnicodeSet fBurmeseWordSet
;
265 UnicodeSet fEndWordSet
;
266 UnicodeSet fBeginWordSet
;
268 DictionaryMatcher
*fDictionary
;
273 * <p>Default constructor.</p>
275 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
278 BurmeseBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
281 * <p>Virtual destructor.</p>
283 virtual ~BurmeseBreakEngine();
287 * <p>Divide up a range of known dictionary characters.</p>
289 * @param text A UText representing the text
290 * @param rangeStart The start of the range of dictionary characters
291 * @param rangeEnd The end of the range of dictionary characters
292 * @param foundBreaks Output of C array of int32_t break positions, or 0
293 * @return The number of breaks found
295 virtual int32_t divideUpDictionaryRange( UText
*text
,
298 UStack
&foundBreaks
) const;
302 /*******************************************************************
307 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
308 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
310 * <p>After it is constructed a KhmerBreakEngine may be shared between
311 * threads without synchronization.</p>
313 class KhmerBreakEngine
: public DictionaryBreakEngine
{
316 * The set of characters handled by this engine
320 UnicodeSet fKhmerWordSet
;
321 UnicodeSet fEndWordSet
;
322 UnicodeSet fBeginWordSet
;
324 DictionaryMatcher
*fDictionary
;
329 * <p>Default constructor.</p>
331 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
334 KhmerBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
337 * <p>Virtual destructor.</p>
339 virtual ~KhmerBreakEngine();
343 * <p>Divide up a range of known dictionary characters.</p>
345 * @param text A UText representing the text
346 * @param rangeStart The start of the range of dictionary characters
347 * @param rangeEnd The end of the range of dictionary characters
348 * @param foundBreaks Output of C array of int32_t break positions, or 0
349 * @return The number of breaks found
351 virtual int32_t divideUpDictionaryRange( UText
*text
,
354 UStack
&foundBreaks
) const;
358 #if !UCONFIG_NO_NORMALIZATION
360 /*******************************************************************
364 //indicates language/script that the CjkBreakEngine will handle
371 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
372 * dictionary with costs associated with each word and
373 * Viterbi decoding to determine CJK-specific breaks.</p>
375 class CjkBreakEngine
: public DictionaryBreakEngine
{
378 * The set of characters handled by this engine
381 UnicodeSet fHangulWordSet
;
382 UnicodeSet fHanWordSet
;
383 UnicodeSet fKatakanaWordSet
;
384 UnicodeSet fHiraganaWordSet
;
386 DictionaryMatcher
*fDictionary
;
387 const Normalizer2
*nfkcNorm2
;
392 * <p>Default constructor.</p>
394 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
395 * engine is deleted. The DictionaryMatcher must contain costs for each word
396 * in order for the dictionary to work properly.
398 CjkBreakEngine(DictionaryMatcher
*adoptDictionary
, LanguageType type
, UErrorCode
&status
);
401 * <p>Virtual destructor.</p>
403 virtual ~CjkBreakEngine();
407 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
409 * @param text A UText representing the text
410 * @param rangeStart The start of the range of dictionary characters
411 * @param rangeEnd The end of the range of dictionary characters
412 * @param foundBreaks Output of C array of int32_t break positions, or 0
413 * @return The number of breaks found
415 virtual int32_t divideUpDictionaryRange( UText
*text
,
418 UStack
&foundBreaks
) const;