2 *******************************************************************************
3 * Copyright (C) 2006,2012-2013, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/utext.h"
19 class DictionaryMatcher
;
21 /*******************************************************************
22 * DictionaryBreakEngine
26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
27 * dictionary to determine language-specific breaks.</p>
29 * <p>After it is constructed a DictionaryBreakEngine may be shared between
30 * threads without synchronization.</p>
32 class DictionaryBreakEngine
: public LanguageBreakEngine
{
35 * The set of characters handled by this engine
42 * The set of break types handled by this engine
49 * <p>Default constructor.</p>
52 DictionaryBreakEngine();
57 * <p>Constructor setting the break types handled.</p>
59 * @param breakTypes A bitmap of types handled by the engine.
61 DictionaryBreakEngine( uint32_t breakTypes
);
64 * <p>Virtual destructor.</p>
66 virtual ~DictionaryBreakEngine();
69 * <p>Indicate whether this engine handles a particular character for
70 * a particular kind of break.</p>
72 * @param c A character which begins a run that the engine might handle
73 * @param breakType The type of text break which the caller wants to determine
74 * @return TRUE if this engine handles the particular character and break
77 virtual UBool
handles( UChar32 c
, int32_t breakType
) const;
80 * <p>Find any breaks within a run in the supplied text.</p>
82 * @param text A UText representing the text. The iterator is left at
83 * the end of the run of characters which the engine is capable of handling
84 * that starts from the first (or last) character in the range.
85 * @param startPos The start of the run within the supplied text.
86 * @param endPos The end of the run within the supplied text.
87 * @param reverse Whether the caller is looking for breaks in a reverse
89 * @param breakType The type of break desired, or -1.
90 * @param foundBreaks An allocated C array of the breaks found, if any
91 * @return The number of breaks found.
93 virtual int32_t findBreaks( UText
*text
,
98 UStack
&foundBreaks
) const;
103 * <p>Set the character set handled by this engine.</p>
105 * @param set A UnicodeSet of the set of characters handled by the engine
107 virtual void setCharacters( const UnicodeSet
&set
);
110 * <p>Set the break types handled by this engine.</p>
112 * @param breakTypes A bitmap of types handled by the engine.
114 // virtual void setBreakTypes( uint32_t breakTypes );
117 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
119 * @param text A UText representing the text
120 * @param rangeStart The start of the range of dictionary characters
121 * @param rangeEnd The end of the range of dictionary characters
122 * @param foundBreaks Output of C array of int32_t break positions, or 0
123 * @return The number of breaks found
125 virtual int32_t divideUpDictionaryRange( UText
*text
,
128 UStack
&foundBreaks
) const = 0;
132 /*******************************************************************
137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
138 * dictionary and heuristics to determine Thai-specific breaks.</p>
140 * <p>After it is constructed a ThaiBreakEngine may be shared between
141 * threads without synchronization.</p>
143 class ThaiBreakEngine
: public DictionaryBreakEngine
{
146 * The set of characters handled by this engine
150 UnicodeSet fThaiWordSet
;
151 UnicodeSet fEndWordSet
;
152 UnicodeSet fBeginWordSet
;
153 UnicodeSet fSuffixSet
;
155 DictionaryMatcher
*fDictionary
;
160 * <p>Default constructor.</p>
162 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
165 ThaiBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
168 * <p>Virtual destructor.</p>
170 virtual ~ThaiBreakEngine();
174 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
176 * @param text A UText representing the text
177 * @param rangeStart The start of the range of dictionary characters
178 * @param rangeEnd The end of the range of dictionary characters
179 * @param foundBreaks Output of C array of int32_t break positions, or 0
180 * @return The number of breaks found
182 virtual int32_t divideUpDictionaryRange( UText
*text
,
185 UStack
&foundBreaks
) const;
189 /*******************************************************************
194 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
195 * dictionary and heuristics to determine Lao-specific breaks.</p>
197 * <p>After it is constructed a LaoBreakEngine may be shared between
198 * threads without synchronization.</p>
200 class LaoBreakEngine
: public DictionaryBreakEngine
{
203 * The set of characters handled by this engine
207 UnicodeSet fLaoWordSet
;
208 UnicodeSet fEndWordSet
;
209 UnicodeSet fBeginWordSet
;
211 DictionaryMatcher
*fDictionary
;
216 * <p>Default constructor.</p>
218 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
221 LaoBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
224 * <p>Virtual destructor.</p>
226 virtual ~LaoBreakEngine();
230 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
232 * @param text A UText representing the text
233 * @param rangeStart The start of the range of dictionary characters
234 * @param rangeEnd The end of the range of dictionary characters
235 * @param foundBreaks Output of C array of int32_t break positions, or 0
236 * @return The number of breaks found
238 virtual int32_t divideUpDictionaryRange( UText
*text
,
241 UStack
&foundBreaks
) const;
245 /*******************************************************************
250 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
251 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
253 * <p>After it is constructed a KhmerBreakEngine may be shared between
254 * threads without synchronization.</p>
256 class KhmerBreakEngine
: public DictionaryBreakEngine
{
259 * The set of characters handled by this engine
263 UnicodeSet fKhmerWordSet
;
264 UnicodeSet fEndWordSet
;
265 UnicodeSet fBeginWordSet
;
267 DictionaryMatcher
*fDictionary
;
272 * <p>Default constructor.</p>
274 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
277 KhmerBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
280 * <p>Virtual destructor.</p>
282 virtual ~KhmerBreakEngine();
286 * <p>Divide up a range of known dictionary characters.</p>
288 * @param text A UText representing the text
289 * @param rangeStart The start of the range of dictionary characters
290 * @param rangeEnd The end of the range of dictionary characters
291 * @param foundBreaks Output of C array of int32_t break positions, or 0
292 * @return The number of breaks found
294 virtual int32_t divideUpDictionaryRange( UText
*text
,
297 UStack
&foundBreaks
) const;
301 #if !UCONFIG_NO_NORMALIZATION
303 /*******************************************************************
307 //indicates language/script that the CjkBreakEngine will handle
314 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
315 * dictionary with costs associated with each word and
316 * Viterbi decoding to determine CJK-specific breaks.</p>
318 class CjkBreakEngine
: public DictionaryBreakEngine
{
321 * The set of characters handled by this engine
324 UnicodeSet fHangulWordSet
;
325 UnicodeSet fHanWordSet
;
326 UnicodeSet fKatakanaWordSet
;
327 UnicodeSet fHiraganaWordSet
;
329 DictionaryMatcher
*fDictionary
;
334 * <p>Default constructor.</p>
336 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
337 * engine is deleted. The DictionaryMatcher must contain costs for each word
338 * in order for the dictionary to work properly.
340 CjkBreakEngine(DictionaryMatcher
*adoptDictionary
, LanguageType type
, UErrorCode
&status
);
343 * <p>Virtual destructor.</p>
345 virtual ~CjkBreakEngine();
349 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
351 * @param text A UText representing the text
352 * @param rangeStart The start of the range of dictionary characters
353 * @param rangeEnd The end of the range of dictionary characters
354 * @param foundBreaks Output of C array of int32_t break positions, or 0
355 * @return The number of breaks found
357 virtual int32_t divideUpDictionaryRange( UText
*text
,
360 UStack
&foundBreaks
) const;