2 *******************************************************************************
3 * Copyright (C) 2006,2012, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/utext.h"
19 class DictionaryMatcher
;
21 /*******************************************************************
22 * DictionaryBreakEngine
26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
27 * dictionary to determine language-specific breaks.</p>
29 * <p>After it is constructed a DictionaryBreakEngine may be shared between
30 * threads without synchronization.</p>
32 class DictionaryBreakEngine
: public LanguageBreakEngine
{
35 * The set of characters handled by this engine
42 * The set of break types handled by this engine
49 * <p>Default constructor.</p>
52 DictionaryBreakEngine();
57 * <p>Constructor setting the break types handled.</p>
59 * @param breakTypes A bitmap of types handled by the engine.
61 DictionaryBreakEngine( uint32_t breakTypes
);
64 * <p>Virtual destructor.</p>
66 virtual ~DictionaryBreakEngine();
69 * <p>Indicate whether this engine handles a particular character for
70 * a particular kind of break.</p>
72 * @param c A character which begins a run that the engine might handle
73 * @param breakType The type of text break which the caller wants to determine
74 * @return TRUE if this engine handles the particular character and break
77 virtual UBool
handles( UChar32 c
, int32_t breakType
) const;
80 * <p>Find any breaks within a run in the supplied text.</p>
82 * @param text A UText representing the text. The iterator is left at
83 * the end of the run of characters which the engine is capable of handling
84 * that starts from the first (or last) character in the range.
85 * @param startPos The start of the run within the supplied text.
86 * @param endPos The end of the run within the supplied text.
87 * @param reverse Whether the caller is looking for breaks in a reverse
89 * @param breakType The type of break desired, or -1.
90 * @param foundBreaks An allocated C array of the breaks found, if any
91 * @return The number of breaks found.
93 virtual int32_t findBreaks( UText
*text
,
98 UStack
&foundBreaks
) const;
103 * <p>Set the character set handled by this engine.</p>
105 * @param set A UnicodeSet of the set of characters handled by the engine
107 virtual void setCharacters( const UnicodeSet
&set
);
110 * <p>Set the break types handled by this engine.</p>
112 * @param breakTypes A bitmap of types handled by the engine.
114 // virtual void setBreakTypes( uint32_t breakTypes );
117 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
119 * @param text A UText representing the text
120 * @param rangeStart The start of the range of dictionary characters
121 * @param rangeEnd The end of the range of dictionary characters
122 * @param foundBreaks Output of C array of int32_t break positions, or 0
123 * @return The number of breaks found
125 virtual int32_t divideUpDictionaryRange( UText
*text
,
128 UStack
&foundBreaks
) const = 0;
132 /*******************************************************************
137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
138 * dictionary and heuristics to determine Thai-specific breaks.</p>
140 * <p>After it is constructed a ThaiBreakEngine may be shared between
141 * threads without synchronization.</p>
143 class ThaiBreakEngine
: public DictionaryBreakEngine
{
146 * The set of characters handled by this engine
150 UnicodeSet fThaiWordSet
;
151 UnicodeSet fEndWordSet
;
152 UnicodeSet fBeginWordSet
;
153 UnicodeSet fSuffixSet
;
155 DictionaryMatcher
*fDictionary
;
160 * <p>Default constructor.</p>
162 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
165 ThaiBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
168 * <p>Virtual destructor.</p>
170 virtual ~ThaiBreakEngine();
174 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
176 * @param text A UText representing the text
177 * @param rangeStart The start of the range of dictionary characters
178 * @param rangeEnd The end of the range of dictionary characters
179 * @param foundBreaks Output of C array of int32_t break positions, or 0
180 * @return The number of breaks found
182 virtual int32_t divideUpDictionaryRange( UText
*text
,
185 UStack
&foundBreaks
) const;
189 #if !UCONFIG_NO_NORMALIZATION
191 /*******************************************************************
195 //indicates language/script that the CjkBreakEngine will handle
202 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
203 * dictionary with costs associated with each word and
204 * Viterbi decoding to determine CJK-specific breaks.</p>
206 class CjkBreakEngine
: public DictionaryBreakEngine
{
209 * The set of characters handled by this engine
212 UnicodeSet fHangulWordSet
;
213 UnicodeSet fHanWordSet
;
214 UnicodeSet fKatakanaWordSet
;
215 UnicodeSet fHiraganaWordSet
;
217 DictionaryMatcher
*fDictionary
;
222 * <p>Default constructor.</p>
224 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
225 * engine is deleted. The DictionaryMatcher must contain costs for each word
226 * in order for the dictionary to work properly.
228 CjkBreakEngine(DictionaryMatcher
*adoptDictionary
, LanguageType type
, UErrorCode
&status
);
231 * <p>Virtual destructor.</p>
233 virtual ~CjkBreakEngine();
237 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
239 * @param text A UText representing the text
240 * @param rangeStart The start of the range of dictionary characters
241 * @param rangeEnd The end of the range of dictionary characters
242 * @param foundBreaks Output of C array of int32_t break positions, or 0
243 * @return The number of breaks found
245 virtual int32_t divideUpDictionaryRange( UText
*text
,
248 UStack
&foundBreaks
) const;
254 /*******************************************************************
259 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
260 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
262 * <p>After it is constructed a KhmerBreakEngine may be shared between
263 * threads without synchronization.</p>
265 class KhmerBreakEngine
: public DictionaryBreakEngine
{
268 * The set of characters handled by this engine
272 UnicodeSet fKhmerWordSet
;
273 UnicodeSet fEndWordSet
;
274 UnicodeSet fBeginWordSet
;
276 DictionaryMatcher
*fDictionary
;
281 * <p>Default constructor.</p>
283 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
286 KhmerBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
289 * <p>Virtual destructor.</p>
291 virtual ~KhmerBreakEngine();
295 * <p>Divide up a range of known dictionary characters.</p>
297 * @param text A UText representing the text
298 * @param rangeStart The start of the range of dictionary characters
299 * @param rangeEnd The end of the range of dictionary characters
300 * @param foundBreaks Output of C array of int32_t break positions, or 0
301 * @return The number of breaks found
303 virtual int32_t divideUpDictionaryRange( UText
*text
,
306 UStack
&foundBreaks
) const;