1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2006-2014, International Business Machines Corporation *
6 * and others. All Rights Reserved. *
7 *******************************************************************************
13 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utext.h"
22 class DictionaryMatcher
;
25 /*******************************************************************
26 * DictionaryBreakEngine
30 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
31 * dictionary to determine language-specific breaks.</p>
33 * <p>After it is constructed a DictionaryBreakEngine may be shared between
34 * threads without synchronization.</p>
36 class DictionaryBreakEngine
: public LanguageBreakEngine
{
39 * The set of characters handled by this engine
50 DictionaryBreakEngine();
53 * <p>Virtual destructor.</p>
55 virtual ~DictionaryBreakEngine();
58 * <p>Indicate whether this engine handles a particular character for
59 * a particular kind of break.</p>
61 * @param c A character which begins a run that the engine might handle
62 * @return TRUE if this engine handles the particular character and break
65 virtual UBool
handles(UChar32 c
) const;
68 * <p>Find any breaks within a run in the supplied text.</p>
70 * @param text A UText representing the text. The iterator is left at
71 * the end of the run of characters which the engine is capable of handling
72 * that starts from the first character in the range.
73 * @param startPos The start of the run within the supplied text.
74 * @param endPos The end of the run within the supplied text.
75 * @param foundBreaks vector of int32_t to receive the break positions
76 * @return The number of breaks found.
78 virtual int32_t findBreaks( UText
*text
,
81 UVector32
&foundBreaks
) const;
86 * <p>Set the character set handled by this engine.</p>
88 * @param set A UnicodeSet of the set of characters handled by the engine
90 virtual void setCharacters( const UnicodeSet
&set
);
93 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
95 * @param text A UText representing the text
96 * @param rangeStart The start of the range of dictionary characters
97 * @param rangeEnd The end of the range of dictionary characters
98 * @param foundBreaks Output of C array of int32_t break positions, or 0
99 * @return The number of breaks found
101 virtual int32_t divideUpDictionaryRange( UText
*text
,
104 UVector32
&foundBreaks
) const = 0;
108 /*******************************************************************
113 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
114 * dictionary and heuristics to determine Thai-specific breaks.</p>
116 * <p>After it is constructed a ThaiBreakEngine may be shared between
117 * threads without synchronization.</p>
119 class ThaiBreakEngine
: public DictionaryBreakEngine
{
122 * The set of characters handled by this engine
126 UnicodeSet fThaiWordSet
;
127 UnicodeSet fEndWordSet
;
128 UnicodeSet fBeginWordSet
;
129 UnicodeSet fSuffixSet
;
131 DictionaryMatcher
*fDictionary
;
136 * <p>Default constructor.</p>
138 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
141 ThaiBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
144 * <p>Virtual destructor.</p>
146 virtual ~ThaiBreakEngine();
150 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
152 * @param text A UText representing the text
153 * @param rangeStart The start of the range of dictionary characters
154 * @param rangeEnd The end of the range of dictionary characters
155 * @param foundBreaks Output of C array of int32_t break positions, or 0
156 * @return The number of breaks found
158 virtual int32_t divideUpDictionaryRange( UText
*text
,
161 UVector32
&foundBreaks
) const;
165 /*******************************************************************
170 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
171 * dictionary and heuristics to determine Lao-specific breaks.</p>
173 * <p>After it is constructed a LaoBreakEngine may be shared between
174 * threads without synchronization.</p>
176 class LaoBreakEngine
: public DictionaryBreakEngine
{
179 * The set of characters handled by this engine
183 UnicodeSet fLaoWordSet
;
184 UnicodeSet fEndWordSet
;
185 UnicodeSet fBeginWordSet
;
187 DictionaryMatcher
*fDictionary
;
192 * <p>Default constructor.</p>
194 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
197 LaoBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
200 * <p>Virtual destructor.</p>
202 virtual ~LaoBreakEngine();
206 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
208 * @param text A UText representing the text
209 * @param rangeStart The start of the range of dictionary characters
210 * @param rangeEnd The end of the range of dictionary characters
211 * @param foundBreaks Output of C array of int32_t break positions, or 0
212 * @return The number of breaks found
214 virtual int32_t divideUpDictionaryRange( UText
*text
,
217 UVector32
&foundBreaks
) const;
221 /*******************************************************************
226 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
227 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
229 * <p>After it is constructed a BurmeseBreakEngine may be shared between
230 * threads without synchronization.</p>
232 class BurmeseBreakEngine
: public DictionaryBreakEngine
{
235 * The set of characters handled by this engine
239 UnicodeSet fBurmeseWordSet
;
240 UnicodeSet fEndWordSet
;
241 UnicodeSet fBeginWordSet
;
243 DictionaryMatcher
*fDictionary
;
248 * <p>Default constructor.</p>
250 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
253 BurmeseBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
256 * <p>Virtual destructor.</p>
258 virtual ~BurmeseBreakEngine();
262 * <p>Divide up a range of known dictionary characters.</p>
264 * @param text A UText representing the text
265 * @param rangeStart The start of the range of dictionary characters
266 * @param rangeEnd The end of the range of dictionary characters
267 * @param foundBreaks Output of C array of int32_t break positions, or 0
268 * @return The number of breaks found
270 virtual int32_t divideUpDictionaryRange( UText
*text
,
273 UVector32
&foundBreaks
) const;
277 /*******************************************************************
282 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
283 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
285 * <p>After it is constructed a KhmerBreakEngine may be shared between
286 * threads without synchronization.</p>
288 class KhmerBreakEngine
: public DictionaryBreakEngine
{
291 * The set of characters handled by this engine
295 UnicodeSet fKhmerWordSet
;
296 UnicodeSet fEndWordSet
;
297 UnicodeSet fBeginWordSet
;
299 DictionaryMatcher
*fDictionary
;
304 * <p>Default constructor.</p>
306 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
309 KhmerBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
312 * <p>Virtual destructor.</p>
314 virtual ~KhmerBreakEngine();
318 * <p>Divide up a range of known dictionary characters.</p>
320 * @param text A UText representing the text
321 * @param rangeStart The start of the range of dictionary characters
322 * @param rangeEnd The end of the range of dictionary characters
323 * @param foundBreaks Output of C array of int32_t break positions, or 0
324 * @return The number of breaks found
326 virtual int32_t divideUpDictionaryRange( UText
*text
,
329 UVector32
&foundBreaks
) const;
333 #if !UCONFIG_NO_NORMALIZATION
335 /*******************************************************************
339 //indicates language/script that the CjkBreakEngine will handle
346 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
347 * dictionary with costs associated with each word and
348 * Viterbi decoding to determine CJK-specific breaks.</p>
350 class CjkBreakEngine
: public DictionaryBreakEngine
{
353 * The set of characters handled by this engine
356 UnicodeSet fHangulWordSet
;
357 UnicodeSet fHanWordSet
;
358 UnicodeSet fKatakanaWordSet
;
359 UnicodeSet fHiraganaWordSet
;
361 DictionaryMatcher
*fDictionary
;
362 const Normalizer2
*nfkcNorm2
;
367 * <p>Default constructor.</p>
369 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
370 * engine is deleted. The DictionaryMatcher must contain costs for each word
371 * in order for the dictionary to work properly.
373 CjkBreakEngine(DictionaryMatcher
*adoptDictionary
, LanguageType type
, UErrorCode
&status
);
376 * <p>Virtual destructor.</p>
378 virtual ~CjkBreakEngine();
382 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
384 * @param text A UText representing the text
385 * @param rangeStart The start of the range of dictionary characters
386 * @param rangeEnd The end of the range of dictionary characters
387 * @param foundBreaks Output of C array of int32_t break positions, or 0
388 * @return The number of breaks found
390 virtual int32_t divideUpDictionaryRange( UText
*text
,
393 UVector32
&foundBreaks
) const;