1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
5 * Copyright (C) 2006-2014, International Business Machines Corporation *
6 * and others. All Rights Reserved. *
7 *******************************************************************************
13 #include "unicode/utypes.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utext.h"
21 class DictionaryMatcher
;
24 /*******************************************************************
25 * DictionaryBreakEngine
29 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
30 * dictionary to determine language-specific breaks.</p>
32 * <p>After it is constructed a DictionaryBreakEngine may be shared between
33 * threads without synchronization.</p>
35 class DictionaryBreakEngine
: public LanguageBreakEngine
{
38 * The set of characters handled by this engine
45 * The set of break types handled by this engine
52 * <p>Default constructor.</p>
55 DictionaryBreakEngine();
60 * <p>Constructor setting the break types handled.</p>
62 * @param breakTypes A bitmap of types handled by the engine.
64 DictionaryBreakEngine( uint32_t breakTypes
);
67 * <p>Virtual destructor.</p>
69 virtual ~DictionaryBreakEngine();
72 * <p>Indicate whether this engine handles a particular character for
73 * a particular kind of break.</p>
75 * @param c A character which begins a run that the engine might handle
76 * @param breakType The type of text break which the caller wants to determine
77 * @return TRUE if this engine handles the particular character and break
80 virtual UBool
handles( UChar32 c
, int32_t breakType
) const;
83 * <p>Find any breaks within a run in the supplied text.</p>
85 * @param text A UText representing the text. The iterator is left at
86 * the end of the run of characters which the engine is capable of handling
87 * that starts from the first (or last) character in the range.
88 * @param startPos The start of the run within the supplied text.
89 * @param endPos The end of the run within the supplied text.
90 * @param reverse Whether the caller is looking for breaks in a reverse
92 * @param breakType The type of break desired, or -1.
93 * @param foundBreaks An allocated C array of the breaks found, if any
94 * @return The number of breaks found.
96 virtual int32_t findBreaks( UText
*text
,
101 UStack
&foundBreaks
) const;
106 * <p>Set the character set handled by this engine.</p>
108 * @param set A UnicodeSet of the set of characters handled by the engine
110 virtual void setCharacters( const UnicodeSet
&set
);
113 * <p>Set the break types handled by this engine.</p>
115 * @param breakTypes A bitmap of types handled by the engine.
117 // virtual void setBreakTypes( uint32_t breakTypes );
120 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
122 * @param text A UText representing the text
123 * @param rangeStart The start of the range of dictionary characters
124 * @param rangeEnd The end of the range of dictionary characters
125 * @param foundBreaks Output of C array of int32_t break positions, or 0
126 * @return The number of breaks found
128 virtual int32_t divideUpDictionaryRange( UText
*text
,
131 UStack
&foundBreaks
) const = 0;
135 /*******************************************************************
140 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
141 * dictionary and heuristics to determine Thai-specific breaks.</p>
143 * <p>After it is constructed a ThaiBreakEngine may be shared between
144 * threads without synchronization.</p>
146 class ThaiBreakEngine
: public DictionaryBreakEngine
{
149 * The set of characters handled by this engine
153 UnicodeSet fThaiWordSet
;
154 UnicodeSet fEndWordSet
;
155 UnicodeSet fBeginWordSet
;
156 UnicodeSet fSuffixSet
;
158 DictionaryMatcher
*fDictionary
;
163 * <p>Default constructor.</p>
165 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
168 ThaiBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
171 * <p>Virtual destructor.</p>
173 virtual ~ThaiBreakEngine();
177 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
179 * @param text A UText representing the text
180 * @param rangeStart The start of the range of dictionary characters
181 * @param rangeEnd The end of the range of dictionary characters
182 * @param foundBreaks Output of C array of int32_t break positions, or 0
183 * @return The number of breaks found
185 virtual int32_t divideUpDictionaryRange( UText
*text
,
188 UStack
&foundBreaks
) const;
192 /*******************************************************************
197 * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
198 * dictionary and heuristics to determine Lao-specific breaks.</p>
200 * <p>After it is constructed a LaoBreakEngine may be shared between
201 * threads without synchronization.</p>
203 class LaoBreakEngine
: public DictionaryBreakEngine
{
206 * The set of characters handled by this engine
210 UnicodeSet fLaoWordSet
;
211 UnicodeSet fEndWordSet
;
212 UnicodeSet fBeginWordSet
;
214 DictionaryMatcher
*fDictionary
;
219 * <p>Default constructor.</p>
221 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
224 LaoBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
227 * <p>Virtual destructor.</p>
229 virtual ~LaoBreakEngine();
233 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
235 * @param text A UText representing the text
236 * @param rangeStart The start of the range of dictionary characters
237 * @param rangeEnd The end of the range of dictionary characters
238 * @param foundBreaks Output of C array of int32_t break positions, or 0
239 * @return The number of breaks found
241 virtual int32_t divideUpDictionaryRange( UText
*text
,
244 UStack
&foundBreaks
) const;
248 /*******************************************************************
253 * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
254 * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
256 * <p>After it is constructed a BurmeseBreakEngine may be shared between
257 * threads without synchronization.</p>
259 class BurmeseBreakEngine
: public DictionaryBreakEngine
{
262 * The set of characters handled by this engine
266 UnicodeSet fBurmeseWordSet
;
267 UnicodeSet fEndWordSet
;
268 UnicodeSet fBeginWordSet
;
270 DictionaryMatcher
*fDictionary
;
275 * <p>Default constructor.</p>
277 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
280 BurmeseBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
283 * <p>Virtual destructor.</p>
285 virtual ~BurmeseBreakEngine();
289 * <p>Divide up a range of known dictionary characters.</p>
291 * @param text A UText representing the text
292 * @param rangeStart The start of the range of dictionary characters
293 * @param rangeEnd The end of the range of dictionary characters
294 * @param foundBreaks Output of C array of int32_t break positions, or 0
295 * @return The number of breaks found
297 virtual int32_t divideUpDictionaryRange( UText
*text
,
300 UStack
&foundBreaks
) const;
304 /*******************************************************************
309 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
310 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
312 * <p>After it is constructed a KhmerBreakEngine may be shared between
313 * threads without synchronization.</p>
315 class KhmerBreakEngine
: public DictionaryBreakEngine
{
318 * The set of characters handled by this engine
322 UnicodeSet fKhmerWordSet
;
323 UnicodeSet fEndWordSet
;
324 UnicodeSet fBeginWordSet
;
326 DictionaryMatcher
*fDictionary
;
331 * <p>Default constructor.</p>
333 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
336 KhmerBreakEngine(DictionaryMatcher
*adoptDictionary
, UErrorCode
&status
);
339 * <p>Virtual destructor.</p>
341 virtual ~KhmerBreakEngine();
345 * <p>Divide up a range of known dictionary characters.</p>
347 * @param text A UText representing the text
348 * @param rangeStart The start of the range of dictionary characters
349 * @param rangeEnd The end of the range of dictionary characters
350 * @param foundBreaks Output of C array of int32_t break positions, or 0
351 * @return The number of breaks found
353 virtual int32_t divideUpDictionaryRange( UText
*text
,
356 UStack
&foundBreaks
) const;
360 #if !UCONFIG_NO_NORMALIZATION
362 /*******************************************************************
366 //indicates language/script that the CjkBreakEngine will handle
373 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
374 * dictionary with costs associated with each word and
375 * Viterbi decoding to determine CJK-specific breaks.</p>
377 class CjkBreakEngine
: public DictionaryBreakEngine
{
380 * The set of characters handled by this engine
383 UnicodeSet fHangulWordSet
;
384 UnicodeSet fHanWordSet
;
385 UnicodeSet fKatakanaWordSet
;
386 UnicodeSet fHiraganaWordSet
;
388 DictionaryMatcher
*fDictionary
;
389 const Normalizer2
*nfkcNorm2
;
394 * <p>Default constructor.</p>
396 * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
397 * engine is deleted. The DictionaryMatcher must contain costs for each word
398 * in order for the dictionary to work properly.
400 CjkBreakEngine(DictionaryMatcher
*adoptDictionary
, LanguageType type
, UErrorCode
&status
);
403 * <p>Virtual destructor.</p>
405 virtual ~CjkBreakEngine();
409 * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
411 * @param text A UText representing the text
412 * @param rangeStart The start of the range of dictionary characters
413 * @param rangeEnd The end of the range of dictionary characters
414 * @param foundBreaks Output of C array of int32_t break positions, or 0
415 * @return The number of breaks found
417 virtual int32_t divideUpDictionaryRange( UText
*text
,
420 UStack
&foundBreaks
) const;