]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/brkeng.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ************************************************************************************
5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6 * All Rights Reserved. *
7 ************************************************************************************
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/utext.h"
16 #include "unicode/uscript.h"
22 class DictionaryMatcher
;
24 /*******************************************************************
29 * <p>LanguageBreakEngines implement language-specific knowledge for
30 * finding text boundaries within a run of characters belonging to a
31 * specific set. The boundaries will be of a specific kind, e.g. word,
34 * <p>LanguageBreakEngines should normally be implemented so as to
35 * be shared between threads without locking.</p>
37 class LanguageBreakEngine
: public UMemory
{
41 * <p>Default constructor.</p>
44 LanguageBreakEngine();
47 * <p>Virtual destructor.</p>
49 virtual ~LanguageBreakEngine();
52 * <p>Indicate whether this engine handles a particular character for
53 * a particular kind of break.</p>
55 * @param c A character which begins a run that the engine might handle
56 * @param breakType The type of text break which the caller wants to determine
57 * @return TRUE if this engine handles the particular character and break
60 virtual UBool
handles(UChar32 c
, int32_t breakType
) const = 0;
63 * <p>Find any breaks within a run in the supplied text.</p>
65 * @param text A UText representing the text. The
66 * iterator is left at the end of the run of characters which the engine
67 * is capable of handling.
68 * @param startPos The start of the run within the supplied text.
69 * @param endPos The end of the run within the supplied text.
70 * @param reverse Whether the caller is looking for breaks in a reverse
72 * @param breakType The type of break desired, or -1.
73 * @param foundBreaks An allocated C array of the breaks found, if any
74 * @return The number of breaks found.
76 virtual int32_t findBreaks( UText
*text
,
81 UStack
&foundBreaks
) const = 0;
85 /*******************************************************************
86 * LanguageBreakFactory
90 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
91 * that can determine breaks for characters in a specific set, if
92 * such an object can be found.</p>
94 * <p>If a LanguageBreakFactory is to be shared between threads,
95 * appropriate synchronization must be used; there is none internal
98 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
99 * normally be shared between threads without synchronization, unless
100 * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
102 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
103 * it returns when it itself is deleted, unless the specific subclass of
104 * LanguageBreakFactory indicates otherwise. Naturally, the factory should
105 * not be deleted until the LanguageBreakEngines it has returned are no
108 class LanguageBreakFactory
: public UMemory
{
112 * <p>Default constructor.</p>
115 LanguageBreakFactory();
118 * <p>Virtual destructor.</p>
120 virtual ~LanguageBreakFactory();
123 * <p>Find and return a LanguageBreakEngine that can find the desired
124 * kind of break for the set of characters to which the supplied
125 * character belongs. It is up to the set of available engines to
126 * determine what the sets of characters are.</p>
128 * @param c A character that begins a run for which a LanguageBreakEngine is
130 * @param breakType The kind of text break for which a LanguageBreakEngine is
132 * @return A LanguageBreakEngine with the desired characteristics, or 0.
134 virtual const LanguageBreakEngine
*getEngineFor(UChar32 c
, int32_t breakType
) = 0;
138 /*******************************************************************
143 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
144 * handles characters that no other LanguageBreakEngine is available to
145 * handle. It is told the character and the type of break; at its
146 * discretion it may handle more than the specified character (e.g.,
147 * the entire script to which that character belongs.</p>
149 * <p>UnhandledEngines may not be shared between threads without
150 * external synchronization.</p>
153 class UnhandledEngine
: public LanguageBreakEngine
{
157 * The sets of characters handled, for each break type
161 UnicodeSet
*fHandled
[4];
166 * <p>Default constructor.</p>
169 UnhandledEngine(UErrorCode
&status
);
172 * <p>Virtual destructor.</p>
174 virtual ~UnhandledEngine();
177 * <p>Indicate whether this engine handles a particular character for
178 * a particular kind of break.</p>
180 * @param c A character which begins a run that the engine might handle
181 * @param breakType The type of text break which the caller wants to determine
182 * @return TRUE if this engine handles the particular character and break
185 virtual UBool
handles(UChar32 c
, int32_t breakType
) const;
188 * <p>Find any breaks within a run in the supplied text.</p>
190 * @param text A UText representing the text (TODO: UText). The
191 * iterator is left at the end of the run of characters which the engine
192 * is capable of handling.
193 * @param startPos The start of the run within the supplied text.
194 * @param endPos The end of the run within the supplied text.
195 * @param reverse Whether the caller is looking for breaks in a reverse
197 * @param breakType The type of break desired, or -1.
198 * @param foundBreaks An allocated C array of the breaks found, if any
199 * @return The number of breaks found.
201 virtual int32_t findBreaks( UText
*text
,
206 UStack
&foundBreaks
) const;
209 * <p>Tell the engine to handle a particular character and break type.</p>
211 * @param c A character which the engine should handle
212 * @param breakType The type of text break for which the engine should handle c
214 virtual void handleCharacter(UChar32 c
, int32_t breakType
);
218 /*******************************************************************
219 * ICULanguageBreakFactory
223 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
224 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
225 * data in the ICU data file.</p>
227 class ICULanguageBreakFactory
: public LanguageBreakFactory
{
231 * The stack of break engines created by this factory
240 * <p>Standard constructor.</p>
243 ICULanguageBreakFactory(UErrorCode
&status
);
246 * <p>Virtual destructor.</p>
248 virtual ~ICULanguageBreakFactory();
251 * <p>Find and return a LanguageBreakEngine that can find the desired
252 * kind of break for the set of characters to which the supplied
253 * character belongs. It is up to the set of available engines to
254 * determine what the sets of characters are.</p>
256 * @param c A character that begins a run for which a LanguageBreakEngine is
258 * @param breakType The kind of text break for which a LanguageBreakEngine is
260 * @return A LanguageBreakEngine with the desired characteristics, or 0.
262 virtual const LanguageBreakEngine
*getEngineFor(UChar32 c
, int32_t breakType
);
266 * <p>Create a LanguageBreakEngine for the set of characters to which
267 * the supplied character belongs, for the specified break type.</p>
269 * @param c A character that begins a run for which a LanguageBreakEngine is
271 * @param breakType The kind of text break for which a LanguageBreakEngine is
273 * @return A LanguageBreakEngine with the desired characteristics, or 0.
275 virtual const LanguageBreakEngine
*loadEngineFor(UChar32 c
, int32_t breakType
);
278 * <p>Create a DictionaryMatcher for the specified script and break type.</p>
279 * @param script An ISO 15924 script code that identifies the dictionary to be
281 * @param breakType The kind of text break for which a dictionary is
283 * @return A DictionaryMatcher with the desired characteristics, or NULL.
285 virtual DictionaryMatcher
*loadDictionaryMatcherFor(UScriptCode script
, int32_t breakType
);