]>
git.saurik.com Git - apple/icu.git/blob - icuSources/common/brkeng.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 ************************************************************************************
5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6 * All Rights Reserved. *
7 ************************************************************************************
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/utext.h"
16 #include "unicode/uscript.h"
23 class DictionaryMatcher
;
25 /*******************************************************************
30 * <p>LanguageBreakEngines implement language-specific knowledge for
31 * finding text boundaries within a run of characters belonging to a
32 * specific set. The boundaries will be of a specific kind, e.g. word,
35 * <p>LanguageBreakEngines should normally be implemented so as to
36 * be shared between threads without locking.</p>
38 class LanguageBreakEngine
: public UMemory
{
42 * <p>Default constructor.</p>
45 LanguageBreakEngine();
48 * <p>Virtual destructor.</p>
50 virtual ~LanguageBreakEngine();
53 * <p>Indicate whether this engine handles a particular character for
54 * a particular kind of break.</p>
56 * @param c A character which begins a run that the engine might handle
57 * @return TRUE if this engine handles the particular character and break
60 virtual UBool
handles(UChar32 c
) const = 0;
63 * <p>Find any breaks within a run in the supplied text.</p>
65 * @param text A UText representing the text. The
66 * iterator is left at the end of the run of characters which the engine
67 * is capable of handling.
68 * @param startPos The start of the run within the supplied text.
69 * @param endPos The end of the run within the supplied text.
70 * @param foundBreaks A Vector of int32_t to receive the breaks.
71 * @return The number of breaks found.
73 virtual int32_t findBreaks( UText
*text
,
76 UVector32
&foundBreaks
) const = 0;
80 /*******************************************************************
81 * LanguageBreakFactory
85 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
86 * that can determine breaks for characters in a specific set, if
87 * such an object can be found.</p>
89 * <p>If a LanguageBreakFactory is to be shared between threads,
90 * appropriate synchronization must be used; there is none internal
93 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
94 * normally be shared between threads without synchronization, unless
95 * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
97 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
98 * it returns when it itself is deleted, unless the specific subclass of
99 * LanguageBreakFactory indicates otherwise. Naturally, the factory should
100 * not be deleted until the LanguageBreakEngines it has returned are no
103 class LanguageBreakFactory
: public UMemory
{
107 * <p>Default constructor.</p>
110 LanguageBreakFactory();
113 * <p>Virtual destructor.</p>
115 virtual ~LanguageBreakFactory();
118 * <p>Find and return a LanguageBreakEngine that can find the desired
119 * kind of break for the set of characters to which the supplied
120 * character belongs. It is up to the set of available engines to
121 * determine what the sets of characters are.</p>
123 * @param c A character that begins a run for which a LanguageBreakEngine is
125 * @return A LanguageBreakEngine with the desired characteristics, or 0.
127 virtual const LanguageBreakEngine
*getEngineFor(UChar32 c
) = 0;
131 /*******************************************************************
136 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
137 * handles characters that no other LanguageBreakEngine is available to
138 * handle. It is told the character and the type of break; at its
139 * discretion it may handle more than the specified character (e.g.,
140 * the entire script to which that character belongs.</p>
142 * <p>UnhandledEngines may not be shared between threads without
143 * external synchronization.</p>
146 class UnhandledEngine
: public LanguageBreakEngine
{
150 * The sets of characters handled.
154 UnicodeSet
*fHandled
;
159 * <p>Default constructor.</p>
162 UnhandledEngine(UErrorCode
&status
);
165 * <p>Virtual destructor.</p>
167 virtual ~UnhandledEngine();
170 * <p>Indicate whether this engine handles a particular character for
171 * a particular kind of break.</p>
173 * @param c A character which begins a run that the engine might handle
174 * @return TRUE if this engine handles the particular character and break
177 virtual UBool
handles(UChar32 c
) const;
180 * <p>Find any breaks within a run in the supplied text.</p>
182 * @param text A UText representing the text (TODO: UText). The
183 * iterator is left at the end of the run of characters which the engine
184 * is capable of handling.
185 * @param startPos The start of the run within the supplied text.
186 * @param endPos The end of the run within the supplied text.
187 * @param foundBreaks An allocated C array of the breaks found, if any
188 * @return The number of breaks found.
190 virtual int32_t findBreaks( UText
*text
,
193 UVector32
&foundBreaks
) const;
196 * <p>Tell the engine to handle a particular character and break type.</p>
198 * @param c A character which the engine should handle
200 virtual void handleCharacter(UChar32 c
);
204 /*******************************************************************
205 * ICULanguageBreakFactory
209 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
210 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
211 * data in the ICU data file.</p>
213 class ICULanguageBreakFactory
: public LanguageBreakFactory
{
217 * The stack of break engines created by this factory
226 * <p>Standard constructor.</p>
229 ICULanguageBreakFactory(UErrorCode
&status
);
232 * <p>Virtual destructor.</p>
234 virtual ~ICULanguageBreakFactory();
237 * <p>Find and return a LanguageBreakEngine that can find the desired
238 * kind of break for the set of characters to which the supplied
239 * character belongs. It is up to the set of available engines to
240 * determine what the sets of characters are.</p>
242 * @param c A character that begins a run for which a LanguageBreakEngine is
244 * @return A LanguageBreakEngine with the desired characteristics, or 0.
246 virtual const LanguageBreakEngine
*getEngineFor(UChar32 c
);
250 * <p>Create a LanguageBreakEngine for the set of characters to which
251 * the supplied character belongs, for the specified break type.</p>
253 * @param c A character that begins a run for which a LanguageBreakEngine is
255 * @return A LanguageBreakEngine with the desired characteristics, or 0.
257 virtual const LanguageBreakEngine
*loadEngineFor(UChar32 c
);
260 * <p>Create a DictionaryMatcher for the specified script and break type.</p>
261 * @param script An ISO 15924 script code that identifies the dictionary to be
263 * @return A DictionaryMatcher with the desired characteristics, or NULL.
265 virtual DictionaryMatcher
*loadDictionaryMatcherFor(UScriptCode script
);