]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/brkeng.h
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / brkeng.h
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4 ************************************************************************************
5 * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
6 * All Rights Reserved. *
7 ************************************************************************************
8 */
9
10 #ifndef BRKENG_H
11 #define BRKENG_H
12
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/utext.h"
16 #include "unicode/uscript.h"
17
18 U_NAMESPACE_BEGIN
19
20 class UnicodeSet;
21 class UStack;
22 class DictionaryMatcher;
23
24 /*******************************************************************
25 * LanguageBreakEngine
26 */
27
28 /**
29 * <p>LanguageBreakEngines implement language-specific knowledge for
30 * finding text boundaries within a run of characters belonging to a
31 * specific set. The boundaries will be of a specific kind, e.g. word,
32 * line, etc.</p>
33 *
34 * <p>LanguageBreakEngines should normally be implemented so as to
35 * be shared between threads without locking.</p>
36 */
37 class LanguageBreakEngine : public UMemory {
38 public:
39
40 /**
41 * <p>Default constructor.</p>
42 *
43 */
44 LanguageBreakEngine();
45
46 /**
47 * <p>Virtual destructor.</p>
48 */
49 virtual ~LanguageBreakEngine();
50
51 /**
52 * <p>Indicate whether this engine handles a particular character for
53 * a particular kind of break.</p>
54 *
55 * @param c A character which begins a run that the engine might handle
56 * @param breakType The type of text break which the caller wants to determine
57 * @return TRUE if this engine handles the particular character and break
58 * type.
59 */
60 virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
61
62 /**
63 * <p>Find any breaks within a run in the supplied text.</p>
64 *
65 * @param text A UText representing the text. The
66 * iterator is left at the end of the run of characters which the engine
67 * is capable of handling.
68 * @param startPos The start of the run within the supplied text.
69 * @param endPos The end of the run within the supplied text.
70 * @param reverse Whether the caller is looking for breaks in a reverse
71 * direction.
72 * @param breakType The type of break desired, or -1.
73 * @param foundBreaks An allocated C array of the breaks found, if any
74 * @return The number of breaks found.
75 */
76 virtual int32_t findBreaks( UText *text,
77 int32_t startPos,
78 int32_t endPos,
79 UBool reverse,
80 int32_t breakType,
81 UStack &foundBreaks ) const = 0;
82
83 };
84
85 /*******************************************************************
86 * LanguageBreakFactory
87 */
88
89 /**
90 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
91 * that can determine breaks for characters in a specific set, if
92 * such an object can be found.</p>
93 *
94 * <p>If a LanguageBreakFactory is to be shared between threads,
95 * appropriate synchronization must be used; there is none internal
96 * to the factory.</p>
97 *
98 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
99 * normally be shared between threads without synchronization, unless
100 * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
101 *
102 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
103 * it returns when it itself is deleted, unless the specific subclass of
104 * LanguageBreakFactory indicates otherwise. Naturally, the factory should
105 * not be deleted until the LanguageBreakEngines it has returned are no
106 * longer needed.</p>
107 */
108 class LanguageBreakFactory : public UMemory {
109 public:
110
111 /**
112 * <p>Default constructor.</p>
113 *
114 */
115 LanguageBreakFactory();
116
117 /**
118 * <p>Virtual destructor.</p>
119 */
120 virtual ~LanguageBreakFactory();
121
122 /**
123 * <p>Find and return a LanguageBreakEngine that can find the desired
124 * kind of break for the set of characters to which the supplied
125 * character belongs. It is up to the set of available engines to
126 * determine what the sets of characters are.</p>
127 *
128 * @param c A character that begins a run for which a LanguageBreakEngine is
129 * sought.
130 * @param breakType The kind of text break for which a LanguageBreakEngine is
131 * sought.
132 * @return A LanguageBreakEngine with the desired characteristics, or 0.
133 */
134 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
135
136 };
137
138 /*******************************************************************
139 * UnhandledEngine
140 */
141
142 /**
143 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
144 * handles characters that no other LanguageBreakEngine is available to
145 * handle. It is told the character and the type of break; at its
146 * discretion it may handle more than the specified character (e.g.,
147 * the entire script to which that character belongs.</p>
148 *
149 * <p>UnhandledEngines may not be shared between threads without
150 * external synchronization.</p>
151 */
152
153 class UnhandledEngine : public LanguageBreakEngine {
154 private:
155
156 /**
157 * The sets of characters handled, for each break type
158 * @internal
159 */
160
161 UnicodeSet *fHandled[4];
162
163 public:
164
165 /**
166 * <p>Default constructor.</p>
167 *
168 */
169 UnhandledEngine(UErrorCode &status);
170
171 /**
172 * <p>Virtual destructor.</p>
173 */
174 virtual ~UnhandledEngine();
175
176 /**
177 * <p>Indicate whether this engine handles a particular character for
178 * a particular kind of break.</p>
179 *
180 * @param c A character which begins a run that the engine might handle
181 * @param breakType The type of text break which the caller wants to determine
182 * @return TRUE if this engine handles the particular character and break
183 * type.
184 */
185 virtual UBool handles(UChar32 c, int32_t breakType) const;
186
187 /**
188 * <p>Find any breaks within a run in the supplied text.</p>
189 *
190 * @param text A UText representing the text (TODO: UText). The
191 * iterator is left at the end of the run of characters which the engine
192 * is capable of handling.
193 * @param startPos The start of the run within the supplied text.
194 * @param endPos The end of the run within the supplied text.
195 * @param reverse Whether the caller is looking for breaks in a reverse
196 * direction.
197 * @param breakType The type of break desired, or -1.
198 * @param foundBreaks An allocated C array of the breaks found, if any
199 * @return The number of breaks found.
200 */
201 virtual int32_t findBreaks( UText *text,
202 int32_t startPos,
203 int32_t endPos,
204 UBool reverse,
205 int32_t breakType,
206 UStack &foundBreaks ) const;
207
208 /**
209 * <p>Tell the engine to handle a particular character and break type.</p>
210 *
211 * @param c A character which the engine should handle
212 * @param breakType The type of text break for which the engine should handle c
213 */
214 virtual void handleCharacter(UChar32 c, int32_t breakType);
215
216 };
217
218 /*******************************************************************
219 * ICULanguageBreakFactory
220 */
221
222 /**
223 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
224 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
225 * data in the ICU data file.</p>
226 */
227 class ICULanguageBreakFactory : public LanguageBreakFactory {
228 private:
229
230 /**
231 * The stack of break engines created by this factory
232 * @internal
233 */
234
235 UStack *fEngines;
236
237 public:
238
239 /**
240 * <p>Standard constructor.</p>
241 *
242 */
243 ICULanguageBreakFactory(UErrorCode &status);
244
245 /**
246 * <p>Virtual destructor.</p>
247 */
248 virtual ~ICULanguageBreakFactory();
249
250 /**
251 * <p>Find and return a LanguageBreakEngine that can find the desired
252 * kind of break for the set of characters to which the supplied
253 * character belongs. It is up to the set of available engines to
254 * determine what the sets of characters are.</p>
255 *
256 * @param c A character that begins a run for which a LanguageBreakEngine is
257 * sought.
258 * @param breakType The kind of text break for which a LanguageBreakEngine is
259 * sought.
260 * @return A LanguageBreakEngine with the desired characteristics, or 0.
261 */
262 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
263
264 protected:
265 /**
266 * <p>Create a LanguageBreakEngine for the set of characters to which
267 * the supplied character belongs, for the specified break type.</p>
268 *
269 * @param c A character that begins a run for which a LanguageBreakEngine is
270 * sought.
271 * @param breakType The kind of text break for which a LanguageBreakEngine is
272 * sought.
273 * @return A LanguageBreakEngine with the desired characteristics, or 0.
274 */
275 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
276
277 /**
278 * <p>Create a DictionaryMatcher for the specified script and break type.</p>
279 * @param script An ISO 15924 script code that identifies the dictionary to be
280 * created.
281 * @param breakType The kind of text break for which a dictionary is
282 * sought.
283 * @return A DictionaryMatcher with the desired characteristics, or NULL.
284 */
285 virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
286 };
287
288 U_NAMESPACE_END
289
290 /* BRKENG_H */
291 #endif