]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/brkeng.h
ICU-400.42.tar.gz
[apple/icu.git] / icuSources / common / brkeng.h
CommitLineData
73c04bcf 1/**
46f4442e
A
2 ************************************************************************************
3 * Copyright (C) 2006-2007, International Business Machines Corporation and others. *
4 * All Rights Reserved. *
5 ************************************************************************************
73c04bcf
A
6 */
7
8#ifndef BRKENG_H
9#define BRKENG_H
10
11#include "unicode/utypes.h"
12#include "unicode/uobject.h"
13#include "unicode/utext.h"
14#include "unicode/uscript.h"
15
16U_NAMESPACE_BEGIN
17
18class UnicodeSet;
19class UStack;
20class CompactTrieDictionary;
21
22/*******************************************************************
23 * LanguageBreakEngine
24 */
25
26/**
27 * <p>LanguageBreakEngines implement language-specific knowledge for
28 * finding text boundaries within a run of characters belonging to a
29 * specific set. The boundaries will be of a specific kind, e.g. word,
30 * line, etc.</p>
31 *
32 * <p>LanguageBreakEngines should normally be implemented so as to
33 * be shared between threads without locking.</p>
34 */
35class LanguageBreakEngine : public UMemory {
36 public:
37
38 /**
39 * <p>Default constructor.</p>
40 *
41 */
42 LanguageBreakEngine();
43
44 /**
45 * <p>Virtual destructor.</p>
46 */
47 virtual ~LanguageBreakEngine();
48
49 /**
50 * <p>Indicate whether this engine handles a particular character for
51 * a particular kind of break.</p>
52 *
53 * @param c A character which begins a run that the engine might handle
54 * @param breakType The type of text break which the caller wants to determine
55 * @return TRUE if this engine handles the particular character and break
56 * type.
57 */
58 virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
59
60 /**
61 * <p>Find any breaks within a run in the supplied text.</p>
62 *
63 * @param text A UText representing the text. The
64 * iterator is left at the end of the run of characters which the engine
65 * is capable of handling.
66 * @param startPos The start of the run within the supplied text.
67 * @param endPos The end of the run within the supplied text.
68 * @param reverse Whether the caller is looking for breaks in a reverse
69 * direction.
70 * @param breakType The type of break desired, or -1.
71 * @param foundBreaks An allocated C array of the breaks found, if any
72 * @return The number of breaks found.
73 */
74 virtual int32_t findBreaks( UText *text,
75 int32_t startPos,
76 int32_t endPos,
77 UBool reverse,
78 int32_t breakType,
79 UStack &foundBreaks ) const = 0;
80
81};
82
83/*******************************************************************
84 * LanguageBreakFactory
85 */
86
87/**
88 * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
89 * that can determine breaks for characters in a specific set, if
90 * such an object can be found.</p>
91 *
92 * <p>If a LanguageBreakFactory is to be shared between threads,
93 * appropriate synchronization must be used; there is none internal
94 * to the factory.</p>
95 *
96 * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
97 * normally be shared between threads without synchronization, unless
98 * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
99 *
100 * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
101 * it returns when it itself is deleted, unless the specific subclass of
102 * LanguageBreakFactory indicates otherwise. Naturally, the factory should
103 * not be deleted until the LanguageBreakEngines it has returned are no
104 * longer needed.</p>
105 */
106class LanguageBreakFactory : public UMemory {
107 public:
108
109 /**
110 * <p>Default constructor.</p>
111 *
112 */
113 LanguageBreakFactory();
114
115 /**
116 * <p>Virtual destructor.</p>
117 */
118 virtual ~LanguageBreakFactory();
119
120 /**
121 * <p>Find and return a LanguageBreakEngine that can find the desired
122 * kind of break for the set of characters to which the supplied
123 * character belongs. It is up to the set of available engines to
124 * determine what the sets of characters are.</p>
125 *
126 * @param c A character that begins a run for which a LanguageBreakEngine is
127 * sought.
128 * @param breakType The kind of text break for which a LanguageBreakEngine is
129 * sought.
130 * @return A LanguageBreakEngine with the desired characteristics, or 0.
131 */
132 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
133
134};
135
136/*******************************************************************
137 * UnhandledEngine
138 */
139
140/**
141 * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
142 * handles characters that no other LanguageBreakEngine is available to
143 * handle. It is told the character and the type of break; at its
144 * discretion it may handle more than the specified character (e.g.,
145 * the entire script to which that character belongs.</p>
146 *
147 * <p>UnhandledEngines may not be shared between threads without
148 * external synchronization.</p>
149 */
150
151class UnhandledEngine : public LanguageBreakEngine {
152 private:
153
154 /**
155 * The sets of characters handled, for each break type
156 * @internal
157 */
158
159 UnicodeSet *fHandled[4];
160
161 public:
162
163 /**
164 * <p>Default constructor.</p>
165 *
166 */
167 UnhandledEngine(UErrorCode &status);
168
169 /**
170 * <p>Virtual destructor.</p>
171 */
172 virtual ~UnhandledEngine();
173
174 /**
175 * <p>Indicate whether this engine handles a particular character for
176 * a particular kind of break.</p>
177 *
178 * @param c A character which begins a run that the engine might handle
179 * @param breakType The type of text break which the caller wants to determine
180 * @return TRUE if this engine handles the particular character and break
181 * type.
182 */
183 virtual UBool handles(UChar32 c, int32_t breakType) const;
184
185 /**
186 * <p>Find any breaks within a run in the supplied text.</p>
187 *
188 * @param text A UText representing the text (TODO: UText). The
189 * iterator is left at the end of the run of characters which the engine
190 * is capable of handling.
191 * @param startPos The start of the run within the supplied text.
192 * @param endPos The end of the run within the supplied text.
193 * @param reverse Whether the caller is looking for breaks in a reverse
194 * direction.
195 * @param breakType The type of break desired, or -1.
196 * @param foundBreaks An allocated C array of the breaks found, if any
197 * @return The number of breaks found.
198 */
199 virtual int32_t findBreaks( UText *text,
200 int32_t startPos,
201 int32_t endPos,
202 UBool reverse,
203 int32_t breakType,
204 UStack &foundBreaks ) const;
205
206 /**
207 * <p>Tell the engine to handle a particular character and break type.</p>
208 *
209 * @param c A character which the engine should handle
210 * @param breakType The type of text break for which the engine should handle c
211 */
212 virtual void handleCharacter(UChar32 c, int32_t breakType);
213
214};
215
216/*******************************************************************
217 * ICULanguageBreakFactory
218 */
219
220/**
221 * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
222 * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
223 * data in the ICU data file.</p>
224 */
225class ICULanguageBreakFactory : public LanguageBreakFactory {
226 private:
227
228 /**
229 * The stack of break engines created by this factory
230 * @internal
231 */
232
233 UStack *fEngines;
234
235 public:
236
237 /**
238 * <p>Standard constructor.</p>
239 *
240 */
241 ICULanguageBreakFactory(UErrorCode &status);
242
243 /**
244 * <p>Virtual destructor.</p>
245 */
246 virtual ~ICULanguageBreakFactory();
247
248 /**
249 * <p>Find and return a LanguageBreakEngine that can find the desired
250 * kind of break for the set of characters to which the supplied
251 * character belongs. It is up to the set of available engines to
252 * determine what the sets of characters are.</p>
253 *
254 * @param c A character that begins a run for which a LanguageBreakEngine is
255 * sought.
256 * @param breakType The kind of text break for which a LanguageBreakEngine is
257 * sought.
258 * @return A LanguageBreakEngine with the desired characteristics, or 0.
259 */
260 virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
261
262 protected:
263
264 /**
265 * <p>Create a LanguageBreakEngine for the set of characters to which
266 * the supplied character belongs, for the specified break type.</p>
267 *
268 * @param c A character that begins a run for which a LanguageBreakEngine is
269 * sought.
270 * @param breakType The kind of text break for which a LanguageBreakEngine is
271 * sought.
272 * @return A LanguageBreakEngine with the desired characteristics, or 0.
273 */
274 virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
275
276 /**
277 * <p>Create a CompactTrieDictionary for the specified script and break type.</p>
278 *
279 * @param script An ISO 15924 script code that identifies the dictionary to be
280 * created.
281 * @param breakType The kind of text break for which a dictionary is
282 * sought.
283 * @return A CompactTrieDictionary with the desired characteristics, or 0.
284 */
285 virtual const CompactTrieDictionary *loadDictionaryFor(UScriptCode script, int32_t breakType);
286
287};
288
289U_NAMESPACE_END
290
291 /* BRKENG_H */
292#endif