Commit | Line | Data |
---|---|---|
73c04bcf | 1 | /** |
46f4442e | 2 | ************************************************************************************ |
51004dcb | 3 | * Copyright (C) 2006-2012, International Business Machines Corporation and others. * |
46f4442e A |
4 | * All Rights Reserved. * |
5 | ************************************************************************************ | |
73c04bcf A |
6 | */ |
7 | ||
8 | #ifndef BRKENG_H | |
9 | #define BRKENG_H | |
10 | ||
11 | #include "unicode/utypes.h" | |
12 | #include "unicode/uobject.h" | |
13 | #include "unicode/utext.h" | |
14 | #include "unicode/uscript.h" | |
15 | ||
16 | U_NAMESPACE_BEGIN | |
17 | ||
18 | class UnicodeSet; | |
19 | class UStack; | |
51004dcb | 20 | class DictionaryMatcher; |
73c04bcf A |
21 | |
22 | /******************************************************************* | |
23 | * LanguageBreakEngine | |
24 | */ | |
25 | ||
26 | /** | |
27 | * <p>LanguageBreakEngines implement language-specific knowledge for | |
28 | * finding text boundaries within a run of characters belonging to a | |
29 | * specific set. The boundaries will be of a specific kind, e.g. word, | |
30 | * line, etc.</p> | |
31 | * | |
32 | * <p>LanguageBreakEngines should normally be implemented so as to | |
33 | * be shared between threads without locking.</p> | |
34 | */ | |
35 | class LanguageBreakEngine : public UMemory { | |
36 | public: | |
37 | ||
38 | /** | |
39 | * <p>Default constructor.</p> | |
40 | * | |
41 | */ | |
42 | LanguageBreakEngine(); | |
43 | ||
44 | /** | |
45 | * <p>Virtual destructor.</p> | |
46 | */ | |
47 | virtual ~LanguageBreakEngine(); | |
48 | ||
49 | /** | |
50 | * <p>Indicate whether this engine handles a particular character for | |
51 | * a particular kind of break.</p> | |
52 | * | |
53 | * @param c A character which begins a run that the engine might handle | |
54 | * @param breakType The type of text break which the caller wants to determine | |
55 | * @return TRUE if this engine handles the particular character and break | |
56 | * type. | |
57 | */ | |
58 | virtual UBool handles(UChar32 c, int32_t breakType) const = 0; | |
59 | ||
60 | /** | |
61 | * <p>Find any breaks within a run in the supplied text.</p> | |
62 | * | |
63 | * @param text A UText representing the text. The | |
64 | * iterator is left at the end of the run of characters which the engine | |
65 | * is capable of handling. | |
66 | * @param startPos The start of the run within the supplied text. | |
67 | * @param endPos The end of the run within the supplied text. | |
68 | * @param reverse Whether the caller is looking for breaks in a reverse | |
69 | * direction. | |
70 | * @param breakType The type of break desired, or -1. | |
71 | * @param foundBreaks An allocated C array of the breaks found, if any | |
72 | * @return The number of breaks found. | |
73 | */ | |
74 | virtual int32_t findBreaks( UText *text, | |
75 | int32_t startPos, | |
76 | int32_t endPos, | |
77 | UBool reverse, | |
78 | int32_t breakType, | |
79 | UStack &foundBreaks ) const = 0; | |
80 | ||
81 | }; | |
82 | ||
83 | /******************************************************************* | |
84 | * LanguageBreakFactory | |
85 | */ | |
86 | ||
87 | /** | |
88 | * <p>LanguageBreakFactorys find and return a LanguageBreakEngine | |
89 | * that can determine breaks for characters in a specific set, if | |
90 | * such an object can be found.</p> | |
91 | * | |
92 | * <p>If a LanguageBreakFactory is to be shared between threads, | |
93 | * appropriate synchronization must be used; there is none internal | |
94 | * to the factory.</p> | |
95 | * | |
96 | * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can | |
97 | * normally be shared between threads without synchronization, unless | |
98 | * the specific subclass of LanguageBreakFactory indicates otherwise.</p> | |
99 | * | |
100 | * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine | |
101 | * it returns when it itself is deleted, unless the specific subclass of | |
102 | * LanguageBreakFactory indicates otherwise. Naturally, the factory should | |
103 | * not be deleted until the LanguageBreakEngines it has returned are no | |
104 | * longer needed.</p> | |
105 | */ | |
106 | class LanguageBreakFactory : public UMemory { | |
107 | public: | |
108 | ||
109 | /** | |
110 | * <p>Default constructor.</p> | |
111 | * | |
112 | */ | |
113 | LanguageBreakFactory(); | |
114 | ||
115 | /** | |
116 | * <p>Virtual destructor.</p> | |
117 | */ | |
118 | virtual ~LanguageBreakFactory(); | |
119 | ||
120 | /** | |
121 | * <p>Find and return a LanguageBreakEngine that can find the desired | |
122 | * kind of break for the set of characters to which the supplied | |
123 | * character belongs. It is up to the set of available engines to | |
124 | * determine what the sets of characters are.</p> | |
125 | * | |
126 | * @param c A character that begins a run for which a LanguageBreakEngine is | |
127 | * sought. | |
128 | * @param breakType The kind of text break for which a LanguageBreakEngine is | |
129 | * sought. | |
130 | * @return A LanguageBreakEngine with the desired characteristics, or 0. | |
131 | */ | |
132 | virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; | |
133 | ||
134 | }; | |
135 | ||
136 | /******************************************************************* | |
137 | * UnhandledEngine | |
138 | */ | |
139 | ||
140 | /** | |
141 | * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that | |
142 | * handles characters that no other LanguageBreakEngine is available to | |
143 | * handle. It is told the character and the type of break; at its | |
144 | * discretion it may handle more than the specified character (e.g., | |
145 | * the entire script to which that character belongs.</p> | |
146 | * | |
147 | * <p>UnhandledEngines may not be shared between threads without | |
148 | * external synchronization.</p> | |
149 | */ | |
150 | ||
151 | class UnhandledEngine : public LanguageBreakEngine { | |
152 | private: | |
153 | ||
154 | /** | |
155 | * The sets of characters handled, for each break type | |
156 | * @internal | |
157 | */ | |
158 | ||
159 | UnicodeSet *fHandled[4]; | |
160 | ||
161 | public: | |
162 | ||
163 | /** | |
164 | * <p>Default constructor.</p> | |
165 | * | |
166 | */ | |
167 | UnhandledEngine(UErrorCode &status); | |
168 | ||
169 | /** | |
170 | * <p>Virtual destructor.</p> | |
171 | */ | |
172 | virtual ~UnhandledEngine(); | |
173 | ||
174 | /** | |
175 | * <p>Indicate whether this engine handles a particular character for | |
176 | * a particular kind of break.</p> | |
177 | * | |
178 | * @param c A character which begins a run that the engine might handle | |
179 | * @param breakType The type of text break which the caller wants to determine | |
180 | * @return TRUE if this engine handles the particular character and break | |
181 | * type. | |
182 | */ | |
183 | virtual UBool handles(UChar32 c, int32_t breakType) const; | |
184 | ||
185 | /** | |
186 | * <p>Find any breaks within a run in the supplied text.</p> | |
187 | * | |
188 | * @param text A UText representing the text (TODO: UText). The | |
189 | * iterator is left at the end of the run of characters which the engine | |
190 | * is capable of handling. | |
191 | * @param startPos The start of the run within the supplied text. | |
192 | * @param endPos The end of the run within the supplied text. | |
193 | * @param reverse Whether the caller is looking for breaks in a reverse | |
194 | * direction. | |
195 | * @param breakType The type of break desired, or -1. | |
196 | * @param foundBreaks An allocated C array of the breaks found, if any | |
197 | * @return The number of breaks found. | |
198 | */ | |
199 | virtual int32_t findBreaks( UText *text, | |
200 | int32_t startPos, | |
201 | int32_t endPos, | |
202 | UBool reverse, | |
203 | int32_t breakType, | |
204 | UStack &foundBreaks ) const; | |
205 | ||
206 | /** | |
207 | * <p>Tell the engine to handle a particular character and break type.</p> | |
208 | * | |
209 | * @param c A character which the engine should handle | |
210 | * @param breakType The type of text break for which the engine should handle c | |
211 | */ | |
212 | virtual void handleCharacter(UChar32 c, int32_t breakType); | |
213 | ||
214 | }; | |
215 | ||
216 | /******************************************************************* | |
217 | * ICULanguageBreakFactory | |
218 | */ | |
219 | ||
220 | /** | |
221 | * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for | |
222 | * ICU. It creates dictionary-based LanguageBreakEngines from dictionary | |
223 | * data in the ICU data file.</p> | |
224 | */ | |
225 | class ICULanguageBreakFactory : public LanguageBreakFactory { | |
226 | private: | |
227 | ||
228 | /** | |
229 | * The stack of break engines created by this factory | |
230 | * @internal | |
231 | */ | |
232 | ||
233 | UStack *fEngines; | |
234 | ||
235 | public: | |
236 | ||
237 | /** | |
238 | * <p>Standard constructor.</p> | |
239 | * | |
240 | */ | |
241 | ICULanguageBreakFactory(UErrorCode &status); | |
242 | ||
243 | /** | |
244 | * <p>Virtual destructor.</p> | |
245 | */ | |
246 | virtual ~ICULanguageBreakFactory(); | |
247 | ||
248 | /** | |
249 | * <p>Find and return a LanguageBreakEngine that can find the desired | |
250 | * kind of break for the set of characters to which the supplied | |
251 | * character belongs. It is up to the set of available engines to | |
252 | * determine what the sets of characters are.</p> | |
253 | * | |
254 | * @param c A character that begins a run for which a LanguageBreakEngine is | |
255 | * sought. | |
256 | * @param breakType The kind of text break for which a LanguageBreakEngine is | |
257 | * sought. | |
258 | * @return A LanguageBreakEngine with the desired characteristics, or 0. | |
259 | */ | |
260 | virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); | |
261 | ||
51004dcb | 262 | protected: |
73c04bcf A |
263 | /** |
264 | * <p>Create a LanguageBreakEngine for the set of characters to which | |
265 | * the supplied character belongs, for the specified break type.</p> | |
266 | * | |
267 | * @param c A character that begins a run for which a LanguageBreakEngine is | |
268 | * sought. | |
269 | * @param breakType The kind of text break for which a LanguageBreakEngine is | |
270 | * sought. | |
271 | * @return A LanguageBreakEngine with the desired characteristics, or 0. | |
272 | */ | |
273 | virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); | |
274 | ||
51004dcb A |
275 | /** |
276 | * <p>Create a DictionaryMatcher for the specified script and break type.</p> | |
277 | * @param script An ISO 15924 script code that identifies the dictionary to be | |
278 | * created. | |
279 | * @param breakType The kind of text break for which a dictionary is | |
280 | * sought. | |
281 | * @return A DictionaryMatcher with the desired characteristics, or NULL. | |
282 | */ | |
283 | virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); | |
73c04bcf A |
284 | }; |
285 | ||
286 | U_NAMESPACE_END | |
287 | ||
288 | /* BRKENG_H */ | |
289 | #endif |