]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/dictbe.h
ICU-491.11.3.tar.gz
[apple/icu.git] / icuSources / common / dictbe.h
1 /**
2 *******************************************************************************
3 * Copyright (C) 2006,2011, International Business Machines Corporation *
4 * and others. All Rights Reserved. *
5 *******************************************************************************
6 */
7
8 #ifndef DICTBE_H
9 #define DICTBE_H
10
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/utext.h"
14
15 #include "brkeng.h"
16
17 U_NAMESPACE_BEGIN
18
19 class TrieWordDictionary;
20
21 /*******************************************************************
22 * DictionaryBreakEngine
23 */
24
25 /**
26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
27 * dictionary to determine language-specific breaks.</p>
28 *
29 * <p>After it is constructed a DictionaryBreakEngine may be shared between
30 * threads without synchronization.</p>
31 */
32 class DictionaryBreakEngine : public LanguageBreakEngine {
33 private:
34 /**
35 * The set of characters handled by this engine
36 * @internal
37 */
38
39 UnicodeSet fSet;
40
41 /**
42 * The set of break types handled by this engine
43 * @internal
44 */
45
46 uint32_t fTypes;
47
48 /**
49 * <p>Default constructor.</p>
50 *
51 */
52 DictionaryBreakEngine();
53
54 public:
55
56 /**
57 * <p>Constructor setting the break types handled.</p>
58 *
59 * @param breakTypes A bitmap of types handled by the engine.
60 */
61 DictionaryBreakEngine( uint32_t breakTypes );
62
63 /**
64 * <p>Virtual destructor.</p>
65 */
66 virtual ~DictionaryBreakEngine();
67
68 /**
69 * <p>Indicate whether this engine handles a particular character for
70 * a particular kind of break.</p>
71 *
72 * @param c A character which begins a run that the engine might handle
73 * @param breakType The type of text break which the caller wants to determine
74 * @return TRUE if this engine handles the particular character and break
75 * type.
76 */
77 virtual UBool handles( UChar32 c, int32_t breakType ) const;
78
79 /**
80 * <p>Find any breaks within a run in the supplied text.</p>
81 *
82 * @param text A UText representing the text. The
83 * iterator is left at the end of the run of characters which the engine
84 * is capable of handling.
85 * @param startPos The start of the run within the supplied text.
86 * @param endPos The end of the run within the supplied text.
87 * @param reverse Whether the caller is looking for breaks in a reverse
88 * direction.
89 * @param breakType The type of break desired, or -1.
90 * @param foundBreaks An allocated C array of the breaks found, if any
91 * @return The number of breaks found.
92 */
93 virtual int32_t findBreaks( UText *text,
94 int32_t startPos,
95 int32_t endPos,
96 UBool reverse,
97 int32_t breakType,
98 UStack &foundBreaks ) const;
99
100 protected:
101
102 /**
103 * <p>Set the character set handled by this engine.</p>
104 *
105 * @param set A UnicodeSet of the set of characters handled by the engine
106 */
107 virtual void setCharacters( const UnicodeSet &set );
108
109 /**
110 * <p>Set the break types handled by this engine.</p>
111 *
112 * @param breakTypes A bitmap of types handled by the engine.
113 */
114 // virtual void setBreakTypes( uint32_t breakTypes );
115
116 /**
117 * <p>Divide up a range of known dictionary characters.</p>
118 *
119 * @param text A UText representing the text
120 * @param rangeStart The start of the range of dictionary characters
121 * @param rangeEnd The end of the range of dictionary characters
122 * @param foundBreaks Output of C array of int32_t break positions, or 0
123 * @return The number of breaks found
124 */
125 virtual int32_t divideUpDictionaryRange( UText *text,
126 int32_t rangeStart,
127 int32_t rangeEnd,
128 UStack &foundBreaks ) const = 0;
129
130 };
131
132 /*******************************************************************
133 * ThaiBreakEngine
134 */
135
136 /**
137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
138 * TrieWordDictionary and heuristics to determine Thai-specific breaks.</p>
139 *
140 * <p>After it is constructed a ThaiBreakEngine may be shared between
141 * threads without synchronization.</p>
142 */
143 class ThaiBreakEngine : public DictionaryBreakEngine {
144 private:
145 /**
146 * The set of characters handled by this engine
147 * @internal
148 */
149
150 UnicodeSet fThaiWordSet;
151 UnicodeSet fEndWordSet;
152 UnicodeSet fBeginWordSet;
153 UnicodeSet fSuffixSet;
154 UnicodeSet fMarkSet;
155 const TrieWordDictionary *fDictionary;
156
157 public:
158
159 /**
160 * <p>Default constructor.</p>
161 *
162 * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
163 * engine is deleted.
164 */
165 ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
166
167 /**
168 * <p>Virtual destructor.</p>
169 */
170 virtual ~ThaiBreakEngine();
171
172 protected:
173 /**
174 * <p>Divide up a range of known dictionary characters.</p>
175 *
176 * @param text A UText representing the text
177 * @param rangeStart The start of the range of dictionary characters
178 * @param rangeEnd The end of the range of dictionary characters
179 * @param foundBreaks Output of C array of int32_t break positions, or 0
180 * @return The number of breaks found
181 */
182 virtual int32_t divideUpDictionaryRange( UText *text,
183 int32_t rangeStart,
184 int32_t rangeEnd,
185 UStack &foundBreaks ) const;
186
187 };
188
189
190 /*******************************************************************
191 * KhmerBreakEngine
192 */
193
194 /**
195 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
196 * TrieWordDictionary and heuristics to determine Khmer-specific breaks.</p>
197 *
198 * <p>After it is constructed a KhmerBreakEngine may be shared between
199 * threads without synchronization.</p>
200 */
201 class KhmerBreakEngine : public DictionaryBreakEngine {
202 private:
203 /**
204 * The set of characters handled by this engine
205 * @internal
206 */
207
208 UnicodeSet fKhmerWordSet;
209 UnicodeSet fEndWordSet;
210 UnicodeSet fBeginWordSet;
211 UnicodeSet fMarkSet;
212 const TrieWordDictionary *fDictionary;
213
214 public:
215
216 /**
217 * <p>Default constructor.</p>
218 *
219 * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
220 * engine is deleted.
221 */
222 KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);
223
224 /**
225 * <p>Virtual destructor.</p>
226 */
227 virtual ~KhmerBreakEngine();
228
229 protected:
230 /**
231 * <p>Divide up a range of known dictionary characters.</p>
232 *
233 * @param text A UText representing the text
234 * @param rangeStart The start of the range of dictionary characters
235 * @param rangeEnd The end of the range of dictionary characters
236 * @param foundBreaks Output of C array of int32_t break positions, or 0
237 * @return The number of breaks found
238 */
239 virtual int32_t divideUpDictionaryRange( UText *text,
240 int32_t rangeStart,
241 int32_t rangeEnd,
242 UStack &foundBreaks ) const;
243
244 };
245
246
247 U_NAMESPACE_END
248
249 /* DICTBE_H */
250 #endif