]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /** |
2 | ******************************************************************************* | |
46f4442e | 3 | * Copyright (C) 2006-2008, International Business Machines Corporation and others. * |
73c04bcf A |
4 | * All Rights Reserved. * |
5 | ******************************************************************************* | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_BREAK_ITERATION | |
11 | ||
12 | #include "brkeng.h" | |
13 | #include "dictbe.h" | |
14 | #include "unicode/uniset.h" | |
15 | #include "unicode/chariter.h" | |
16 | #include "unicode/ubrk.h" | |
17 | #include "uvector.h" | |
18 | #include "triedict.h" | |
19 | ||
20 | U_NAMESPACE_BEGIN | |
21 | ||
22 | /* | |
23 | ****************************************************************** | |
24 | */ | |
25 | ||
26 | /*DictionaryBreakEngine::DictionaryBreakEngine() { | |
27 | fTypes = 0; | |
28 | }*/ | |
29 | ||
30 | DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { | |
31 | fTypes = breakTypes; | |
32 | } | |
33 | ||
34 | DictionaryBreakEngine::~DictionaryBreakEngine() { | |
35 | } | |
36 | ||
37 | UBool | |
38 | DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { | |
39 | return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) | |
40 | && fSet.contains(c)); | |
41 | } | |
42 | ||
43 | int32_t | |
44 | DictionaryBreakEngine::findBreaks( UText *text, | |
45 | int32_t startPos, | |
46 | int32_t endPos, | |
47 | UBool reverse, | |
48 | int32_t breakType, | |
49 | UStack &foundBreaks ) const { | |
50 | int32_t result = 0; | |
51 | ||
52 | // Find the span of characters included in the set. | |
53 | int32_t start = (int32_t)utext_getNativeIndex(text); | |
54 | int32_t current; | |
55 | int32_t rangeStart; | |
56 | int32_t rangeEnd; | |
57 | UChar32 c = utext_current32(text); | |
58 | if (reverse) { | |
59 | UBool isDict = fSet.contains(c); | |
60 | while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { | |
61 | c = utext_previous32(text); | |
62 | isDict = fSet.contains(c); | |
63 | } | |
64 | rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); | |
65 | rangeEnd = start + 1; | |
66 | } | |
67 | else { | |
68 | while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { | |
69 | utext_next32(text); // TODO: recast loop for postincrement | |
70 | c = utext_current32(text); | |
71 | } | |
72 | rangeStart = start; | |
73 | rangeEnd = current; | |
74 | } | |
75 | if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { | |
76 | result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); | |
77 | utext_setNativeIndex(text, current); | |
78 | } | |
79 | ||
80 | return result; | |
81 | } | |
82 | ||
83 | void | |
46f4442e | 84 | DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { |
73c04bcf | 85 | fSet = set; |
46f4442e A |
86 | // Compact for caching |
87 | fSet.compact(); | |
73c04bcf A |
88 | } |
89 | ||
90 | /*void | |
91 | DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) { | |
92 | fTypes = breakTypes; | |
93 | }*/ | |
94 | ||
95 | /* | |
96 | ****************************************************************** | |
97 | */ | |
98 | ||
99 | ||
100 | // Helper class for improving readability of the Thai word break | |
101 | // algorithm. The implementation is completely inline. | |
102 | ||
103 | // List size, limited by the maximum number of words in the dictionary | |
104 | // that form a nested sequence. | |
105 | #define POSSIBLE_WORD_LIST_MAX 20 | |
106 | ||
107 | class PossibleWord { | |
108 | private: | |
109 | // list of word candidate lengths, in increasing length order | |
110 | int32_t lengths[POSSIBLE_WORD_LIST_MAX]; | |
111 | int count; // Count of candidates | |
112 | int32_t prefix; // The longest match with a dictionary word | |
113 | int32_t offset; // Offset in the text of these candidates | |
114 | int mark; // The preferred candidate's offset | |
115 | int current; // The candidate we're currently looking at | |
116 | ||
117 | public: | |
118 | PossibleWord(); | |
119 | ~PossibleWord(); | |
120 | ||
121 | // Fill the list of candidates if needed, select the longest, and return the number found | |
122 | int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ); | |
123 | ||
124 | // Select the currently marked candidate, point after it in the text, and invalidate self | |
125 | int32_t acceptMarked( UText *text ); | |
126 | ||
127 | // Back up from the current candidate to the next shorter one; return TRUE if that exists | |
128 | // and point the text after it | |
129 | UBool backUp( UText *text ); | |
130 | ||
131 | // Return the longest prefix this candidate location shares with a dictionary word | |
132 | int32_t longestPrefix(); | |
133 | ||
134 | // Mark the current candidate as the one we like | |
135 | void markCurrent(); | |
136 | }; | |
137 | ||
138 | inline | |
139 | PossibleWord::PossibleWord() { | |
140 | offset = -1; | |
141 | } | |
142 | ||
143 | inline | |
144 | PossibleWord::~PossibleWord() { | |
145 | } | |
146 | ||
147 | inline int | |
148 | PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) { | |
149 | // TODO: If getIndex is too slow, use offset < 0 and add discardAll() | |
150 | int32_t start = (int32_t)utext_getNativeIndex(text); | |
151 | if (start != offset) { | |
152 | offset = start; | |
153 | prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); | |
154 | // Dictionary leaves text after longest prefix, not longest word. Back up. | |
155 | if (count <= 0) { | |
156 | utext_setNativeIndex(text, start); | |
157 | } | |
158 | } | |
159 | if (count > 0) { | |
160 | utext_setNativeIndex(text, start+lengths[count-1]); | |
161 | } | |
162 | current = count-1; | |
163 | mark = current; | |
164 | return count; | |
165 | } | |
166 | ||
167 | inline int32_t | |
168 | PossibleWord::acceptMarked( UText *text ) { | |
169 | utext_setNativeIndex(text, offset + lengths[mark]); | |
170 | return lengths[mark]; | |
171 | } | |
172 | ||
173 | inline UBool | |
174 | PossibleWord::backUp( UText *text ) { | |
175 | if (current > 0) { | |
176 | utext_setNativeIndex(text, offset + lengths[--current]); | |
177 | return TRUE; | |
178 | } | |
179 | return FALSE; | |
180 | } | |
181 | ||
182 | inline int32_t | |
183 | PossibleWord::longestPrefix() { | |
184 | return prefix; | |
185 | } | |
186 | ||
187 | inline void | |
188 | PossibleWord::markCurrent() { | |
189 | mark = current; | |
190 | } | |
191 | ||
192 | // How many words in a row are "good enough"? | |
193 | #define THAI_LOOKAHEAD 3 | |
194 | ||
195 | // Will not combine a non-word with a preceding dictionary word longer than this | |
196 | #define THAI_ROOT_COMBINE_THRESHOLD 3 | |
197 | ||
198 | // Will not combine a non-word that shares at least this much prefix with a | |
199 | // dictionary word, with a preceding word | |
200 | #define THAI_PREFIX_COMBINE_THRESHOLD 3 | |
201 | ||
202 | // Ellision character | |
203 | #define THAI_PAIYANNOI 0x0E2F | |
204 | ||
205 | // Repeat character | |
206 | #define THAI_MAIYAMOK 0x0E46 | |
207 | ||
208 | // Minimum word size | |
209 | #define THAI_MIN_WORD 2 | |
210 | ||
211 | // Minimum number of characters for two words | |
212 | #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) | |
213 | ||
214 | ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) | |
215 | : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), | |
216 | fDictionary(adoptDictionary) | |
217 | { | |
218 | fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); | |
219 | if (U_SUCCESS(status)) { | |
220 | setCharacters(fThaiWordSet); | |
221 | } | |
222 | fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); | |
46f4442e | 223 | fMarkSet.add(0x0020); |
73c04bcf A |
224 | fEndWordSet = fThaiWordSet; |
225 | fEndWordSet.remove(0x0E31); // MAI HAN-AKAT | |
226 | fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
227 | fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK | |
228 | fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
229 | fSuffixSet.add(THAI_PAIYANNOI); | |
230 | fSuffixSet.add(THAI_MAIYAMOK); | |
46f4442e A |
231 | |
232 | // Compact for caching. | |
233 | fMarkSet.compact(); | |
234 | fEndWordSet.compact(); | |
235 | fBeginWordSet.compact(); | |
236 | fSuffixSet.compact(); | |
73c04bcf A |
237 | } |
238 | ||
239 | ThaiBreakEngine::~ThaiBreakEngine() { | |
240 | delete fDictionary; | |
241 | } | |
242 | ||
243 | int32_t | |
244 | ThaiBreakEngine::divideUpDictionaryRange( UText *text, | |
245 | int32_t rangeStart, | |
246 | int32_t rangeEnd, | |
247 | UStack &foundBreaks ) const { | |
248 | if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { | |
249 | return 0; // Not enough characters for two words | |
250 | } | |
251 | ||
252 | uint32_t wordsFound = 0; | |
253 | int32_t wordLength; | |
254 | int32_t current; | |
255 | UErrorCode status = U_ZERO_ERROR; | |
256 | PossibleWord words[THAI_LOOKAHEAD]; | |
257 | UChar32 uc; | |
258 | ||
259 | utext_setNativeIndex(text, rangeStart); | |
260 | ||
261 | while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | |
262 | wordLength = 0; | |
263 | ||
264 | // Look for candidate words at the current position | |
265 | int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
266 | ||
267 | // If we found exactly one, use that | |
268 | if (candidates == 1) { | |
269 | wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); | |
270 | wordsFound += 1; | |
271 | } | |
272 | ||
273 | // If there was more than one, see which one can take us forward the most words | |
274 | else if (candidates > 1) { | |
275 | // If we're already at the end of the range, we're done | |
276 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
277 | goto foundBest; | |
278 | } | |
279 | do { | |
280 | int wordsMatched = 1; | |
281 | if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { | |
282 | if (wordsMatched < 2) { | |
283 | // Followed by another dictionary word; mark first word as a good candidate | |
284 | words[wordsFound%THAI_LOOKAHEAD].markCurrent(); | |
285 | wordsMatched = 2; | |
286 | } | |
287 | ||
288 | // If we're already at the end of the range, we're done | |
289 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
290 | goto foundBest; | |
291 | } | |
292 | ||
293 | // See if any of the possible second words is followed by a third word | |
294 | do { | |
295 | // If we find a third word, stop right away | |
296 | if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { | |
297 | words[wordsFound%THAI_LOOKAHEAD].markCurrent(); | |
298 | goto foundBest; | |
299 | } | |
300 | } | |
301 | while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text)); | |
302 | } | |
303 | } | |
304 | while (words[wordsFound%THAI_LOOKAHEAD].backUp(text)); | |
305 | foundBest: | |
306 | wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); | |
307 | wordsFound += 1; | |
308 | } | |
309 | ||
310 | // We come here after having either found a word or not. We look ahead to the | |
311 | // next word. If it's not a dictionary word, we will combine it withe the word we | |
312 | // just found (if there is one), but only if the preceding word does not exceed | |
313 | // the threshold. | |
314 | // The text iterator should now be positioned at the end of the word we found. | |
315 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { | |
316 | // if it is a dictionary word, do nothing. If it isn't, then if there is | |
317 | // no preceding word, or the non-word shares less than the minimum threshold | |
318 | // of characters with a dictionary word, then scan to resynchronize | |
319 | if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
320 | && (wordLength == 0 | |
321 | || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { | |
322 | // Look for a plausible word boundary | |
323 | //TODO: This section will need a rework for UText. | |
324 | int32_t remaining = rangeEnd - (current+wordLength); | |
325 | UChar32 pc = utext_current32(text); | |
326 | int32_t chars = 0; | |
46f4442e | 327 | for (;;) { |
73c04bcf A |
328 | utext_next32(text); |
329 | uc = utext_current32(text); | |
330 | // TODO: Here we're counting on the fact that the SA languages are all | |
331 | // in the BMP. This should get fixed with the UText rework. | |
332 | chars += 1; | |
333 | if (--remaining <= 0) { | |
334 | break; | |
335 | } | |
336 | if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | |
337 | // Maybe. See if it's in the dictionary. | |
338 | // NOTE: In the original Apple code, checked that the next | |
339 | // two characters after uc were not 0x0E4C THANTHAKHAT before | |
340 | // checking the dictionary. That is just a performance filter, | |
341 | // but it's not clear it's faster than checking the trie. | |
342 | int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
343 | utext_setNativeIndex(text, current+wordLength+chars); | |
344 | if (candidates > 0) { | |
345 | break; | |
346 | } | |
347 | } | |
348 | pc = uc; | |
349 | } | |
350 | ||
351 | // Bump the word count if there wasn't already one | |
352 | if (wordLength <= 0) { | |
353 | wordsFound += 1; | |
354 | } | |
355 | ||
356 | // Update the length with the passed-over characters | |
357 | wordLength += chars; | |
358 | } | |
359 | else { | |
360 | // Back up to where we were for next iteration | |
361 | utext_setNativeIndex(text, current+wordLength); | |
362 | } | |
363 | } | |
364 | ||
365 | // Never stop before a combining mark. | |
366 | int32_t currPos; | |
367 | while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | |
368 | utext_next32(text); | |
369 | wordLength += (int32_t)utext_getNativeIndex(text) - currPos; | |
370 | } | |
371 | ||
372 | // Look ahead for possible suffixes if a dictionary word does not follow. | |
373 | // We do this in code rather than using a rule so that the heuristic | |
374 | // resynch continues to function. For example, one of the suffix characters | |
375 | // could be a typo in the middle of a word. | |
376 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { | |
377 | if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
378 | && fSuffixSet.contains(uc = utext_current32(text))) { | |
379 | if (uc == THAI_PAIYANNOI) { | |
380 | if (!fSuffixSet.contains(utext_previous32(text))) { | |
381 | // Skip over previous end and PAIYANNOI | |
382 | utext_next32(text); | |
383 | utext_next32(text); | |
384 | wordLength += 1; // Add PAIYANNOI to word | |
385 | uc = utext_current32(text); // Fetch next character | |
386 | } | |
387 | else { | |
388 | // Restore prior position | |
389 | utext_next32(text); | |
390 | } | |
391 | } | |
392 | if (uc == THAI_MAIYAMOK) { | |
393 | if (utext_previous32(text) != THAI_MAIYAMOK) { | |
394 | // Skip over previous end and MAIYAMOK | |
395 | utext_next32(text); | |
396 | utext_next32(text); | |
397 | wordLength += 1; // Add MAIYAMOK to word | |
398 | } | |
399 | else { | |
400 | // Restore prior position | |
401 | utext_next32(text); | |
402 | } | |
403 | } | |
404 | } | |
405 | else { | |
406 | utext_setNativeIndex(text, current+wordLength); | |
407 | } | |
408 | } | |
409 | ||
410 | // Did we find a word on this iteration? If so, push it on the break stack | |
411 | if (wordLength > 0) { | |
412 | foundBreaks.push((current+wordLength), status); | |
413 | } | |
414 | } | |
415 | ||
416 | // Don't return a break for the end of the dictionary range if there is one there. | |
417 | if (foundBreaks.peeki() >= rangeEnd) { | |
418 | (void) foundBreaks.popi(); | |
419 | wordsFound -= 1; | |
420 | } | |
421 | ||
422 | return wordsFound; | |
423 | } | |
424 | ||
425 | U_NAMESPACE_END | |
426 | ||
427 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |