]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /** |
2 | ******************************************************************************* | |
3 | * Copyright (C) 2006, International Business Machines Corporation and others. * | |
4 | * All Rights Reserved. * | |
5 | ******************************************************************************* | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_BREAK_ITERATION | |
11 | ||
12 | #include "brkeng.h" | |
13 | #include "dictbe.h" | |
14 | #include "unicode/uniset.h" | |
15 | #include "unicode/chariter.h" | |
16 | #include "unicode/ubrk.h" | |
17 | #include "uvector.h" | |
18 | #include "triedict.h" | |
19 | ||
20 | U_NAMESPACE_BEGIN | |
21 | ||
22 | /* | |
23 | ****************************************************************** | |
24 | */ | |
25 | ||
26 | /*DictionaryBreakEngine::DictionaryBreakEngine() { | |
27 | fTypes = 0; | |
28 | }*/ | |
29 | ||
30 | DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { | |
31 | fTypes = breakTypes; | |
32 | } | |
33 | ||
34 | DictionaryBreakEngine::~DictionaryBreakEngine() { | |
35 | } | |
36 | ||
37 | UBool | |
38 | DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { | |
39 | return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) | |
40 | && fSet.contains(c)); | |
41 | } | |
42 | ||
43 | int32_t | |
44 | DictionaryBreakEngine::findBreaks( UText *text, | |
45 | int32_t startPos, | |
46 | int32_t endPos, | |
47 | UBool reverse, | |
48 | int32_t breakType, | |
49 | UStack &foundBreaks ) const { | |
50 | int32_t result = 0; | |
51 | ||
52 | // Find the span of characters included in the set. | |
53 | int32_t start = (int32_t)utext_getNativeIndex(text); | |
54 | int32_t current; | |
55 | int32_t rangeStart; | |
56 | int32_t rangeEnd; | |
57 | UChar32 c = utext_current32(text); | |
58 | if (reverse) { | |
59 | UBool isDict = fSet.contains(c); | |
60 | while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { | |
61 | c = utext_previous32(text); | |
62 | isDict = fSet.contains(c); | |
63 | } | |
64 | rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); | |
65 | rangeEnd = start + 1; | |
66 | } | |
67 | else { | |
68 | while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { | |
69 | utext_next32(text); // TODO: recast loop for postincrement | |
70 | c = utext_current32(text); | |
71 | } | |
72 | rangeStart = start; | |
73 | rangeEnd = current; | |
74 | } | |
75 | if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { | |
76 | result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); | |
77 | utext_setNativeIndex(text, current); | |
78 | } | |
79 | ||
80 | return result; | |
81 | } | |
82 | ||
83 | void | |
84 | DictionaryBreakEngine::setCharacters( UnicodeSet &set ) { | |
85 | fSet = set; | |
86 | } | |
87 | ||
88 | /*void | |
89 | DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) { | |
90 | fTypes = breakTypes; | |
91 | }*/ | |
92 | ||
93 | /* | |
94 | ****************************************************************** | |
95 | */ | |
96 | ||
97 | ||
98 | // Helper class for improving readability of the Thai word break | |
99 | // algorithm. The implementation is completely inline. | |
100 | ||
101 | // List size, limited by the maximum number of words in the dictionary | |
102 | // that form a nested sequence. | |
103 | #define POSSIBLE_WORD_LIST_MAX 20 | |
104 | ||
105 | class PossibleWord { | |
106 | private: | |
107 | // list of word candidate lengths, in increasing length order | |
108 | int32_t lengths[POSSIBLE_WORD_LIST_MAX]; | |
109 | int count; // Count of candidates | |
110 | int32_t prefix; // The longest match with a dictionary word | |
111 | int32_t offset; // Offset in the text of these candidates | |
112 | int mark; // The preferred candidate's offset | |
113 | int current; // The candidate we're currently looking at | |
114 | ||
115 | public: | |
116 | PossibleWord(); | |
117 | ~PossibleWord(); | |
118 | ||
119 | // Fill the list of candidates if needed, select the longest, and return the number found | |
120 | int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ); | |
121 | ||
122 | // Select the currently marked candidate, point after it in the text, and invalidate self | |
123 | int32_t acceptMarked( UText *text ); | |
124 | ||
125 | // Back up from the current candidate to the next shorter one; return TRUE if that exists | |
126 | // and point the text after it | |
127 | UBool backUp( UText *text ); | |
128 | ||
129 | // Return the longest prefix this candidate location shares with a dictionary word | |
130 | int32_t longestPrefix(); | |
131 | ||
132 | // Mark the current candidate as the one we like | |
133 | void markCurrent(); | |
134 | }; | |
135 | ||
136 | inline | |
137 | PossibleWord::PossibleWord() { | |
138 | offset = -1; | |
139 | } | |
140 | ||
141 | inline | |
142 | PossibleWord::~PossibleWord() { | |
143 | } | |
144 | ||
145 | inline int | |
146 | PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) { | |
147 | // TODO: If getIndex is too slow, use offset < 0 and add discardAll() | |
148 | int32_t start = (int32_t)utext_getNativeIndex(text); | |
149 | if (start != offset) { | |
150 | offset = start; | |
151 | prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); | |
152 | // Dictionary leaves text after longest prefix, not longest word. Back up. | |
153 | if (count <= 0) { | |
154 | utext_setNativeIndex(text, start); | |
155 | } | |
156 | } | |
157 | if (count > 0) { | |
158 | utext_setNativeIndex(text, start+lengths[count-1]); | |
159 | } | |
160 | current = count-1; | |
161 | mark = current; | |
162 | return count; | |
163 | } | |
164 | ||
165 | inline int32_t | |
166 | PossibleWord::acceptMarked( UText *text ) { | |
167 | utext_setNativeIndex(text, offset + lengths[mark]); | |
168 | return lengths[mark]; | |
169 | } | |
170 | ||
171 | inline UBool | |
172 | PossibleWord::backUp( UText *text ) { | |
173 | if (current > 0) { | |
174 | utext_setNativeIndex(text, offset + lengths[--current]); | |
175 | return TRUE; | |
176 | } | |
177 | return FALSE; | |
178 | } | |
179 | ||
180 | inline int32_t | |
181 | PossibleWord::longestPrefix() { | |
182 | return prefix; | |
183 | } | |
184 | ||
185 | inline void | |
186 | PossibleWord::markCurrent() { | |
187 | mark = current; | |
188 | } | |
189 | ||
190 | // How many words in a row are "good enough"? | |
191 | #define THAI_LOOKAHEAD 3 | |
192 | ||
193 | // Will not combine a non-word with a preceding dictionary word longer than this | |
194 | #define THAI_ROOT_COMBINE_THRESHOLD 3 | |
195 | ||
196 | // Will not combine a non-word that shares at least this much prefix with a | |
197 | // dictionary word, with a preceding word | |
198 | #define THAI_PREFIX_COMBINE_THRESHOLD 3 | |
199 | ||
200 | // Ellision character | |
201 | #define THAI_PAIYANNOI 0x0E2F | |
202 | ||
203 | // Repeat character | |
204 | #define THAI_MAIYAMOK 0x0E46 | |
205 | ||
206 | // Minimum word size | |
207 | #define THAI_MIN_WORD 2 | |
208 | ||
209 | // Minimum number of characters for two words | |
210 | #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) | |
211 | ||
212 | ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) | |
213 | : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), | |
214 | fDictionary(adoptDictionary) | |
215 | { | |
216 | fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); | |
217 | if (U_SUCCESS(status)) { | |
218 | setCharacters(fThaiWordSet); | |
219 | } | |
220 | fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); | |
221 | fEndWordSet = fThaiWordSet; | |
222 | fEndWordSet.remove(0x0E31); // MAI HAN-AKAT | |
223 | fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
224 | fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK | |
225 | fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI | |
226 | fSuffixSet.add(THAI_PAIYANNOI); | |
227 | fSuffixSet.add(THAI_MAIYAMOK); | |
228 | } | |
229 | ||
230 | ThaiBreakEngine::~ThaiBreakEngine() { | |
231 | delete fDictionary; | |
232 | } | |
233 | ||
234 | int32_t | |
235 | ThaiBreakEngine::divideUpDictionaryRange( UText *text, | |
236 | int32_t rangeStart, | |
237 | int32_t rangeEnd, | |
238 | UStack &foundBreaks ) const { | |
239 | if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { | |
240 | return 0; // Not enough characters for two words | |
241 | } | |
242 | ||
243 | uint32_t wordsFound = 0; | |
244 | int32_t wordLength; | |
245 | int32_t current; | |
246 | UErrorCode status = U_ZERO_ERROR; | |
247 | PossibleWord words[THAI_LOOKAHEAD]; | |
248 | UChar32 uc; | |
249 | ||
250 | utext_setNativeIndex(text, rangeStart); | |
251 | ||
252 | while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | |
253 | wordLength = 0; | |
254 | ||
255 | // Look for candidate words at the current position | |
256 | int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
257 | ||
258 | // If we found exactly one, use that | |
259 | if (candidates == 1) { | |
260 | wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); | |
261 | wordsFound += 1; | |
262 | } | |
263 | ||
264 | // If there was more than one, see which one can take us forward the most words | |
265 | else if (candidates > 1) { | |
266 | // If we're already at the end of the range, we're done | |
267 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
268 | goto foundBest; | |
269 | } | |
270 | do { | |
271 | int wordsMatched = 1; | |
272 | if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { | |
273 | if (wordsMatched < 2) { | |
274 | // Followed by another dictionary word; mark first word as a good candidate | |
275 | words[wordsFound%THAI_LOOKAHEAD].markCurrent(); | |
276 | wordsMatched = 2; | |
277 | } | |
278 | ||
279 | // If we're already at the end of the range, we're done | |
280 | if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | |
281 | goto foundBest; | |
282 | } | |
283 | ||
284 | // See if any of the possible second words is followed by a third word | |
285 | do { | |
286 | // If we find a third word, stop right away | |
287 | if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { | |
288 | words[wordsFound%THAI_LOOKAHEAD].markCurrent(); | |
289 | goto foundBest; | |
290 | } | |
291 | } | |
292 | while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text)); | |
293 | } | |
294 | } | |
295 | while (words[wordsFound%THAI_LOOKAHEAD].backUp(text)); | |
296 | foundBest: | |
297 | wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); | |
298 | wordsFound += 1; | |
299 | } | |
300 | ||
301 | // We come here after having either found a word or not. We look ahead to the | |
302 | // next word. If it's not a dictionary word, we will combine it withe the word we | |
303 | // just found (if there is one), but only if the preceding word does not exceed | |
304 | // the threshold. | |
305 | // The text iterator should now be positioned at the end of the word we found. | |
306 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { | |
307 | // if it is a dictionary word, do nothing. If it isn't, then if there is | |
308 | // no preceding word, or the non-word shares less than the minimum threshold | |
309 | // of characters with a dictionary word, then scan to resynchronize | |
310 | if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
311 | && (wordLength == 0 | |
312 | || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { | |
313 | // Look for a plausible word boundary | |
314 | //TODO: This section will need a rework for UText. | |
315 | int32_t remaining = rangeEnd - (current+wordLength); | |
316 | UChar32 pc = utext_current32(text); | |
317 | int32_t chars = 0; | |
318 | while (TRUE) { | |
319 | utext_next32(text); | |
320 | uc = utext_current32(text); | |
321 | // TODO: Here we're counting on the fact that the SA languages are all | |
322 | // in the BMP. This should get fixed with the UText rework. | |
323 | chars += 1; | |
324 | if (--remaining <= 0) { | |
325 | break; | |
326 | } | |
327 | if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | |
328 | // Maybe. See if it's in the dictionary. | |
329 | // NOTE: In the original Apple code, checked that the next | |
330 | // two characters after uc were not 0x0E4C THANTHAKHAT before | |
331 | // checking the dictionary. That is just a performance filter, | |
332 | // but it's not clear it's faster than checking the trie. | |
333 | int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); | |
334 | utext_setNativeIndex(text, current+wordLength+chars); | |
335 | if (candidates > 0) { | |
336 | break; | |
337 | } | |
338 | } | |
339 | pc = uc; | |
340 | } | |
341 | ||
342 | // Bump the word count if there wasn't already one | |
343 | if (wordLength <= 0) { | |
344 | wordsFound += 1; | |
345 | } | |
346 | ||
347 | // Update the length with the passed-over characters | |
348 | wordLength += chars; | |
349 | } | |
350 | else { | |
351 | // Back up to where we were for next iteration | |
352 | utext_setNativeIndex(text, current+wordLength); | |
353 | } | |
354 | } | |
355 | ||
356 | // Never stop before a combining mark. | |
357 | int32_t currPos; | |
358 | while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | |
359 | utext_next32(text); | |
360 | wordLength += (int32_t)utext_getNativeIndex(text) - currPos; | |
361 | } | |
362 | ||
363 | // Look ahead for possible suffixes if a dictionary word does not follow. | |
364 | // We do this in code rather than using a rule so that the heuristic | |
365 | // resynch continues to function. For example, one of the suffix characters | |
366 | // could be a typo in the middle of a word. | |
367 | if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { | |
368 | if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 | |
369 | && fSuffixSet.contains(uc = utext_current32(text))) { | |
370 | if (uc == THAI_PAIYANNOI) { | |
371 | if (!fSuffixSet.contains(utext_previous32(text))) { | |
372 | // Skip over previous end and PAIYANNOI | |
373 | utext_next32(text); | |
374 | utext_next32(text); | |
375 | wordLength += 1; // Add PAIYANNOI to word | |
376 | uc = utext_current32(text); // Fetch next character | |
377 | } | |
378 | else { | |
379 | // Restore prior position | |
380 | utext_next32(text); | |
381 | } | |
382 | } | |
383 | if (uc == THAI_MAIYAMOK) { | |
384 | if (utext_previous32(text) != THAI_MAIYAMOK) { | |
385 | // Skip over previous end and MAIYAMOK | |
386 | utext_next32(text); | |
387 | utext_next32(text); | |
388 | wordLength += 1; // Add MAIYAMOK to word | |
389 | } | |
390 | else { | |
391 | // Restore prior position | |
392 | utext_next32(text); | |
393 | } | |
394 | } | |
395 | } | |
396 | else { | |
397 | utext_setNativeIndex(text, current+wordLength); | |
398 | } | |
399 | } | |
400 | ||
401 | // Did we find a word on this iteration? If so, push it on the break stack | |
402 | if (wordLength > 0) { | |
403 | foundBreaks.push((current+wordLength), status); | |
404 | } | |
405 | } | |
406 | ||
407 | // Don't return a break for the end of the dictionary range if there is one there. | |
408 | if (foundBreaks.peeki() >= rangeEnd) { | |
409 | (void) foundBreaks.popi(); | |
410 | wordsFound -= 1; | |
411 | } | |
412 | ||
413 | return wordsFound; | |
414 | } | |
415 | ||
416 | U_NAMESPACE_END | |
417 | ||
418 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |