]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/dictbe.cpp
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / common / dictbe.cpp
CommitLineData
73c04bcf
A
1/**
2 *******************************************************************************
3 * Copyright (C) 2006, International Business Machines Corporation and others. *
4 * All Rights Reserved. *
5 *******************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "brkeng.h"
13#include "dictbe.h"
14#include "unicode/uniset.h"
15#include "unicode/chariter.h"
16#include "unicode/ubrk.h"
17#include "uvector.h"
18#include "triedict.h"
19
20U_NAMESPACE_BEGIN
21
22/*
23 ******************************************************************
24 */
25
26/*DictionaryBreakEngine::DictionaryBreakEngine() {
27 fTypes = 0;
28}*/
29
30DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
31 fTypes = breakTypes;
32}
33
34DictionaryBreakEngine::~DictionaryBreakEngine() {
35}
36
37UBool
38DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
39 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
40 && fSet.contains(c));
41}
42
43int32_t
44DictionaryBreakEngine::findBreaks( UText *text,
45 int32_t startPos,
46 int32_t endPos,
47 UBool reverse,
48 int32_t breakType,
49 UStack &foundBreaks ) const {
50 int32_t result = 0;
51
52 // Find the span of characters included in the set.
53 int32_t start = (int32_t)utext_getNativeIndex(text);
54 int32_t current;
55 int32_t rangeStart;
56 int32_t rangeEnd;
57 UChar32 c = utext_current32(text);
58 if (reverse) {
59 UBool isDict = fSet.contains(c);
60 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
61 c = utext_previous32(text);
62 isDict = fSet.contains(c);
63 }
64 rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
65 rangeEnd = start + 1;
66 }
67 else {
68 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
69 utext_next32(text); // TODO: recast loop for postincrement
70 c = utext_current32(text);
71 }
72 rangeStart = start;
73 rangeEnd = current;
74 }
75 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
76 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
77 utext_setNativeIndex(text, current);
78 }
79
80 return result;
81}
82
83void
84DictionaryBreakEngine::setCharacters( UnicodeSet &set ) {
85 fSet = set;
86}
87
88/*void
89DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
90 fTypes = breakTypes;
91}*/
92
93/*
94 ******************************************************************
95 */
96
97
98// Helper class for improving readability of the Thai word break
99// algorithm. The implementation is completely inline.
100
101// List size, limited by the maximum number of words in the dictionary
102// that form a nested sequence.
103#define POSSIBLE_WORD_LIST_MAX 20
104
105class PossibleWord {
106 private:
107 // list of word candidate lengths, in increasing length order
108 int32_t lengths[POSSIBLE_WORD_LIST_MAX];
109 int count; // Count of candidates
110 int32_t prefix; // The longest match with a dictionary word
111 int32_t offset; // Offset in the text of these candidates
112 int mark; // The preferred candidate's offset
113 int current; // The candidate we're currently looking at
114
115 public:
116 PossibleWord();
117 ~PossibleWord();
118
119 // Fill the list of candidates if needed, select the longest, and return the number found
120 int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd );
121
122 // Select the currently marked candidate, point after it in the text, and invalidate self
123 int32_t acceptMarked( UText *text );
124
125 // Back up from the current candidate to the next shorter one; return TRUE if that exists
126 // and point the text after it
127 UBool backUp( UText *text );
128
129 // Return the longest prefix this candidate location shares with a dictionary word
130 int32_t longestPrefix();
131
132 // Mark the current candidate as the one we like
133 void markCurrent();
134};
135
136inline
137PossibleWord::PossibleWord() {
138 offset = -1;
139}
140
141inline
142PossibleWord::~PossibleWord() {
143}
144
145inline int
146PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) {
147 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
148 int32_t start = (int32_t)utext_getNativeIndex(text);
149 if (start != offset) {
150 offset = start;
151 prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
152 // Dictionary leaves text after longest prefix, not longest word. Back up.
153 if (count <= 0) {
154 utext_setNativeIndex(text, start);
155 }
156 }
157 if (count > 0) {
158 utext_setNativeIndex(text, start+lengths[count-1]);
159 }
160 current = count-1;
161 mark = current;
162 return count;
163}
164
165inline int32_t
166PossibleWord::acceptMarked( UText *text ) {
167 utext_setNativeIndex(text, offset + lengths[mark]);
168 return lengths[mark];
169}
170
171inline UBool
172PossibleWord::backUp( UText *text ) {
173 if (current > 0) {
174 utext_setNativeIndex(text, offset + lengths[--current]);
175 return TRUE;
176 }
177 return FALSE;
178}
179
180inline int32_t
181PossibleWord::longestPrefix() {
182 return prefix;
183}
184
185inline void
186PossibleWord::markCurrent() {
187 mark = current;
188}
189
190// How many words in a row are "good enough"?
191#define THAI_LOOKAHEAD 3
192
193// Will not combine a non-word with a preceding dictionary word longer than this
194#define THAI_ROOT_COMBINE_THRESHOLD 3
195
196// Will not combine a non-word that shares at least this much prefix with a
197// dictionary word, with a preceding word
198#define THAI_PREFIX_COMBINE_THRESHOLD 3
199
200// Ellision character
201#define THAI_PAIYANNOI 0x0E2F
202
203// Repeat character
204#define THAI_MAIYAMOK 0x0E46
205
206// Minimum word size
207#define THAI_MIN_WORD 2
208
209// Minimum number of characters for two words
210#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
211
212ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status)
213 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
214 fDictionary(adoptDictionary)
215{
216 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
217 if (U_SUCCESS(status)) {
218 setCharacters(fThaiWordSet);
219 }
220 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
221 fEndWordSet = fThaiWordSet;
222 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
223 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
224 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
225 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
226 fSuffixSet.add(THAI_PAIYANNOI);
227 fSuffixSet.add(THAI_MAIYAMOK);
228}
229
230ThaiBreakEngine::~ThaiBreakEngine() {
231 delete fDictionary;
232}
233
234int32_t
235ThaiBreakEngine::divideUpDictionaryRange( UText *text,
236 int32_t rangeStart,
237 int32_t rangeEnd,
238 UStack &foundBreaks ) const {
239 if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
240 return 0; // Not enough characters for two words
241 }
242
243 uint32_t wordsFound = 0;
244 int32_t wordLength;
245 int32_t current;
246 UErrorCode status = U_ZERO_ERROR;
247 PossibleWord words[THAI_LOOKAHEAD];
248 UChar32 uc;
249
250 utext_setNativeIndex(text, rangeStart);
251
252 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
253 wordLength = 0;
254
255 // Look for candidate words at the current position
256 int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
257
258 // If we found exactly one, use that
259 if (candidates == 1) {
260 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
261 wordsFound += 1;
262 }
263
264 // If there was more than one, see which one can take us forward the most words
265 else if (candidates > 1) {
266 // If we're already at the end of the range, we're done
267 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
268 goto foundBest;
269 }
270 do {
271 int wordsMatched = 1;
272 if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
273 if (wordsMatched < 2) {
274 // Followed by another dictionary word; mark first word as a good candidate
275 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
276 wordsMatched = 2;
277 }
278
279 // If we're already at the end of the range, we're done
280 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
281 goto foundBest;
282 }
283
284 // See if any of the possible second words is followed by a third word
285 do {
286 // If we find a third word, stop right away
287 if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
288 words[wordsFound%THAI_LOOKAHEAD].markCurrent();
289 goto foundBest;
290 }
291 }
292 while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text));
293 }
294 }
295 while (words[wordsFound%THAI_LOOKAHEAD].backUp(text));
296foundBest:
297 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text);
298 wordsFound += 1;
299 }
300
301 // We come here after having either found a word or not. We look ahead to the
302 // next word. If it's not a dictionary word, we will combine it withe the word we
303 // just found (if there is one), but only if the preceding word does not exceed
304 // the threshold.
305 // The text iterator should now be positioned at the end of the word we found.
306 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
307 // if it is a dictionary word, do nothing. If it isn't, then if there is
308 // no preceding word, or the non-word shares less than the minimum threshold
309 // of characters with a dictionary word, then scan to resynchronize
310 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
311 && (wordLength == 0
312 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
313 // Look for a plausible word boundary
314 //TODO: This section will need a rework for UText.
315 int32_t remaining = rangeEnd - (current+wordLength);
316 UChar32 pc = utext_current32(text);
317 int32_t chars = 0;
318 while (TRUE) {
319 utext_next32(text);
320 uc = utext_current32(text);
321 // TODO: Here we're counting on the fact that the SA languages are all
322 // in the BMP. This should get fixed with the UText rework.
323 chars += 1;
324 if (--remaining <= 0) {
325 break;
326 }
327 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
328 // Maybe. See if it's in the dictionary.
329 // NOTE: In the original Apple code, checked that the next
330 // two characters after uc were not 0x0E4C THANTHAKHAT before
331 // checking the dictionary. That is just a performance filter,
332 // but it's not clear it's faster than checking the trie.
333 int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
334 utext_setNativeIndex(text, current+wordLength+chars);
335 if (candidates > 0) {
336 break;
337 }
338 }
339 pc = uc;
340 }
341
342 // Bump the word count if there wasn't already one
343 if (wordLength <= 0) {
344 wordsFound += 1;
345 }
346
347 // Update the length with the passed-over characters
348 wordLength += chars;
349 }
350 else {
351 // Back up to where we were for next iteration
352 utext_setNativeIndex(text, current+wordLength);
353 }
354 }
355
356 // Never stop before a combining mark.
357 int32_t currPos;
358 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
359 utext_next32(text);
360 wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
361 }
362
363 // Look ahead for possible suffixes if a dictionary word does not follow.
364 // We do this in code rather than using a rule so that the heuristic
365 // resynch continues to function. For example, one of the suffix characters
366 // could be a typo in the middle of a word.
367 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
368 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
369 && fSuffixSet.contains(uc = utext_current32(text))) {
370 if (uc == THAI_PAIYANNOI) {
371 if (!fSuffixSet.contains(utext_previous32(text))) {
372 // Skip over previous end and PAIYANNOI
373 utext_next32(text);
374 utext_next32(text);
375 wordLength += 1; // Add PAIYANNOI to word
376 uc = utext_current32(text); // Fetch next character
377 }
378 else {
379 // Restore prior position
380 utext_next32(text);
381 }
382 }
383 if (uc == THAI_MAIYAMOK) {
384 if (utext_previous32(text) != THAI_MAIYAMOK) {
385 // Skip over previous end and MAIYAMOK
386 utext_next32(text);
387 utext_next32(text);
388 wordLength += 1; // Add MAIYAMOK to word
389 }
390 else {
391 // Restore prior position
392 utext_next32(text);
393 }
394 }
395 }
396 else {
397 utext_setNativeIndex(text, current+wordLength);
398 }
399 }
400
401 // Did we find a word on this iteration? If so, push it on the break stack
402 if (wordLength > 0) {
403 foundBreaks.push((current+wordLength), status);
404 }
405 }
406
407 // Don't return a break for the end of the dictionary range if there is one there.
408 if (foundBreaks.peeki() >= rangeEnd) {
409 (void) foundBreaks.popi();
410 wordsFound -= 1;
411 }
412
413 return wordsFound;
414}
415
416U_NAMESPACE_END
417
418#endif /* #if !UCONFIG_NO_BREAK_ITERATION */