2 *******************************************************************************
3 * Copyright (C) 2006-2008, International Business Machines Corporation and others. *
4 * All Rights Reserved. *
5 *******************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_BREAK_ITERATION
14 #include "unicode/uniset.h"
15 #include "unicode/chariter.h"
16 #include "unicode/ubrk.h"
23 ******************************************************************
26 /*DictionaryBreakEngine::DictionaryBreakEngine() {
30 DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes
) {
34 DictionaryBreakEngine::~DictionaryBreakEngine() {
38 DictionaryBreakEngine::handles(UChar32 c
, int32_t breakType
) const {
39 return (breakType
>= 0 && breakType
< 32 && (((uint32_t)1 << breakType
) & fTypes
)
44 DictionaryBreakEngine::findBreaks( UText
*text
,
49 UStack
&foundBreaks
) const {
52 // Find the span of characters included in the set.
53 int32_t start
= (int32_t)utext_getNativeIndex(text
);
57 UChar32 c
= utext_current32(text
);
59 UBool isDict
= fSet
.contains(c
);
60 while((current
= (int32_t)utext_getNativeIndex(text
)) > startPos
&& isDict
) {
61 c
= utext_previous32(text
);
62 isDict
= fSet
.contains(c
);
64 rangeStart
= (current
< startPos
) ? startPos
: current
+(isDict
? 0 : 1);
68 while((current
= (int32_t)utext_getNativeIndex(text
)) < endPos
&& fSet
.contains(c
)) {
69 utext_next32(text
); // TODO: recast loop for postincrement
70 c
= utext_current32(text
);
75 if (breakType
>= 0 && breakType
< 32 && (((uint32_t)1 << breakType
) & fTypes
)) {
76 result
= divideUpDictionaryRange(text
, rangeStart
, rangeEnd
, foundBreaks
);
77 utext_setNativeIndex(text
, current
);
84 DictionaryBreakEngine::setCharacters( const UnicodeSet
&set
) {
86 // Compact for caching
91 DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) {
96 ******************************************************************
100 // Helper class for improving readability of the Thai word break
101 // algorithm. The implementation is completely inline.
103 // List size, limited by the maximum number of words in the dictionary
104 // that form a nested sequence.
105 #define POSSIBLE_WORD_LIST_MAX 20
109 // list of word candidate lengths, in increasing length order
110 int32_t lengths
[POSSIBLE_WORD_LIST_MAX
];
111 int count
; // Count of candidates
112 int32_t prefix
; // The longest match with a dictionary word
113 int32_t offset
; // Offset in the text of these candidates
114 int mark
; // The preferred candidate's offset
115 int current
; // The candidate we're currently looking at
121 // Fill the list of candidates if needed, select the longest, and return the number found
122 int candidates( UText
*text
, const TrieWordDictionary
*dict
, int32_t rangeEnd
);
124 // Select the currently marked candidate, point after it in the text, and invalidate self
125 int32_t acceptMarked( UText
*text
);
127 // Back up from the current candidate to the next shorter one; return TRUE if that exists
128 // and point the text after it
129 UBool
backUp( UText
*text
);
131 // Return the longest prefix this candidate location shares with a dictionary word
132 int32_t longestPrefix();
134 // Mark the current candidate as the one we like
139 PossibleWord::PossibleWord() {
144 PossibleWord::~PossibleWord() {
148 PossibleWord::candidates( UText
*text
, const TrieWordDictionary
*dict
, int32_t rangeEnd
) {
149 // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
150 int32_t start
= (int32_t)utext_getNativeIndex(text
);
151 if (start
!= offset
) {
153 prefix
= dict
->matches(text
, rangeEnd
-start
, lengths
, count
, sizeof(lengths
)/sizeof(lengths
[0]));
154 // Dictionary leaves text after longest prefix, not longest word. Back up.
156 utext_setNativeIndex(text
, start
);
160 utext_setNativeIndex(text
, start
+lengths
[count
-1]);
168 PossibleWord::acceptMarked( UText
*text
) {
169 utext_setNativeIndex(text
, offset
+ lengths
[mark
]);
170 return lengths
[mark
];
174 PossibleWord::backUp( UText
*text
) {
176 utext_setNativeIndex(text
, offset
+ lengths
[--current
]);
183 PossibleWord::longestPrefix() {
188 PossibleWord::markCurrent() {
192 // How many words in a row are "good enough"?
193 #define THAI_LOOKAHEAD 3
195 // Will not combine a non-word with a preceding dictionary word longer than this
196 #define THAI_ROOT_COMBINE_THRESHOLD 3
198 // Will not combine a non-word that shares at least this much prefix with a
199 // dictionary word, with a preceding word
200 #define THAI_PREFIX_COMBINE_THRESHOLD 3
202 // Ellision character
203 #define THAI_PAIYANNOI 0x0E2F
206 #define THAI_MAIYAMOK 0x0E46
209 #define THAI_MIN_WORD 2
211 // Minimum number of characters for two words
212 #define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
214 ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary
*adoptDictionary
, UErrorCode
&status
)
215 : DictionaryBreakEngine((1<<UBRK_WORD
) | (1<<UBRK_LINE
)),
216 fDictionary(adoptDictionary
)
218 fThaiWordSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status
);
219 if (U_SUCCESS(status
)) {
220 setCharacters(fThaiWordSet
);
222 fMarkSet
.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status
);
223 fMarkSet
.add(0x0020);
224 fEndWordSet
= fThaiWordSet
;
225 fEndWordSet
.remove(0x0E31); // MAI HAN-AKAT
226 fEndWordSet
.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
227 fBeginWordSet
.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
228 fBeginWordSet
.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
229 fSuffixSet
.add(THAI_PAIYANNOI
);
230 fSuffixSet
.add(THAI_MAIYAMOK
);
232 // Compact for caching.
234 fEndWordSet
.compact();
235 fBeginWordSet
.compact();
236 fSuffixSet
.compact();
239 ThaiBreakEngine::~ThaiBreakEngine() {
244 ThaiBreakEngine::divideUpDictionaryRange( UText
*text
,
247 UStack
&foundBreaks
) const {
248 if ((rangeEnd
- rangeStart
) < THAI_MIN_WORD_SPAN
) {
249 return 0; // Not enough characters for two words
252 uint32_t wordsFound
= 0;
255 UErrorCode status
= U_ZERO_ERROR
;
256 PossibleWord words
[THAI_LOOKAHEAD
];
259 utext_setNativeIndex(text
, rangeStart
);
261 while (U_SUCCESS(status
) && (current
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
) {
264 // Look for candidate words at the current position
265 int candidates
= words
[wordsFound%THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
267 // If we found exactly one, use that
268 if (candidates
== 1) {
269 wordLength
= words
[wordsFound%THAI_LOOKAHEAD
].acceptMarked(text
);
273 // If there was more than one, see which one can take us forward the most words
274 else if (candidates
> 1) {
275 // If we're already at the end of the range, we're done
276 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
280 int wordsMatched
= 1;
281 if (words
[(wordsFound
+1)%THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) > 0) {
282 if (wordsMatched
< 2) {
283 // Followed by another dictionary word; mark first word as a good candidate
284 words
[wordsFound%THAI_LOOKAHEAD
].markCurrent();
288 // If we're already at the end of the range, we're done
289 if ((int32_t)utext_getNativeIndex(text
) >= rangeEnd
) {
293 // See if any of the possible second words is followed by a third word
295 // If we find a third word, stop right away
296 if (words
[(wordsFound
+2)%THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
)) {
297 words
[wordsFound%THAI_LOOKAHEAD
].markCurrent();
301 while (words
[(wordsFound
+1)%THAI_LOOKAHEAD
].backUp(text
));
304 while (words
[wordsFound%THAI_LOOKAHEAD
].backUp(text
));
306 wordLength
= words
[wordsFound%THAI_LOOKAHEAD
].acceptMarked(text
);
310 // We come here after having either found a word or not. We look ahead to the
311 // next word. If it's not a dictionary word, we will combine it withe the word we
312 // just found (if there is one), but only if the preceding word does not exceed
314 // The text iterator should now be positioned at the end of the word we found.
315 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& wordLength
< THAI_ROOT_COMBINE_THRESHOLD
) {
316 // if it is a dictionary word, do nothing. If it isn't, then if there is
317 // no preceding word, or the non-word shares less than the minimum threshold
318 // of characters with a dictionary word, then scan to resynchronize
319 if (words
[wordsFound%THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
321 || words
[wordsFound%THAI_LOOKAHEAD
].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD
)) {
322 // Look for a plausible word boundary
323 //TODO: This section will need a rework for UText.
324 int32_t remaining
= rangeEnd
- (current
+wordLength
);
325 UChar32 pc
= utext_current32(text
);
329 uc
= utext_current32(text
);
330 // TODO: Here we're counting on the fact that the SA languages are all
331 // in the BMP. This should get fixed with the UText rework.
333 if (--remaining
<= 0) {
336 if (fEndWordSet
.contains(pc
) && fBeginWordSet
.contains(uc
)) {
337 // Maybe. See if it's in the dictionary.
338 // NOTE: In the original Apple code, checked that the next
339 // two characters after uc were not 0x0E4C THANTHAKHAT before
340 // checking the dictionary. That is just a performance filter,
341 // but it's not clear it's faster than checking the trie.
342 int candidates
= words
[(wordsFound
+1)%THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
);
343 utext_setNativeIndex(text
, current
+wordLength
+chars
);
344 if (candidates
> 0) {
351 // Bump the word count if there wasn't already one
352 if (wordLength
<= 0) {
356 // Update the length with the passed-over characters
360 // Back up to where we were for next iteration
361 utext_setNativeIndex(text
, current
+wordLength
);
365 // Never stop before a combining mark.
367 while ((currPos
= (int32_t)utext_getNativeIndex(text
)) < rangeEnd
&& fMarkSet
.contains(utext_current32(text
))) {
369 wordLength
+= (int32_t)utext_getNativeIndex(text
) - currPos
;
372 // Look ahead for possible suffixes if a dictionary word does not follow.
373 // We do this in code rather than using a rule so that the heuristic
374 // resynch continues to function. For example, one of the suffix characters
375 // could be a typo in the middle of a word.
376 if ((int32_t)utext_getNativeIndex(text
) < rangeEnd
&& wordLength
> 0) {
377 if (words
[wordsFound%THAI_LOOKAHEAD
].candidates(text
, fDictionary
, rangeEnd
) <= 0
378 && fSuffixSet
.contains(uc
= utext_current32(text
))) {
379 if (uc
== THAI_PAIYANNOI
) {
380 if (!fSuffixSet
.contains(utext_previous32(text
))) {
381 // Skip over previous end and PAIYANNOI
384 wordLength
+= 1; // Add PAIYANNOI to word
385 uc
= utext_current32(text
); // Fetch next character
388 // Restore prior position
392 if (uc
== THAI_MAIYAMOK
) {
393 if (utext_previous32(text
) != THAI_MAIYAMOK
) {
394 // Skip over previous end and MAIYAMOK
397 wordLength
+= 1; // Add MAIYAMOK to word
400 // Restore prior position
406 utext_setNativeIndex(text
, current
+wordLength
);
410 // Did we find a word on this iteration? If so, push it on the break stack
411 if (wordLength
> 0) {
412 foundBreaks
.push((current
+wordLength
), status
);
416 // Don't return a break for the end of the dictionary range if there is one there.
417 if (foundBreaks
.peeki() >= rangeEnd
) {
418 (void) foundBreaks
.popi();
427 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */