2 **********************************************************************
3 * Copyright (C) 1999-2004 IBM Corp. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 12/1/99 rgillam Complete port from Java.
7 * 01/13/2000 helena Added UErrorCode to ctors.
8 **********************************************************************
11 #include "unicode/utypes.h"
13 #if !UCONFIG_NO_BREAK_ITERATION
15 #include "unicode/dbbi.h"
16 #include "unicode/schriter.h"
24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DictionaryBasedBreakIterator
)
27 //------------------------------------------------------------------------------
31 //------------------------------------------------------------------------------
33 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator() :
34 RuleBasedBreakIterator() {
39 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory
* rbbiData
,
40 const char* dictionaryFilename
,
42 : RuleBasedBreakIterator(rbbiData
, status
)
45 if (U_FAILURE(status
)) {return;};
46 fTables
= new DictionaryBasedBreakIteratorTables(dictionaryFilename
, status
);
47 if (U_FAILURE(status
)) {
48 if (fTables
!= NULL
) {
49 fTables
->removeReference();
56 status
= U_MEMORY_ALLOCATION_ERROR
;
62 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator
&other
) :
63 RuleBasedBreakIterator(other
)
66 if (other
.fTables
!= NULL
) {
67 fTables
= other
.fTables
;
68 fTables
->addReference();
75 //------------------------------------------------------------------------------
79 //------------------------------------------------------------------------------
80 DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
82 uprv_free(cachedBreakPositions
);
83 cachedBreakPositions
= NULL
;
84 if (fTables
!= NULL
) {fTables
->removeReference();};
87 //------------------------------------------------------------------------------
89 // Assignment operator. Sets this iterator to have the same behavior,
90 // and iterate over the same text, as the one passed in.
92 //------------------------------------------------------------------------------
93 DictionaryBasedBreakIterator
&
94 DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator
& that
) {
98 reset(); // clears out cached break positions.
99 RuleBasedBreakIterator::operator=(that
);
100 if (this->fTables
!= that
.fTables
) {
101 if (this->fTables
!= NULL
) {this->fTables
->removeReference();};
102 this->fTables
= that
.fTables
;
103 if (this->fTables
!= NULL
) {this->fTables
->addReference();};
108 //------------------------------------------------------------------------------
110 // Clone() Returns a newly-constructed RuleBasedBreakIterator with the same
111 // behavior, and iterating over the same text, as this one.
113 //------------------------------------------------------------------------------
115 DictionaryBasedBreakIterator::clone() const {
116 return new DictionaryBasedBreakIterator(*this);
119 //=======================================================================
120 // BreakIterator overrides
121 //=======================================================================
124 * Advances the iterator one step backwards.
125 * @return The position of the last boundary position before the
126 * current iteration position
129 DictionaryBasedBreakIterator::previous()
131 // if we have cached break positions and we're still in the range
132 // covered by them, just move one step backward in the cache
133 if (cachedBreakPositions
!= NULL
&& positionInCache
> 0) {
135 fText
->setIndex(cachedBreakPositions
[positionInCache
]);
136 return cachedBreakPositions
[positionInCache
];
139 // otherwise, dump the cache and use the inherited previous() method to move
140 // backward. This may fill up the cache with new break positions, in which
141 // case we have to mark our position in the cache
144 int32_t result
= RuleBasedBreakIterator::previous();
145 if (cachedBreakPositions
!= NULL
) {
146 for (positionInCache
=0;
147 cachedBreakPositions
[positionInCache
] != result
;
149 U_ASSERT(positionInCache
< numCachedBreakPositions
);
150 if (positionInCache
>= numCachedBreakPositions
) {
151 // Something has gone wrong. Dump the cache.
160 * Sets the current iteration position to the last boundary position
161 * before the specified position.
162 * @param offset The position to begin searching from
163 * @return The position of the last boundary before "offset"
166 DictionaryBasedBreakIterator::preceding(int32_t offset
)
168 // if the offset passed in is already past the end of the text,
169 // just return DONE; if it's before the beginning, return the
170 // text's starting offset
171 if (fText
== NULL
|| offset
> fText
->endIndex()) {
172 return BreakIterator::DONE
;
174 else if (offset
< fText
->startIndex()) {
175 return fText
->startIndex();
178 // if we have no cached break positions, or "offset" is outside the
179 // range covered by the cache, we can just call the inherited routine
180 // (which will eventually call other routines in this class that may
181 // refresh the cache)
182 if (cachedBreakPositions
== NULL
|| offset
<= cachedBreakPositions
[0] ||
183 offset
> cachedBreakPositions
[numCachedBreakPositions
- 1]) {
185 return RuleBasedBreakIterator::preceding(offset
);
188 // on the other hand, if "offset" is within the range covered by the cache,
189 // then all we have to do is search the cache for the last break position
193 while (positionInCache
< numCachedBreakPositions
194 && offset
> cachedBreakPositions
[positionInCache
])
197 fText
->setIndex(cachedBreakPositions
[positionInCache
]);
198 return fText
->getIndex();
203 * Sets the current iteration position to the first boundary position after
204 * the specified position.
205 * @param offset The position to begin searching forward from
206 * @return The position of the first boundary after "offset"
209 DictionaryBasedBreakIterator::following(int32_t offset
)
211 // if the offset passed in is already past the end of the text,
212 // just return DONE; if it's before the beginning, return the
213 // text's starting offset
214 if (fText
== NULL
|| offset
> fText
->endIndex()) {
215 return BreakIterator::DONE
;
217 else if (offset
< fText
->startIndex()) {
218 return fText
->startIndex();
221 // if we have no cached break positions, or if "offset" is outside the
222 // range covered by the cache, then dump the cache and call our
223 // inherited following() method. This will call other methods in this
224 // class that may refresh the cache.
225 if (cachedBreakPositions
== NULL
|| offset
< cachedBreakPositions
[0] ||
226 offset
>= cachedBreakPositions
[numCachedBreakPositions
- 1]) {
228 return RuleBasedBreakIterator::following(offset
);
231 // on the other hand, if "offset" is within the range covered by the
232 // cache, then just search the cache for the first break position
236 while (positionInCache
< numCachedBreakPositions
237 && offset
>= cachedBreakPositions
[positionInCache
])
239 fText
->setIndex(cachedBreakPositions
[positionInCache
]);
240 return fText
->getIndex();
245 * This is the implementation function for next().
248 DictionaryBasedBreakIterator::handleNext()
250 UErrorCode status
= U_ZERO_ERROR
;
251 // if there are no cached break positions, or if we've just moved
252 // off the end of the range covered by the cache, we have to dump
253 // and possibly regenerate the cache
254 if (cachedBreakPositions
== NULL
|| positionInCache
== numCachedBreakPositions
- 1) {
256 // start by using the inherited handleNext() to find a tentative return
257 // value. dictionaryCharCount tells us how many dictionary characters
258 // we passed over on our way to the tentative return value
259 int32_t startPos
= fText
->getIndex();
260 fDictionaryCharCount
= 0;
261 int32_t result
= RuleBasedBreakIterator::handleNext();
263 // if we passed over more than one dictionary character, then we use
264 // divideUpDictionaryRange() to regenerate the cached break positions
266 if (fDictionaryCharCount
> 1 && result
- startPos
> 1) {
267 divideUpDictionaryRange(startPos
, result
, status
);
268 U_ASSERT(U_SUCCESS(status
));
269 if (U_FAILURE(status
)) {
270 // Something went badly wrong, an internal error.
271 // We have no way from here to report it to caller.
272 // Treat as if this is if the dictionary did not apply to range.
278 // otherwise, the value we got back from the inherited fuction
279 // is our return value, and we can dump the cache
286 // if the cache of break positions has been regenerated (or existed all
287 // along), then just advance to the next break position in the cache
289 if (cachedBreakPositions
!= NULL
) {
291 fText
->setIndex(cachedBreakPositions
[positionInCache
]);
292 return cachedBreakPositions
[positionInCache
];
294 return -9999; // SHOULD NEVER GET HERE!
298 DictionaryBasedBreakIterator::reset()
300 uprv_free(cachedBreakPositions
);
301 cachedBreakPositions
= NULL
;
302 numCachedBreakPositions
= 0;
303 fDictionaryCharCount
= 0;
309 //------------------------------------------------------------------------------
311 // init() Common initialization routine, for use by constructors, etc.
313 //------------------------------------------------------------------------------
314 void DictionaryBasedBreakIterator::init() {
315 cachedBreakPositions
= NULL
;
317 numCachedBreakPositions
= 0;
318 fDictionaryCharCount
= 0;
323 //------------------------------------------------------------------------------
327 //------------------------------------------------------------------------------
328 BreakIterator
* DictionaryBasedBreakIterator::createBufferClone(void *stackBuffer
,
332 if (U_FAILURE(status
)){
337 // If user buffer size is zero this is a preflight operation to
338 // obtain the needed buffer size, allowing for worst case misalignment.
340 if (bufferSize
== 0) {
341 bufferSize
= sizeof(DictionaryBasedBreakIterator
) + U_ALIGNMENT_OFFSET_UP(0);
346 // Check the alignment and size of the user supplied buffer.
347 // Allocate heap memory if the user supplied memory is insufficient.
349 char *buf
= (char *)stackBuffer
;
350 uint32_t s
= bufferSize
;
352 if (stackBuffer
== NULL
) {
353 s
= 0; // Ignore size, force allocation if user didn't give us a buffer.
355 if (U_ALIGNMENT_OFFSET(stackBuffer
) != 0) {
356 int32_t offsetUp
= (int32_t)U_ALIGNMENT_OFFSET_UP(buf
);
360 if (s
< sizeof(DictionaryBasedBreakIterator
)) {
361 buf
= (char *) new DictionaryBasedBreakIterator();
363 status
= U_MEMORY_ALLOCATION_ERROR
;
366 status
= U_SAFECLONE_ALLOCATED_WARNING
;
370 // Initialize the clone object.
371 // TODO: using an overloaded C++ "operator new" to directly initialize the
372 // copy in the user's buffer would be better, but it doesn't seem
373 // to get along with namespaces. Investigate why.
375 // The memcpy is only safe with an empty (default constructed)
376 // break iterator. Use on others can screw up reference counts
377 // to data. memcpy-ing objects is not really a good idea...
379 DictionaryBasedBreakIterator localIter
; // Empty break iterator, source for memcpy
380 DictionaryBasedBreakIterator
*clone
= (DictionaryBasedBreakIterator
*)buf
;
381 uprv_memcpy(clone
, &localIter
, sizeof(DictionaryBasedBreakIterator
)); // clone = empty, but initialized, iterator.
382 *clone
= *this; // clone = the real one we want.
383 if (status
!= U_SAFECLONE_ALLOCATED_WARNING
) {
384 clone
->fBufferClone
= TRUE
;
393 * This is the function that actually implements the dictionary-based
394 * algorithm. Given the endpoints of a range of text, it uses the
395 * dictionary to determine the positions of any boundaries in this
396 * range. It stores all the boundary positions it discovers in
397 * cachedBreakPositions so that we only have to do this work once
398 * for each time we enter the range.
401 DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos
, int32_t endPos
, UErrorCode
&status
)
403 // the range we're dividing may begin or end with non-dictionary characters
404 // (i.e., for line breaking, we may have leading or trailing punctuation
405 // that needs to be kept with the word). Seek from the beginning of the
406 // range to the first dictionary character
407 fText
->setIndex(startPos
);
408 UChar c
= fText
->current();
409 while (isDictionaryChar(c
) == FALSE
) {
413 if (U_FAILURE(status
)) {
414 return; // UStack below overwrites the status error codes
417 // initialize. We maintain two stacks: currentBreakPositions contains
418 // the list of break positions that will be returned if we successfully
419 // finish traversing the whole range now. possibleBreakPositions lists
420 // all other possible word ends we've passed along the way. (Whenever
421 // we reach an error [a sequence of characters that can't begin any word
422 // in the dictionary], we back up, possibly delete some breaks from
423 // currentBreakPositions, move a break from possibleBreakPositions
424 // to currentBreakPositions, and start over from there. This process
425 // continues in this way until we either successfully make it all the way
426 // across the range, or exhaust all of our combinations of break
427 // positions.) wrongBreakPositions is used to keep track of paths we've
428 // tried on previous iterations. As the iterator backs up further and
429 // further, this saves us from having to follow each possible path
430 // through the text all the way to the error (hopefully avoiding many
431 // future recursive calls as well).
432 // there can be only one kind of error in UStack and UVector, so we'll
433 // just let the error fall through
434 UStack
currentBreakPositions(status
);
435 UStack
possibleBreakPositions(status
);
436 UVector
wrongBreakPositions(status
);
438 // the dictionary is implemented as a trie, which is treated as a state
439 // machine. -1 represents the end of a legal word. Every word in the
440 // dictionary is represented by a path from the root node to -1. A path
441 // that ends in state 0 is an illegal combination of characters.
444 // these two variables are used for error handling. We keep track of the
445 // farthest we've gotten through the range being divided, and the combination
446 // of breaks that got us that far. If we use up all possible break
447 // combinations, the text contains an error or a word that's not in the
448 // dictionary. In this case, we "bless" the break positions that got us the
449 // farthest as real break positions, and then start over from scratch with
450 // the character where the error occurred.
451 int32_t farthestEndPoint
= fText
->getIndex();
452 UStack
bestBreakPositions(status
);
453 UBool bestBreakPositionsInitialized
= FALSE
;
455 if (U_FAILURE(status
)) {
458 // initialize (we always exit the loop with a break statement)
459 c
= fText
->current();
462 // if we can transition to state "-1" from our current state, we're
463 // on the last character of a legal word. Push that position onto
464 // the possible-break-positions stack
465 if (fTables
->fDictionary
->at(state
, (int32_t)0) == -1) {
466 possibleBreakPositions
.push(fText
->getIndex(), status
);
467 if (U_FAILURE(status
)) {
472 // look up the new state to transition to in the dictionary
473 state
= fTables
->fDictionary
->at(state
, c
);
475 // if the character we're sitting on causes us to transition to
476 // the "end of word" state, then it was a non-dictionary character
477 // and we've successfully traversed the whole range. Drop out
480 currentBreakPositions
.push(fText
->getIndex(), status
);
481 if (U_FAILURE(status
)) {
487 // if the character we're sitting on causes us to transition to
488 // the error state, or if we've gone off the end of the range
489 // without transitioning to the "end of word" state, we've hit
491 else if (state
== 0 || fText
->getIndex() >= endPos
) {
493 // if this is the farthest we've gotten, take note of it in
494 // case there's an error in the text
495 if (fText
->getIndex() > farthestEndPoint
) {
496 farthestEndPoint
= fText
->getIndex();
497 bestBreakPositions
.removeAllElements();
498 bestBreakPositionsInitialized
= TRUE
;
499 for (int32_t i
= 0; i
< currentBreakPositions
.size(); i
++) {
500 bestBreakPositions
.push(currentBreakPositions
.elementAti(i
), status
);
504 // wrongBreakPositions is a list of all break positions we've tried starting
505 // that didn't allow us to traverse all the way through the text. Every time
506 // we pop a break position off of currentBreakPositions, we put it into
507 // wrongBreakPositions to avoid trying it again later. If we make it to this
508 // spot, we're either going to back up to a break in possibleBreakPositions
509 // and try starting over from there, or we've exhausted all possible break
510 // positions and are going to do the fallback procedure. This loop prevents
511 // us from messing with anything in possibleBreakPositions that didn't work as
512 // a starting point the last time we tried it (this is to prevent a bunch of
513 // repetitive checks from slowing down some extreme cases)
514 while (!possibleBreakPositions
.isEmpty() && wrongBreakPositions
.contains(
515 possibleBreakPositions
.peeki())) {
516 possibleBreakPositions
.popi();
519 // if we've used up all possible break-position combinations, there's
520 // an error or an unknown word in the text. In this case, we start
521 // over, treating the farthest character we've reached as the beginning
522 // of the range, and "blessing" the break positions that got us that
523 // far as real break positions
524 if (possibleBreakPositions
.isEmpty()) {
525 if (bestBreakPositionsInitialized
) {
526 currentBreakPositions
.removeAllElements();
527 for (int32_t i
= 0; i
< bestBreakPositions
.size(); i
++) {
528 currentBreakPositions
.push(bestBreakPositions
.elementAti(i
), status
);
529 if (U_FAILURE(status
)) {
533 bestBreakPositions
.removeAllElements();
534 if (farthestEndPoint
< endPos
) {
535 fText
->setIndex(farthestEndPoint
+ 1);
542 if ((currentBreakPositions
.isEmpty()
543 || currentBreakPositions
.peeki() != fText
->getIndex())
544 && fText
->getIndex() != startPos
) {
545 currentBreakPositions
.push(fText
->getIndex(), status
);
546 if (U_FAILURE(status
)) {
551 currentBreakPositions
.push(fText
->getIndex(), status
);
552 if (U_FAILURE(status
)) {
558 // if we still have more break positions we can try, then promote the
559 // last break in possibleBreakPositions into currentBreakPositions,
560 // and get rid of all entries in currentBreakPositions that come after
561 // it. Then back up to that position and start over from there (i.e.,
562 // treat that position as the beginning of a new word)
564 int32_t temp
= possibleBreakPositions
.popi();
566 while (!currentBreakPositions
.isEmpty() && temp
<
567 currentBreakPositions
.peeki()) {
568 temp2
= currentBreakPositions
.popi();
569 wrongBreakPositions
.addElement(temp2
, status
);
571 currentBreakPositions
.push(temp
, status
);
572 fText
->setIndex(currentBreakPositions
.peeki());
575 // re-sync "c" for the next go-round, and drop out of the loop if
576 // we've made it off the end of the range
577 c
= fText
->current();
578 if (fText
->getIndex() >= endPos
) {
583 // if we didn't hit any exceptional conditions on this last iteration,
584 // just advance to the next character and loop
590 // dump the last break position in the list, and replace it with the actual
591 // end of the range (which may be the same character, or may be further on
592 // because the range actually ended with non-dictionary characters we want to
593 // keep with the word)
594 if (!currentBreakPositions
.isEmpty()) {
595 currentBreakPositions
.popi();
597 currentBreakPositions
.push(endPos
, status
);
598 if (U_FAILURE(status
)) {
602 // create a regular array to hold the break positions and copy
603 // the break positions from the stack to the array (in addition,
604 // our starting position goes into this array as a break position).
605 // This array becomes the cache of break positions used by next()
606 // and previous(), so this is where we actually refresh the cache.
607 if (cachedBreakPositions
!= NULL
) {
608 uprv_free(cachedBreakPositions
);
610 cachedBreakPositions
= (int32_t *)uprv_malloc((currentBreakPositions
.size() + 1) * sizeof(int32_t));
612 if(cachedBreakPositions
== NULL
) {
613 status
= U_MEMORY_ALLOCATION_ERROR
;
616 numCachedBreakPositions
= currentBreakPositions
.size() + 1;
617 cachedBreakPositions
[0] = startPos
;
619 for (int32_t i
= 0; i
< currentBreakPositions
.size(); i
++) {
620 cachedBreakPositions
[i
+ 1] = currentBreakPositions
.elementAti(i
);
627 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */