]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/dbbi.cpp
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / common / dbbi.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2004 IBM Corp. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 12/1/99 rgillam Complete port from Java.
7 * 01/13/2000 helena Added UErrorCode to ctors.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_BREAK_ITERATION
14
15 #include "unicode/dbbi.h"
16 #include "unicode/schriter.h"
17 #include "dbbi_tbl.h"
18 #include "uvector.h"
19 #include "cmemory.h"
20 #include "uassert.h"
21
22 U_NAMESPACE_BEGIN
23
24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(DictionaryBasedBreakIterator)
25
26
27 //------------------------------------------------------------------------------
28 //
29 // constructors
30 //
31 //------------------------------------------------------------------------------
32
33 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator() :
34 RuleBasedBreakIterator() {
35 init();
36 }
37
38
39 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* rbbiData,
40 const char* dictionaryFilename,
41 UErrorCode& status)
42 : RuleBasedBreakIterator(rbbiData, status)
43 {
44 init();
45 if (U_FAILURE(status)) {return;};
46 fTables = new DictionaryBasedBreakIteratorTables(dictionaryFilename, status);
47 if (U_FAILURE(status)) {
48 if (fTables != NULL) {
49 fTables->removeReference();
50 fTables = NULL;
51 }
52 return;
53 }
54 /* test for NULL */
55 if(fTables == 0) {
56 status = U_MEMORY_ALLOCATION_ERROR;
57 return;
58 }
59 }
60
61
62 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(const DictionaryBasedBreakIterator &other) :
63 RuleBasedBreakIterator(other)
64 {
65 init();
66 if (other.fTables != NULL) {
67 fTables = other.fTables;
68 fTables->addReference();
69 }
70 }
71
72
73
74
75 //------------------------------------------------------------------------------
76 //
77 // Destructor
78 //
79 //------------------------------------------------------------------------------
80 DictionaryBasedBreakIterator::~DictionaryBasedBreakIterator()
81 {
82 uprv_free(cachedBreakPositions);
83 cachedBreakPositions = NULL;
84 if (fTables != NULL) {fTables->removeReference();};
85 }
86
87 //------------------------------------------------------------------------------
88 //
89 // Assignment operator. Sets this iterator to have the same behavior,
90 // and iterate over the same text, as the one passed in.
91 //
92 //------------------------------------------------------------------------------
93 DictionaryBasedBreakIterator&
94 DictionaryBasedBreakIterator::operator=(const DictionaryBasedBreakIterator& that) {
95 if (this == &that) {
96 return *this;
97 }
98 reset(); // clears out cached break positions.
99 RuleBasedBreakIterator::operator=(that);
100 if (this->fTables != that.fTables) {
101 if (this->fTables != NULL) {this->fTables->removeReference();};
102 this->fTables = that.fTables;
103 if (this->fTables != NULL) {this->fTables->addReference();};
104 }
105 return *this;
106 }
107
108 //------------------------------------------------------------------------------
109 //
110 // Clone() Returns a newly-constructed RuleBasedBreakIterator with the same
111 // behavior, and iterating over the same text, as this one.
112 //
113 //------------------------------------------------------------------------------
114 BreakIterator*
115 DictionaryBasedBreakIterator::clone() const {
116 return new DictionaryBasedBreakIterator(*this);
117 }
118
119 //=======================================================================
120 // BreakIterator overrides
121 //=======================================================================
122
123 /**
124 * Advances the iterator one step backwards.
125 * @return The position of the last boundary position before the
126 * current iteration position
127 */
128 int32_t
129 DictionaryBasedBreakIterator::previous()
130 {
131 // if we have cached break positions and we're still in the range
132 // covered by them, just move one step backward in the cache
133 if (cachedBreakPositions != NULL && positionInCache > 0) {
134 --positionInCache;
135 fText->setIndex(cachedBreakPositions[positionInCache]);
136 return cachedBreakPositions[positionInCache];
137 }
138
139 // otherwise, dump the cache and use the inherited previous() method to move
140 // backward. This may fill up the cache with new break positions, in which
141 // case we have to mark our position in the cache
142 else {
143 reset();
144 int32_t result = RuleBasedBreakIterator::previous();
145 if (cachedBreakPositions != NULL) {
146 for (positionInCache=0;
147 cachedBreakPositions[positionInCache] != result;
148 positionInCache++);
149 U_ASSERT(positionInCache < numCachedBreakPositions);
150 if (positionInCache >= numCachedBreakPositions) {
151 // Something has gone wrong. Dump the cache.
152 reset();
153 }
154 }
155 return result;
156 }
157 }
158
159 /**
160 * Sets the current iteration position to the last boundary position
161 * before the specified position.
162 * @param offset The position to begin searching from
163 * @return The position of the last boundary before "offset"
164 */
165 int32_t
166 DictionaryBasedBreakIterator::preceding(int32_t offset)
167 {
168 // if the offset passed in is already past the end of the text,
169 // just return DONE; if it's before the beginning, return the
170 // text's starting offset
171 if (fText == NULL || offset > fText->endIndex()) {
172 return BreakIterator::DONE;
173 }
174 else if (offset < fText->startIndex()) {
175 return fText->startIndex();
176 }
177
178 // if we have no cached break positions, or "offset" is outside the
179 // range covered by the cache, we can just call the inherited routine
180 // (which will eventually call other routines in this class that may
181 // refresh the cache)
182 if (cachedBreakPositions == NULL || offset <= cachedBreakPositions[0] ||
183 offset > cachedBreakPositions[numCachedBreakPositions - 1]) {
184 reset();
185 return RuleBasedBreakIterator::preceding(offset);
186 }
187
188 // on the other hand, if "offset" is within the range covered by the cache,
189 // then all we have to do is search the cache for the last break position
190 // before "offset"
191 else {
192 positionInCache = 0;
193 while (positionInCache < numCachedBreakPositions
194 && offset > cachedBreakPositions[positionInCache])
195 ++positionInCache;
196 --positionInCache;
197 fText->setIndex(cachedBreakPositions[positionInCache]);
198 return fText->getIndex();
199 }
200 }
201
202 /**
203 * Sets the current iteration position to the first boundary position after
204 * the specified position.
205 * @param offset The position to begin searching forward from
206 * @return The position of the first boundary after "offset"
207 */
208 int32_t
209 DictionaryBasedBreakIterator::following(int32_t offset)
210 {
211 // if the offset passed in is already past the end of the text,
212 // just return DONE; if it's before the beginning, return the
213 // text's starting offset
214 if (fText == NULL || offset > fText->endIndex()) {
215 return BreakIterator::DONE;
216 }
217 else if (offset < fText->startIndex()) {
218 return fText->startIndex();
219 }
220
221 // if we have no cached break positions, or if "offset" is outside the
222 // range covered by the cache, then dump the cache and call our
223 // inherited following() method. This will call other methods in this
224 // class that may refresh the cache.
225 if (cachedBreakPositions == NULL || offset < cachedBreakPositions[0] ||
226 offset >= cachedBreakPositions[numCachedBreakPositions - 1]) {
227 reset();
228 return RuleBasedBreakIterator::following(offset);
229 }
230
231 // on the other hand, if "offset" is within the range covered by the
232 // cache, then just search the cache for the first break position
233 // after "offset"
234 else {
235 positionInCache = 0;
236 while (positionInCache < numCachedBreakPositions
237 && offset >= cachedBreakPositions[positionInCache])
238 ++positionInCache;
239 fText->setIndex(cachedBreakPositions[positionInCache]);
240 return fText->getIndex();
241 }
242 }
243
244 /**
245 * This is the implementation function for next().
246 */
247 int32_t
248 DictionaryBasedBreakIterator::handleNext()
249 {
250 UErrorCode status = U_ZERO_ERROR;
251 // if there are no cached break positions, or if we've just moved
252 // off the end of the range covered by the cache, we have to dump
253 // and possibly regenerate the cache
254 if (cachedBreakPositions == NULL || positionInCache == numCachedBreakPositions - 1) {
255
256 // start by using the inherited handleNext() to find a tentative return
257 // value. dictionaryCharCount tells us how many dictionary characters
258 // we passed over on our way to the tentative return value
259 int32_t startPos = fText->getIndex();
260 fDictionaryCharCount = 0;
261 int32_t result = RuleBasedBreakIterator::handleNext();
262
263 // if we passed over more than one dictionary character, then we use
264 // divideUpDictionaryRange() to regenerate the cached break positions
265 // for the new range
266 if (fDictionaryCharCount > 1 && result - startPos > 1) {
267 divideUpDictionaryRange(startPos, result, status);
268 U_ASSERT(U_SUCCESS(status));
269 if (U_FAILURE(status)) {
270 // Something went badly wrong, an internal error.
271 // We have no way from here to report it to caller.
272 // Treat as if this is if the dictionary did not apply to range.
273 reset();
274 return result;
275 }
276 }
277
278 // otherwise, the value we got back from the inherited fuction
279 // is our return value, and we can dump the cache
280 else {
281 reset();
282 return result;
283 }
284 }
285
286 // if the cache of break positions has been regenerated (or existed all
287 // along), then just advance to the next break position in the cache
288 // and return it
289 if (cachedBreakPositions != NULL) {
290 ++positionInCache;
291 fText->setIndex(cachedBreakPositions[positionInCache]);
292 return cachedBreakPositions[positionInCache];
293 }
294 return -9999; // SHOULD NEVER GET HERE!
295 }
296
297 void
298 DictionaryBasedBreakIterator::reset()
299 {
300 uprv_free(cachedBreakPositions);
301 cachedBreakPositions = NULL;
302 numCachedBreakPositions = 0;
303 fDictionaryCharCount = 0;
304 positionInCache = 0;
305 }
306
307
308
309 //------------------------------------------------------------------------------
310 //
311 // init() Common initialization routine, for use by constructors, etc.
312 //
313 //------------------------------------------------------------------------------
314 void DictionaryBasedBreakIterator::init() {
315 cachedBreakPositions = NULL;
316 fTables = NULL;
317 numCachedBreakPositions = 0;
318 fDictionaryCharCount = 0;
319 positionInCache = 0;
320 }
321
322
323 //------------------------------------------------------------------------------
324 //
325 // BufferClone
326 //
327 //------------------------------------------------------------------------------
328 BreakIterator * DictionaryBasedBreakIterator::createBufferClone(void *stackBuffer,
329 int32_t &bufferSize,
330 UErrorCode &status)
331 {
332 if (U_FAILURE(status)){
333 return NULL;
334 }
335
336 //
337 // If user buffer size is zero this is a preflight operation to
338 // obtain the needed buffer size, allowing for worst case misalignment.
339 //
340 if (bufferSize == 0) {
341 bufferSize = sizeof(DictionaryBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
342 return NULL;
343 }
344
345 //
346 // Check the alignment and size of the user supplied buffer.
347 // Allocate heap memory if the user supplied memory is insufficient.
348 //
349 char *buf = (char *)stackBuffer;
350 uint32_t s = bufferSize;
351
352 if (stackBuffer == NULL) {
353 s = 0; // Ignore size, force allocation if user didn't give us a buffer.
354 }
355 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
356 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(buf);
357 s -= offsetUp;
358 buf += offsetUp;
359 }
360 if (s < sizeof(DictionaryBasedBreakIterator)) {
361 buf = (char *) new DictionaryBasedBreakIterator();
362 if (buf == 0) {
363 status = U_MEMORY_ALLOCATION_ERROR;
364 return NULL;
365 }
366 status = U_SAFECLONE_ALLOCATED_WARNING;
367 }
368
369 //
370 // Initialize the clone object.
371 // TODO: using an overloaded C++ "operator new" to directly initialize the
372 // copy in the user's buffer would be better, but it doesn't seem
373 // to get along with namespaces. Investigate why.
374 //
375 // The memcpy is only safe with an empty (default constructed)
376 // break iterator. Use on others can screw up reference counts
377 // to data. memcpy-ing objects is not really a good idea...
378 //
379 DictionaryBasedBreakIterator localIter; // Empty break iterator, source for memcpy
380 DictionaryBasedBreakIterator *clone = (DictionaryBasedBreakIterator *)buf;
381 uprv_memcpy(clone, &localIter, sizeof(DictionaryBasedBreakIterator)); // clone = empty, but initialized, iterator.
382 *clone = *this; // clone = the real one we want.
383 if (status != U_SAFECLONE_ALLOCATED_WARNING) {
384 clone->fBufferClone = TRUE;
385 }
386 return clone;
387 }
388
389
390
391
392 /**
393 * This is the function that actually implements the dictionary-based
394 * algorithm. Given the endpoints of a range of text, it uses the
395 * dictionary to determine the positions of any boundaries in this
396 * range. It stores all the boundary positions it discovers in
397 * cachedBreakPositions so that we only have to do this work once
398 * for each time we enter the range.
399 */
400 void
401 DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t endPos, UErrorCode &status)
402 {
403 // the range we're dividing may begin or end with non-dictionary characters
404 // (i.e., for line breaking, we may have leading or trailing punctuation
405 // that needs to be kept with the word). Seek from the beginning of the
406 // range to the first dictionary character
407 fText->setIndex(startPos);
408 UChar c = fText->current();
409 while (isDictionaryChar(c) == FALSE) {
410 c = fText->next();
411 }
412
413 if (U_FAILURE(status)) {
414 return; // UStack below overwrites the status error codes
415 }
416
417 // initialize. We maintain two stacks: currentBreakPositions contains
418 // the list of break positions that will be returned if we successfully
419 // finish traversing the whole range now. possibleBreakPositions lists
420 // all other possible word ends we've passed along the way. (Whenever
421 // we reach an error [a sequence of characters that can't begin any word
422 // in the dictionary], we back up, possibly delete some breaks from
423 // currentBreakPositions, move a break from possibleBreakPositions
424 // to currentBreakPositions, and start over from there. This process
425 // continues in this way until we either successfully make it all the way
426 // across the range, or exhaust all of our combinations of break
427 // positions.) wrongBreakPositions is used to keep track of paths we've
428 // tried on previous iterations. As the iterator backs up further and
429 // further, this saves us from having to follow each possible path
430 // through the text all the way to the error (hopefully avoiding many
431 // future recursive calls as well).
432 // there can be only one kind of error in UStack and UVector, so we'll
433 // just let the error fall through
434 UStack currentBreakPositions(status);
435 UStack possibleBreakPositions(status);
436 UVector wrongBreakPositions(status);
437
438 // the dictionary is implemented as a trie, which is treated as a state
439 // machine. -1 represents the end of a legal word. Every word in the
440 // dictionary is represented by a path from the root node to -1. A path
441 // that ends in state 0 is an illegal combination of characters.
442 int16_t state = 0;
443
444 // these two variables are used for error handling. We keep track of the
445 // farthest we've gotten through the range being divided, and the combination
446 // of breaks that got us that far. If we use up all possible break
447 // combinations, the text contains an error or a word that's not in the
448 // dictionary. In this case, we "bless" the break positions that got us the
449 // farthest as real break positions, and then start over from scratch with
450 // the character where the error occurred.
451 int32_t farthestEndPoint = fText->getIndex();
452 UStack bestBreakPositions(status);
453 UBool bestBreakPositionsInitialized = FALSE;
454
455 if (U_FAILURE(status)) {
456 return;
457 }
458 // initialize (we always exit the loop with a break statement)
459 c = fText->current();
460 for (;;) {
461
462 // if we can transition to state "-1" from our current state, we're
463 // on the last character of a legal word. Push that position onto
464 // the possible-break-positions stack
465 if (fTables->fDictionary->at(state, (int32_t)0) == -1) {
466 possibleBreakPositions.push(fText->getIndex(), status);
467 if (U_FAILURE(status)) {
468 return;
469 }
470 }
471
472 // look up the new state to transition to in the dictionary
473 state = fTables->fDictionary->at(state, c);
474
475 // if the character we're sitting on causes us to transition to
476 // the "end of word" state, then it was a non-dictionary character
477 // and we've successfully traversed the whole range. Drop out
478 // of the loop.
479 if (state == -1) {
480 currentBreakPositions.push(fText->getIndex(), status);
481 if (U_FAILURE(status)) {
482 return;
483 }
484 break;
485 }
486
487 // if the character we're sitting on causes us to transition to
488 // the error state, or if we've gone off the end of the range
489 // without transitioning to the "end of word" state, we've hit
490 // an error...
491 else if (state == 0 || fText->getIndex() >= endPos) {
492
493 // if this is the farthest we've gotten, take note of it in
494 // case there's an error in the text
495 if (fText->getIndex() > farthestEndPoint) {
496 farthestEndPoint = fText->getIndex();
497 bestBreakPositions.removeAllElements();
498 bestBreakPositionsInitialized = TRUE;
499 for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
500 bestBreakPositions.push(currentBreakPositions.elementAti(i), status);
501 }
502 }
503
504 // wrongBreakPositions is a list of all break positions we've tried starting
505 // that didn't allow us to traverse all the way through the text. Every time
506 // we pop a break position off of currentBreakPositions, we put it into
507 // wrongBreakPositions to avoid trying it again later. If we make it to this
508 // spot, we're either going to back up to a break in possibleBreakPositions
509 // and try starting over from there, or we've exhausted all possible break
510 // positions and are going to do the fallback procedure. This loop prevents
511 // us from messing with anything in possibleBreakPositions that didn't work as
512 // a starting point the last time we tried it (this is to prevent a bunch of
513 // repetitive checks from slowing down some extreme cases)
514 while (!possibleBreakPositions.isEmpty() && wrongBreakPositions.contains(
515 possibleBreakPositions.peeki())) {
516 possibleBreakPositions.popi();
517 }
518
519 // if we've used up all possible break-position combinations, there's
520 // an error or an unknown word in the text. In this case, we start
521 // over, treating the farthest character we've reached as the beginning
522 // of the range, and "blessing" the break positions that got us that
523 // far as real break positions
524 if (possibleBreakPositions.isEmpty()) {
525 if (bestBreakPositionsInitialized) {
526 currentBreakPositions.removeAllElements();
527 for (int32_t i = 0; i < bestBreakPositions.size(); i++) {
528 currentBreakPositions.push(bestBreakPositions.elementAti(i), status);
529 if (U_FAILURE(status)) {
530 return;
531 }
532 }
533 bestBreakPositions.removeAllElements();
534 if (farthestEndPoint < endPos) {
535 fText->setIndex(farthestEndPoint + 1);
536 }
537 else {
538 break;
539 }
540 }
541 else {
542 if ((currentBreakPositions.isEmpty()
543 || currentBreakPositions.peeki() != fText->getIndex())
544 && fText->getIndex() != startPos) {
545 currentBreakPositions.push(fText->getIndex(), status);
546 if (U_FAILURE(status)) {
547 return;
548 }
549 }
550 fText->next();
551 currentBreakPositions.push(fText->getIndex(), status);
552 if (U_FAILURE(status)) {
553 return;
554 }
555 }
556 }
557
558 // if we still have more break positions we can try, then promote the
559 // last break in possibleBreakPositions into currentBreakPositions,
560 // and get rid of all entries in currentBreakPositions that come after
561 // it. Then back up to that position and start over from there (i.e.,
562 // treat that position as the beginning of a new word)
563 else {
564 int32_t temp = possibleBreakPositions.popi();
565 int32_t temp2 = 0;
566 while (!currentBreakPositions.isEmpty() && temp <
567 currentBreakPositions.peeki()) {
568 temp2 = currentBreakPositions.popi();
569 wrongBreakPositions.addElement(temp2, status);
570 }
571 currentBreakPositions.push(temp, status);
572 fText->setIndex(currentBreakPositions.peeki());
573 }
574
575 // re-sync "c" for the next go-round, and drop out of the loop if
576 // we've made it off the end of the range
577 c = fText->current();
578 if (fText->getIndex() >= endPos) {
579 break;
580 }
581 }
582
583 // if we didn't hit any exceptional conditions on this last iteration,
584 // just advance to the next character and loop
585 else {
586 c = fText->next();
587 }
588 }
589
590 // dump the last break position in the list, and replace it with the actual
591 // end of the range (which may be the same character, or may be further on
592 // because the range actually ended with non-dictionary characters we want to
593 // keep with the word)
594 if (!currentBreakPositions.isEmpty()) {
595 currentBreakPositions.popi();
596 }
597 currentBreakPositions.push(endPos, status);
598 if (U_FAILURE(status)) {
599 return;
600 }
601
602 // create a regular array to hold the break positions and copy
603 // the break positions from the stack to the array (in addition,
604 // our starting position goes into this array as a break position).
605 // This array becomes the cache of break positions used by next()
606 // and previous(), so this is where we actually refresh the cache.
607 if (cachedBreakPositions != NULL) {
608 uprv_free(cachedBreakPositions);
609 }
610 cachedBreakPositions = (int32_t *)uprv_malloc((currentBreakPositions.size() + 1) * sizeof(int32_t));
611 /* Test for NULL */
612 if(cachedBreakPositions == NULL) {
613 status = U_MEMORY_ALLOCATION_ERROR;
614 return;
615 }
616 numCachedBreakPositions = currentBreakPositions.size() + 1;
617 cachedBreakPositions[0] = startPos;
618
619 for (int32_t i = 0; i < currentBreakPositions.size(); i++) {
620 cachedBreakPositions[i + 1] = currentBreakPositions.elementAti(i);
621 }
622 positionInCache = 0;
623 }
624
625 U_NAMESPACE_END
626
627 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
628
629 /* eof */