]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbi.cpp
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / common / rbbi.cpp
1 /*
2 ***************************************************************************
3 * Copyright (C) 1999-2008 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 ***************************************************************************
6 */
7 //
8 // file: rbbi.c Contains the implementation of the rule based break iterator
9 // runtime engine and the API implementation for
10 // class RuleBasedBreakIterator
11 //
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include "unicode/rbbi.h"
18 #include "unicode/schriter.h"
19 #include "unicode/uchriter.h"
20 #include "unicode/udata.h"
21 #include "unicode/uclean.h"
22 #include "rbbidata.h"
23 #include "rbbirb.h"
24 #include "cmemory.h"
25 #include "cstring.h"
26 #include "umutex.h"
27 #include "ucln_cmn.h"
28 #include "brkeng.h"
29
30 #include "uassert.h"
31 #include "uvector.h"
32
33 // if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
34 #if U_LOCAL_SERVICE_HOOK
35 #include "localsvc.h"
36 #endif
37
38 #ifdef RBBI_DEBUG
39 static UBool fTrace = FALSE;
40 #endif
41
42 U_NAMESPACE_BEGIN
43
44 // The state number of the starting state
45 #define START_STATE 1
46
47 // The state-transition value indicating "stop"
48 #define STOP_STATE 0
49
50
51 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
52
53
54 //=======================================================================
55 // constructors
56 //=======================================================================
57
58 /**
59 * Constructs a RuleBasedBreakIterator that uses the already-created
60 * tables object that is passed in as a parameter.
61 */
62 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
63 {
64 init();
65 fData = new RBBIDataWrapper(data, status); // status checked in constructor
66 if (U_FAILURE(status)) {return;}
67 if(fData == 0) {
68 status = U_MEMORY_ALLOCATION_ERROR;
69 return;
70 }
71 }
72
73 /**
74 * Same as above but does not adopt memory
75 */
76 RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status)
77 {
78 init();
79 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor
80 if (U_FAILURE(status)) {return;}
81 if(fData == 0) {
82 status = U_MEMORY_ALLOCATION_ERROR;
83 return;
84 }
85 }
86
87 //-------------------------------------------------------------------------------
88 //
89 // Constructor from a UDataMemory handle to precompiled break rules
90 // stored in an ICU data file.
91 //
92 //-------------------------------------------------------------------------------
93 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
94 {
95 init();
96 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
97 if (U_FAILURE(status)) {return;}
98 if(fData == 0) {
99 status = U_MEMORY_ALLOCATION_ERROR;
100 return;
101 }
102 }
103
104
105
106 //-------------------------------------------------------------------------------
107 //
108 // Constructor from a set of rules supplied as a string.
109 //
110 //-------------------------------------------------------------------------------
111 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
112 UParseError &parseError,
113 UErrorCode &status)
114 {
115 init();
116 if (U_FAILURE(status)) {return;}
117 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
118 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
119 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
120 // creates and returns a complete RBBI. From here, in a constructor, we
121 // can't just return the object created by the builder factory, hence
122 // the assignment of the factory created object to "this".
123 if (U_SUCCESS(status)) {
124 *this = *bi;
125 delete bi;
126 }
127 }
128
129
130 //-------------------------------------------------------------------------------
131 //
132 // Default Constructor. Create an empty shell that can be set up later.
133 // Used when creating a RuleBasedBreakIterator from a set
134 // of rules.
135 //-------------------------------------------------------------------------------
136 RuleBasedBreakIterator::RuleBasedBreakIterator() {
137 init();
138 }
139
140
141 //-------------------------------------------------------------------------------
142 //
143 // Copy constructor. Will produce a break iterator with the same behavior,
144 // and which iterates over the same text, as the one passed in.
145 //
146 //-------------------------------------------------------------------------------
147 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
148 : BreakIterator(other)
149 {
150 this->init();
151 *this = other;
152 }
153
154
155 /**
156 * Destructor
157 */
158 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
159 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
160 // fCharIter was adopted from the outside.
161 delete fCharIter;
162 }
163 fCharIter = NULL;
164 delete fSCharIter;
165 fCharIter = NULL;
166 delete fDCharIter;
167 fDCharIter = NULL;
168
169 utext_close(fText);
170
171 if (fData != NULL) {
172 fData->removeReference();
173 fData = NULL;
174 }
175 if (fCachedBreakPositions) {
176 uprv_free(fCachedBreakPositions);
177 fCachedBreakPositions = NULL;
178 }
179 if (fLanguageBreakEngines) {
180 delete fLanguageBreakEngines;
181 fLanguageBreakEngines = NULL;
182 }
183 if (fUnhandledBreakEngine) {
184 delete fUnhandledBreakEngine;
185 fUnhandledBreakEngine = NULL;
186 }
187 }
188
189 /**
190 * Assignment operator. Sets this iterator to have the same behavior,
191 * and iterate over the same text, as the one passed in.
192 */
193 RuleBasedBreakIterator&
194 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
195 if (this == &that) {
196 return *this;
197 }
198 reset(); // Delete break cache information
199 fBreakType = that.fBreakType;
200 if (fLanguageBreakEngines != NULL) {
201 delete fLanguageBreakEngines;
202 fLanguageBreakEngines = NULL; // Just rebuild for now
203 }
204 // TODO: clone fLanguageBreakEngines from "that"
205 UErrorCode status = U_ZERO_ERROR;
206 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
207
208 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
209 delete fCharIter;
210 }
211 fCharIter = NULL;
212
213 if (that.fCharIter != NULL ) {
214 // This is a little bit tricky - it will intially appear that
215 // this->fCharIter is adopted, even if that->fCharIter was
216 // not adopted. That's ok.
217 fCharIter = that.fCharIter->clone();
218 }
219
220 if (fData != NULL) {
221 fData->removeReference();
222 fData = NULL;
223 }
224 if (that.fData != NULL) {
225 fData = that.fData->addReference();
226 }
227
228 return *this;
229 }
230
231
232
233 //-----------------------------------------------------------------------------
234 //
235 // init() Shared initialization routine. Used by all the constructors.
236 // Initializes all fields, leaving the object in a consistent state.
237 //
238 //-----------------------------------------------------------------------------
239 void RuleBasedBreakIterator::init() {
240 UErrorCode status = U_ZERO_ERROR;
241 fBufferClone = FALSE;
242 fText = utext_openUChars(NULL, NULL, 0, &status);
243 fCharIter = NULL;
244 fSCharIter = NULL;
245 fDCharIter = NULL;
246 fData = NULL;
247 fLastRuleStatusIndex = 0;
248 fLastStatusIndexValid = TRUE;
249 fDictionaryCharCount = 0;
250 fBreakType = -1;
251
252 fCachedBreakPositions = NULL;
253 fLanguageBreakEngines = NULL;
254 fUnhandledBreakEngine = NULL;
255 fNumCachedBreakPositions = 0;
256 fPositionInCache = 0;
257
258 #ifdef RBBI_DEBUG
259 static UBool debugInitDone = FALSE;
260 if (debugInitDone == FALSE) {
261 char *debugEnv = getenv("U_RBBIDEBUG");
262 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
263 fTrace = TRUE;
264 }
265 debugInitDone = TRUE;
266 }
267 #endif
268 }
269
270
271
272 //-----------------------------------------------------------------------------
273 //
274 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same
275 // behavior, and iterating over the same text, as this one.
276 // Virtual function: does the right thing with subclasses.
277 //
278 //-----------------------------------------------------------------------------
279 BreakIterator*
280 RuleBasedBreakIterator::clone(void) const {
281 return new RuleBasedBreakIterator(*this);
282 }
283
284 /**
285 * Equality operator. Returns TRUE if both BreakIterators are of the
286 * same class, have the same behavior, and iterate over the same text.
287 */
288 UBool
289 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
290 if (that.getDynamicClassID() != getDynamicClassID()) {
291 return FALSE;
292 }
293
294 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
295
296 if (!utext_equals(fText, that2.fText)) {
297 // The two break iterators are operating on different text,
298 // or have a different interation position.
299 return FALSE;
300 };
301
302 // TODO: need a check for when in a dictionary region at different offsets.
303
304 if (that2.fData == fData ||
305 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
306 // The two break iterators are using the same rules.
307 return TRUE;
308 }
309 return FALSE;
310 }
311
312 /**
313 * Compute a hash code for this BreakIterator
314 * @return A hash code
315 */
316 int32_t
317 RuleBasedBreakIterator::hashCode(void) const {
318 int32_t hash = 0;
319 if (fData != NULL) {
320 hash = fData->hashCode();
321 }
322 return hash;
323 }
324
325
326 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
327 if (U_FAILURE(status)) {
328 return;
329 }
330 reset();
331 fText = utext_clone(fText, ut, FALSE, TRUE, &status);
332
333 // Set up a dummy CharacterIterator to be returned if anyone
334 // calls getText(). With input from UText, there is no reasonable
335 // way to return a characterIterator over the actual input text.
336 // Return one over an empty string instead - this is the closest
337 // we can come to signaling a failure.
338 // (GetText() is obsolete, this failure is sort of OK)
339 if (fDCharIter == NULL) {
340 static const UChar c = 0;
341 fDCharIter = new UCharCharacterIterator(&c, 0);
342 if (fDCharIter == NULL) {
343 status = U_MEMORY_ALLOCATION_ERROR;
344 return;
345 }
346 }
347
348 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
349 // existing fCharIter was adopted from the outside. Delete it now.
350 delete fCharIter;
351 }
352 fCharIter = fDCharIter;
353
354 this->first();
355 }
356
357
358 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
359 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
360 return result;
361 }
362
363
364
365 /**
366 * Returns the description used to create this iterator
367 */
368 const UnicodeString&
369 RuleBasedBreakIterator::getRules() const {
370 if (fData != NULL) {
371 return fData->getRuleSourceString();
372 } else {
373 static const UnicodeString *s;
374 if (s == NULL) {
375 // TODO: something more elegant here.
376 // perhaps API should return the string by value.
377 // Note: thread unsafe init & leak are semi-ok, better than
378 // what was before. Sould be cleaned up, though.
379 s = new UnicodeString;
380 }
381 return *s;
382 }
383 }
384
385 //=======================================================================
386 // BreakIterator overrides
387 //=======================================================================
388
389 /**
390 * Return a CharacterIterator over the text being analyzed.
391 */
392 CharacterIterator&
393 RuleBasedBreakIterator::getText() const {
394 return *fCharIter;
395 }
396
397 /**
398 * Set the iterator to analyze a new piece of text. This function resets
399 * the current iteration position to the beginning of the text.
400 * @param newText An iterator over the text to analyze.
401 */
402 void
403 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
404 // If we are holding a CharacterIterator adopted from a
405 // previous call to this function, delete it now.
406 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
407 delete fCharIter;
408 }
409
410 fCharIter = newText;
411 UErrorCode status = U_ZERO_ERROR;
412 reset();
413 if (newText==NULL || newText->startIndex() != 0) {
414 // startIndex !=0 wants to be an error, but there's no way to report it.
415 // Make the iterator text be an empty string.
416 fText = utext_openUChars(fText, NULL, 0, &status);
417 } else {
418 fText = utext_openCharacterIterator(fText, newText, &status);
419 }
420 this->first();
421 }
422
423 /**
424 * Set the iterator to analyze a new piece of text. This function resets
425 * the current iteration position to the beginning of the text.
426 * @param newText An iterator over the text to analyze.
427 */
428 void
429 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
430 UErrorCode status = U_ZERO_ERROR;
431 reset();
432 fText = utext_openConstUnicodeString(fText, &newText, &status);
433
434 // Set up a character iterator on the string.
435 // Needed in case someone calls getText().
436 // Can not, unfortunately, do this lazily on the (probably never)
437 // call to getText(), because getText is const.
438 if (fSCharIter == NULL) {
439 fSCharIter = new StringCharacterIterator(newText);
440 } else {
441 fSCharIter->setText(newText);
442 }
443
444 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
445 // old fCharIter was adopted from the outside. Delete it.
446 delete fCharIter;
447 }
448 fCharIter = fSCharIter;
449
450 this->first();
451 }
452
453
454
455 /**
456 * Sets the current iteration position to the beginning of the text.
457 * @return The offset of the beginning of the text.
458 */
459 int32_t RuleBasedBreakIterator::first(void) {
460 reset();
461 fLastRuleStatusIndex = 0;
462 fLastStatusIndexValid = TRUE;
463 //if (fText == NULL)
464 // return BreakIterator::DONE;
465
466 utext_setNativeIndex(fText, 0);
467 return 0;
468 }
469
470 /**
471 * Sets the current iteration position to the end of the text.
472 * @return The text's past-the-end offset.
473 */
474 int32_t RuleBasedBreakIterator::last(void) {
475 reset();
476 if (fText == NULL) {
477 fLastRuleStatusIndex = 0;
478 fLastStatusIndexValid = TRUE;
479 return BreakIterator::DONE;
480 }
481
482 fLastStatusIndexValid = FALSE;
483 int32_t pos = (int32_t)utext_nativeLength(fText);
484 utext_setNativeIndex(fText, pos);
485 return pos;
486 }
487
488 /**
489 * Advances the iterator either forward or backward the specified number of steps.
490 * Negative values move backward, and positive values move forward. This is
491 * equivalent to repeatedly calling next() or previous().
492 * @param n The number of steps to move. The sign indicates the direction
493 * (negative is backwards, and positive is forwards).
494 * @return The character offset of the boundary position n boundaries away from
495 * the current one.
496 */
497 int32_t RuleBasedBreakIterator::next(int32_t n) {
498 int32_t result = current();
499 while (n > 0) {
500 result = next();
501 --n;
502 }
503 while (n < 0) {
504 result = previous();
505 ++n;
506 }
507 return result;
508 }
509
510 /**
511 * Advances the iterator to the next boundary position.
512 * @return The position of the first boundary after this one.
513 */
514 int32_t RuleBasedBreakIterator::next(void) {
515 // if we have cached break positions and we're still in the range
516 // covered by them, just move one step forward in the cache
517 if (fCachedBreakPositions != NULL) {
518 if (fPositionInCache < fNumCachedBreakPositions - 1) {
519 ++fPositionInCache;
520 int32_t pos = fCachedBreakPositions[fPositionInCache];
521 utext_setNativeIndex(fText, pos);
522 return pos;
523 }
524 else {
525 reset();
526 }
527 }
528
529 int32_t startPos = current();
530 int32_t result = handleNext(fData->fForwardTable);
531 if (fDictionaryCharCount > 0) {
532 result = checkDictionary(startPos, result, FALSE);
533 }
534 return result;
535 }
536
537 /**
538 * Advances the iterator backwards, to the last boundary preceding this one.
539 * @return The position of the last boundary position preceding this one.
540 */
541 int32_t RuleBasedBreakIterator::previous(void) {
542 int32_t result;
543 int32_t startPos;
544
545 // if we have cached break positions and we're still in the range
546 // covered by them, just move one step backward in the cache
547 if (fCachedBreakPositions != NULL) {
548 if (fPositionInCache > 0) {
549 --fPositionInCache;
550 // If we're at the beginning of the cache, need to reevaluate the
551 // rule status
552 if (fPositionInCache <= 0) {
553 fLastStatusIndexValid = FALSE;
554 }
555 int32_t pos = fCachedBreakPositions[fPositionInCache];
556 utext_setNativeIndex(fText, pos);
557 return pos;
558 }
559 else {
560 reset();
561 }
562 }
563
564 // if we're already sitting at the beginning of the text, return DONE
565 if (fText == NULL || (startPos = current()) == 0) {
566 fLastRuleStatusIndex = 0;
567 fLastStatusIndexValid = TRUE;
568 return BreakIterator::DONE;
569 }
570
571 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
572 result = handlePrevious(fData->fReverseTable);
573 if (fDictionaryCharCount > 0) {
574 result = checkDictionary(result, startPos, TRUE);
575 }
576 return result;
577 }
578
579 // old rule syntax
580 // set things up. handlePrevious() will back us up to some valid
581 // break position before the current position (we back our internal
582 // iterator up one step to prevent handlePrevious() from returning
583 // the current position), but not necessarily the last one before
584
585 // where we started
586
587 int32_t start = current();
588
589 UTEXT_PREVIOUS32(fText);
590 int32_t lastResult = handlePrevious(fData->fReverseTable);
591 if (lastResult == UBRK_DONE) {
592 lastResult = 0;
593 utext_setNativeIndex(fText, 0);
594 }
595 result = lastResult;
596 int32_t lastTag = 0;
597 UBool breakTagValid = FALSE;
598
599 // iterate forward from the known break position until we pass our
600 // starting point. The last break position before the starting
601 // point is our return value
602
603 for (;;) {
604 result = next();
605 if (result == BreakIterator::DONE || result >= start) {
606 break;
607 }
608 lastResult = result;
609 lastTag = fLastRuleStatusIndex;
610 breakTagValid = TRUE;
611 }
612
613 // fLastBreakTag wants to have the value for section of text preceding
614 // the result position that we are to return (in lastResult.) If
615 // the backwards rules overshot and the above loop had to do two or more
616 // next()s to move up to the desired return position, we will have a valid
617 // tag value. But, if handlePrevious() took us to exactly the correct result positon,
618 // we wont have a tag value for that position, which is only set by handleNext().
619
620 // set the current iteration position to be the last break position
621 // before where we started, and then return that value
622 utext_setNativeIndex(fText, lastResult);
623 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
624 fLastStatusIndexValid = breakTagValid;
625
626 // No need to check the dictionary; it will have been handled by
627 // next()
628
629 return lastResult;
630 }
631
632 /**
633 * Sets the iterator to refer to the first boundary position following
634 * the specified position.
635 * @offset The position from which to begin searching for a break position.
636 * @return The position of the first break after the current position.
637 */
638 int32_t RuleBasedBreakIterator::following(int32_t offset) {
639 // if we have cached break positions and offset is in the range
640 // covered by them, use them
641 // TODO: could use binary search
642 // TODO: what if offset is outside range, but break is not?
643 if (fCachedBreakPositions != NULL) {
644 if (offset >= fCachedBreakPositions[0]
645 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
646 fPositionInCache = 0;
647 // We are guaranteed not to leave the array due to range test above
648 while (offset >= fCachedBreakPositions[fPositionInCache]) {
649 ++fPositionInCache;
650 }
651 int32_t pos = fCachedBreakPositions[fPositionInCache];
652 utext_setNativeIndex(fText, pos);
653 return pos;
654 }
655 else {
656 reset();
657 }
658 }
659
660 // if the offset passed in is already past the end of the text,
661 // just return DONE; if it's before the beginning, return the
662 // text's starting offset
663 fLastRuleStatusIndex = 0;
664 fLastStatusIndexValid = TRUE;
665 if (fText == NULL || offset >= utext_nativeLength(fText)) {
666 last();
667 return next();
668 }
669 else if (offset < 0) {
670 return first();
671 }
672
673 // otherwise, set our internal iteration position (temporarily)
674 // to the position passed in. If this is the _beginning_ position,
675 // then we can just use next() to get our return value
676
677 int32_t result = 0;
678
679 if (fData->fSafeRevTable != NULL) {
680 // new rule syntax
681 utext_setNativeIndex(fText, offset);
682 // move forward one codepoint to prepare for moving back to a
683 // safe point.
684 // this handles offset being between a supplementary character
685 UTEXT_NEXT32(fText);
686 // handlePrevious will move most of the time to < 1 boundary away
687 handlePrevious(fData->fSafeRevTable);
688 int32_t result = next();
689 while (result <= offset) {
690 result = next();
691 }
692 return result;
693 }
694 if (fData->fSafeFwdTable != NULL) {
695 // backup plan if forward safe table is not available
696 utext_setNativeIndex(fText, offset);
697 UTEXT_PREVIOUS32(fText);
698 // handle next will give result >= offset
699 handleNext(fData->fSafeFwdTable);
700 // previous will give result 0 or 1 boundary away from offset,
701 // most of the time
702 // we have to
703 int32_t oldresult = previous();
704 while (oldresult > offset) {
705 int32_t result = previous();
706 if (result <= offset) {
707 return oldresult;
708 }
709 oldresult = result;
710 }
711 int32_t result = next();
712 if (result <= offset) {
713 return next();
714 }
715 return result;
716 }
717 // otherwise, we have to sync up first. Use handlePrevious() to back
718 // up to a known break position before the specified position (if
719 // we can determine that the specified position is a break position,
720 // we don't back up at all). This may or may not be the last break
721 // position at or before our starting position. Advance forward
722 // from here until we've passed the starting position. The position
723 // we stop on will be the first break position after the specified one.
724 // old rule syntax
725
726 utext_setNativeIndex(fText, offset);
727 if (offset==0 ||
728 offset==1 && utext_getNativeIndex(fText)==0) {
729 return next();
730 }
731 result = previous();
732
733 while (result != BreakIterator::DONE && result <= offset) {
734 result = next();
735 }
736
737 return result;
738 }
739
740 /**
741 * Sets the iterator to refer to the last boundary position before the
742 * specified position.
743 * @offset The position to begin searching for a break from.
744 * @return The position of the last boundary before the starting position.
745 */
746 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
747 // if we have cached break positions and offset is in the range
748 // covered by them, use them
749 if (fCachedBreakPositions != NULL) {
750 // TODO: binary search?
751 // TODO: What if offset is outside range, but break is not?
752 if (offset > fCachedBreakPositions[0]
753 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
754 fPositionInCache = 0;
755 while (fPositionInCache < fNumCachedBreakPositions
756 && offset > fCachedBreakPositions[fPositionInCache])
757 ++fPositionInCache;
758 --fPositionInCache;
759 // If we're at the beginning of the cache, need to reevaluate the
760 // rule status
761 if (fPositionInCache <= 0) {
762 fLastStatusIndexValid = FALSE;
763 }
764 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
765 return fCachedBreakPositions[fPositionInCache];
766 }
767 else {
768 reset();
769 }
770 }
771
772 // if the offset passed in is already past the end of the text,
773 // just return DONE; if it's before the beginning, return the
774 // text's starting offset
775 if (fText == NULL || offset > utext_nativeLength(fText)) {
776 // return BreakIterator::DONE;
777 return last();
778 }
779 else if (offset < 0) {
780 return first();
781 }
782
783 // if we start by updating the current iteration position to the
784 // position specified by the caller, we can just use previous()
785 // to carry out this operation
786
787 if (fData->fSafeFwdTable != NULL) {
788 // new rule syntax
789 utext_setNativeIndex(fText, offset);
790 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
791 if (newOffset != offset) {
792 // Will come here if specified offset was not a code point boundary AND
793 // the underlying implmentation is using UText, which snaps any non-code-point-boundary
794 // indices to the containing code point.
795 // For breakitereator::preceding only, these non-code-point indices need to be moved
796 // up to refer to the following codepoint.
797 UTEXT_NEXT32(fText);
798 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
799 }
800
801 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair,
802 // rather than adjusting the position unconditionally?
803 // (Change would interact with safe rules.)
804 // TODO: change RBBI behavior for off-boundary indices to match that of UText?
805 // affects only preceding(), seems cleaner, but is slightly different.
806 UTEXT_PREVIOUS32(fText);
807 handleNext(fData->fSafeFwdTable);
808 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
809 while (result >= offset) {
810 result = previous();
811 }
812 return result;
813 }
814 if (fData->fSafeRevTable != NULL) {
815 // backup plan if forward safe table is not available
816 // TODO: check whether this path can be discarded
817 // It's probably OK to say that rules must supply both safe tables
818 // if they use safe tables at all. We have certainly never described
819 // to anyone how to work with just one safe table.
820 utext_setNativeIndex(fText, offset);
821 UTEXT_NEXT32(fText);
822
823 // handle previous will give result <= offset
824 handlePrevious(fData->fSafeRevTable);
825
826 // next will give result 0 or 1 boundary away from offset,
827 // most of the time
828 // we have to
829 int32_t oldresult = next();
830 while (oldresult < offset) {
831 int32_t result = next();
832 if (result >= offset) {
833 return oldresult;
834 }
835 oldresult = result;
836 }
837 int32_t result = previous();
838 if (result >= offset) {
839 return previous();
840 }
841 return result;
842 }
843
844 // old rule syntax
845 utext_setNativeIndex(fText, offset);
846 return previous();
847 }
848
849 /**
850 * Returns true if the specfied position is a boundary position. As a side
851 * effect, leaves the iterator pointing to the first boundary position at
852 * or after "offset".
853 * @param offset the offset to check.
854 * @return True if "offset" is a boundary position.
855 */
856 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
857 // the beginning index of the iterator is always a boundary position by definition
858 if (offset == 0) {
859 first(); // For side effects on current position, tag values.
860 return TRUE;
861 }
862
863 if (offset == (int32_t)utext_nativeLength(fText)) {
864 last(); // For side effects on current position, tag values.
865 return TRUE;
866 }
867
868 // out-of-range indexes are never boundary positions
869 if (offset < 0) {
870 first(); // For side effects on current position, tag values.
871 return FALSE;
872 }
873
874 if (offset > utext_nativeLength(fText)) {
875 last(); // For side effects on current position, tag values.
876 return FALSE;
877 }
878
879 // otherwise, we can use following() on the position before the specified
880 // one and return true if the position we get back is the one the user
881 // specified
882 utext_previous32From(fText, offset);
883 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
884 UBool result = following(backOne) == offset;
885 return result;
886 }
887
888 /**
889 * Returns the current iteration position.
890 * @return The current iteration position.
891 */
892 int32_t RuleBasedBreakIterator::current(void) const {
893 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
894 return pos;
895 }
896
897 //=======================================================================
898 // implementation
899 //=======================================================================
900
901 //
902 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
903 // of user text. A variable with this enum type keeps track of where we
904 // are. The state machine only fetches user input while in the RUN mode.
905 //
906 enum RBBIRunMode {
907 RBBI_START, // state machine processing is before first char of input
908 RBBI_RUN, // state machine processing is in the user text
909 RBBI_END // state machine processing is after end of user text.
910 };
911
912
913 //-----------------------------------------------------------------------------------
914 //
915 // handleNext(stateTable)
916 // This method is the actual implementation of the rbbi next() method.
917 // This method initializes the state machine to state 1
918 // and advances through the text character by character until we reach the end
919 // of the text or the state machine transitions to state 0. We update our return
920 // value every time the state machine passes through an accepting state.
921 //
922 //-----------------------------------------------------------------------------------
923 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
924 int32_t state;
925 int16_t category = 0;
926 RBBIRunMode mode;
927
928 RBBIStateTableRow *row;
929 UChar32 c;
930 int32_t lookaheadStatus = 0;
931 int32_t lookaheadTagIdx = 0;
932 int32_t result = 0;
933 int32_t initialPosition = 0;
934 int32_t lookaheadResult = 0;
935 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
936 const char *tableData = statetable->fTableData;
937 uint32_t tableRowLen = statetable->fRowLen;
938
939 #ifdef RBBI_DEBUG
940 if (fTrace) {
941 RBBIDebugPuts("Handle Next pos char state category");
942 }
943 #endif
944
945 // No matter what, handleNext alway correctly sets the break tag value.
946 fLastStatusIndexValid = TRUE;
947 fLastRuleStatusIndex = 0;
948
949 // if we're already at the end of the text, return DONE.
950 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
951 result = initialPosition;
952 c = UTEXT_NEXT32(fText);
953 if (fData == NULL || c==U_SENTINEL) {
954 return BreakIterator::DONE;
955 }
956
957 // Set the initial state for the state machine
958 state = START_STATE;
959 row = (RBBIStateTableRow *)
960 //(statetable->fTableData + (statetable->fRowLen * state));
961 (tableData + tableRowLen * state);
962
963
964 mode = RBBI_RUN;
965 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
966 category = 2;
967 mode = RBBI_START;
968 }
969
970
971 // loop until we reach the end of the text or transition to state 0
972 //
973 for (;;) {
974 if (c == U_SENTINEL) {
975 // Reached end of input string.
976 if (mode == RBBI_END) {
977 // We have already run the loop one last time with the
978 // character set to the psueudo {eof} value. Now it is time
979 // to unconditionally bail out.
980 if (lookaheadResult > result) {
981 // We ran off the end of the string with a pending look-ahead match.
982 // Treat this as if the look-ahead condition had been met, and return
983 // the match at the / position from the look-ahead rule.
984 result = lookaheadResult;
985 fLastRuleStatusIndex = lookaheadTagIdx;
986 lookaheadStatus = 0;
987 }
988 break;
989 }
990 // Run the loop one last time with the fake end-of-input character category.
991 mode = RBBI_END;
992 category = 1;
993 }
994
995 //
996 // Get the char category. An incoming category of 1 or 2 means that
997 // we are preset for doing the beginning or end of input, and
998 // that we shouldn't get a category from an actual text input character.
999 //
1000 if (mode == RBBI_RUN) {
1001 // look up the current character's character category, which tells us
1002 // which column in the state table to look at.
1003 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1004 // not the size of the character going in, which is a UChar32.
1005 //
1006 UTRIE_GET16(&fData->fTrie, c, category);
1007
1008 // Check the dictionary bit in the character's category.
1009 // Counter is only used by dictionary based iterators (subclasses).
1010 // Chars that need to be handled by a dictionary have a flag bit set
1011 // in their category values.
1012 //
1013 if ((category & 0x4000) != 0) {
1014 fDictionaryCharCount++;
1015 // And off the dictionary flag bit.
1016 category &= ~0x4000;
1017 }
1018 }
1019
1020 #ifdef RBBI_DEBUG
1021 if (fTrace) {
1022 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText));
1023 if (0x20<=c && c<0x7f) {
1024 RBBIDebugPrintf("\"%c\" ", c);
1025 } else {
1026 RBBIDebugPrintf("%5x ", c);
1027 }
1028 RBBIDebugPrintf("%3d %3d\n", state, category);
1029 }
1030 #endif
1031
1032 // State Transition - move machine to its next state
1033 //
1034 state = row->fNextState[category];
1035 row = (RBBIStateTableRow *)
1036 // (statetable->fTableData + (statetable->fRowLen * state));
1037 (tableData + tableRowLen * state);
1038
1039
1040 if (row->fAccepting == -1) {
1041 // Match found, common case.
1042 if (mode != RBBI_START) {
1043 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1044 }
1045 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
1046 }
1047
1048 if (row->fLookAhead != 0) {
1049 if (lookaheadStatus != 0
1050 && row->fAccepting == lookaheadStatus) {
1051 // Lookahead match is completed.
1052 result = lookaheadResult;
1053 fLastRuleStatusIndex = lookaheadTagIdx;
1054 lookaheadStatus = 0;
1055 // TODO: make a standalone hard break in a rule work.
1056 if (lookAheadHardBreak) {
1057 UTEXT_SETNATIVEINDEX(fText, result);
1058 return result;
1059 }
1060 // Look-ahead completed, but other rules may match further. Continue on
1061 // TODO: junk this feature? I don't think it's used anywhwere.
1062 goto continueOn;
1063 }
1064
1065 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1066 lookaheadResult = r;
1067 lookaheadStatus = row->fLookAhead;
1068 lookaheadTagIdx = row->fTagIdx;
1069 goto continueOn;
1070 }
1071
1072
1073 if (row->fAccepting != 0) {
1074 // Because this is an accepting state, any in-progress look-ahead match
1075 // is no longer relavant. Clear out the pending lookahead status.
1076 lookaheadStatus = 0; // clear out any pending look-ahead match.
1077 }
1078
1079 continueOn:
1080 if (state == STOP_STATE) {
1081 // This is the normal exit from the lookup state machine.
1082 // We have advanced through the string until it is certain that no
1083 // longer match is possible, no matter what characters follow.
1084 break;
1085 }
1086
1087 // Advance to the next character.
1088 // If this is a beginning-of-input loop iteration, don't advance
1089 // the input position. The next iteration will be processing the
1090 // first real input character.
1091 if (mode == RBBI_RUN) {
1092 c = UTEXT_NEXT32(fText);
1093 } else {
1094 if (mode == RBBI_START) {
1095 mode = RBBI_RUN;
1096 }
1097 }
1098
1099
1100 }
1101
1102 // The state machine is done. Check whether it found a match...
1103
1104 // If the iterator failed to advance in the match engine, force it ahead by one.
1105 // (This really indicates a defect in the break rules. They should always match
1106 // at least one character.)
1107 if (result == initialPosition) {
1108 UTEXT_SETNATIVEINDEX(fText, initialPosition);
1109 UTEXT_NEXT32(fText);
1110 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1111 }
1112
1113 // Leave the iterator at our result position.
1114 UTEXT_SETNATIVEINDEX(fText, result);
1115 #ifdef RBBI_DEBUG
1116 if (fTrace) {
1117 RBBIDebugPrintf("result = %d\n\n", result);
1118 }
1119 #endif
1120 return result;
1121 }
1122
1123
1124
1125 //-----------------------------------------------------------------------------------
1126 //
1127 // handlePrevious()
1128 //
1129 // Iterate backwards, according to the logic of the reverse rules.
1130 // This version handles the exact style backwards rules.
1131 //
1132 // The logic of this function is very similar to handleNext(), above.
1133 //
1134 //-----------------------------------------------------------------------------------
1135 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
1136 int32_t state;
1137 int16_t category = 0;
1138 RBBIRunMode mode;
1139 RBBIStateTableRow *row;
1140 UChar32 c;
1141 int32_t lookaheadStatus = 0;
1142 int32_t result = 0;
1143 int32_t initialPosition = 0;
1144 int32_t lookaheadResult = 0;
1145 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1146
1147 #ifdef RBBI_DEBUG
1148 if (fTrace) {
1149 RBBIDebugPuts("Handle Previous pos char state category");
1150 }
1151 #endif
1152
1153 // handlePrevious() never gets the rule status.
1154 // Flag the status as invalid; if the user ever asks for status, we will need
1155 // to back up, then re-find the break position using handleNext(), which does
1156 // get the status value.
1157 fLastStatusIndexValid = FALSE;
1158 fLastRuleStatusIndex = 0;
1159
1160 // if we're already at the start of the text, return DONE.
1161 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1162 return BreakIterator::DONE;
1163 }
1164
1165 // Set up the starting char.
1166 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1167 result = initialPosition;
1168 c = UTEXT_PREVIOUS32(fText);
1169
1170 // Set the initial state for the state machine
1171 state = START_STATE;
1172 row = (RBBIStateTableRow *)
1173 (statetable->fTableData + (statetable->fRowLen * state));
1174 category = 3;
1175 mode = RBBI_RUN;
1176 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1177 category = 2;
1178 mode = RBBI_START;
1179 }
1180
1181
1182 // loop until we reach the start of the text or transition to state 0
1183 //
1184 for (;;) {
1185 if (c == U_SENTINEL) {
1186 // Reached end of input string.
1187 if (mode == RBBI_END ||
1188 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
1189 // We have already run the loop one last time with the
1190 // character set to the psueudo {eof} value. Now it is time
1191 // to unconditionally bail out.
1192 // (Or we have an old format binary rule file that does not support {eof}.)
1193 if (lookaheadResult < result) {
1194 // We ran off the end of the string with a pending look-ahead match.
1195 // Treat this as if the look-ahead condition had been met, and return
1196 // the match at the / position from the look-ahead rule.
1197 result = lookaheadResult;
1198 lookaheadStatus = 0;
1199 } else if (result == initialPosition) {
1200 // Ran off start, no match found.
1201 // move one index one (towards the start, since we are doing a previous())
1202 UTEXT_SETNATIVEINDEX(fText, initialPosition);
1203 UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check.
1204 }
1205 break;
1206 }
1207 // Run the loop one last time with the fake end-of-input character category.
1208 mode = RBBI_END;
1209 category = 1;
1210 }
1211
1212 //
1213 // Get the char category. An incoming category of 1 or 2 means that
1214 // we are preset for doing the beginning or end of input, and
1215 // that we shouldn't get a category from an actual text input character.
1216 //
1217 if (mode == RBBI_RUN) {
1218 // look up the current character's character category, which tells us
1219 // which column in the state table to look at.
1220 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1221 // not the size of the character going in, which is a UChar32.
1222 //
1223 UTRIE_GET16(&fData->fTrie, c, category);
1224
1225 // Check the dictionary bit in the character's category.
1226 // Counter is only used by dictionary based iterators (subclasses).
1227 // Chars that need to be handled by a dictionary have a flag bit set
1228 // in their category values.
1229 //
1230 if ((category & 0x4000) != 0) {
1231 fDictionaryCharCount++;
1232 // And off the dictionary flag bit.
1233 category &= ~0x4000;
1234 }
1235 }
1236
1237 #ifdef RBBI_DEBUG
1238 if (fTrace) {
1239 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText));
1240 if (0x20<=c && c<0x7f) {
1241 RBBIDebugPrintf("\"%c\" ", c);
1242 } else {
1243 RBBIDebugPrintf("%5x ", c);
1244 }
1245 RBBIDebugPrintf("%3d %3d\n", state, category);
1246 }
1247 #endif
1248
1249 // State Transition - move machine to its next state
1250 //
1251 state = row->fNextState[category];
1252 row = (RBBIStateTableRow *)
1253 (statetable->fTableData + (statetable->fRowLen * state));
1254
1255 if (row->fAccepting == -1) {
1256 // Match found, common case.
1257 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1258 }
1259
1260 if (row->fLookAhead != 0) {
1261 if (lookaheadStatus != 0
1262 && row->fAccepting == lookaheadStatus) {
1263 // Lookahead match is completed.
1264 result = lookaheadResult;
1265 lookaheadStatus = 0;
1266 // TODO: make a standalone hard break in a rule work.
1267 if (lookAheadHardBreak) {
1268 UTEXT_SETNATIVEINDEX(fText, result);
1269 return result;
1270 }
1271 // Look-ahead completed, but other rules may match further. Continue on
1272 // TODO: junk this feature? I don't think it's used anywhwere.
1273 goto continueOn;
1274 }
1275
1276 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1277 lookaheadResult = r;
1278 lookaheadStatus = row->fLookAhead;
1279 goto continueOn;
1280 }
1281
1282
1283 if (row->fAccepting != 0) {
1284 // Because this is an accepting state, any in-progress look-ahead match
1285 // is no longer relavant. Clear out the pending lookahead status.
1286 lookaheadStatus = 0;
1287 }
1288
1289 continueOn:
1290 if (state == STOP_STATE) {
1291 // This is the normal exit from the lookup state machine.
1292 // We have advanced through the string until it is certain that no
1293 // longer match is possible, no matter what characters follow.
1294 break;
1295 }
1296
1297 // Move (backwards) to the next character to process.
1298 // If this is a beginning-of-input loop iteration, don't advance
1299 // the input position. The next iteration will be processing the
1300 // first real input character.
1301 if (mode == RBBI_RUN) {
1302 c = UTEXT_PREVIOUS32(fText);
1303 } else {
1304 if (mode == RBBI_START) {
1305 mode = RBBI_RUN;
1306 }
1307 }
1308 }
1309
1310 // The state machine is done. Check whether it found a match...
1311
1312 // If the iterator failed to advance in the match engine, force it ahead by one.
1313 // (This really indicates a defect in the break rules. They should always match
1314 // at least one character.)
1315 if (result == initialPosition) {
1316 UTEXT_SETNATIVEINDEX(fText, initialPosition);
1317 UTEXT_PREVIOUS32(fText);
1318 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1319 }
1320
1321 // Leave the iterator at our result position.
1322 UTEXT_SETNATIVEINDEX(fText, result);
1323 #ifdef RBBI_DEBUG
1324 if (fTrace) {
1325 RBBIDebugPrintf("result = %d\n\n", result);
1326 }
1327 #endif
1328 return result;
1329 }
1330
1331
1332 void
1333 RuleBasedBreakIterator::reset()
1334 {
1335 if (fCachedBreakPositions) {
1336 uprv_free(fCachedBreakPositions);
1337 }
1338 fCachedBreakPositions = NULL;
1339 fNumCachedBreakPositions = 0;
1340 fDictionaryCharCount = 0;
1341 fPositionInCache = 0;
1342 }
1343
1344
1345
1346 //-------------------------------------------------------------------------------
1347 //
1348 // getRuleStatus() Return the break rule tag associated with the current
1349 // iterator position. If the iterator arrived at its current
1350 // position by iterating forwards, the value will have been
1351 // cached by the handleNext() function.
1352 //
1353 // If no cached status value is available, the status is
1354 // found by doing a previous() followed by a next(), which
1355 // leaves the iterator where it started, and computes the
1356 // status while doing the next().
1357 //
1358 //-------------------------------------------------------------------------------
1359 void RuleBasedBreakIterator::makeRuleStatusValid() {
1360 if (fLastStatusIndexValid == FALSE) {
1361 // No cached status is available.
1362 if (fText == NULL || current() == 0) {
1363 // At start of text, or there is no text. Status is always zero.
1364 fLastRuleStatusIndex = 0;
1365 fLastStatusIndexValid = TRUE;
1366 } else {
1367 // Not at start of text. Find status the tedious way.
1368 int32_t pa = current();
1369 previous();
1370 if (fNumCachedBreakPositions > 0) {
1371 reset(); // Blow off the dictionary cache
1372 }
1373 int32_t pb = next();
1374 if (pa != pb) {
1375 // note: the if (pa != pb) test is here only to eliminate warnings for
1376 // unused local variables on gcc. Logically, it isn't needed.
1377 U_ASSERT(pa == pb);
1378 }
1379 }
1380 }
1381 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx);
1382 }
1383
1384
1385 int32_t RuleBasedBreakIterator::getRuleStatus() const {
1386 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
1387 nonConstThis->makeRuleStatusValid();
1388
1389 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1390 // (the number of status values.)
1391 // This function returns the last (largest) of the array of status values.
1392 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1393 int32_t tagVal = fData->fRuleStatusTable[idx];
1394
1395 return tagVal;
1396 }
1397
1398
1399
1400
1401 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1402 int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1403 {
1404 if (U_FAILURE(status)) {
1405 return 0;
1406 }
1407
1408 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
1409 nonConstThis->makeRuleStatusValid();
1410 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1411 int32_t numValsToCopy = numVals;
1412 if (numVals > capacity) {
1413 status = U_BUFFER_OVERFLOW_ERROR;
1414 numValsToCopy = capacity;
1415 }
1416 int i;
1417 for (i=0; i<numValsToCopy; i++) {
1418 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1419 }
1420 return numVals;
1421 }
1422
1423
1424
1425 //-------------------------------------------------------------------------------
1426 //
1427 // getBinaryRules Access to the compiled form of the rules,
1428 // for use by build system tools that save the data
1429 // for standard iterator types.
1430 //
1431 //-------------------------------------------------------------------------------
1432 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1433 const uint8_t *retPtr = NULL;
1434 length = 0;
1435
1436 if (fData != NULL) {
1437 retPtr = (const uint8_t *)fData->fHeader;
1438 length = fData->fHeader->fLength;
1439 }
1440 return retPtr;
1441 }
1442
1443
1444
1445
1446 //-------------------------------------------------------------------------------
1447 //
1448 // BufferClone TODO: In my (Andy) opinion, this function should be deprecated.
1449 // Saving one heap allocation isn't worth the trouble.
1450 // Cloning shouldn't be done in tight loops, and
1451 // making the clone copy involves other heap operations anyway.
1452 // And the application code for correctly dealing with buffer
1453 // size problems and the eventual object destruction is ugly.
1454 //
1455 //-------------------------------------------------------------------------------
1456 BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1457 int32_t &bufferSize,
1458 UErrorCode &status)
1459 {
1460 if (U_FAILURE(status)){
1461 return NULL;
1462 }
1463
1464 //
1465 // If user buffer size is zero this is a preflight operation to
1466 // obtain the needed buffer size, allowing for worst case misalignment.
1467 //
1468 if (bufferSize == 0) {
1469 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1470 return NULL;
1471 }
1472
1473
1474 //
1475 // Check the alignment and size of the user supplied buffer.
1476 // Allocate heap memory if the user supplied memory is insufficient.
1477 //
1478 char *buf = (char *)stackBuffer;
1479 uint32_t s = bufferSize;
1480
1481 if (stackBuffer == NULL) {
1482 s = 0; // Ignore size, force allocation if user didn't give us a buffer.
1483 }
1484 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1485 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1486 s -= offsetUp;
1487 buf += offsetUp;
1488 }
1489 if (s < sizeof(RuleBasedBreakIterator)) {
1490 // Not enough room in the caller-supplied buffer.
1491 // Do a plain-vanilla heap based clone and return that, along with
1492 // a warning that the clone was allocated.
1493 RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
1494 if (clonedBI == 0) {
1495 status = U_MEMORY_ALLOCATION_ERROR;
1496 } else {
1497 status = U_SAFECLONE_ALLOCATED_WARNING;
1498 }
1499 return clonedBI;
1500 }
1501
1502 //
1503 // Clone the source BI into the caller-supplied buffer.
1504 // TODO: using an overloaded operator new to directly initialize the
1505 // copy in the user's buffer would be better, but it doesn't seem
1506 // to get along with namespaces. Investigate why.
1507 //
1508 // The memcpy is only safe with an empty (default constructed)
1509 // break iterator. Use on others can screw up reference counts
1510 // to data. memcpy-ing objects is not really a good idea...
1511 //
1512 RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy
1513 RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
1514 uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
1515 clone->init(); // Init RuleBasedBreakIterator part, (user default constructor)
1516 *clone = *this; // clone = the real BI we want.
1517 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code)
1518
1519 return clone;
1520 }
1521
1522
1523 //-------------------------------------------------------------------------------
1524 //
1525 // isDictionaryChar Return true if the category lookup for this char
1526 // indicates that it is in the set of dictionary lookup
1527 // chars.
1528 //
1529 // This function is intended for use by dictionary based
1530 // break iterators.
1531 //
1532 //-------------------------------------------------------------------------------
1533 /*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
1534 if (fData == NULL) {
1535 return FALSE;
1536 }
1537 uint16_t category;
1538 UTRIE_GET16(&fData->fTrie, c, category);
1539 return (category & 0x4000) != 0;
1540 }*/
1541
1542
1543 //-------------------------------------------------------------------------------
1544 //
1545 // checkDictionary This function handles all processing of characters in
1546 // the "dictionary" set. It will determine the appropriate
1547 // course of action, and possibly set up a cache in the
1548 // process.
1549 //
1550 //-------------------------------------------------------------------------------
1551 int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1552 int32_t endPos,
1553 UBool reverse) {
1554 // Reset the old break cache first.
1555 uint32_t dictionaryCount = fDictionaryCharCount;
1556 reset();
1557
1558 if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
1559 return (reverse ? startPos : endPos);
1560 }
1561
1562 // Starting from the starting point, scan towards the proposed result,
1563 // looking for the first dictionary character (which may be the one
1564 // we're on, if we're starting in the middle of a range).
1565 utext_setNativeIndex(fText, reverse ? endPos : startPos);
1566 if (reverse) {
1567 UTEXT_PREVIOUS32(fText);
1568 }
1569
1570 int32_t rangeStart = startPos;
1571 int32_t rangeEnd = endPos;
1572
1573 uint16_t category;
1574 int32_t current;
1575 UErrorCode status = U_ZERO_ERROR;
1576 UStack breaks(status);
1577 int32_t foundBreakCount = 0;
1578 UChar32 c = utext_current32(fText);
1579
1580 UTRIE_GET16(&fData->fTrie, c, category);
1581
1582 // Is the character we're starting on a dictionary character? If so, we
1583 // need to back up to include the entire run; otherwise the results of
1584 // the break algorithm will differ depending on where we start. Since
1585 // the result is cached and there is typically a non-dictionary break
1586 // within a small number of words, there should be little performance impact.
1587 if (category & 0x4000) {
1588 if (reverse) {
1589 do {
1590 utext_next32(fText); // TODO: recast to work directly with postincrement.
1591 c = utext_current32(fText);
1592 UTRIE_GET16(&fData->fTrie, c, category);
1593 } while (c != U_SENTINEL && (category & 0x4000));
1594 // Back up to the last dictionary character
1595 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1596 if (c == U_SENTINEL) {
1597 // c = fText->last32();
1598 // TODO: why was this if needed?
1599 c = UTEXT_PREVIOUS32(fText);
1600 }
1601 else {
1602 c = UTEXT_PREVIOUS32(fText);
1603 }
1604 }
1605 else {
1606 do {
1607 c = UTEXT_PREVIOUS32(fText);
1608 UTRIE_GET16(&fData->fTrie, c, category);
1609 }
1610 while (c != U_SENTINEL && (category & 0x4000));
1611 // Back up to the last dictionary character
1612 if (c == U_SENTINEL) {
1613 // c = fText->first32();
1614 c = utext_current32(fText);
1615 }
1616 else {
1617 utext_next32(fText);
1618 c = utext_current32(fText);
1619 }
1620 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1621 }
1622 UTRIE_GET16(&fData->fTrie, c, category);
1623 }
1624
1625 // Loop through the text, looking for ranges of dictionary characters.
1626 // For each span, find the appropriate break engine, and ask it to find
1627 // any breaks within the span.
1628 // Note: we always do this in the forward direction, so that the break
1629 // cache is built in the right order.
1630 if (reverse) {
1631 utext_setNativeIndex(fText, rangeStart);
1632 c = utext_current32(fText);
1633 UTRIE_GET16(&fData->fTrie, c, category);
1634 }
1635 while(U_SUCCESS(status)) {
1636 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1637 utext_next32(fText); // TODO: tweak for post-increment operation
1638 c = utext_current32(fText);
1639 UTRIE_GET16(&fData->fTrie, c, category);
1640 }
1641 if (current >= rangeEnd) {
1642 break;
1643 }
1644
1645 // We now have a dictionary character. Get the appropriate language object
1646 // to deal with it.
1647 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1648
1649 // Ask the language object if there are any breaks. It will leave the text
1650 // pointer on the other side of its range, ready to search for the next one.
1651 if (lbe != NULL) {
1652 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1653 }
1654
1655 // Reload the loop variables for the next go-round
1656 c = utext_current32(fText);
1657 UTRIE_GET16(&fData->fTrie, c, category);
1658 }
1659
1660 // If we found breaks, build a new break cache. The first and last entries must
1661 // be the original starting and ending position.
1662 if (foundBreakCount > 0) {
1663 int32_t totalBreaks = foundBreakCount;
1664 if (startPos < breaks.elementAti(0)) {
1665 totalBreaks += 1;
1666 }
1667 if (endPos > breaks.peeki()) {
1668 totalBreaks += 1;
1669 }
1670 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1671 if (fCachedBreakPositions != NULL) {
1672 int32_t out = 0;
1673 fNumCachedBreakPositions = totalBreaks;
1674 if (startPos < breaks.elementAti(0)) {
1675 fCachedBreakPositions[out++] = startPos;
1676 }
1677 for (int32_t i = 0; i < foundBreakCount; ++i) {
1678 fCachedBreakPositions[out++] = breaks.elementAti(i);
1679 }
1680 if (endPos > fCachedBreakPositions[out-1]) {
1681 fCachedBreakPositions[out] = endPos;
1682 }
1683 // If there are breaks, then by definition, we are replacing the original
1684 // proposed break by one of the breaks we found. Use following() and
1685 // preceding() to do the work. They should never recurse in this case.
1686 if (reverse) {
1687 return preceding(endPos - 1);
1688 }
1689 else {
1690 return following(startPos);
1691 }
1692 }
1693 // If the allocation failed, just fall through to the "no breaks found" case.
1694 }
1695
1696 // If we get here, there were no language-based breaks. Set the text pointer
1697 // to the original proposed break.
1698 utext_setNativeIndex(fText, reverse ? startPos : endPos);
1699 return (reverse ? startPos : endPos);
1700 }
1701
1702 U_NAMESPACE_END
1703
1704 // defined in ucln_cmn.h
1705
1706 static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL;
1707
1708 /**
1709 * Release all static memory held by breakiterator.
1710 */
1711 U_CDECL_BEGIN
1712 static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1713 if (gLanguageBreakFactories) {
1714 delete gLanguageBreakFactories;
1715 gLanguageBreakFactories = NULL;
1716 }
1717 return TRUE;
1718 }
1719 U_CDECL_END
1720
1721 U_CDECL_BEGIN
1722 static void U_CALLCONV _deleteFactory(void *obj) {
1723 delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj;
1724 }
1725 U_CDECL_END
1726 U_NAMESPACE_BEGIN
1727
1728 static const LanguageBreakEngine*
1729 getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1730 {
1731 UBool needsInit;
1732 UErrorCode status = U_ZERO_ERROR;
1733 UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit);
1734
1735 if (needsInit) {
1736 UStack *factories = new UStack(_deleteFactory, NULL, status);
1737 if (factories != NULL && U_SUCCESS(status)) {
1738 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1739 factories->push(builtIn, status);
1740 #ifdef U_LOCAL_SERVICE_HOOK
1741 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1742 if (extra != NULL) {
1743 factories->push(extra, status);
1744 }
1745 #endif
1746 }
1747 umtx_lock(NULL);
1748 if (gLanguageBreakFactories == NULL) {
1749 gLanguageBreakFactories = factories;
1750 factories = NULL;
1751 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1752 }
1753 umtx_unlock(NULL);
1754 delete factories;
1755 }
1756
1757 if (gLanguageBreakFactories == NULL) {
1758 return NULL;
1759 }
1760
1761 int32_t i = gLanguageBreakFactories->size();
1762 const LanguageBreakEngine *lbe = NULL;
1763 while (--i >= 0) {
1764 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1765 lbe = factory->getEngineFor(c, breakType);
1766 if (lbe != NULL) {
1767 break;
1768 }
1769 }
1770 return lbe;
1771 }
1772
1773
1774 //-------------------------------------------------------------------------------
1775 //
1776 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1777 // the characer c.
1778 //
1779 //-------------------------------------------------------------------------------
1780 const LanguageBreakEngine *
1781 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1782 const LanguageBreakEngine *lbe = NULL;
1783 UErrorCode status = U_ZERO_ERROR;
1784
1785 if (fLanguageBreakEngines == NULL) {
1786 fLanguageBreakEngines = new UStack(status);
1787 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1788 delete fLanguageBreakEngines;
1789 fLanguageBreakEngines = 0;
1790 return NULL;
1791 }
1792 }
1793
1794 int32_t i = fLanguageBreakEngines->size();
1795 while (--i >= 0) {
1796 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1797 if (lbe->handles(c, fBreakType)) {
1798 return lbe;
1799 }
1800 }
1801
1802 // No existing dictionary took the character. See if a factory wants to
1803 // give us a new LanguageBreakEngine for this character.
1804 lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1805
1806 // If we got one, use it and push it on our stack.
1807 if (lbe != NULL) {
1808 fLanguageBreakEngines->push((void *)lbe, status);
1809 // Even if we can't remember it, we can keep looking it up, so
1810 // return it even if the push fails.
1811 return lbe;
1812 }
1813
1814 // No engine is forthcoming for this character. Add it to the
1815 // reject set. Create the reject break engine if needed.
1816 if (fUnhandledBreakEngine == NULL) {
1817 fUnhandledBreakEngine = new UnhandledEngine(status);
1818 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1819 status = U_MEMORY_ALLOCATION_ERROR;
1820 }
1821 // Put it last so that scripts for which we have an engine get tried
1822 // first.
1823 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1824 // If we can't insert it, or creation failed, get rid of it
1825 if (U_FAILURE(status)) {
1826 delete fUnhandledBreakEngine;
1827 fUnhandledBreakEngine = 0;
1828 return NULL;
1829 }
1830 }
1831
1832 // Tell the reject engine about the character; at its discretion, it may
1833 // add more than just the one character.
1834 fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1835
1836 return fUnhandledBreakEngine;
1837 }
1838
1839
1840
1841 /*int32_t RuleBasedBreakIterator::getBreakType() const {
1842 return fBreakType;
1843 }*/
1844
1845 void RuleBasedBreakIterator::setBreakType(int32_t type) {
1846 fBreakType = type;
1847 reset();
1848 }
1849
1850 U_NAMESPACE_END
1851
1852 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */