]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbi.cpp
ICU-6.2.22.tar.gz
[apple/icu.git] / icuSources / common / rbbi.cpp
1 /*
2 ***************************************************************************
3 * Copyright (C) 1999-2004 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 ***************************************************************************
6 */
7 //
8 // file: rbbi.c Contains the implementation of the rule based break iterator
9 // runtime engine and the API implementation for
10 // class RuleBasedBreakIterator
11 //
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include "unicode/rbbi.h"
18 #include "unicode/schriter.h"
19 #include "unicode/udata.h"
20 #include "unicode/uclean.h"
21 #include "rbbidata.h"
22 #include "rbbirb.h"
23 #include "cmemory.h"
24 #include "cstring.h"
25
26 #include "uassert.h"
27
28 U_NAMESPACE_BEGIN
29
30
31 static const int16_t START_STATE = 1; // The state number of the starting state
32 static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
33
34
35 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
36
37
38 //=======================================================================
39 // constructors
40 //=======================================================================
41
42 /**
43 * Constructs a RuleBasedBreakIterator that uses the already-created
44 * tables object that is passed in as a parameter.
45 */
46 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
47 {
48 init();
49 fData = new RBBIDataWrapper(data, status); // status checked in constructor
50 if (U_FAILURE(status)) {return;}
51 if(fData == 0) {
52 status = U_MEMORY_ALLOCATION_ERROR;
53 return;
54 }
55 }
56
57 //-------------------------------------------------------------------------------
58 //
59 // Constructor from a UDataMemory handle to precompiled break rules
60 // stored in an ICU data file.
61 //
62 //-------------------------------------------------------------------------------
63 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
64 {
65 init();
66 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
67 if (U_FAILURE(status)) {return;}
68 if(fData == 0) {
69 status = U_MEMORY_ALLOCATION_ERROR;
70 return;
71 }
72 }
73
74
75
76 //-------------------------------------------------------------------------------
77 //
78 // Constructor from a set of rules supplied as a string.
79 //
80 //-------------------------------------------------------------------------------
81 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
82 UParseError &parseError,
83 UErrorCode &status)
84 {
85 u_init(&status); // Just in case ICU is not yet initialized
86 init();
87 if (U_FAILURE(status)) {return;}
88 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
89 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status);
90 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
91 // creates and returns a complete RBBI. From here, in a constructor, we
92 // can't just return the object created by the builder factory, hence
93 // the assignment of the factory created object to "this".
94 if (U_SUCCESS(status)) {
95 *this = *bi;
96 delete bi;
97 }
98 }
99
100
101 //-------------------------------------------------------------------------------
102 //
103 // Default Constructor. Create an empty shell that can be set up later.
104 // Used when creating a RuleBasedBreakIterator from a set
105 // of rules.
106 //-------------------------------------------------------------------------------
107 RuleBasedBreakIterator::RuleBasedBreakIterator() {
108 init();
109 }
110
111
112 //-------------------------------------------------------------------------------
113 //
114 // Copy constructor. Will produce a break iterator with the same behavior,
115 // and which iterates over the same text, as the one passed in.
116 //
117 //-------------------------------------------------------------------------------
118 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
119 : BreakIterator(other)
120 {
121 this->init();
122 *this = other;
123 }
124
125
126 /**
127 * Destructor
128 */
129 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
130 delete fText;
131 fText = NULL;
132 if (fData != NULL) {
133 fData->removeReference();
134 fData = NULL;
135 }
136 }
137
138 /**
139 * Assignment operator. Sets this iterator to have the same behavior,
140 * and iterate over the same text, as the one passed in.
141 */
142 RuleBasedBreakIterator&
143 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
144 if (this == &that) {
145 return *this;
146 }
147 delete fText;
148 fText = NULL;
149 if (that.fText != NULL) {
150 fText = that.fText->clone();
151 }
152
153 if (fData != NULL) {
154 fData->removeReference();
155 fData = NULL;
156 }
157 if (that.fData != NULL) {
158 fData = that.fData->addReference();
159 }
160 fTrace = that.fTrace;
161
162 return *this;
163 }
164
165
166
167 //-----------------------------------------------------------------------------
168 //
169 // init() Shared initialization routine. Used by all the constructors.
170 // Initializes all fields, leaving the object in a consistent state.
171 //
172 //-----------------------------------------------------------------------------
173 UBool RuleBasedBreakIterator::fTrace = FALSE;
174 void RuleBasedBreakIterator::init() {
175
176 fText = NULL;
177 fData = NULL;
178 fLastRuleStatusIndex = 0;
179 fLastStatusIndexValid = TRUE;
180 fDictionaryCharCount = 0;
181
182 #ifdef RBBI_DEBUG
183 static UBool debugInitDone = FALSE;
184 if (debugInitDone == FALSE) {
185 char *debugEnv = getenv("U_RBBIDEBUG");
186 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
187 fTrace = TRUE;
188 }
189 debugInitDone = TRUE;
190 }
191 #endif
192 }
193
194
195
196 //-----------------------------------------------------------------------------
197 //
198 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same
199 // behavior, and iterating over the same text, as this one.
200 // Virtual function: does the right thing with subclasses.
201 //
202 //-----------------------------------------------------------------------------
203 BreakIterator*
204 RuleBasedBreakIterator::clone(void) const {
205 return new RuleBasedBreakIterator(*this);
206 }
207
208 /**
209 * Equality operator. Returns TRUE if both BreakIterators are of the
210 * same class, have the same behavior, and iterate over the same text.
211 */
212 UBool
213 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
214 UBool r = FALSE;
215 if (that.getDynamicClassID() != getDynamicClassID()) {
216 return r;
217 }
218
219 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
220 if (fText == that2.fText ||
221 (fText != NULL && that2.fText != NULL && *that2.fText == *fText)) {
222 if (that2.fData == fData ||
223 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
224 r = TRUE;
225 }
226 }
227 return r;
228 }
229
230 /**
231 * Compute a hash code for this BreakIterator
232 * @return A hash code
233 */
234 int32_t
235 RuleBasedBreakIterator::hashCode(void) const {
236 int32_t hash = 0;
237 if (fData != NULL) {
238 hash = fData->hashCode();
239 }
240 return hash;
241 }
242
243 /**
244 * Returns the description used to create this iterator
245 */
246 const UnicodeString&
247 RuleBasedBreakIterator::getRules() const {
248 if (fData != NULL) {
249 return fData->getRuleSourceString();
250 } else {
251 static const UnicodeString *s;
252 if (s == NULL) {
253 // TODO: something more elegant here.
254 // perhaps API should return the string by value.
255 // Note: thread unsafe init & leak are semi-ok, better than
256 // what was before. Sould be cleaned up, though.
257 s = new UnicodeString;
258 }
259 return *s;
260 }
261 }
262
263 //=======================================================================
264 // BreakIterator overrides
265 //=======================================================================
266
267 /**
268 * Return a CharacterIterator over the text being analyzed. This version
269 * of this method returns the actual CharacterIterator we're using internally.
270 * Changing the state of this iterator can have undefined consequences. If
271 * you need to change it, clone it first.
272 * @return An iterator over the text being analyzed.
273 */
274 const CharacterIterator&
275 RuleBasedBreakIterator::getText() const {
276 RuleBasedBreakIterator* nonConstThis = (RuleBasedBreakIterator*)this;
277
278 // The iterator is initialized pointing to no text at all, so if this
279 // function is called while we're in that state, we have to fudge an
280 // an iterator to return.
281 if (nonConstThis->fText == NULL) {
282 nonConstThis->fText = new StringCharacterIterator(UnicodeString());
283 }
284 return *nonConstThis->fText;
285 }
286
287 /**
288 * Set the iterator to analyze a new piece of text. This function resets
289 * the current iteration position to the beginning of the text.
290 * @param newText An iterator over the text to analyze.
291 */
292 void
293 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
294 reset();
295 delete fText;
296 fText = newText;
297 this->first();
298 }
299
300 /**
301 * Set the iterator to analyze a new piece of text. This function resets
302 * the current iteration position to the beginning of the text.
303 * @param newText An iterator over the text to analyze.
304 */
305 void
306 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
307 reset();
308 if (fText != NULL && fText->getDynamicClassID()
309 == StringCharacterIterator::getStaticClassID()) {
310 ((StringCharacterIterator*)fText)->setText(newText);
311 }
312 else {
313 delete fText;
314 fText = new StringCharacterIterator(newText);
315 }
316 this->first();
317 }
318
319
320
321 /**
322 * Sets the current iteration position to the beginning of the text.
323 * (i.e., the CharacterIterator's starting offset).
324 * @return The offset of the beginning of the text.
325 */
326 int32_t RuleBasedBreakIterator::first(void) {
327 reset();
328 fLastRuleStatusIndex = 0;
329 fLastStatusIndexValid = TRUE;
330 if (fText == NULL)
331 return BreakIterator::DONE;
332
333 fText->first();
334 return fText->getIndex();
335 }
336
337 /**
338 * Sets the current iteration position to the end of the text.
339 * (i.e., the CharacterIterator's ending offset).
340 * @return The text's past-the-end offset.
341 */
342 int32_t RuleBasedBreakIterator::last(void) {
343 reset();
344 if (fText == NULL) {
345 fLastRuleStatusIndex = 0;
346 fLastStatusIndexValid = TRUE;
347 return BreakIterator::DONE;
348 }
349
350 // I'm not sure why, but t.last() returns the offset of the last character,
351 // rather than the past-the-end offset
352 //
353 // (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
354 // will work correctly.)
355
356
357 fLastStatusIndexValid = FALSE;
358 int32_t pos = fText->endIndex();
359 fText->setIndex(pos);
360
361 return pos;
362 }
363
364 /**
365 * Advances the iterator either forward or backward the specified number of steps.
366 * Negative values move backward, and positive values move forward. This is
367 * equivalent to repeatedly calling next() or previous().
368 * @param n The number of steps to move. The sign indicates the direction
369 * (negative is backwards, and positive is forwards).
370 * @return The character offset of the boundary position n boundaries away from
371 * the current one.
372 */
373 int32_t RuleBasedBreakIterator::next(int32_t n) {
374 int32_t result = current();
375 while (n > 0) {
376 result = handleNext();
377 --n;
378 }
379 while (n < 0) {
380 result = previous();
381 ++n;
382 }
383 return result;
384 }
385
386 /**
387 * Advances the iterator to the next boundary position.
388 * @return The position of the first boundary after this one.
389 */
390 int32_t RuleBasedBreakIterator::next(void) {
391 return handleNext();
392 }
393
394 /**
395 * Advances the iterator backwards, to the last boundary preceding this one.
396 * @return The position of the last boundary position preceding this one.
397 */
398 int32_t RuleBasedBreakIterator::previous(void) {
399 // if we're already sitting at the beginning of the text, return DONE
400 if (fText == NULL || current() == fText->startIndex()) {
401 fLastRuleStatusIndex = 0;
402 fLastStatusIndexValid = TRUE;
403 return BreakIterator::DONE;
404 }
405
406 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
407 return handlePrevious(fData->fReverseTable);
408 }
409
410 // old rule syntax
411 // set things up. handlePrevious() will back us up to some valid
412 // break position before the current position (we back our internal
413 // iterator up one step to prevent handlePrevious() from returning
414 // the current position), but not necessarily the last one before
415 // where we started
416
417 int32_t start = current();
418
419 fText->previous32();
420 int32_t lastResult = handlePrevious();
421 int32_t result = lastResult;
422 int32_t lastTag = 0;
423 UBool breakTagValid = FALSE;
424
425 // iterate forward from the known break position until we pass our
426 // starting point. The last break position before the starting
427 // point is our return value
428
429 for (;;) {
430 result = handleNext();
431 if (result == BreakIterator::DONE || result >= start) {
432 break;
433 }
434 lastResult = result;
435 lastTag = fLastRuleStatusIndex;
436 breakTagValid = TRUE;
437 }
438
439 // fLastBreakTag wants to have the value for section of text preceding
440 // the result position that we are to return (in lastResult.) If
441 // the backwards rules overshot and the above loop had to do two or more
442 // handleNext()s to move up to the desired return position, we will have a valid
443 // tag value. But, if handlePrevious() took us to exactly the correct result positon,
444 // we wont have a tag value for that position, which is only set by handleNext().
445
446 // set the current iteration position to be the last break position
447 // before where we started, and then return that value
448 fText->setIndex(lastResult);
449 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
450 fLastStatusIndexValid = breakTagValid;
451 return lastResult;
452 }
453
454 /**
455 * Sets the iterator to refer to the first boundary position following
456 * the specified position.
457 * @offset The position from which to begin searching for a break position.
458 * @return The position of the first break after the current position.
459 */
460 int32_t RuleBasedBreakIterator::following(int32_t offset) {
461 // if the offset passed in is already past the end of the text,
462 // just return DONE; if it's before the beginning, return the
463 // text's starting offset
464 fLastRuleStatusIndex = 0;
465 fLastStatusIndexValid = TRUE;
466 if (fText == NULL || offset >= fText->endIndex()) {
467 last();
468 return next();
469 }
470 else if (offset < fText->startIndex()) {
471 return first();
472 }
473
474 // otherwise, set our internal iteration position (temporarily)
475 // to the position passed in. If this is the _beginning_ position,
476 // then we can just use next() to get our return value
477
478 int32_t result = 0;
479
480 if (fData->fSafeRevTable != NULL) {
481 // new rule syntax
482 /// todo synwee
483 fText->setIndex(offset);
484 // move forward one codepoint to prepare for moving back to a
485 // safe point.
486 // this handles offset being between a supplementary character
487 fText->next32();
488 // handlePrevious will move most of the time to < 1 boundary away
489 handlePrevious(fData->fSafeRevTable);
490 int32_t result = next();
491 while (result <= offset) {
492 result = next();
493 }
494 return result;
495 }
496 if (fData->fSafeFwdTable != NULL) {
497 // backup plan if forward safe table is not available
498 fText->setIndex(offset);
499 fText->previous32();
500 // handle next will give result >= offset
501 handleNext(fData->fSafeFwdTable);
502 // previous will give result 0 or 1 boundary away from offset,
503 // most of the time
504 // we have to
505 int32_t oldresult = previous();
506 while (oldresult > offset) {
507 int32_t result = previous();
508 if (result <= offset) {
509 return oldresult;
510 }
511 oldresult = result;
512 }
513 int32_t result = next();
514 if (result <= offset) {
515 return next();
516 }
517 return result;
518 }
519 // otherwise, we have to sync up first. Use handlePrevious() to back
520 // us up to a known break position before the specified position (if
521 // we can determine that the specified position is a break position,
522 // we don't back up at all). This may or may not be the last break
523 // position at or before our starting position. Advance forward
524 // from here until we've passed the starting position. The position
525 // we stop on will be the first break position after the specified one.
526 // old rule syntax
527
528 fText->setIndex(offset);
529 if (offset == fText->startIndex()) {
530 return handleNext();
531 }
532 result = previous();
533
534 while (result != BreakIterator::DONE && result <= offset) {
535 result = next();
536 }
537
538 return result;
539 }
540
541 /**
542 * Sets the iterator to refer to the last boundary position before the
543 * specified position.
544 * @offset The position to begin searching for a break from.
545 * @return The position of the last boundary before the starting position.
546 */
547 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
548 // if the offset passed in is already past the end of the text,
549 // just return DONE; if it's before the beginning, return the
550
551 // text's starting offset
552 if (fText == NULL || offset > fText->endIndex()) {
553 // return BreakIterator::DONE;
554 return last();
555 }
556 else if (offset < fText->startIndex()) {
557 return first();
558 }
559
560 // if we start by updating the current iteration position to the
561 // position specified by the caller, we can just use previous()
562 // to carry out this operation
563
564 if (fData->fSafeFwdTable != NULL) {
565 /// todo synwee
566 // new rule syntax
567 fText->setIndex(offset);
568 // move backwards one codepoint to prepare for moving forwards to a
569 // safe point.
570 // this handles offset being between a supplementary character
571 // TODO: would it be better to just check for being in the middle of a surrogate pair,
572 // rather than adjusting the position unconditionally?
573 // (Change would interact with safe rules.)
574 fText->previous32();
575 handleNext(fData->fSafeFwdTable);
576 int32_t result = fText->getIndex();
577 while (result >= offset) {
578 result = previous();
579 }
580 return result;
581 }
582 if (fData->fSafeRevTable != NULL) {
583 // backup plan if forward safe table is not available
584 fText->setIndex(offset);
585 fText->next32();
586 // handle previous will give result <= offset
587 handlePrevious(fData->fSafeRevTable);
588
589 // next will give result 0 or 1 boundary away from offset,
590 // most of the time
591 // we have to
592 int32_t oldresult = next();
593 while (oldresult < offset) {
594 int32_t result = next();
595 if (result >= offset) {
596 return oldresult;
597 }
598 oldresult = result;
599 }
600 int32_t result = previous();
601 if (result >= offset) {
602 return previous();
603 }
604 return result;
605 }
606
607 // old rule syntax
608 fText->setIndex(offset);
609 return previous();
610 }
611
612 /**
613 * Returns true if the specfied position is a boundary position. As a side
614 * effect, leaves the iterator pointing to the first boundary position at
615 * or after "offset".
616 * @param offset the offset to check.
617 * @return True if "offset" is a boundary position.
618 */
619 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
620 // the beginning index of the iterator is always a boundary position by definition
621 if (fText == NULL || offset == fText->startIndex()) {
622 first(); // For side effects on current position, tag values.
623 return TRUE;
624 }
625
626 if (offset == fText->endIndex()) {
627 last(); // For side effects on current position, tag values.
628 return TRUE;
629 }
630
631 // out-of-range indexes are never boundary positions
632 if (offset < fText->startIndex()) {
633 first(); // For side effects on current position, tag values.
634 return FALSE;
635 }
636
637 if (offset > fText->endIndex()) {
638 last(); // For side effects on current position, tag values.
639 return FALSE;
640 }
641
642 // otherwise, we can use following() on the position before the specified
643 // one and return true if the position we get back is the one the user
644 // specified
645 return following(offset - 1) == offset;
646 }
647
648 /**
649 * Returns the current iteration position.
650 * @return The current iteration position.
651 */
652 int32_t RuleBasedBreakIterator::current(void) const {
653 return (fText != NULL) ? fText->getIndex() : BreakIterator::DONE;
654 }
655
656 //=======================================================================
657 // implementation
658 //=======================================================================
659
660
661 //-----------------------------------------------------------------------------------
662 //
663 // handleNext()
664 // This method is the actual implementation of the next() method. All iteration
665 // vectors through here. This method initializes the state machine to state 1
666 // and advances through the text character by character until we reach the end
667 // of the text or the state machine transitions to state 0. We update our return
668 // value every time the state machine passes through an accepting state.
669 //
670 //-----------------------------------------------------------------------------------
671 int32_t RuleBasedBreakIterator::handleNext() {
672 return handleNext(fData->fForwardTable);
673 }
674
675 int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
676 if (fTrace) {
677 RBBIDebugPuts("Handle Next pos char state category");
678 }
679
680 // No matter what, handleNext alway correctly sets the break tag value.
681 fLastStatusIndexValid = TRUE;
682
683 // if we're already at the end of the text, return DONE.
684 if (fText == NULL || fData == NULL || fText->hasNext() == FALSE) {
685 fLastRuleStatusIndex = 0;
686 return BreakIterator::DONE;
687 }
688
689 int32_t initialPosition = fText->getIndex();
690 int32_t result = initialPosition;
691 int32_t lookaheadResult = 0;
692
693 // Initialize the state machine. Begin in state 1
694 int32_t state = START_STATE;
695 int16_t category;
696 UChar32 c = fText->current32();
697 RBBIStateTableRow *row;
698 int32_t lookaheadStatus = 0;
699 int32_t lookaheadTagIdx = 0;
700
701 fLastRuleStatusIndex = 0;
702
703 row = (RBBIStateTableRow *) // Point to starting row of state table.
704 (statetable->fTableData + (statetable->fRowLen * state));
705
706 // Character Category fetch for starting character.
707 // See comments on character category code within loop, below.
708 UTRIE_GET16(&fData->fTrie, c, category);
709 if ((category & 0x4000) != 0) {
710 fDictionaryCharCount++;
711 category &= ~0x4000;
712 }
713
714 // loop until we reach the end of the text or transition to state 0
715 for (;;) {
716 if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) {
717 // Reached end of input string.
718 // Note: CharacterIterator::DONE is 0xffff, which is also a legal
719 // character value. Check for DONE first, because it's quicker,
720 // but also need to check fText->hasNext() to be certain.
721
722 if (lookaheadResult > result) {
723 // We ran off the end of the string with a pending look-ahead match.
724 // Treat this as if the look-ahead condition had been met, and return
725 // the match at the / position from the look-ahead rule.
726 result = lookaheadResult;
727 fLastRuleStatusIndex = lookaheadTagIdx;
728 lookaheadStatus = 0;
729 } else if (result == initialPosition) {
730 // Ran off end, no match found.
731 // move forward one
732 fText->setIndex(initialPosition);
733 fText->next32();
734 fText->getIndex();
735 }
736 break;
737 }
738 // look up the current character's character category, which tells us
739 // which column in the state table to look at.
740 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
741 // not the size of the character going in, which is a UChar32.
742 //
743 UTRIE_GET16(&fData->fTrie, c, category);
744
745 // Check the dictionary bit in the character's category.
746 // Counter is only used by dictionary based iterators (subclasses).
747 // Chars that need to be handled by a dictionary have a flag bit set
748 // in their category values.
749 //
750 if ((category & 0x4000) != 0) {
751 fDictionaryCharCount++;
752 // And off the dictionary flag bit.
753 category &= ~0x4000;
754 }
755
756 #ifdef RBBI_DEBUG
757 if (fTrace) {
758 RBBIDebugPrintf(" %4d ", fText->getIndex());
759 if (0x20<=c && c<0x7f) {
760 RBBIDebugPrintf("\"%c\" ", c);
761 } else {
762 RBBIDebugPrintf("%5x ", c);
763 }
764 RBBIDebugPrintf("%3d %3d\n", state, category);
765 }
766 #endif
767
768 // look up a state transition in the state table
769 state = row->fNextState[category];
770 row = (RBBIStateTableRow *)
771 (statetable->fTableData + (statetable->fRowLen * state));
772
773 // Get the next character. Doing it here positions the iterator
774 // to the correct position for recording matches in the code that
775 // follows.
776 c = fText->next32();
777
778 if (row->fAccepting == -1) {
779 // Match found, common case, could have lookahead so we move on to check it
780 result = fText->getIndex();
781 /// added
782 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
783 }
784
785 if (row->fLookAhead != 0) {
786 if (lookaheadStatus != 0
787 && row->fAccepting == lookaheadStatus) {
788 // Lookahead match is completed. Set the result accordingly, but only
789 // if no other rule has matched further in the mean time.
790 result = lookaheadResult;
791 fLastRuleStatusIndex = lookaheadTagIdx;
792 lookaheadStatus = 0;
793 /// i think we have to back up to read the lookahead character again
794 /// fText->setIndex(lookaheadResult);
795 /// TODO: this is a simple hack since reverse rules only have simple
796 /// lookahead rules that we can definitely break out from.
797 /// we need to make the lookahead rules not chain eventually.
798 /// return result;
799 /// this is going to be the longest match again
800 goto continueOn;
801 }
802
803 int32_t r = fText->getIndex();
804 lookaheadResult = r;
805 lookaheadStatus = row->fLookAhead;
806 lookaheadTagIdx = row->fTagIdx;
807 goto continueOn;
808 }
809
810
811 if (row->fAccepting == 0) {
812 // No match, nothing of interest happening, common case.
813 goto continueOn;
814 }
815
816 lookaheadStatus = 0; // clear out any pending look-ahead matches.
817
818 continueOn:
819 if (state == STOP_STATE) {
820 // This is the normal exit from the lookup state machine.
821 // We have advanced through the string until it is certain that no
822 // longer match is possible, no matter what characters follow.
823 break;
824 }
825 }
826
827 // The state machine is done. Check whether it found a match...
828
829 // If the iterator failed to advance in the match engine, force it ahead by one.
830 // (This really indicates a defect in the break rules. They should always match
831 // at least one character.)
832 if (result == initialPosition) {
833 result = fText->setIndex(initialPosition);
834 fText ->next32();
835 result = fText->getIndex();
836 }
837
838 // Leave the iterator at our result position.
839 fText->setIndex(result);
840 if (fTrace) {
841 RBBIDebugPrintf("result = %d\n\n", result);
842 }
843 return result;
844 }
845
846
847 //----------------------------------------------------------------
848 //
849 // handlePrevious(void) This is the variant used with old style rules
850 // (Overshoot to a safe point, then move forward)
851 //
852 //----------------------------------------------------------------
853 int32_t RuleBasedBreakIterator::handlePrevious(void) {
854 if (fText == NULL || fData == NULL) {
855 return 0;
856 }
857 if (fData->fReverseTable == NULL) {
858 return fText->setToStart();
859 }
860
861 int32_t state = START_STATE;
862 int32_t category;
863 int32_t lastCategory = 0;
864 int32_t result = fText->getIndex();
865 int32_t lookaheadStatus = 0;
866 int32_t lookaheadResult = 0;
867 int32_t lookaheadTagIdx = 0;
868 UChar32 c = fText->current32();
869 RBBIStateTableRow *row;
870
871 row = (RBBIStateTableRow *)
872 (this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
873 UTRIE_GET16(&fData->fTrie, c, category);
874 if ((category & 0x4000) != 0) {
875 fDictionaryCharCount++;
876 category &= ~0x4000;
877 }
878
879 if (fTrace) {
880 RBBIDebugPuts("Handle Prev pos char state category");
881 }
882
883 // loop until we reach the beginning of the text or transition to state 0
884 for (;;) {
885 if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
886 break;
887 }
888
889 // save the last character's category and look up the current
890 // character's category
891 lastCategory = category;
892 UTRIE_GET16(&fData->fTrie, c, category);
893
894 // Check the dictionary bit in the character's category.
895 // Counter is only used by dictionary based iterators.
896 //
897 if ((category & 0x4000) != 0) {
898 fDictionaryCharCount++;
899 category &= ~0x4000;
900 }
901
902 #ifdef RBBI_DEBUG
903 if (fTrace) {
904 RBBIDebugPrintf(" %4d ", fText->getIndex());
905 if (0x20<=c && c<0x7f) {
906 RBBIDebugPrintf("\"%c\" ", c);
907 } else {
908 RBBIDebugPrintf("%5x ", c);
909 }
910 RBBIDebugPrintf("%3d %3d\n", state, category);
911 }
912 #endif
913
914 // look up a state transition in the backwards state table
915 state = row->fNextState[category];
916 row = (RBBIStateTableRow *)
917 (this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
918
919 if (row->fAccepting == 0 && row->fLookAhead == 0) {
920 // No match, nothing of interest happening, common case.
921 goto continueOn;
922 }
923
924 if (row->fAccepting == -1) {
925 // Match found, common case, no lookahead involved.
926 result = fText->getIndex();
927 lookaheadStatus = 0; // clear out any pending look-ahead matches.
928 goto continueOn;
929 }
930
931 if (row->fAccepting == 0 && row->fLookAhead != 0) {
932 // Lookahead match point. Remember it, but only if no other rule
933 // has unconditionally matched to this point.
934 // TODO: handle case where there's a pending match from a different rule
935 // where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
936 int32_t r = fText->getIndex();
937 if (r > result) {
938 lookaheadResult = r;
939 lookaheadStatus = row->fLookAhead;
940 lookaheadTagIdx = row->fTagIdx;
941 }
942 goto continueOn;
943 }
944
945 if (row->fAccepting != 0 && row->fLookAhead != 0) {
946 // Lookahead match is completed. Set the result accordingly, but only
947 // if no other rule has matched further in the mean time.
948 if (lookaheadResult > result) {
949 U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
950 // of overlapping lookahead matches.
951 result = lookaheadResult;
952 fLastRuleStatusIndex = lookaheadTagIdx;
953 lookaheadStatus = 0;
954 }
955 goto continueOn;
956 }
957
958 continueOn:
959 if (state == STOP_STATE) {
960 break;
961 }
962
963 // then advance one character backwards
964 c = fText->previous32();
965 }
966
967 // Note: the result postion isn't what is returned to the user by previous(),
968 // but where the implementation of previous() turns around and
969 // starts iterating forward again.
970 if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
971 result = fText->startIndex();
972 }
973 fText->setIndex(result);
974
975 return result;
976 }
977
978
979 //-----------------------------------------------------------------------------------
980 //
981 // handlePrevious()
982 //
983 // This method backs the iterator back up to a "safe position" in the text.
984 // This is a position that we know, without any context, may be any position
985 // not more than 2 breaks away. Occasionally, the position may be less than
986 // one break away.
987 // The various calling methods then iterate forward from this safe position to
988 // the appropriate position to return.
989 //
990 // The logic of this function is very similar to handleNext(), above.
991 //
992 //-----------------------------------------------------------------------------------
993 int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
994 if (fText == NULL || statetable == NULL) {
995 return 0;
996 }
997 // break tag is no longer valid after icu switched to exact backwards
998 // positioning.
999 fLastStatusIndexValid = FALSE;
1000 if (statetable == NULL) {
1001 return fText->setToStart();
1002 }
1003
1004 int32_t state = START_STATE;
1005 int32_t category;
1006 int32_t lastCategory = 0;
1007 UBool hasPassedStartText = !fText->hasPrevious();
1008 UChar32 c = fText->previous32();
1009 // previous character
1010 int32_t result = fText->getIndex();
1011 int32_t lookaheadStatus = 0;
1012 int32_t lookaheadResult = 0;
1013 int32_t lookaheadTagIdx = 0;
1014 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1015
1016 RBBIStateTableRow *row;
1017
1018 row = (RBBIStateTableRow *)
1019 (statetable->fTableData + (state * statetable->fRowLen));
1020 UTRIE_GET16(&fData->fTrie, c, category);
1021 if ((category & 0x4000) != 0) {
1022 fDictionaryCharCount++;
1023 category &= ~0x4000;
1024 }
1025
1026 if (fTrace) {
1027 RBBIDebugPuts("Handle Prev pos char state category");
1028 }
1029
1030 // loop until we reach the beginning of the text or transition to state 0
1031 for (;;) {
1032 // if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
1033 if (hasPassedStartText) {
1034 // if we have already considered the start of the text
1035 if (row->fLookAhead != 0 && lookaheadResult == 0) {
1036 result = 0;
1037 }
1038 break;
1039 }
1040
1041 // save the last character's category and look up the current
1042 // character's category
1043 lastCategory = category;
1044 UTRIE_GET16(&fData->fTrie, c, category);
1045
1046 // Check the dictionary bit in the character's category.
1047 // Counter is only used by dictionary based iterators.
1048 //
1049 if ((category & 0x4000) != 0) {
1050 fDictionaryCharCount++;
1051 category &= ~0x4000;
1052 }
1053
1054 #ifdef RBBI_DEBUG
1055 if (fTrace) {
1056 RBBIDebugPrintf(" %4d ", fText->getIndex());
1057 if (0x20<=c && c<0x7f) {
1058 RBBIDebugPrintf("\"%c\" ", c);
1059 } else {
1060 RBBIDebugPrintf("%5x ", c);
1061 }
1062 RBBIDebugPrintf("%3d %3d\n", state, category);
1063 }
1064 #endif
1065
1066 // look up a state transition in the backwards state table
1067 state = row->fNextState[category];
1068 row = (RBBIStateTableRow *)
1069 (statetable->fTableData + (state * statetable->fRowLen));
1070
1071 if (row->fAccepting == -1) {
1072 // Match found, common case, could have lookahead so we move on to check it
1073 result = fText->getIndex();
1074 /// added
1075 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) value.
1076 }
1077
1078 if (row->fLookAhead != 0) {
1079 if (lookaheadStatus != 0
1080 && row->fAccepting == lookaheadStatus) {
1081 // Lookahead match is completed. Set the result accordingly, but only
1082 // if no other rule has matched further in the mean time.
1083 result = lookaheadResult;
1084 fLastRuleStatusIndex = lookaheadTagIdx;
1085 lookaheadStatus = 0;
1086 /// i think we have to back up to read the lookahead character again
1087 /// fText->setIndex(lookaheadResult);
1088 /// TODO: this is a simple hack since reverse rules only have simple
1089 /// lookahead rules that we can definitely break out from.
1090 /// we need to make the lookahead rules not chain eventually.
1091 /// return result;
1092 /// this is going to be the longest match again
1093
1094 /// syn wee todo hard coded for line breaks stuff
1095 /// needs to provide a tag in rules to ensure a stop.
1096
1097 if (lookAheadHardBreak) {
1098 fText->setIndex(result);
1099 return result;
1100 }
1101 category = lastCategory;
1102 fText->setIndex(result);
1103
1104 goto continueOn;
1105 }
1106
1107 int32_t r = fText->getIndex();
1108 lookaheadResult = r;
1109 lookaheadStatus = row->fLookAhead;
1110 fLastRuleStatusIndex = row->fTagIdx;
1111 goto continueOn;
1112 }
1113
1114 // not lookahead
1115 if (row->fAccepting == 0) {
1116 // No match, nothing of interest happening, common case.
1117 goto continueOn;
1118 }
1119
1120 lookaheadStatus = 0; // clear out any pending look-ahead matches.
1121
1122 continueOn:
1123 if (state == STOP_STATE) {
1124 break;
1125 }
1126
1127 // then advance one character backwards
1128 hasPassedStartText = !fText->hasPrevious();
1129 c = fText->previous32();
1130 }
1131
1132 // Note: the result postion isn't what is returned to the user by previous(),
1133 // but where the implementation of previous() turns around and
1134 // starts iterating forward again.
1135 fText->setIndex(result);
1136
1137 return result;
1138 }
1139
1140
1141 void
1142 RuleBasedBreakIterator::reset()
1143 {
1144 // Base-class version of this function is a no-op.
1145 // Subclasses may override with their own reset behavior.
1146 }
1147
1148
1149
1150 //-------------------------------------------------------------------------------
1151 //
1152 // getRuleStatus() Return the break rule tag associated with the current
1153 // iterator position. If the iterator arrived at its current
1154 // position by iterating forwards, the value will have been
1155 // cached by the handleNext() function.
1156 //
1157 // If no cached status value is available, the status is
1158 // found by doing a previous() followed by a next(), which
1159 // leaves the iterator where it started, and computes the
1160 // status while doing the next().
1161 //
1162 //-------------------------------------------------------------------------------
1163 void RuleBasedBreakIterator::makeRuleStatusValid() {
1164 if (fLastStatusIndexValid == FALSE) {
1165 // No cached status is available.
1166 if (fText == NULL || current() == fText->startIndex()) {
1167 // At start of text, or there is no text. Status is always zero.
1168 fLastRuleStatusIndex = 0;
1169 fLastStatusIndexValid = TRUE;
1170 } else {
1171 // Not at start of text. Find status the tedious way.
1172 int32_t pa = current();
1173 previous();
1174 int32_t pb = next();
1175 if (pa != pb) {
1176 // note: the if (pa != pb) test is here only to eliminate warnings for
1177 // unused local variables on gcc. Logically, it isn't needed.
1178 U_ASSERT(pa == pb);
1179 }
1180 }
1181 }
1182 U_ASSERT(fLastStatusIndexValid == TRUE);
1183 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx);
1184 }
1185
1186
1187 int32_t RuleBasedBreakIterator::getRuleStatus() const {
1188 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
1189 nonConstThis->makeRuleStatusValid();
1190
1191 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1192 // (the number of status values.)
1193 // This function returns the last (largest) of the array of status values.
1194 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1195 int32_t tagVal = fData->fRuleStatusTable[idx];
1196
1197 return tagVal;
1198 }
1199
1200
1201
1202
1203 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1204 int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1205 {
1206 if (U_FAILURE(status)) {
1207 return 0;
1208 }
1209
1210 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
1211 nonConstThis->makeRuleStatusValid();
1212 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1213 int32_t numValsToCopy = numVals;
1214 if (numVals > capacity) {
1215 status = U_BUFFER_OVERFLOW_ERROR;
1216 numValsToCopy = capacity;
1217 }
1218 int i;
1219 for (i=0; i<numValsToCopy; i++) {
1220 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1221 }
1222 return numVals;
1223 }
1224
1225
1226
1227 //-------------------------------------------------------------------------------
1228 //
1229 // getBinaryRules Access to the compiled form of the rules,
1230 // for use by build system tools that save the data
1231 // for standard iterator types.
1232 //
1233 //-------------------------------------------------------------------------------
1234 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1235 const uint8_t *retPtr = NULL;
1236 length = 0;
1237
1238 if (fData != NULL) {
1239 retPtr = (const uint8_t *)fData->fHeader;
1240 length = fData->fHeader->fLength;
1241 }
1242 return retPtr;
1243 }
1244
1245
1246
1247
1248 //-------------------------------------------------------------------------------
1249 //
1250 // BufferClone TODO: In my (Andy) opinion, this function should be deprecated.
1251 // Saving one heap allocation isn't worth the trouble.
1252 // Cloning shouldn't be done in tight loops, and
1253 // making the clone copy involves other heap operations anyway.
1254 // And the application code for correctly dealing with buffer
1255 // size problems and the eventual object destruction is ugly.
1256 //
1257 //-------------------------------------------------------------------------------
1258 BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1259 int32_t &bufferSize,
1260 UErrorCode &status)
1261 {
1262 if (U_FAILURE(status)){
1263 return NULL;
1264 }
1265
1266 //
1267 // If user buffer size is zero this is a preflight operation to
1268 // obtain the needed buffer size, allowing for worst case misalignment.
1269 //
1270 if (bufferSize == 0) {
1271 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1272 return NULL;
1273 }
1274
1275
1276 //
1277 // Check the alignment and size of the user supplied buffer.
1278 // Allocate heap memory if the user supplied memory is insufficient.
1279 //
1280 char *buf = (char *)stackBuffer;
1281 uint32_t s = bufferSize;
1282
1283 if (stackBuffer == NULL) {
1284 s = 0; // Ignore size, force allocation if user didn't give us a buffer.
1285 }
1286 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1287 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1288 s -= offsetUp;
1289 buf += offsetUp;
1290 }
1291 if (s < sizeof(RuleBasedBreakIterator)) {
1292 buf = (char *) new RuleBasedBreakIterator;
1293 if (buf == 0) {
1294 status = U_MEMORY_ALLOCATION_ERROR;
1295 return NULL;
1296 }
1297 status = U_SAFECLONE_ALLOCATED_WARNING;
1298 }
1299
1300 //
1301 // Clone the object.
1302 // TODO: using an overloaded operator new to directly initialize the
1303 // copy in the user's buffer would be better, but it doesn't seem
1304 // to get along with namespaces. Investigate why.
1305 //
1306 // The memcpy is only safe with an empty (default constructed)
1307 // break iterator. Use on others can screw up reference counts
1308 // to data. memcpy-ing objects is not really a good idea...
1309 //
1310 RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy
1311 RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
1312 uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // clone = empty, but initialized, iterator.
1313 *clone = *this; // clone = the real one we want.
1314 if (status != U_SAFECLONE_ALLOCATED_WARNING) {
1315 clone->fBufferClone = TRUE;
1316 }
1317
1318 return clone;
1319 }
1320
1321
1322
1323 //-------------------------------------------------------------------------------
1324 //
1325 // isDictionaryChar Return true if the category lookup for this char
1326 // indicates that it is in the set of dictionary lookup
1327 // chars.
1328 //
1329 // This function is intended for use by dictionary based
1330 // break iterators.
1331 //
1332 //-------------------------------------------------------------------------------
1333 UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
1334 if (fData == NULL) {
1335 return FALSE;
1336 }
1337 uint16_t category;
1338 UTRIE_GET16(&fData->fTrie, c, category);
1339 return (category & 0x4000) != 0;
1340 }
1341
1342
1343
1344 U_NAMESPACE_END
1345
1346 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */