]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbi.cpp
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / common / rbbi.cpp
CommitLineData
b75a7d8f
A
1/*
2***************************************************************************
73c04bcf 3* Copyright (C) 1999-2006 International Business Machines Corporation *
b75a7d8f
A
4* and others. All rights reserved. *
5***************************************************************************
6*/
374ca955
A
7//
8// file: rbbi.c Contains the implementation of the rule based break iterator
9// runtime engine and the API implementation for
10// class RuleBasedBreakIterator
11//
b75a7d8f
A
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_BREAK_ITERATION
16
17#include "unicode/rbbi.h"
18#include "unicode/schriter.h"
73c04bcf 19#include "unicode/uchriter.h"
b75a7d8f 20#include "unicode/udata.h"
374ca955 21#include "unicode/uclean.h"
b75a7d8f
A
22#include "rbbidata.h"
23#include "rbbirb.h"
24#include "cmemory.h"
25#include "cstring.h"
73c04bcf
A
26#include "mutex.h"
27#include "ucln_cmn.h"
28#include "brkeng.h"
b75a7d8f
A
29
30#include "uassert.h"
73c04bcf
A
31#include "uvector.h"
32
33// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
34#if U_LOCAL_SERVICE_HOOK
35#include "localsvc.h"
36#endif
37
38#ifdef RBBI_DEBUG
39static UBool fTrace = FALSE;
40#endif
b75a7d8f
A
41
42U_NAMESPACE_BEGIN
43
44
45static const int16_t START_STATE = 1; // The state number of the starting state
46static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
47
374ca955
A
48
49UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
b75a7d8f
A
50
51
52//=======================================================================
53// constructors
54//=======================================================================
55
56/**
57 * Constructs a RuleBasedBreakIterator that uses the already-created
58 * tables object that is passed in as a parameter.
59 */
60RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
61{
62 init();
374ca955 63 fData = new RBBIDataWrapper(data, status); // status checked in constructor
b75a7d8f 64 if (U_FAILURE(status)) {return;}
b75a7d8f
A
65 if(fData == 0) {
66 status = U_MEMORY_ALLOCATION_ERROR;
67 return;
68 }
69}
70
71//-------------------------------------------------------------------------------
72//
73// Constructor from a UDataMemory handle to precompiled break rules
74// stored in an ICU data file.
75//
76//-------------------------------------------------------------------------------
77RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
78{
79 init();
374ca955 80 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
b75a7d8f 81 if (U_FAILURE(status)) {return;}
b75a7d8f
A
82 if(fData == 0) {
83 status = U_MEMORY_ALLOCATION_ERROR;
84 return;
85 }
86}
87
88
89
90//-------------------------------------------------------------------------------
91//
92// Constructor from a set of rules supplied as a string.
93//
94//-------------------------------------------------------------------------------
95RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
96 UParseError &parseError,
97 UErrorCode &status)
98{
99 init();
100 if (U_FAILURE(status)) {return;}
101 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
102 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status);
103 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
104 // creates and returns a complete RBBI. From here, in a constructor, we
105 // can't just return the object created by the builder factory, hence
106 // the assignment of the factory created object to "this".
107 if (U_SUCCESS(status)) {
108 *this = *bi;
109 delete bi;
110 }
111}
112
113
114//-------------------------------------------------------------------------------
115//
116// Default Constructor. Create an empty shell that can be set up later.
117// Used when creating a RuleBasedBreakIterator from a set
118// of rules.
119//-------------------------------------------------------------------------------
120RuleBasedBreakIterator::RuleBasedBreakIterator() {
121 init();
122}
123
124
125//-------------------------------------------------------------------------------
126//
127// Copy constructor. Will produce a break iterator with the same behavior,
128// and which iterates over the same text, as the one passed in.
129//
130//-------------------------------------------------------------------------------
131RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
132: BreakIterator(other)
133{
134 this->init();
135 *this = other;
136}
137
138
139/**
140 * Destructor
141 */
142RuleBasedBreakIterator::~RuleBasedBreakIterator() {
73c04bcf
A
143 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
144 // fCharIter was adopted from the outside.
145 delete fCharIter;
146 }
147 fCharIter = NULL;
148 delete fSCharIter;
149 fCharIter = NULL;
150 delete fDCharIter;
151 fDCharIter = NULL;
152
153 utext_close(fText);
154
b75a7d8f
A
155 if (fData != NULL) {
156 fData->removeReference();
157 fData = NULL;
158 }
73c04bcf
A
159 if (fCachedBreakPositions) {
160 uprv_free(fCachedBreakPositions);
161 fCachedBreakPositions = NULL;
162 }
163 if (fLanguageBreakEngines) {
164 delete fLanguageBreakEngines;
165 fLanguageBreakEngines = NULL;
166 }
167 if (fUnhandledBreakEngine) {
168 delete fUnhandledBreakEngine;
169 fUnhandledBreakEngine = NULL;
170 }
b75a7d8f
A
171}
172
173/**
174 * Assignment operator. Sets this iterator to have the same behavior,
175 * and iterate over the same text, as the one passed in.
176 */
177RuleBasedBreakIterator&
178RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
179 if (this == &that) {
180 return *this;
181 }
73c04bcf
A
182 reset(); // Delete break cache information
183 fBreakType = that.fBreakType;
184 if (fLanguageBreakEngines != NULL) {
185 delete fLanguageBreakEngines;
186 fLanguageBreakEngines = NULL; // Just rebuild for now
187 }
188 // TODO: clone fLanguageBreakEngines from "that"
189 UErrorCode status = U_ZERO_ERROR;
190 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
191
192 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
193 delete fCharIter;
194 }
195 fCharIter = NULL;
196
197 if (that.fCharIter != NULL ) {
198 // This is a little bit tricky - it will intially appear that
199 // this->fCharIter is adopted, even if that->fCharIter was
200 // not adopted. That's ok.
201 fCharIter = that.fCharIter->clone();
b75a7d8f
A
202 }
203
204 if (fData != NULL) {
205 fData->removeReference();
206 fData = NULL;
207 }
208 if (that.fData != NULL) {
209 fData = that.fData->addReference();
210 }
b75a7d8f
A
211
212 return *this;
213}
214
215
216
217//-----------------------------------------------------------------------------
218//
219// init() Shared initialization routine. Used by all the constructors.
220// Initializes all fields, leaving the object in a consistent state.
221//
222//-----------------------------------------------------------------------------
b75a7d8f 223void RuleBasedBreakIterator::init() {
73c04bcf
A
224 UErrorCode status = U_ZERO_ERROR;
225 fBufferClone = FALSE;
226 fText = utext_openUChars(NULL, NULL, 0, &status);
227 fCharIter = NULL;
228 fSCharIter = NULL;
229 fDCharIter = NULL;
374ca955
A
230 fData = NULL;
231 fLastRuleStatusIndex = 0;
232 fLastStatusIndexValid = TRUE;
233 fDictionaryCharCount = 0;
73c04bcf
A
234 fBreakType = -1;
235
236 fCachedBreakPositions = NULL;
237 fLanguageBreakEngines = NULL;
238 fUnhandledBreakEngine = NULL;
239 fNumCachedBreakPositions = 0;
240 fPositionInCache = 0;
b75a7d8f
A
241
242#ifdef RBBI_DEBUG
243 static UBool debugInitDone = FALSE;
244 if (debugInitDone == FALSE) {
245 char *debugEnv = getenv("U_RBBIDEBUG");
246 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
247 fTrace = TRUE;
248 }
249 debugInitDone = TRUE;
250 }
251#endif
252}
253
254
255
256//-----------------------------------------------------------------------------
257//
258// clone - Returns a newly-constructed RuleBasedBreakIterator with the same
259// behavior, and iterating over the same text, as this one.
260// Virtual function: does the right thing with subclasses.
261//
262//-----------------------------------------------------------------------------
263BreakIterator*
264RuleBasedBreakIterator::clone(void) const {
265 return new RuleBasedBreakIterator(*this);
266}
267
268/**
269 * Equality operator. Returns TRUE if both BreakIterators are of the
270 * same class, have the same behavior, and iterate over the same text.
271 */
272UBool
273RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
b75a7d8f 274 if (that.getDynamicClassID() != getDynamicClassID()) {
73c04bcf 275 return FALSE;
b75a7d8f
A
276 }
277
278 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
73c04bcf
A
279
280 if (!utext_equals(fText, that2.fText)) {
281 // The two break iterators are operating on different text,
282 // or have a different interation position.
283 return FALSE;
284 };
285
286 // TODO: need a check for when in a dictionary region at different offsets.
287
288 if (that2.fData == fData ||
289 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
290 // The two break iterators are using the same rules.
291 return TRUE;
b75a7d8f 292 }
73c04bcf 293 return FALSE;
b75a7d8f
A
294}
295
296/**
297 * Compute a hash code for this BreakIterator
298 * @return A hash code
299 */
300int32_t
301RuleBasedBreakIterator::hashCode(void) const {
302 int32_t hash = 0;
303 if (fData != NULL) {
304 hash = fData->hashCode();
305 }
306 return hash;
307}
308
73c04bcf
A
309
310void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
311 if (U_FAILURE(status)) {
312 return;
313 }
314 reset();
315 fText = utext_clone(fText, ut, FALSE, TRUE, &status);
316
317 // Set up a dummy CharacterIterator to be returned if anyone
318 // calls getText(). With input from UText, there is no reasonable
319 // way to return a characterIterator over the actual input text.
320 // Return one over an empty string instead - this is the closest
321 // we can come to signaling a failure.
322 // (GetText() is obsolete, this failure is sort of OK)
323 if (fDCharIter == NULL) {
324 static UChar c = 0;
325 fDCharIter = new UCharCharacterIterator(&c, 0);
326 }
327
328 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
329 // existing fCharIter was adopted from the outside. Delete it now.
330 delete fCharIter;
331 }
332 fCharIter = fDCharIter;
333
334 this->first();
335}
336
337
338UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
339 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
340 return result;
341}
342
343
344
b75a7d8f
A
345/**
346 * Returns the description used to create this iterator
347 */
348const UnicodeString&
349RuleBasedBreakIterator::getRules() const {
350 if (fData != NULL) {
351 return fData->getRuleSourceString();
352 } else {
353 static const UnicodeString *s;
354 if (s == NULL) {
355 // TODO: something more elegant here.
356 // perhaps API should return the string by value.
357 // Note: thread unsafe init & leak are semi-ok, better than
358 // what was before. Sould be cleaned up, though.
359 s = new UnicodeString;
360 }
361 return *s;
362 }
363}
364
365//=======================================================================
366// BreakIterator overrides
367//=======================================================================
368
369/**
73c04bcf 370 * Return a CharacterIterator over the text being analyzed.
b75a7d8f 371 */
73c04bcf 372CharacterIterator&
b75a7d8f 373RuleBasedBreakIterator::getText() const {
73c04bcf 374 return *fCharIter;
b75a7d8f
A
375}
376
377/**
378 * Set the iterator to analyze a new piece of text. This function resets
379 * the current iteration position to the beginning of the text.
380 * @param newText An iterator over the text to analyze.
381 */
382void
383RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
73c04bcf
A
384 // If we are holding a CharacterIterator adopted from a
385 // previous call to this function, delete it now.
386 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
387 delete fCharIter;
388 }
389
390 fCharIter = newText;
391 UErrorCode status = U_ZERO_ERROR;
b75a7d8f 392 reset();
73c04bcf
A
393 if (newText==NULL || newText->startIndex() != 0) {
394 // startIndex !=0 wants to be an error, but there's no way to report it.
395 // Make the iterator text be an empty string.
396 fText = utext_openUChars(fText, NULL, 0, &status);
397 } else {
398 fText = utext_openCharacterIterator(fText, newText, &status);
399 }
b75a7d8f
A
400 this->first();
401}
402
403/**
404 * Set the iterator to analyze a new piece of text. This function resets
405 * the current iteration position to the beginning of the text.
406 * @param newText An iterator over the text to analyze.
407 */
408void
409RuleBasedBreakIterator::setText(const UnicodeString& newText) {
73c04bcf 410 UErrorCode status = U_ZERO_ERROR;
b75a7d8f 411 reset();
73c04bcf
A
412 fText = utext_openConstUnicodeString(fText, &newText, &status);
413
414 // Set up a character iterator on the string.
415 // Needed in case someone calls getText().
416 // Can not, unfortunately, do this lazily on the (probably never)
417 // call to getText(), because getText is const.
418 if (fSCharIter == NULL) {
419 fSCharIter = new StringCharacterIterator(newText);
420 } else {
421 fSCharIter->setText(newText);
b75a7d8f 422 }
73c04bcf
A
423
424 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
425 // old fCharIter was adopted from the outside. Delete it.
426 delete fCharIter;
b75a7d8f 427 }
73c04bcf
A
428 fCharIter = fSCharIter;
429
b75a7d8f
A
430 this->first();
431}
432
433
434
435/**
436 * Sets the current iteration position to the beginning of the text.
b75a7d8f
A
437 * @return The offset of the beginning of the text.
438 */
439int32_t RuleBasedBreakIterator::first(void) {
440 reset();
374ca955
A
441 fLastRuleStatusIndex = 0;
442 fLastStatusIndexValid = TRUE;
73c04bcf
A
443 //if (fText == NULL)
444 // return BreakIterator::DONE;
b75a7d8f 445
73c04bcf
A
446 utext_setNativeIndex(fText, 0);
447 return 0;
b75a7d8f
A
448}
449
450/**
451 * Sets the current iteration position to the end of the text.
b75a7d8f
A
452 * @return The text's past-the-end offset.
453 */
454int32_t RuleBasedBreakIterator::last(void) {
455 reset();
456 if (fText == NULL) {
374ca955
A
457 fLastRuleStatusIndex = 0;
458 fLastStatusIndexValid = TRUE;
b75a7d8f
A
459 return BreakIterator::DONE;
460 }
461
374ca955 462 fLastStatusIndexValid = FALSE;
73c04bcf
A
463 int32_t pos = (int32_t)utext_nativeLength(fText);
464 utext_setNativeIndex(fText, pos);
b75a7d8f
A
465 return pos;
466}
467
468/**
469 * Advances the iterator either forward or backward the specified number of steps.
470 * Negative values move backward, and positive values move forward. This is
471 * equivalent to repeatedly calling next() or previous().
472 * @param n The number of steps to move. The sign indicates the direction
473 * (negative is backwards, and positive is forwards).
474 * @return The character offset of the boundary position n boundaries away from
475 * the current one.
476 */
477int32_t RuleBasedBreakIterator::next(int32_t n) {
478 int32_t result = current();
479 while (n > 0) {
73c04bcf 480 result = next();
b75a7d8f
A
481 --n;
482 }
483 while (n < 0) {
484 result = previous();
485 ++n;
486 }
487 return result;
488}
489
490/**
491 * Advances the iterator to the next boundary position.
492 * @return The position of the first boundary after this one.
493 */
494int32_t RuleBasedBreakIterator::next(void) {
73c04bcf
A
495 // if we have cached break positions and we're still in the range
496 // covered by them, just move one step forward in the cache
497 if (fCachedBreakPositions != NULL) {
498 if (fPositionInCache < fNumCachedBreakPositions - 1) {
499 ++fPositionInCache;
500 int32_t pos = fCachedBreakPositions[fPositionInCache];
501 utext_setNativeIndex(fText, pos);
502 return pos;
503 }
504 else {
505 reset();
506 }
507 }
508
509 int32_t startPos = current();
510 int32_t result = handleNext(fData->fForwardTable);
511 if (fDictionaryCharCount > 0) {
512 result = checkDictionary(startPos, result, FALSE);
513 }
514 return result;
b75a7d8f
A
515}
516
517/**
518 * Advances the iterator backwards, to the last boundary preceding this one.
519 * @return The position of the last boundary position preceding this one.
520 */
521int32_t RuleBasedBreakIterator::previous(void) {
73c04bcf
A
522 int32_t result;
523 int32_t startPos;
524
525 // if we have cached break positions and we're still in the range
526 // covered by them, just move one step backward in the cache
527 if (fCachedBreakPositions != NULL) {
528 if (fPositionInCache > 0) {
529 --fPositionInCache;
530 // If we're at the beginning of the cache, need to reevaluate the
531 // rule status
532 if (fPositionInCache <= 0) {
533 fLastStatusIndexValid = FALSE;
534 }
535 int32_t pos = fCachedBreakPositions[fPositionInCache];
536 utext_setNativeIndex(fText, pos);
537 return pos;
538 }
539 else {
540 reset();
541 }
542 }
543
b75a7d8f 544 // if we're already sitting at the beginning of the text, return DONE
73c04bcf 545 if (fText == NULL || (startPos = current()) == 0) {
374ca955
A
546 fLastRuleStatusIndex = 0;
547 fLastStatusIndexValid = TRUE;
b75a7d8f
A
548 return BreakIterator::DONE;
549 }
550
374ca955 551 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
73c04bcf
A
552 result = handlePrevious(fData->fReverseTable);
553 if (fDictionaryCharCount > 0) {
554 result = checkDictionary(result, startPos, TRUE);
555 }
556 return result;
374ca955
A
557 }
558
559 // old rule syntax
b75a7d8f
A
560 // set things up. handlePrevious() will back us up to some valid
561 // break position before the current position (we back our internal
562 // iterator up one step to prevent handlePrevious() from returning
563 // the current position), but not necessarily the last one before
73c04bcf 564
b75a7d8f 565 // where we started
374ca955 566
b75a7d8f 567 int32_t start = current();
374ca955 568
73c04bcf
A
569 UTEXT_PREVIOUS32(fText);
570 int32_t lastResult = handlePrevious(fData->fReverseTable);
571 if (lastResult == UBRK_DONE) {
572 lastResult = 0;
573 utext_setNativeIndex(fText, 0);
574 }
575 result = lastResult;
b75a7d8f
A
576 int32_t lastTag = 0;
577 UBool breakTagValid = FALSE;
578
579 // iterate forward from the known break position until we pass our
580 // starting point. The last break position before the starting
581 // point is our return value
374ca955 582
b75a7d8f 583 for (;;) {
73c04bcf 584 result = next();
b75a7d8f
A
585 if (result == BreakIterator::DONE || result >= start) {
586 break;
587 }
588 lastResult = result;
374ca955 589 lastTag = fLastRuleStatusIndex;
b75a7d8f
A
590 breakTagValid = TRUE;
591 }
592
593 // fLastBreakTag wants to have the value for section of text preceding
594 // the result position that we are to return (in lastResult.) If
595 // the backwards rules overshot and the above loop had to do two or more
73c04bcf 596 // next()s to move up to the desired return position, we will have a valid
374ca955
A
597 // tag value. But, if handlePrevious() took us to exactly the correct result positon,
598 // we wont have a tag value for that position, which is only set by handleNext().
b75a7d8f
A
599
600 // set the current iteration position to be the last break position
601 // before where we started, and then return that value
73c04bcf 602 utext_setNativeIndex(fText, lastResult);
374ca955
A
603 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
604 fLastStatusIndexValid = breakTagValid;
73c04bcf
A
605
606 // No need to check the dictionary; it will have been handled by
607 // next()
608
b75a7d8f
A
609 return lastResult;
610}
611
b75a7d8f
A
612/**
613 * Sets the iterator to refer to the first boundary position following
614 * the specified position.
615 * @offset The position from which to begin searching for a break position.
616 * @return The position of the first break after the current position.
617 */
618int32_t RuleBasedBreakIterator::following(int32_t offset) {
73c04bcf
A
619 // if we have cached break positions and offset is in the range
620 // covered by them, use them
621 // TODO: could use binary search
622 // TODO: what if offset is outside range, but break is not?
623 if (fCachedBreakPositions != NULL) {
624 if (offset >= fCachedBreakPositions[0]
625 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
626 fPositionInCache = 0;
627 // We are guaranteed not to leave the array due to range test above
628 while (offset >= fCachedBreakPositions[fPositionInCache]) {
629 ++fPositionInCache;
630 }
631 int32_t pos = fCachedBreakPositions[fPositionInCache];
632 utext_setNativeIndex(fText, pos);
633 return pos;
634 }
635 else {
636 reset();
637 }
638 }
639
b75a7d8f
A
640 // if the offset passed in is already past the end of the text,
641 // just return DONE; if it's before the beginning, return the
642 // text's starting offset
374ca955
A
643 fLastRuleStatusIndex = 0;
644 fLastStatusIndexValid = TRUE;
73c04bcf 645 if (fText == NULL || offset >= utext_nativeLength(fText)) {
b75a7d8f
A
646 last();
647 return next();
648 }
73c04bcf 649 else if (offset < 0) {
b75a7d8f
A
650 return first();
651 }
652
653 // otherwise, set our internal iteration position (temporarily)
654 // to the position passed in. If this is the _beginning_ position,
655 // then we can just use next() to get our return value
b75a7d8f 656
374ca955
A
657 int32_t result = 0;
658
659 if (fData->fSafeRevTable != NULL) {
660 // new rule syntax
73c04bcf 661 utext_setNativeIndex(fText, offset);
374ca955
A
662 // move forward one codepoint to prepare for moving back to a
663 // safe point.
664 // this handles offset being between a supplementary character
73c04bcf 665 UTEXT_NEXT32(fText);
374ca955
A
666 // handlePrevious will move most of the time to < 1 boundary away
667 handlePrevious(fData->fSafeRevTable);
668 int32_t result = next();
669 while (result <= offset) {
670 result = next();
671 }
672 return result;
673 }
674 if (fData->fSafeFwdTable != NULL) {
675 // backup plan if forward safe table is not available
73c04bcf
A
676 utext_setNativeIndex(fText, offset);
677 UTEXT_PREVIOUS32(fText);
374ca955
A
678 // handle next will give result >= offset
679 handleNext(fData->fSafeFwdTable);
680 // previous will give result 0 or 1 boundary away from offset,
681 // most of the time
682 // we have to
683 int32_t oldresult = previous();
684 while (oldresult > offset) {
685 int32_t result = previous();
686 if (result <= offset) {
687 return oldresult;
688 }
689 oldresult = result;
690 }
691 int32_t result = next();
692 if (result <= offset) {
693 return next();
694 }
695 return result;
696 }
b75a7d8f 697 // otherwise, we have to sync up first. Use handlePrevious() to back
73c04bcf 698 // up to a known break position before the specified position (if
b75a7d8f
A
699 // we can determine that the specified position is a break position,
700 // we don't back up at all). This may or may not be the last break
701 // position at or before our starting position. Advance forward
702 // from here until we've passed the starting position. The position
703 // we stop on will be the first break position after the specified one.
374ca955
A
704 // old rule syntax
705
73c04bcf
A
706 utext_setNativeIndex(fText, offset);
707 if (offset==0 ||
708 offset==1 && utext_getNativeIndex(fText)==0) {
709 return next();
374ca955
A
710 }
711 result = previous();
b75a7d8f 712
b75a7d8f
A
713 while (result != BreakIterator::DONE && result <= offset) {
714 result = next();
715 }
716
717 return result;
718}
719
720/**
721 * Sets the iterator to refer to the last boundary position before the
722 * specified position.
723 * @offset The position to begin searching for a break from.
724 * @return The position of the last boundary before the starting position.
725 */
726int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
73c04bcf
A
727 // if we have cached break positions and offset is in the range
728 // covered by them, use them
729 if (fCachedBreakPositions != NULL) {
730 // TODO: binary search?
731 // TODO: What if offset is outside range, but break is not?
732 if (offset > fCachedBreakPositions[0]
733 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
734 fPositionInCache = 0;
735 while (fPositionInCache < fNumCachedBreakPositions
736 && offset > fCachedBreakPositions[fPositionInCache])
737 ++fPositionInCache;
738 --fPositionInCache;
739 // If we're at the beginning of the cache, need to reevaluate the
740 // rule status
741 if (fPositionInCache <= 0) {
742 fLastStatusIndexValid = FALSE;
743 }
744 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
745 return fCachedBreakPositions[fPositionInCache];
746 }
747 else {
748 reset();
749 }
750 }
751
b75a7d8f
A
752 // if the offset passed in is already past the end of the text,
753 // just return DONE; if it's before the beginning, return the
754 // text's starting offset
73c04bcf 755 if (fText == NULL || offset > utext_nativeLength(fText)) {
b75a7d8f
A
756 // return BreakIterator::DONE;
757 return last();
758 }
73c04bcf 759 else if (offset < 0) {
b75a7d8f
A
760 return first();
761 }
762
763 // if we start by updating the current iteration position to the
764 // position specified by the caller, we can just use previous()
765 // to carry out this operation
374ca955
A
766
767 if (fData->fSafeFwdTable != NULL) {
374ca955 768 // new rule syntax
73c04bcf
A
769 utext_setNativeIndex(fText, offset);
770 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
771 if (newOffset != offset) {
772 // Will come here if specified offset was not a code point boundary AND
773 // the underlying implmentation is using UText, which snaps any non-code-point-boundary
774 // indices to the containing code point.
775 // For breakitereator::preceding only, these non-code-point indices need to be moved
776 // up to refer to the following codepoint.
777 UTEXT_NEXT32(fText);
778 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
779 }
780
781 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair,
374ca955
A
782 // rather than adjusting the position unconditionally?
783 // (Change would interact with safe rules.)
73c04bcf
A
784 // TODO: change RBBI behavior for off-boundary indices to match that of UText?
785 // affects only preceding(), seems cleaner, but is slightly different.
786 UTEXT_PREVIOUS32(fText);
374ca955 787 handleNext(fData->fSafeFwdTable);
73c04bcf 788 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
374ca955
A
789 while (result >= offset) {
790 result = previous();
791 }
792 return result;
793 }
794 if (fData->fSafeRevTable != NULL) {
795 // backup plan if forward safe table is not available
73c04bcf
A
796 // TODO: check whether this path can be discarded
797 // It's probably OK to say that rules must supply both safe tables
798 // if they use safe tables at all. We have certainly never described
799 // to anyone how to work with just one safe table.
800 utext_setNativeIndex(fText, offset);
801 UTEXT_NEXT32(fText);
802
374ca955
A
803 // handle previous will give result <= offset
804 handlePrevious(fData->fSafeRevTable);
805
806 // next will give result 0 or 1 boundary away from offset,
807 // most of the time
808 // we have to
809 int32_t oldresult = next();
810 while (oldresult < offset) {
811 int32_t result = next();
812 if (result >= offset) {
813 return oldresult;
814 }
815 oldresult = result;
816 }
817 int32_t result = previous();
818 if (result >= offset) {
819 return previous();
820 }
821 return result;
822 }
823
824 // old rule syntax
73c04bcf 825 utext_setNativeIndex(fText, offset);
b75a7d8f
A
826 return previous();
827}
828
829/**
830 * Returns true if the specfied position is a boundary position. As a side
831 * effect, leaves the iterator pointing to the first boundary position at
832 * or after "offset".
833 * @param offset the offset to check.
834 * @return True if "offset" is a boundary position.
835 */
836UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
837 // the beginning index of the iterator is always a boundary position by definition
73c04bcf 838 if (offset == 0) {
b75a7d8f
A
839 first(); // For side effects on current position, tag values.
840 return TRUE;
841 }
842
73c04bcf 843 if (offset == (int32_t)utext_nativeLength(fText)) {
374ca955
A
844 last(); // For side effects on current position, tag values.
845 return TRUE;
846 }
847
b75a7d8f 848 // out-of-range indexes are never boundary positions
73c04bcf 849 if (offset < 0) {
b75a7d8f
A
850 first(); // For side effects on current position, tag values.
851 return FALSE;
852 }
853
73c04bcf 854 if (offset > utext_nativeLength(fText)) {
b75a7d8f
A
855 last(); // For side effects on current position, tag values.
856 return FALSE;
857 }
858
859 // otherwise, we can use following() on the position before the specified
860 // one and return true if the position we get back is the one the user
861 // specified
73c04bcf
A
862 utext_previous32From(fText, offset);
863 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
864 UBool result = following(backOne) == offset;
865 return result;
b75a7d8f
A
866}
867
868/**
869 * Returns the current iteration position.
870 * @return The current iteration position.
871 */
872int32_t RuleBasedBreakIterator::current(void) const {
73c04bcf
A
873 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
874 return pos;
b75a7d8f 875}
73c04bcf 876
b75a7d8f
A
877//=======================================================================
878// implementation
879//=======================================================================
880
73c04bcf
A
881//
882// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
883// of user text. A variable with this enum type keeps track of where we
884// are. The state machine only fetches user input while in the RUN mode.
885//
886enum RBBIRunMode {
887 RBBI_START, // state machine processing is before first char of input
888 RBBI_RUN, // state machine processing is in the user text
889 RBBI_END // state machine processing is after end of user text.
890};
891
b75a7d8f
A
892
893//-----------------------------------------------------------------------------------
894//
73c04bcf
A
895// handleNext(stateTable)
896// This method is the actual implementation of the rbbi next() method.
897// This method initializes the state machine to state 1
b75a7d8f
A
898// and advances through the text character by character until we reach the end
899// of the text or the state machine transitions to state 0. We update our return
374ca955 900// value every time the state machine passes through an accepting state.
b75a7d8f
A
901//
902//-----------------------------------------------------------------------------------
374ca955 903int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
73c04bcf
A
904 int32_t state;
905 int16_t category = 0;
906 RBBIRunMode mode;
907
908 RBBIStateTableRow *row;
909 UChar32 c;
910 int32_t lookaheadStatus = 0;
911 int32_t lookaheadTagIdx = 0;
912 int32_t result = 0;
913 int32_t initialPosition = 0;
914 int32_t lookaheadResult = 0;
915 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
916 const char *tableData = statetable->fTableData;
917 uint32_t tableRowLen = statetable->fRowLen;
918
919 #ifdef RBBI_DEBUG
920 if (fTrace) {
921 RBBIDebugPuts("Handle Next pos char state category");
922 }
923 #endif
b75a7d8f
A
924
925 // No matter what, handleNext alway correctly sets the break tag value.
374ca955 926 fLastStatusIndexValid = TRUE;
73c04bcf 927 fLastRuleStatusIndex = 0;
b75a7d8f
A
928
929 // if we're already at the end of the text, return DONE.
73c04bcf
A
930 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
931 result = initialPosition;
932 c = UTEXT_NEXT32(fText);
933 if (fData == NULL || c==U_SENTINEL) {
b75a7d8f
A
934 return BreakIterator::DONE;
935 }
936
73c04bcf
A
937 // Set the initial state for the state machine
938 state = START_STATE;
939 row = (RBBIStateTableRow *)
940 //(statetable->fTableData + (statetable->fRowLen * state));
941 (tableData + tableRowLen * state);
942
943
944 mode = RBBI_RUN;
945 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
946 category = 2;
947 mode = RBBI_START;
948 }
b75a7d8f 949
b75a7d8f
A
950
951 // loop until we reach the end of the text or transition to state 0
73c04bcf 952 //
b75a7d8f 953 for (;;) {
73c04bcf 954 if (c == U_SENTINEL) {
374ca955 955 // Reached end of input string.
73c04bcf
A
956 if (mode == RBBI_END) {
957 // We have already run the loop one last time with the
958 // character set to the psueudo {eof} value. Now it is time
959 // to unconditionally bail out.
960 if (lookaheadResult > result) {
961 // We ran off the end of the string with a pending look-ahead match.
962 // Treat this as if the look-ahead condition had been met, and return
963 // the match at the / position from the look-ahead rule.
964 result = lookaheadResult;
965 fLastRuleStatusIndex = lookaheadTagIdx;
966 lookaheadStatus = 0;
967 }
968 break;
374ca955 969 }
73c04bcf
A
970 // Run the loop one last time with the fake end-of-input character category.
971 mode = RBBI_END;
972 category = 1;
b75a7d8f 973 }
b75a7d8f 974
b75a7d8f 975 //
73c04bcf
A
976 // Get the char category. An incoming category of 1 or 2 means that
977 // we are preset for doing the beginning or end of input, and
978 // that we shouldn't get a category from an actual text input character.
979 //
980 if (mode == RBBI_RUN) {
981 // look up the current character's character category, which tells us
982 // which column in the state table to look at.
983 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
984 // not the size of the character going in, which is a UChar32.
985 //
986 UTRIE_GET16(&fData->fTrie, c, category);
987
988 // Check the dictionary bit in the character's category.
989 // Counter is only used by dictionary based iterators (subclasses).
990 // Chars that need to be handled by a dictionary have a flag bit set
991 // in their category values.
992 //
993 if ((category & 0x4000) != 0) {
994 fDictionaryCharCount++;
995 // And off the dictionary flag bit.
996 category &= ~0x4000;
997 }
b75a7d8f
A
998 }
999
374ca955
A
1000 #ifdef RBBI_DEBUG
1001 if (fTrace) {
73c04bcf 1002 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText));
374ca955
A
1003 if (0x20<=c && c<0x7f) {
1004 RBBIDebugPrintf("\"%c\" ", c);
1005 } else {
1006 RBBIDebugPrintf("%5x ", c);
1007 }
1008 RBBIDebugPrintf("%3d %3d\n", state, category);
b75a7d8f 1009 }
374ca955 1010 #endif
b75a7d8f 1011
73c04bcf
A
1012 // State Transition - move machine to its next state
1013 //
b75a7d8f
A
1014 state = row->fNextState[category];
1015 row = (RBBIStateTableRow *)
73c04bcf
A
1016 // (statetable->fTableData + (statetable->fRowLen * state));
1017 (tableData + tableRowLen * state);
b75a7d8f 1018
b75a7d8f 1019
b75a7d8f 1020 if (row->fAccepting == -1) {
73c04bcf
A
1021 // Match found, common case.
1022 if (mode != RBBI_START) {
1023 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1024 }
374ca955 1025 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
b75a7d8f
A
1026 }
1027
374ca955
A
1028 if (row->fLookAhead != 0) {
1029 if (lookaheadStatus != 0
1030 && row->fAccepting == lookaheadStatus) {
73c04bcf 1031 // Lookahead match is completed.
374ca955
A
1032 result = lookaheadResult;
1033 fLastRuleStatusIndex = lookaheadTagIdx;
1034 lookaheadStatus = 0;
73c04bcf
A
1035 // TODO: make a standalone hard break in a rule work.
1036 if (lookAheadHardBreak) {
1037 utext_setNativeIndex(fText, result);
1038 return result;
1039 }
1040 // Look-ahead completed, but other rules may match further. Continue on
1041 // TODO: junk this feature? I don't think it's used anywhwere.
374ca955 1042 goto continueOn;
b75a7d8f 1043 }
374ca955 1044
73c04bcf 1045 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
374ca955
A
1046 lookaheadResult = r;
1047 lookaheadStatus = row->fLookAhead;
1048 lookaheadTagIdx = row->fTagIdx;
b75a7d8f
A
1049 goto continueOn;
1050 }
1051
374ca955 1052
73c04bcf
A
1053 if (row->fAccepting != 0) {
1054 // Because this is an accepting state, any in-progress look-ahead match
1055 // is no longer relavant. Clear out the pending lookahead status.
1056 lookaheadStatus = 0; // clear out any pending look-ahead match.
b75a7d8f
A
1057 }
1058
1059continueOn:
1060 if (state == STOP_STATE) {
374ca955
A
1061 // This is the normal exit from the lookup state machine.
1062 // We have advanced through the string until it is certain that no
1063 // longer match is possible, no matter what characters follow.
b75a7d8f
A
1064 break;
1065 }
73c04bcf
A
1066
1067 // Advance to the next character.
1068 // If this is a beginning-of-input loop iteration, don't advance
1069 // the input position. The next iteration will be processing the
1070 // first real input character.
1071 if (mode == RBBI_RUN) {
1072 c = UTEXT_NEXT32(fText);
1073 } else {
1074 if (mode == RBBI_START) {
1075 mode = RBBI_RUN;
1076 }
1077 }
1078
1079
b75a7d8f
A
1080 }
1081
374ca955 1082 // The state machine is done. Check whether it found a match...
b75a7d8f 1083
374ca955
A
1084 // If the iterator failed to advance in the match engine, force it ahead by one.
1085 // (This really indicates a defect in the break rules. They should always match
1086 // at least one character.)
1087 if (result == initialPosition) {
73c04bcf
A
1088 utext_setNativeIndex(fText, initialPosition);
1089 UTEXT_NEXT32(fText);
1090 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
374ca955 1091 }
b75a7d8f 1092
374ca955 1093 // Leave the iterator at our result position.
73c04bcf
A
1094 utext_setNativeIndex(fText, result);
1095 #ifdef RBBI_DEBUG
1096 if (fTrace) {
1097 RBBIDebugPrintf("result = %d\n\n", result);
b75a7d8f 1098 }
73c04bcf 1099 #endif
b75a7d8f
A
1100 return result;
1101}
1102
1103
73c04bcf 1104
374ca955
A
1105//-----------------------------------------------------------------------------------
1106//
1107// handlePrevious()
1108//
73c04bcf
A
1109// Iterate backwards, according to the logic of the reverse rules.
1110// This version handles the exact style backwards rules.
374ca955
A
1111//
1112// The logic of this function is very similar to handleNext(), above.
1113//
1114//-----------------------------------------------------------------------------------
1115int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
73c04bcf
A
1116 int32_t state;
1117 int16_t category = 0;
1118 RBBIRunMode mode;
1119 RBBIStateTableRow *row;
1120 UChar32 c;
1121 int32_t lookaheadStatus = 0;
1122 int32_t result = 0;
1123 int32_t initialPosition = 0;
1124 int32_t lookaheadResult = 0;
1125 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
1126
1127 #ifdef RBBI_DEBUG
1128 if (fTrace) {
1129 RBBIDebugPuts("Handle Previous pos char state category");
1130 }
1131 #endif
1132
1133 // handlePrevious() never gets the rule status.
1134 // Flag the status as invalid; if the user ever asks for status, we will need
1135 // to back up, then re-find the break position using handleNext(), which does
1136 // get the status value.
374ca955 1137 fLastStatusIndexValid = FALSE;
73c04bcf 1138 fLastRuleStatusIndex = 0;
374ca955 1139
73c04bcf
A
1140 // if we're already at the start of the text, return DONE.
1141 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1142 return BreakIterator::DONE;
1143 }
374ca955 1144
73c04bcf
A
1145 // Set up the starting char.
1146 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1147 result = initialPosition;
1148 c = UTEXT_PREVIOUS32(fText);
374ca955 1149
73c04bcf
A
1150 // Set the initial state for the state machine
1151 state = START_STATE;
374ca955 1152 row = (RBBIStateTableRow *)
73c04bcf
A
1153 (statetable->fTableData + (statetable->fRowLen * state));
1154 category = 3;
1155 mode = RBBI_RUN;
1156 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1157 category = 2;
1158 mode = RBBI_START;
374ca955
A
1159 }
1160
374ca955 1161
73c04bcf
A
1162 // loop until we reach the start of the text or transition to state 0
1163 //
374ca955 1164 for (;;) {
73c04bcf
A
1165 if (c == U_SENTINEL) {
1166 // Reached end of input string.
1167 if (mode == RBBI_END ||
1168 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
1169 // We have already run the loop one last time with the
1170 // character set to the psueudo {eof} value. Now it is time
1171 // to unconditionally bail out.
1172 // (Or we have an old format binary rule file that does not support {eof}.)
1173 if (lookaheadResult < result) {
1174 // We ran off the end of the string with a pending look-ahead match.
1175 // Treat this as if the look-ahead condition had been met, and return
1176 // the match at the / position from the look-ahead rule.
1177 result = lookaheadResult;
1178 lookaheadStatus = 0;
1179 } else if (result == initialPosition) {
1180 // Ran off start, no match found.
1181 // move one index one (towards the start, since we are doing a previous())
1182 utext_setNativeIndex(fText, initialPosition);
1183 UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check.
1184 }
1185 break;
374ca955 1186 }
73c04bcf
A
1187 // Run the loop one last time with the fake end-of-input character category.
1188 mode = RBBI_END;
1189 category = 1;
374ca955
A
1190 }
1191
374ca955 1192 //
73c04bcf
A
1193 // Get the char category. An incoming category of 1 or 2 means that
1194 // we are preset for doing the beginning or end of input, and
1195 // that we shouldn't get a category from an actual text input character.
1196 //
1197 if (mode == RBBI_RUN) {
1198 // look up the current character's character category, which tells us
1199 // which column in the state table to look at.
1200 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1201 // not the size of the character going in, which is a UChar32.
1202 //
1203 UTRIE_GET16(&fData->fTrie, c, category);
1204
1205 // Check the dictionary bit in the character's category.
1206 // Counter is only used by dictionary based iterators (subclasses).
1207 // Chars that need to be handled by a dictionary have a flag bit set
1208 // in their category values.
1209 //
1210 if ((category & 0x4000) != 0) {
1211 fDictionaryCharCount++;
1212 // And off the dictionary flag bit.
1213 category &= ~0x4000;
1214 }
374ca955
A
1215 }
1216
1217 #ifdef RBBI_DEBUG
1218 if (fTrace) {
73c04bcf 1219 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText));
374ca955
A
1220 if (0x20<=c && c<0x7f) {
1221 RBBIDebugPrintf("\"%c\" ", c);
1222 } else {
1223 RBBIDebugPrintf("%5x ", c);
1224 }
1225 RBBIDebugPrintf("%3d %3d\n", state, category);
1226 }
1227 #endif
1228
73c04bcf
A
1229 // State Transition - move machine to its next state
1230 //
374ca955
A
1231 state = row->fNextState[category];
1232 row = (RBBIStateTableRow *)
73c04bcf 1233 (statetable->fTableData + (statetable->fRowLen * state));
374ca955
A
1234
1235 if (row->fAccepting == -1) {
73c04bcf
A
1236 // Match found, common case.
1237 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
374ca955
A
1238 }
1239
1240 if (row->fLookAhead != 0) {
1241 if (lookaheadStatus != 0
1242 && row->fAccepting == lookaheadStatus) {
73c04bcf 1243 // Lookahead match is completed.
374ca955 1244 result = lookaheadResult;
374ca955 1245 lookaheadStatus = 0;
73c04bcf 1246 // TODO: make a standalone hard break in a rule work.
374ca955 1247 if (lookAheadHardBreak) {
73c04bcf 1248 utext_setNativeIndex(fText, result);
374ca955
A
1249 return result;
1250 }
73c04bcf
A
1251 // Look-ahead completed, but other rules may match further. Continue on
1252 // TODO: junk this feature? I don't think it's used anywhwere.
374ca955
A
1253 goto continueOn;
1254 }
1255
73c04bcf
A
1256 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1257 lookaheadResult = r;
1258 lookaheadStatus = row->fLookAhead;
374ca955
A
1259 goto continueOn;
1260 }
1261
374ca955 1262
73c04bcf
A
1263 if (row->fAccepting != 0) {
1264 // Because this is an accepting state, any in-progress look-ahead match
1265 // is no longer relavant. Clear out the pending lookahead status.
1266 lookaheadStatus = 0;
1267 }
374ca955
A
1268
1269continueOn:
1270 if (state == STOP_STATE) {
73c04bcf
A
1271 // This is the normal exit from the lookup state machine.
1272 // We have advanced through the string until it is certain that no
1273 // longer match is possible, no matter what characters follow.
374ca955
A
1274 break;
1275 }
1276
73c04bcf
A
1277 // Move (backwards) to the next character to process.
1278 // If this is a beginning-of-input loop iteration, don't advance
1279 // the input position. The next iteration will be processing the
1280 // first real input character.
1281 if (mode == RBBI_RUN) {
1282 c = UTEXT_PREVIOUS32(fText);
1283 } else {
1284 if (mode == RBBI_START) {
1285 mode = RBBI_RUN;
1286 }
1287 }
374ca955
A
1288 }
1289
73c04bcf
A
1290 // The state machine is done. Check whether it found a match...
1291
1292 // If the iterator failed to advance in the match engine, force it ahead by one.
1293 // (This really indicates a defect in the break rules. They should always match
1294 // at least one character.)
1295 if (result == initialPosition) {
1296 utext_setNativeIndex(fText, initialPosition);
1297 UTEXT_PREVIOUS32(fText);
1298 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1299 }
374ca955 1300
73c04bcf
A
1301 // Leave the iterator at our result position.
1302 utext_setNativeIndex(fText, result);
1303 #ifdef RBBI_DEBUG
1304 if (fTrace) {
1305 RBBIDebugPrintf("result = %d\n\n", result);
1306 }
1307 #endif
374ca955
A
1308 return result;
1309}
1310
1311
b75a7d8f
A
1312void
1313RuleBasedBreakIterator::reset()
1314{
73c04bcf
A
1315 if (fCachedBreakPositions) {
1316 uprv_free(fCachedBreakPositions);
1317 }
1318 fCachedBreakPositions = NULL;
1319 fNumCachedBreakPositions = 0;
1320 fDictionaryCharCount = 0;
1321 fPositionInCache = 0;
b75a7d8f
A
1322}
1323
1324
1325
1326//-------------------------------------------------------------------------------
1327//
1328// getRuleStatus() Return the break rule tag associated with the current
1329// iterator position. If the iterator arrived at its current
1330// position by iterating forwards, the value will have been
1331// cached by the handleNext() function.
1332//
1333// If no cached status value is available, the status is
1334// found by doing a previous() followed by a next(), which
1335// leaves the iterator where it started, and computes the
1336// status while doing the next().
1337//
1338//-------------------------------------------------------------------------------
374ca955
A
1339void RuleBasedBreakIterator::makeRuleStatusValid() {
1340 if (fLastStatusIndexValid == FALSE) {
b75a7d8f 1341 // No cached status is available.
73c04bcf 1342 if (fText == NULL || current() == 0) {
b75a7d8f 1343 // At start of text, or there is no text. Status is always zero.
374ca955
A
1344 fLastRuleStatusIndex = 0;
1345 fLastStatusIndexValid = TRUE;
b75a7d8f
A
1346 } else {
1347 // Not at start of text. Find status the tedious way.
1348 int32_t pa = current();
374ca955 1349 previous();
73c04bcf
A
1350 if (fNumCachedBreakPositions > 0) {
1351 reset(); // Blow off the dictionary cache
1352 }
374ca955
A
1353 int32_t pb = next();
1354 if (pa != pb) {
1355 // note: the if (pa != pb) test is here only to eliminate warnings for
1356 // unused local variables on gcc. Logically, it isn't needed.
1357 U_ASSERT(pa == pb);
1358 }
b75a7d8f
A
1359 }
1360 }
374ca955 1361 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx);
b75a7d8f
A
1362}
1363
1364
374ca955
A
1365int32_t RuleBasedBreakIterator::getRuleStatus() const {
1366 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
1367 nonConstThis->makeRuleStatusValid();
1368
1369 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1370 // (the number of status values.)
1371 // This function returns the last (largest) of the array of status values.
1372 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1373 int32_t tagVal = fData->fRuleStatusTable[idx];
1374
1375 return tagVal;
1376}
1377
1378
1379
1380
1381int32_t RuleBasedBreakIterator::getRuleStatusVec(
1382 int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1383{
1384 if (U_FAILURE(status)) {
1385 return 0;
1386 }
1387
1388 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
1389 nonConstThis->makeRuleStatusValid();
1390 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1391 int32_t numValsToCopy = numVals;
1392 if (numVals > capacity) {
1393 status = U_BUFFER_OVERFLOW_ERROR;
1394 numValsToCopy = capacity;
1395 }
1396 int i;
1397 for (i=0; i<numValsToCopy; i++) {
1398 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1399 }
1400 return numVals;
1401}
1402
1403
1404
b75a7d8f
A
1405//-------------------------------------------------------------------------------
1406//
1407// getBinaryRules Access to the compiled form of the rules,
1408// for use by build system tools that save the data
1409// for standard iterator types.
1410//
1411//-------------------------------------------------------------------------------
1412const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1413 const uint8_t *retPtr = NULL;
1414 length = 0;
1415
1416 if (fData != NULL) {
1417 retPtr = (const uint8_t *)fData->fHeader;
1418 length = fData->fHeader->fLength;
1419 }
1420 return retPtr;
1421}
1422
1423
1424
1425
1426//-------------------------------------------------------------------------------
1427//
1428// BufferClone TODO: In my (Andy) opinion, this function should be deprecated.
1429// Saving one heap allocation isn't worth the trouble.
1430// Cloning shouldn't be done in tight loops, and
1431// making the clone copy involves other heap operations anyway.
1432// And the application code for correctly dealing with buffer
1433// size problems and the eventual object destruction is ugly.
1434//
1435//-------------------------------------------------------------------------------
1436BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
1437 int32_t &bufferSize,
1438 UErrorCode &status)
1439{
1440 if (U_FAILURE(status)){
1441 return NULL;
1442 }
1443
1444 //
1445 // If user buffer size is zero this is a preflight operation to
1446 // obtain the needed buffer size, allowing for worst case misalignment.
1447 //
1448 if (bufferSize == 0) {
1449 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0);
1450 return NULL;
1451 }
1452
1453
1454 //
1455 // Check the alignment and size of the user supplied buffer.
1456 // Allocate heap memory if the user supplied memory is insufficient.
1457 //
1458 char *buf = (char *)stackBuffer;
1459 uint32_t s = bufferSize;
1460
1461 if (stackBuffer == NULL) {
1462 s = 0; // Ignore size, force allocation if user didn't give us a buffer.
1463 }
1464 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) {
1465 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf);
1466 s -= offsetUp;
1467 buf += offsetUp;
1468 }
1469 if (s < sizeof(RuleBasedBreakIterator)) {
73c04bcf
A
1470 // Not enough room in the caller-supplied buffer.
1471 // Do a plain-vanilla heap based clone and return that, along with
1472 // a warning that the clone was allocated.
1473 RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this);
1474 if (clonedBI == 0) {
b75a7d8f 1475 status = U_MEMORY_ALLOCATION_ERROR;
73c04bcf
A
1476 } else {
1477 status = U_SAFECLONE_ALLOCATED_WARNING;
b75a7d8f 1478 }
73c04bcf 1479 return clonedBI;
b75a7d8f
A
1480 }
1481
1482 //
73c04bcf 1483 // Clone the source BI into the caller-supplied buffer.
b75a7d8f
A
1484 // TODO: using an overloaded operator new to directly initialize the
1485 // copy in the user's buffer would be better, but it doesn't seem
1486 // to get along with namespaces. Investigate why.
1487 //
1488 // The memcpy is only safe with an empty (default constructed)
1489 // break iterator. Use on others can screw up reference counts
1490 // to data. memcpy-ing objects is not really a good idea...
1491 //
1492 RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy
1493 RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf;
73c04bcf
A
1494 uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part
1495 clone->init(); // Init RuleBasedBreakIterator part, (user default constructor)
1496 *clone = *this; // clone = the real BI we want.
1497 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code)
b75a7d8f
A
1498
1499 return clone;
1500}
1501
1502
b75a7d8f
A
1503//-------------------------------------------------------------------------------
1504//
1505// isDictionaryChar Return true if the category lookup for this char
1506// indicates that it is in the set of dictionary lookup
1507// chars.
1508//
1509// This function is intended for use by dictionary based
1510// break iterators.
1511//
1512//-------------------------------------------------------------------------------
73c04bcf 1513/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
b75a7d8f
A
1514 if (fData == NULL) {
1515 return FALSE;
1516 }
1517 uint16_t category;
1518 UTRIE_GET16(&fData->fTrie, c, category);
1519 return (category & 0x4000) != 0;
73c04bcf
A
1520}*/
1521
1522
1523//-------------------------------------------------------------------------------
1524//
1525// checkDictionary This function handles all processing of characters in
1526// the "dictionary" set. It will determine the appropriate
1527// course of action, and possibly set up a cache in the
1528// process.
1529//
1530//-------------------------------------------------------------------------------
1531int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
1532 int32_t endPos,
1533 UBool reverse) {
1534 // Reset the old break cache first.
1535 uint32_t dictionaryCount = fDictionaryCharCount;
1536 reset();
1537
1538 if (dictionaryCount <= 1 || (endPos - startPos) <= 1) {
1539 return (reverse ? startPos : endPos);
1540 }
1541
1542 // Starting from the starting point, scan towards the proposed result,
1543 // looking for the first dictionary character (which may be the one
1544 // we're on, if we're starting in the middle of a range).
1545 utext_setNativeIndex(fText, reverse ? endPos : startPos);
1546 if (reverse) {
1547 UTEXT_PREVIOUS32(fText);
1548 }
1549
1550 int32_t rangeStart = startPos;
1551 int32_t rangeEnd = endPos;
1552
1553 uint16_t category;
1554 int32_t current;
1555 UErrorCode status = U_ZERO_ERROR;
1556 UStack breaks(status);
1557 int32_t foundBreakCount = 0;
1558 UChar32 c = utext_current32(fText);
1559
1560 UTRIE_GET16(&fData->fTrie, c, category);
1561
1562 // Is the character we're starting on a dictionary character? If so, we
1563 // need to back up to include the entire run; otherwise the results of
1564 // the break algorithm will differ depending on where we start. Since
1565 // the result is cached and there is typically a non-dictionary break
1566 // within a small number of words, there should be little performance impact.
1567 if (category & 0x4000) {
1568 if (reverse) {
1569 do {
1570 utext_next32(fText); // TODO: recast to work directly with postincrement.
1571 c = utext_current32(fText);
1572 UTRIE_GET16(&fData->fTrie, c, category);
1573 } while (c != U_SENTINEL && (category & 0x4000));
1574 // Back up to the last dictionary character
1575 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1576 if (c == U_SENTINEL) {
1577 // c = fText->last32();
1578 // TODO: why was this if needed?
1579 c = UTEXT_PREVIOUS32(fText);
1580 }
1581 else {
1582 c = UTEXT_PREVIOUS32(fText);
1583 }
1584 }
1585 else {
1586 do {
1587 c = UTEXT_PREVIOUS32(fText);
1588 UTRIE_GET16(&fData->fTrie, c, category);
1589 }
1590 while (c != U_SENTINEL && (category & 0x4000));
1591 // Back up to the last dictionary character
1592 if (c == U_SENTINEL) {
1593 // c = fText->first32();
1594 c = utext_current32(fText);
1595 }
1596 else {
1597 utext_next32(fText);
1598 c = utext_current32(fText);
1599 }
1600 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1601 }
1602 UTRIE_GET16(&fData->fTrie, c, category);
1603 }
1604
1605 // Loop through the text, looking for ranges of dictionary characters.
1606 // For each span, find the appropriate break engine, and ask it to find
1607 // any breaks within the span.
1608 // Note: we always do this in the forward direction, so that the break
1609 // cache is built in the right order.
1610 if (reverse) {
1611 utext_setNativeIndex(fText, rangeStart);
1612 c = utext_current32(fText);
1613 UTRIE_GET16(&fData->fTrie, c, category);
1614 }
1615 while(U_SUCCESS(status)) {
1616 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1617 utext_next32(fText); // TODO: tweak for post-increment operation
1618 c = utext_current32(fText);
1619 UTRIE_GET16(&fData->fTrie, c, category);
1620 }
1621 if (current >= rangeEnd) {
1622 break;
1623 }
1624
1625 // We now have a dictionary character. Get the appropriate language object
1626 // to deal with it.
1627 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1628
1629 // Ask the language object if there are any breaks. It will leave the text
1630 // pointer on the other side of its range, ready to search for the next one.
1631 if (lbe != NULL) {
1632 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks);
1633 }
1634
1635 // Reload the loop variables for the next go-round
1636 c = utext_current32(fText);
1637 UTRIE_GET16(&fData->fTrie, c, category);
1638 }
1639
1640 // If we found breaks, build a new break cache. The first and last entries must
1641 // be the original starting and ending position.
1642 if (foundBreakCount > 0) {
1643 int32_t totalBreaks = foundBreakCount;
1644 if (startPos < breaks.elementAti(0)) {
1645 totalBreaks += 1;
1646 }
1647 if (endPos > breaks.peeki()) {
1648 totalBreaks += 1;
1649 }
1650 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1651 if (fCachedBreakPositions != NULL) {
1652 int32_t out = 0;
1653 fNumCachedBreakPositions = totalBreaks;
1654 if (startPos < breaks.elementAti(0)) {
1655 fCachedBreakPositions[out++] = startPos;
1656 }
1657 for (int32_t i = 0; i < foundBreakCount; ++i) {
1658 fCachedBreakPositions[out++] = breaks.elementAti(i);
1659 }
1660 if (endPos > fCachedBreakPositions[out-1]) {
1661 fCachedBreakPositions[out] = endPos;
1662 }
1663 // If there are breaks, then by definition, we are replacing the original
1664 // proposed break by one of the breaks we found. Use following() and
1665 // preceding() to do the work. They should never recurse in this case.
1666 if (reverse) {
1667 return preceding(endPos - 1);
1668 }
1669 else {
1670 return following(startPos);
1671 }
1672 }
1673 // If the allocation failed, just fall through to the "no breaks found" case.
1674 }
1675
1676 // If we get here, there were no language-based breaks. Set the text pointer
1677 // to the original proposed break.
1678 utext_setNativeIndex(fText, reverse ? startPos : endPos);
1679 return (reverse ? startPos : endPos);
1680}
1681
1682static UStack *gLanguageBreakFactories = NULL;
1683
1684U_NAMESPACE_END
1685
1686// defined in ucln_cmn.h
1687
1688/**
1689 * Release all static memory held by breakiterator.
1690 */
1691U_CDECL_BEGIN
1692static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1693 if (gLanguageBreakFactories) {
1694 delete gLanguageBreakFactories;
1695 gLanguageBreakFactories = NULL;
1696 }
1697 return TRUE;
b75a7d8f 1698}
73c04bcf 1699U_CDECL_END
b75a7d8f 1700
73c04bcf
A
1701U_CDECL_BEGIN
1702static void U_CALLCONV _deleteFactory(void *obj) {
1703 delete (LanguageBreakFactory *) obj;
1704}
1705U_CDECL_END
1706U_NAMESPACE_BEGIN
b75a7d8f 1707
73c04bcf
A
1708static const LanguageBreakEngine*
1709getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
1710{
1711 UBool needsInit;
1712 UErrorCode status = U_ZERO_ERROR;
1713 umtx_lock(NULL);
1714 needsInit = (UBool)(gLanguageBreakFactories == NULL);
1715 umtx_unlock(NULL);
1716
1717 if (needsInit) {
1718 UStack *factories = new UStack(_deleteFactory, NULL, status);
1719 if (U_SUCCESS(status)) {
1720 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1721 factories->push(builtIn, status);
1722#ifdef U_LOCAL_SERVICE_HOOK
1723 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1724 if (extra != NULL) {
1725 factories->push(extra, status);
1726 }
1727#endif
1728 }
1729 umtx_lock(NULL);
1730 if (gLanguageBreakFactories == NULL) {
1731 gLanguageBreakFactories = factories;
1732 factories = NULL;
1733 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict);
1734 }
1735 umtx_unlock(NULL);
1736 delete factories;
1737 }
1738
1739 if (gLanguageBreakFactories == NULL) {
1740 return NULL;
1741 }
1742
1743 int32_t i = gLanguageBreakFactories->size();
1744 const LanguageBreakEngine *lbe = NULL;
1745 while (--i >= 0) {
1746 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1747 lbe = factory->getEngineFor(c, breakType);
1748 if (lbe != NULL) {
1749 break;
1750 }
1751 }
1752 return lbe;
1753}
1754
1755
1756//-------------------------------------------------------------------------------
1757//
1758// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1759// the characer c.
1760//
1761//-------------------------------------------------------------------------------
1762const LanguageBreakEngine *
1763RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1764 const LanguageBreakEngine *lbe = NULL;
1765 UErrorCode status = U_ZERO_ERROR;
1766
1767 if (fLanguageBreakEngines == NULL) {
1768 fLanguageBreakEngines = new UStack(status);
1769 if (U_FAILURE(status)) {
1770 delete fLanguageBreakEngines;
1771 fLanguageBreakEngines = 0;
1772 return NULL;
1773 }
1774 }
1775
1776 int32_t i = fLanguageBreakEngines->size();
1777 while (--i >= 0) {
1778 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1779 if (lbe->handles(c, fBreakType)) {
1780 return lbe;
1781 }
1782 }
1783
1784 // No existing dictionary took the character. See if a factory wants to
1785 // give us a new LanguageBreakEngine for this character.
1786 lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
1787
1788 // If we got one, use it and push it on our stack.
1789 if (lbe != NULL) {
1790 fLanguageBreakEngines->push((void *)lbe, status);
1791 // Even if we can't remember it, we can keep looking it up, so
1792 // return it even if the push fails.
1793 return lbe;
1794 }
1795
1796 // No engine is forthcoming for this character. Add it to the
1797 // reject set. Create the reject break engine if needed.
1798 if (fUnhandledBreakEngine == NULL) {
1799 fUnhandledBreakEngine = new UnhandledEngine(status);
1800 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1801 status = U_MEMORY_ALLOCATION_ERROR;
1802 }
1803 // Put it last so that scripts for which we have an engine get tried
1804 // first.
1805 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1806 // If we can't insert it, or creation failed, get rid of it
1807 if (U_FAILURE(status)) {
1808 delete fUnhandledBreakEngine;
1809 fUnhandledBreakEngine = 0;
1810 return NULL;
1811 }
1812 }
1813
1814 // Tell the reject engine about the character; at its discretion, it may
1815 // add more than just the one character.
1816 fUnhandledBreakEngine->handleCharacter(c, fBreakType);
1817
1818 return fUnhandledBreakEngine;
1819}
1820
1821
1822
1823/*int32_t RuleBasedBreakIterator::getBreakType() const {
1824 return fBreakType;
1825}*/
1826
1827void RuleBasedBreakIterator::setBreakType(int32_t type) {
1828 fBreakType = type;
1829 reset();
1830}
b75a7d8f
A
1831
1832U_NAMESPACE_END
1833
1834#endif /* #if !UCONFIG_NO_BREAK_ITERATION */