]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbbi57.cpp
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbi57.cpp
CommitLineData
0f5d89e8
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5* Copyright (C) 1999-2016 International Business Machines Corporation
6* and others. All rights reserved.
7***************************************************************************
8
9**********************************************************************
10* Legacy version of RuleBasedBreakIterator from ICU 57,
11* only for use by Apple RuleBasedTokenizer
12**********************************************************************
13*/
14
15#include "utypeinfo.h" // for 'typeid' to work
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_BREAK_ITERATION
20
21#include "unicode/schriter.h"
22#include "unicode/uchriter.h"
23#include "unicode/udata.h"
24#include "unicode/uclean.h"
25#include "unicode/utext.h"
26#include "rbbidata57.h"
27#include "rbbirb57.h"
28#include "rbbi57.h"
29#include "cmemory.h"
30#include "cstring.h"
31#include "umutex.h"
32#include "ucln_cmn.h"
33#include "brkeng.h"
34#include "utrie.h"
35
36#include "uassert.h"
37#include "uvectr32.h"
38
39// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included.
40#if U_LOCAL_SERVICE_HOOK
41#include "localsvc.h"
42#endif
43
44#ifdef RBBI_DEBUG
45static UBool fTrace = FALSE;
46#endif
47
48U_NAMESPACE_BEGIN
49
50// The state number of the starting state
51#define START_STATE 1
52
53// The state-transition value indicating "stop"
54#define STOP_STATE 0
55
56
57UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator57)
58
59
60//=======================================================================
61// constructors
62//=======================================================================
63
64/**
65 * Constructs a RuleBasedBreakIterator57 that uses the already-created
66 * tables object that is passed in as a parameter.
67 */
68RuleBasedBreakIterator57::RuleBasedBreakIterator57(RBBIDataHeader57* data, UErrorCode &status)
69{
70 init();
71 fData = new RBBIDataWrapper57(data, status); // status checked in constructor
72 if (U_FAILURE(status)) {return;}
73 if(fData == 0) {
74 status = U_MEMORY_ALLOCATION_ERROR;
75 return;
76 }
77}
78
79/**
80 * Same as above but does not adopt memory
81 */
82RuleBasedBreakIterator57::RuleBasedBreakIterator57(const RBBIDataHeader57* data, enum EDontAdopt, UErrorCode &status)
83{
84 init();
85 fData = new RBBIDataWrapper57(data, RBBIDataWrapper57::kDontAdopt, status); // status checked in constructor
86 if (U_FAILURE(status)) {return;}
87 if(fData == 0) {
88 status = U_MEMORY_ALLOCATION_ERROR;
89 return;
90 }
91}
92
93
94#if 0
95// not used by rbtok.cpp
96
97//
98// Construct from precompiled binary rules (tables). This constructor is public API,
99// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
100//
101RuleBasedBreakIterator57::RuleBasedBreakIterator57(const uint8_t *compiledRules,
102 uint32_t ruleLength,
103 UErrorCode &status) {
104 init();
105 if (U_FAILURE(status)) {
106 return;
107 }
108 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader57)) {
109 status = U_ILLEGAL_ARGUMENT_ERROR;
110 return;
111 }
112 const RBBIDataHeader57 *data = (const RBBIDataHeader57 *)compiledRules;
113 if (data->fLength > ruleLength) {
114 status = U_ILLEGAL_ARGUMENT_ERROR;
115 return;
116 }
117 fData = new RBBIDataWrapper57(data, RBBIDataWrapper57::kDontAdopt, status);
118 if (U_FAILURE(status)) {return;}
119 if(fData == 0) {
120 status = U_MEMORY_ALLOCATION_ERROR;
121 return;
122 }
123}
124
125
126//-------------------------------------------------------------------------------
127//
128// Constructor from a UDataMemory handle to precompiled break rules
129// stored in an ICU data file.
130//
131//-------------------------------------------------------------------------------
132RuleBasedBreakIterator57::RuleBasedBreakIterator57(UDataMemory* udm, UErrorCode &status)
133{
134 init();
135 fData = new RBBIDataWrapper57(udm, status); // status checked in constructor
136 if (U_FAILURE(status)) {return;}
137 if(fData == 0) {
138 status = U_MEMORY_ALLOCATION_ERROR;
139 return;
140 }
141}
142#endif
143
144
145
146//-------------------------------------------------------------------------------
147//
148// Constructor from a set of rules supplied as a string.
149//
150//-------------------------------------------------------------------------------
151RuleBasedBreakIterator57::RuleBasedBreakIterator57( const UnicodeString &rules,
152 UParseError &parseError,
153 UErrorCode &status)
154{
155 init();
156 if (U_FAILURE(status)) {return;}
157 RuleBasedBreakIterator57 *bi = (RuleBasedBreakIterator57 *)
158 RBBIRuleBuilder57::createRuleBasedBreakIterator(rules, &parseError, status);
159 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
160 // creates and returns a complete RBBI. From here, in a constructor, we
161 // can't just return the object created by the builder factory, hence
162 // the assignment of the factory created object to "this".
163 if (U_SUCCESS(status)) {
164 *this = *bi;
165 delete bi;
166 }
167}
168
169
170//-------------------------------------------------------------------------------
171//
172// Default Constructor. Create an empty shell that can be set up later.
173// Used when creating a RuleBasedBreakIterator57 from a set
174// of rules.
175//-------------------------------------------------------------------------------
176RuleBasedBreakIterator57::RuleBasedBreakIterator57() {
177 init();
178}
179
180
181//-------------------------------------------------------------------------------
182//
183// Copy constructor. Will produce a break iterator with the same behavior,
184// and which iterates over the same text, as the one passed in.
185//
186//-------------------------------------------------------------------------------
187RuleBasedBreakIterator57::RuleBasedBreakIterator57(const RuleBasedBreakIterator57& other)
188: BreakIterator(other)
189{
190 this->init();
191 *this = other;
192}
193
194
195/**
196 * Destructor
197 */
198RuleBasedBreakIterator57::~RuleBasedBreakIterator57() {
199 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
200 // fCharIter was adopted from the outside.
201 delete fCharIter;
202 }
203 fCharIter = NULL;
204 delete fSCharIter;
205 fCharIter = NULL;
206 delete fDCharIter;
207 fDCharIter = NULL;
208
209 utext_close(fText);
210
211 if (fData != NULL) {
212 fData->removeReference();
213 fData = NULL;
214 }
215 if (fCachedBreakPositions) {
216 uprv_free(fCachedBreakPositions);
217 fCachedBreakPositions = NULL;
218 }
219 if (fLanguageBreakEngines) {
220 delete fLanguageBreakEngines;
221 fLanguageBreakEngines = NULL;
222 }
223 if (fUnhandledBreakEngine) {
224 delete fUnhandledBreakEngine;
225 fUnhandledBreakEngine = NULL;
226 }
227}
228
229/**
230 * Assignment operator. Sets this iterator to have the same behavior,
231 * and iterate over the same text, as the one passed in.
232 */
233RuleBasedBreakIterator57&
234RuleBasedBreakIterator57::operator=(const RuleBasedBreakIterator57& that) {
235 if (this == &that) {
236 return *this;
237 }
238 fLineWordOpts = that.fLineWordOpts;
239 reset(); // Delete break cache information
240 fBreakType = that.fBreakType;
241 if (fLanguageBreakEngines != NULL) {
242 delete fLanguageBreakEngines;
243 fLanguageBreakEngines = NULL; // Just rebuild for now
244 }
245 // TODO: clone fLanguageBreakEngines from "that"
246 UErrorCode status = U_ZERO_ERROR;
247 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status);
248
249 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
250 delete fCharIter;
251 }
252 fCharIter = NULL;
253
254 if (that.fCharIter != NULL ) {
255 // This is a little bit tricky - it will intially appear that
256 // this->fCharIter is adopted, even if that->fCharIter was
257 // not adopted. That's ok.
258 fCharIter = that.fCharIter->clone();
259 }
260
261 if (fData != NULL) {
262 fData->removeReference();
263 fData = NULL;
264 }
265 if (that.fData != NULL) {
266 fData = that.fData->addReference();
267 }
268
269 return *this;
270}
271
272
273
274//-----------------------------------------------------------------------------
275//
276// init() Shared initialization routine. Used by all the constructors.
277// Initializes all fields, leaving the object in a consistent state.
278//
279//-----------------------------------------------------------------------------
280void RuleBasedBreakIterator57::init() {
281 UErrorCode status = U_ZERO_ERROR;
282 fText = utext_openUChars(NULL, NULL, 0, &status);
283 fCharIter = NULL;
284 fSCharIter = NULL;
285 fDCharIter = NULL;
286 fData = NULL;
287 fLastRuleStatusIndex = 0;
288 fLastStatusIndexValid = TRUE;
289 fDictionaryCharCount = 0;
290 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable
291 // dictionary behavior for Break Iterators that are
292 // built from rules. Even better would be the ability to
293 // declare the type in the rules.
294
295 fCachedBreakPositions = NULL;
296 fLanguageBreakEngines = NULL;
297 fUnhandledBreakEngine = NULL;
298 fNumCachedBreakPositions = 0;
299 fPositionInCache = 0;
300
301#ifdef RBBI_DEBUG
302 static UBool debugInitDone = FALSE;
303 if (debugInitDone == FALSE) {
304 char *debugEnv = getenv("U_RBBIDEBUG");
305 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
306 fTrace = TRUE;
307 }
308 debugInitDone = TRUE;
309 }
310#endif
311}
312
313
314
315//-----------------------------------------------------------------------------
316//
317// clone - Returns a newly-constructed RuleBasedBreakIterator57 with the same
318// behavior, and iterating over the same text, as this one.
319// Virtual function: does the right thing with subclasses.
320//
321//-----------------------------------------------------------------------------
322BreakIterator*
323RuleBasedBreakIterator57::clone(void) const {
324 return new RuleBasedBreakIterator57(*this);
325}
326
327/**
328 * Equality operator. Returns TRUE if both BreakIterators are of the
329 * same class, have the same behavior, and iterate over the same text.
330 */
331UBool
332RuleBasedBreakIterator57::operator==(const BreakIterator& that) const {
333 if (typeid(*this) != typeid(that)) {
334 return FALSE;
335 }
336
337 const RuleBasedBreakIterator57& that2 = (const RuleBasedBreakIterator57&) that;
338 if (that2.fLineWordOpts != fLineWordOpts) {
339 return FALSE;
340 }
341
342 if (!utext_equals(fText, that2.fText)) {
343 // The two break iterators are operating on different text,
344 // or have a different interation position.
345 return FALSE;
346 };
347
348 // TODO: need a check for when in a dictionary region at different offsets.
349
350 if (that2.fData == fData ||
351 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
352 // The two break iterators are using the same rules.
353 return TRUE;
354 }
355 return FALSE;
356}
357
358/**
359 * Compute a hash code for this BreakIterator
360 * @return A hash code
361 */
362int32_t
363RuleBasedBreakIterator57::hashCode(void) const {
364 int32_t hash = 0;
365 if (fData != NULL) {
366 hash = fData->hashCode();
367 }
368 return hash;
369}
370
371
372void RuleBasedBreakIterator57::setText(UText *ut, UErrorCode &status) {
373 if (U_FAILURE(status)) {
374 return;
375 }
376 reset();
377 fText = utext_clone(fText, ut, FALSE, TRUE, &status);
378
379 // Set up a dummy CharacterIterator to be returned if anyone
380 // calls getText(). With input from UText, there is no reasonable
381 // way to return a characterIterator over the actual input text.
382 // Return one over an empty string instead - this is the closest
383 // we can come to signaling a failure.
384 // (GetText() is obsolete, this failure is sort of OK)
385 if (fDCharIter == NULL) {
386 static const UChar c = 0;
387 fDCharIter = new UCharCharacterIterator(&c, 0);
388 if (fDCharIter == NULL) {
389 status = U_MEMORY_ALLOCATION_ERROR;
390 return;
391 }
392 }
393
394 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
395 // existing fCharIter was adopted from the outside. Delete it now.
396 delete fCharIter;
397 }
398 fCharIter = fDCharIter;
399
400 this->first();
401}
402
403
404UText *RuleBasedBreakIterator57::getUText(UText *fillIn, UErrorCode &status) const {
405 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status);
406 return result;
407}
408
409
410
411#if 0
412// not used by rbtok.cpp
413/**
414 * Returns the description used to create this iterator
415 */
416const UnicodeString&
417RuleBasedBreakIterator57::getRules() const {
418 if (fData != NULL) {
419 return fData->getRuleSourceString();
420 } else {
421 static const UnicodeString *s;
422 if (s == NULL) {
423 // TODO: something more elegant here.
424 // perhaps API should return the string by value.
425 // Note: thread unsafe init & leak are semi-ok, better than
426 // what was before. Sould be cleaned up, though.
427 s = new UnicodeString;
428 }
429 return *s;
430 }
431}
432#endif
433
434//=======================================================================
435// BreakIterator overrides
436//=======================================================================
437
438/**
439 * Return a CharacterIterator over the text being analyzed.
440 */
441CharacterIterator&
442RuleBasedBreakIterator57::getText() const {
443 return *fCharIter;
444}
445
446/**
447 * Set the iterator to analyze a new piece of text. This function resets
448 * the current iteration position to the beginning of the text.
449 * @param newText An iterator over the text to analyze.
450 */
451void
452RuleBasedBreakIterator57::adoptText(CharacterIterator* newText) {
453 // If we are holding a CharacterIterator adopted from a
454 // previous call to this function, delete it now.
455 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
456 delete fCharIter;
457 }
458
459 fCharIter = newText;
460 UErrorCode status = U_ZERO_ERROR;
461 reset();
462 if (newText==NULL || newText->startIndex() != 0) {
463 // startIndex !=0 wants to be an error, but there's no way to report it.
464 // Make the iterator text be an empty string.
465 fText = utext_openUChars(fText, NULL, 0, &status);
466 } else {
467 fText = utext_openCharacterIterator(fText, newText, &status);
468 }
469 this->first();
470}
471
472/**
473 * Set the iterator to analyze a new piece of text. This function resets
474 * the current iteration position to the beginning of the text.
475 * @param newText An iterator over the text to analyze.
476 */
477void
478RuleBasedBreakIterator57::setText(const UnicodeString& newText) {
479 UErrorCode status = U_ZERO_ERROR;
480 reset();
481 fText = utext_openConstUnicodeString(fText, &newText, &status);
482
483 // Set up a character iterator on the string.
484 // Needed in case someone calls getText().
485 // Can not, unfortunately, do this lazily on the (probably never)
486 // call to getText(), because getText is const.
487 if (fSCharIter == NULL) {
488 fSCharIter = new StringCharacterIterator(newText);
489 } else {
490 fSCharIter->setText(newText);
491 }
492
493 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) {
494 // old fCharIter was adopted from the outside. Delete it.
495 delete fCharIter;
496 }
497 fCharIter = fSCharIter;
498
499 this->first();
500}
501
502
503/**
504 * Provide a new UText for the input text. Must reference text with contents identical
505 * to the original.
506 * Intended for use with text data originating in Java (garbage collected) environments
507 * where the data may be moved in memory at arbitrary times.
508 */
509RuleBasedBreakIterator57 &RuleBasedBreakIterator57::refreshInputText(UText *input, UErrorCode &status) {
510 if (U_FAILURE(status)) {
511 return *this;
512 }
513 if (input == NULL) {
514 status = U_ILLEGAL_ARGUMENT_ERROR;
515 return *this;
516 }
517 int64_t pos = utext_getNativeIndex(fText);
518 // Shallow read-only clone of the new UText into the existing input UText
519 fText = utext_clone(fText, input, FALSE, TRUE, &status);
520 if (U_FAILURE(status)) {
521 return *this;
522 }
523 utext_setNativeIndex(fText, pos);
524 if (utext_getNativeIndex(fText) != pos) {
525 // Sanity check. The new input utext is supposed to have the exact same
526 // contents as the old. If we can't set to the same position, it doesn't.
527 // The contents underlying the old utext might be invalid at this point,
528 // so it's not safe to check directly.
529 status = U_ILLEGAL_ARGUMENT_ERROR;
530 }
531 return *this;
532}
533
534
535/**
536 * Sets the current iteration position to the beginning of the text, position zero.
537 * @return The new iterator position, which is zero.
538 */
539int32_t RuleBasedBreakIterator57::first(void) {
540 reset();
541 fLastRuleStatusIndex = 0;
542 fLastStatusIndexValid = TRUE;
543 //if (fText == NULL)
544 // return BreakIterator::DONE;
545
546 utext_setNativeIndex(fText, 0);
547 return 0;
548}
549
550/**
551 * Sets the current iteration position to the end of the text.
552 * @return The text's past-the-end offset.
553 */
554int32_t RuleBasedBreakIterator57::last(void) {
555 reset();
556 if (fText == NULL) {
557 fLastRuleStatusIndex = 0;
558 fLastStatusIndexValid = TRUE;
559 return BreakIterator::DONE;
560 }
561
562 fLastStatusIndexValid = FALSE;
563 int32_t pos = (int32_t)utext_nativeLength(fText);
564 utext_setNativeIndex(fText, pos);
565 return pos;
566}
567
568/**
569 * Advances the iterator either forward or backward the specified number of steps.
570 * Negative values move backward, and positive values move forward. This is
571 * equivalent to repeatedly calling next() or previous().
572 * @param n The number of steps to move. The sign indicates the direction
573 * (negative is backwards, and positive is forwards).
574 * @return The character offset of the boundary position n boundaries away from
575 * the current one.
576 */
577int32_t RuleBasedBreakIterator57::next(int32_t n) {
578 int32_t result = current();
579 while (n > 0) {
580 result = next();
581 --n;
582 }
583 while (n < 0) {
584 result = previous();
585 ++n;
586 }
587 return result;
588}
589
590/**
591 * Advances the iterator to the next boundary position.
592 * @return The position of the first boundary after this one.
593 */
594int32_t RuleBasedBreakIterator57::next(void) {
595 // if we have cached break positions and we're still in the range
596 // covered by them, just move one step forward in the cache
597 if (fCachedBreakPositions != NULL) {
598 if (fPositionInCache < fNumCachedBreakPositions - 1) {
599 ++fPositionInCache;
600 int32_t pos = fCachedBreakPositions[fPositionInCache];
601 utext_setNativeIndex(fText, pos);
602 return pos;
603 }
604 else {
605 reset();
606 }
607 }
608
609 int32_t startPos = current();
610 fDictionaryCharCount = 0;
611 int32_t result = handleNext(fData->fForwardTable);
612 while (fLineWordOpts != UBRK_LINEWORD_NORMAL) {
613 UChar32 prevChr = utext_char32At(fText, result-1);
614 UChar32 currChr = utext_char32At(fText, result);
615 if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) {
616 break;
617 }
618 int32_t nextResult = handleNext(fData->fForwardTable);
619 if (nextResult <= result) {
620 break;
621 }
622 result = nextResult;
623 }
624 if (fDictionaryCharCount > 0) {
625 result = checkDictionary(startPos, result, FALSE);
626 }
627 return result;
628}
629
630/**
631 * Advances the iterator backwards, to the last boundary preceding this one.
632 * @return The position of the last boundary position preceding this one.
633 */
634int32_t RuleBasedBreakIterator57::previous(void) {
635 int32_t result;
636 int32_t startPos;
637
638 // if we have cached break positions and we're still in the range
639 // covered by them, just move one step backward in the cache
640 if (fCachedBreakPositions != NULL) {
641 if (fPositionInCache > 0) {
642 --fPositionInCache;
643 // If we're at the beginning of the cache, need to reevaluate the
644 // rule status
645 if (fPositionInCache <= 0) {
646 fLastStatusIndexValid = FALSE;
647 }
648 int32_t pos = fCachedBreakPositions[fPositionInCache];
649 utext_setNativeIndex(fText, pos);
650 return pos;
651 }
652 else {
653 reset();
654 }
655 }
656
657 // if we're already sitting at the beginning of the text, return DONE
658 if (fText == NULL || (startPos = current()) == 0) {
659 fLastRuleStatusIndex = 0;
660 fLastStatusIndexValid = TRUE;
661 return BreakIterator::DONE;
662 }
663
664 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) {
665 result = handlePrevious(fData->fReverseTable);
666 while (fLineWordOpts != UBRK_LINEWORD_NORMAL) {
667 UChar32 prevChr = utext_char32At(fText, result-1);
668 UChar32 currChr = utext_char32At(fText, result);
669 if (currChr == U_SENTINEL || prevChr == U_SENTINEL || !u_isalpha(currChr) || !u_isalpha(prevChr)) {
670 break;
671 }
672 int32_t prevResult = handlePrevious(fData->fReverseTable);
673 if (prevResult >= result) {
674 break;
675 }
676 result = prevResult;
677 }
678 if (fDictionaryCharCount > 0) {
679 result = checkDictionary(result, startPos, TRUE);
680 }
681 return result;
682 }
683
684 // old rule syntax
685 // set things up. handlePrevious() will back us up to some valid
686 // break position before the current position (we back our internal
687 // iterator up one step to prevent handlePrevious() from returning
688 // the current position), but not necessarily the last one before
689 // where we started
690
691 int32_t start = current();
692
693 (void)UTEXT_PREVIOUS32(fText);
694 int32_t lastResult = handlePrevious(fData->fReverseTable);
695 if (lastResult == UBRK_DONE) {
696 lastResult = 0;
697 utext_setNativeIndex(fText, 0);
698 }
699 result = lastResult;
700 int32_t lastTag = 0;
701 UBool breakTagValid = FALSE;
702
703 // iterate forward from the known break position until we pass our
704 // starting point. The last break position before the starting
705 // point is our return value
706
707 for (;;) {
708 result = next();
709 if (result == BreakIterator::DONE || result >= start) {
710 break;
711 }
712 lastResult = result;
713 lastTag = fLastRuleStatusIndex;
714 breakTagValid = TRUE;
715 }
716
717 // fLastBreakTag wants to have the value for section of text preceding
718 // the result position that we are to return (in lastResult.) If
719 // the backwards rules overshot and the above loop had to do two or more
720 // next()s to move up to the desired return position, we will have a valid
721 // tag value. But, if handlePrevious() took us to exactly the correct result position,
722 // we wont have a tag value for that position, which is only set by handleNext().
723
724 // Set the current iteration position to be the last break position
725 // before where we started, and then return that value.
726 utext_setNativeIndex(fText, lastResult);
727 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
728 fLastStatusIndexValid = breakTagValid;
729
730 // No need to check the dictionary; it will have been handled by
731 // next()
732
733 return lastResult;
734}
735
736/**
737 * Sets the iterator to refer to the first boundary position following
738 * the specified position.
739 * @offset The position from which to begin searching for a break position.
740 * @return The position of the first break after the current position.
741 */
742int32_t RuleBasedBreakIterator57::following(int32_t offset) {
743 // if the offset passed in is already past the end of the text,
744 // just return DONE; if it's before the beginning, return the
745 // text's starting offset
746 if (fText == NULL || offset >= utext_nativeLength(fText)) {
747 last();
748 return next();
749 }
750 else if (offset < 0) {
751 return first();
752 }
753
754 // Move requested offset to a code point start. It might be on a trail surrogate,
755 // or on a trail byte if the input is UTF-8.
756 utext_setNativeIndex(fText, offset);
757 offset = (int32_t)utext_getNativeIndex(fText);
758
759 // if we have cached break positions and offset is in the range
760 // covered by them, use them
761 // TODO: could use binary search
762 // TODO: what if offset is outside range, but break is not?
763 if (fCachedBreakPositions != NULL) {
764 if (offset >= fCachedBreakPositions[0]
765 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
766 fPositionInCache = 0;
767 // We are guaranteed not to leave the array due to range test above
768 while (offset >= fCachedBreakPositions[fPositionInCache]) {
769 ++fPositionInCache;
770 }
771 int32_t pos = fCachedBreakPositions[fPositionInCache];
772 utext_setNativeIndex(fText, pos);
773 return pos;
774 }
775 else {
776 reset();
777 }
778 }
779
780 // Set our internal iteration position (temporarily)
781 // to the position passed in. If this is the _beginning_ position,
782 // then we can just use next() to get our return value
783
784 int32_t result = 0;
785
786 if (fData->fSafeRevTable != NULL) {
787 // new rule syntax
788 utext_setNativeIndex(fText, offset);
789 // move forward one codepoint to prepare for moving back to a
790 // safe point.
791 // this handles offset being between a supplementary character
792 // TODO: is this still needed, with move to code point boundary handled above?
793 (void)UTEXT_NEXT32(fText);
794 // handlePrevious will move most of the time to < 1 boundary away
795 handlePrevious(fData->fSafeRevTable);
796 int32_t result = next();
797 while (result <= offset) {
798 result = next();
799 }
800 return result;
801 }
802 if (fData->fSafeFwdTable != NULL) {
803 // backup plan if forward safe table is not available
804 utext_setNativeIndex(fText, offset);
805 (void)UTEXT_PREVIOUS32(fText);
806 // handle next will give result >= offset
807 handleNext(fData->fSafeFwdTable);
808 // previous will give result 0 or 1 boundary away from offset,
809 // most of the time
810 // we have to
811 int32_t oldresult = previous();
812 while (oldresult > offset) {
813 int32_t result = previous();
814 if (result <= offset) {
815 return oldresult;
816 }
817 oldresult = result;
818 }
819 int32_t result = next();
820 if (result <= offset) {
821 return next();
822 }
823 return result;
824 }
825 // otherwise, we have to sync up first. Use handlePrevious() to back
826 // up to a known break position before the specified position (if
827 // we can determine that the specified position is a break position,
828 // we don't back up at all). This may or may not be the last break
829 // position at or before our starting position. Advance forward
830 // from here until we've passed the starting position. The position
831 // we stop on will be the first break position after the specified one.
832 // old rule syntax
833
834 utext_setNativeIndex(fText, offset);
835 if (offset==0 ||
836 (offset==1 && utext_getNativeIndex(fText)==0)) {
837 return next();
838 }
839 result = previous();
840
841 while (result != BreakIterator::DONE && result <= offset) {
842 result = next();
843 }
844
845 return result;
846}
847
848/**
849 * Sets the iterator to refer to the last boundary position before the
850 * specified position.
851 * @offset The position to begin searching for a break from.
852 * @return The position of the last boundary before the starting position.
853 */
854int32_t RuleBasedBreakIterator57::preceding(int32_t offset) {
855 // if the offset passed in is already past the end of the text,
856 // just return DONE; if it's before the beginning, return the
857 // text's starting offset
858 if (fText == NULL || offset > utext_nativeLength(fText)) {
859 return last();
860 }
861 else if (offset < 0) {
862 return first();
863 }
864
865 // Move requested offset to a code point start. It might be on a trail surrogate,
866 // or on a trail byte if the input is UTF-8.
867 utext_setNativeIndex(fText, offset);
868 offset = (int32_t)utext_getNativeIndex(fText);
869
870 // if we have cached break positions and offset is in the range
871 // covered by them, use them
872 if (fCachedBreakPositions != NULL) {
873 // TODO: binary search?
874 // TODO: What if offset is outside range, but break is not?
875 if (offset > fCachedBreakPositions[0]
876 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
877 fPositionInCache = 0;
878 while (fPositionInCache < fNumCachedBreakPositions
879 && offset > fCachedBreakPositions[fPositionInCache])
880 ++fPositionInCache;
881 --fPositionInCache;
882 // If we're at the beginning of the cache, need to reevaluate the
883 // rule status
884 if (fPositionInCache <= 0) {
885 fLastStatusIndexValid = FALSE;
886 }
887 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
888 return fCachedBreakPositions[fPositionInCache];
889 }
890 else {
891 reset();
892 }
893 }
894
895 // if we start by updating the current iteration position to the
896 // position specified by the caller, we can just use previous()
897 // to carry out this operation
898
899 if (fData->fSafeFwdTable != NULL) {
900 // new rule syntax
901 utext_setNativeIndex(fText, offset);
902 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
903 if (newOffset != offset) {
904 // Will come here if specified offset was not a code point boundary AND
905 // the underlying implmentation is using UText, which snaps any non-code-point-boundary
906 // indices to the containing code point.
907 // For breakitereator::preceding only, these non-code-point indices need to be moved
908 // up to refer to the following codepoint.
909 (void)UTEXT_NEXT32(fText);
910 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
911 }
912
913 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair,
914 // rather than adjusting the position unconditionally?
915 // (Change would interact with safe rules.)
916 // TODO: change RBBI behavior for off-boundary indices to match that of UText?
917 // affects only preceding(), seems cleaner, but is slightly different.
918 (void)UTEXT_PREVIOUS32(fText);
919 handleNext(fData->fSafeFwdTable);
920 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
921 while (result >= offset) {
922 result = previous();
923 }
924 return result;
925 }
926 if (fData->fSafeRevTable != NULL) {
927 // backup plan if forward safe table is not available
928 // TODO: check whether this path can be discarded
929 // It's probably OK to say that rules must supply both safe tables
930 // if they use safe tables at all. We have certainly never described
931 // to anyone how to work with just one safe table.
932 utext_setNativeIndex(fText, offset);
933 (void)UTEXT_NEXT32(fText);
934
935 // handle previous will give result <= offset
936 handlePrevious(fData->fSafeRevTable);
937
938 // next will give result 0 or 1 boundary away from offset,
939 // most of the time
940 // we have to
941 int32_t oldresult = next();
942 while (oldresult < offset) {
943 int32_t result = next();
944 if (result >= offset) {
945 return oldresult;
946 }
947 oldresult = result;
948 }
949 int32_t result = previous();
950 if (result >= offset) {
951 return previous();
952 }
953 return result;
954 }
955
956 // old rule syntax
957 utext_setNativeIndex(fText, offset);
958 return previous();
959}
960
961/**
962 * Returns true if the specfied position is a boundary position. As a side
963 * effect, leaves the iterator pointing to the first boundary position at
964 * or after "offset".
965 * @param offset the offset to check.
966 * @return True if "offset" is a boundary position.
967 */
968UBool RuleBasedBreakIterator57::isBoundary(int32_t offset) {
969 // the beginning index of the iterator is always a boundary position by definition
970 if (offset == 0) {
971 first(); // For side effects on current position, tag values.
972 return TRUE;
973 }
974
975 if (offset == (int32_t)utext_nativeLength(fText)) {
976 last(); // For side effects on current position, tag values.
977 return TRUE;
978 }
979
980 // out-of-range indexes are never boundary positions
981 if (offset < 0) {
982 first(); // For side effects on current position, tag values.
983 return FALSE;
984 }
985
986 if (offset > utext_nativeLength(fText)) {
987 last(); // For side effects on current position, tag values.
988 return FALSE;
989 }
990
991 // otherwise, we can use following() on the position before the specified
992 // one and return true if the position we get back is the one the user
993 // specified
994 utext_previous32From(fText, offset);
995 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
996 UBool result = following(backOne) == offset;
997 return result;
998}
999
1000/**
1001 * Returns the current iteration position.
1002 * @return The current iteration position.
1003 */
1004int32_t RuleBasedBreakIterator57::current(void) const {
1005 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1006 return pos;
1007}
1008
1009//=======================================================================
1010// implementation
1011//=======================================================================
1012
1013//
1014// RBBIRunMode - the state machine runs an extra iteration at the beginning and end
1015// of user text. A variable with this enum type keeps track of where we
1016// are. The state machine only fetches user input while in the RUN mode.
1017//
1018enum RBBIRunMode {
1019 RBBI_START, // state machine processing is before first char of input
1020 RBBI_RUN, // state machine processing is in the user text
1021 RBBI_END // state machine processing is after end of user text.
1022};
1023
1024
1025// Map from look-ahead break states (corresponds to rules) to boundary positions.
1026// Allows multiple lookahead break rules to be in flight at the same time.
1027//
1028// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
1029// in the state table be sequential, then we can just index an array. And the
1030// table could also tell us in advance how big that array needs to be.
1031//
1032// Before ICU 57 there was just a single simple variable for a look-ahead match that
1033// was in progress. Two rules at once did not work.
1034
1035static const int32_t kMaxLookaheads = 8;
1036struct LookAheadResults {
1037 int32_t fUsedSlotLimit;
1038 int32_t fPositions[8];
1039 int16_t fKeys[8];
1040
1041 LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {};
1042
1043 int32_t getPosition(int16_t key) {
1044 for (int32_t i=0; i<fUsedSlotLimit; ++i) {
1045 if (fKeys[i] == key) {
1046 return fPositions[i];
1047 }
1048 }
1049 U_ASSERT(FALSE);
1050 return -1;
1051 }
1052
1053 void setPosition(int16_t key, int32_t position) {
1054 int32_t i;
1055 for (i=0; i<fUsedSlotLimit; ++i) {
1056 if (fKeys[i] == key) {
1057 fPositions[i] = position;
1058 return;
1059 }
1060 }
1061 if (i >= kMaxLookaheads) {
1062 U_ASSERT(FALSE);
1063 i = kMaxLookaheads - 1;
1064 }
1065 fKeys[i] = key;
1066 fPositions[i] = position;
1067 U_ASSERT(fUsedSlotLimit == i);
1068 fUsedSlotLimit = i + 1;
1069 }
1070};
1071
1072
1073//-----------------------------------------------------------------------------------
1074//
1075// handleNext(stateTable)
1076// This method is the actual implementation of the rbbi next() method.
1077// This method initializes the state machine to state 1
1078// and advances through the text character by character until we reach the end
1079// of the text or the state machine transitions to state 0. We update our return
1080// value every time the state machine passes through an accepting state.
1081//
1082//-----------------------------------------------------------------------------------
1083int32_t RuleBasedBreakIterator57::handleNext(const RBBIStateTable *statetable) {
1084 int32_t state;
1085 uint16_t category = 0;
1086 RBBIRunMode mode;
1087
1088 RBBIStateTableRow *row;
1089 UChar32 c;
1090 LookAheadResults lookAheadMatches;
1091 int32_t result = 0;
1092 int32_t initialPosition = 0;
1093 const char *tableData = statetable->fTableData;
1094 uint32_t tableRowLen = statetable->fRowLen;
1095
1096 #ifdef RBBI_DEBUG
1097 if (fTrace) {
1098 RBBIDebugPuts("Handle Next pos char state category");
1099 }
1100 #endif
1101
1102 // No matter what, handleNext alway correctly sets the break tag value.
1103 fLastStatusIndexValid = TRUE;
1104 fLastRuleStatusIndex = 0;
1105
1106 // if we're already at the end of the text, return DONE.
1107 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1108 result = initialPosition;
1109 c = UTEXT_NEXT32(fText);
1110 if (fData == NULL || c==U_SENTINEL) {
1111 return BreakIterator::DONE;
1112 }
1113
1114 // Set the initial state for the state machine
1115 state = START_STATE;
1116 row = (RBBIStateTableRow *)
1117 //(statetable->fTableData + (statetable->fRowLen * state));
1118 (tableData + tableRowLen * state);
1119
1120
1121 mode = RBBI_RUN;
1122 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1123 category = 2;
1124 mode = RBBI_START;
1125 }
1126
1127
1128 // loop until we reach the end of the text or transition to state 0
1129 //
1130 for (;;) {
1131 if (c == U_SENTINEL) {
1132 // Reached end of input string.
1133 if (mode == RBBI_END) {
1134 // We have already run the loop one last time with the
1135 // character set to the psueudo {eof} value. Now it is time
1136 // to unconditionally bail out.
1137 break;
1138 }
1139 // Run the loop one last time with the fake end-of-input character category.
1140 mode = RBBI_END;
1141 category = 1;
1142 }
1143
1144 //
1145 // Get the char category. An incoming category of 1 or 2 means that
1146 // we are preset for doing the beginning or end of input, and
1147 // that we shouldn't get a category from an actual text input character.
1148 //
1149 if (mode == RBBI_RUN) {
1150 // look up the current character's character category, which tells us
1151 // which column in the state table to look at.
1152 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1153 // not the size of the character going in, which is a UChar32.
1154 //
1155 UTRIE_GET16(&fData->fTrie, c, category);
1156
1157 // Check the dictionary bit in the character's category.
1158 // Counter is only used by dictionary based iterators (subclasses).
1159 // Chars that need to be handled by a dictionary have a flag bit set
1160 // in their category values.
1161 //
1162 if ((category & 0x4000) != 0) {
1163 fDictionaryCharCount++;
1164 // And off the dictionary flag bit.
1165 category &= ~0x4000;
1166 }
1167 }
1168
1169 #ifdef RBBI_DEBUG
1170 if (fTrace) {
1171 RBBIDebugPrintf(" %4lld ", utext_getNativeIndex(fText));
1172 if (0x20<=c && c<0x7f) {
1173 RBBIDebugPrintf("\"%c\" ", c);
1174 } else {
1175 RBBIDebugPrintf("%5x ", c);
1176 }
1177 RBBIDebugPrintf("%3d %3d\n", state, category);
1178 }
1179 #endif
1180
1181 // State Transition - move machine to its next state
1182 //
1183
1184 // Note: fNextState is defined as uint16_t[2], but we are casting
1185 // a generated RBBI table to RBBIStateTableRow and some tables
1186 // actually have more than 2 categories.
1187 U_ASSERT(category<fData->fHeader->fCatCount);
1188 state = row->fNextState[category]; /*Not accessing beyond memory*/
1189 row = (RBBIStateTableRow *)
1190 // (statetable->fTableData + (statetable->fRowLen * state));
1191 (tableData + tableRowLen * state);
1192
1193
1194 if (row->fAccepting == -1) {
1195 // Match found, common case.
1196 if (mode != RBBI_START) {
1197 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1198 }
1199 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
1200 }
1201
1202 int16_t completedRule = row->fAccepting;
1203 if (completedRule > 0) {
1204 // Lookahead match is completed.
1205 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
1206 if (lookaheadResult >= 0) {
1207 fLastRuleStatusIndex = row->fTagIdx;
1208 UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
1209 return lookaheadResult;
1210 }
1211 }
1212 int16_t rule = row->fLookAhead;
1213 if (rule != 0) {
1214 // At the position of a '/' in a look-ahead match. Record it.
1215 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1216 lookAheadMatches.setPosition(rule, pos);
1217 }
1218
1219 if (state == STOP_STATE) {
1220 // This is the normal exit from the lookup state machine.
1221 // We have advanced through the string until it is certain that no
1222 // longer match is possible, no matter what characters follow.
1223 break;
1224 }
1225
1226 // Advance to the next character.
1227 // If this is a beginning-of-input loop iteration, don't advance
1228 // the input position. The next iteration will be processing the
1229 // first real input character.
1230 if (mode == RBBI_RUN) {
1231 c = UTEXT_NEXT32(fText);
1232 } else {
1233 if (mode == RBBI_START) {
1234 mode = RBBI_RUN;
1235 }
1236 }
1237
1238
1239 }
1240
1241 // The state machine is done. Check whether it found a match...
1242
1243 // If the iterator failed to advance in the match engine, force it ahead by one.
1244 // (This really indicates a defect in the break rules. They should always match
1245 // at least one character.)
1246 if (result == initialPosition) {
1247 UTEXT_SETNATIVEINDEX(fText, initialPosition);
1248 UTEXT_NEXT32(fText);
1249 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1250 }
1251
1252 // Leave the iterator at our result position.
1253 UTEXT_SETNATIVEINDEX(fText, result);
1254 #ifdef RBBI_DEBUG
1255 if (fTrace) {
1256 RBBIDebugPrintf("result = %d\n\n", result);
1257 }
1258 #endif
1259 return result;
1260}
1261
1262
1263
1264//-----------------------------------------------------------------------------------
1265//
1266// handlePrevious()
1267//
1268// Iterate backwards, according to the logic of the reverse rules.
1269// This version handles the exact style backwards rules.
1270//
1271// The logic of this function is very similar to handleNext(), above.
1272//
1273//-----------------------------------------------------------------------------------
1274int32_t RuleBasedBreakIterator57::handlePrevious(const RBBIStateTable *statetable) {
1275 int32_t state;
1276 uint16_t category = 0;
1277 RBBIRunMode mode;
1278 RBBIStateTableRow *row;
1279 UChar32 c;
1280 LookAheadResults lookAheadMatches;
1281 int32_t result = 0;
1282 int32_t initialPosition = 0;
1283
1284 #ifdef RBBI_DEBUG
1285 if (fTrace) {
1286 RBBIDebugPuts("Handle Previous pos char state category");
1287 }
1288 #endif
1289
1290 // handlePrevious() never gets the rule status.
1291 // Flag the status as invalid; if the user ever asks for status, we will need
1292 // to back up, then re-find the break position using handleNext(), which does
1293 // get the status value.
1294 fLastStatusIndexValid = FALSE;
1295 fLastRuleStatusIndex = 0;
1296
1297 // if we're already at the start of the text, return DONE.
1298 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) {
1299 return BreakIterator::DONE;
1300 }
1301
1302 // Set up the starting char.
1303 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1304 result = initialPosition;
1305 c = UTEXT_PREVIOUS32(fText);
1306
1307 // Set the initial state for the state machine
1308 state = START_STATE;
1309 row = (RBBIStateTableRow *)
1310 (statetable->fTableData + (statetable->fRowLen * state));
1311 category = 3;
1312 mode = RBBI_RUN;
1313 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
1314 category = 2;
1315 mode = RBBI_START;
1316 }
1317
1318
1319 // loop until we reach the start of the text or transition to state 0
1320 //
1321 for (;;) {
1322 if (c == U_SENTINEL) {
1323 // Reached end of input string.
1324 if (mode == RBBI_END) {
1325 // We have already run the loop one last time with the
1326 // character set to the psueudo {eof} value. Now it is time
1327 // to unconditionally bail out.
1328 if (result == initialPosition) {
1329 // Ran off start, no match found.
1330 // move one index one (towards the start, since we are doing a previous())
1331 UTEXT_SETNATIVEINDEX(fText, initialPosition);
1332 (void)UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check.
1333 }
1334 break;
1335 }
1336 // Run the loop one last time with the fake end-of-input character category.
1337 mode = RBBI_END;
1338 category = 1;
1339 }
1340
1341 //
1342 // Get the char category. An incoming category of 1 or 2 means that
1343 // we are preset for doing the beginning or end of input, and
1344 // that we shouldn't get a category from an actual text input character.
1345 //
1346 if (mode == RBBI_RUN) {
1347 // look up the current character's character category, which tells us
1348 // which column in the state table to look at.
1349 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1350 // not the size of the character going in, which is a UChar32.
1351 //
1352 UTRIE_GET16(&fData->fTrie, c, category);
1353
1354 // Check the dictionary bit in the character's category.
1355 // Counter is only used by dictionary based iterators (subclasses).
1356 // Chars that need to be handled by a dictionary have a flag bit set
1357 // in their category values.
1358 //
1359 if ((category & 0x4000) != 0) {
1360 fDictionaryCharCount++;
1361 // And off the dictionary flag bit.
1362 category &= ~0x4000;
1363 }
1364 }
1365
1366 #ifdef RBBI_DEBUG
1367 if (fTrace) {
1368 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText));
1369 if (0x20<=c && c<0x7f) {
1370 RBBIDebugPrintf("\"%c\" ", c);
1371 } else {
1372 RBBIDebugPrintf("%5x ", c);
1373 }
1374 RBBIDebugPrintf("%3d %3d\n", state, category);
1375 }
1376 #endif
1377
1378 // State Transition - move machine to its next state
1379 //
1380
1381 // Note: fNextState is defined as uint16_t[2], but we are casting
1382 // a generated RBBI table to RBBIStateTableRow and some tables
1383 // actually have more than 2 categories.
1384 U_ASSERT(category<fData->fHeader->fCatCount);
1385 state = row->fNextState[category]; /*Not accessing beyond memory*/
1386 row = (RBBIStateTableRow *)
1387 (statetable->fTableData + (statetable->fRowLen * state));
1388
1389 if (row->fAccepting == -1) {
1390 // Match found, common case.
1391 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1392 }
1393
1394 int16_t completedRule = row->fAccepting;
1395 if (completedRule > 0) {
1396 // Lookahead match is completed.
1397 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
1398 if (lookaheadResult >= 0) {
1399 UTEXT_SETNATIVEINDEX(fText, lookaheadResult);
1400 return lookaheadResult;
1401 }
1402 }
1403 int16_t rule = row->fLookAhead;
1404 if (rule != 0) {
1405 // At the position of a '/' in a look-ahead match. Record it.
1406 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1407 lookAheadMatches.setPosition(rule, pos);
1408 }
1409
1410 if (state == STOP_STATE) {
1411 // This is the normal exit from the lookup state machine.
1412 // We have advanced through the string until it is certain that no
1413 // longer match is possible, no matter what characters follow.
1414 break;
1415 }
1416
1417 // Move (backwards) to the next character to process.
1418 // If this is a beginning-of-input loop iteration, don't advance
1419 // the input position. The next iteration will be processing the
1420 // first real input character.
1421 if (mode == RBBI_RUN) {
1422 c = UTEXT_PREVIOUS32(fText);
1423 } else {
1424 if (mode == RBBI_START) {
1425 mode = RBBI_RUN;
1426 }
1427 }
1428 }
1429
1430 // The state machine is done. Check whether it found a match...
1431
1432 // If the iterator failed to advance in the match engine, force it ahead by one.
1433 // (This really indicates a defect in the break rules. They should always match
1434 // at least one character.)
1435 if (result == initialPosition) {
1436 UTEXT_SETNATIVEINDEX(fText, initialPosition);
1437 UTEXT_PREVIOUS32(fText);
1438 result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1439 }
1440
1441 // Leave the iterator at our result position.
1442 UTEXT_SETNATIVEINDEX(fText, result);
1443 #ifdef RBBI_DEBUG
1444 if (fTrace) {
1445 RBBIDebugPrintf("result = %d\n\n", result);
1446 }
1447 #endif
1448 return result;
1449}
1450
1451
1452void
1453RuleBasedBreakIterator57::reset()
1454{
1455 if (fCachedBreakPositions) {
1456 uprv_free(fCachedBreakPositions);
1457 }
1458 fCachedBreakPositions = NULL;
1459 fNumCachedBreakPositions = 0;
1460 fDictionaryCharCount = 0;
1461 fPositionInCache = 0;
1462}
1463
1464
1465
1466//-------------------------------------------------------------------------------
1467//
1468// getRuleStatus() Return the break rule tag associated with the current
1469// iterator position. If the iterator arrived at its current
1470// position by iterating forwards, the value will have been
1471// cached by the handleNext() function.
1472//
1473// If no cached status value is available, the status is
1474// found by doing a previous() followed by a next(), which
1475// leaves the iterator where it started, and computes the
1476// status while doing the next().
1477//
1478//-------------------------------------------------------------------------------
1479void RuleBasedBreakIterator57::makeRuleStatusValid() {
1480 if (fLastStatusIndexValid == FALSE) {
1481 // No cached status is available.
1482 if (fText == NULL || current() == 0) {
1483 // At start of text, or there is no text. Status is always zero.
1484 fLastRuleStatusIndex = 0;
1485 fLastStatusIndexValid = TRUE;
1486 } else {
1487 // Not at start of text. Find status the tedious way.
1488 int32_t pa = current();
1489 previous();
1490 if (fNumCachedBreakPositions > 0) {
1491 reset(); // Blow off the dictionary cache
1492 }
1493 int32_t pb = next();
1494 if (pa != pb) {
1495 // note: the if (pa != pb) test is here only to eliminate warnings for
1496 // unused local variables on gcc. Logically, it isn't needed.
1497 U_ASSERT(pa == pb);
1498 }
1499 }
1500 }
1501 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx);
1502}
1503
1504
1505int32_t RuleBasedBreakIterator57::getRuleStatus() const {
1506 RuleBasedBreakIterator57 *nonConstThis = (RuleBasedBreakIterator57 *)this;
1507 nonConstThis->makeRuleStatusValid();
1508
1509 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1510 // (the number of status values.)
1511 // This function returns the last (largest) of the array of status values.
1512 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
1513 int32_t tagVal = fData->fRuleStatusTable[idx];
1514
1515 return tagVal;
1516}
1517
1518
1519
1520
1521int32_t RuleBasedBreakIterator57::getRuleStatusVec(
1522 int32_t *fillInVec, int32_t capacity, UErrorCode &status)
1523{
1524 if (U_FAILURE(status)) {
1525 return 0;
1526 }
1527
1528 RuleBasedBreakIterator57 *nonConstThis = (RuleBasedBreakIterator57 *)this;
1529 nonConstThis->makeRuleStatusValid();
1530 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
1531 int32_t numValsToCopy = numVals;
1532 if (numVals > capacity) {
1533 status = U_BUFFER_OVERFLOW_ERROR;
1534 numValsToCopy = capacity;
1535 }
1536 int i;
1537 for (i=0; i<numValsToCopy; i++) {
1538 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
1539 }
1540 return numVals;
1541}
1542
1543
1544
1545//-------------------------------------------------------------------------------
1546//
1547// getBinaryRules Access to the compiled form of the rules,
1548// for use by build system tools that save the data
1549// for standard iterator types.
1550//
1551//-------------------------------------------------------------------------------
1552const uint8_t *RuleBasedBreakIterator57::getBinaryRules(uint32_t &length) {
1553 const uint8_t *retPtr = NULL;
1554 length = 0;
1555
1556 if (fData != NULL) {
1557 retPtr = (const uint8_t *)fData->fHeader;
1558 length = fData->fHeader->fLength;
1559 }
1560 return retPtr;
1561}
1562
1563
1564BreakIterator * RuleBasedBreakIterator57::createBufferClone(void * /*stackBuffer*/,
1565 int32_t &bufferSize,
1566 UErrorCode &status)
1567{
1568 if (U_FAILURE(status)){
1569 return NULL;
1570 }
1571
1572 if (bufferSize == 0) {
1573 bufferSize = 1; // preflighting for deprecated functionality
1574 return NULL;
1575 }
1576
1577 BreakIterator *clonedBI = clone();
1578 if (clonedBI == NULL) {
1579 status = U_MEMORY_ALLOCATION_ERROR;
1580 } else {
1581 status = U_SAFECLONE_ALLOCATED_WARNING;
1582 }
1583 return (RuleBasedBreakIterator57 *)clonedBI;
1584}
1585
1586//-------------------------------------------------------------------------------
1587//
1588// checkDictionary This function handles all processing of characters in
1589// the "dictionary" set. It will determine the appropriate
1590// course of action, and possibly set up a cache in the
1591// process.
1592//
1593//-------------------------------------------------------------------------------
1594int32_t RuleBasedBreakIterator57::checkDictionary(int32_t startPos,
1595 int32_t endPos,
1596 UBool reverse) {
1597 // Reset the old break cache first.
1598 reset();
1599
1600 // note: code segment below assumes that dictionary chars are in the
1601 // startPos-endPos range
1602 // value returned should be next character in sequence
1603 if ((endPos - startPos) <= 1) {
1604 return (reverse ? startPos : endPos);
1605 }
1606
1607 // Starting from the starting point, scan towards the proposed result,
1608 // looking for the first dictionary character (which may be the one
1609 // we're on, if we're starting in the middle of a range).
1610 utext_setNativeIndex(fText, reverse ? endPos : startPos);
1611 if (reverse) {
1612 UTEXT_PREVIOUS32(fText);
1613 }
1614
1615 int32_t rangeStart = startPos;
1616 int32_t rangeEnd = endPos;
1617
1618 uint16_t category;
1619 int32_t current;
1620 UErrorCode status = U_ZERO_ERROR;
1621 UVector32 breaks(status); // changed from UStack in ICU 57
1622 int32_t foundBreakCount = 0;
1623 UChar32 c = utext_current32(fText);
1624
1625 UTRIE_GET16(&fData->fTrie, c, category);
1626
1627 // Is the character we're starting on a dictionary character? If so, we
1628 // need to back up to include the entire run; otherwise the results of
1629 // the break algorithm will differ depending on where we start. Since
1630 // the result is cached and there is typically a non-dictionary break
1631 // within a small number of words, there should be little performance impact.
1632 if (category & 0x4000) {
1633 if (reverse) {
1634 do {
1635 utext_next32(fText); // TODO: recast to work directly with postincrement.
1636 c = utext_current32(fText);
1637 UTRIE_GET16(&fData->fTrie, c, category);
1638 } while (c != U_SENTINEL && (category & 0x4000));
1639 // Back up to the last dictionary character
1640 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
1641 if (c == U_SENTINEL) {
1642 // c = fText->last32();
1643 // TODO: why was this if needed?
1644 c = UTEXT_PREVIOUS32(fText);
1645 }
1646 else {
1647 c = UTEXT_PREVIOUS32(fText);
1648 }
1649 }
1650 else {
1651 do {
1652 c = UTEXT_PREVIOUS32(fText);
1653 UTRIE_GET16(&fData->fTrie, c, category);
1654 }
1655 while (c != U_SENTINEL && (category & 0x4000));
1656 // Back up to the last dictionary character
1657 if (c == U_SENTINEL) {
1658 // c = fText->first32();
1659 c = utext_current32(fText);
1660 }
1661 else {
1662 utext_next32(fText);
1663 c = utext_current32(fText);
1664 }
1665 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
1666 }
1667 UTRIE_GET16(&fData->fTrie, c, category);
1668 }
1669
1670 // Loop through the text, looking for ranges of dictionary characters.
1671 // For each span, find the appropriate break engine, and ask it to find
1672 // any breaks within the span.
1673 // Note: we always do this in the forward direction, so that the break
1674 // cache is built in the right order.
1675 if (reverse) {
1676 utext_setNativeIndex(fText, rangeStart);
1677 c = utext_current32(fText);
1678 UTRIE_GET16(&fData->fTrie, c, category);
1679 }
1680 while(U_SUCCESS(status)) {
1681 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
1682 utext_next32(fText); // TODO: tweak for post-increment operation
1683 c = utext_current32(fText);
1684 UTRIE_GET16(&fData->fTrie, c, category);
1685 }
1686 if (current >= rangeEnd) {
1687 break;
1688 }
1689
1690 // We now have a dictionary character. Get the appropriate language object
1691 // to deal with it.
1692 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c);
1693
1694 // Ask the language object if there are any breaks. It will leave the text
1695 // pointer on the other side of its range, ready to search for the next one.
1696 if (lbe != NULL) {
1697 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, breaks);
1698 }
1699
1700 // Reload the loop variables for the next go-round
1701 c = utext_current32(fText);
1702 UTRIE_GET16(&fData->fTrie, c, category);
1703 }
1704
1705 // If we found breaks, build a new break cache. The first and last entries must
1706 // be the original starting and ending position.
1707 if (foundBreakCount > 0) {
1708 U_ASSERT(foundBreakCount == breaks.size());
1709 int32_t totalBreaks = foundBreakCount;
1710 if (startPos < breaks.elementAti(0)) {
1711 totalBreaks += 1;
1712 }
1713 if (endPos > breaks.peeki()) {
1714 totalBreaks += 1;
1715 }
1716 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t));
1717 if (fCachedBreakPositions != NULL) {
1718 int32_t out = 0;
1719 fNumCachedBreakPositions = totalBreaks;
1720 if (startPos < breaks.elementAti(0)) {
1721 fCachedBreakPositions[out++] = startPos;
1722 }
1723 for (int32_t i = 0; i < foundBreakCount; ++i) {
1724 fCachedBreakPositions[out++] = breaks.elementAti(i);
1725 }
1726 if (endPos > fCachedBreakPositions[out-1]) {
1727 fCachedBreakPositions[out] = endPos;
1728 }
1729 // If there are breaks, then by definition, we are replacing the original
1730 // proposed break by one of the breaks we found. Use following() and
1731 // preceding() to do the work. They should never recurse in this case.
1732 if (reverse) {
1733 return preceding(endPos);
1734 }
1735 else {
1736 return following(startPos);
1737 }
1738 }
1739 // If the allocation failed, just fall through to the "no breaks found" case.
1740 }
1741
1742 // If we get here, there were no language-based breaks. Set the text pointer
1743 // to the original proposed break.
1744 utext_setNativeIndex(fText, reverse ? startPos : endPos);
1745 return (reverse ? startPos : endPos);
1746}
1747
1748U_NAMESPACE_END
1749
1750
1751static icu::UStack *gLanguageBreakFactories = NULL;
1752static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
1753
1754/**
1755 * Release all static memory held by breakiterator.
1756 */
1757U_CDECL_BEGIN
1758static UBool U_CALLCONV breakiterator_cleanup_dict(void) {
1759 if (gLanguageBreakFactories) {
1760 delete gLanguageBreakFactories;
1761 gLanguageBreakFactories = NULL;
1762 }
1763 gLanguageBreakFactoriesInitOnce.reset();
1764 return TRUE;
1765}
1766U_CDECL_END
1767
1768U_CDECL_BEGIN
1769static void U_CALLCONV _deleteFactory(void *obj) {
1770 delete (icu::LanguageBreakFactory *) obj;
1771}
1772U_CDECL_END
1773U_NAMESPACE_BEGIN
1774
1775static void U_CALLCONV initLanguageFactories() {
1776 UErrorCode status = U_ZERO_ERROR;
1777 U_ASSERT(gLanguageBreakFactories == NULL);
1778 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
1779 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
1780 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1781 gLanguageBreakFactories->push(builtIn, status);
1782#ifdef U_LOCAL_SERVICE_HOOK
1783 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1784 if (extra != NULL) {
1785 gLanguageBreakFactories->push(extra, status);
1786 }
1787#endif
1788 }
1789 ucln_common_registerCleanup(UCLN_COMMON_RBBI57, breakiterator_cleanup_dict);
1790}
1791
1792
1793static const LanguageBreakEngine*
3d1f044b 1794getLanguageBreakEngineFromFactory(UChar32 c)
0f5d89e8
A
1795{
1796 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
1797 if (gLanguageBreakFactories == NULL) {
1798 return NULL;
1799 }
1800
1801 int32_t i = gLanguageBreakFactories->size();
1802 const LanguageBreakEngine *lbe = NULL;
1803 while (--i >= 0) {
1804 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1805 lbe = factory->getEngineFor(c);
1806 if (lbe != NULL) {
1807 break;
1808 }
1809 }
1810 return lbe;
1811}
1812
1813
1814//-------------------------------------------------------------------------------
1815//
1816// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1817// the character c.
1818//
1819//-------------------------------------------------------------------------------
1820const LanguageBreakEngine *
1821RuleBasedBreakIterator57::getLanguageBreakEngine(UChar32 c) {
1822 const LanguageBreakEngine *lbe = NULL;
1823 UErrorCode status = U_ZERO_ERROR;
1824
1825 if (fLanguageBreakEngines == NULL) {
1826 fLanguageBreakEngines = new UStack(status);
1827 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1828 delete fLanguageBreakEngines;
1829 fLanguageBreakEngines = 0;
1830 return NULL;
1831 }
1832 }
1833
1834 int32_t i = fLanguageBreakEngines->size();
1835 while (--i >= 0) {
1836 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1837 if (lbe->handles(c)) {
1838 return lbe;
1839 }
1840 }
1841
1842 // No existing dictionary took the character. See if a factory wants to
1843 // give us a new LanguageBreakEngine for this character.
3d1f044b 1844 lbe = getLanguageBreakEngineFromFactory(c);
0f5d89e8
A
1845
1846 // If we got one, use it and push it on our stack.
1847 if (lbe != NULL) {
1848 fLanguageBreakEngines->push((void *)lbe, status);
1849 // Even if we can't remember it, we can keep looking it up, so
1850 // return it even if the push fails.
1851 return lbe;
1852 }
1853
1854 // No engine is forthcoming for this character. Add it to the
1855 // reject set. Create the reject break engine if needed.
1856 if (fUnhandledBreakEngine == NULL) {
1857 fUnhandledBreakEngine = new UnhandledEngine(status);
1858 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1859 status = U_MEMORY_ALLOCATION_ERROR;
1860 }
1861 // Put it last so that scripts for which we have an engine get tried
1862 // first.
1863 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1864 // If we can't insert it, or creation failed, get rid of it
1865 if (U_FAILURE(status)) {
1866 delete fUnhandledBreakEngine;
1867 fUnhandledBreakEngine = 0;
1868 return NULL;
1869 }
1870 }
1871
1872 // Tell the reject engine about the character; at its discretion, it may
1873 // add more than just the one character.
1874 fUnhandledBreakEngine->handleCharacter(c);
1875
1876 return fUnhandledBreakEngine;
1877}
1878
1879void RuleBasedBreakIterator57::setBreakType(int32_t type) {
1880 fBreakType = type;
1881 reset();
1882}
1883
1884U_NAMESPACE_END
1885
1886#endif /* #if !UCONFIG_NO_BREAK_ITERATION */