]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbbi.cpp
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbbi.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016 International Business Machines Corporation
6 * and others. All rights reserved.
7 ***************************************************************************
8 */
9 //
10 // file: rbbi.cpp Contains the implementation of the rule based break iterator
11 // runtime engine and the API implementation for
12 // class RuleBasedBreakIterator
13 //
14
15 #include "utypeinfo.h" // for 'typeid' to work
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_BREAK_ITERATION
20
21 #include <cinttypes>
22
23 #include "unicode/rbbi.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uchriter.h"
26 #include "unicode/uclean.h"
27 #include "unicode/udata.h"
28
29 #include "brkeng.h"
30 #include "ucln_cmn.h"
31 #include "cmemory.h"
32 #include "cstring.h"
33 #include "localsvc.h"
34 #include "rbbidata.h"
35 #include "rbbi_cache.h"
36 #include "rbbirb.h"
37 #include "uassert.h"
38 #include "umutex.h"
39 #include "uvectr32.h"
40
41 #ifdef RBBI_DEBUG
42 static UBool gTrace = FALSE;
43 #endif
44
45 U_NAMESPACE_BEGIN
46
47 // The state number of the starting state
48 constexpr int32_t START_STATE = 1;
49
50 // The state-transition value indicating "stop"
51 constexpr int32_t STOP_STATE = 0;
52
53
54 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator)
55
56
57 //=======================================================================
58 // constructors
59 //=======================================================================
60
61 /**
62 * Constructs a RuleBasedBreakIterator that uses the already-created
63 * tables object that is passed in as a parameter.
64 */
65 RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status)
66 : fSCharIter(UnicodeString())
67 {
68 init(status);
69 fData = new RBBIDataWrapper(data, status); // status checked in constructor
70 if (U_FAILURE(status)) {return;}
71 if(fData == 0) {
72 status = U_MEMORY_ALLOCATION_ERROR;
73 return;
74 }
75 }
76
77 //
78 // Construct from precompiled binary rules (tables). This constructor is public API,
79 // taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
80 //
81 RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules,
82 uint32_t ruleLength,
83 UErrorCode &status)
84 : fSCharIter(UnicodeString())
85 {
86 init(status);
87 if (U_FAILURE(status)) {
88 return;
89 }
90 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) {
91 status = U_ILLEGAL_ARGUMENT_ERROR;
92 return;
93 }
94 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules;
95 if (data->fLength > ruleLength) {
96 status = U_ILLEGAL_ARGUMENT_ERROR;
97 return;
98 }
99 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status);
100 if (U_FAILURE(status)) {return;}
101 if(fData == 0) {
102 status = U_MEMORY_ALLOCATION_ERROR;
103 return;
104 }
105 }
106
107
108 //-------------------------------------------------------------------------------
109 //
110 // Constructor from a UDataMemory handle to precompiled break rules
111 // stored in an ICU data file.
112 //
113 //-------------------------------------------------------------------------------
114 RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status)
115 : fSCharIter(UnicodeString())
116 {
117 init(status);
118 fData = new RBBIDataWrapper(udm, status); // status checked in constructor
119 if (U_FAILURE(status)) {return;}
120 if(fData == 0) {
121 status = U_MEMORY_ALLOCATION_ERROR;
122 return;
123 }
124 }
125
126
127
128 //-------------------------------------------------------------------------------
129 //
130 // Constructor from a set of rules supplied as a string.
131 //
132 //-------------------------------------------------------------------------------
133 RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
134 UParseError &parseError,
135 UErrorCode &status)
136 : fSCharIter(UnicodeString())
137 {
138 init(status);
139 if (U_FAILURE(status)) {return;}
140 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
141 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status);
142 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
143 // creates and returns a complete RBBI. From here, in a constructor, we
144 // can't just return the object created by the builder factory, hence
145 // the assignment of the factory created object to "this".
146 if (U_SUCCESS(status)) {
147 *this = *bi;
148 delete bi;
149 }
150 }
151
152
153 //-------------------------------------------------------------------------------
154 //
155 // Default Constructor. Create an empty shell that can be set up later.
156 // Used when creating a RuleBasedBreakIterator from a set
157 // of rules.
158 //-------------------------------------------------------------------------------
159 RuleBasedBreakIterator::RuleBasedBreakIterator()
160 : fSCharIter(UnicodeString())
161 {
162 UErrorCode status = U_ZERO_ERROR;
163 init(status);
164 }
165
166
167 //-------------------------------------------------------------------------------
168 //
169 // Copy constructor. Will produce a break iterator with the same behavior,
170 // and which iterates over the same text, as the one passed in.
171 //
172 //-------------------------------------------------------------------------------
173 RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other)
174 : BreakIterator(other),
175 fSCharIter(UnicodeString())
176 {
177 UErrorCode status = U_ZERO_ERROR;
178 this->init(status);
179 *this = other;
180 }
181
182
183 /**
184 * Destructor
185 */
186 RuleBasedBreakIterator::~RuleBasedBreakIterator() {
187 if (fCharIter != &fSCharIter) {
188 // fCharIter was adopted from the outside.
189 delete fCharIter;
190 }
191 fCharIter = NULL;
192
193 utext_close(&fText);
194
195 if (fData != NULL) {
196 fData->removeReference();
197 fData = NULL;
198 }
199 delete fBreakCache;
200 fBreakCache = NULL;
201
202 delete fDictionaryCache;
203 fDictionaryCache = NULL;
204
205 delete fLanguageBreakEngines;
206 fLanguageBreakEngines = NULL;
207
208 delete fUnhandledBreakEngine;
209 fUnhandledBreakEngine = NULL;
210
211 delete [] fLatin1Cat;
212 fLatin1Cat = NULL;
213 }
214
215 /**
216 * Assignment operator. Sets this iterator to have the same behavior,
217 * and iterate over the same text, as the one passed in.
218 */
219 RuleBasedBreakIterator&
220 RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
221 if (this == &that) {
222 return *this;
223 }
224 BreakIterator::operator=(that);
225 fLineWordOpts = that.fLineWordOpts;
226
227 if (fLanguageBreakEngines != NULL) {
228 delete fLanguageBreakEngines;
229 fLanguageBreakEngines = NULL; // Just rebuild for now
230 }
231 // TODO: clone fLanguageBreakEngines from "that"
232 UErrorCode status = U_ZERO_ERROR;
233 utext_clone(&fText, &that.fText, FALSE, TRUE, &status);
234
235 if (fCharIter != &fSCharIter) {
236 delete fCharIter;
237 }
238 fCharIter = &fSCharIter;
239
240 if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) {
241 // This is a little bit tricky - it will intially appear that
242 // this->fCharIter is adopted, even if that->fCharIter was
243 // not adopted. That's ok.
244 fCharIter = that.fCharIter->clone();
245 }
246 fSCharIter = that.fSCharIter;
247 if (fCharIter == NULL) {
248 fCharIter = &fSCharIter;
249 }
250
251 if (fData != NULL) {
252 fData->removeReference();
253 fData = NULL;
254 }
255 if (that.fData != NULL) {
256 fData = that.fData->addReference();
257 }
258
259 delete [] fLatin1Cat;
260 fLatin1Cat = NULL;
261
262 fPosition = that.fPosition;
263 fRuleStatusIndex = that.fRuleStatusIndex;
264 fDone = that.fDone;
265
266 // TODO: both the dictionary and the main cache need to be copied.
267 // Current position could be within a dictionary range. Trying to continue
268 // the iteration without the caches present would go to the rules, with
269 // the assumption that the current position is on a rule boundary.
270 fBreakCache->reset(fPosition, fRuleStatusIndex);
271 fDictionaryCache->reset();
272
273 return *this;
274 }
275
276
277
278 //-----------------------------------------------------------------------------
279 //
280 // init() Shared initialization routine. Used by all the constructors.
281 // Initializes all fields, leaving the object in a consistent state.
282 //
283 //-----------------------------------------------------------------------------
284 void RuleBasedBreakIterator::init(UErrorCode &status) {
285 fCharIter = NULL;
286 fData = NULL;
287 fLatin1Cat = NULL;
288 fPosition = 0;
289 fRuleStatusIndex = 0;
290 fDone = false;
291 fDictionaryCharCount = 0;
292 fLanguageBreakEngines = NULL;
293 fUnhandledBreakEngine = NULL;
294 fBreakCache = NULL;
295 fDictionaryCache = NULL;
296
297 // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
298 // fText = UTEXT_INITIALIZER;
299 static const UText initializedUText = UTEXT_INITIALIZER;
300 uprv_memcpy(&fText, &initializedUText, sizeof(UText));
301
302 if (U_FAILURE(status)) {
303 return;
304 }
305
306 utext_openUChars(&fText, NULL, 0, &status);
307 fDictionaryCache = new DictionaryCache(this, status);
308 fBreakCache = new BreakCache(this, status);
309 if (U_SUCCESS(status) && (fDictionaryCache == NULL || fBreakCache == NULL)) {
310 status = U_MEMORY_ALLOCATION_ERROR;
311 }
312
313 #ifdef RBBI_DEBUG
314 static UBool debugInitDone = FALSE;
315 if (debugInitDone == FALSE) {
316 char *debugEnv = getenv("U_RBBIDEBUG");
317 if (debugEnv && uprv_strstr(debugEnv, "trace")) {
318 gTrace = TRUE;
319 }
320 debugInitDone = TRUE;
321 }
322 #endif
323 }
324
325
326 void RuleBasedBreakIterator::initLatin1Cat(void) {
327 fLatin1Cat = new uint16_t[256];
328 for (UChar32 c = 0; c < 256; ++c) {
329 fLatin1Cat[c] = UTRIE2_GET16(fData->fTrie, c);
330 }
331 }
332
333 //-----------------------------------------------------------------------------
334 //
335 // clone - Returns a newly-constructed RuleBasedBreakIterator with the same
336 // behavior, and iterating over the same text, as this one.
337 // Virtual function: does the right thing with subclasses.
338 //
339 //-----------------------------------------------------------------------------
340 BreakIterator*
341 RuleBasedBreakIterator::clone(void) const {
342 return new RuleBasedBreakIterator(*this);
343 }
344
345 /**
346 * Equality operator. Returns TRUE if both BreakIterators are of the
347 * same class, have the same behavior, and iterate over the same text.
348 */
349 UBool
350 RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
351 if (typeid(*this) != typeid(that)) {
352 return FALSE;
353 }
354 if (this == &that) {
355 return TRUE;
356 }
357
358 // The base class BreakIterator carries no state that participates in equality,
359 // and does not implement an equality function that would otherwise be
360 // checked at this point.
361
362 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
363 if (that2.fLineWordOpts != fLineWordOpts) {
364 return FALSE;
365 }
366
367 if (!utext_equals(&fText, &that2.fText)) {
368 // The two break iterators are operating on different text,
369 // or have a different iteration position.
370 // Note that fText's position is always the same as the break iterator's position.
371 return FALSE;
372 };
373
374 if (!(fPosition == that2.fPosition &&
375 fRuleStatusIndex == that2.fRuleStatusIndex &&
376 fDone == that2.fDone)) {
377 return FALSE;
378 }
379
380 if (that2.fData == fData ||
381 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
382 // The two break iterators are using the same rules.
383 return TRUE;
384 }
385 return FALSE;
386 }
387
388 /**
389 * Compute a hash code for this BreakIterator
390 * @return A hash code
391 */
392 int32_t
393 RuleBasedBreakIterator::hashCode(void) const {
394 int32_t hash = 0;
395 if (fData != NULL) {
396 hash = fData->hashCode();
397 }
398 return hash;
399 }
400
401
402 void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
403 if (U_FAILURE(status)) {
404 return;
405 }
406 fBreakCache->reset();
407 fDictionaryCache->reset();
408 utext_clone(&fText, ut, FALSE, TRUE, &status);
409
410 // Set up a dummy CharacterIterator to be returned if anyone
411 // calls getText(). With input from UText, there is no reasonable
412 // way to return a characterIterator over the actual input text.
413 // Return one over an empty string instead - this is the closest
414 // we can come to signaling a failure.
415 // (GetText() is obsolete, this failure is sort of OK)
416 fSCharIter.setText(UnicodeString());
417
418 if (fCharIter != &fSCharIter) {
419 // existing fCharIter was adopted from the outside. Delete it now.
420 delete fCharIter;
421 }
422 fCharIter = &fSCharIter;
423
424 this->first();
425 }
426
427
428 UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
429 UText *result = utext_clone(fillIn, &fText, FALSE, TRUE, &status);
430 return result;
431 }
432
433
434 //=======================================================================
435 // BreakIterator overrides
436 //=======================================================================
437
438 /**
439 * Return a CharacterIterator over the text being analyzed.
440 */
441 CharacterIterator&
442 RuleBasedBreakIterator::getText() const {
443 return *fCharIter;
444 }
445
446 /**
447 * Set the iterator to analyze a new piece of text. This function resets
448 * the current iteration position to the beginning of the text.
449 * @param newText An iterator over the text to analyze.
450 */
451 void
452 RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
453 // If we are holding a CharacterIterator adopted from a
454 // previous call to this function, delete it now.
455 if (fCharIter != &fSCharIter) {
456 delete fCharIter;
457 }
458
459 fCharIter = newText;
460 UErrorCode status = U_ZERO_ERROR;
461 fBreakCache->reset();
462 fDictionaryCache->reset();
463 if (newText==NULL || newText->startIndex() != 0) {
464 // startIndex !=0 wants to be an error, but there's no way to report it.
465 // Make the iterator text be an empty string.
466 utext_openUChars(&fText, NULL, 0, &status);
467 } else {
468 utext_openCharacterIterator(&fText, newText, &status);
469 }
470 this->first();
471 }
472
473 /**
474 * Set the iterator to analyze a new piece of text. This function resets
475 * the current iteration position to the beginning of the text.
476 * @param newText An iterator over the text to analyze.
477 */
478 void
479 RuleBasedBreakIterator::setText(const UnicodeString& newText) {
480 UErrorCode status = U_ZERO_ERROR;
481 fBreakCache->reset();
482 fDictionaryCache->reset();
483 utext_openConstUnicodeString(&fText, &newText, &status);
484
485 // Set up a character iterator on the string.
486 // Needed in case someone calls getText().
487 // Can not, unfortunately, do this lazily on the (probably never)
488 // call to getText(), because getText is const.
489 fSCharIter.setText(newText);
490
491 if (fCharIter != &fSCharIter) {
492 // old fCharIter was adopted from the outside. Delete it.
493 delete fCharIter;
494 }
495 fCharIter = &fSCharIter;
496
497 this->first();
498 }
499
500
501 /**
502 * Provide a new UText for the input text. Must reference text with contents identical
503 * to the original.
504 * Intended for use with text data originating in Java (garbage collected) environments
505 * where the data may be moved in memory at arbitrary times.
506 */
507 RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
508 if (U_FAILURE(status)) {
509 return *this;
510 }
511 if (input == NULL) {
512 status = U_ILLEGAL_ARGUMENT_ERROR;
513 return *this;
514 }
515 int64_t pos = utext_getNativeIndex(&fText);
516 // Shallow read-only clone of the new UText into the existing input UText
517 utext_clone(&fText, input, FALSE, TRUE, &status);
518 if (U_FAILURE(status)) {
519 return *this;
520 }
521 utext_setNativeIndex(&fText, pos);
522 if (utext_getNativeIndex(&fText) != pos) {
523 // Sanity check. The new input utext is supposed to have the exact same
524 // contents as the old. If we can't set to the same position, it doesn't.
525 // The contents underlying the old utext might be invalid at this point,
526 // so it's not safe to check directly.
527 status = U_ILLEGAL_ARGUMENT_ERROR;
528 }
529 return *this;
530 }
531
532
533 /**
534 * Sets the current iteration position to the beginning of the text, position zero.
535 * @return The new iterator position, which is zero.
536 */
537 int32_t RuleBasedBreakIterator::first(void) {
538 UErrorCode status = U_ZERO_ERROR;
539 if (!fBreakCache->seek(0)) {
540 fBreakCache->populateNear(0, status);
541 }
542 fBreakCache->current();
543 U_ASSERT(fPosition == 0);
544 return 0;
545 }
546
547 /**
548 * Sets the current iteration position to the end of the text.
549 * @return The text's past-the-end offset.
550 */
551 int32_t RuleBasedBreakIterator::last(void) {
552 int32_t endPos = (int32_t)utext_nativeLength(&fText);
553 UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position.
554 (void)endShouldBeBoundary;
555 U_ASSERT(endShouldBeBoundary);
556 U_ASSERT(fPosition == endPos);
557 return endPos;
558 }
559
560 /**
561 * Advances the iterator either forward or backward the specified number of steps.
562 * Negative values move backward, and positive values move forward. This is
563 * equivalent to repeatedly calling next() or previous().
564 * @param n The number of steps to move. The sign indicates the direction
565 * (negative is backwards, and positive is forwards).
566 * @return The character offset of the boundary position n boundaries away from
567 * the current one.
568 */
569 int32_t RuleBasedBreakIterator::next(int32_t n) {
570 int32_t result = 0;
571 if (n > 0) {
572 for (; n > 0 && result != UBRK_DONE; --n) {
573 result = next();
574 }
575 } else if (n < 0) {
576 for (; n < 0 && result != UBRK_DONE; ++n) {
577 result = previous();
578 }
579 } else {
580 result = current();
581 }
582 return result;
583 }
584
585 /**
586 * Advances the iterator to the next boundary position.
587 * @return The position of the first boundary after this one.
588 */
589 int32_t RuleBasedBreakIterator::next(void) {
590 fBreakCache->next();
591 return fDone ? UBRK_DONE : fPosition;
592 }
593
594 /**
595 * Move the iterator backwards, to the boundary preceding the current one.
596 *
597 * Starts from the current position within fText.
598 * Starting position need not be on a boundary.
599 *
600 * @return The position of the boundary position immediately preceding the starting position.
601 */
602 int32_t RuleBasedBreakIterator::previous(void) {
603 UErrorCode status = U_ZERO_ERROR;
604 fBreakCache->previous(status);
605 return fDone ? UBRK_DONE : fPosition;
606 }
607
608 /**
609 * Sets the iterator to refer to the first boundary position following
610 * the specified position.
611 * @param startPos The position from which to begin searching for a break position.
612 * @return The position of the first break after the current position.
613 */
614 int32_t RuleBasedBreakIterator::following(int32_t startPos) {
615 // if the supplied position is before the beginning, return the
616 // text's starting offset
617 if (startPos < 0) {
618 return first();
619 }
620
621 // Move requested offset to a code point start. It might be on a trail surrogate,
622 // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text.
623 utext_setNativeIndex(&fText, startPos);
624 startPos = (int32_t)utext_getNativeIndex(&fText);
625
626 UErrorCode status = U_ZERO_ERROR;
627 fBreakCache->following(startPos, status);
628 return fDone ? UBRK_DONE : fPosition;
629 }
630
631 /**
632 * Sets the iterator to refer to the last boundary position before the
633 * specified position.
634 * @param offset The position to begin searching for a break from.
635 * @return The position of the last boundary before the starting position.
636 */
637 int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
638 if (offset > utext_nativeLength(&fText)) {
639 return last();
640 }
641
642 // Move requested offset to a code point start. It might be on a trail surrogate,
643 // or on a trail byte if the input is UTF-8.
644
645 utext_setNativeIndex(&fText, offset);
646 int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
647
648 UErrorCode status = U_ZERO_ERROR;
649 fBreakCache->preceding(adjustedOffset, status);
650 return fDone ? UBRK_DONE : fPosition;
651 }
652
653 /**
654 * Returns true if the specfied position is a boundary position. As a side
655 * effect, leaves the iterator pointing to the first boundary position at
656 * or after "offset".
657 *
658 * @param offset the offset to check.
659 * @return True if "offset" is a boundary position.
660 */
661 UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
662 // out-of-range indexes are never boundary positions
663 if (offset < 0) {
664 first(); // For side effects on current position, tag values.
665 return FALSE;
666 }
667
668 // Adjust offset to be on a code point boundary and not beyond the end of the text.
669 // Note that isBoundary() is always false for offsets that are not on code point boundaries.
670 // But we still need the side effect of leaving iteration at the following boundary.
671
672 utext_setNativeIndex(&fText, offset);
673 int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText));
674
675 bool result = false;
676 UErrorCode status = U_ZERO_ERROR;
677 if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) {
678 result = (fBreakCache->current() == offset);
679 }
680
681 if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) {
682 // Original offset is beyond the end of the text. Return FALSE, it's not a boundary,
683 // but the iteration position remains set to the end of the text, which is a boundary.
684 return FALSE;
685 }
686 if (!result) {
687 // Not on a boundary. isBoundary() must leave iterator on the following boundary.
688 // Cache->seek(), above, left us on the preceding boundary, so advance one.
689 next();
690 }
691 return result;
692 }
693
694
695 /**
696 * Returns the current iteration position.
697 * @return The current iteration position.
698 */
699 int32_t RuleBasedBreakIterator::current(void) const {
700 return fPosition;
701 }
702
703
704 //=======================================================================
705 // implementation
706 //=======================================================================
707
708 //
709 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
710 // of user text. A variable with this enum type keeps track of where we
711 // are. The state machine only fetches user input while in the RUN mode.
712 //
713 enum RBBIRunMode {
714 RBBI_START, // state machine processing is before first char of input
715 RBBI_RUN, // state machine processing is in the user text
716 RBBI_END // state machine processing is after end of user text.
717 };
718
719
720 // Map from look-ahead break states (corresponds to rules) to boundary positions.
721 // Allows multiple lookahead break rules to be in flight at the same time.
722 //
723 // This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers
724 // in the state table be sequential, then we can just index an array. And the
725 // table could also tell us in advance how big that array needs to be.
726 //
727 // Before ICU 57 there was just a single simple variable for a look-ahead match that
728 // was in progress. Two rules at once did not work.
729
730 static const int32_t kMaxLookaheads = 8;
731 struct LookAheadResults {
732 int32_t fUsedSlotLimit;
733 int32_t fPositions[8];
734 int16_t fKeys[8];
735
736 LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}
737
738 int32_t getPosition(int16_t key) {
739 for (int32_t i=0; i<fUsedSlotLimit; ++i) {
740 if (fKeys[i] == key) {
741 return fPositions[i];
742 }
743 }
744 // with NLLT source rules, Latn sample and ubrk_next, we see a request for key 79 here
745 // near the end of text, when setPosition has only ever set positions for key 80 or 82.
746 //UPRV_UNREACHABLE;
747 return -1;
748 }
749
750 void setPosition(int16_t key, int32_t position) {
751 int32_t i;
752 for (i=0; i<fUsedSlotLimit; ++i) {
753 if (fKeys[i] == key) {
754 fPositions[i] = position;
755 return;
756 }
757 }
758 if (i >= kMaxLookaheads) {
759 UPRV_UNREACHABLE;
760 i = kMaxLookaheads - 1; // Apple addition
761 }
762 fKeys[i] = key;
763 fPositions[i] = position;
764 U_ASSERT(fUsedSlotLimit == i);
765 fUsedSlotLimit = i + 1;
766 }
767 };
768
769
770 //-----------------------------------------------------------------------------------
771 //
772 // handleNext()
773 // Run the state machine to find a boundary
774 //
775 //-----------------------------------------------------------------------------------
776 // Route handleNext calls through the following to handleNextInternal,
777 // in order to handle fLineWordOpts.
778 int32_t RuleBasedBreakIterator::handleNext() {
779 int32_t result = handleNextInternal();
780 while (fLineWordOpts != UBRK_LINEWORD_NORMAL) {
781 UChar32 prevChr = utext_char32At(&fText, result-1);
782 UChar32 currChr = utext_char32At(&fText, result);
783 if (currChr == U_SENTINEL || prevChr == U_SENTINEL) {
784 break;
785 }
786 if (fLineWordOpts == UBRK_LINEWORD_KEEP_HANGUL) {
787 UErrorCode status = U_ZERO_ERROR;
788 if (uscript_getScript(currChr, &status) != USCRIPT_HANGUL || uscript_getScript(prevChr, &status) != USCRIPT_HANGUL) {
789 break;
790 }
791 } else {
792 if (!u_isalpha(currChr) || !u_isalpha(prevChr)) {
793 break;
794 }
795 }
796 int32_t nextResult = handleNextInternal();
797 if (nextResult <= result) {
798 break;
799 }
800 result = nextResult;
801 }
802 return result;
803 }
804
805 int32_t RuleBasedBreakIterator::handleNextInternal() {
806 int32_t state;
807 uint16_t category = 0;
808 RBBIRunMode mode;
809
810 RBBIStateTableRow *row;
811 UChar32 c;
812 LookAheadResults lookAheadMatches;
813 int32_t result = 0;
814 int32_t initialPosition = 0;
815 const RBBIStateTable *statetable = fData->fForwardTable;
816 const char *tableData = statetable->fTableData;
817 uint32_t tableRowLen = statetable->fRowLen;
818 #ifdef RBBI_DEBUG
819 if (gTrace) {
820 RBBIDebugPuts("Handle Next pos char state category");
821 }
822 #endif
823
824 // handleNext alway sets the break tag value.
825 // Set the default for it.
826 fRuleStatusIndex = 0;
827
828 fDictionaryCharCount = 0;
829
830 // if we're already at the end of the text, return DONE.
831 initialPosition = fPosition;
832 UTEXT_SETNATIVEINDEX(&fText, initialPosition);
833 result = initialPosition;
834 c = UTEXT_NEXT32(&fText);
835 if (c==U_SENTINEL) {
836 fDone = TRUE;
837 return UBRK_DONE;
838 }
839
840 // Set the initial state for the state machine
841 state = START_STATE;
842 row = (RBBIStateTableRow *)
843 //(statetable->fTableData + (statetable->fRowLen * state));
844 (tableData + tableRowLen * state);
845
846
847 mode = RBBI_RUN;
848 if (statetable->fFlags & RBBI_BOF_REQUIRED) {
849 category = 2;
850 mode = RBBI_START;
851 }
852
853
854 // loop until we reach the end of the text or transition to state 0
855 //
856 for (;;) {
857 if (c == U_SENTINEL) {
858 // Reached end of input string.
859 if (mode == RBBI_END) {
860 // We have already run the loop one last time with the
861 // character set to the psueudo {eof} value. Now it is time
862 // to unconditionally bail out.
863 break;
864 }
865 // Run the loop one last time with the fake end-of-input character category.
866 mode = RBBI_END;
867 category = 1;
868 }
869
870 //
871 // Get the char category. An incoming category of 1 or 2 means that
872 // we are preset for doing the beginning or end of input, and
873 // that we shouldn't get a category from an actual text input character.
874 //
875 if (mode == RBBI_RUN) {
876 // look up the current character's character category, which tells us
877 // which column in the state table to look at.
878 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
879 // not the size of the character going in, which is a UChar32.
880 //
881 category = (fLatin1Cat!=NULL && c<0x100)? fLatin1Cat[c]: UTRIE2_GET16(fData->fTrie, c);
882
883 // Check the dictionary bit in the character's category.
884 // Counter is only used by dictionary based iteration.
885 // Chars that need to be handled by a dictionary have a flag bit set
886 // in their category values.
887 //
888 if ((category & 0x4000) != 0) {
889 fDictionaryCharCount++;
890 // And off the dictionary flag bit.
891 category &= ~0x4000;
892 }
893 }
894
895 #ifdef RBBI_DEBUG
896 if (gTrace) {
897 RBBIDebugPrintf(" %4" PRId64 " ", utext_getNativeIndex(&fText));
898 if (0x20<=c && c<0x7f) {
899 RBBIDebugPrintf("\"%c\" ", c);
900 } else {
901 RBBIDebugPrintf("%5x ", c);
902 }
903 RBBIDebugPrintf("%3d %3d\n", state, category);
904 }
905 #endif
906
907 // State Transition - move machine to its next state
908 //
909
910 // fNextState is a variable-length array.
911 U_ASSERT(category<fData->fHeader->fCatCount);
912 state = row->fNextState[category]; /*Not accessing beyond memory*/
913 row = (RBBIStateTableRow *)
914 // (statetable->fTableData + (statetable->fRowLen * state));
915 (tableData + tableRowLen * state);
916
917
918 if (row->fAccepting == -1) {
919 // Match found, common case.
920 if (mode != RBBI_START) {
921 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
922 }
923 fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
924 }
925
926 int16_t completedRule = row->fAccepting;
927 if (completedRule > 0) {
928 // Lookahead match is completed.
929 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
930 if (lookaheadResult >= 0) {
931 fRuleStatusIndex = row->fTagIdx;
932 fPosition = lookaheadResult;
933 return lookaheadResult;
934 }
935 }
936 int16_t rule = row->fLookAhead;
937 if (rule != 0) {
938 // At the position of a '/' in a look-ahead match. Record it.
939 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
940 lookAheadMatches.setPosition(rule, pos);
941 }
942
943 if (state == STOP_STATE) {
944 // This is the normal exit from the lookup state machine.
945 // We have advanced through the string until it is certain that no
946 // longer match is possible, no matter what characters follow.
947 break;
948 }
949
950 // Advance to the next character.
951 // If this is a beginning-of-input loop iteration, don't advance
952 // the input position. The next iteration will be processing the
953 // first real input character.
954 if (mode == RBBI_RUN) {
955 c = UTEXT_NEXT32(&fText);
956 } else {
957 if (mode == RBBI_START) {
958 mode = RBBI_RUN;
959 }
960 }
961 }
962
963 // The state machine is done. Check whether it found a match...
964
965 // If the iterator failed to advance in the match engine, force it ahead by one.
966 // (This really indicates a defect in the break rules. They should always match
967 // at least one character.)
968 if (result == initialPosition) {
969 utext_setNativeIndex(&fText, initialPosition);
970 utext_next32(&fText);
971 result = (int32_t)utext_getNativeIndex(&fText);
972 fRuleStatusIndex = 0;
973 }
974
975 // Leave the iterator at our result position.
976 fPosition = result;
977 #ifdef RBBI_DEBUG
978 if (gTrace) {
979 RBBIDebugPrintf("result = %d\n\n", result);
980 }
981 #endif
982 return result;
983 }
984
985
986 //-----------------------------------------------------------------------------------
987 //
988 // handleSafePrevious()
989 //
990 // Iterate backwards using the safe reverse rules.
991 // The logic of this function is similar to handleNext(), but simpler
992 // because the safe table does not require as many options.
993 //
994 //-----------------------------------------------------------------------------------
995 int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
996 int32_t state;
997 uint16_t category = 0;
998 RBBIStateTableRow *row;
999 UChar32 c;
1000 int32_t result = 0;
1001
1002 const RBBIStateTable *stateTable = fData->fReverseTable;
1003 UTEXT_SETNATIVEINDEX(&fText, fromPosition);
1004 #ifdef RBBI_DEBUG
1005 if (gTrace) {
1006 RBBIDebugPuts("Handle Previous pos char state category");
1007 }
1008 #endif
1009
1010 // if we're already at the start of the text, return DONE.
1011 if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
1012 return BreakIterator::DONE;
1013 }
1014
1015 // Set the initial state for the state machine
1016 c = UTEXT_PREVIOUS32(&fText);
1017 state = START_STATE;
1018 row = (RBBIStateTableRow *)
1019 (stateTable->fTableData + (stateTable->fRowLen * state));
1020
1021 // loop until we reach the start of the text or transition to state 0
1022 //
1023 for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
1024
1025 // look up the current character's character category, which tells us
1026 // which column in the state table to look at.
1027 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
1028 // not the size of the character going in, which is a UChar32.
1029 //
1030 // And off the dictionary flag bit. For reverse iteration it is not used.
1031 category = UTRIE2_GET16(fData->fTrie, c);
1032 category &= ~0x4000;
1033
1034 #ifdef RBBI_DEBUG
1035 if (gTrace) {
1036 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
1037 if (0x20<=c && c<0x7f) {
1038 RBBIDebugPrintf("\"%c\" ", c);
1039 } else {
1040 RBBIDebugPrintf("%5x ", c);
1041 }
1042 RBBIDebugPrintf("%3d %3d\n", state, category);
1043 }
1044 #endif
1045
1046 // State Transition - move machine to its next state
1047 //
1048 // fNextState is a variable-length array.
1049 U_ASSERT(category<fData->fHeader->fCatCount);
1050 state = row->fNextState[category]; /*Not accessing beyond memory*/
1051 row = (RBBIStateTableRow *)
1052 (stateTable->fTableData + (stateTable->fRowLen * state));
1053
1054 if (state == STOP_STATE) {
1055 // This is the normal exit from the lookup state machine.
1056 // Transistion to state zero means we have found a safe point.
1057 break;
1058 }
1059 }
1060
1061 // The state machine is done. Check whether it found a match...
1062 result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
1063 #ifdef RBBI_DEBUG
1064 if (gTrace) {
1065 RBBIDebugPrintf("result = %d\n\n", result);
1066 }
1067 #endif
1068 return result;
1069 }
1070
1071 //-------------------------------------------------------------------------------
1072 //
1073 // getRuleStatus() Return the break rule tag associated with the current
1074 // iterator position. If the iterator arrived at its current
1075 // position by iterating forwards, the value will have been
1076 // cached by the handleNext() function.
1077 //
1078 //-------------------------------------------------------------------------------
1079
1080 int32_t RuleBasedBreakIterator::getRuleStatus() const {
1081
1082 // fLastRuleStatusIndex indexes to the start of the appropriate status record
1083 // (the number of status values.)
1084 // This function returns the last (largest) of the array of status values.
1085 int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex];
1086 int32_t tagVal = fData->fRuleStatusTable[idx];
1087
1088 return tagVal;
1089 }
1090
1091
1092 int32_t RuleBasedBreakIterator::getRuleStatusVec(
1093 int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
1094 if (U_FAILURE(status)) {
1095 return 0;
1096 }
1097
1098 int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex];
1099 int32_t numValsToCopy = numVals;
1100 if (numVals > capacity) {
1101 status = U_BUFFER_OVERFLOW_ERROR;
1102 numValsToCopy = capacity;
1103 }
1104 int i;
1105 for (i=0; i<numValsToCopy; i++) {
1106 fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1];
1107 }
1108 return numVals;
1109 }
1110
1111 // Apple custom addition
1112 int32_t RuleBasedBreakIterator::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
1113 {
1114 if (fDone) {
1115 return 0;
1116 }
1117 RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
1118 RuleBasedTokenRange *outTokenP = outTokenRanges;
1119 int32_t lastOffset = fPosition;
1120 while (outTokenP < outTokenLimit) {
1121 // start portion from inlining populateFollowing()
1122 int32_t pos = 0;
1123 int32_t ruleStatusIdx = 0;
1124 int32_t startPos = fPosition;
1125
1126 if (fDictionaryCache->following(startPos, &pos, &ruleStatusIdx)) {
1127 fPosition = pos;
1128 fRuleStatusIndex = ruleStatusIdx;
1129 } else {
1130 pos = handleNextInternal(); // sets fRuleStatusIndex for the pos it returns, updates fPosition
1131 if (pos == UBRK_DONE) {
1132 // fDone = TRUE; already set by handleNextInternal
1133 break;
1134 }
1135 // Use current result from handleNextInternal(), including fRuleStatusIndex,
1136 // unless overridden by dictionary subdivisions
1137 fPosition = pos;
1138 if (fDictionaryCharCount > 0) {
1139 // The text segment obtained from the rules includes dictionary characters.
1140 // Subdivide it, with subdivided results going into the dictionary cache.
1141 fDictionaryCache->populateDictionary(startPos, pos, fRuleStatusIndex, fRuleStatusIndex);
1142 if (fDictionaryCache->following(startPos, &pos, &ruleStatusIdx)) {
1143 fPosition = pos;
1144 fRuleStatusIndex = ruleStatusIdx;
1145 }
1146 }
1147 }
1148 // end portion from inlining populateFollowing()
1149 int32_t flagCount = fData->fRuleStatusTable[fRuleStatusIndex];
1150 const int32_t* flagPtr = fData->fRuleStatusTable + fRuleStatusIndex + flagCount;
1151 int32_t flagSet = *flagPtr; // if -1 then skip token
1152 if (flagSet != -1) {
1153 outTokenP->location = lastOffset;
1154 outTokenP++->length = fPosition - lastOffset;
1155 if (outTokenFlags) {
1156 // flagSet should be the OR of all flags returned by getRuleStatusVec;
1157 // here we collect from high-order to low-order.
1158 while (--flagCount > 0) {
1159 flagSet |= *--flagPtr;
1160 }
1161 *outTokenFlags++ = (unsigned long)flagSet;
1162 }
1163 }
1164 lastOffset = fPosition;
1165 }
1166 return (outTokenP - outTokenRanges);
1167 }
1168
1169 //-------------------------------------------------------------------------------
1170 //
1171 // getBinaryRules Access to the compiled form of the rules,
1172 // for use by build system tools that save the data
1173 // for standard iterator types.
1174 //
1175 //-------------------------------------------------------------------------------
1176 const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
1177 const uint8_t *retPtr = NULL;
1178 length = 0;
1179
1180 if (fData != NULL) {
1181 retPtr = (const uint8_t *)fData->fHeader;
1182 length = fData->fHeader->fLength;
1183 }
1184 return retPtr;
1185 }
1186
1187
1188 BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/,
1189 int32_t &bufferSize,
1190 UErrorCode &status)
1191 {
1192 if (U_FAILURE(status)){
1193 return NULL;
1194 }
1195
1196 if (bufferSize == 0) {
1197 bufferSize = 1; // preflighting for deprecated functionality
1198 return NULL;
1199 }
1200
1201 BreakIterator *clonedBI = clone();
1202 if (clonedBI == NULL) {
1203 status = U_MEMORY_ALLOCATION_ERROR;
1204 } else {
1205 status = U_SAFECLONE_ALLOCATED_WARNING;
1206 }
1207 return (RuleBasedBreakIterator *)clonedBI;
1208 }
1209
1210 U_NAMESPACE_END
1211
1212
1213 static icu::UStack *gLanguageBreakFactories = nullptr;
1214 static const icu::UnicodeString *gEmptyString = nullptr;
1215 static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER;
1216 static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER;
1217
1218 /**
1219 * Release all static memory held by breakiterator.
1220 */
1221 U_CDECL_BEGIN
1222 static UBool U_CALLCONV rbbi_cleanup(void) {
1223 delete gLanguageBreakFactories;
1224 gLanguageBreakFactories = nullptr;
1225 delete gEmptyString;
1226 gEmptyString = nullptr;
1227 gLanguageBreakFactoriesInitOnce.reset();
1228 gRBBIInitOnce.reset();
1229 return TRUE;
1230 }
1231 U_CDECL_END
1232
1233 U_CDECL_BEGIN
1234 static void U_CALLCONV _deleteFactory(void *obj) {
1235 delete (icu::LanguageBreakFactory *) obj;
1236 }
1237 U_CDECL_END
1238 U_NAMESPACE_BEGIN
1239
1240 static void U_CALLCONV rbbiInit() {
1241 gEmptyString = new UnicodeString();
1242 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1243 }
1244
1245 static void U_CALLCONV initLanguageFactories() {
1246 UErrorCode status = U_ZERO_ERROR;
1247 U_ASSERT(gLanguageBreakFactories == NULL);
1248 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status);
1249 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) {
1250 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status);
1251 gLanguageBreakFactories->push(builtIn, status);
1252 #ifdef U_LOCAL_SERVICE_HOOK
1253 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status);
1254 if (extra != NULL) {
1255 gLanguageBreakFactories->push(extra, status);
1256 }
1257 #endif
1258 }
1259 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup);
1260 }
1261
1262
1263 static const LanguageBreakEngine*
1264 getLanguageBreakEngineFromFactory(UChar32 c)
1265 {
1266 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
1267 if (gLanguageBreakFactories == NULL) {
1268 return NULL;
1269 }
1270
1271 int32_t i = gLanguageBreakFactories->size();
1272 const LanguageBreakEngine *lbe = NULL;
1273 while (--i >= 0) {
1274 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
1275 lbe = factory->getEngineFor(c);
1276 if (lbe != NULL) {
1277 break;
1278 }
1279 }
1280 return lbe;
1281 }
1282
1283
1284 //-------------------------------------------------------------------------------
1285 //
1286 // getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the
1287 // the character c.
1288 //
1289 //-------------------------------------------------------------------------------
1290 const LanguageBreakEngine *
1291 RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
1292 const LanguageBreakEngine *lbe = NULL;
1293 UErrorCode status = U_ZERO_ERROR;
1294
1295 if (fLanguageBreakEngines == NULL) {
1296 fLanguageBreakEngines = new UStack(status);
1297 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) {
1298 delete fLanguageBreakEngines;
1299 fLanguageBreakEngines = 0;
1300 return NULL;
1301 }
1302 }
1303
1304 int32_t i = fLanguageBreakEngines->size();
1305 while (--i >= 0) {
1306 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
1307 if (lbe->handles(c)) {
1308 return lbe;
1309 }
1310 }
1311
1312 // No existing dictionary took the character. See if a factory wants to
1313 // give us a new LanguageBreakEngine for this character.
1314 lbe = getLanguageBreakEngineFromFactory(c);
1315
1316 // If we got one, use it and push it on our stack.
1317 if (lbe != NULL) {
1318 fLanguageBreakEngines->push((void *)lbe, status);
1319 // Even if we can't remember it, we can keep looking it up, so
1320 // return it even if the push fails.
1321 return lbe;
1322 }
1323
1324 // No engine is forthcoming for this character. Add it to the
1325 // reject set. Create the reject break engine if needed.
1326 if (fUnhandledBreakEngine == NULL) {
1327 fUnhandledBreakEngine = new UnhandledEngine(status);
1328 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) {
1329 status = U_MEMORY_ALLOCATION_ERROR;
1330 return nullptr;
1331 }
1332 // Put it last so that scripts for which we have an engine get tried
1333 // first.
1334 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status);
1335 // If we can't insert it, or creation failed, get rid of it
1336 if (U_FAILURE(status)) {
1337 delete fUnhandledBreakEngine;
1338 fUnhandledBreakEngine = 0;
1339 return NULL;
1340 }
1341 }
1342
1343 // Tell the reject engine about the character; at its discretion, it may
1344 // add more than just the one character.
1345 fUnhandledBreakEngine->handleCharacter(c);
1346
1347 return fUnhandledBreakEngine;
1348 }
1349
1350 void RuleBasedBreakIterator::dumpCache() {
1351 fBreakCache->dumpCache();
1352 }
1353
1354 void RuleBasedBreakIterator::dumpTables() {
1355 fData->printData();
1356 }
1357
1358 /**
1359 * Returns the description used to create this iterator
1360 */
1361
1362 const UnicodeString&
1363 RuleBasedBreakIterator::getRules() const {
1364 if (fData != NULL) {
1365 return fData->getRuleSourceString();
1366 } else {
1367 umtx_initOnce(gRBBIInitOnce, &rbbiInit);
1368 return *gEmptyString;
1369 }
1370 }
1371
1372 U_NAMESPACE_END
1373
1374 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */