1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 // file: rbbi_cache.cpp
6 #include "unicode/utypes.h"
8 #if !UCONFIG_NO_BREAK_ITERATION
10 #include "unicode/ubrk.h"
11 #include "unicode/rbbi.h"
13 #include "rbbi_cache.h"
25 * DictionaryCache implementation
28 RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator
*bi
, UErrorCode
&status
) :
29 fBI(bi
), fBreaks(status
), fPositionInCache(-1),
30 fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
33 RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
36 void RuleBasedBreakIterator::DictionaryCache::reset() {
37 fPositionInCache
= -1;
40 fFirstRuleStatusIndex
= 0;
41 fOtherRuleStatusIndex
= 0;
42 fBreaks
.removeAllElements();
45 UBool
RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos
, int32_t *result
, int32_t *statusIndex
) {
46 if (fromPos
>= fLimit
|| fromPos
< fStart
) {
47 fPositionInCache
= -1;
51 // Sequential iteration, move from previous boundary to the following
54 if (fPositionInCache
>= 0 && fPositionInCache
< fBreaks
.size() && fBreaks
.elementAti(fPositionInCache
) == fromPos
) {
56 if (fPositionInCache
>= fBreaks
.size()) {
57 fPositionInCache
= -1;
60 r
= fBreaks
.elementAti(fPositionInCache
);
61 U_ASSERT(r
> fromPos
);
63 *statusIndex
= fOtherRuleStatusIndex
;
67 // Random indexing. Linear search for the boundary following the given position.
69 for (fPositionInCache
= 0; fPositionInCache
< fBreaks
.size(); ++fPositionInCache
) {
70 r
= fBreaks
.elementAti(fPositionInCache
);
73 *statusIndex
= fOtherRuleStatusIndex
;
81 UBool
RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos
, int32_t *result
, int32_t *statusIndex
) {
82 if (fromPos
<= fStart
|| fromPos
> fLimit
) {
83 fPositionInCache
= -1;
87 if (fromPos
== fLimit
) {
88 fPositionInCache
= fBreaks
.size() - 1;
89 if (fPositionInCache
>= 0) {
90 U_ASSERT(fBreaks
.elementAti(fPositionInCache
) == fromPos
);
95 if (fPositionInCache
> 0 && fPositionInCache
< fBreaks
.size() && fBreaks
.elementAti(fPositionInCache
) == fromPos
) {
97 r
= fBreaks
.elementAti(fPositionInCache
);
98 U_ASSERT(r
< fromPos
);
100 *statusIndex
= ( r
== fStart
) ? fFirstRuleStatusIndex
: fOtherRuleStatusIndex
;
104 if (fPositionInCache
== 0) {
105 fPositionInCache
= -1;
109 for (fPositionInCache
= fBreaks
.size()-1; fPositionInCache
>= 0; --fPositionInCache
) {
110 r
= fBreaks
.elementAti(fPositionInCache
);
113 *statusIndex
= ( r
== fStart
) ? fFirstRuleStatusIndex
: fOtherRuleStatusIndex
;
120 void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos
, int32_t endPos
,
121 int32_t firstRuleStatus
, int32_t otherRuleStatus
) {
122 if ((endPos
- startPos
) <= 1) {
127 fFirstRuleStatusIndex
= firstRuleStatus
;
128 fOtherRuleStatusIndex
= otherRuleStatus
;
130 int32_t rangeStart
= startPos
;
131 int32_t rangeEnd
= endPos
;
135 UErrorCode status
= U_ZERO_ERROR
;
136 int32_t foundBreakCount
= 0;
137 UText
*text
= &fBI
->fText
;
139 // Loop through the text, looking for ranges of dictionary characters.
140 // For each span, find the appropriate break engine, and ask it to find
141 // any breaks within the span.
143 utext_setNativeIndex(text
, rangeStart
);
144 UChar32 c
= utext_current32(text
);
145 category
= UTRIE2_GET16(fBI
->fData
->fTrie
, c
);
147 while(U_SUCCESS(status
)) {
148 while((current
= (int32_t)UTEXT_GETNATIVEINDEX(text
)) < rangeEnd
&& (category
& 0x4000) == 0) {
149 utext_next32(text
); // TODO: cleaner loop structure.
150 c
= utext_current32(text
);
151 category
= UTRIE2_GET16(fBI
->fData
->fTrie
, c
);
153 if (current
>= rangeEnd
) {
157 // We now have a dictionary character. Get the appropriate language object
159 const LanguageBreakEngine
*lbe
= fBI
->getLanguageBreakEngine(c
);
161 // Ask the language object if there are any breaks. It will add them to the cache and
162 // leave the text pointer on the other side of its range, ready to search for the next one.
164 foundBreakCount
+= lbe
->findBreaks(text
, rangeStart
, rangeEnd
, fBreaks
);
167 // Reload the loop variables for the next go-round
168 c
= utext_current32(text
);
169 category
= UTRIE2_GET16(fBI
->fData
->fTrie
, c
);
172 // If we found breaks, ensure that the first and last entries are
173 // the original starting and ending position. And initialize the
174 // cache iteration position to the first entry.
176 // printf("foundBreakCount = %d\n", foundBreakCount);
177 if (foundBreakCount
> 0) {
178 U_ASSERT(foundBreakCount
== fBreaks
.size());
179 if (startPos
< fBreaks
.elementAti(0)) {
180 // The dictionary did not place a boundary at the start of the segment of text.
181 // Add one now. This should not commonly happen, but it would be easy for interactions
182 // of the rules for dictionary segments and the break engine implementations to
183 // inadvertently cause it. Cover it here, just in case.
184 fBreaks
.insertElementAt(startPos
, 0, status
);
186 if (endPos
> fBreaks
.peeki()) {
187 fBreaks
.push(endPos
, status
);
189 fPositionInCache
= 0;
190 // Note: Dictionary matching may extend beyond the original limit.
191 fStart
= fBreaks
.elementAti(0);
192 fLimit
= fBreaks
.peeki();
194 // there were no language-based breaks, even though the segment contained
195 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
196 // for this range will fail, and the calling code will fall back to the rule based boundaries.
202 * BreakCache implemetation
205 RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator
*bi
, UErrorCode
&status
) :
206 fBI(bi
), fSideBuffer(status
) {
211 RuleBasedBreakIterator::BreakCache::~BreakCache() {
215 void RuleBasedBreakIterator::BreakCache::reset(int32_t pos
, int32_t ruleStatus
) {
220 fBoundaries
[0] = pos
;
221 fStatuses
[0] = (uint16_t)ruleStatus
;
225 int32_t RuleBasedBreakIterator::BreakCache::current() {
226 fBI
->fPosition
= fTextIdx
;
227 fBI
->fRuleStatusIndex
= fStatuses
[fBufIdx
];
233 void RuleBasedBreakIterator::BreakCache::following(int32_t startPos
, UErrorCode
&status
) {
234 if (U_FAILURE(status
)) {
237 if (startPos
== fTextIdx
|| seek(startPos
) || populateNear(startPos
, status
)) {
238 // startPos is in the cache. Do a next() from that position.
239 // TODO: an awkward set of interactions with bi->fDone
240 // seek() does not clear it; it can't because of interactions with populateNear().
241 // next() does not clear it in the fast-path case, where everything matters. Maybe it should.
242 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
250 void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos
, UErrorCode
&status
) {
251 if (U_FAILURE(status
)) {
254 if (startPos
== fTextIdx
|| seek(startPos
) || populateNear(startPos
, status
)) {
255 if (startPos
== fTextIdx
) {
258 // seek() leaves the BreakCache positioned at the preceding boundary
259 // if the requested position is between two bounaries.
260 // current() pushes the BreakCache position out to the BreakIterator itself.
261 U_ASSERT(startPos
> fTextIdx
);
270 * Out-of-line code for BreakCache::next().
271 * Cache does not already contain the boundary
273 void RuleBasedBreakIterator::BreakCache::nextOL() {
274 fBI
->fDone
= !populateFollowing();
275 fBI
->fPosition
= fTextIdx
;
276 fBI
->fRuleStatusIndex
= fStatuses
[fBufIdx
];
281 void RuleBasedBreakIterator::BreakCache::previous(UErrorCode
&status
) {
282 if (U_FAILURE(status
)) {
285 int32_t initialBufIdx
= fBufIdx
;
286 if (fBufIdx
== fStartBufIdx
) {
287 // At start of cache. Prepend to it.
288 populatePreceding(status
);
290 // Cache already holds the next boundary
291 fBufIdx
= modChunkSize(fBufIdx
- 1);
292 fTextIdx
= fBoundaries
[fBufIdx
];
294 fBI
->fDone
= (fBufIdx
== initialBufIdx
);
295 fBI
->fPosition
= fTextIdx
;
296 fBI
->fRuleStatusIndex
= fStatuses
[fBufIdx
];
301 UBool
RuleBasedBreakIterator::BreakCache::seek(int32_t pos
) {
302 if (pos
< fBoundaries
[fStartBufIdx
] || pos
> fBoundaries
[fEndBufIdx
]) {
305 if (pos
== fBoundaries
[fStartBufIdx
]) {
306 // Common case: seek(0), from BreakIterator::first()
307 fBufIdx
= fStartBufIdx
;
308 fTextIdx
= fBoundaries
[fBufIdx
];
311 if (pos
== fBoundaries
[fEndBufIdx
]) {
312 fBufIdx
= fEndBufIdx
;
313 fTextIdx
= fBoundaries
[fBufIdx
];
317 int32_t min
= fStartBufIdx
;
318 int32_t max
= fEndBufIdx
;
320 int32_t probe
= (min
+ max
+ (min
>max
? CACHE_SIZE
: 0)) / 2;
321 probe
= modChunkSize(probe
);
322 if (fBoundaries
[probe
] > pos
) {
325 min
= modChunkSize(probe
+ 1);
328 U_ASSERT(fBoundaries
[max
] > pos
);
329 fBufIdx
= modChunkSize(max
- 1);
330 fTextIdx
= fBoundaries
[fBufIdx
];
331 U_ASSERT(fTextIdx
<= pos
);
336 UBool
RuleBasedBreakIterator::BreakCache::populateNear(int32_t position
, UErrorCode
&status
) {
337 if (U_FAILURE(status
)) {
340 U_ASSERT(position
< fBoundaries
[fStartBufIdx
] || position
> fBoundaries
[fEndBufIdx
]);
342 // Find a boundary somewhere in the vicinity of the requested position.
343 // Depending on the safe rules and the text data, it could be either before, at, or after
344 // the requested position.
347 // If the requested position is not near already cached positions, clear the existing cache,
348 // find a near-by boundary and begin new cache contents there.
350 if ((position
< fBoundaries
[fStartBufIdx
] - 15) || position
> (fBoundaries
[fEndBufIdx
] + 15)) {
351 int32_t aBoundary
= 0;
352 int32_t ruleStatusIndex
= 0;
354 int32_t backupPos
= fBI
->handleSafePrevious(position
);
357 // Advance to the boundary following the backup position.
358 // There is a complication: the safe reverse rules identify pairs of code points
359 // that are safe. If advancing from the safe point moves forwards by less than
360 // two code points, we need to advance one more time to ensure that the boundary
361 // is good, including a correct rules status value.
363 fBI
->fPosition
= backupPos
;
364 aBoundary
= fBI
->handleNext();
365 if (aBoundary
<= backupPos
+ 4) {
366 // +4 is a quick test for possibly having advanced only one codepoint.
367 // Four being the length of the longest potential code point, a supplementary in UTF-8
368 utext_setNativeIndex(&fBI
->fText
, aBoundary
);
369 if (backupPos
== utext_getPreviousNativeIndex(&fBI
->fText
)) {
370 // The initial handleNext() only advanced by a single code point. Go again.
371 aBoundary
= fBI
->handleNext(); // Safe rules identify safe pairs.
374 ruleStatusIndex
= fBI
->fRuleStatusIndex
;
377 reset(aBoundary
, ruleStatusIndex
); // Reset cache to hold aBoundary as a single starting point.
380 // Fill in boundaries between existing cache content and the new requested position.
382 if (fBoundaries
[fEndBufIdx
] < position
) {
383 // The last position in the cache precedes the requested position.
384 // Add following position(s) to the cache.
385 while (fBoundaries
[fEndBufIdx
] < position
) {
386 if (!populateFollowing()) {
390 fBufIdx
= fEndBufIdx
; // Set iterator position to the end of the buffer.
391 fTextIdx
= fBoundaries
[fBufIdx
]; // Required because populateFollowing may add extra boundaries.
392 while (fTextIdx
> position
) { // Move backwards to a position at or preceding the requested pos.
398 if (fBoundaries
[fStartBufIdx
] > position
) {
399 // The first position in the cache is beyond the requested position.
400 // back up more until we get a boundary <= the requested position.
401 while (fBoundaries
[fStartBufIdx
] > position
) {
402 populatePreceding(status
);
404 fBufIdx
= fStartBufIdx
; // Set iterator position to the start of the buffer.
405 fTextIdx
= fBoundaries
[fBufIdx
]; // Required because populatePreceding may add extra boundaries.
406 while (fTextIdx
< position
) { // Move forwards to a position at or following the requested pos.
409 if (fTextIdx
> position
) {
410 // If position is not itself a boundary, the next() loop above will overshoot.
411 // Back up one, leaving cache position at the boundary preceding the requested position.
417 U_ASSERT(fTextIdx
== position
);
423 UBool
RuleBasedBreakIterator::BreakCache::populateFollowing() {
424 int32_t fromPosition
= fBoundaries
[fEndBufIdx
];
425 int32_t fromRuleStatusIdx
= fStatuses
[fEndBufIdx
];
427 int32_t ruleStatusIdx
= 0;
429 if (fBI
->fDictionaryCache
->following(fromPosition
, &pos
, &ruleStatusIdx
)) {
430 addFollowing(pos
, ruleStatusIdx
, UpdateCachePosition
);
434 fBI
->fPosition
= fromPosition
;
435 pos
= fBI
->handleNext();
436 if (pos
== UBRK_DONE
) {
440 ruleStatusIdx
= fBI
->fRuleStatusIndex
;
441 if (fBI
->fDictionaryCharCount
> 0) {
442 // The text segment obtained from the rules includes dictionary characters.
443 // Subdivide it, with subdivided results going into the dictionary cache.
444 fBI
->fDictionaryCache
->populateDictionary(fromPosition
, pos
, fromRuleStatusIdx
, ruleStatusIdx
);
445 if (fBI
->fDictionaryCache
->following(fromPosition
, &pos
, &ruleStatusIdx
)) {
446 addFollowing(pos
, ruleStatusIdx
, UpdateCachePosition
);
448 // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
449 // But be careful with interactions with populateNear().
453 // Rule based segment did not include dictionary characters.
454 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
455 // meaning that we didn't take the return, above.
456 // Add its end point to the cache.
457 addFollowing(pos
, ruleStatusIdx
, UpdateCachePosition
);
459 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
460 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
462 for (int count
=0; count
<6; ++count
) {
463 pos
= fBI
->handleNext();
464 if (pos
== UBRK_DONE
|| fBI
->fDictionaryCharCount
> 0) {
467 addFollowing(pos
, fBI
->fRuleStatusIndex
, RetainCachePosition
);
474 UBool
RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode
&status
) {
475 if (U_FAILURE(status
)) {
479 int32_t fromPosition
= fBoundaries
[fStartBufIdx
];
480 if (fromPosition
== 0) {
484 int32_t position
= 0;
485 int32_t positionStatusIdx
= 0;
487 if (fBI
->fDictionaryCache
->preceding(fromPosition
, &position
, &positionStatusIdx
)) {
488 addPreceding(position
, positionStatusIdx
, UpdateCachePosition
);
492 int32_t backupPosition
= fromPosition
;
494 // Find a boundary somewhere preceding the first already-cached boundary
496 backupPosition
= backupPosition
- 30;
497 if (backupPosition
<= 0) {
500 backupPosition
= fBI
->handleSafePrevious(backupPosition
);
502 if (backupPosition
== UBRK_DONE
|| backupPosition
== 0) {
504 positionStatusIdx
= 0;
506 // Advance to the boundary following the backup position.
507 // There is a complication: the safe reverse rules identify pairs of code points
508 // that are safe. If advancing from the safe point moves forwards by less than
509 // two code points, we need to advance one more time to ensure that the boundary
510 // is good, including a correct rules status value.
512 fBI
->fPosition
= backupPosition
;
513 position
= fBI
->handleNext();
514 if (position
<= backupPosition
+ 4) {
515 // +4 is a quick test for possibly having advanced only one codepoint.
516 // Four being the length of the longest potential code point, a supplementary in UTF-8
517 utext_setNativeIndex(&fBI
->fText
, position
);
518 if (backupPosition
== utext_getPreviousNativeIndex(&fBI
->fText
)) {
519 // The initial handleNext() only advanced by a single code point. Go again.
520 position
= fBI
->handleNext(); // Safe rules identify safe pairs.
523 positionStatusIdx
= fBI
->fRuleStatusIndex
;
525 } while (position
>= fromPosition
);
527 // Find boundaries between the one we just located and the first already-cached boundary
528 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
530 fSideBuffer
.removeAllElements();
531 fSideBuffer
.addElement(position
, status
);
532 fSideBuffer
.addElement(positionStatusIdx
, status
);
535 int32_t prevPosition
= fBI
->fPosition
= position
;
536 int32_t prevStatusIdx
= positionStatusIdx
;
537 position
= fBI
->handleNext();
538 positionStatusIdx
= fBI
->fRuleStatusIndex
;
539 if (position
== UBRK_DONE
) {
543 UBool segmentHandledByDictionary
= FALSE
;
544 if (fBI
->fDictionaryCharCount
!= 0) {
545 // Segment from the rules includes dictionary characters.
546 // Subdivide it, with subdivided results going into the dictionary cache.
547 int32_t dictSegEndPosition
= position
;
548 fBI
->fDictionaryCache
->populateDictionary(prevPosition
, dictSegEndPosition
, prevStatusIdx
, positionStatusIdx
);
549 while (fBI
->fDictionaryCache
->following(prevPosition
, &position
, &positionStatusIdx
)) {
550 segmentHandledByDictionary
= true;
551 U_ASSERT(position
> prevPosition
);
552 if (position
>= fromPosition
) {
555 U_ASSERT(position
<= dictSegEndPosition
);
556 fSideBuffer
.addElement(position
, status
);
557 fSideBuffer
.addElement(positionStatusIdx
, status
);
558 prevPosition
= position
;
560 U_ASSERT(position
==dictSegEndPosition
|| position
>=fromPosition
);
563 if (!segmentHandledByDictionary
&& position
< fromPosition
) {
564 fSideBuffer
.addElement(position
, status
);
565 fSideBuffer
.addElement(positionStatusIdx
, status
);
567 } while (position
< fromPosition
);
569 // Move boundaries from the side buffer to the main circular buffer.
570 UBool success
= FALSE
;
571 if (!fSideBuffer
.isEmpty()) {
572 positionStatusIdx
= fSideBuffer
.popi();
573 position
= fSideBuffer
.popi();
574 addPreceding(position
, positionStatusIdx
, UpdateCachePosition
);
578 while (!fSideBuffer
.isEmpty()) {
579 positionStatusIdx
= fSideBuffer
.popi();
580 position
= fSideBuffer
.popi();
581 if (!addPreceding(position
, positionStatusIdx
, RetainCachePosition
)) {
582 // No space in circular buffer to hold a new preceding result while
583 // also retaining the current cache (iteration) position.
584 // Bailing out is safe; the cache will refill again if needed.
593 void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position
, int32_t ruleStatusIdx
, UpdatePositionValues update
) {
594 U_ASSERT(position
> fBoundaries
[fEndBufIdx
]);
595 U_ASSERT(ruleStatusIdx
<= UINT16_MAX
);
596 int32_t nextIdx
= modChunkSize(fEndBufIdx
+ 1);
597 if (nextIdx
== fStartBufIdx
) {
598 fStartBufIdx
= modChunkSize(fStartBufIdx
+ 6); // TODO: experiment. Probably revert to 1.
600 fBoundaries
[nextIdx
] = position
;
601 fStatuses
[nextIdx
] = static_cast<uint16_t>(ruleStatusIdx
);
602 fEndBufIdx
= nextIdx
;
603 if (update
== UpdateCachePosition
) {
604 // Set current position to the newly added boundary.
608 // Retaining the original cache position.
609 // Check if the added boundary wraps around the buffer, and would over-write the original position.
610 // It's the responsibility of callers of this function to not add too many.
611 U_ASSERT(nextIdx
!= fBufIdx
);
615 bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position
, int32_t ruleStatusIdx
, UpdatePositionValues update
) {
616 U_ASSERT(position
< fBoundaries
[fStartBufIdx
]);
617 U_ASSERT(ruleStatusIdx
<= UINT16_MAX
);
618 int32_t nextIdx
= modChunkSize(fStartBufIdx
- 1);
619 if (nextIdx
== fEndBufIdx
) {
620 if (fBufIdx
== fEndBufIdx
&& update
== RetainCachePosition
) {
621 // Failure. The insertion of the new boundary would claim the buffer position that is the
622 // current iteration position. And we also want to retain the current iteration position.
623 // (The buffer is already completely full of entries that precede the iteration position.)
626 fEndBufIdx
= modChunkSize(fEndBufIdx
- 1);
628 fBoundaries
[nextIdx
] = position
;
629 fStatuses
[nextIdx
] = static_cast<uint16_t>(ruleStatusIdx
);
630 fStartBufIdx
= nextIdx
;
631 if (update
== UpdateCachePosition
) {
639 void RuleBasedBreakIterator::BreakCache::dumpCache() {
641 RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx
, fBufIdx
);
642 for (int32_t i
=fStartBufIdx
; ; i
=modChunkSize(i
+1)) {
643 RBBIDebugPrintf("%d %d\n", i
, fBoundaries
[i
]);
644 if (i
== fEndBufIdx
) {
653 #endif // #if !UCONFIG_NO_BREAK_ITERATION