1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 // file: rbbi_cache.cpp
6 #include "unicode/utypes.h"
8 #if !UCONFIG_NO_BREAK_ITERATION
10 #include "unicode/ubrk.h"
11 #include "unicode/rbbi.h"
13 #include "rbbi_cache.h"
25 * DictionaryCache implementation
28 RuleBasedBreakIterator::DictionaryCache::DictionaryCache(RuleBasedBreakIterator
*bi
, UErrorCode
&status
) :
29 fBI(bi
), fBreaks(status
), fPositionInCache(-1),
30 fStart(0), fLimit(0), fFirstRuleStatusIndex(0), fOtherRuleStatusIndex(0) {
33 RuleBasedBreakIterator::DictionaryCache::~DictionaryCache() {
36 void RuleBasedBreakIterator::DictionaryCache::reset() {
37 fPositionInCache
= -1;
40 fFirstRuleStatusIndex
= 0;
41 fOtherRuleStatusIndex
= 0;
42 fBreaks
.removeAllElements();
45 UBool
RuleBasedBreakIterator::DictionaryCache::following(int32_t fromPos
, int32_t *result
, int32_t *statusIndex
) {
46 if (fromPos
>= fLimit
|| fromPos
< fStart
) {
47 fPositionInCache
= -1;
51 // Sequential iteration, move from previous boundary to the following
54 if (fPositionInCache
>= 0 && fPositionInCache
< fBreaks
.size() && fBreaks
.elementAti(fPositionInCache
) == fromPos
) {
56 if (fPositionInCache
>= fBreaks
.size()) {
57 fPositionInCache
= -1;
60 r
= fBreaks
.elementAti(fPositionInCache
);
61 U_ASSERT(r
> fromPos
);
63 *statusIndex
= fOtherRuleStatusIndex
;
67 // Random indexing. Linear search for the boundary following the given position.
69 for (fPositionInCache
= 0; fPositionInCache
< fBreaks
.size(); ++fPositionInCache
) {
70 r
= fBreaks
.elementAti(fPositionInCache
);
73 *statusIndex
= fOtherRuleStatusIndex
;
78 fPositionInCache
= -1;
83 UBool
RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos
, int32_t *result
, int32_t *statusIndex
) {
84 if (fromPos
<= fStart
|| fromPos
> fLimit
) {
85 fPositionInCache
= -1;
89 if (fromPos
== fLimit
) {
90 fPositionInCache
= fBreaks
.size() - 1;
91 if (fPositionInCache
>= 0) {
92 U_ASSERT(fBreaks
.elementAti(fPositionInCache
) == fromPos
);
97 if (fPositionInCache
> 0 && fPositionInCache
< fBreaks
.size() && fBreaks
.elementAti(fPositionInCache
) == fromPos
) {
99 r
= fBreaks
.elementAti(fPositionInCache
);
100 U_ASSERT(r
< fromPos
);
102 *statusIndex
= ( r
== fStart
) ? fFirstRuleStatusIndex
: fOtherRuleStatusIndex
;
106 if (fPositionInCache
== 0) {
107 fPositionInCache
= -1;
111 for (fPositionInCache
= fBreaks
.size()-1; fPositionInCache
>= 0; --fPositionInCache
) {
112 r
= fBreaks
.elementAti(fPositionInCache
);
115 *statusIndex
= ( r
== fStart
) ? fFirstRuleStatusIndex
: fOtherRuleStatusIndex
;
120 fPositionInCache
= -1;
124 void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos
, int32_t endPos
,
125 int32_t firstRuleStatus
, int32_t otherRuleStatus
) {
126 if ((endPos
- startPos
) <= 1) {
131 fFirstRuleStatusIndex
= firstRuleStatus
;
132 fOtherRuleStatusIndex
= otherRuleStatus
;
134 int32_t rangeStart
= startPos
;
135 int32_t rangeEnd
= endPos
;
139 UErrorCode status
= U_ZERO_ERROR
;
140 int32_t foundBreakCount
= 0;
141 UText
*text
= &fBI
->fText
;
143 // Loop through the text, looking for ranges of dictionary characters.
144 // For each span, find the appropriate break engine, and ask it to find
145 // any breaks within the span.
147 utext_setNativeIndex(text
, rangeStart
);
148 UChar32 c
= utext_current32(text
);
149 category
= UTRIE2_GET16(fBI
->fData
->fTrie
, c
);
151 while(U_SUCCESS(status
)) {
152 while((current
= (int32_t)UTEXT_GETNATIVEINDEX(text
)) < rangeEnd
&& (category
& 0x4000) == 0) {
153 utext_next32(text
); // TODO: cleaner loop structure.
154 c
= utext_current32(text
);
155 category
= UTRIE2_GET16(fBI
->fData
->fTrie
, c
);
157 if (current
>= rangeEnd
) {
161 // We now have a dictionary character. Get the appropriate language object
163 const LanguageBreakEngine
*lbe
= fBI
->getLanguageBreakEngine(c
);
165 // Ask the language object if there are any breaks. It will add them to the cache and
166 // leave the text pointer on the other side of its range, ready to search for the next one.
168 foundBreakCount
+= lbe
->findBreaks(text
, rangeStart
, rangeEnd
, fBreaks
);
171 // Reload the loop variables for the next go-round
172 c
= utext_current32(text
);
173 category
= UTRIE2_GET16(fBI
->fData
->fTrie
, c
);
176 // If we found breaks, ensure that the first and last entries are
177 // the original starting and ending position. And initialize the
178 // cache iteration position to the first entry.
180 // printf("foundBreakCount = %d\n", foundBreakCount);
181 if (foundBreakCount
> 0) {
182 U_ASSERT(foundBreakCount
== fBreaks
.size());
183 if (startPos
< fBreaks
.elementAti(0)) {
184 // The dictionary did not place a boundary at the start of the segment of text.
185 // Add one now. This should not commonly happen, but it would be easy for interactions
186 // of the rules for dictionary segments and the break engine implementations to
187 // inadvertently cause it. Cover it here, just in case.
188 fBreaks
.insertElementAt(startPos
, 0, status
);
190 if (endPos
> fBreaks
.peeki()) {
191 fBreaks
.push(endPos
, status
);
193 fPositionInCache
= 0;
194 // Note: Dictionary matching may extend beyond the original limit.
195 fStart
= fBreaks
.elementAti(0);
196 fLimit
= fBreaks
.peeki();
198 // there were no language-based breaks, even though the segment contained
199 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache
200 // for this range will fail, and the calling code will fall back to the rule based boundaries.
206 * BreakCache implemetation
209 RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator
*bi
, UErrorCode
&status
) :
210 fBI(bi
), fSideBuffer(status
) {
215 RuleBasedBreakIterator::BreakCache::~BreakCache() {
219 void RuleBasedBreakIterator::BreakCache::reset(int32_t pos
, int32_t ruleStatus
) {
224 fBoundaries
[0] = pos
;
225 fStatuses
[0] = (uint16_t)ruleStatus
;
229 int32_t RuleBasedBreakIterator::BreakCache::current() {
230 fBI
->fPosition
= fTextIdx
;
231 fBI
->fRuleStatusIndex
= fStatuses
[fBufIdx
];
237 void RuleBasedBreakIterator::BreakCache::following(int32_t startPos
, UErrorCode
&status
) {
238 if (U_FAILURE(status
)) {
241 if (startPos
== fTextIdx
|| seek(startPos
) || populateNear(startPos
, status
)) {
242 // startPos is in the cache. Do a next() from that position.
243 // TODO: an awkward set of interactions with bi->fDone
244 // seek() does not clear it; it can't because of interactions with populateNear().
245 // next() does not clear it in the fast-path case, where everything matters. Maybe it should.
246 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end.
254 void RuleBasedBreakIterator::BreakCache::preceding(int32_t startPos
, UErrorCode
&status
) {
255 if (U_FAILURE(status
)) {
258 if (startPos
== fTextIdx
|| seek(startPos
) || populateNear(startPos
, status
)) {
259 if (startPos
== fTextIdx
) {
262 // seek() leaves the BreakCache positioned at the preceding boundary
263 // if the requested position is between two bounaries.
264 // current() pushes the BreakCache position out to the BreakIterator itself.
265 U_ASSERT(startPos
> fTextIdx
);
274 * Out-of-line code for BreakCache::next().
275 * Cache does not already contain the boundary
277 void RuleBasedBreakIterator::BreakCache::nextOL() {
278 fBI
->fDone
= !populateFollowing();
279 fBI
->fPosition
= fTextIdx
;
280 fBI
->fRuleStatusIndex
= fStatuses
[fBufIdx
];
285 void RuleBasedBreakIterator::BreakCache::previous(UErrorCode
&status
) {
286 if (U_FAILURE(status
)) {
289 int32_t initialBufIdx
= fBufIdx
;
290 if (fBufIdx
== fStartBufIdx
) {
291 // At start of cache. Prepend to it.
292 populatePreceding(status
);
294 // Cache already holds the next boundary
295 fBufIdx
= modChunkSize(fBufIdx
- 1);
296 fTextIdx
= fBoundaries
[fBufIdx
];
298 fBI
->fDone
= (fBufIdx
== initialBufIdx
);
299 fBI
->fPosition
= fTextIdx
;
300 fBI
->fRuleStatusIndex
= fStatuses
[fBufIdx
];
305 UBool
RuleBasedBreakIterator::BreakCache::seek(int32_t pos
) {
306 if (pos
< fBoundaries
[fStartBufIdx
] || pos
> fBoundaries
[fEndBufIdx
]) {
309 if (pos
== fBoundaries
[fStartBufIdx
]) {
310 // Common case: seek(0), from BreakIterator::first()
311 fBufIdx
= fStartBufIdx
;
312 fTextIdx
= fBoundaries
[fBufIdx
];
315 if (pos
== fBoundaries
[fEndBufIdx
]) {
316 fBufIdx
= fEndBufIdx
;
317 fTextIdx
= fBoundaries
[fBufIdx
];
321 int32_t min
= fStartBufIdx
;
322 int32_t max
= fEndBufIdx
;
324 int32_t probe
= (min
+ max
+ (min
>max
? CACHE_SIZE
: 0)) / 2;
325 probe
= modChunkSize(probe
);
326 if (fBoundaries
[probe
] > pos
) {
329 min
= modChunkSize(probe
+ 1);
332 U_ASSERT(fBoundaries
[max
] > pos
);
333 fBufIdx
= modChunkSize(max
- 1);
334 fTextIdx
= fBoundaries
[fBufIdx
];
335 U_ASSERT(fTextIdx
<= pos
);
340 UBool
RuleBasedBreakIterator::BreakCache::populateNear(int32_t position
, UErrorCode
&status
) {
341 if (U_FAILURE(status
)) {
344 U_ASSERT(position
< fBoundaries
[fStartBufIdx
] || position
> fBoundaries
[fEndBufIdx
]);
346 // Find a boundary somewhere in the vicinity of the requested position.
347 // Depending on the safe rules and the text data, it could be either before, at, or after
348 // the requested position.
351 // If the requested position is not near already cached positions, clear the existing cache,
352 // find a near-by boundary and begin new cache contents there.
354 if ((position
< fBoundaries
[fStartBufIdx
] - 15) || position
> (fBoundaries
[fEndBufIdx
] + 15)) {
355 int32_t aBoundary
= 0;
356 int32_t ruleStatusIndex
= 0;
358 int32_t backupPos
= fBI
->handleSafePrevious(position
);
361 // Advance to the boundary following the backup position.
362 // There is a complication: the safe reverse rules identify pairs of code points
363 // that are safe. If advancing from the safe point moves forwards by less than
364 // two code points, we need to advance one more time to ensure that the boundary
365 // is good, including a correct rules status value.
367 fBI
->fPosition
= backupPos
;
368 aBoundary
= fBI
->handleNext();
369 if (aBoundary
<= backupPos
+ 4) {
370 // +4 is a quick test for possibly having advanced only one codepoint.
371 // Four being the length of the longest potential code point, a supplementary in UTF-8
372 utext_setNativeIndex(&fBI
->fText
, aBoundary
);
373 if (backupPos
== utext_getPreviousNativeIndex(&fBI
->fText
)) {
374 // The initial handleNext() only advanced by a single code point. Go again.
375 aBoundary
= fBI
->handleNext(); // Safe rules identify safe pairs.
378 ruleStatusIndex
= fBI
->fRuleStatusIndex
;
381 reset(aBoundary
, ruleStatusIndex
); // Reset cache to hold aBoundary as a single starting point.
384 // Fill in boundaries between existing cache content and the new requested position.
386 if (fBoundaries
[fEndBufIdx
] < position
) {
387 // The last position in the cache precedes the requested position.
388 // Add following position(s) to the cache.
389 while (fBoundaries
[fEndBufIdx
] < position
) {
390 if (!populateFollowing()) {
395 fBufIdx
= fEndBufIdx
; // Set iterator position to the end of the buffer.
396 fTextIdx
= fBoundaries
[fBufIdx
]; // Required because populateFollowing may add extra boundaries.
397 while (fTextIdx
> position
) { // Move backwards to a position at or preceding the requested pos.
403 if (fBoundaries
[fStartBufIdx
] > position
) {
404 // The first position in the cache is beyond the requested position.
405 // back up more until we get a boundary <= the requested position.
406 while (fBoundaries
[fStartBufIdx
] > position
) {
407 populatePreceding(status
);
409 fBufIdx
= fStartBufIdx
; // Set iterator position to the start of the buffer.
410 fTextIdx
= fBoundaries
[fBufIdx
]; // Required because populatePreceding may add extra boundaries.
411 while (fTextIdx
< position
) { // Move forwards to a position at or following the requested pos.
414 if (fTextIdx
> position
) {
415 // If position is not itself a boundary, the next() loop above will overshoot.
416 // Back up one, leaving cache position at the boundary preceding the requested position.
422 U_ASSERT(fTextIdx
== position
);
428 UBool
RuleBasedBreakIterator::BreakCache::populateFollowing() {
429 int32_t fromPosition
= fBoundaries
[fEndBufIdx
];
430 int32_t fromRuleStatusIdx
= fStatuses
[fEndBufIdx
];
432 int32_t ruleStatusIdx
= 0;
434 if (fBI
->fDictionaryCache
->following(fromPosition
, &pos
, &ruleStatusIdx
)) {
435 addFollowing(pos
, ruleStatusIdx
, UpdateCachePosition
);
439 fBI
->fPosition
= fromPosition
;
440 pos
= fBI
->handleNext();
441 if (pos
== UBRK_DONE
) {
445 ruleStatusIdx
= fBI
->fRuleStatusIndex
;
446 if (fBI
->fDictionaryCharCount
> 0) {
447 // The text segment obtained from the rules includes dictionary characters.
448 // Subdivide it, with subdivided results going into the dictionary cache.
449 fBI
->fDictionaryCache
->populateDictionary(fromPosition
, pos
, fromRuleStatusIdx
, ruleStatusIdx
);
450 if (fBI
->fDictionaryCache
->following(fromPosition
, &pos
, &ruleStatusIdx
)) {
451 addFollowing(pos
, ruleStatusIdx
, UpdateCachePosition
);
453 // TODO: may want to move a sizable chunk of dictionary cache to break cache at this point.
454 // But be careful with interactions with populateNear().
458 // Rule based segment did not include dictionary characters.
459 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
460 // meaning that we didn't take the return, above.
461 // Add its end point to the cache.
462 addFollowing(pos
, ruleStatusIdx
, UpdateCachePosition
);
464 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration.
465 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results.
467 for (int count
=0; count
<6; ++count
) {
468 pos
= fBI
->handleNext();
469 if (pos
== UBRK_DONE
|| fBI
->fDictionaryCharCount
> 0) {
472 addFollowing(pos
, fBI
->fRuleStatusIndex
, RetainCachePosition
);
479 UBool
RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode
&status
) {
480 if (U_FAILURE(status
)) {
484 int32_t fromPosition
= fBoundaries
[fStartBufIdx
];
485 if (fromPosition
== 0) {
489 int32_t position
= 0;
490 int32_t positionStatusIdx
= 0;
492 if (fBI
->fDictionaryCache
->preceding(fromPosition
, &position
, &positionStatusIdx
)) {
493 addPreceding(position
, positionStatusIdx
, UpdateCachePosition
);
497 int32_t backupPosition
= fromPosition
;
499 // Find a boundary somewhere preceding the first already-cached boundary
501 backupPosition
= backupPosition
- 30;
502 if (backupPosition
<= 0) {
505 backupPosition
= fBI
->handleSafePrevious(backupPosition
);
507 if (backupPosition
== UBRK_DONE
|| backupPosition
== 0) {
509 positionStatusIdx
= 0;
511 // Advance to the boundary following the backup position.
512 // There is a complication: the safe reverse rules identify pairs of code points
513 // that are safe. If advancing from the safe point moves forwards by less than
514 // two code points, we need to advance one more time to ensure that the boundary
515 // is good, including a correct rules status value.
517 fBI
->fPosition
= backupPosition
;
518 position
= fBI
->handleNext();
519 if (position
<= backupPosition
+ 4) {
520 // +4 is a quick test for possibly having advanced only one codepoint.
521 // Four being the length of the longest potential code point, a supplementary in UTF-8
522 utext_setNativeIndex(&fBI
->fText
, position
);
523 if (backupPosition
== utext_getPreviousNativeIndex(&fBI
->fText
)) {
524 // The initial handleNext() only advanced by a single code point. Go again.
525 position
= fBI
->handleNext(); // Safe rules identify safe pairs.
528 positionStatusIdx
= fBI
->fRuleStatusIndex
;
530 } while (position
>= fromPosition
);
532 // Find boundaries between the one we just located and the first already-cached boundary
533 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
535 fSideBuffer
.removeAllElements();
536 fSideBuffer
.addElement(position
, status
);
537 fSideBuffer
.addElement(positionStatusIdx
, status
);
540 int32_t prevPosition
= fBI
->fPosition
= position
;
541 int32_t prevStatusIdx
= positionStatusIdx
;
542 position
= fBI
->handleNext();
543 positionStatusIdx
= fBI
->fRuleStatusIndex
;
544 if (position
== UBRK_DONE
) {
548 UBool segmentHandledByDictionary
= FALSE
;
549 if (fBI
->fDictionaryCharCount
!= 0) {
550 // Segment from the rules includes dictionary characters.
551 // Subdivide it, with subdivided results going into the dictionary cache.
552 int32_t dictSegEndPosition
= position
;
553 fBI
->fDictionaryCache
->populateDictionary(prevPosition
, dictSegEndPosition
, prevStatusIdx
, positionStatusIdx
);
554 while (fBI
->fDictionaryCache
->following(prevPosition
, &position
, &positionStatusIdx
)) {
555 segmentHandledByDictionary
= true;
556 U_ASSERT(position
> prevPosition
);
557 if (position
>= fromPosition
) {
560 U_ASSERT(position
<= dictSegEndPosition
);
561 fSideBuffer
.addElement(position
, status
);
562 fSideBuffer
.addElement(positionStatusIdx
, status
);
563 prevPosition
= position
;
565 U_ASSERT(position
==dictSegEndPosition
|| position
>=fromPosition
);
568 if (!segmentHandledByDictionary
&& position
< fromPosition
) {
569 fSideBuffer
.addElement(position
, status
);
570 fSideBuffer
.addElement(positionStatusIdx
, status
);
572 } while (position
< fromPosition
);
574 // Move boundaries from the side buffer to the main circular buffer.
575 UBool success
= FALSE
;
576 if (!fSideBuffer
.isEmpty()) {
577 positionStatusIdx
= fSideBuffer
.popi();
578 position
= fSideBuffer
.popi();
579 addPreceding(position
, positionStatusIdx
, UpdateCachePosition
);
583 while (!fSideBuffer
.isEmpty()) {
584 positionStatusIdx
= fSideBuffer
.popi();
585 position
= fSideBuffer
.popi();
586 if (!addPreceding(position
, positionStatusIdx
, RetainCachePosition
)) {
587 // No space in circular buffer to hold a new preceding result while
588 // also retaining the current cache (iteration) position.
589 // Bailing out is safe; the cache will refill again if needed.
598 void RuleBasedBreakIterator::BreakCache::addFollowing(int32_t position
, int32_t ruleStatusIdx
, UpdatePositionValues update
) {
599 U_ASSERT(position
> fBoundaries
[fEndBufIdx
]);
600 U_ASSERT(ruleStatusIdx
<= UINT16_MAX
);
601 int32_t nextIdx
= modChunkSize(fEndBufIdx
+ 1);
602 if (nextIdx
== fStartBufIdx
) {
603 fStartBufIdx
= modChunkSize(fStartBufIdx
+ 6); // TODO: experiment. Probably revert to 1.
605 fBoundaries
[nextIdx
] = position
;
606 fStatuses
[nextIdx
] = ruleStatusIdx
;
607 fEndBufIdx
= nextIdx
;
608 if (update
== UpdateCachePosition
) {
609 // Set current position to the newly added boundary.
613 // Retaining the original cache position.
614 // Check if the added boundary wraps around the buffer, and would over-write the original position.
615 // It's the responsibility of callers of this function to not add too many.
616 U_ASSERT(nextIdx
!= fBufIdx
);
620 bool RuleBasedBreakIterator::BreakCache::addPreceding(int32_t position
, int32_t ruleStatusIdx
, UpdatePositionValues update
) {
621 U_ASSERT(position
< fBoundaries
[fStartBufIdx
]);
622 U_ASSERT(ruleStatusIdx
<= UINT16_MAX
);
623 int32_t nextIdx
= modChunkSize(fStartBufIdx
- 1);
624 if (nextIdx
== fEndBufIdx
) {
625 if (fBufIdx
== fEndBufIdx
&& update
== RetainCachePosition
) {
626 // Failure. The insertion of the new boundary would claim the buffer position that is the
627 // current iteration position. And we also want to retain the current iteration position.
628 // (The buffer is already completely full of entries that precede the iteration position.)
631 fEndBufIdx
= modChunkSize(fEndBufIdx
- 1);
633 fBoundaries
[nextIdx
] = position
;
634 fStatuses
[nextIdx
] = ruleStatusIdx
;
635 fStartBufIdx
= nextIdx
;
636 if (update
== UpdateCachePosition
) {
644 void RuleBasedBreakIterator::BreakCache::dumpCache() {
646 RBBIDebugPrintf("fTextIdx:%d fBufIdx:%d\n", fTextIdx
, fBufIdx
);
647 for (int32_t i
=fStartBufIdx
; ; i
=modChunkSize(i
+1)) {
648 RBBIDebugPrintf("%d %d\n", i
, fBoundaries
[i
]);
649 if (i
== fEndBufIdx
) {
658 #endif // #if !UCONFIG_NO_BREAK_ITERATION