1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
9 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION
13 #include "unicode/rbbi.h"
14 #include "unicode/uobject.h"
20 /* DictionaryCache stores the boundaries obtained from a run of dictionary characters.
21 * Dictionary boundaries are moved first to this cache, then from here
22 * to the main BreakCache, where they may inter-leave with non-dictionary
23 * boundaries. The public BreakIterator API always fetches directly
24 * from the main BreakCache, not from here.
26 * In common situations, the number of boundaries in a single dictionary run
27 * should be quite small, it will be terminated by punctuation, spaces,
28 * or any other non-dictionary characters. The main BreakCache may end
29 * up with boundaries from multiple dictionary based runs.
31 * The boundaries are stored in a simple ArrayList (vector), with the
32 * assumption that they will be accessed sequentially.
34 class RuleBasedBreakIterator::DictionaryCache
: public UMemory
{
36 DictionaryCache(RuleBasedBreakIterator
*bi
, UErrorCode
&status
);
41 UBool
following(int32_t fromPos
, int32_t *pos
, int32_t *statusIndex
);
42 UBool
preceding(int32_t fromPos
, int32_t *pos
, int32_t *statusIndex
);
45 * Populate the cache with the dictionary based boundaries within a region of text.
46 * @param startPos The start position of a range of text
47 * @param endPos The end position of a range of text
48 * @param firstRuleStatus The rule status index that applies to the break at startPos
49 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos
52 void populateDictionary(int32_t startPos
, int32_t endPos
,
53 int32_t firstRuleStatus
, int32_t otherRuleStatus
);
57 RuleBasedBreakIterator
*fBI
;
59 UVector32 fBreaks
; // A vector containing the boundaries.
60 int32_t fPositionInCache
; // Index in fBreaks of last boundary returned by following()
61 // or preceding(). Optimizes sequential access.
62 int32_t fStart
; // Text position of first boundary in cache.
63 int32_t fLimit
; // Last boundary in cache. Which is the limit of the
64 // text segment being handled by the dictionary.
65 int32_t fFirstRuleStatusIndex
; // Rule status info for first boundary.
66 int32_t fOtherRuleStatusIndex
; // Rule status info for 2nd through last boundaries.
73 * Cache of break boundary positions and rule status values.
74 * Break iterator API functions, next(), previous(), etc., will use cached results
75 * when possible, and otherwise cache new results as they are obtained.
77 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
79 * The cache is implemented as a single circular buffer.
83 * size of the circular cache buffer.
86 class RuleBasedBreakIterator::BreakCache
: public UMemory
{
88 BreakCache(RuleBasedBreakIterator
*bi
, UErrorCode
&status
);
89 virtual ~BreakCache();
90 void reset(int32_t pos
= 0, int32_t ruleStatus
= 0);
91 void next() { if (fBufIdx
== fEndBufIdx
) {
94 fBufIdx
= modChunkSize(fBufIdx
+ 1);
95 fTextIdx
= fBI
->fPosition
= fBoundaries
[fBufIdx
];
96 fBI
->fRuleStatusIndex
= fStatuses
[fBufIdx
];
102 void previous(UErrorCode
&status
);
104 // Move the iteration state to the position following the startPosition.
105 // Input position must be pinned to the input length.
106 void following(int32_t startPosition
, UErrorCode
&status
);
108 void preceding(int32_t startPosition
, UErrorCode
&status
);
111 * Update the state of the public BreakIterator (fBI) to reflect the
112 * current state of the break iterator cache (this).
117 * Add boundaries to the cache near the specified position.
118 * The given position need not be a boundary itself.
119 * The input position must be within the range of the text, and
120 * on a code point boundary.
121 * If the requested position is a break boundary, leave the iteration
123 * If the requested position is not a boundary, leave the iteration
124 * position on the preceding boundary and include both the
125 * preceding and following boundaries in the cache.
126 * Additional boundaries, either preceding or following, may be added
127 * to the cache as a side effect.
129 * Return FALSE if the operation failed.
131 UBool
populateNear(int32_t position
, UErrorCode
&status
);
134 * Add boundary(s) to the cache following the current last boundary.
135 * Return FALSE if at the end of the text, and no more boundaries can be added.
136 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added.
138 UBool
populateFollowing();
141 * Add one or more boundaries to the cache preceding the first currently cached boundary.
142 * Leave the iteration position on the first added boundary.
143 * Return false if no boundaries could be added (if at the start of the text.)
145 UBool
populatePreceding(UErrorCode
&status
);
147 enum UpdatePositionValues
{
148 RetainCachePosition
= 0,
149 UpdateCachePosition
= 1
153 * Add the boundary following the current position.
154 * The current position can be left as it was, or changed to the newly added boundary,
155 * as specified by the update parameter.
157 void addFollowing(int32_t position
, int32_t ruleStatusIdx
, UpdatePositionValues update
);
161 * Add the boundary preceding the current position.
162 * The current position can be left as it was, or changed to the newly added boundary,
163 * as specified by the update parameter.
165 bool addPreceding(int32_t position
, int32_t ruleStatusIdx
, UpdatePositionValues update
);
168 * Set the cache position to the specified position, or, if the position
169 * falls between to cached boundaries, to the preceding boundary.
170 * Fails if the requested position is outside of the range of boundaries currently held by the cache.
171 * The startPosition must be on a code point boundary.
173 * Return TRUE if successful, FALSE if the specified position is after
174 * the last cached boundary or before the first.
176 UBool
seek(int32_t startPosition
);
181 static inline int32_t modChunkSize(int index
) { return index
& (CACHE_SIZE
- 1); }
183 static constexpr int32_t CACHE_SIZE
= 128;
184 static_assert((CACHE_SIZE
& (CACHE_SIZE
-1)) == 0, "CACHE_SIZE must be power of two.");
186 RuleBasedBreakIterator
*fBI
;
187 int32_t fStartBufIdx
;
188 int32_t fEndBufIdx
; // inclusive
193 int32_t fBoundaries
[CACHE_SIZE
];
194 uint16_t fStatuses
[CACHE_SIZE
];
196 UVector32 fSideBuffer
;
201 #endif // #if !UCONFIG_NO_BREAK_ITERATION
203 #endif // RBBI_CACHE_H