]>
Commit | Line | Data |
---|---|---|
1 | // Copyright (C) 2016 and later: Unicode, Inc. and others. | |
2 | // License & terms of use: http://www.unicode.org/copyright.html | |
3 | ||
4 | // file: rbbi_cache.h | |
5 | // | |
6 | #ifndef RBBI_CACHE_H | |
7 | #define RBBI_CACHE_H | |
8 | ||
9 | #include "unicode/utypes.h" | |
10 | ||
11 | #if !UCONFIG_NO_BREAK_ITERATION | |
12 | ||
13 | #include "unicode/rbbi.h" | |
14 | #include "unicode/uobject.h" | |
15 | ||
16 | #include "uvectr32.h" | |
17 | ||
18 | U_NAMESPACE_BEGIN | |
19 | ||
20 | /* DictionaryCache stores the boundaries obtained from a run of dictionary characters. | |
21 | * Dictionary boundaries are moved first to this cache, then from here | |
22 | * to the main BreakCache, where they may inter-leave with non-dictionary | |
23 | * boundaries. The public BreakIterator API always fetches directly | |
24 | * from the main BreakCache, not from here. | |
25 | * | |
26 | * In common situations, the number of boundaries in a single dictionary run | |
27 | * should be quite small, it will be terminated by punctuation, spaces, | |
28 | * or any other non-dictionary characters. The main BreakCache may end | |
29 | * up with boundaries from multiple dictionary based runs. | |
30 | * | |
31 | * The boundaries are stored in a simple ArrayList (vector), with the | |
32 | * assumption that they will be accessed sequentially. | |
33 | */ | |
34 | class RuleBasedBreakIterator::DictionaryCache: public UMemory { | |
35 | public: | |
36 | DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status); | |
37 | ~DictionaryCache(); | |
38 | ||
39 | void reset(); | |
40 | ||
41 | UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex); | |
42 | UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex); | |
43 | ||
44 | /** | |
45 | * Populate the cache with the dictionary based boundaries within a region of text. | |
46 | * @param startPos The start position of a range of text | |
47 | * @param endPos The end position of a range of text | |
48 | * @param firstRuleStatus The rule status index that applies to the break at startPos | |
49 | * @param otherRuleStatus The rule status index that applies to boundaries other than startPos | |
50 | * @internal | |
51 | */ | |
52 | void populateDictionary(int32_t startPos, int32_t endPos, | |
53 | int32_t firstRuleStatus, int32_t otherRuleStatus); | |
54 | ||
55 | ||
56 | ||
57 | RuleBasedBreakIterator *fBI; | |
58 | ||
59 | UVector32 fBreaks; // A vector containing the boundaries. | |
60 | int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following() | |
61 | // or preceding(). Optimizes sequential access. | |
62 | int32_t fStart; // Text position of first boundary in cache. | |
63 | int32_t fLimit; // Last boundary in cache. Which is the limit of the | |
64 | // text segment being handled by the dictionary. | |
65 | int32_t fFirstRuleStatusIndex; // Rule status info for first boundary. | |
66 | int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. | |
67 | }; | |
68 | ||
69 | ||
70 | /* | |
71 | * class BreakCache | |
72 | * | |
73 | * Cache of break boundary positions and rule status values. | |
74 | * Break iterator API functions, next(), previous(), etc., will use cached results | |
75 | * when possible, and otherwise cache new results as they are obtained. | |
76 | * | |
77 | * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. | |
78 | * | |
79 | * The cache is implemented as a single circular buffer. | |
80 | */ | |
81 | ||
82 | /* | |
83 | * size of the circular cache buffer. | |
84 | */ | |
85 | ||
86 | class RuleBasedBreakIterator::BreakCache: public UMemory { | |
87 | public: | |
88 | BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status); | |
89 | virtual ~BreakCache(); | |
90 | void reset(int32_t pos = 0, int32_t ruleStatus = 0); | |
91 | void next() { if (fBufIdx == fEndBufIdx) { | |
92 | nextOL(); | |
93 | } else { | |
94 | fBufIdx = modChunkSize(fBufIdx + 1); | |
95 | fTextIdx = fBI->fPosition = fBoundaries[fBufIdx]; | |
96 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; | |
97 | } | |
98 | }; | |
99 | ||
100 | ||
101 | void nextOL(); | |
102 | void previous(UErrorCode &status); | |
103 | ||
104 | // Move the iteration state to the position following the startPosition. | |
105 | // Input position must be pinned to the input length. | |
106 | void following(int32_t startPosition, UErrorCode &status); | |
107 | ||
108 | void preceding(int32_t startPosition, UErrorCode &status); | |
109 | ||
110 | /* | |
111 | * Update the state of the public BreakIterator (fBI) to reflect the | |
112 | * current state of the break iterator cache (this). | |
113 | */ | |
114 | int32_t current(); | |
115 | ||
116 | /** | |
117 | * Add boundaries to the cache near the specified position. | |
118 | * The given position need not be a boundary itself. | |
119 | * The input position must be within the range of the text, and | |
120 | * on a code point boundary. | |
121 | * If the requested position is a break boundary, leave the iteration | |
122 | * position on it. | |
123 | * If the requested position is not a boundary, leave the iteration | |
124 | * position on the preceding boundary and include both the | |
125 | * preceding and following boundaries in the cache. | |
126 | * Additional boundaries, either preceding or following, may be added | |
127 | * to the cache as a side effect. | |
128 | * | |
129 | * Return FALSE if the operation failed. | |
130 | */ | |
131 | UBool populateNear(int32_t position, UErrorCode &status); | |
132 | ||
133 | /** | |
134 | * Add boundary(s) to the cache following the current last boundary. | |
135 | * Return FALSE if at the end of the text, and no more boundaries can be added. | |
136 | * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. | |
137 | */ | |
138 | UBool populateFollowing(); | |
139 | ||
140 | /** | |
141 | * Add one or more boundaries to the cache preceding the first currently cached boundary. | |
142 | * Leave the iteration position on the first added boundary. | |
143 | * Return false if no boundaries could be added (if at the start of the text.) | |
144 | */ | |
145 | UBool populatePreceding(UErrorCode &status); | |
146 | ||
147 | enum UpdatePositionValues { | |
148 | RetainCachePosition = 0, | |
149 | UpdateCachePosition = 1 | |
150 | }; | |
151 | ||
152 | /* | |
153 | * Add the boundary following the current position. | |
154 | * The current position can be left as it was, or changed to the newly added boundary, | |
155 | * as specified by the update parameter. | |
156 | */ | |
157 | void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); | |
158 | ||
159 | ||
160 | /* | |
161 | * Add the boundary preceding the current position. | |
162 | * The current position can be left as it was, or changed to the newly added boundary, | |
163 | * as specified by the update parameter. | |
164 | */ | |
165 | bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); | |
166 | ||
167 | /** | |
168 | * Set the cache position to the specified position, or, if the position | |
169 | * falls between to cached boundaries, to the preceding boundary. | |
170 | * Fails if the requested position is outside of the range of boundaries currently held by the cache. | |
171 | * The startPosition must be on a code point boundary. | |
172 | * | |
173 | * Return TRUE if successful, FALSE if the specified position is after | |
174 | * the last cached boundary or before the first. | |
175 | */ | |
176 | UBool seek(int32_t startPosition); | |
177 | ||
178 | void dumpCache(); | |
179 | ||
180 | private: | |
181 | static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; | |
182 | ||
183 | static constexpr int32_t CACHE_SIZE = 128; | |
184 | static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); | |
185 | ||
186 | RuleBasedBreakIterator *fBI; | |
187 | int32_t fStartBufIdx; | |
188 | int32_t fEndBufIdx; // inclusive | |
189 | ||
190 | int32_t fTextIdx; | |
191 | int32_t fBufIdx; | |
192 | ||
193 | int32_t fBoundaries[CACHE_SIZE]; | |
194 | uint16_t fStatuses[CACHE_SIZE]; | |
195 | ||
196 | UVector32 fSideBuffer; | |
197 | }; | |
198 | ||
199 | U_NAMESPACE_END | |
200 | ||
201 | #endif // #if !UCONFIG_NO_BREAK_ITERATION | |
202 | ||
203 | #endif // RBBI_CACHE_H |