]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbtok.cpp
ICU-59173.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbtok.cpp
1 /*
2 ***************************************************************************
3 * Copyright (C) 2006-2008,2017 Apple Inc. All Rights Reserved. *
4 ***************************************************************************
5 */
6
7 #include "unicode/utypes.h"
8
9 #if !UCONFIG_NO_BREAK_ITERATION
10
11 #include "rbtok.h"
12 #include "unicode/ustring.h"
13 #include "unicode/utext.h"
14 #include "rbbidata.h"
15 #include "rbbirb.h"
16 #include "uassert.h"
17
18 #ifdef RBBI_DEBUG
19 // The following is now static in rbbi.cpp, gets set dynamicaly.
20 // For now duplicate here to build, and force to TRUE if desired.
21 static UBool fTrace = FALSE;
22 #endif
23
24 U_NAMESPACE_BEGIN
25
26
27 static const int16_t START_STATE = 1; // The state number of the starting state
28 static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
29
30 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
31 {
32 RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
33 RuleBasedTokenRange *outTokenP = outTokenRanges;
34 int32_t state;
35 uint16_t category = 0;
36
37 const RBBIStateTableRow *row;
38 const RBBIStateTableRow *const startRow = fStartRow;
39
40 int32_t lastAcceptingState = 0;
41 UChar32 c = 0;
42 signed long prev;
43 signed long result;
44 const char *const tableData = fData->fForwardTable->fTableData;
45 const uint32_t tableRowLen = fData->fForwardTable->fRowLen;
46 UText *text = fText;
47
48 #ifdef RBBI_DEBUG
49 if (fTrace) {
50 RBBIDebugPuts("Handle Next pos char state category");
51 }
52 #endif
53
54 fLastStatusIndexValid = FALSE;
55
56 // if we're already at the end of the text, return DONE.
57 prev = (signed long)UTEXT_GETNATIVEINDEX(text);
58
59 // loop until we reach the end of the text or transition to state 0
60 //
61 const UTrie *trie = &fData->fTrie;
62 while (outTokenP < outTokenLimit) {
63 // LookAheadResults lookAheadMatches; // added in RBBI, #12081/r38387
64 c = UTEXT_NEXT32(text);
65 if (c == U_SENTINEL)
66 {
67 goto exitTokenizer;
68 }
69 // Set the initial state for the state machine
70 state = START_STATE;
71 row = startRow;
72
73 // if we have cached break positions and we're still in the range
74 // covered by them, just move one step forward in the cache
75 if (fCachedBreakPositions != NULL) {
76 if (fPositionInCache < fNumCachedBreakPositions - 1) {
77 ++fPositionInCache;
78 result = fCachedBreakPositions[fPositionInCache];
79 goto emitToken;
80 }
81 else {
82 reset();
83 }
84 }
85
86 while (c != U_SENTINEL) {
87 //
88 // Get the char category. An incoming category of 1 or 2 means that
89 // we are preset for doing the beginning or end of input, and
90 // that we shouldn't get a category from an actual text input character.
91 //
92 // look up the current character's character category, which tells us
93 // which column in the state table to look at.
94 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
95 // not the size of the character going in, which is a UChar32.
96 //
97 if (c < 0x100)
98 category = fLatin1Cat[c];
99 else
100 UTRIE_GET16(trie, c, category);
101
102 // Check the dictionary bit in the character's category.
103 // Counter is only used by dictionary based iterators (subclasses).
104 // Chars that need to be handled by a dictionary have a flag bit set
105 // in their category values.
106 //
107 if ((category & 0x4000) != 0) {
108 fDictionaryCharCount++;
109 // And off the dictionary flag bit.
110 category &= ~0x4000;
111 }
112
113 #ifdef RBBI_DEBUG
114 if (fTrace) {
115 RBBIDebugPrintf(" %4lld ", utext_getNativeIndex(fText));
116 if (0x20<=c && c<0x7f) {
117 RBBIDebugPrintf("\"%c\" ", c);
118 } else {
119 RBBIDebugPrintf("%5x ", c);
120 }
121 RBBIDebugPrintf("%3d %3d\n", state, category);
122 }
123 #endif
124
125 // State Transition - move machine to its next state
126 //
127
128 // Note: fNextState is defined as uint16_t[2], but we are casting
129 // a generated RBBI table to RBBIStateTableRow and some tables
130 // actually have more than 2 categories.
131 U_ASSERT(category<fData->fHeader->fCatCount);
132 state = row->fNextState[category];
133 row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
134
135 if (row->fAccepting == -1) {
136 // Match found, common case.
137 result = (signed long)UTEXT_GETNATIVEINDEX(text);
138 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
139 //lastStatusRow = row;
140 lastAcceptingState = state;
141 }
142
143 // rbbi has added code here to check lookAheadMatches and
144 // set lookAheadMatches, per open-source ICU #12081/r38387
145
146 if (state == STOP_STATE) {
147 // This is the normal exit from the lookup state machine.
148 // We have advanced through the string until it is certain that no
149 // longer match is possible, no matter what characters follow.
150 break;
151 }
152
153 // Advance to the next character.
154 // If this is a beginning-of-input loop iteration, don't advance
155 // the input position. The next iteration will be processing the
156 // first real input character.
157 c = UTEXT_NEXT32(text);
158 }
159
160 if (fDictionaryCharCount > 0) {
161 result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
162 }
163
164 emitToken:
165 // The state machine is done. Check whether it found a match...
166
167 // If the iterator failed to advance in the match engine, force it ahead by one.
168 // (This really indicates a defect in the break rules. They should always match
169 // at least one character.). Added in open-source ICU r13469
170 UBool setFlagsZero = FALSE;
171 if (result == prev) {
172 UTEXT_SETNATIVEINDEX(text, prev);
173 UTEXT_NEXT32(text);
174 result = (int32_t)UTEXT_GETNATIVEINDEX(text);
175 setFlagsZero = TRUE;
176 }
177
178 // Leave the iterator at our result position.
179 UTEXT_SETNATIVEINDEX(text, result);
180
181 RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
182 int32_t flags = (!setFlagsZero)? fStateFlags[lastAcceptingState]: 0;
183
184 if (flags == -1) {
185 goto skipToken;
186 }
187
188 #ifdef RBBI_DEBUG
189 if (fTrace) {
190 RBBIDebugPrintf("Emit location %3ld length %2ld flags %08X\n", range.location, range.length, flags);
191 }
192 #endif
193 *outTokenP++ = range;
194 if (outTokenFlags)
195 {
196 *outTokenFlags++ = (unsigned long) flags;
197 }
198
199 if (flags & 0x40000000) {
200 goto exitTokenizer;
201 }
202
203 skipToken:
204 prev = result;
205 }
206
207 exitTokenizer:
208 return (outTokenP - outTokenRanges);
209 }
210
211 void
212 RuleBasedTokenizer::init()
213 {
214 const RBBIStateTable *statetable = fData->fForwardTable;
215 setBreakType(UBRK_WORD);
216 fStartRow = (const RBBIStateTableRow *)
217 (statetable->fTableData + (statetable->fRowLen * START_STATE));
218 UChar i;
219 const UTrie *trie = &fData->fTrie;
220 //int16_t category;
221 fLatin1Cat = new int16_t[256];
222 for (i = 0; i < 256; ++i)
223 {
224 //UTRIE_GET16(trie, i, category);
225 //fLatin1Cat[i] = category;
226 fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
227 }
228 fStateFlags = new int32_t[statetable->fNumStates];
229 for (i = 0; i < statetable->fNumStates; ++i)
230 {
231 const RBBIStateTableRow *row = (const RBBIStateTableRow *)
232 (statetable->fTableData + (statetable->fRowLen * i));
233 int32_t flags = 0;
234 if (row->fAccepting == -1 && row->fTagIdx != 0)
235 {
236 const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
237 const int32_t *valLimit = vals + 1;
238 valLimit += *vals++;
239 while (vals < valLimit)
240 {
241 int32_t val = *vals++;
242 if (val == 0)
243 {
244 break;
245 }
246 else if (val > 0)
247 {
248 flags |= val;
249 }
250 else
251 {
252 flags = val;
253 break;
254 }
255 }
256 }
257 fStateFlags[i] = flags;
258 }
259 }
260
261 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
262 : RuleBasedBreakIterator(rules, parseErr, err)
263 {
264 if (U_SUCCESS(err)) {
265 init();
266 }
267 }
268
269 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
270 : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
271 {
272 if (U_SUCCESS(status)) {
273 init();
274 }
275 }
276
277 RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
278 : RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status)
279 {
280 if (U_SUCCESS(status)) {
281 init();
282 }
283 }
284
285 RuleBasedTokenizer::~RuleBasedTokenizer() {
286 delete [] fStateFlags;
287 delete [] fLatin1Cat;
288 }
289
290 U_NAMESPACE_END
291
292 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */