]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbtok.cpp
ICU-59180.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbtok.cpp
1 /*
2 ***************************************************************************
3 * Copyright (C) 2006-2008,2017 Apple Inc. All Rights Reserved. *
4 ***************************************************************************
5 */
6
7 #include "unicode/utypes.h"
8
9 #if !UCONFIG_NO_BREAK_ITERATION
10
11 #include "rbtok.h"
12 #include "unicode/ustring.h"
13 #include "unicode/utext.h"
14 #include "rbbidata.h"
15 #include "rbbirb.h"
16 #include "uassert.h"
17
18 #ifdef RBBI_DEBUG
19 // The following is now static in rbbi.cpp, gets set dynamicaly.
20 // For now duplicate here to build, and force to TRUE if desired.
21 static UBool fTrace = FALSE;
22 #endif
23
24 U_NAMESPACE_BEGIN
25
26
27 static const int16_t START_STATE = 1; // The state number of the starting state
28 static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
29
30 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
31 {
32 RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
33 RuleBasedTokenRange *outTokenP = outTokenRanges;
34 int32_t state;
35 uint16_t category = 0;
36
37 const RBBIStateTableRow *row;
38 const RBBIStateTableRow *const startRow = fStartRow;
39
40 int32_t lastAcceptingState = 0;
41 UChar32 c = 0;
42 signed long prev;
43 signed long result;
44 const char *const tableData = fData->fForwardTable->fTableData;
45 const uint32_t tableRowLen = fData->fForwardTable->fRowLen;
46 UText *text = fText;
47
48 #ifdef RBBI_DEBUG
49 if (fTrace) {
50 RBBIDebugPuts("Handle Next pos char state category");
51 }
52 #endif
53
54 fLastStatusIndexValid = FALSE;
55
56 // if we're already at the end of the text, return DONE.
57 prev = (signed long)UTEXT_GETNATIVEINDEX(text);
58
59 // loop until we reach the end of the text or transition to state 0
60 //
61 const UTrie *trie = &fData->fTrie;
62 while (outTokenP < outTokenLimit) {
63 // LookAheadResults lookAheadMatches; // added in RBBI, #12081/r38387
64 result = prev; // fallback initialization, prevent uninitialized use
65 c = UTEXT_NEXT32(text);
66 if (c == U_SENTINEL)
67 {
68 goto exitTokenizer;
69 }
70 // Set the initial state for the state machine
71 state = START_STATE;
72 row = startRow;
73
74 // if we have cached break positions and we're still in the range
75 // covered by them, just move one step forward in the cache
76 if (fCachedBreakPositions != NULL) {
77 if (fPositionInCache < fNumCachedBreakPositions - 1) {
78 ++fPositionInCache;
79 result = fCachedBreakPositions[fPositionInCache];
80 goto emitToken;
81 }
82 else {
83 reset();
84 }
85 }
86
87 while (c != U_SENTINEL) {
88 //
89 // Get the char category. An incoming category of 1 or 2 means that
90 // we are preset for doing the beginning or end of input, and
91 // that we shouldn't get a category from an actual text input character.
92 //
93 // look up the current character's character category, which tells us
94 // which column in the state table to look at.
95 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
96 // not the size of the character going in, which is a UChar32.
97 //
98 if (c < 0x100)
99 category = fLatin1Cat[c];
100 else
101 UTRIE_GET16(trie, c, category);
102
103 // Check the dictionary bit in the character's category.
104 // Counter is only used by dictionary based iterators (subclasses).
105 // Chars that need to be handled by a dictionary have a flag bit set
106 // in their category values.
107 //
108 if ((category & 0x4000) != 0) {
109 fDictionaryCharCount++;
110 // And off the dictionary flag bit.
111 category &= ~0x4000;
112 }
113
114 #ifdef RBBI_DEBUG
115 if (fTrace) {
116 RBBIDebugPrintf(" %4lld ", utext_getNativeIndex(fText));
117 if (0x20<=c && c<0x7f) {
118 RBBIDebugPrintf("\"%c\" ", c);
119 } else {
120 RBBIDebugPrintf("%5x ", c);
121 }
122 RBBIDebugPrintf("%3d %3d\n", state, category);
123 }
124 #endif
125
126 // State Transition - move machine to its next state
127 //
128
129 // Note: fNextState is defined as uint16_t[2], but we are casting
130 // a generated RBBI table to RBBIStateTableRow and some tables
131 // actually have more than 2 categories.
132 U_ASSERT(category<fData->fHeader->fCatCount);
133 state = row->fNextState[category];
134 row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
135
136 if (row->fAccepting == -1) {
137 // Match found, common case.
138 result = (signed long)UTEXT_GETNATIVEINDEX(text);
139 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
140 //lastStatusRow = row;
141 lastAcceptingState = state;
142 }
143
144 // rbbi has added code here to check lookAheadMatches and
145 // set lookAheadMatches, per open-source ICU #12081/r38387
146
147 if (state == STOP_STATE) {
148 // This is the normal exit from the lookup state machine.
149 // We have advanced through the string until it is certain that no
150 // longer match is possible, no matter what characters follow.
151 break;
152 }
153
154 // Advance to the next character.
155 // If this is a beginning-of-input loop iteration, don't advance
156 // the input position. The next iteration will be processing the
157 // first real input character.
158 c = UTEXT_NEXT32(text);
159 }
160
161 if (fDictionaryCharCount > 0) {
162 result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
163 }
164
165 emitToken:
166 // The state machine is done. Check whether it found a match...
167
168 // If the iterator failed to advance in the match engine, force it ahead by one.
169 // (This really indicates a defect in the break rules. They should always match
170 // at least one character.). Added in open-source ICU r13469
171 UBool setFlagsZero = FALSE;
172 if (result == prev) {
173 UTEXT_SETNATIVEINDEX(text, prev);
174 UTEXT_NEXT32(text);
175 result = (int32_t)UTEXT_GETNATIVEINDEX(text);
176 setFlagsZero = TRUE;
177 }
178
179 // Leave the iterator at our result position.
180 UTEXT_SETNATIVEINDEX(text, result);
181
182 RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
183 int32_t flags = (!setFlagsZero)? fStateFlags[lastAcceptingState]: 0;
184
185 if (flags == -1) {
186 goto skipToken;
187 }
188
189 #ifdef RBBI_DEBUG
190 if (fTrace) {
191 RBBIDebugPrintf("Emit location %3ld length %2ld flags %08X\n", range.location, range.length, flags);
192 }
193 #endif
194 *outTokenP++ = range;
195 if (outTokenFlags)
196 {
197 *outTokenFlags++ = (unsigned long) flags;
198 }
199
200 if (flags & 0x40000000) {
201 goto exitTokenizer;
202 }
203
204 skipToken:
205 prev = result;
206 }
207
208 exitTokenizer:
209 return (outTokenP - outTokenRanges);
210 }
211
212 void
213 RuleBasedTokenizer::init()
214 {
215 const RBBIStateTable *statetable = fData->fForwardTable;
216 setBreakType(UBRK_WORD);
217 fStartRow = (const RBBIStateTableRow *)
218 (statetable->fTableData + (statetable->fRowLen * START_STATE));
219 UChar i;
220 const UTrie *trie = &fData->fTrie;
221 //int16_t category;
222 fLatin1Cat = new int16_t[256];
223 for (i = 0; i < 256; ++i)
224 {
225 //UTRIE_GET16(trie, i, category);
226 //fLatin1Cat[i] = category;
227 fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
228 }
229 fStateFlags = new int32_t[statetable->fNumStates];
230 for (i = 0; i < statetable->fNumStates; ++i)
231 {
232 const RBBIStateTableRow *row = (const RBBIStateTableRow *)
233 (statetable->fTableData + (statetable->fRowLen * i));
234 int32_t flags = 0;
235 if (row->fAccepting == -1 && row->fTagIdx != 0)
236 {
237 const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
238 const int32_t *valLimit = vals + 1;
239 valLimit += *vals++;
240 while (vals < valLimit)
241 {
242 int32_t val = *vals++;
243 if (val == 0)
244 {
245 break;
246 }
247 else if (val > 0)
248 {
249 flags |= val;
250 }
251 else
252 {
253 flags = val;
254 break;
255 }
256 }
257 }
258 fStateFlags[i] = flags;
259 }
260 }
261
262 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
263 : RuleBasedBreakIterator(rules, parseErr, err)
264 {
265 if (U_SUCCESS(err)) {
266 init();
267 }
268 }
269
270 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
271 : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
272 {
273 if (U_SUCCESS(status)) {
274 init();
275 }
276 }
277
278 RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
279 : RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status)
280 {
281 if (U_SUCCESS(status)) {
282 init();
283 }
284 }
285
286 RuleBasedTokenizer::~RuleBasedTokenizer() {
287 delete [] fStateFlags;
288 delete [] fLatin1Cat;
289 }
290
291 U_NAMESPACE_END
292
293 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */