]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | *************************************************************************** | |
0f5d89e8 | 3 | * Copyright (C) 2006-2008,2017-2018 Apple Inc. All Rights Reserved. * |
73c04bcf | 4 | *************************************************************************** |
73c04bcf A |
5 | */ |
6 | ||
7 | #include "unicode/utypes.h" | |
8 | ||
9 | #if !UCONFIG_NO_BREAK_ITERATION | |
10 | ||
73c04bcf A |
11 | #include "unicode/ustring.h" |
12 | #include "unicode/utext.h" | |
0f5d89e8 A |
13 | #include "rbbidata57.h" |
14 | #include "rbbi57.h" | |
15 | #include "rbtok.h" | |
f3c0d7a5 | 16 | #include "uassert.h" |
73c04bcf | 17 | |
f3c0d7a5 A |
18 | #ifdef RBBI_DEBUG |
19 | // The following is now static in rbbi.cpp, gets set dynamicaly. | |
20 | // For now duplicate here to build, and force to TRUE if desired. | |
21 | static UBool fTrace = FALSE; | |
22 | #endif | |
73c04bcf | 23 | |
f3c0d7a5 | 24 | U_NAMESPACE_BEGIN |
73c04bcf | 25 | |
73c04bcf A |
26 | |
27 | static const int16_t START_STATE = 1; // The state number of the starting state | |
28 | static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop" | |
29 | ||
30 | int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags) | |
31 | { | |
32 | RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens; | |
33 | RuleBasedTokenRange *outTokenP = outTokenRanges; | |
34 | int32_t state; | |
f3c0d7a5 A |
35 | uint16_t category = 0; |
36 | ||
73c04bcf A |
37 | const RBBIStateTableRow *row; |
38 | const RBBIStateTableRow *const startRow = fStartRow; | |
39 | ||
40 | int32_t lastAcceptingState = 0; | |
41 | UChar32 c = 0; | |
42 | signed long prev; | |
43 | signed long result; | |
44 | const char *const tableData = fData->fForwardTable->fTableData; | |
45 | const uint32_t tableRowLen = fData->fForwardTable->fRowLen; | |
46 | UText *text = fText; | |
47 | ||
48 | #ifdef RBBI_DEBUG | |
49 | if (fTrace) { | |
50 | RBBIDebugPuts("Handle Next pos char state category"); | |
51 | } | |
52 | #endif | |
53 | ||
54 | fLastStatusIndexValid = FALSE; | |
55 | ||
56 | // if we're already at the end of the text, return DONE. | |
57 | prev = (signed long)UTEXT_GETNATIVEINDEX(text); | |
f3c0d7a5 | 58 | |
73c04bcf A |
59 | // loop until we reach the end of the text or transition to state 0 |
60 | // | |
f3c0d7a5 | 61 | const UTrie *trie = &fData->fTrie; |
73c04bcf | 62 | while (outTokenP < outTokenLimit) { |
0f5d89e8 | 63 | result = prev; // fallback initialization |
f3c0d7a5 | 64 | c = UTEXT_NEXT32(text); |
73c04bcf A |
65 | if (c == U_SENTINEL) |
66 | { | |
67 | goto exitTokenizer; | |
68 | } | |
69 | // Set the initial state for the state machine | |
70 | state = START_STATE; | |
71 | row = startRow; | |
f3c0d7a5 | 72 | |
73c04bcf A |
73 | // if we have cached break positions and we're still in the range |
74 | // covered by them, just move one step forward in the cache | |
75 | if (fCachedBreakPositions != NULL) { | |
76 | if (fPositionInCache < fNumCachedBreakPositions - 1) { | |
77 | ++fPositionInCache; | |
78 | result = fCachedBreakPositions[fPositionInCache]; | |
79 | goto emitToken; | |
80 | } | |
81 | else { | |
82 | reset(); | |
83 | } | |
84 | } | |
85 | ||
86 | while (c != U_SENTINEL) { | |
87 | // | |
88 | // Get the char category. An incoming category of 1 or 2 means that | |
89 | // we are preset for doing the beginning or end of input, and | |
90 | // that we shouldn't get a category from an actual text input character. | |
91 | // | |
92 | // look up the current character's character category, which tells us | |
93 | // which column in the state table to look at. | |
94 | // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, | |
95 | // not the size of the character going in, which is a UChar32. | |
96 | // | |
2ca993e8 | 97 | if (c < 0x100) |
73c04bcf A |
98 | category = fLatin1Cat[c]; |
99 | else | |
100 | UTRIE_GET16(trie, c, category); | |
f3c0d7a5 | 101 | |
73c04bcf A |
102 | // Check the dictionary bit in the character's category. |
103 | // Counter is only used by dictionary based iterators (subclasses). | |
104 | // Chars that need to be handled by a dictionary have a flag bit set | |
105 | // in their category values. | |
106 | // | |
2ca993e8 | 107 | if ((category & 0x4000) != 0) { |
73c04bcf A |
108 | fDictionaryCharCount++; |
109 | // And off the dictionary flag bit. | |
110 | category &= ~0x4000; | |
111 | } | |
f3c0d7a5 | 112 | |
73c04bcf A |
113 | #ifdef RBBI_DEBUG |
114 | if (fTrace) { | |
f3c0d7a5 | 115 | RBBIDebugPrintf(" %4lld ", utext_getNativeIndex(fText)); |
73c04bcf A |
116 | if (0x20<=c && c<0x7f) { |
117 | RBBIDebugPrintf("\"%c\" ", c); | |
118 | } else { | |
119 | RBBIDebugPrintf("%5x ", c); | |
120 | } | |
121 | RBBIDebugPrintf("%3d %3d\n", state, category); | |
122 | } | |
123 | #endif | |
f3c0d7a5 | 124 | |
73c04bcf A |
125 | // State Transition - move machine to its next state |
126 | // | |
f3c0d7a5 A |
127 | |
128 | // Note: fNextState is defined as uint16_t[2], but we are casting | |
129 | // a generated RBBI table to RBBIStateTableRow and some tables | |
130 | // actually have more than 2 categories. | |
131 | U_ASSERT(category<fData->fHeader->fCatCount); | |
73c04bcf A |
132 | state = row->fNextState[category]; |
133 | row = (const RBBIStateTableRow *) (tableData + tableRowLen * state); | |
f3c0d7a5 | 134 | |
73c04bcf A |
135 | if (row->fAccepting == -1) { |
136 | // Match found, common case. | |
137 | result = (signed long)UTEXT_GETNATIVEINDEX(text); | |
138 | //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. | |
139 | //lastStatusRow = row; | |
140 | lastAcceptingState = state; | |
141 | } | |
0f5d89e8 | 142 | |
73c04bcf A |
143 | if (state == STOP_STATE) { |
144 | // This is the normal exit from the lookup state machine. | |
145 | // We have advanced through the string until it is certain that no | |
146 | // longer match is possible, no matter what characters follow. | |
147 | break; | |
148 | } | |
f3c0d7a5 A |
149 | |
150 | // Advance to the next character. | |
73c04bcf A |
151 | // If this is a beginning-of-input loop iteration, don't advance |
152 | // the input position. The next iteration will be processing the | |
153 | // first real input character. | |
154 | c = UTEXT_NEXT32(text); | |
155 | } | |
156 | ||
157 | if (fDictionaryCharCount > 0) { | |
158 | result = (signed long) checkDictionary(prev, (int32_t) result, FALSE); | |
159 | } | |
160 | ||
161 | emitToken: | |
162 | // The state machine is done. Check whether it found a match... | |
f3c0d7a5 A |
163 | |
164 | // If the iterator failed to advance in the match engine, force it ahead by one. | |
165 | // (This really indicates a defect in the break rules. They should always match | |
166 | // at least one character.). Added in open-source ICU r13469 | |
167 | UBool setFlagsZero = FALSE; | |
168 | if (result == prev) { | |
169 | UTEXT_SETNATIVEINDEX(text, prev); | |
170 | UTEXT_NEXT32(text); | |
171 | result = (int32_t)UTEXT_GETNATIVEINDEX(text); | |
172 | setFlagsZero = TRUE; | |
173 | } | |
174 | ||
73c04bcf A |
175 | // Leave the iterator at our result position. |
176 | UTEXT_SETNATIVEINDEX(text, result); | |
177 | ||
178 | RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)}; | |
f3c0d7a5 | 179 | int32_t flags = (!setFlagsZero)? fStateFlags[lastAcceptingState]: 0; |
73c04bcf | 180 | |
f3c0d7a5 | 181 | if (flags == -1) { |
73c04bcf | 182 | goto skipToken; |
f3c0d7a5 | 183 | } |
73c04bcf | 184 | |
f3c0d7a5 A |
185 | #ifdef RBBI_DEBUG |
186 | if (fTrace) { | |
187 | RBBIDebugPrintf("Emit location %3ld length %2ld flags %08X\n", range.location, range.length, flags); | |
188 | } | |
189 | #endif | |
73c04bcf A |
190 | *outTokenP++ = range; |
191 | if (outTokenFlags) | |
192 | { | |
193 | *outTokenFlags++ = (unsigned long) flags; | |
194 | } | |
195 | ||
f3c0d7a5 | 196 | if (flags & 0x40000000) { |
73c04bcf | 197 | goto exitTokenizer; |
f3c0d7a5 | 198 | } |
73c04bcf A |
199 | |
200 | skipToken: | |
201 | prev = result; | |
202 | } | |
f3c0d7a5 | 203 | |
73c04bcf A |
204 | exitTokenizer: |
205 | return (outTokenP - outTokenRanges); | |
206 | } | |
207 | ||
73c04bcf A |
208 | void |
209 | RuleBasedTokenizer::init() | |
210 | { | |
211 | const RBBIStateTable *statetable = fData->fForwardTable; | |
212 | setBreakType(UBRK_WORD); | |
213 | fStartRow = (const RBBIStateTableRow *) | |
214 | (statetable->fTableData + (statetable->fRowLen * START_STATE)); | |
215 | UChar i; | |
216 | const UTrie *trie = &fData->fTrie; | |
f3c0d7a5 | 217 | //int16_t category; |
73c04bcf A |
218 | fLatin1Cat = new int16_t[256]; |
219 | for (i = 0; i < 256; ++i) | |
220 | { | |
221 | //UTRIE_GET16(trie, i, category); | |
222 | //fLatin1Cat[i] = category; | |
223 | fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i); | |
224 | } | |
225 | fStateFlags = new int32_t[statetable->fNumStates]; | |
226 | for (i = 0; i < statetable->fNumStates; ++i) | |
227 | { | |
228 | const RBBIStateTableRow *row = (const RBBIStateTableRow *) | |
229 | (statetable->fTableData + (statetable->fRowLen * i)); | |
230 | int32_t flags = 0; | |
f3c0d7a5 | 231 | if (row->fAccepting == -1 && row->fTagIdx != 0) |
73c04bcf A |
232 | { |
233 | const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx); | |
234 | const int32_t *valLimit = vals + 1; | |
235 | valLimit += *vals++; | |
236 | while (vals < valLimit) | |
237 | { | |
238 | int32_t val = *vals++; | |
239 | if (val == 0) | |
240 | { | |
241 | break; | |
242 | } | |
243 | else if (val > 0) | |
244 | { | |
245 | flags |= val; | |
246 | } | |
247 | else | |
248 | { | |
249 | flags = val; | |
250 | break; | |
251 | } | |
252 | } | |
253 | } | |
254 | fStateFlags[i] = flags; | |
255 | } | |
256 | } | |
257 | ||
258 | RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err) | |
0f5d89e8 | 259 | : RuleBasedBreakIterator57(rules, parseErr, err) |
73c04bcf | 260 | { |
2ca993e8 A |
261 | if (U_SUCCESS(err)) { |
262 | init(); | |
263 | } | |
73c04bcf A |
264 | } |
265 | ||
266 | RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status) | |
0f5d89e8 | 267 | : RuleBasedBreakIterator57((RBBIDataHeader57 *)data, status) |
73c04bcf | 268 | { |
2ca993e8 A |
269 | if (U_SUCCESS(status)) { |
270 | init(); | |
271 | } | |
73c04bcf A |
272 | } |
273 | ||
46f4442e | 274 | RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status) |
0f5d89e8 | 275 | : RuleBasedBreakIterator57((const RBBIDataHeader57 *)data, RuleBasedBreakIterator57::kDontAdopt, status) |
46f4442e | 276 | { |
2ca993e8 A |
277 | if (U_SUCCESS(status)) { |
278 | init(); | |
279 | } | |
46f4442e A |
280 | } |
281 | ||
73c04bcf A |
282 | RuleBasedTokenizer::~RuleBasedTokenizer() { |
283 | delete [] fStateFlags; | |
284 | delete [] fLatin1Cat; | |
285 | } | |
286 | ||
287 | U_NAMESPACE_END | |
288 | ||
289 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |