]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | *************************************************************************** | |
46f4442e | 3 | * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. * |
73c04bcf | 4 | *************************************************************************** |
73c04bcf A |
5 | */ |
6 | ||
7 | #include "unicode/utypes.h" | |
8 | ||
9 | #if !UCONFIG_NO_BREAK_ITERATION | |
10 | ||
11 | #include "rbtok.h" | |
12 | #include "unicode/ustring.h" | |
13 | #include "unicode/utext.h" | |
14 | #include "rbbidata.h" | |
15 | ||
16 | U_NAMESPACE_BEGIN | |
17 | ||
18 | ||
19 | #if defined(__GNUC__) && (__GNUC__ >= 4) | |
20 | #pragma GCC optimization_level 3 | |
21 | #endif | |
22 | ||
23 | static const int16_t START_STATE = 1; // The state number of the starting state | |
24 | static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop" | |
25 | ||
26 | int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags) | |
27 | { | |
28 | RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens; | |
29 | RuleBasedTokenRange *outTokenP = outTokenRanges; | |
30 | int32_t state; | |
31 | int16_t category; | |
32 | ||
33 | const RBBIStateTableRow *row; | |
34 | const RBBIStateTableRow *const startRow = fStartRow; | |
35 | ||
36 | int32_t lastAcceptingState = 0; | |
37 | UChar32 c = 0; | |
38 | signed long prev; | |
39 | signed long result; | |
40 | const char *const tableData = fData->fForwardTable->fTableData; | |
41 | const uint32_t tableRowLen = fData->fForwardTable->fRowLen; | |
42 | UText *text = fText; | |
43 | ||
44 | #ifdef RBBI_DEBUG | |
45 | if (fTrace) { | |
46 | RBBIDebugPuts("Handle Next pos char state category"); | |
47 | } | |
48 | #endif | |
49 | ||
50 | fLastStatusIndexValid = FALSE; | |
51 | ||
52 | // if we're already at the end of the text, return DONE. | |
53 | prev = (signed long)UTEXT_GETNATIVEINDEX(text); | |
54 | ||
55 | // loop until we reach the end of the text or transition to state 0 | |
56 | // | |
57 | const UTrie *trie = &fData->fTrie; | |
58 | while (outTokenP < outTokenLimit) { | |
59 | c = UTEXT_NEXT32(text); | |
60 | if (c == U_SENTINEL) | |
61 | { | |
62 | goto exitTokenizer; | |
63 | } | |
64 | // Set the initial state for the state machine | |
65 | state = START_STATE; | |
66 | row = startRow; | |
67 | ||
68 | // if we have cached break positions and we're still in the range | |
69 | // covered by them, just move one step forward in the cache | |
70 | if (fCachedBreakPositions != NULL) { | |
71 | if (fPositionInCache < fNumCachedBreakPositions - 1) { | |
72 | ++fPositionInCache; | |
73 | result = fCachedBreakPositions[fPositionInCache]; | |
74 | goto emitToken; | |
75 | } | |
76 | else { | |
77 | reset(); | |
78 | } | |
79 | } | |
80 | ||
81 | while (c != U_SENTINEL) { | |
82 | // | |
83 | // Get the char category. An incoming category of 1 or 2 means that | |
84 | // we are preset for doing the beginning or end of input, and | |
85 | // that we shouldn't get a category from an actual text input character. | |
86 | // | |
87 | // look up the current character's character category, which tells us | |
88 | // which column in the state table to look at. | |
89 | // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, | |
90 | // not the size of the character going in, which is a UChar32. | |
91 | // | |
92 | if (__builtin_expect((c < 0x100), 1)) | |
93 | category = fLatin1Cat[c]; | |
94 | else | |
95 | UTRIE_GET16(trie, c, category); | |
96 | ||
97 | // Check the dictionary bit in the character's category. | |
98 | // Counter is only used by dictionary based iterators (subclasses). | |
99 | // Chars that need to be handled by a dictionary have a flag bit set | |
100 | // in their category values. | |
101 | // | |
102 | if (__builtin_expect((category & 0x4000) != 0, 0)) { | |
103 | fDictionaryCharCount++; | |
104 | // And off the dictionary flag bit. | |
105 | category &= ~0x4000; | |
106 | } | |
107 | ||
108 | #ifdef RBBI_DEBUG | |
109 | if (fTrace) { | |
110 | RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText)); | |
111 | if (0x20<=c && c<0x7f) { | |
112 | RBBIDebugPrintf("\"%c\" ", c); | |
113 | } else { | |
114 | RBBIDebugPrintf("%5x ", c); | |
115 | } | |
116 | RBBIDebugPrintf("%3d %3d\n", state, category); | |
117 | } | |
118 | #endif | |
119 | ||
120 | // State Transition - move machine to its next state | |
121 | // | |
122 | state = row->fNextState[category]; | |
123 | row = (const RBBIStateTableRow *) (tableData + tableRowLen * state); | |
124 | ||
125 | if (row->fAccepting == -1) { | |
126 | // Match found, common case. | |
127 | result = (signed long)UTEXT_GETNATIVEINDEX(text); | |
128 | //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. | |
129 | //lastStatusRow = row; | |
130 | lastAcceptingState = state; | |
131 | } | |
132 | ||
133 | if (state == STOP_STATE) { | |
134 | // This is the normal exit from the lookup state machine. | |
135 | // We have advanced through the string until it is certain that no | |
136 | // longer match is possible, no matter what characters follow. | |
137 | break; | |
138 | } | |
139 | ||
140 | // Advance to the next character. | |
141 | // If this is a beginning-of-input loop iteration, don't advance | |
142 | // the input position. The next iteration will be processing the | |
143 | // first real input character. | |
144 | c = UTEXT_NEXT32(text); | |
145 | } | |
146 | ||
147 | if (fDictionaryCharCount > 0) { | |
148 | result = (signed long) checkDictionary(prev, (int32_t) result, FALSE); | |
149 | } | |
150 | ||
151 | emitToken: | |
152 | // The state machine is done. Check whether it found a match... | |
153 | ||
154 | // Leave the iterator at our result position. | |
155 | UTEXT_SETNATIVEINDEX(text, result); | |
156 | ||
157 | RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)}; | |
158 | int32_t flags = fStateFlags[lastAcceptingState]; | |
159 | ||
160 | if (flags == -1) | |
161 | goto skipToken; | |
162 | ||
163 | *outTokenP++ = range; | |
164 | if (outTokenFlags) | |
165 | { | |
166 | *outTokenFlags++ = (unsigned long) flags; | |
167 | } | |
168 | ||
169 | if (flags & 0x40000000) | |
170 | goto exitTokenizer; | |
171 | ||
172 | skipToken: | |
173 | prev = result; | |
174 | } | |
175 | ||
176 | exitTokenizer: | |
177 | return (outTokenP - outTokenRanges); | |
178 | } | |
179 | ||
180 | #if defined (__GNUC__) && (__GNUC__ >= 4) | |
181 | #pragma GCC optimization_level reset | |
182 | #endif | |
183 | ||
184 | void | |
185 | RuleBasedTokenizer::init() | |
186 | { | |
187 | const RBBIStateTable *statetable = fData->fForwardTable; | |
188 | setBreakType(UBRK_WORD); | |
189 | fStartRow = (const RBBIStateTableRow *) | |
190 | (statetable->fTableData + (statetable->fRowLen * START_STATE)); | |
191 | UChar i; | |
192 | const UTrie *trie = &fData->fTrie; | |
193 | int16_t category; | |
194 | fLatin1Cat = new int16_t[256]; | |
195 | for (i = 0; i < 256; ++i) | |
196 | { | |
197 | //UTRIE_GET16(trie, i, category); | |
198 | //fLatin1Cat[i] = category; | |
199 | fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i); | |
200 | } | |
201 | fStateFlags = new int32_t[statetable->fNumStates]; | |
202 | for (i = 0; i < statetable->fNumStates; ++i) | |
203 | { | |
204 | const RBBIStateTableRow *row = (const RBBIStateTableRow *) | |
205 | (statetable->fTableData + (statetable->fRowLen * i)); | |
206 | int32_t flags = 0; | |
207 | if (row->fAccepting == -1) | |
208 | { | |
209 | const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx); | |
210 | const int32_t *valLimit = vals + 1; | |
211 | valLimit += *vals++; | |
212 | while (vals < valLimit) | |
213 | { | |
214 | int32_t val = *vals++; | |
215 | if (val == 0) | |
216 | { | |
217 | break; | |
218 | } | |
219 | else if (val > 0) | |
220 | { | |
221 | flags |= val; | |
222 | } | |
223 | else | |
224 | { | |
225 | flags = val; | |
226 | break; | |
227 | } | |
228 | } | |
229 | } | |
230 | fStateFlags[i] = flags; | |
231 | } | |
232 | } | |
233 | ||
234 | RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err) | |
235 | : RuleBasedBreakIterator(rules, parseErr, err) | |
236 | { | |
237 | init(); | |
238 | } | |
239 | ||
240 | RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status) | |
241 | : RuleBasedBreakIterator((RBBIDataHeader *)data, status) | |
242 | { | |
243 | init(); | |
244 | } | |
245 | ||
46f4442e A |
246 | RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status) |
247 | : RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status) | |
248 | { | |
249 | init(); | |
250 | } | |
251 | ||
73c04bcf A |
252 | RuleBasedTokenizer::~RuleBasedTokenizer() { |
253 | delete [] fStateFlags; | |
254 | delete [] fLatin1Cat; | |
255 | } | |
256 | ||
257 | U_NAMESPACE_END | |
258 | ||
259 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |