]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | *************************************************************************** | |
3 | * Copyright (C) 2006 Apple Computer, Inc. All rights reserved. * | |
4 | *************************************************************************** | |
5 | ||
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | ||
10 | #if !UCONFIG_NO_BREAK_ITERATION | |
11 | ||
12 | #include "rbtok.h" | |
13 | #include "unicode/ustring.h" | |
14 | #include "unicode/utext.h" | |
15 | #include "rbbidata.h" | |
16 | ||
17 | U_NAMESPACE_BEGIN | |
18 | ||
19 | ||
20 | #if defined(__GNUC__) && (__GNUC__ >= 4) | |
21 | #pragma GCC optimization_level 3 | |
22 | #endif | |
23 | ||
24 | static const int16_t START_STATE = 1; // The state number of the starting state | |
25 | static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop" | |
26 | ||
27 | int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags) | |
28 | { | |
29 | RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens; | |
30 | RuleBasedTokenRange *outTokenP = outTokenRanges; | |
31 | int32_t state; | |
32 | int16_t category; | |
33 | ||
34 | const RBBIStateTableRow *row; | |
35 | const RBBIStateTableRow *const startRow = fStartRow; | |
36 | ||
37 | int32_t lastAcceptingState = 0; | |
38 | UChar32 c = 0; | |
39 | signed long prev; | |
40 | signed long result; | |
41 | const char *const tableData = fData->fForwardTable->fTableData; | |
42 | const uint32_t tableRowLen = fData->fForwardTable->fRowLen; | |
43 | UText *text = fText; | |
44 | ||
45 | #ifdef RBBI_DEBUG | |
46 | if (fTrace) { | |
47 | RBBIDebugPuts("Handle Next pos char state category"); | |
48 | } | |
49 | #endif | |
50 | ||
51 | fLastStatusIndexValid = FALSE; | |
52 | ||
53 | // if we're already at the end of the text, return DONE. | |
54 | prev = (signed long)UTEXT_GETNATIVEINDEX(text); | |
55 | ||
56 | // loop until we reach the end of the text or transition to state 0 | |
57 | // | |
58 | const UTrie *trie = &fData->fTrie; | |
59 | while (outTokenP < outTokenLimit) { | |
60 | c = UTEXT_NEXT32(text); | |
61 | if (c == U_SENTINEL) | |
62 | { | |
63 | goto exitTokenizer; | |
64 | } | |
65 | // Set the initial state for the state machine | |
66 | state = START_STATE; | |
67 | row = startRow; | |
68 | ||
69 | // if we have cached break positions and we're still in the range | |
70 | // covered by them, just move one step forward in the cache | |
71 | if (fCachedBreakPositions != NULL) { | |
72 | if (fPositionInCache < fNumCachedBreakPositions - 1) { | |
73 | ++fPositionInCache; | |
74 | result = fCachedBreakPositions[fPositionInCache]; | |
75 | goto emitToken; | |
76 | } | |
77 | else { | |
78 | reset(); | |
79 | } | |
80 | } | |
81 | ||
82 | while (c != U_SENTINEL) { | |
83 | // | |
84 | // Get the char category. An incoming category of 1 or 2 means that | |
85 | // we are preset for doing the beginning or end of input, and | |
86 | // that we shouldn't get a category from an actual text input character. | |
87 | // | |
88 | // look up the current character's character category, which tells us | |
89 | // which column in the state table to look at. | |
90 | // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, | |
91 | // not the size of the character going in, which is a UChar32. | |
92 | // | |
93 | if (__builtin_expect((c < 0x100), 1)) | |
94 | category = fLatin1Cat[c]; | |
95 | else | |
96 | UTRIE_GET16(trie, c, category); | |
97 | ||
98 | // Check the dictionary bit in the character's category. | |
99 | // Counter is only used by dictionary based iterators (subclasses). | |
100 | // Chars that need to be handled by a dictionary have a flag bit set | |
101 | // in their category values. | |
102 | // | |
103 | if (__builtin_expect((category & 0x4000) != 0, 0)) { | |
104 | fDictionaryCharCount++; | |
105 | // And off the dictionary flag bit. | |
106 | category &= ~0x4000; | |
107 | } | |
108 | ||
109 | #ifdef RBBI_DEBUG | |
110 | if (fTrace) { | |
111 | RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText)); | |
112 | if (0x20<=c && c<0x7f) { | |
113 | RBBIDebugPrintf("\"%c\" ", c); | |
114 | } else { | |
115 | RBBIDebugPrintf("%5x ", c); | |
116 | } | |
117 | RBBIDebugPrintf("%3d %3d\n", state, category); | |
118 | } | |
119 | #endif | |
120 | ||
121 | // State Transition - move machine to its next state | |
122 | // | |
123 | state = row->fNextState[category]; | |
124 | row = (const RBBIStateTableRow *) (tableData + tableRowLen * state); | |
125 | ||
126 | if (row->fAccepting == -1) { | |
127 | // Match found, common case. | |
128 | result = (signed long)UTEXT_GETNATIVEINDEX(text); | |
129 | //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. | |
130 | //lastStatusRow = row; | |
131 | lastAcceptingState = state; | |
132 | } | |
133 | ||
134 | if (state == STOP_STATE) { | |
135 | // This is the normal exit from the lookup state machine. | |
136 | // We have advanced through the string until it is certain that no | |
137 | // longer match is possible, no matter what characters follow. | |
138 | break; | |
139 | } | |
140 | ||
141 | // Advance to the next character. | |
142 | // If this is a beginning-of-input loop iteration, don't advance | |
143 | // the input position. The next iteration will be processing the | |
144 | // first real input character. | |
145 | c = UTEXT_NEXT32(text); | |
146 | } | |
147 | ||
148 | if (fDictionaryCharCount > 0) { | |
149 | result = (signed long) checkDictionary(prev, (int32_t) result, FALSE); | |
150 | } | |
151 | ||
152 | emitToken: | |
153 | // The state machine is done. Check whether it found a match... | |
154 | ||
155 | // Leave the iterator at our result position. | |
156 | UTEXT_SETNATIVEINDEX(text, result); | |
157 | ||
158 | RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)}; | |
159 | int32_t flags = fStateFlags[lastAcceptingState]; | |
160 | ||
161 | if (flags == -1) | |
162 | goto skipToken; | |
163 | ||
164 | *outTokenP++ = range; | |
165 | if (outTokenFlags) | |
166 | { | |
167 | *outTokenFlags++ = (unsigned long) flags; | |
168 | } | |
169 | ||
170 | if (flags & 0x40000000) | |
171 | goto exitTokenizer; | |
172 | ||
173 | skipToken: | |
174 | prev = result; | |
175 | } | |
176 | ||
177 | exitTokenizer: | |
178 | return (outTokenP - outTokenRanges); | |
179 | } | |
180 | ||
181 | #if defined (__GNUC__) && (__GNUC__ >= 4) | |
182 | #pragma GCC optimization_level reset | |
183 | #endif | |
184 | ||
185 | void | |
186 | RuleBasedTokenizer::init() | |
187 | { | |
188 | const RBBIStateTable *statetable = fData->fForwardTable; | |
189 | setBreakType(UBRK_WORD); | |
190 | fStartRow = (const RBBIStateTableRow *) | |
191 | (statetable->fTableData + (statetable->fRowLen * START_STATE)); | |
192 | UChar i; | |
193 | const UTrie *trie = &fData->fTrie; | |
194 | int16_t category; | |
195 | fLatin1Cat = new int16_t[256]; | |
196 | for (i = 0; i < 256; ++i) | |
197 | { | |
198 | //UTRIE_GET16(trie, i, category); | |
199 | //fLatin1Cat[i] = category; | |
200 | fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i); | |
201 | } | |
202 | fStateFlags = new int32_t[statetable->fNumStates]; | |
203 | for (i = 0; i < statetable->fNumStates; ++i) | |
204 | { | |
205 | const RBBIStateTableRow *row = (const RBBIStateTableRow *) | |
206 | (statetable->fTableData + (statetable->fRowLen * i)); | |
207 | int32_t flags = 0; | |
208 | if (row->fAccepting == -1) | |
209 | { | |
210 | const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx); | |
211 | const int32_t *valLimit = vals + 1; | |
212 | valLimit += *vals++; | |
213 | while (vals < valLimit) | |
214 | { | |
215 | int32_t val = *vals++; | |
216 | if (val == 0) | |
217 | { | |
218 | break; | |
219 | } | |
220 | else if (val > 0) | |
221 | { | |
222 | flags |= val; | |
223 | } | |
224 | else | |
225 | { | |
226 | flags = val; | |
227 | break; | |
228 | } | |
229 | } | |
230 | } | |
231 | fStateFlags[i] = flags; | |
232 | } | |
233 | } | |
234 | ||
235 | RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err) | |
236 | : RuleBasedBreakIterator(rules, parseErr, err) | |
237 | { | |
238 | init(); | |
239 | } | |
240 | ||
241 | RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status) | |
242 | : RuleBasedBreakIterator((RBBIDataHeader *)data, status) | |
243 | { | |
244 | init(); | |
245 | } | |
246 | ||
247 | RuleBasedTokenizer::~RuleBasedTokenizer() { | |
248 | delete [] fStateFlags; | |
249 | delete [] fLatin1Cat; | |
250 | } | |
251 | ||
252 | U_NAMESPACE_END | |
253 | ||
254 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |