]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbtok.cpp
ICU-551.30.tar.gz
[apple/icu.git] / icuSources / common / rbtok.cpp
1 /*
2 ***************************************************************************
3 * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. *
4 ***************************************************************************
5 */
6
7 #include "unicode/utypes.h"
8
9 #if !UCONFIG_NO_BREAK_ITERATION
10
11 #include "rbtok.h"
12 #include "unicode/ustring.h"
13 #include "unicode/utext.h"
14 #include "rbbidata.h"
15
16 U_NAMESPACE_BEGIN
17
18
19 #if defined(__GNUC__) && (__GNUC__ >= 4)
20 #pragma GCC optimization_level 3
21 #endif
22
23 static const int16_t START_STATE = 1; // The state number of the starting state
24 static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
25
26 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
27 {
28 RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
29 RuleBasedTokenRange *outTokenP = outTokenRanges;
30 int32_t state;
31 int16_t category;
32
33 const RBBIStateTableRow *row;
34 const RBBIStateTableRow *const startRow = fStartRow;
35
36 int32_t lastAcceptingState = 0;
37 UChar32 c = 0;
38 signed long prev;
39 signed long result;
40 const char *const tableData = fData->fForwardTable->fTableData;
41 const uint32_t tableRowLen = fData->fForwardTable->fRowLen;
42 UText *text = fText;
43
44 #ifdef RBBI_DEBUG
45 if (fTrace) {
46 RBBIDebugPuts("Handle Next pos char state category");
47 }
48 #endif
49
50 fLastStatusIndexValid = FALSE;
51
52 // if we're already at the end of the text, return DONE.
53 prev = (signed long)UTEXT_GETNATIVEINDEX(text);
54
55 // loop until we reach the end of the text or transition to state 0
56 //
57 const UTrie *trie = &fData->fTrie;
58 while (outTokenP < outTokenLimit) {
59 c = UTEXT_NEXT32(text);
60 if (c == U_SENTINEL)
61 {
62 goto exitTokenizer;
63 }
64 // Set the initial state for the state machine
65 state = START_STATE;
66 row = startRow;
67
68 // if we have cached break positions and we're still in the range
69 // covered by them, just move one step forward in the cache
70 if (fCachedBreakPositions != NULL) {
71 if (fPositionInCache < fNumCachedBreakPositions - 1) {
72 ++fPositionInCache;
73 result = fCachedBreakPositions[fPositionInCache];
74 goto emitToken;
75 }
76 else {
77 reset();
78 }
79 }
80
81 while (c != U_SENTINEL) {
82 //
83 // Get the char category. An incoming category of 1 or 2 means that
84 // we are preset for doing the beginning or end of input, and
85 // that we shouldn't get a category from an actual text input character.
86 //
87 // look up the current character's character category, which tells us
88 // which column in the state table to look at.
89 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
90 // not the size of the character going in, which is a UChar32.
91 //
92 if (__builtin_expect((c < 0x100), 1))
93 category = fLatin1Cat[c];
94 else
95 UTRIE_GET16(trie, c, category);
96
97 // Check the dictionary bit in the character's category.
98 // Counter is only used by dictionary based iterators (subclasses).
99 // Chars that need to be handled by a dictionary have a flag bit set
100 // in their category values.
101 //
102 if (__builtin_expect((category & 0x4000) != 0, 0)) {
103 fDictionaryCharCount++;
104 // And off the dictionary flag bit.
105 category &= ~0x4000;
106 }
107
108 #ifdef RBBI_DEBUG
109 if (fTrace) {
110 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText));
111 if (0x20<=c && c<0x7f) {
112 RBBIDebugPrintf("\"%c\" ", c);
113 } else {
114 RBBIDebugPrintf("%5x ", c);
115 }
116 RBBIDebugPrintf("%3d %3d\n", state, category);
117 }
118 #endif
119
120 // State Transition - move machine to its next state
121 //
122 state = row->fNextState[category];
123 row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
124
125 if (row->fAccepting == -1) {
126 // Match found, common case.
127 result = (signed long)UTEXT_GETNATIVEINDEX(text);
128 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
129 //lastStatusRow = row;
130 lastAcceptingState = state;
131 }
132
133 if (state == STOP_STATE) {
134 // This is the normal exit from the lookup state machine.
135 // We have advanced through the string until it is certain that no
136 // longer match is possible, no matter what characters follow.
137 break;
138 }
139
140 // Advance to the next character.
141 // If this is a beginning-of-input loop iteration, don't advance
142 // the input position. The next iteration will be processing the
143 // first real input character.
144 c = UTEXT_NEXT32(text);
145 }
146
147 if (fDictionaryCharCount > 0) {
148 result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
149 }
150
151 emitToken:
152 // The state machine is done. Check whether it found a match...
153
154 // Leave the iterator at our result position.
155 UTEXT_SETNATIVEINDEX(text, result);
156
157 RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
158 int32_t flags = fStateFlags[lastAcceptingState];
159
160 if (flags == -1)
161 goto skipToken;
162
163 *outTokenP++ = range;
164 if (outTokenFlags)
165 {
166 *outTokenFlags++ = (unsigned long) flags;
167 }
168
169 if (flags & 0x40000000)
170 goto exitTokenizer;
171
172 skipToken:
173 prev = result;
174 }
175
176 exitTokenizer:
177 return (outTokenP - outTokenRanges);
178 }
179
180 #if defined (__GNUC__) && (__GNUC__ >= 4)
181 #pragma GCC optimization_level reset
182 #endif
183
184 void
185 RuleBasedTokenizer::init()
186 {
187 const RBBIStateTable *statetable = fData->fForwardTable;
188 setBreakType(UBRK_WORD);
189 fStartRow = (const RBBIStateTableRow *)
190 (statetable->fTableData + (statetable->fRowLen * START_STATE));
191 UChar i;
192 const UTrie *trie = &fData->fTrie;
193 int16_t category;
194 fLatin1Cat = new int16_t[256];
195 for (i = 0; i < 256; ++i)
196 {
197 //UTRIE_GET16(trie, i, category);
198 //fLatin1Cat[i] = category;
199 fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
200 }
201 fStateFlags = new int32_t[statetable->fNumStates];
202 for (i = 0; i < statetable->fNumStates; ++i)
203 {
204 const RBBIStateTableRow *row = (const RBBIStateTableRow *)
205 (statetable->fTableData + (statetable->fRowLen * i));
206 int32_t flags = 0;
207 if (row->fAccepting == -1)
208 {
209 const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
210 const int32_t *valLimit = vals + 1;
211 valLimit += *vals++;
212 while (vals < valLimit)
213 {
214 int32_t val = *vals++;
215 if (val == 0)
216 {
217 break;
218 }
219 else if (val > 0)
220 {
221 flags |= val;
222 }
223 else
224 {
225 flags = val;
226 break;
227 }
228 }
229 }
230 fStateFlags[i] = flags;
231 }
232 }
233
234 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
235 : RuleBasedBreakIterator(rules, parseErr, err)
236 {
237 init();
238 }
239
240 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
241 : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
242 {
243 init();
244 }
245
246 RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
247 : RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status)
248 {
249 init();
250 }
251
252 RuleBasedTokenizer::~RuleBasedTokenizer() {
253 delete [] fStateFlags;
254 delete [] fLatin1Cat;
255 }
256
257 U_NAMESPACE_END
258
259 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */