]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbtok.cpp
ICU-8.11.1.tar.gz
[apple/icu.git] / icuSources / common / rbtok.cpp
CommitLineData
73c04bcf
A
1/*
2***************************************************************************
3* Copyright (C) 2006 Apple Computer, Inc. All rights reserved. *
4***************************************************************************
5
6*/
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "rbtok.h"
13#include "unicode/ustring.h"
14#include "unicode/utext.h"
15#include "rbbidata.h"
16
17U_NAMESPACE_BEGIN
18
19
20#if defined(__GNUC__) && (__GNUC__ >= 4)
21#pragma GCC optimization_level 3
22#endif
23
24static const int16_t START_STATE = 1; // The state number of the starting state
25static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
26
27int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
28{
29 RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
30 RuleBasedTokenRange *outTokenP = outTokenRanges;
31 int32_t state;
32 int16_t category;
33
34 const RBBIStateTableRow *row;
35 const RBBIStateTableRow *const startRow = fStartRow;
36
37 int32_t lastAcceptingState = 0;
38 UChar32 c = 0;
39 signed long prev;
40 signed long result;
41 const char *const tableData = fData->fForwardTable->fTableData;
42 const uint32_t tableRowLen = fData->fForwardTable->fRowLen;
43 UText *text = fText;
44
45 #ifdef RBBI_DEBUG
46 if (fTrace) {
47 RBBIDebugPuts("Handle Next pos char state category");
48 }
49 #endif
50
51 fLastStatusIndexValid = FALSE;
52
53 // if we're already at the end of the text, return DONE.
54 prev = (signed long)UTEXT_GETNATIVEINDEX(text);
55
56 // loop until we reach the end of the text or transition to state 0
57 //
58 const UTrie *trie = &fData->fTrie;
59 while (outTokenP < outTokenLimit) {
60 c = UTEXT_NEXT32(text);
61 if (c == U_SENTINEL)
62 {
63 goto exitTokenizer;
64 }
65 // Set the initial state for the state machine
66 state = START_STATE;
67 row = startRow;
68
69 // if we have cached break positions and we're still in the range
70 // covered by them, just move one step forward in the cache
71 if (fCachedBreakPositions != NULL) {
72 if (fPositionInCache < fNumCachedBreakPositions - 1) {
73 ++fPositionInCache;
74 result = fCachedBreakPositions[fPositionInCache];
75 goto emitToken;
76 }
77 else {
78 reset();
79 }
80 }
81
82 while (c != U_SENTINEL) {
83 //
84 // Get the char category. An incoming category of 1 or 2 means that
85 // we are preset for doing the beginning or end of input, and
86 // that we shouldn't get a category from an actual text input character.
87 //
88 // look up the current character's character category, which tells us
89 // which column in the state table to look at.
90 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
91 // not the size of the character going in, which is a UChar32.
92 //
93 if (__builtin_expect((c < 0x100), 1))
94 category = fLatin1Cat[c];
95 else
96 UTRIE_GET16(trie, c, category);
97
98 // Check the dictionary bit in the character's category.
99 // Counter is only used by dictionary based iterators (subclasses).
100 // Chars that need to be handled by a dictionary have a flag bit set
101 // in their category values.
102 //
103 if (__builtin_expect((category & 0x4000) != 0, 0)) {
104 fDictionaryCharCount++;
105 // And off the dictionary flag bit.
106 category &= ~0x4000;
107 }
108
109 #ifdef RBBI_DEBUG
110 if (fTrace) {
111 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText));
112 if (0x20<=c && c<0x7f) {
113 RBBIDebugPrintf("\"%c\" ", c);
114 } else {
115 RBBIDebugPrintf("%5x ", c);
116 }
117 RBBIDebugPrintf("%3d %3d\n", state, category);
118 }
119 #endif
120
121 // State Transition - move machine to its next state
122 //
123 state = row->fNextState[category];
124 row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
125
126 if (row->fAccepting == -1) {
127 // Match found, common case.
128 result = (signed long)UTEXT_GETNATIVEINDEX(text);
129 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
130 //lastStatusRow = row;
131 lastAcceptingState = state;
132 }
133
134 if (state == STOP_STATE) {
135 // This is the normal exit from the lookup state machine.
136 // We have advanced through the string until it is certain that no
137 // longer match is possible, no matter what characters follow.
138 break;
139 }
140
141 // Advance to the next character.
142 // If this is a beginning-of-input loop iteration, don't advance
143 // the input position. The next iteration will be processing the
144 // first real input character.
145 c = UTEXT_NEXT32(text);
146 }
147
148 if (fDictionaryCharCount > 0) {
149 result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
150 }
151
152emitToken:
153 // The state machine is done. Check whether it found a match...
154
155 // Leave the iterator at our result position.
156 UTEXT_SETNATIVEINDEX(text, result);
157
158 RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
159 int32_t flags = fStateFlags[lastAcceptingState];
160
161 if (flags == -1)
162 goto skipToken;
163
164 *outTokenP++ = range;
165 if (outTokenFlags)
166 {
167 *outTokenFlags++ = (unsigned long) flags;
168 }
169
170 if (flags & 0x40000000)
171 goto exitTokenizer;
172
173skipToken:
174 prev = result;
175 }
176
177exitTokenizer:
178 return (outTokenP - outTokenRanges);
179}
180
181#if defined (__GNUC__) && (__GNUC__ >= 4)
182#pragma GCC optimization_level reset
183#endif
184
185void
186RuleBasedTokenizer::init()
187{
188 const RBBIStateTable *statetable = fData->fForwardTable;
189 setBreakType(UBRK_WORD);
190 fStartRow = (const RBBIStateTableRow *)
191 (statetable->fTableData + (statetable->fRowLen * START_STATE));
192 UChar i;
193 const UTrie *trie = &fData->fTrie;
194 int16_t category;
195 fLatin1Cat = new int16_t[256];
196 for (i = 0; i < 256; ++i)
197 {
198 //UTRIE_GET16(trie, i, category);
199 //fLatin1Cat[i] = category;
200 fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
201 }
202 fStateFlags = new int32_t[statetable->fNumStates];
203 for (i = 0; i < statetable->fNumStates; ++i)
204 {
205 const RBBIStateTableRow *row = (const RBBIStateTableRow *)
206 (statetable->fTableData + (statetable->fRowLen * i));
207 int32_t flags = 0;
208 if (row->fAccepting == -1)
209 {
210 const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
211 const int32_t *valLimit = vals + 1;
212 valLimit += *vals++;
213 while (vals < valLimit)
214 {
215 int32_t val = *vals++;
216 if (val == 0)
217 {
218 break;
219 }
220 else if (val > 0)
221 {
222 flags |= val;
223 }
224 else
225 {
226 flags = val;
227 break;
228 }
229 }
230 }
231 fStateFlags[i] = flags;
232 }
233}
234
235RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
236 : RuleBasedBreakIterator(rules, parseErr, err)
237{
238 init();
239}
240
241RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
242 : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
243{
244 init();
245}
246
247RuleBasedTokenizer::~RuleBasedTokenizer() {
248 delete [] fStateFlags;
249 delete [] fLatin1Cat;
250}
251
252U_NAMESPACE_END
253
254#endif /* #if !UCONFIG_NO_BREAK_ITERATION */