]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbtok.cpp
ICU-62135.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbtok.cpp
CommitLineData
73c04bcf
A
1/*
2***************************************************************************
0f5d89e8 3* Copyright (C) 2006-2008,2017-2018 Apple Inc. All Rights Reserved. *
73c04bcf 4***************************************************************************
73c04bcf
A
5*/
6
7#include "unicode/utypes.h"
8
9#if !UCONFIG_NO_BREAK_ITERATION
10
73c04bcf
A
11#include "unicode/ustring.h"
12#include "unicode/utext.h"
0f5d89e8
A
13#include "rbbidata57.h"
14#include "rbbi57.h"
15#include "rbtok.h"
f3c0d7a5 16#include "uassert.h"
73c04bcf 17
f3c0d7a5
A
18#ifdef RBBI_DEBUG
19// The following is now static in rbbi.cpp, gets set dynamicaly.
20// For now duplicate here to build, and force to TRUE if desired.
21static UBool fTrace = FALSE;
22#endif
73c04bcf 23
f3c0d7a5 24U_NAMESPACE_BEGIN
73c04bcf 25
73c04bcf
A
26
27static const int16_t START_STATE = 1; // The state number of the starting state
28static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
29
30int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
31{
32 RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
33 RuleBasedTokenRange *outTokenP = outTokenRanges;
34 int32_t state;
f3c0d7a5
A
35 uint16_t category = 0;
36
73c04bcf
A
37 const RBBIStateTableRow *row;
38 const RBBIStateTableRow *const startRow = fStartRow;
39
40 int32_t lastAcceptingState = 0;
41 UChar32 c = 0;
42 signed long prev;
43 signed long result;
44 const char *const tableData = fData->fForwardTable->fTableData;
45 const uint32_t tableRowLen = fData->fForwardTable->fRowLen;
46 UText *text = fText;
47
48 #ifdef RBBI_DEBUG
49 if (fTrace) {
50 RBBIDebugPuts("Handle Next pos char state category");
51 }
52 #endif
53
54 fLastStatusIndexValid = FALSE;
55
56 // if we're already at the end of the text, return DONE.
57 prev = (signed long)UTEXT_GETNATIVEINDEX(text);
f3c0d7a5 58
73c04bcf
A
59 // loop until we reach the end of the text or transition to state 0
60 //
f3c0d7a5 61 const UTrie *trie = &fData->fTrie;
73c04bcf 62 while (outTokenP < outTokenLimit) {
0f5d89e8 63 result = prev; // fallback initialization
f3c0d7a5 64 c = UTEXT_NEXT32(text);
73c04bcf
A
65 if (c == U_SENTINEL)
66 {
67 goto exitTokenizer;
68 }
69 // Set the initial state for the state machine
70 state = START_STATE;
71 row = startRow;
f3c0d7a5 72
73c04bcf
A
73 // if we have cached break positions and we're still in the range
74 // covered by them, just move one step forward in the cache
75 if (fCachedBreakPositions != NULL) {
76 if (fPositionInCache < fNumCachedBreakPositions - 1) {
77 ++fPositionInCache;
78 result = fCachedBreakPositions[fPositionInCache];
79 goto emitToken;
80 }
81 else {
82 reset();
83 }
84 }
85
86 while (c != U_SENTINEL) {
87 //
88 // Get the char category. An incoming category of 1 or 2 means that
89 // we are preset for doing the beginning or end of input, and
90 // that we shouldn't get a category from an actual text input character.
91 //
92 // look up the current character's character category, which tells us
93 // which column in the state table to look at.
94 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
95 // not the size of the character going in, which is a UChar32.
96 //
2ca993e8 97 if (c < 0x100)
73c04bcf
A
98 category = fLatin1Cat[c];
99 else
100 UTRIE_GET16(trie, c, category);
f3c0d7a5 101
73c04bcf
A
102 // Check the dictionary bit in the character's category.
103 // Counter is only used by dictionary based iterators (subclasses).
104 // Chars that need to be handled by a dictionary have a flag bit set
105 // in their category values.
106 //
2ca993e8 107 if ((category & 0x4000) != 0) {
73c04bcf
A
108 fDictionaryCharCount++;
109 // And off the dictionary flag bit.
110 category &= ~0x4000;
111 }
f3c0d7a5 112
73c04bcf
A
113 #ifdef RBBI_DEBUG
114 if (fTrace) {
f3c0d7a5 115 RBBIDebugPrintf(" %4lld ", utext_getNativeIndex(fText));
73c04bcf
A
116 if (0x20<=c && c<0x7f) {
117 RBBIDebugPrintf("\"%c\" ", c);
118 } else {
119 RBBIDebugPrintf("%5x ", c);
120 }
121 RBBIDebugPrintf("%3d %3d\n", state, category);
122 }
123 #endif
f3c0d7a5 124
73c04bcf
A
125 // State Transition - move machine to its next state
126 //
f3c0d7a5
A
127
128 // Note: fNextState is defined as uint16_t[2], but we are casting
129 // a generated RBBI table to RBBIStateTableRow and some tables
130 // actually have more than 2 categories.
131 U_ASSERT(category<fData->fHeader->fCatCount);
73c04bcf
A
132 state = row->fNextState[category];
133 row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
f3c0d7a5 134
73c04bcf
A
135 if (row->fAccepting == -1) {
136 // Match found, common case.
137 result = (signed long)UTEXT_GETNATIVEINDEX(text);
138 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
139 //lastStatusRow = row;
140 lastAcceptingState = state;
141 }
0f5d89e8 142
73c04bcf
A
143 if (state == STOP_STATE) {
144 // This is the normal exit from the lookup state machine.
145 // We have advanced through the string until it is certain that no
146 // longer match is possible, no matter what characters follow.
147 break;
148 }
f3c0d7a5
A
149
150 // Advance to the next character.
73c04bcf
A
151 // If this is a beginning-of-input loop iteration, don't advance
152 // the input position. The next iteration will be processing the
153 // first real input character.
154 c = UTEXT_NEXT32(text);
155 }
156
157 if (fDictionaryCharCount > 0) {
158 result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
159 }
160
161emitToken:
162 // The state machine is done. Check whether it found a match...
f3c0d7a5
A
163
164 // If the iterator failed to advance in the match engine, force it ahead by one.
165 // (This really indicates a defect in the break rules. They should always match
166 // at least one character.). Added in open-source ICU r13469
167 UBool setFlagsZero = FALSE;
168 if (result == prev) {
169 UTEXT_SETNATIVEINDEX(text, prev);
170 UTEXT_NEXT32(text);
171 result = (int32_t)UTEXT_GETNATIVEINDEX(text);
172 setFlagsZero = TRUE;
173 }
174
73c04bcf
A
175 // Leave the iterator at our result position.
176 UTEXT_SETNATIVEINDEX(text, result);
177
178 RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
f3c0d7a5 179 int32_t flags = (!setFlagsZero)? fStateFlags[lastAcceptingState]: 0;
73c04bcf 180
f3c0d7a5 181 if (flags == -1) {
73c04bcf 182 goto skipToken;
f3c0d7a5 183 }
73c04bcf 184
f3c0d7a5
A
185 #ifdef RBBI_DEBUG
186 if (fTrace) {
187 RBBIDebugPrintf("Emit location %3ld length %2ld flags %08X\n", range.location, range.length, flags);
188 }
189 #endif
73c04bcf
A
190 *outTokenP++ = range;
191 if (outTokenFlags)
192 {
193 *outTokenFlags++ = (unsigned long) flags;
194 }
195
f3c0d7a5 196 if (flags & 0x40000000) {
73c04bcf 197 goto exitTokenizer;
f3c0d7a5 198 }
73c04bcf
A
199
200skipToken:
201 prev = result;
202 }
f3c0d7a5 203
73c04bcf
A
204exitTokenizer:
205 return (outTokenP - outTokenRanges);
206}
207
73c04bcf
A
208void
209RuleBasedTokenizer::init()
210{
211 const RBBIStateTable *statetable = fData->fForwardTable;
212 setBreakType(UBRK_WORD);
213 fStartRow = (const RBBIStateTableRow *)
214 (statetable->fTableData + (statetable->fRowLen * START_STATE));
215 UChar i;
216 const UTrie *trie = &fData->fTrie;
f3c0d7a5 217 //int16_t category;
73c04bcf
A
218 fLatin1Cat = new int16_t[256];
219 for (i = 0; i < 256; ++i)
220 {
221 //UTRIE_GET16(trie, i, category);
222 //fLatin1Cat[i] = category;
223 fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
224 }
225 fStateFlags = new int32_t[statetable->fNumStates];
226 for (i = 0; i < statetable->fNumStates; ++i)
227 {
228 const RBBIStateTableRow *row = (const RBBIStateTableRow *)
229 (statetable->fTableData + (statetable->fRowLen * i));
230 int32_t flags = 0;
f3c0d7a5 231 if (row->fAccepting == -1 && row->fTagIdx != 0)
73c04bcf
A
232 {
233 const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
234 const int32_t *valLimit = vals + 1;
235 valLimit += *vals++;
236 while (vals < valLimit)
237 {
238 int32_t val = *vals++;
239 if (val == 0)
240 {
241 break;
242 }
243 else if (val > 0)
244 {
245 flags |= val;
246 }
247 else
248 {
249 flags = val;
250 break;
251 }
252 }
253 }
254 fStateFlags[i] = flags;
255 }
256}
257
258RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
0f5d89e8 259 : RuleBasedBreakIterator57(rules, parseErr, err)
73c04bcf 260{
2ca993e8
A
261 if (U_SUCCESS(err)) {
262 init();
263 }
73c04bcf
A
264}
265
266RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
0f5d89e8 267 : RuleBasedBreakIterator57((RBBIDataHeader57 *)data, status)
73c04bcf 268{
2ca993e8
A
269 if (U_SUCCESS(status)) {
270 init();
271 }
73c04bcf
A
272}
273
46f4442e 274RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
0f5d89e8 275 : RuleBasedBreakIterator57((const RBBIDataHeader57 *)data, RuleBasedBreakIterator57::kDontAdopt, status)
46f4442e 276{
2ca993e8
A
277 if (U_SUCCESS(status)) {
278 init();
279 }
46f4442e
A
280}
281
73c04bcf
A
282RuleBasedTokenizer::~RuleBasedTokenizer() {
283 delete [] fStateFlags;
284 delete [] fLatin1Cat;
285}
286
287U_NAMESPACE_END
288
289#endif /* #if !UCONFIG_NO_BREAK_ITERATION */