2 ***************************************************************************
3 * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. *
4 ***************************************************************************
7 #include "unicode/utypes.h"
9 #if !UCONFIG_NO_BREAK_ITERATION
12 #include "unicode/ustring.h"
13 #include "unicode/utext.h"
19 #if defined(__GNUC__) && (__GNUC__ >= 4)
20 #pragma GCC optimization_level 3
23 static const int16_t START_STATE
= 1; // The state number of the starting state
24 static const int16_t STOP_STATE
= 0; // The state-transition value indicating "stop"
26 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens
, RuleBasedTokenRange
*outTokenRanges
, unsigned long *outTokenFlags
)
28 RuleBasedTokenRange
*outTokenLimit
= outTokenRanges
+ maxTokens
;
29 RuleBasedTokenRange
*outTokenP
= outTokenRanges
;
33 const RBBIStateTableRow
*row
;
34 const RBBIStateTableRow
*const startRow
= fStartRow
;
36 int32_t lastAcceptingState
= 0;
40 const char *const tableData
= fData
->fForwardTable
->fTableData
;
41 const uint32_t tableRowLen
= fData
->fForwardTable
->fRowLen
;
46 RBBIDebugPuts("Handle Next pos char state category");
50 fLastStatusIndexValid
= FALSE
;
52 // if we're already at the end of the text, return DONE.
53 prev
= (signed long)UTEXT_GETNATIVEINDEX(text
);
55 // loop until we reach the end of the text or transition to state 0
57 const UTrie
*trie
= &fData
->fTrie
;
58 while (outTokenP
< outTokenLimit
) {
59 c
= UTEXT_NEXT32(text
);
64 // Set the initial state for the state machine
68 // if we have cached break positions and we're still in the range
69 // covered by them, just move one step forward in the cache
70 if (fCachedBreakPositions
!= NULL
) {
71 if (fPositionInCache
< fNumCachedBreakPositions
- 1) {
73 result
= fCachedBreakPositions
[fPositionInCache
];
81 while (c
!= U_SENTINEL
) {
83 // Get the char category. An incoming category of 1 or 2 means that
84 // we are preset for doing the beginning or end of input, and
85 // that we shouldn't get a category from an actual text input character.
87 // look up the current character's character category, which tells us
88 // which column in the state table to look at.
89 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
90 // not the size of the character going in, which is a UChar32.
93 category
= fLatin1Cat
[c
];
95 UTRIE_GET16(trie
, c
, category
);
97 // Check the dictionary bit in the character's category.
98 // Counter is only used by dictionary based iterators (subclasses).
99 // Chars that need to be handled by a dictionary have a flag bit set
100 // in their category values.
102 if ((category
& 0x4000) != 0) {
103 fDictionaryCharCount
++;
104 // And off the dictionary flag bit.
110 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText
));
111 if (0x20<=c
&& c
<0x7f) {
112 RBBIDebugPrintf("\"%c\" ", c
);
114 RBBIDebugPrintf("%5x ", c
);
116 RBBIDebugPrintf("%3d %3d\n", state
, category
);
120 // State Transition - move machine to its next state
122 state
= row
->fNextState
[category
];
123 row
= (const RBBIStateTableRow
*) (tableData
+ tableRowLen
* state
);
125 if (row
->fAccepting
== -1) {
126 // Match found, common case.
127 result
= (signed long)UTEXT_GETNATIVEINDEX(text
);
128 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
129 //lastStatusRow = row;
130 lastAcceptingState
= state
;
133 if (state
== STOP_STATE
) {
134 // This is the normal exit from the lookup state machine.
135 // We have advanced through the string until it is certain that no
136 // longer match is possible, no matter what characters follow.
140 // Advance to the next character.
141 // If this is a beginning-of-input loop iteration, don't advance
142 // the input position. The next iteration will be processing the
143 // first real input character.
144 c
= UTEXT_NEXT32(text
);
147 if (fDictionaryCharCount
> 0) {
148 result
= (signed long) checkDictionary(prev
, (int32_t) result
, FALSE
);
152 // The state machine is done. Check whether it found a match...
154 // Leave the iterator at our result position.
155 UTEXT_SETNATIVEINDEX(text
, result
);
157 RuleBasedTokenRange range
= {(signed long)prev
, (signed long) (result
-prev
)};
158 int32_t flags
= fStateFlags
[lastAcceptingState
];
163 *outTokenP
++ = range
;
166 *outTokenFlags
++ = (unsigned long) flags
;
169 if (flags
& 0x40000000)
177 return (outTokenP
- outTokenRanges
);
180 #if defined (__GNUC__) && (__GNUC__ >= 4)
181 #pragma GCC optimization_level reset
185 RuleBasedTokenizer::init()
187 const RBBIStateTable
*statetable
= fData
->fForwardTable
;
188 setBreakType(UBRK_WORD
);
189 fStartRow
= (const RBBIStateTableRow
*)
190 (statetable
->fTableData
+ (statetable
->fRowLen
* START_STATE
));
192 const UTrie
*trie
= &fData
->fTrie
;
194 fLatin1Cat
= new int16_t[256];
195 for (i
= 0; i
< 256; ++i
)
197 //UTRIE_GET16(trie, i, category);
198 //fLatin1Cat[i] = category;
199 fLatin1Cat
[i
] = _UTRIE_GET_RAW(trie
, index
, 0, i
);
201 fStateFlags
= new int32_t[statetable
->fNumStates
];
202 for (i
= 0; i
< statetable
->fNumStates
; ++i
)
204 const RBBIStateTableRow
*row
= (const RBBIStateTableRow
*)
205 (statetable
->fTableData
+ (statetable
->fRowLen
* i
));
207 if (row
->fAccepting
== -1)
209 const int32_t *vals
= (fData
->fRuleStatusTable
) + (row
->fTagIdx
);
210 const int32_t *valLimit
= vals
+ 1;
212 while (vals
< valLimit
)
214 int32_t val
= *vals
++;
230 fStateFlags
[i
] = flags
;
234 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString
&rules
, UParseError
&parseErr
, UErrorCode
&err
)
235 : RuleBasedBreakIterator(rules
, parseErr
, err
)
237 if (U_SUCCESS(err
)) {
242 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data
, UErrorCode
&status
)
243 : RuleBasedBreakIterator((RBBIDataHeader
*)data
, status
)
245 if (U_SUCCESS(status
)) {
250 RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data
, enum EDontAdopt
, UErrorCode
&status
)
251 : RuleBasedBreakIterator((const RBBIDataHeader
*)data
, RuleBasedBreakIterator::kDontAdopt
, status
)
253 if (U_SUCCESS(status
)) {
258 RuleBasedTokenizer::~RuleBasedTokenizer() {
259 delete [] fStateFlags
;
260 delete [] fLatin1Cat
;
265 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */