2 ***************************************************************************
3 * Copyright (C) 2006 Apple Computer, Inc. All rights reserved. *
4 ***************************************************************************
8 #include "unicode/utypes.h"
10 #if !UCONFIG_NO_BREAK_ITERATION
13 #include "unicode/ustring.h"
14 #include "unicode/utext.h"
20 #if defined(__GNUC__) && (__GNUC__ >= 4)
21 #pragma GCC optimization_level 3
24 static const int16_t START_STATE
= 1; // The state number of the starting state
25 static const int16_t STOP_STATE
= 0; // The state-transition value indicating "stop"
27 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens
, RuleBasedTokenRange
*outTokenRanges
, unsigned long *outTokenFlags
)
29 RuleBasedTokenRange
*outTokenLimit
= outTokenRanges
+ maxTokens
;
30 RuleBasedTokenRange
*outTokenP
= outTokenRanges
;
34 const RBBIStateTableRow
*row
;
35 const RBBIStateTableRow
*const startRow
= fStartRow
;
37 int32_t lastAcceptingState
= 0;
41 const char *const tableData
= fData
->fForwardTable
->fTableData
;
42 const uint32_t tableRowLen
= fData
->fForwardTable
->fRowLen
;
47 RBBIDebugPuts("Handle Next pos char state category");
51 fLastStatusIndexValid
= FALSE
;
53 // if we're already at the end of the text, return DONE.
54 prev
= (signed long)UTEXT_GETNATIVEINDEX(text
);
56 // loop until we reach the end of the text or transition to state 0
58 const UTrie
*trie
= &fData
->fTrie
;
59 while (outTokenP
< outTokenLimit
) {
60 c
= UTEXT_NEXT32(text
);
65 // Set the initial state for the state machine
69 // if we have cached break positions and we're still in the range
70 // covered by them, just move one step forward in the cache
71 if (fCachedBreakPositions
!= NULL
) {
72 if (fPositionInCache
< fNumCachedBreakPositions
- 1) {
74 result
= fCachedBreakPositions
[fPositionInCache
];
82 while (c
!= U_SENTINEL
) {
84 // Get the char category. An incoming category of 1 or 2 means that
85 // we are preset for doing the beginning or end of input, and
86 // that we shouldn't get a category from an actual text input character.
88 // look up the current character's character category, which tells us
89 // which column in the state table to look at.
90 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
91 // not the size of the character going in, which is a UChar32.
93 if (__builtin_expect((c
< 0x100), 1))
94 category
= fLatin1Cat
[c
];
96 UTRIE_GET16(trie
, c
, category
);
98 // Check the dictionary bit in the character's category.
99 // Counter is only used by dictionary based iterators (subclasses).
100 // Chars that need to be handled by a dictionary have a flag bit set
101 // in their category values.
103 if (__builtin_expect((category
& 0x4000) != 0, 0)) {
104 fDictionaryCharCount
++;
105 // And off the dictionary flag bit.
111 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText
));
112 if (0x20<=c
&& c
<0x7f) {
113 RBBIDebugPrintf("\"%c\" ", c
);
115 RBBIDebugPrintf("%5x ", c
);
117 RBBIDebugPrintf("%3d %3d\n", state
, category
);
121 // State Transition - move machine to its next state
123 state
= row
->fNextState
[category
];
124 row
= (const RBBIStateTableRow
*) (tableData
+ tableRowLen
* state
);
126 if (row
->fAccepting
== -1) {
127 // Match found, common case.
128 result
= (signed long)UTEXT_GETNATIVEINDEX(text
);
129 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
130 //lastStatusRow = row;
131 lastAcceptingState
= state
;
134 if (state
== STOP_STATE
) {
135 // This is the normal exit from the lookup state machine.
136 // We have advanced through the string until it is certain that no
137 // longer match is possible, no matter what characters follow.
141 // Advance to the next character.
142 // If this is a beginning-of-input loop iteration, don't advance
143 // the input position. The next iteration will be processing the
144 // first real input character.
145 c
= UTEXT_NEXT32(text
);
148 if (fDictionaryCharCount
> 0) {
149 result
= (signed long) checkDictionary(prev
, (int32_t) result
, FALSE
);
153 // The state machine is done. Check whether it found a match...
155 // Leave the iterator at our result position.
156 UTEXT_SETNATIVEINDEX(text
, result
);
158 RuleBasedTokenRange range
= {(signed long)prev
, (signed long) (result
-prev
)};
159 int32_t flags
= fStateFlags
[lastAcceptingState
];
164 *outTokenP
++ = range
;
167 *outTokenFlags
++ = (unsigned long) flags
;
170 if (flags
& 0x40000000)
178 return (outTokenP
- outTokenRanges
);
181 #if defined (__GNUC__) && (__GNUC__ >= 4)
182 #pragma GCC optimization_level reset
186 RuleBasedTokenizer::init()
188 const RBBIStateTable
*statetable
= fData
->fForwardTable
;
189 setBreakType(UBRK_WORD
);
190 fStartRow
= (const RBBIStateTableRow
*)
191 (statetable
->fTableData
+ (statetable
->fRowLen
* START_STATE
));
193 const UTrie
*trie
= &fData
->fTrie
;
195 fLatin1Cat
= new int16_t[256];
196 for (i
= 0; i
< 256; ++i
)
198 //UTRIE_GET16(trie, i, category);
199 //fLatin1Cat[i] = category;
200 fLatin1Cat
[i
] = _UTRIE_GET_RAW(trie
, index
, 0, i
);
202 fStateFlags
= new int32_t[statetable
->fNumStates
];
203 for (i
= 0; i
< statetable
->fNumStates
; ++i
)
205 const RBBIStateTableRow
*row
= (const RBBIStateTableRow
*)
206 (statetable
->fTableData
+ (statetable
->fRowLen
* i
));
208 if (row
->fAccepting
== -1)
210 const int32_t *vals
= (fData
->fRuleStatusTable
) + (row
->fTagIdx
);
211 const int32_t *valLimit
= vals
+ 1;
213 while (vals
< valLimit
)
215 int32_t val
= *vals
++;
231 fStateFlags
[i
] = flags
;
235 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString
&rules
, UParseError
&parseErr
, UErrorCode
&err
)
236 : RuleBasedBreakIterator(rules
, parseErr
, err
)
241 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data
, UErrorCode
&status
)
242 : RuleBasedBreakIterator((RBBIDataHeader
*)data
, status
)
247 RuleBasedTokenizer::~RuleBasedTokenizer() {
248 delete [] fStateFlags
;
249 delete [] fLatin1Cat
;
254 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */