2 ***************************************************************************
3 * Copyright (C) 2006-2008,2017 Apple Inc. All Rights Reserved. *
4 ***************************************************************************
7 #include "unicode/utypes.h"
9 #if !UCONFIG_NO_BREAK_ITERATION
12 #include "unicode/ustring.h"
13 #include "unicode/utext.h"
19 // The following is now static in rbbi.cpp, gets set dynamicaly.
20 // For now duplicate here to build, and force to TRUE if desired.
21 static UBool fTrace
= FALSE
;
27 static const int16_t START_STATE
= 1; // The state number of the starting state
28 static const int16_t STOP_STATE
= 0; // The state-transition value indicating "stop"
30 int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens
, RuleBasedTokenRange
*outTokenRanges
, unsigned long *outTokenFlags
)
32 RuleBasedTokenRange
*outTokenLimit
= outTokenRanges
+ maxTokens
;
33 RuleBasedTokenRange
*outTokenP
= outTokenRanges
;
35 uint16_t category
= 0;
37 const RBBIStateTableRow
*row
;
38 const RBBIStateTableRow
*const startRow
= fStartRow
;
40 int32_t lastAcceptingState
= 0;
44 const char *const tableData
= fData
->fForwardTable
->fTableData
;
45 const uint32_t tableRowLen
= fData
->fForwardTable
->fRowLen
;
50 RBBIDebugPuts("Handle Next pos char state category");
54 fLastStatusIndexValid
= FALSE
;
56 // if we're already at the end of the text, return DONE.
57 prev
= (signed long)UTEXT_GETNATIVEINDEX(text
);
59 // loop until we reach the end of the text or transition to state 0
61 const UTrie
*trie
= &fData
->fTrie
;
62 while (outTokenP
< outTokenLimit
) {
63 // LookAheadResults lookAheadMatches; // added in RBBI, #12081/r38387
64 result
= prev
; // fallback initialization, prevent uninitialized use
65 c
= UTEXT_NEXT32(text
);
70 // Set the initial state for the state machine
74 // if we have cached break positions and we're still in the range
75 // covered by them, just move one step forward in the cache
76 if (fCachedBreakPositions
!= NULL
) {
77 if (fPositionInCache
< fNumCachedBreakPositions
- 1) {
79 result
= fCachedBreakPositions
[fPositionInCache
];
87 while (c
!= U_SENTINEL
) {
89 // Get the char category. An incoming category of 1 or 2 means that
90 // we are preset for doing the beginning or end of input, and
91 // that we shouldn't get a category from an actual text input character.
93 // look up the current character's character category, which tells us
94 // which column in the state table to look at.
95 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
96 // not the size of the character going in, which is a UChar32.
99 category
= fLatin1Cat
[c
];
101 UTRIE_GET16(trie
, c
, category
);
103 // Check the dictionary bit in the character's category.
104 // Counter is only used by dictionary based iterators (subclasses).
105 // Chars that need to be handled by a dictionary have a flag bit set
106 // in their category values.
108 if ((category
& 0x4000) != 0) {
109 fDictionaryCharCount
++;
110 // And off the dictionary flag bit.
116 RBBIDebugPrintf(" %4lld ", utext_getNativeIndex(fText
));
117 if (0x20<=c
&& c
<0x7f) {
118 RBBIDebugPrintf("\"%c\" ", c
);
120 RBBIDebugPrintf("%5x ", c
);
122 RBBIDebugPrintf("%3d %3d\n", state
, category
);
126 // State Transition - move machine to its next state
129 // Note: fNextState is defined as uint16_t[2], but we are casting
130 // a generated RBBI table to RBBIStateTableRow and some tables
131 // actually have more than 2 categories.
132 U_ASSERT(category
<fData
->fHeader
->fCatCount
);
133 state
= row
->fNextState
[category
];
134 row
= (const RBBIStateTableRow
*) (tableData
+ tableRowLen
* state
);
136 if (row
->fAccepting
== -1) {
137 // Match found, common case.
138 result
= (signed long)UTEXT_GETNATIVEINDEX(text
);
139 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
140 //lastStatusRow = row;
141 lastAcceptingState
= state
;
144 // rbbi has added code here to check lookAheadMatches and
145 // set lookAheadMatches, per open-source ICU #12081/r38387
147 if (state
== STOP_STATE
) {
148 // This is the normal exit from the lookup state machine.
149 // We have advanced through the string until it is certain that no
150 // longer match is possible, no matter what characters follow.
154 // Advance to the next character.
155 // If this is a beginning-of-input loop iteration, don't advance
156 // the input position. The next iteration will be processing the
157 // first real input character.
158 c
= UTEXT_NEXT32(text
);
161 if (fDictionaryCharCount
> 0) {
162 result
= (signed long) checkDictionary(prev
, (int32_t) result
, FALSE
);
166 // The state machine is done. Check whether it found a match...
168 // If the iterator failed to advance in the match engine, force it ahead by one.
169 // (This really indicates a defect in the break rules. They should always match
170 // at least one character.). Added in open-source ICU r13469
171 UBool setFlagsZero
= FALSE
;
172 if (result
== prev
) {
173 UTEXT_SETNATIVEINDEX(text
, prev
);
175 result
= (int32_t)UTEXT_GETNATIVEINDEX(text
);
179 // Leave the iterator at our result position.
180 UTEXT_SETNATIVEINDEX(text
, result
);
182 RuleBasedTokenRange range
= {(signed long)prev
, (signed long) (result
-prev
)};
183 int32_t flags
= (!setFlagsZero
)? fStateFlags
[lastAcceptingState
]: 0;
191 RBBIDebugPrintf("Emit location %3ld length %2ld flags %08X\n", range
.location
, range
.length
, flags
);
194 *outTokenP
++ = range
;
197 *outTokenFlags
++ = (unsigned long) flags
;
200 if (flags
& 0x40000000) {
209 return (outTokenP
- outTokenRanges
);
213 RuleBasedTokenizer::init()
215 const RBBIStateTable
*statetable
= fData
->fForwardTable
;
216 setBreakType(UBRK_WORD
);
217 fStartRow
= (const RBBIStateTableRow
*)
218 (statetable
->fTableData
+ (statetable
->fRowLen
* START_STATE
));
220 const UTrie
*trie
= &fData
->fTrie
;
222 fLatin1Cat
= new int16_t[256];
223 for (i
= 0; i
< 256; ++i
)
225 //UTRIE_GET16(trie, i, category);
226 //fLatin1Cat[i] = category;
227 fLatin1Cat
[i
] = _UTRIE_GET_RAW(trie
, index
, 0, i
);
229 fStateFlags
= new int32_t[statetable
->fNumStates
];
230 for (i
= 0; i
< statetable
->fNumStates
; ++i
)
232 const RBBIStateTableRow
*row
= (const RBBIStateTableRow
*)
233 (statetable
->fTableData
+ (statetable
->fRowLen
* i
));
235 if (row
->fAccepting
== -1 && row
->fTagIdx
!= 0)
237 const int32_t *vals
= (fData
->fRuleStatusTable
) + (row
->fTagIdx
);
238 const int32_t *valLimit
= vals
+ 1;
240 while (vals
< valLimit
)
242 int32_t val
= *vals
++;
258 fStateFlags
[i
] = flags
;
262 RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString
&rules
, UParseError
&parseErr
, UErrorCode
&err
)
263 : RuleBasedBreakIterator(rules
, parseErr
, err
)
265 if (U_SUCCESS(err
)) {
270 RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data
, UErrorCode
&status
)
271 : RuleBasedBreakIterator((RBBIDataHeader
*)data
, status
)
273 if (U_SUCCESS(status
)) {
278 RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data
, enum EDontAdopt
, UErrorCode
&status
)
279 : RuleBasedBreakIterator((const RBBIDataHeader
*)data
, RuleBasedBreakIterator::kDontAdopt
, status
)
281 if (U_SUCCESS(status
)) {
286 RuleBasedTokenizer::~RuleBasedTokenizer() {
287 delete [] fStateFlags
;
288 delete [] fLatin1Cat
;
293 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */