2 ***************************************************************************
3 * Copyright (C) 2006 Apple Computer, Inc. All rights reserved. *
4 ***************************************************************************
11 #include "unicode/utypes.h"
15 * \brief C++ API: Rule Based Tokenizer
18 #if !UCONFIG_NO_BREAK_ITERATION
20 #include "unicode/urbtok.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/parseerr.h"
28 struct RBBIDataHeader
;
29 struct RBBIStateTableRow
;
34 * A subclass of RuleBasedBreakIterator that adds tokenization functionality.
36 * <p>This class is for internal use only by Apple Computer, Inc.</p>
39 class U_COMMON_API RuleBasedTokenizer
: public RuleBasedBreakIterator
{
43 * The row corresponding to the start state
46 const RBBIStateTableRow
*fStartRow
;
49 * The merged flag results for accepting states
55 * Character categories for the Latin1 subset of Unicode
62 * Construct a RuleBasedTokenizer from a set of rules supplied as a string.
63 * @param rules The break rules to be used.
64 * @param parseError In the event of a syntax error in the rules, provides the location
65 * within the rules of the problem.
66 * @param status Information on any errors encountered.
69 RuleBasedTokenizer(const UnicodeString
&rules
, UParseError
&parseErr
, UErrorCode
&status
);
72 * Constructor from a flattened set of RBBI data in uprv_malloc'd memory.
73 * RulesBasedBreakIterators built from a custom set of rules
74 * are created via this constructor; the rules are compiled
75 * into memory, then the break iterator is constructed here.
77 * The break iterator adopts the memory, and will
81 RuleBasedTokenizer(uint8_t *data
, UErrorCode
&status
);
87 virtual ~RuleBasedTokenizer();
90 * Fetch the next set of tokens.
91 * @param maxTokens The maximum number of tokens to return.
92 * @param outTokenRanges Pointer to output array of token ranges.
93 * @param outTokenFlags (optional) pointer to output array of token flags.
96 int32_t tokenize(int32_t maxTokens
, RuleBasedTokenRange
*outTokenRanges
, unsigned long *outTokenFlags
);
100 * Common initialization function, used by constructors.
108 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */