2 ***************************************************************************
3 * Copyright (C) 2006-2008, 2018 Apple Inc. All Rights Reserved. *
4 ***************************************************************************
6 ***************************************************************************
7 * This uses the ICU 57 legacy version of RuleBasedBreakIterator for
8 * performanc reasons, does not support the RuleBasedBreakIterator rule
9 * syntax updates from ICU 60 and later, and requires both forward and
10 * reverse rules (as in ICU 57).
11 ***************************************************************************
17 #include "unicode/utypes.h"
21 * \brief C++ API: Rule Based Tokenizer
24 #if !UCONFIG_NO_BREAK_ITERATION
26 #include "unicode/urbtok.h"
27 #include "unicode/parseerr.h"
28 #include "rbbidata57.h"
35 struct RBBIDataHeader57
;
36 struct RBBIStateTableRow
;
41 * A subclass of RuleBasedBreakIterator57 that adds tokenization functionality.
43 * <p>This class is for internal use only by Apple Inc.</p>
46 class U_COMMON_API RuleBasedTokenizer
: public RuleBasedBreakIterator57
{
50 * The row corresponding to the start state
53 const RBBIStateTableRow
*fStartRow
;
56 * The merged flag results for accepting states
62 * Character categories for the Latin1 subset of Unicode
69 * Construct a RuleBasedTokenizer from a set of rules supplied as a string.
70 * @param rules The break rules to be used.
71 * @param parseError In the event of a syntax error in the rules, provides the location
72 * within the rules of the problem.
73 * @param status Information on any errors encountered.
74 * @internal, used by urbtok57.cpp
76 RuleBasedTokenizer(const UnicodeString
&rules
, UParseError
&parseErr
, UErrorCode
&status
);
79 * Constructor from a flattened set of RBBI data in uprv_malloc'd memory.
80 * RulesBasedBreakIterators built from a custom set of rules
81 * are created via this constructor; the rules are compiled
82 * into memory, then the break iterator is constructed here.
84 * The break iterator adopts the memory, and will
86 * @internal, used by urbtok57.cpp
88 RuleBasedTokenizer(uint8_t *data
, UErrorCode
&status
);
91 * Constructor from a flattened set of RBBI data in umemory which need not
92 * be malloced (e.g. it may be a memory-mapped file, etc.).
94 * This version does not adopt the memory, and does not
96 * @internal, used by urbtok57.cpp
101 RuleBasedTokenizer(const uint8_t *data
, enum EDontAdopt dontAdopt
, UErrorCode
&status
);
107 virtual ~RuleBasedTokenizer();
110 * Fetch the next set of tokens.
111 * @param maxTokens The maximum number of tokens to return.
112 * @param outTokenRanges Pointer to output array of token ranges.
113 * @param outTokenFlags (optional) pointer to output array of token flags.
114 * @internal, used by urbtok57.cpp
116 int32_t tokenize(int32_t maxTokens
, RuleBasedTokenRange
*outTokenRanges
, unsigned long *outTokenFlags
);
120 * Common initialization function, used by constructors.
128 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */