]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbtok.h
ICU-8.11.4.tar.gz
[apple/icu.git] / icuSources / common / rbtok.h
1 /*
2 ***************************************************************************
3 * Copyright (C) 2006 Apple Computer, Inc. All rights reserved. *
4 ***************************************************************************
5
6 */
7
8 #ifndef RBTOK_H
9 #define RBTOK_H
10
11 #include "unicode/utypes.h"
12
13 /**
14 * \file
15 * \brief C++ API: Rule Based Tokenizer
16 */
17
18 #if !UCONFIG_NO_BREAK_ITERATION
19
20 #include "unicode/urbtok.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/parseerr.h"
23
24
25 U_NAMESPACE_BEGIN
26
27 /** @internal */
28 struct RBBIDataHeader;
29 struct RBBIStateTableRow;
30
31
32 /**
33 *
34 * A subclass of RuleBasedBreakIterator that adds tokenization functionality.
35
36 * <p>This class is for internal use only by Apple Computer, Inc.</p>
37 *
38 */
39 class U_COMMON_API RuleBasedTokenizer : public RuleBasedBreakIterator {
40
41 private:
42 /**
43 * The row corresponding to the start state
44 * @internal
45 */
46 const RBBIStateTableRow *fStartRow;
47
48 /**
49 * The merged flag results for accepting states
50 * @internal
51 */
52 int32_t *fStateFlags;
53
54 /**
55 * Character categories for the Latin1 subset of Unicode
56 * @internal
57 */
58 int16_t *fLatin1Cat;
59
60 public:
61 /**
62 * Construct a RuleBasedTokenizer from a set of rules supplied as a string.
63 * @param rules The break rules to be used.
64 * @param parseError In the event of a syntax error in the rules, provides the location
65 * within the rules of the problem.
66 * @param status Information on any errors encountered.
67 * @internal
68 */
69 RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &status);
70
71 /**
72 * Constructor from a flattened set of RBBI data in uprv_malloc'd memory.
73 * RulesBasedBreakIterators built from a custom set of rules
74 * are created via this constructor; the rules are compiled
75 * into memory, then the break iterator is constructed here.
76 *
77 * The break iterator adopts the memory, and will
78 * free it when done.
79 * @internal
80 */
81 RuleBasedTokenizer(uint8_t *data, UErrorCode &status);
82
83 /**
84 * Destructor
85 * @internal
86 */
87 virtual ~RuleBasedTokenizer();
88
89 /**
90 * Fetch the next set of tokens.
91 * @param maxTokens The maximum number of tokens to return.
92 * @param outTokenRanges Pointer to output array of token ranges.
93 * @param outTokenFlags (optional) pointer to output array of token flags.
94 * @internal
95 */
96 int32_t tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags);
97
98 private:
99 /**
100 * Common initialization function, used by constructors.
101 * @internal
102 */
103 void init();
104 };
105
106 U_NAMESPACE_END
107
108 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
109
110 #endif