]>
Commit | Line | Data |
---|---|---|
73c04bcf A |
1 | /* |
2 | *************************************************************************** | |
0f5d89e8 A |
3 | * Copyright (C) 2006-2008, 2018 Apple Inc. All Rights Reserved. * |
4 | *************************************************************************** | |
5 | * | |
6 | *************************************************************************** | |
7 | * This uses the ICU 57 legacy version of RuleBasedBreakIterator for | |
8 | * performanc reasons, does not support the RuleBasedBreakIterator rule | |
9 | * syntax updates from ICU 60 and later, and requires both forward and | |
10 | * reverse rules (as in ICU 57). | |
73c04bcf | 11 | *************************************************************************** |
73c04bcf A |
12 | */ |
13 | ||
14 | #ifndef RBTOK_H | |
15 | #define RBTOK_H | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | ||
19 | /** | |
20 | * \file | |
21 | * \brief C++ API: Rule Based Tokenizer | |
22 | */ | |
23 | ||
24 | #if !UCONFIG_NO_BREAK_ITERATION | |
25 | ||
26 | #include "unicode/urbtok.h" | |
73c04bcf | 27 | #include "unicode/parseerr.h" |
0f5d89e8 A |
28 | #include "rbbidata57.h" |
29 | #include "rbbi57.h" | |
73c04bcf A |
30 | |
31 | ||
32 | U_NAMESPACE_BEGIN | |
33 | ||
34 | /** @internal */ | |
0f5d89e8 | 35 | struct RBBIDataHeader57; |
73c04bcf A |
36 | struct RBBIStateTableRow; |
37 | ||
38 | ||
39 | /** | |
40 | * | |
0f5d89e8 | 41 | * A subclass of RuleBasedBreakIterator57 that adds tokenization functionality. |
73c04bcf | 42 | |
0f5d89e8 | 43 | * <p>This class is for internal use only by Apple Inc.</p> |
73c04bcf A |
44 | * |
45 | */ | |
0f5d89e8 | 46 | class U_COMMON_API RuleBasedTokenizer : public RuleBasedBreakIterator57 { |
73c04bcf A |
47 | |
48 | private: | |
49 | /** | |
50 | * The row corresponding to the start state | |
51 | * @internal | |
52 | */ | |
53 | const RBBIStateTableRow *fStartRow; | |
54 | ||
55 | /** | |
56 | * The merged flag results for accepting states | |
57 | * @internal | |
58 | */ | |
59 | int32_t *fStateFlags; | |
60 | ||
61 | /** | |
62 | * Character categories for the Latin1 subset of Unicode | |
63 | * @internal | |
64 | */ | |
65 | int16_t *fLatin1Cat; | |
66 | ||
67 | public: | |
68 | /** | |
69 | * Construct a RuleBasedTokenizer from a set of rules supplied as a string. | |
70 | * @param rules The break rules to be used. | |
71 | * @param parseError In the event of a syntax error in the rules, provides the location | |
72 | * within the rules of the problem. | |
73 | * @param status Information on any errors encountered. | |
0f5d89e8 | 74 | * @internal, used by urbtok57.cpp |
73c04bcf A |
75 | */ |
76 | RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &status); | |
77 | ||
78 | /** | |
79 | * Constructor from a flattened set of RBBI data in uprv_malloc'd memory. | |
80 | * RulesBasedBreakIterators built from a custom set of rules | |
81 | * are created via this constructor; the rules are compiled | |
82 | * into memory, then the break iterator is constructed here. | |
83 | * | |
84 | * The break iterator adopts the memory, and will | |
85 | * free it when done. | |
0f5d89e8 | 86 | * @internal, used by urbtok57.cpp |
73c04bcf A |
87 | */ |
88 | RuleBasedTokenizer(uint8_t *data, UErrorCode &status); | |
89 | ||
46f4442e A |
90 | /** |
91 | * Constructor from a flattened set of RBBI data in umemory which need not | |
92 | * be malloced (e.g. it may be a memory-mapped file, etc.). | |
93 | * | |
94 | * This version does not adopt the memory, and does not | |
95 | * free it when done. | |
0f5d89e8 | 96 | * @internal, used by urbtok57.cpp |
46f4442e A |
97 | */ |
98 | enum EDontAdopt { | |
99 | kDontAdopt | |
100 | }; | |
101 | RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt dontAdopt, UErrorCode &status); | |
102 | ||
73c04bcf A |
103 | /** |
104 | * Destructor | |
105 | * @internal | |
106 | */ | |
107 | virtual ~RuleBasedTokenizer(); | |
108 | ||
109 | /** | |
110 | * Fetch the next set of tokens. | |
111 | * @param maxTokens The maximum number of tokens to return. | |
112 | * @param outTokenRanges Pointer to output array of token ranges. | |
113 | * @param outTokenFlags (optional) pointer to output array of token flags. | |
0f5d89e8 | 114 | * @internal, used by urbtok57.cpp |
73c04bcf A |
115 | */ |
116 | int32_t tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags); | |
117 | ||
118 | private: | |
119 | /** | |
120 | * Common initialization function, used by constructors. | |
121 | * @internal | |
122 | */ | |
123 | void init(); | |
124 | }; | |
125 | ||
126 | U_NAMESPACE_END | |
127 | ||
128 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
129 | ||
130 | #endif |