]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | *************************************************************************** | |
3 | * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. * | |
4 | *************************************************************************** | |
5 | */ | |
6 | ||
7 | #ifndef RBTOK_H | |
8 | #define RBTOK_H | |
9 | ||
10 | #include "unicode/utypes.h" | |
11 | ||
12 | /** | |
13 | * \file | |
14 | * \brief C++ API: Rule Based Tokenizer | |
15 | */ | |
16 | ||
17 | #if !UCONFIG_NO_BREAK_ITERATION | |
18 | ||
19 | #include "unicode/urbtok.h" | |
20 | #include "unicode/rbbi.h" | |
21 | #include "unicode/parseerr.h" | |
22 | ||
23 | ||
24 | U_NAMESPACE_BEGIN | |
25 | ||
26 | /** @internal */ | |
27 | struct RBBIDataHeader; | |
28 | struct RBBIStateTableRow; | |
29 | ||
30 | ||
31 | /** | |
32 | * | |
33 | * A subclass of RuleBasedBreakIterator that adds tokenization functionality. | |
34 | ||
35 | * <p>This class is for internal use only by Apple Computer, Inc.</p> | |
36 | * | |
37 | */ | |
38 | class U_COMMON_API RuleBasedTokenizer : public RuleBasedBreakIterator { | |
39 | ||
40 | private: | |
41 | /** | |
42 | * The row corresponding to the start state | |
43 | * @internal | |
44 | */ | |
45 | const RBBIStateTableRow *fStartRow; | |
46 | ||
47 | /** | |
48 | * The merged flag results for accepting states | |
49 | * @internal | |
50 | */ | |
51 | int32_t *fStateFlags; | |
52 | ||
53 | /** | |
54 | * Character categories for the Latin1 subset of Unicode | |
55 | * @internal | |
56 | */ | |
57 | int16_t *fLatin1Cat; | |
58 | ||
59 | public: | |
60 | /** | |
61 | * Construct a RuleBasedTokenizer from a set of rules supplied as a string. | |
62 | * @param rules The break rules to be used. | |
63 | * @param parseError In the event of a syntax error in the rules, provides the location | |
64 | * within the rules of the problem. | |
65 | * @param status Information on any errors encountered. | |
66 | * @internal | |
67 | */ | |
68 | RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &status); | |
69 | ||
70 | /** | |
71 | * Constructor from a flattened set of RBBI data in uprv_malloc'd memory. | |
72 | * RulesBasedBreakIterators built from a custom set of rules | |
73 | * are created via this constructor; the rules are compiled | |
74 | * into memory, then the break iterator is constructed here. | |
75 | * | |
76 | * The break iterator adopts the memory, and will | |
77 | * free it when done. | |
78 | * @internal | |
79 | */ | |
80 | RuleBasedTokenizer(uint8_t *data, UErrorCode &status); | |
81 | ||
82 | /** | |
83 | * Constructor from a flattened set of RBBI data in umemory which need not | |
84 | * be malloced (e.g. it may be a memory-mapped file, etc.). | |
85 | * | |
86 | * This version does not adopt the memory, and does not | |
87 | * free it when done. | |
88 | * @internal | |
89 | */ | |
90 | enum EDontAdopt { | |
91 | kDontAdopt | |
92 | }; | |
93 | RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt dontAdopt, UErrorCode &status); | |
94 | ||
95 | /** | |
96 | * Destructor | |
97 | * @internal | |
98 | */ | |
99 | virtual ~RuleBasedTokenizer(); | |
100 | ||
101 | /** | |
102 | * Fetch the next set of tokens. | |
103 | * @param maxTokens The maximum number of tokens to return. | |
104 | * @param outTokenRanges Pointer to output array of token ranges. | |
105 | * @param outTokenFlags (optional) pointer to output array of token flags. | |
106 | * @internal | |
107 | */ | |
108 | int32_t tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags); | |
109 | ||
110 | private: | |
111 | /** | |
112 | * Common initialization function, used by constructors. | |
113 | * @internal | |
114 | */ | |
115 | void init(); | |
116 | }; | |
117 | ||
118 | U_NAMESPACE_END | |
119 | ||
120 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
121 | ||
122 | #endif |