]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/rbtok.h
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / common / rbtok.h
1 /*
2 ***************************************************************************
3 * Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. *
4 ***************************************************************************
5 */
6
7 #ifndef RBTOK_H
8 #define RBTOK_H
9
10 #include "unicode/utypes.h"
11
12 /**
13 * \file
14 * \brief C++ API: Rule Based Tokenizer
15 */
16
17 #if !UCONFIG_NO_BREAK_ITERATION
18
19 #include "unicode/urbtok.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/parseerr.h"
22
23
24 U_NAMESPACE_BEGIN
25
26 /** @internal */
27 struct RBBIDataHeader;
28 struct RBBIStateTableRow;
29
30
31 /**
32 *
33 * A subclass of RuleBasedBreakIterator that adds tokenization functionality.
34
35 * <p>This class is for internal use only by Apple Computer, Inc.</p>
36 *
37 */
38 class U_COMMON_API RuleBasedTokenizer : public RuleBasedBreakIterator {
39
40 private:
41 /**
42 * The row corresponding to the start state
43 * @internal
44 */
45 const RBBIStateTableRow *fStartRow;
46
47 /**
48 * The merged flag results for accepting states
49 * @internal
50 */
51 int32_t *fStateFlags;
52
53 /**
54 * Character categories for the Latin1 subset of Unicode
55 * @internal
56 */
57 int16_t *fLatin1Cat;
58
59 public:
60 /**
61 * Construct a RuleBasedTokenizer from a set of rules supplied as a string.
62 * @param rules The break rules to be used.
63 * @param parseError In the event of a syntax error in the rules, provides the location
64 * within the rules of the problem.
65 * @param status Information on any errors encountered.
66 * @internal
67 */
68 RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &status);
69
70 /**
71 * Constructor from a flattened set of RBBI data in uprv_malloc'd memory.
72 * RulesBasedBreakIterators built from a custom set of rules
73 * are created via this constructor; the rules are compiled
74 * into memory, then the break iterator is constructed here.
75 *
76 * The break iterator adopts the memory, and will
77 * free it when done.
78 * @internal
79 */
80 RuleBasedTokenizer(uint8_t *data, UErrorCode &status);
81
82 /**
83 * Constructor from a flattened set of RBBI data in umemory which need not
84 * be malloced (e.g. it may be a memory-mapped file, etc.).
85 *
86 * This version does not adopt the memory, and does not
87 * free it when done.
88 * @internal
89 */
90 enum EDontAdopt {
91 kDontAdopt
92 };
93 RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt dontAdopt, UErrorCode &status);
94
95 /**
96 * Destructor
97 * @internal
98 */
99 virtual ~RuleBasedTokenizer();
100
101 /**
102 * Fetch the next set of tokens.
103 * @param maxTokens The maximum number of tokens to return.
104 * @param outTokenRanges Pointer to output array of token ranges.
105 * @param outTokenFlags (optional) pointer to output array of token flags.
106 * @internal
107 */
108 int32_t tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags);
109
110 private:
111 /**
112 * Common initialization function, used by constructors.
113 * @internal
114 */
115 void init();
116 };
117
118 U_NAMESPACE_END
119
120 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
121
122 #endif