]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/rbtok.h
ICU-64252.0.1.tar.gz
[apple/icu.git] / icuSources / common / rbtok.h
CommitLineData
73c04bcf
A
1/*
2***************************************************************************
0f5d89e8
A
3* Copyright (C) 2006-2008, 2018 Apple Inc. All Rights Reserved. *
4***************************************************************************
5*
6***************************************************************************
7* This uses the ICU 57 legacy version of RuleBasedBreakIterator for
8* performanc reasons, does not support the RuleBasedBreakIterator rule
9* syntax updates from ICU 60 and later, and requires both forward and
10* reverse rules (as in ICU 57).
73c04bcf 11***************************************************************************
73c04bcf
A
12*/
13
14#ifndef RBTOK_H
15#define RBTOK_H
16
17#include "unicode/utypes.h"
18
19/**
20 * \file
21 * \brief C++ API: Rule Based Tokenizer
22 */
23
24#if !UCONFIG_NO_BREAK_ITERATION
25
26#include "unicode/urbtok.h"
73c04bcf 27#include "unicode/parseerr.h"
0f5d89e8
A
28#include "rbbidata57.h"
29#include "rbbi57.h"
73c04bcf
A
30
31
32U_NAMESPACE_BEGIN
33
34/** @internal */
0f5d89e8 35struct RBBIDataHeader57;
73c04bcf
A
36struct RBBIStateTableRow;
37
38
39/**
40 *
0f5d89e8 41 * A subclass of RuleBasedBreakIterator57 that adds tokenization functionality.
73c04bcf 42
0f5d89e8 43 * <p>This class is for internal use only by Apple Inc.</p>
73c04bcf
A
44 *
45 */
0f5d89e8 46class U_COMMON_API RuleBasedTokenizer : public RuleBasedBreakIterator57 {
73c04bcf
A
47
48private:
49 /**
50 * The row corresponding to the start state
51 * @internal
52 */
53 const RBBIStateTableRow *fStartRow;
54
55 /**
56 * The merged flag results for accepting states
57 * @internal
58 */
59 int32_t *fStateFlags;
60
61 /**
62 * Character categories for the Latin1 subset of Unicode
63 * @internal
64 */
65 int16_t *fLatin1Cat;
66
67public:
68 /**
69 * Construct a RuleBasedTokenizer from a set of rules supplied as a string.
70 * @param rules The break rules to be used.
71 * @param parseError In the event of a syntax error in the rules, provides the location
72 * within the rules of the problem.
73 * @param status Information on any errors encountered.
0f5d89e8 74 * @internal, used by urbtok57.cpp
73c04bcf
A
75 */
76 RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &status);
77
78 /**
79 * Constructor from a flattened set of RBBI data in uprv_malloc'd memory.
80 * RulesBasedBreakIterators built from a custom set of rules
81 * are created via this constructor; the rules are compiled
82 * into memory, then the break iterator is constructed here.
83 *
84 * The break iterator adopts the memory, and will
85 * free it when done.
0f5d89e8 86 * @internal, used by urbtok57.cpp
73c04bcf
A
87 */
88 RuleBasedTokenizer(uint8_t *data, UErrorCode &status);
89
46f4442e
A
90 /**
91 * Constructor from a flattened set of RBBI data in umemory which need not
92 * be malloced (e.g. it may be a memory-mapped file, etc.).
93 *
94 * This version does not adopt the memory, and does not
95 * free it when done.
0f5d89e8 96 * @internal, used by urbtok57.cpp
46f4442e
A
97 */
98 enum EDontAdopt {
99 kDontAdopt
100 };
101 RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt dontAdopt, UErrorCode &status);
102
73c04bcf
A
103 /**
104 * Destructor
105 * @internal
106 */
107 virtual ~RuleBasedTokenizer();
108
109 /**
110 * Fetch the next set of tokens.
111 * @param maxTokens The maximum number of tokens to return.
112 * @param outTokenRanges Pointer to output array of token ranges.
113 * @param outTokenFlags (optional) pointer to output array of token flags.
0f5d89e8 114 * @internal, used by urbtok57.cpp
73c04bcf
A
115 */
116 int32_t tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags);
117
118private:
119 /**
120 * Common initialization function, used by constructors.
121 * @internal
122 */
123 void init();
124};
125
126U_NAMESPACE_END
127
128#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
129
130#endif