]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/colldata.h
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / colldata.h
... / ...
CommitLineData
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 ******************************************************************************
5 * Copyright (C) 1996-2012, International Business Machines *
6 * Corporation and others. All Rights Reserved. *
7 ******************************************************************************
8 */
9
10/**
11 * \file
12 * \brief Originally, added as C++ API for Collation data used to compute minLengthInChars
13 * \internal
14 */
15
16/*
17 * Note: This module was incldued in ICU 4.0.1 as @internal technology preview for supporting
18 * Boyer-Moore string search API. For now, only SSearchTest depends on this module. I temporaly
19 * moved the module from i18n directory to intltest, because we have no plan to publish this
20 * as public API. (2012-12-18 yoshito)
21 */
22
23#ifndef COLL_DATA_H
24#define COLL_DATA_H
25
26#include "unicode/utypes.h"
27
28#if !UCONFIG_NO_COLLATION
29
30#include "unicode/ucol.h"
31#include "unicode/unistr.h"
32
33 /**
34 * The size of the internal CE buffer in a <code>CEList</code> object
35 */
36#define CELIST_BUFFER_SIZE 4
37
38/**
39 * \def INSTRUMENT_CELIST
40 * Define this to enable the <code>CEList</code> objects to collect
41 * statistics.
42 */
43
44 /**
45 * The size of the initial list in a <code>StringList</code> object.
46 */
47#define STRING_LIST_BUFFER_SIZE 16
48
49U_NAMESPACE_USE
50
51 /**
52 * This object holds a list of CEs generated from a particular
53 * <code>UnicodeString</code>
54 *
55 */
56class CEList
57{
58public:
59 /**
60 * Construct a <code>CEList</code> object.
61 *
62 * @param coll - the Collator used to collect the CEs.
63 * @param string - the string for which to collect the CEs.
64 * @param status - will be set if any errors occur.
65 *
66 * Note: if on return, status is set to an error code,
67 * the only safe thing to do with this object is to call
68 * the destructor.
69 */
70 CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status);
71
72 /**
73 * The destructor.
74 */
75 ~CEList();
76
77 /**
78 * Return the number of CEs in the list.
79 *
80 * @return the number of CEs in the list.
81 */
82 int32_t size() const;
83
84 /**
85 * Get a particular CE from the list.
86 *
87 * @param index - the index of the CE to return
88 *
89 * @return the CE, or <code>0</code> if <code>index</code> is out of range
90 */
91 uint32_t get(int32_t index) const;
92
93 /**
94 * Check if the CEs in another <code>CEList</code> match the
95 * suffix of this list starting at a give offset.
96 *
97 * @param offset - the offset of the suffix
98 * @param other - the other <code>CEList</code>
99 *
100 * @return <code>TRUE</code> if the CEs match, <code>FALSE</code> otherwise.
101 */
102 UBool matchesAt(int32_t offset, const CEList *other) const;
103
104 /**
105 * The index operator.
106 *
107 * @param index - the index
108 *
109 * @return a reference to the given CE in the list
110 */
111 uint32_t &operator[](int32_t index) const;
112
113private:
114 void add(uint32_t ce, UErrorCode &status);
115
116 uint32_t ceBuffer[CELIST_BUFFER_SIZE];
117 uint32_t *ces;
118 int32_t listMax;
119 int32_t listSize;
120};
121
122/**
123 * StringList
124 *
125 * This object holds a list of <code>UnicodeString</code> objects.
126 */
127class StringList
128{
129public:
130 /**
131 * Construct an empty <code>StringList</code>
132 *
133 * @param status - will be set if any errors occur.
134 *
135 * Note: if on return, status is set to an error code,
136 * the only safe thing to do with this object is to call
137 * the destructor.
138 */
139 StringList(UErrorCode &status);
140
141 /**
142 * The destructor.
143 */
144 ~StringList();
145
146 /**
147 * Add a string to the list.
148 *
149 * @param string - the string to add
150 * @param status - will be set if any errors occur.
151 */
152 void add(const UnicodeString *string, UErrorCode &status);
153
154 /**
155 * Add an array of Unicode code points to the list.
156 *
157 * @param chars - the address of the array of code points
158 * @param count - the number of code points in the array
159 * @param status - will be set if any errors occur.
160 */
161 void add(const UChar *chars, int32_t count, UErrorCode &status);
162
163 /**
164 * Get a particular string from the list.
165 *
166 * @param index - the index of the string
167 *
168 * @return a pointer to the <code>UnicodeString</code> or <code>NULL</code>
169 * if <code>index</code> is out of bounds.
170 */
171 const UnicodeString *get(int32_t index) const;
172
173 /**
174 * Get the number of stings in the list.
175 *
176 * @return the number of strings in the list.
177 */
178 int32_t size() const;
179
180private:
181 UnicodeString *strings;
182 int32_t listMax;
183 int32_t listSize;
184};
185
186
187/*
188 * Forward references to internal classes.
189 */
190class StringToCEsMap;
191class CEToStringsMap;
192
193/**
194 * CollData
195 *
196 * This class holds the Collator-specific data needed to
197 * compute the length of the shortest string that can
198 * generate a partcular list of CEs.
199 *
200 * <code>CollData</code> objects are quite expensive to compute. Because
201 * of this, they are cached. When you call <code>CollData::open</code> it
202 * returns a reference counted cached object. When you call <code>CollData::close</code>
203 * the reference count on the object is decremented but the object is not deleted.
204 *
205 * If you do not need to reuse any unreferenced objects in the cache, you can call
206 * <code>CollData::flushCollDataCache</code>. If you no longer need any <code>CollData</code>
207 * objects, you can call <code>CollData::freeCollDataCache</code>
208 */
209class CollData
210{
211public:
212 /**
213 * Construct a <code>CollData</code> object.
214 *
215 * @param collator - the collator
216 * @param status - will be set if any errors occur.
217 */
218 CollData(UCollator *collator, UErrorCode &status);
219
220 /**
221 * The destructor.
222 */
223 ~CollData();
224
225 /**
226 * Get the <code>UCollator</code> object used to create this object.
227 * The object returned may not be the exact object that was used to
228 * create this object, but it will have the same behavior.
229 */
230 UCollator *getCollator() const;
231
232 /**
233 * Get a list of all the strings which generate a list
234 * of CEs starting with a given CE.
235 *
236 * @param ce - the CE
237 *
238 * return a <code>StringList</code> object containing all
239 * the stirngs, or <code>NULL</code> if there are
240 * no such strings.
241 */
242 const StringList *getStringList(int32_t ce) const;
243
244 /**
245 * Get a list of the CEs generated by a partcular stirng.
246 *
247 * @param string - the string
248 *
249 * @return a <code>CEList</code> object containt the CEs. You
250 * must call <code>freeCEList</code> when you are finished
251 * using the <code>CEList</code>/
252 */
253 const CEList *getCEList(const UnicodeString *string) const;
254
255 /**
256 * Release a <code>CEList</code> returned by <code>getCEList</code>.
257 *
258 * @param list - the <code>CEList</code> to free.
259 */
260 void freeCEList(const CEList *list);
261
262 /**
263 * Return the length of the shortest string that will generate
264 * the given list of CEs.
265 *
266 * @param ces - the CEs
267 * @param offset - the offset of the first CE in the list to use.
268 *
269 * @return the length of the shortest string.
270 */
271 int32_t minLengthInChars(const CEList *ces, int32_t offset) const;
272
273
274 /**
275 * Return the length of the shortest string that will generate
276 * the given list of CEs.
277 *
278 * Note: the algorithm used to do this computation is recursive. To
279 * limit the amount of recursion, a "history" list is used to record
280 * the best answer starting at a particular offset in the list of CEs.
281 * If the same offset is visited again during the recursion, the answer
282 * in the history list is used.
283 *
284 * @param ces - the CEs
285 * @param offset - the offset of the first CE in the list to use.
286 * @param history - the history list. Must be at least as long as
287 * the number of cEs in the <code>CEList</code>
288 *
289 * @return the length of the shortest string.
290 */
291 int32_t minLengthInChars(const CEList *ces, int32_t offset, int32_t *history) const;
292
293private:
294 UCollator *coll;
295 CEToStringsMap *ceToCharsStartingWith;
296
297 uint32_t minHan;
298 uint32_t maxHan;
299
300 uint32_t jamoLimits[4];
301};
302
303#endif // #if !UCONFIG_NO_COLLATION
304#endif // #ifndef COLL_DATA_H