]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/brkdict.cpp
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / brkdict.cpp
1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2004 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 12/1/99 rtg Ported from Java
7 * 01/13/2000 helena Added UErrorCode to ctors.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_BREAK_ITERATION
14
15 #include "unicode/ures.h"
16 #include "brkdict.h"
17 #include "cmemory.h"
18
19 U_NAMESPACE_BEGIN
20
21 //=================================================================================
22 // deserialization
23 //=================================================================================
24
25 BreakDictionary::BreakDictionary(const char* /*dictionaryFilename*/, UErrorCode& status)
26 : columnMap(NULL),
27 table(NULL),
28 rowIndex(NULL),
29 rowIndexFlags(NULL),
30 rowIndexFlagsIndex(NULL),
31 rowIndexShifts(NULL)
32 {
33 if (U_FAILURE(status)) return;
34
35 UResourceBundle *th_dict = ures_open(NULL, "th", &status);
36 th_dict = ures_getByKey(th_dict, "BreakDictionaryData", th_dict, &status);
37 if (U_FAILURE(status)) return;
38
39 int32_t len;
40 const uint8_t * data = ures_getBinary(th_dict, &len, &status);
41 ures_close(th_dict);
42 if (U_FAILURE(status)) return;
43
44 readDictionaryFile(data);
45 }
46
47 BreakDictionary::~BreakDictionary()
48 {
49 ucmp8_close(columnMap);
50 uprv_free(table);
51 uprv_free(rowIndex);
52 uprv_free(rowIndexFlags);
53 uprv_free(rowIndexFlagsIndex);
54 uprv_free(rowIndexShifts);
55 }
56
57 // macros to support readDictionaryFile. The data files originated from a Java
58 // program, and Java always writes data out in big-endian format. These macros will
59 // byte-swap the data for appropriate use on Windows.
60
61 #if U_IS_BIG_ENDIAN
62 #define SWAP32(x)
63 #define SWAP16(x)
64 #else
65 #define SWAP32(x) x = (uint32_t)((x >> 24 & 0xff) | (x >> 8 & 0xff00) | (x << 8 & 0xff0000) | (x << 24 & 0xff000000))
66 #define SWAP16(x) x = (uint16_t)((x << 8 & 0xff00) | (x >> 8 & 0xff))
67 #endif
68
69 #define DICTIONARY_READ(source, destAddr, len) \
70 uprv_memcpy(destAddr, source, len);\
71 source+=(len)
72
73
74 void
75 BreakDictionary::readDictionaryFile(const uint8_t * in)
76 {
77 int32_t l;
78 int32_t version;
79
80 int i;
81
82 // read in the version number (right now we just ignore it)
83 DICTIONARY_READ(in, &version, 4);
84
85 // read in the column map (this is serialized in its internal form:
86 // an index array followed by a data array)
87 DICTIONARY_READ(in, &l, 4);
88 SWAP32(l);
89 uint16_t* temp = (uint16_t*) uprv_malloc(sizeof(uint16_t)*l);
90 DICTIONARY_READ(in, temp, l * sizeof (int16_t) );
91 for (i = 0; i < l; i++) {
92 SWAP16(temp[i]);
93 }
94 DICTIONARY_READ(in, &l, 4);
95 SWAP32(l);
96 int8_t* temp2 = (int8_t*) uprv_malloc(sizeof(int8_t)*l);
97 DICTIONARY_READ(in, temp2, l);
98 columnMap = ucmp8_openAdopt(temp, temp2, l);
99
100 // read in numCols and numColGroups
101 DICTIONARY_READ(in, &numCols, 4);
102 SWAP32(numCols);
103 DICTIONARY_READ(in, &numColGroups, 4);
104 SWAP32(numColGroups);
105
106 // read in the row-number index
107 DICTIONARY_READ(in, &l, 4);
108 SWAP32(l);
109 rowIndex = (int16_t *)uprv_malloc(l*2);
110 DICTIONARY_READ(in, rowIndex, l * sizeof (int16_t) );
111 for (i = 0; i < l; i++) {
112 SWAP16(rowIndex[i]);
113 }
114
115 // load in the populated-cells bitmap: index first, then bitmap list
116 DICTIONARY_READ(in, &l, 4);
117 SWAP32(l);
118 rowIndexFlagsIndex = (int16_t *)uprv_malloc(l*2);
119 DICTIONARY_READ(in, rowIndexFlagsIndex, l * sizeof(int16_t) );
120 for (i = 0; i < l; i++) {
121 SWAP16(rowIndexFlagsIndex[i]);
122 }
123 DICTIONARY_READ(in, &l, 4);
124 SWAP32(l);
125 rowIndexFlags = (int32_t *)uprv_malloc(l*4);
126 DICTIONARY_READ(in, rowIndexFlags, l * sizeof(int32_t));
127 for (i = 0; i < l; i++) {
128 SWAP32(rowIndexFlags[i]);
129 }
130
131 // load in the row-shift index
132 DICTIONARY_READ(in, &l, 4);
133 SWAP32(l);
134 rowIndexShifts = (int8_t *)uprv_malloc(l);
135 DICTIONARY_READ(in, rowIndexShifts, l);
136
137 // finally, load in the actual state table
138 DICTIONARY_READ(in, &l, 4);
139 SWAP32(l);
140 table = (int16_t *)uprv_malloc(l*2);
141 DICTIONARY_READ(in, table, l * sizeof(int16_t) );
142 for (i = 0; i < l; i++) {
143 SWAP16(table[i]);
144 }
145
146 // the reverse column map occurs next in the file. In the C/C++ code, for the
147 // time being, we're not going to worry about that.
148 }
149
150 //=================================================================================
151 // access to the words
152 //=================================================================================
153
154 /**
155 * Uses the column map to map the character to a column number, then
156 * passes the row and column number to the other version of at()
157 * @param row The current state
158 * @param ch The character whose column we're interested in
159 * @return The new state to transition to
160 */
161 int16_t
162 BreakDictionary::at(int32_t row, UChar ch) const
163 {
164 int16_t col = ucmp8_get(columnMap, ch);
165 return at(row, (int32_t)col);
166 }
167
168 /**
169 * Returns the value in the cell with the specified (logical) row and
170 * column numbers. In DictionaryBasedBreakIterator, the row number is
171 * a state number, the column number is an input, and the return value
172 * is the row number of the new state to transition to. (0 is the
173 * "error" state, and -1 is the "end of word" state in a dictionary)
174 * @param row The row number of the current state
175 * @param col The column number of the input character (0 means "not a
176 * dictionary character")
177 * @return The row number of the new state to transition to
178 */
179 int16_t
180 BreakDictionary::at(int32_t row, int32_t col) const
181 {
182 if (cellIsPopulated(row, col)) {
183 // we map from logical to physical row number by looking up the
184 // mapping in rowIndex; we map from logical column number to
185 // physical column number by looking up a shift value for this
186 // logical row and offsetting the logical column number by
187 // the shift amount. Then we can use internalAt() to actually
188 // get the value out of the table.
189 return internalAt(rowIndex[row], col + rowIndexShifts[row]);
190 }
191 else {
192 return 0;
193 }
194 }
195
196 //=================================================================================
197 // implementation
198 //=================================================================================
199 /**
200 * Given (logical) row and column numbers, returns true if the
201 * cell in that position is populated
202 */
203 UBool
204 BreakDictionary::cellIsPopulated(int32_t row, int32_t col) const
205 {
206 // look up the entry in the bitmap index for the specified row.
207 // If it's a negative number, it's the column number of the only
208 // populated cell in the row
209 if (rowIndexFlagsIndex[row] < 0) {
210 return col == -rowIndexFlagsIndex[row];
211 }
212
213 // if it's a positive number, it's the offset of an entry in the bitmap
214 // list. If the table is more than 32 columns wide, the bitmap is stored
215 // successive entries in the bitmap list, so we have to divide the column
216 // number by 32 and offset the number we got out of the index by the result.
217 // Once we have the appropriate piece of the bitmap, test the appropriate
218 // bit and return the result.
219 else {
220 int32_t flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
221 return (flags & (1 << (col & 0x1f))) != 0;
222 }
223 }
224
225 /**
226 * Implementation of at() when we know the specified cell is populated.
227 * @param row The PHYSICAL row number of the cell
228 * @param col The PHYSICAL column number of the cell
229 * @return The value stored in the cell
230 */
231 int16_t
232 BreakDictionary::internalAt(int32_t row, int32_t col) const
233 {
234 // the table is a one-dimensional array, so this just does the math necessary
235 // to treat it as a two-dimensional array (we don't just use a two-dimensional
236 // array because two-dimensional arrays are inefficient in Java)
237 return table[row * numCols + col];
238 }
239
240 U_NAMESPACE_END
241
242 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */