]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
374ca955 | 3 | * Copyright (C) 1999-2004 IBM and others. All rights reserved. |
b75a7d8f A |
4 | ********************************************************************** |
5 | * Date Name Description | |
6 | * 12/1/99 rtg Ported from Java | |
7 | * 01/13/2000 helena Added UErrorCode to ctors. | |
8 | ********************************************************************** | |
9 | */ | |
10 | ||
11 | #ifndef BRKDICT_H | |
12 | #define BRKDICT_H | |
13 | ||
14 | #include "unicode/utypes.h" | |
15 | #include "unicode/uobject.h" | |
16 | #include "ucmp8.h" | |
b75a7d8f A |
17 | |
18 | U_NAMESPACE_BEGIN | |
19 | ||
20 | /** | |
21 | * This is the class that represents the list of known words used by | |
22 | * DictionaryBasedBreakIterator. The conceptual data structure used | |
23 | * here is a trie: there is a node hanging off the root node for every | |
24 | * letter that can start a word. Each of these nodes has a node hanging | |
25 | * off of it for every letter that can be the second letter of a word | |
26 | * if this node is the first letter, and so on. The trie is represented | |
27 | * as a two-dimensional array that can be treated as a table of state | |
28 | * transitions. Indexes are used to compress this array, taking | |
29 | * advantage of the fact that this array will always be very sparse. | |
30 | */ | |
31 | class BreakDictionary : public UMemory { | |
32 | //================================================================================= | |
33 | // data members | |
34 | //================================================================================= | |
35 | private: | |
36 | ||
37 | /** | |
38 | * Maps from characters to column numbers. The main use of this is to | |
39 | * avoid making room in the array for empty columns. | |
40 | */ | |
41 | CompactByteArray* columnMap; | |
42 | ||
43 | /** | |
44 | * The number of actual columns in the table | |
45 | */ | |
46 | int32_t numCols; | |
47 | ||
48 | /** | |
49 | * Columns are organized into groups of 32. This says how many | |
50 | * column groups. (We could calculate this, but we store the | |
51 | * value to avoid having to repeatedly calculate it.) | |
52 | */ | |
53 | int32_t numColGroups; | |
54 | ||
55 | /** | |
56 | * The actual compressed state table. Each conceptual row represents | |
57 | * a state, and the cells in it contain the row numbers of the states | |
58 | * to transition to for each possible letter. 0 is used to indicate | |
59 | * an illegal combination of letters (i.e., the error state). The | |
60 | * table is compressed by eliminating all the unpopulated (i.e., zero) | |
61 | * cells. Multiple conceptual rows can then be doubled up in a single | |
62 | * physical row by sliding them up and possibly shifting them to one | |
63 | * side or the other so the populated cells don't collide. Indexes | |
64 | * are used to identify unpopulated cells and to locate populated cells. | |
65 | */ | |
66 | int16_t* table; | |
67 | ||
68 | /** | |
69 | * This index maps logical row numbers to physical row numbers | |
70 | */ | |
71 | int16_t* rowIndex; | |
72 | ||
73 | /** | |
74 | * A bitmap is used to tell which cells in the comceptual table are | |
75 | * populated. This array contains all the unique bit combinations | |
76 | * in that bitmap. If the table is more than 32 columns wide, | |
77 | * successive entries in this array are used for a single row. | |
78 | */ | |
79 | int32_t* rowIndexFlags; | |
80 | ||
81 | /** | |
82 | * This index maps from a logical row number into the bitmap table above. | |
83 | * (This keeps us from storing duplicate bitmap combinations.) Since there | |
84 | * are a lot of rows with only one populated cell, instead of wasting space | |
85 | * in the bitmap table, we just store a negative number in this index for | |
86 | * rows with one populated cell. The absolute value of that number is | |
87 | * the column number of the populated cell. | |
88 | */ | |
89 | int16_t* rowIndexFlagsIndex; | |
90 | ||
91 | /** | |
92 | * For each logical row, this index contains a constant that is added to | |
93 | * the logical column number to get the physical column number | |
94 | */ | |
95 | int8_t* rowIndexShifts; | |
96 | ||
97 | //================================================================================= | |
98 | // deserialization | |
99 | //================================================================================= | |
100 | ||
101 | public: | |
102 | /** | |
103 | * Constructor. Creates the BreakDictionary by using readDictionaryFile() to | |
104 | * load the dictionary tables from the disk. | |
105 | * @param dictionaryFilename The name of the dictionary file | |
106 | * @param status for errors if it occurs | |
107 | */ | |
108 | BreakDictionary(const char* dictionaryFilename, UErrorCode& status); | |
109 | ||
110 | /** | |
111 | * Destructor. | |
112 | */ | |
113 | ~BreakDictionary(); | |
114 | ||
115 | /** | |
116 | * Reads the dictionary file on the disk and constructs the appropriate in-memory | |
117 | * representation. | |
118 | * @param in The given memory stream | |
119 | */ | |
374ca955 | 120 | void readDictionaryFile(const uint8_t * in); |
b75a7d8f A |
121 | |
122 | //================================================================================= | |
123 | // access to the words | |
124 | //================================================================================= | |
125 | ||
126 | /** | |
127 | * Uses the column map to map the character to a column number, then | |
128 | * passes the row and column number to the other version of at() | |
129 | * @param row The current state | |
130 | * @param ch The character whose column we're interested in | |
131 | * @return The new state to transition to | |
132 | */ | |
133 | int16_t at(int32_t row, UChar ch) const; | |
134 | ||
135 | /** | |
136 | * Returns the value in the cell with the specified (logical) row and | |
137 | * column numbers. In DictionaryBasedBreakIterator, the row number is | |
138 | * a state number, the column number is an input, and the return value | |
139 | * is the row number of the new state to transition to. (0 is the | |
140 | * "error" state, and -1 is the "end of word" state in a dictionary) | |
141 | * @param row The row number of the current state | |
142 | * @param col The column number of the input character (0 means "not a | |
143 | * dictionary character") | |
144 | * @return The row number of the new state to transition to | |
145 | */ | |
146 | int16_t at(int32_t row, int32_t col) const; | |
147 | ||
148 | private: | |
149 | /** | |
150 | * Given (logical) row and column numbers, returns true if the | |
151 | * cell in that position is populated | |
152 | * @param row The LOGICAL row number of the cell | |
153 | * @param col The PHYSICAL row number of the cell | |
154 | * @return true if the cell in that position is populated | |
155 | */ | |
156 | UBool cellIsPopulated(int32_t row, int32_t col) const; | |
157 | ||
158 | /** | |
159 | * Implementation of at() when we know the specified cell is populated. | |
160 | * @param row The PHYSICAL row number of the cell | |
161 | * @param col The PHYSICAL column number of the cell | |
162 | * @return The value stored in the cell | |
163 | */ | |
164 | int16_t internalAt(int32_t row, int32_t col) const; | |
165 | ||
166 | // the following methods are never meant to be called and so are not defined | |
167 | // (if you don't declare them, you get default implementations) | |
168 | BreakDictionary(const BreakDictionary& that); | |
169 | BreakDictionary& operator=(const BreakDictionary& that); | |
170 | }; | |
171 | ||
172 | U_NAMESPACE_END | |
173 | ||
174 | #endif |