]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ********************************************************************** | |
3 | * Copyright (C) 1999-2000 IBM and others. All rights reserved. | |
4 | ********************************************************************** | |
5 | * Date Name Description | |
6 | * 12/1/99 rtg Ported from Java | |
7 | * 01/13/2000 helena Added UErrorCode to ctors. | |
8 | ********************************************************************** | |
9 | */ | |
10 | ||
11 | #ifndef BRKDICT_H | |
12 | #define BRKDICT_H | |
13 | ||
14 | #include "unicode/utypes.h" | |
15 | #include "unicode/uobject.h" | |
16 | #include "ucmp8.h" | |
17 | #include "umemstrm.h" | |
18 | ||
19 | U_NAMESPACE_BEGIN | |
20 | ||
21 | /** | |
22 | * This is the class that represents the list of known words used by | |
23 | * DictionaryBasedBreakIterator. The conceptual data structure used | |
24 | * here is a trie: there is a node hanging off the root node for every | |
25 | * letter that can start a word. Each of these nodes has a node hanging | |
26 | * off of it for every letter that can be the second letter of a word | |
27 | * if this node is the first letter, and so on. The trie is represented | |
28 | * as a two-dimensional array that can be treated as a table of state | |
29 | * transitions. Indexes are used to compress this array, taking | |
30 | * advantage of the fact that this array will always be very sparse. | |
31 | */ | |
32 | class BreakDictionary : public UMemory { | |
33 | //================================================================================= | |
34 | // data members | |
35 | //================================================================================= | |
36 | private: | |
37 | ||
38 | /** | |
39 | * Maps from characters to column numbers. The main use of this is to | |
40 | * avoid making room in the array for empty columns. | |
41 | */ | |
42 | CompactByteArray* columnMap; | |
43 | ||
44 | /** | |
45 | * The number of actual columns in the table | |
46 | */ | |
47 | int32_t numCols; | |
48 | ||
49 | /** | |
50 | * Columns are organized into groups of 32. This says how many | |
51 | * column groups. (We could calculate this, but we store the | |
52 | * value to avoid having to repeatedly calculate it.) | |
53 | */ | |
54 | int32_t numColGroups; | |
55 | ||
56 | /** | |
57 | * The actual compressed state table. Each conceptual row represents | |
58 | * a state, and the cells in it contain the row numbers of the states | |
59 | * to transition to for each possible letter. 0 is used to indicate | |
60 | * an illegal combination of letters (i.e., the error state). The | |
61 | * table is compressed by eliminating all the unpopulated (i.e., zero) | |
62 | * cells. Multiple conceptual rows can then be doubled up in a single | |
63 | * physical row by sliding them up and possibly shifting them to one | |
64 | * side or the other so the populated cells don't collide. Indexes | |
65 | * are used to identify unpopulated cells and to locate populated cells. | |
66 | */ | |
67 | int16_t* table; | |
68 | ||
69 | /** | |
70 | * This index maps logical row numbers to physical row numbers | |
71 | */ | |
72 | int16_t* rowIndex; | |
73 | ||
74 | /** | |
75 | * A bitmap is used to tell which cells in the comceptual table are | |
76 | * populated. This array contains all the unique bit combinations | |
77 | * in that bitmap. If the table is more than 32 columns wide, | |
78 | * successive entries in this array are used for a single row. | |
79 | */ | |
80 | int32_t* rowIndexFlags; | |
81 | ||
82 | /** | |
83 | * This index maps from a logical row number into the bitmap table above. | |
84 | * (This keeps us from storing duplicate bitmap combinations.) Since there | |
85 | * are a lot of rows with only one populated cell, instead of wasting space | |
86 | * in the bitmap table, we just store a negative number in this index for | |
87 | * rows with one populated cell. The absolute value of that number is | |
88 | * the column number of the populated cell. | |
89 | */ | |
90 | int16_t* rowIndexFlagsIndex; | |
91 | ||
92 | /** | |
93 | * For each logical row, this index contains a constant that is added to | |
94 | * the logical column number to get the physical column number | |
95 | */ | |
96 | int8_t* rowIndexShifts; | |
97 | ||
98 | //================================================================================= | |
99 | // deserialization | |
100 | //================================================================================= | |
101 | ||
102 | public: | |
103 | /** | |
104 | * Constructor. Creates the BreakDictionary by using readDictionaryFile() to | |
105 | * load the dictionary tables from the disk. | |
106 | * @param dictionaryFilename The name of the dictionary file | |
107 | * @param status for errors if it occurs | |
108 | */ | |
109 | BreakDictionary(const char* dictionaryFilename, UErrorCode& status); | |
110 | ||
111 | /** | |
112 | * Destructor. | |
113 | */ | |
114 | ~BreakDictionary(); | |
115 | ||
116 | /** | |
117 | * Reads the dictionary file on the disk and constructs the appropriate in-memory | |
118 | * representation. | |
119 | * @param in The given memory stream | |
120 | */ | |
121 | void readDictionaryFile(UMemoryStream* in); | |
122 | ||
123 | //================================================================================= | |
124 | // access to the words | |
125 | //================================================================================= | |
126 | ||
127 | /** | |
128 | * Uses the column map to map the character to a column number, then | |
129 | * passes the row and column number to the other version of at() | |
130 | * @param row The current state | |
131 | * @param ch The character whose column we're interested in | |
132 | * @return The new state to transition to | |
133 | */ | |
134 | int16_t at(int32_t row, UChar ch) const; | |
135 | ||
136 | /** | |
137 | * Returns the value in the cell with the specified (logical) row and | |
138 | * column numbers. In DictionaryBasedBreakIterator, the row number is | |
139 | * a state number, the column number is an input, and the return value | |
140 | * is the row number of the new state to transition to. (0 is the | |
141 | * "error" state, and -1 is the "end of word" state in a dictionary) | |
142 | * @param row The row number of the current state | |
143 | * @param col The column number of the input character (0 means "not a | |
144 | * dictionary character") | |
145 | * @return The row number of the new state to transition to | |
146 | */ | |
147 | int16_t at(int32_t row, int32_t col) const; | |
148 | ||
149 | private: | |
150 | /** | |
151 | * Given (logical) row and column numbers, returns true if the | |
152 | * cell in that position is populated | |
153 | * @param row The LOGICAL row number of the cell | |
154 | * @param col The PHYSICAL row number of the cell | |
155 | * @return true if the cell in that position is populated | |
156 | */ | |
157 | UBool cellIsPopulated(int32_t row, int32_t col) const; | |
158 | ||
159 | /** | |
160 | * Implementation of at() when we know the specified cell is populated. | |
161 | * @param row The PHYSICAL row number of the cell | |
162 | * @param col The PHYSICAL column number of the cell | |
163 | * @return The value stored in the cell | |
164 | */ | |
165 | int16_t internalAt(int32_t row, int32_t col) const; | |
166 | ||
167 | // the following methods are never meant to be called and so are not defined | |
168 | // (if you don't declare them, you get default implementations) | |
169 | BreakDictionary(const BreakDictionary& that); | |
170 | BreakDictionary& operator=(const BreakDictionary& that); | |
171 | }; | |
172 | ||
173 | U_NAMESPACE_END | |
174 | ||
175 | #endif |