]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/brkdict.h
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / brkdict.h
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
3* Copyright (C) 1999-2000 IBM and others. All rights reserved.
4**********************************************************************
5* Date Name Description
6* 12/1/99 rtg Ported from Java
7* 01/13/2000 helena Added UErrorCode to ctors.
8**********************************************************************
9*/
10
11#ifndef BRKDICT_H
12#define BRKDICT_H
13
14#include "unicode/utypes.h"
15#include "unicode/uobject.h"
16#include "ucmp8.h"
17#include "umemstrm.h"
18
19U_NAMESPACE_BEGIN
20
21/**
22 * This is the class that represents the list of known words used by
23 * DictionaryBasedBreakIterator. The conceptual data structure used
24 * here is a trie: there is a node hanging off the root node for every
25 * letter that can start a word. Each of these nodes has a node hanging
26 * off of it for every letter that can be the second letter of a word
27 * if this node is the first letter, and so on. The trie is represented
28 * as a two-dimensional array that can be treated as a table of state
29 * transitions. Indexes are used to compress this array, taking
30 * advantage of the fact that this array will always be very sparse.
31 */
32class BreakDictionary : public UMemory {
33 //=================================================================================
34 // data members
35 //=================================================================================
36private:
37
38 /**
39 * Maps from characters to column numbers. The main use of this is to
40 * avoid making room in the array for empty columns.
41 */
42 CompactByteArray* columnMap;
43
44 /**
45 * The number of actual columns in the table
46 */
47 int32_t numCols;
48
49 /**
50 * Columns are organized into groups of 32. This says how many
51 * column groups. (We could calculate this, but we store the
52 * value to avoid having to repeatedly calculate it.)
53 */
54 int32_t numColGroups;
55
56 /**
57 * The actual compressed state table. Each conceptual row represents
58 * a state, and the cells in it contain the row numbers of the states
59 * to transition to for each possible letter. 0 is used to indicate
60 * an illegal combination of letters (i.e., the error state). The
61 * table is compressed by eliminating all the unpopulated (i.e., zero)
62 * cells. Multiple conceptual rows can then be doubled up in a single
63 * physical row by sliding them up and possibly shifting them to one
64 * side or the other so the populated cells don't collide. Indexes
65 * are used to identify unpopulated cells and to locate populated cells.
66 */
67 int16_t* table;
68
69 /**
70 * This index maps logical row numbers to physical row numbers
71 */
72 int16_t* rowIndex;
73
74 /**
75 * A bitmap is used to tell which cells in the comceptual table are
76 * populated. This array contains all the unique bit combinations
77 * in that bitmap. If the table is more than 32 columns wide,
78 * successive entries in this array are used for a single row.
79 */
80 int32_t* rowIndexFlags;
81
82 /**
83 * This index maps from a logical row number into the bitmap table above.
84 * (This keeps us from storing duplicate bitmap combinations.) Since there
85 * are a lot of rows with only one populated cell, instead of wasting space
86 * in the bitmap table, we just store a negative number in this index for
87 * rows with one populated cell. The absolute value of that number is
88 * the column number of the populated cell.
89 */
90 int16_t* rowIndexFlagsIndex;
91
92 /**
93 * For each logical row, this index contains a constant that is added to
94 * the logical column number to get the physical column number
95 */
96 int8_t* rowIndexShifts;
97
98 //=================================================================================
99 // deserialization
100 //=================================================================================
101
102public:
103 /**
104 * Constructor. Creates the BreakDictionary by using readDictionaryFile() to
105 * load the dictionary tables from the disk.
106 * @param dictionaryFilename The name of the dictionary file
107 * @param status for errors if it occurs
108 */
109 BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
110
111 /**
112 * Destructor.
113 */
114 ~BreakDictionary();
115
116 /**
117 * Reads the dictionary file on the disk and constructs the appropriate in-memory
118 * representation.
119 * @param in The given memory stream
120 */
121 void readDictionaryFile(UMemoryStream* in);
122
123 //=================================================================================
124 // access to the words
125 //=================================================================================
126
127 /**
128 * Uses the column map to map the character to a column number, then
129 * passes the row and column number to the other version of at()
130 * @param row The current state
131 * @param ch The character whose column we're interested in
132 * @return The new state to transition to
133 */
134 int16_t at(int32_t row, UChar ch) const;
135
136 /**
137 * Returns the value in the cell with the specified (logical) row and
138 * column numbers. In DictionaryBasedBreakIterator, the row number is
139 * a state number, the column number is an input, and the return value
140 * is the row number of the new state to transition to. (0 is the
141 * "error" state, and -1 is the "end of word" state in a dictionary)
142 * @param row The row number of the current state
143 * @param col The column number of the input character (0 means "not a
144 * dictionary character")
145 * @return The row number of the new state to transition to
146 */
147 int16_t at(int32_t row, int32_t col) const;
148
149private:
150 /**
151 * Given (logical) row and column numbers, returns true if the
152 * cell in that position is populated
153 * @param row The LOGICAL row number of the cell
154 * @param col The PHYSICAL row number of the cell
155 * @return true if the cell in that position is populated
156 */
157 UBool cellIsPopulated(int32_t row, int32_t col) const;
158
159 /**
160 * Implementation of at() when we know the specified cell is populated.
161 * @param row The PHYSICAL row number of the cell
162 * @param col The PHYSICAL column number of the cell
163 * @return The value stored in the cell
164 */
165 int16_t internalAt(int32_t row, int32_t col) const;
166
167 // the following methods are never meant to be called and so are not defined
168 // (if you don't declare them, you get default implementations)
169 BreakDictionary(const BreakDictionary& that);
170 BreakDictionary& operator=(const BreakDictionary& that);
171};
172
173U_NAMESPACE_END
174
175#endif