]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/brkdict.h
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / common / brkdict.h
CommitLineData
b75a7d8f
A
1/*
2**********************************************************************
374ca955 3* Copyright (C) 1999-2004 IBM and others. All rights reserved.
b75a7d8f
A
4**********************************************************************
5* Date Name Description
6* 12/1/99 rtg Ported from Java
7* 01/13/2000 helena Added UErrorCode to ctors.
8**********************************************************************
9*/
10
11#ifndef BRKDICT_H
12#define BRKDICT_H
13
14#include "unicode/utypes.h"
15#include "unicode/uobject.h"
16#include "ucmp8.h"
b75a7d8f
A
17
18U_NAMESPACE_BEGIN
19
20/**
21 * This is the class that represents the list of known words used by
22 * DictionaryBasedBreakIterator. The conceptual data structure used
23 * here is a trie: there is a node hanging off the root node for every
24 * letter that can start a word. Each of these nodes has a node hanging
25 * off of it for every letter that can be the second letter of a word
26 * if this node is the first letter, and so on. The trie is represented
27 * as a two-dimensional array that can be treated as a table of state
28 * transitions. Indexes are used to compress this array, taking
29 * advantage of the fact that this array will always be very sparse.
30 */
31class BreakDictionary : public UMemory {
32 //=================================================================================
33 // data members
34 //=================================================================================
35private:
36
37 /**
38 * Maps from characters to column numbers. The main use of this is to
39 * avoid making room in the array for empty columns.
40 */
41 CompactByteArray* columnMap;
42
43 /**
44 * The number of actual columns in the table
45 */
46 int32_t numCols;
47
48 /**
49 * Columns are organized into groups of 32. This says how many
50 * column groups. (We could calculate this, but we store the
51 * value to avoid having to repeatedly calculate it.)
52 */
53 int32_t numColGroups;
54
55 /**
56 * The actual compressed state table. Each conceptual row represents
57 * a state, and the cells in it contain the row numbers of the states
58 * to transition to for each possible letter. 0 is used to indicate
59 * an illegal combination of letters (i.e., the error state). The
60 * table is compressed by eliminating all the unpopulated (i.e., zero)
61 * cells. Multiple conceptual rows can then be doubled up in a single
62 * physical row by sliding them up and possibly shifting them to one
63 * side or the other so the populated cells don't collide. Indexes
64 * are used to identify unpopulated cells and to locate populated cells.
65 */
66 int16_t* table;
67
68 /**
69 * This index maps logical row numbers to physical row numbers
70 */
71 int16_t* rowIndex;
72
73 /**
74 * A bitmap is used to tell which cells in the comceptual table are
75 * populated. This array contains all the unique bit combinations
76 * in that bitmap. If the table is more than 32 columns wide,
77 * successive entries in this array are used for a single row.
78 */
79 int32_t* rowIndexFlags;
80
81 /**
82 * This index maps from a logical row number into the bitmap table above.
83 * (This keeps us from storing duplicate bitmap combinations.) Since there
84 * are a lot of rows with only one populated cell, instead of wasting space
85 * in the bitmap table, we just store a negative number in this index for
86 * rows with one populated cell. The absolute value of that number is
87 * the column number of the populated cell.
88 */
89 int16_t* rowIndexFlagsIndex;
90
91 /**
92 * For each logical row, this index contains a constant that is added to
93 * the logical column number to get the physical column number
94 */
95 int8_t* rowIndexShifts;
96
97 //=================================================================================
98 // deserialization
99 //=================================================================================
100
101public:
102 /**
103 * Constructor. Creates the BreakDictionary by using readDictionaryFile() to
104 * load the dictionary tables from the disk.
105 * @param dictionaryFilename The name of the dictionary file
106 * @param status for errors if it occurs
107 */
108 BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
109
110 /**
111 * Destructor.
112 */
113 ~BreakDictionary();
114
115 /**
116 * Reads the dictionary file on the disk and constructs the appropriate in-memory
117 * representation.
118 * @param in The given memory stream
119 */
374ca955 120 void readDictionaryFile(const uint8_t * in);
b75a7d8f
A
121
122 //=================================================================================
123 // access to the words
124 //=================================================================================
125
126 /**
127 * Uses the column map to map the character to a column number, then
128 * passes the row and column number to the other version of at()
129 * @param row The current state
130 * @param ch The character whose column we're interested in
131 * @return The new state to transition to
132 */
133 int16_t at(int32_t row, UChar ch) const;
134
135 /**
136 * Returns the value in the cell with the specified (logical) row and
137 * column numbers. In DictionaryBasedBreakIterator, the row number is
138 * a state number, the column number is an input, and the return value
139 * is the row number of the new state to transition to. (0 is the
140 * "error" state, and -1 is the "end of word" state in a dictionary)
141 * @param row The row number of the current state
142 * @param col The column number of the input character (0 means "not a
143 * dictionary character")
144 * @return The row number of the new state to transition to
145 */
146 int16_t at(int32_t row, int32_t col) const;
147
148private:
149 /**
150 * Given (logical) row and column numbers, returns true if the
151 * cell in that position is populated
152 * @param row The LOGICAL row number of the cell
153 * @param col The PHYSICAL row number of the cell
154 * @return true if the cell in that position is populated
155 */
156 UBool cellIsPopulated(int32_t row, int32_t col) const;
157
158 /**
159 * Implementation of at() when we know the specified cell is populated.
160 * @param row The PHYSICAL row number of the cell
161 * @param col The PHYSICAL column number of the cell
162 * @return The value stored in the cell
163 */
164 int16_t internalAt(int32_t row, int32_t col) const;
165
166 // the following methods are never meant to be called and so are not defined
167 // (if you don't declare them, you get default implementations)
168 BreakDictionary(const BreakDictionary& that);
169 BreakDictionary& operator=(const BreakDictionary& that);
170};
171
172U_NAMESPACE_END
173
174#endif