[apple/icu.git] / icuSources / common / brkdict.h

/*
**********************************************************************
*   Copyright (C) 1999-2000 IBM and others. All rights reserved.
**********************************************************************
*   Date        Name        Description
*   12/1/99     rtg         Ported from Java
*   01/13/2000  helena      Added UErrorCode to ctors.
**********************************************************************
*/

#ifndef BRKDICT_H
#define BRKDICT_H

#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "ucmp8.h"
#include "umemstrm.h"

U_NAMESPACE_BEGIN

/**
 * This is the class that represents the list of known words used by
 * DictionaryBasedBreakIterator.  The conceptual data structure used
 * here is a trie: there is a node hanging off the root node for every
 * letter that can start a word.  Each of these nodes has a node hanging
 * off of it for every letter that can be the second letter of a word
 * if this node is the first letter, and so on.  The trie is represented
 * as a two-dimensional array that can be treated as a table of state
 * transitions.  Indexes are used to compress this array, taking
 * advantage of the fact that this array will always be very sparse.
 */
class BreakDictionary : public UMemory {
    //=================================================================================
    // data members
    //=================================================================================
private:

    /**
     * Maps from characters to column numbers.  The main use of this is to
     * avoid making room in the array for empty columns.
     */
    CompactByteArray* columnMap;

    /**
     * The number of actual columns in the table
     */
    int32_t numCols;

    /**
     * Columns are organized into groups of 32.  This says how many
     * column groups.  (We could calculate this, but we store the
     * value to avoid having to repeatedly calculate it.)
     */
    int32_t numColGroups;

    /**
     * The actual compressed state table.  Each conceptual row represents
     * a state, and the cells in it contain the row numbers of the states
     * to transition to for each possible letter.  0 is used to indicate
     * an illegal combination of letters (i.e., the error state).  The
     * table is compressed by eliminating all the unpopulated (i.e., zero)
     * cells.  Multiple conceptual rows can then be doubled up in a single
     * physical row by sliding them up and possibly shifting them to one
     * side or the other so the populated cells don't collide.  Indexes
     * are used to identify unpopulated cells and to locate populated cells.
     */
    int16_t* table;

    /**
     * This index maps logical row numbers to physical row numbers
     */
    int16_t* rowIndex;

    /**
     * A bitmap is used to tell which cells in the comceptual table are
     * populated.  This array contains all the unique bit combinations
     * in that bitmap.  If the table is more than 32 columns wide,
     * successive entries in this array are used for a single row.
     */
    int32_t* rowIndexFlags;

    /**
     * This index maps from a logical row number into the bitmap table above.
     * (This keeps us from storing duplicate bitmap combinations.)  Since there
     * are a lot of rows with only one populated cell, instead of wasting space
     * in the bitmap table, we just store a negative number in this index for
     * rows with one populated cell.  The absolute value of that number is
     * the column number of the populated cell.
     */
    int16_t* rowIndexFlagsIndex;

    /**
     * For each logical row, this index contains a constant that is added to
     * the logical column number to get the physical column number
     */
    int8_t* rowIndexShifts;

    //=================================================================================
    // deserialization
    //=================================================================================

public:
    /**
     * Constructor.  Creates the BreakDictionary by using readDictionaryFile() to
     * load the dictionary tables from the disk.
     * @param dictionaryFilename The name of the dictionary file
     * @param status for errors if it occurs
     */
    BreakDictionary(const char* dictionaryFilename, UErrorCode& status);

    /**
     * Destructor.
     */
    ~BreakDictionary();

    /**
     * Reads the dictionary file on the disk and constructs the appropriate in-memory
     * representation.
     * @param in The given memory stream
     */
    void readDictionaryFile(UMemoryStream* in);

    //=================================================================================
    // access to the words
    //=================================================================================

    /**
     * Uses the column map to map the character to a column number, then
     * passes the row and column number to the other version of at()
     * @param row The current state
     * @param ch The character whose column we're interested in
     * @return The new state to transition to
     */
    int16_t at(int32_t row, UChar ch) const;

    /**
     * Returns the value in the cell with the specified (logical) row and
     * column numbers.  In DictionaryBasedBreakIterator, the row number is
     * a state number, the column number is an input, and the return value
     * is the row number of the new state to transition to.  (0 is the
     * "error" state, and -1 is the "end of word" state in a dictionary)
     * @param row The row number of the current state
     * @param col The column number of the input character (0 means "not a
     * dictionary character")
     * @return The row number of the new state to transition to
     */
    int16_t at(int32_t row, int32_t col) const;

private:
    /**
     * Given (logical) row and column numbers, returns true if the
     * cell in that position is populated
     * @param row The LOGICAL row number of the cell
     * @param col The PHYSICAL row number of the cell
     * @return true if the cell in that position is populated
     */
    UBool cellIsPopulated(int32_t row, int32_t col) const;

    /**
     * Implementation of at() when we know the specified cell is populated.
     * @param row The PHYSICAL row number of the cell
     * @param col The PHYSICAL column number of the cell
     * @return The value stored in the cell
     */
    int16_t internalAt(int32_t row, int32_t col) const;

    // the following methods are never meant to be called and so are not defined
    // (if you don't declare them, you get default implementations)
    BreakDictionary(const BreakDictionary& that);
    BreakDictionary& operator=(const BreakDictionary& that);
};

U_NAMESPACE_END

#endif
Commit	Line	Data
b75a7d8f A	1	/*
	2	**********************************************************************
	3	* Copyright (C) 1999-2000 IBM and others. All rights reserved.
	4	**********************************************************************
	5	* Date Name Description
	6	* 12/1/99 rtg Ported from Java
	7	* 01/13/2000 helena Added UErrorCode to ctors.
	8	**********************************************************************
	9	*/
	10
	11	#ifndef BRKDICT_H
	12	#define BRKDICT_H
	13
	14	#include "unicode/utypes.h"
	15	#include "unicode/uobject.h"
	16	#include "ucmp8.h"
	17	#include "umemstrm.h"
	18
	19	U_NAMESPACE_BEGIN
	20
	21	/**
	22	* This is the class that represents the list of known words used by
	23	* DictionaryBasedBreakIterator. The conceptual data structure used
	24	* here is a trie: there is a node hanging off the root node for every
	25	* letter that can start a word. Each of these nodes has a node hanging
	26	* off of it for every letter that can be the second letter of a word
	27	* if this node is the first letter, and so on. The trie is represented
	28	* as a two-dimensional array that can be treated as a table of state
	29	* transitions. Indexes are used to compress this array, taking
	30	* advantage of the fact that this array will always be very sparse.
	31	*/
	32	class BreakDictionary : public UMemory {
	33	//=================================================================================
	34	// data members
	35	//=================================================================================
	36	private:
	37
	38	/**
	39	* Maps from characters to column numbers. The main use of this is to
	40	* avoid making room in the array for empty columns.
	41	*/
	42	CompactByteArray* columnMap;
	43
	44	/**
	45	* The number of actual columns in the table
	46	*/
	47	int32_t numCols;
	48
	49	/**
	50	* Columns are organized into groups of 32. This says how many
	51	* column groups. (We could calculate this, but we store the
	52	* value to avoid having to repeatedly calculate it.)
	53	*/
	54	int32_t numColGroups;
	55
	56	/**
	57	* The actual compressed state table. Each conceptual row represents
	58	* a state, and the cells in it contain the row numbers of the states
	59	* to transition to for each possible letter. 0 is used to indicate
	60	* an illegal combination of letters (i.e., the error state). The
	61	* table is compressed by eliminating all the unpopulated (i.e., zero)
	62	* cells. Multiple conceptual rows can then be doubled up in a single
	63	* physical row by sliding them up and possibly shifting them to one
	64	* side or the other so the populated cells don't collide. Indexes
65	* are used to identify unpopulated cells and to locate populated cells.
66	*/
67	int16_t* table;
68
69	/**
70	* This index maps logical row numbers to physical row numbers
71	*/
72	int16_t* rowIndex;
73
74	/**
75	* A bitmap is used to tell which cells in the comceptual table are
76	* populated. This array contains all the unique bit combinations
77	* in that bitmap. If the table is more than 32 columns wide,
78	* successive entries in this array are used for a single row.
79	*/
80	int32_t* rowIndexFlags;
81
82	/**
83	* This index maps from a logical row number into the bitmap table above.
84	* (This keeps us from storing duplicate bitmap combinations.) Since there
85	* are a lot of rows with only one populated cell, instead of wasting space
86	* in the bitmap table, we just store a negative number in this index for
87	* rows with one populated cell. The absolute value of that number is
88	* the column number of the populated cell.
89	*/
90	int16_t* rowIndexFlagsIndex;
91
92	/**
93	* For each logical row, this index contains a constant that is added to
94	* the logical column number to get the physical column number
95	*/
96	int8_t* rowIndexShifts;
97
98	//=================================================================================
99	// deserialization
100	//=================================================================================
101
102	public:
103	/**
104	* Constructor. Creates the BreakDictionary by using readDictionaryFile() to
105	* load the dictionary tables from the disk.
106	* @param dictionaryFilename The name of the dictionary file
107	* @param status for errors if it occurs
108	*/
109	BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
110
111	/**
112	* Destructor.
113	*/
114	~BreakDictionary();
115
116	/**
117	* Reads the dictionary file on the disk and constructs the appropriate in-memory
118	* representation.
119	* @param in The given memory stream
120	*/
121	void readDictionaryFile(UMemoryStream* in);
122
123	//=================================================================================
124	// access to the words
125	//=================================================================================
126
127	/**
128	* Uses the column map to map the character to a column number, then
129	* passes the row and column number to the other version of at()
130	* @param row The current state
131	* @param ch The character whose column we're interested in
132	* @return The new state to transition to
133	*/
134	int16_t at(int32_t row, UChar ch) const;
135
136	/**
137	* Returns the value in the cell with the specified (logical) row and
138	* column numbers. In DictionaryBasedBreakIterator, the row number is
139	* a state number, the column number is an input, and the return value
140	* is the row number of the new state to transition to. (0 is the
141	* "error" state, and -1 is the "end of word" state in a dictionary)
142	* @param row The row number of the current state
143	* @param col The column number of the input character (0 means "not a
144	* dictionary character")
145	* @return The row number of the new state to transition to
146	*/
147	int16_t at(int32_t row, int32_t col) const;
148
149	private:
150	/**
151	* Given (logical) row and column numbers, returns true if the
152	* cell in that position is populated
153	* @param row The LOGICAL row number of the cell
154	* @param col The PHYSICAL row number of the cell
155	* @return true if the cell in that position is populated
156	*/
157	UBool cellIsPopulated(int32_t row, int32_t col) const;
158
159	/**
160	* Implementation of at() when we know the specified cell is populated.
161	* @param row The PHYSICAL row number of the cell
162	* @param col The PHYSICAL column number of the cell
163	* @return The value stored in the cell
164	*/
165	int16_t internalAt(int32_t row, int32_t col) const;
166
167	// the following methods are never meant to be called and so are not defined
168	// (if you don't declare them, you get default implementations)
169	BreakDictionary(const BreakDictionary& that);
170	BreakDictionary& operator=(const BreakDictionary& that);
171	};
172
173	U_NAMESPACE_END
174
175	#endif