[apple/icu.git] / icuSources / common / brkdict.h

/*
**********************************************************************
*   Copyright (C) 1999-2004 IBM and others. All rights reserved.
**********************************************************************
*   Date        Name        Description
*   12/1/99     rtg         Ported from Java
*   01/13/2000  helena      Added UErrorCode to ctors.
**********************************************************************
*/

#ifndef BRKDICT_H
#define BRKDICT_H

#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "ucmp8.h"

U_NAMESPACE_BEGIN

/**
 * This is the class that represents the list of known words used by
 * DictionaryBasedBreakIterator.  The conceptual data structure used
 * here is a trie: there is a node hanging off the root node for every
 * letter that can start a word.  Each of these nodes has a node hanging
 * off of it for every letter that can be the second letter of a word
 * if this node is the first letter, and so on.  The trie is represented
 * as a two-dimensional array that can be treated as a table of state
 * transitions.  Indexes are used to compress this array, taking
 * advantage of the fact that this array will always be very sparse.
 */
class BreakDictionary : public UMemory {
    //=================================================================================
    // data members
    //=================================================================================
private:

    /**
     * Maps from characters to column numbers.  The main use of this is to
     * avoid making room in the array for empty columns.
     */
    CompactByteArray* columnMap;

    /**
     * The number of actual columns in the table
     */
    int32_t numCols;

    /**
     * Columns are organized into groups of 32.  This says how many
     * column groups.  (We could calculate this, but we store the
     * value to avoid having to repeatedly calculate it.)
     */
    int32_t numColGroups;

    /**
     * The actual compressed state table.  Each conceptual row represents
     * a state, and the cells in it contain the row numbers of the states
     * to transition to for each possible letter.  0 is used to indicate
     * an illegal combination of letters (i.e., the error state).  The
     * table is compressed by eliminating all the unpopulated (i.e., zero)
     * cells.  Multiple conceptual rows can then be doubled up in a single
     * physical row by sliding them up and possibly shifting them to one
     * side or the other so the populated cells don't collide.  Indexes
     * are used to identify unpopulated cells and to locate populated cells.
     */
    int16_t* table;

    /**
     * This index maps logical row numbers to physical row numbers
     */
    int16_t* rowIndex;

    /**
     * A bitmap is used to tell which cells in the comceptual table are
     * populated.  This array contains all the unique bit combinations
     * in that bitmap.  If the table is more than 32 columns wide,
     * successive entries in this array are used for a single row.
     */
    int32_t* rowIndexFlags;

    /**
     * This index maps from a logical row number into the bitmap table above.
     * (This keeps us from storing duplicate bitmap combinations.)  Since there
     * are a lot of rows with only one populated cell, instead of wasting space
     * in the bitmap table, we just store a negative number in this index for
     * rows with one populated cell.  The absolute value of that number is
     * the column number of the populated cell.
     */
    int16_t* rowIndexFlagsIndex;

    /**
     * For each logical row, this index contains a constant that is added to
     * the logical column number to get the physical column number
     */
    int8_t* rowIndexShifts;

    //=================================================================================
    // deserialization
    //=================================================================================

public:
    /**
     * Constructor.  Creates the BreakDictionary by using readDictionaryFile() to
     * load the dictionary tables from the disk.
     * @param dictionaryFilename The name of the dictionary file
     * @param status for errors if it occurs
     */
    BreakDictionary(const char* dictionaryFilename, UErrorCode& status);

    /**
     * Destructor.
     */
    ~BreakDictionary();

    /**
     * Reads the dictionary file on the disk and constructs the appropriate in-memory
     * representation.
     * @param in The given memory stream
     */
    void readDictionaryFile(const uint8_t * in);

    //=================================================================================
    // access to the words
    //=================================================================================

    /**
     * Uses the column map to map the character to a column number, then
     * passes the row and column number to the other version of at()
     * @param row The current state
     * @param ch The character whose column we're interested in
     * @return The new state to transition to
     */
    int16_t at(int32_t row, UChar ch) const;

    /**
     * Returns the value in the cell with the specified (logical) row and
     * column numbers.  In DictionaryBasedBreakIterator, the row number is
     * a state number, the column number is an input, and the return value
     * is the row number of the new state to transition to.  (0 is the
     * "error" state, and -1 is the "end of word" state in a dictionary)
     * @param row The row number of the current state
     * @param col The column number of the input character (0 means "not a
     * dictionary character")
     * @return The row number of the new state to transition to
     */
    int16_t at(int32_t row, int32_t col) const;

private:
    /**
     * Given (logical) row and column numbers, returns true if the
     * cell in that position is populated
     * @param row The LOGICAL row number of the cell
     * @param col The PHYSICAL row number of the cell
     * @return true if the cell in that position is populated
     */
    UBool cellIsPopulated(int32_t row, int32_t col) const;

    /**
     * Implementation of at() when we know the specified cell is populated.
     * @param row The PHYSICAL row number of the cell
     * @param col The PHYSICAL column number of the cell
     * @return The value stored in the cell
     */
    int16_t internalAt(int32_t row, int32_t col) const;

    // the following methods are never meant to be called and so are not defined
    // (if you don't declare them, you get default implementations)
    BreakDictionary(const BreakDictionary& that);
    BreakDictionary& operator=(const BreakDictionary& that);
};

U_NAMESPACE_END

#endif
Commit	Line	Data
b75a7d8f A	1	/*
b75a7d8f A	2	**********************************************************************
374ca955	3	* Copyright (C) 1999-2004 IBM and others. All rights reserved.
b75a7d8f A	4	**********************************************************************
	5	* Date Name Description
	6	* 12/1/99 rtg Ported from Java
	7	* 01/13/2000 helena Added UErrorCode to ctors.
	8	**********************************************************************
	9	*/
	10
	11	#ifndef BRKDICT_H
	12	#define BRKDICT_H
	13
	14	#include "unicode/utypes.h"
	15	#include "unicode/uobject.h"
	16	#include "ucmp8.h"
b75a7d8f A	17
	18	U_NAMESPACE_BEGIN
	19
	20	/**
	21	* This is the class that represents the list of known words used by
	22	* DictionaryBasedBreakIterator. The conceptual data structure used
	23	* here is a trie: there is a node hanging off the root node for every
	24	* letter that can start a word. Each of these nodes has a node hanging
	25	* off of it for every letter that can be the second letter of a word
	26	* if this node is the first letter, and so on. The trie is represented
	27	* as a two-dimensional array that can be treated as a table of state
	28	* transitions. Indexes are used to compress this array, taking
	29	* advantage of the fact that this array will always be very sparse.
	30	*/
	31	class BreakDictionary : public UMemory {
	32	//=================================================================================
	33	// data members
	34	//=================================================================================
	35	private:
	36
	37	/**
	38	* Maps from characters to column numbers. The main use of this is to
	39	* avoid making room in the array for empty columns.
	40	*/
	41	CompactByteArray* columnMap;
	42
	43	/**
	44	* The number of actual columns in the table
	45	*/
	46	int32_t numCols;
	47
	48	/**
	49	* Columns are organized into groups of 32. This says how many
	50	* column groups. (We could calculate this, but we store the
	51	* value to avoid having to repeatedly calculate it.)
	52	*/
	53	int32_t numColGroups;
	54
	55	/**
	56	* The actual compressed state table. Each conceptual row represents
	57	* a state, and the cells in it contain the row numbers of the states
	58	* to transition to for each possible letter. 0 is used to indicate
	59	* an illegal combination of letters (i.e., the error state). The
	60	* table is compressed by eliminating all the unpopulated (i.e., zero)
	61	* cells. Multiple conceptual rows can then be doubled up in a single
	62	* physical row by sliding them up and possibly shifting them to one
	63	* side or the other so the populated cells don't collide. Indexes
	64	* are used to identify unpopulated cells and to locate populated cells.
	65	*/
	66	int16_t* table;
	67
	68	/**
	69	* This index maps logical row numbers to physical row numbers
	70	*/
	71	int16_t* rowIndex;
	72
	73	/**
	74	* A bitmap is used to tell which cells in the comceptual table are
	75	* populated. This array contains all the unique bit combinations
	76	* in that bitmap. If the table is more than 32 columns wide,
	77	* successive entries in this array are used for a single row.
	78	*/
	79	int32_t* rowIndexFlags;
	80
81	/**
82	* This index maps from a logical row number into the bitmap table above.
83	* (This keeps us from storing duplicate bitmap combinations.) Since there
84	* are a lot of rows with only one populated cell, instead of wasting space
85	* in the bitmap table, we just store a negative number in this index for
86	* rows with one populated cell. The absolute value of that number is
87	* the column number of the populated cell.
88	*/
89	int16_t* rowIndexFlagsIndex;
90
91	/**
92	* For each logical row, this index contains a constant that is added to
93	* the logical column number to get the physical column number
94	*/
95	int8_t* rowIndexShifts;
96
97	//=================================================================================
98	// deserialization
99	//=================================================================================
100
101	public:
102	/**
103	* Constructor. Creates the BreakDictionary by using readDictionaryFile() to
104	* load the dictionary tables from the disk.
105	* @param dictionaryFilename The name of the dictionary file
106	* @param status for errors if it occurs
107	*/
108	BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
109
110	/**
111	* Destructor.
112	*/
113	~BreakDictionary();
114
115	/**
116	* Reads the dictionary file on the disk and constructs the appropriate in-memory
117	* representation.
118	* @param in The given memory stream
119	*/
374ca955	120	void readDictionaryFile(const uint8_t * in);
b75a7d8f A	121
	122	//=================================================================================
	123	// access to the words
	124	//=================================================================================
	125
	126	/**
	127	* Uses the column map to map the character to a column number, then
	128	* passes the row and column number to the other version of at()
	129	* @param row The current state
	130	* @param ch The character whose column we're interested in
	131	* @return The new state to transition to
	132	*/
	133	int16_t at(int32_t row, UChar ch) const;
	134
	135	/**
	136	* Returns the value in the cell with the specified (logical) row and
	137	* column numbers. In DictionaryBasedBreakIterator, the row number is
	138	* a state number, the column number is an input, and the return value
	139	* is the row number of the new state to transition to. (0 is the
	140	* "error" state, and -1 is the "end of word" state in a dictionary)
	141	* @param row The row number of the current state
	142	* @param col The column number of the input character (0 means "not a
	143	* dictionary character")
	144	* @return The row number of the new state to transition to
	145	*/
	146	int16_t at(int32_t row, int32_t col) const;
	147
	148	private:
	149	/**
	150	* Given (logical) row and column numbers, returns true if the
	151	* cell in that position is populated
	152	* @param row The LOGICAL row number of the cell
	153	* @param col The PHYSICAL row number of the cell
	154	* @return true if the cell in that position is populated
	155	*/
	156	UBool cellIsPopulated(int32_t row, int32_t col) const;
	157
	158	/**
	159	* Implementation of at() when we know the specified cell is populated.
	160	* @param row The PHYSICAL row number of the cell
	161	* @param col The PHYSICAL column number of the cell
	162	* @return The value stored in the cell
	163	*/
	164	int16_t internalAt(int32_t row, int32_t col) const;
	165
	166	// the following methods are never meant to be called and so are not defined
	167	// (if you don't declare them, you get default implementations)
	168	BreakDictionary(const BreakDictionary& that);
	169	BreakDictionary& operator=(const BreakDictionary& that);
	170	};
	171
	172	U_NAMESPACE_END
	173
	174	#endif