icuSources/common/brkdict.cpp

   1 /*
   2 **********************************************************************
   3 *   Copyright (C) 1999-2004 IBM and others. All rights reserved.
   4 **********************************************************************
   5 *   Date        Name        Description
   6 *   12/1/99     rtg         Ported from Java
   7 *   01/13/2000  helena      Added UErrorCode to ctors.
   8 **********************************************************************
   9 */
  10
  11 #include "unicode/utypes.h"
  12
  13 #if !UCONFIG_NO_BREAK_ITERATION
  14
  15 #include "unicode/ures.h"
  16 #include "brkdict.h"
  17 #include "cmemory.h"
  18
  19 U_NAMESPACE_BEGIN
  20
  21 //=================================================================================
  22 // deserialization
  23 //=================================================================================
  24
  25 BreakDictionary::BreakDictionary(const char* /*dictionaryFilename*/, UErrorCode& status)
  26  :  columnMap(NULL),
  27     table(NULL),
  28     rowIndex(NULL),
  29     rowIndexFlags(NULL),
  30     rowIndexFlagsIndex(NULL),
  31     rowIndexShifts(NULL)
  32 {
  33     if (U_FAILURE(status)) return;
  34
  35     UResourceBundle *th_dict = ures_open(NULL, "th", &status);
  36     th_dict = ures_getByKey(th_dict, "BreakDictionaryData", th_dict, &status);
  37     if (U_FAILURE(status)) return;
  38
  39     int32_t len;
  40     const uint8_t * data = ures_getBinary(th_dict, &len, &status);
  41     ures_close(th_dict);
  42     if (U_FAILURE(status)) return;
  43
  44     readDictionaryFile(data);
  45 }
  46
  47 BreakDictionary::~BreakDictionary()
  48 {
  49     ucmp8_close(columnMap);
  50     uprv_free(table);
  51     uprv_free(rowIndex);
  52     uprv_free(rowIndexFlags);
  53     uprv_free(rowIndexFlagsIndex);
  54     uprv_free(rowIndexShifts);
  55 }
  56
  57 // macros to support readDictionaryFile.  The data files originated from a Java
  58 // program, and Java always writes data out in big-endian format.  These macros will
  59 // byte-swap the data for appropriate use on Windows.
  60
  61 #if U_IS_BIG_ENDIAN
  62 #define SWAP32(x)
  63 #define SWAP16(x)
  64 #else
  65 #define SWAP32(x) x = (uint32_t)((x >> 24 & 0xff) | (x >> 8 & 0xff00) | (x << 8 & 0xff0000) | (x << 24 & 0xff000000))
  66 #define SWAP16(x) x = (uint16_t)((x << 8 & 0xff00) | (x >> 8 & 0xff))
  67 #endif
  68
  69 #define DICTIONARY_READ(source, destAddr, len) \
  70         uprv_memcpy(destAddr, source, len);\
  71         source+=(len)
  72
  73
  74 void
  75 BreakDictionary::readDictionaryFile(const uint8_t * in)
  76 {
  77     int32_t l;
  78     int32_t version;
  79
  80     int i;
  81
  82     // read in the version number (right now we just ignore it)
  83     DICTIONARY_READ(in, &version, 4);
  84
  85     // read in the column map (this is serialized in its internal form:
  86     // an index array followed by a data array)
  87     DICTIONARY_READ(in, &l, 4);
  88     SWAP32(l);
  89     uint16_t* temp = (uint16_t*) uprv_malloc(sizeof(uint16_t)*l);
  90     DICTIONARY_READ(in, temp, l * sizeof (int16_t) );
  91     for (i = 0; i < l; i++) {
  92         SWAP16(temp[i]);
  93     }
  94     DICTIONARY_READ(in, &l, 4);
  95     SWAP32(l);
  96     int8_t* temp2 = (int8_t*) uprv_malloc(sizeof(int8_t)*l);
  97     DICTIONARY_READ(in, temp2, l);
  98     columnMap = ucmp8_openAdopt(temp, temp2, l);
  99
 100     // read in numCols and numColGroups
 101     DICTIONARY_READ(in, &numCols, 4);
 102     SWAP32(numCols);
 103     DICTIONARY_READ(in, &numColGroups, 4);
 104     SWAP32(numColGroups);
 105
 106     // read in the row-number index
 107     DICTIONARY_READ(in, &l, 4);
 108     SWAP32(l);
 109     rowIndex = (int16_t *)uprv_malloc(l*2);
 110     DICTIONARY_READ(in, rowIndex, l * sizeof (int16_t) );
 111     for (i = 0; i < l; i++) {
 112         SWAP16(rowIndex[i]);
 113     }
 114
 115     // load in the populated-cells bitmap: index first, then bitmap list
 116     DICTIONARY_READ(in, &l, 4);
 117     SWAP32(l);
 118     rowIndexFlagsIndex = (int16_t *)uprv_malloc(l*2);
 119     DICTIONARY_READ(in, rowIndexFlagsIndex, l * sizeof(int16_t) );
 120     for (i = 0; i < l; i++) {
 121         SWAP16(rowIndexFlagsIndex[i]);
 122     }
 123     DICTIONARY_READ(in, &l, 4);
 124     SWAP32(l);
 125     rowIndexFlags = (int32_t *)uprv_malloc(l*4);
 126     DICTIONARY_READ(in, rowIndexFlags, l * sizeof(int32_t));
 127     for (i = 0; i < l; i++) {
 128         SWAP32(rowIndexFlags[i]);
 129     }
 130
 131     // load in the row-shift index
 132     DICTIONARY_READ(in, &l, 4);
 133     SWAP32(l);
 134     rowIndexShifts = (int8_t *)uprv_malloc(l);
 135     DICTIONARY_READ(in, rowIndexShifts, l);
 136
 137     // finally, load in the actual state table
 138     DICTIONARY_READ(in, &l, 4);
 139     SWAP32(l);
 140     table = (int16_t *)uprv_malloc(l*2);
 141     DICTIONARY_READ(in, table, l * sizeof(int16_t) );
 142     for (i = 0; i < l; i++) {
 143         SWAP16(table[i]);
 144     }
 145
 146     // the reverse column map occurs next in the file.  In the C/C++ code, for the
 147     // time being, we're not going to worry about that.
 148 }
 149
 150 //=================================================================================
 151 // access to the words
 152 //=================================================================================
 153
 154 /**
 155  * Uses the column map to map the character to a column number, then
 156  * passes the row and column number to the other version of at()
 157  * @param row The current state
 158  * @param ch The character whose column we're interested in
 159  * @return The new state to transition to
 160  */
 161 int16_t
 162 BreakDictionary::at(int32_t row, UChar ch) const
 163 {
 164     int16_t col = ucmp8_get(columnMap, ch);
 165     return at(row, (int32_t)col);
 166 }
 167
 168 /**
 169  * Returns the value in the cell with the specified (logical) row and
 170  * column numbers.  In DictionaryBasedBreakIterator, the row number is
 171  * a state number, the column number is an input, and the return value
 172  * is the row number of the new state to transition to.  (0 is the
 173  * "error" state, and -1 is the "end of word" state in a dictionary)
 174  * @param row The row number of the current state
 175  * @param col The column number of the input character (0 means "not a
 176  * dictionary character")
 177  * @return The row number of the new state to transition to
 178  */
 179 int16_t
 180 BreakDictionary::at(int32_t row, int32_t col) const
 181 {
 182     if (cellIsPopulated(row, col)) {
 183         // we map from logical to physical row number by looking up the
 184         // mapping in rowIndex; we map from logical column number to
 185         // physical column number by looking up a shift value for this
 186         // logical row and offsetting the logical column number by
 187         // the shift amount.  Then we can use internalAt() to actually
 188         // get the value out of the table.
 189         return internalAt(rowIndex[row], col + rowIndexShifts[row]);
 190     }
 191     else {
 192         return 0;
 193     }
 194 }
 195
 196 //=================================================================================
 197 // implementation
 198 //=================================================================================
 199 /**
 200  * Given (logical) row and column numbers, returns true if the
 201  * cell in that position is populated
 202  */
 203 UBool
 204 BreakDictionary::cellIsPopulated(int32_t row, int32_t col) const
 205 {
 206     // look up the entry in the bitmap index for the specified row.
 207     // If it's a negative number, it's the column number of the only
 208     // populated cell in the row
 209     if (rowIndexFlagsIndex[row] < 0) {
 210         return col == -rowIndexFlagsIndex[row];
 211     }
 212
 213     // if it's a positive number, it's the offset of an entry in the bitmap
 214     // list.  If the table is more than 32 columns wide, the bitmap is stored
 215     // successive entries in the bitmap list, so we have to divide the column
 216     // number by 32 and offset the number we got out of the index by the result.
 217     // Once we have the appropriate piece of the bitmap, test the appropriate
 218     // bit and return the result.
 219     else {
 220         int32_t flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
 221         return (flags & (1 << (col & 0x1f))) != 0;
 222     }
 223 }
 224
 225 /**
 226  * Implementation of at() when we know the specified cell is populated.
 227  * @param row The PHYSICAL row number of the cell
 228  * @param col The PHYSICAL column number of the cell
 229  * @return The value stored in the cell
 230  */
 231 int16_t
 232 BreakDictionary::internalAt(int32_t row, int32_t col) const
 233 {
 234     // the table is a one-dimensional array, so this just does the math necessary
 235     // to treat it as a two-dimensional array (we don't just use a two-dimensional
 236     // array because two-dimensional arrays are inefficient in Java)
 237     return table[row * numCols + col];
 238 }
 239
 240 U_NAMESPACE_END
 241
 242 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */