icuSources/common/dictionarydata.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2014, International Business Machines
   4 * Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 * dictionarydata.h
   7 *
   8 * created on: 2012may31
   9 * created by: Markus W. Scherer & Maxime Serrano
  10 */
  11
  12 #include "dictionarydata.h"
  13 #include "unicode/ucharstrie.h"
  14 #include "unicode/bytestrie.h"
  15 #include "unicode/udata.h"
  16 #include "cmemory.h"
  17
  18 #if !UCONFIG_NO_BREAK_ITERATION
  19
  20 U_NAMESPACE_BEGIN
  21
  22 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
  23 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
  24 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
  25 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
  26
  27 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
  28 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
  29 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
  30 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
  31
  32 DictionaryMatcher::~DictionaryMatcher() {
  33 }
  34
  35 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
  36     udata_close(file);
  37 }
  38
  39 int32_t UCharsDictionaryMatcher::getType() const {
  40     return DictionaryData::TRIE_TYPE_UCHARS;
  41 }
  42
  43 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
  44     UCharsTrie uct(characters);
  45     UChar32 c = utext_next32(text);
  46     if (c < 0) {
  47         return 0;
  48     }
  49     UStringTrieResult result = uct.first(c);
  50     int32_t numChars = 1;
  51     count = 0;
  52     for (;;) {
  53         if (USTRINGTRIE_HAS_VALUE(result)) {
  54             if (count < limit) {
  55                 if (values != NULL) {
  56                     values[count] = uct.getValue();
  57                 }
  58                 lengths[count++] = numChars;
  59             }
  60             if (result == USTRINGTRIE_FINAL_VALUE) {
  61                 break;
  62             }
  63         }
  64         else if (result == USTRINGTRIE_NO_MATCH) {
  65             break;
  66         }
  67
  68         // TODO: why do we have a text limit if the UText knows its length?
  69         if (numChars >= maxLength) {
  70             break;
  71         }
  72
  73         c = utext_next32(text);
  74         if (c < 0) {
  75             break;
  76         }
  77         ++numChars;
  78         result = uct.next(c);
  79     }
  80     return numChars;
  81 }
  82
  83 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
  84     udata_close(file);
  85 }
  86
  87 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
  88     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
  89         if (c == 0x200D) {
  90             return 0xFF;
  91         } else if (c == 0x200C) {
  92             return 0xFE;
  93         }
  94         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
  95         if (delta < 0 || 0xFD < delta) {
  96             return U_SENTINEL;
  97         }
  98         return (UChar32)delta;
  99     }
 100     return c;
 101 }
 102
 103 int32_t BytesDictionaryMatcher::getType() const {
 104     return DictionaryData::TRIE_TYPE_BYTES;
 105 }
 106
 107 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
 108     BytesTrie bt(characters);
 109     UChar32 c = utext_next32(text);
 110     if (c < 0) {
 111         return 0;
 112     }
 113     UStringTrieResult result = bt.first(transform(c));
 114     int32_t numChars = 1;
 115     count = 0;
 116     for (;;) {
 117         if (USTRINGTRIE_HAS_VALUE(result)) {
 118             if (count < limit) {
 119                 if (values != NULL) {
 120                     values[count] = bt.getValue();
 121                 }
 122                 lengths[count++] = numChars;
 123             }
 124             if (result == USTRINGTRIE_FINAL_VALUE) {
 125                 break;
 126             }
 127         }
 128         else if (result == USTRINGTRIE_NO_MATCH) {
 129             break;
 130         }
 131
 132         // TODO: why do we have a text limit if the UText knows its length?
 133         if (numChars >= maxLength) {
 134             break;
 135         }
 136
 137         c = utext_next32(text);
 138         if (c < 0) {
 139             break;
 140         }
 141         ++numChars;
 142         result = bt.next(transform(c));
 143     }
 144     return numChars;
 145 }
 146
 147
 148 U_NAMESPACE_END
 149
 150 U_NAMESPACE_USE
 151
 152 U_CAPI int32_t U_EXPORT2
 153 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
 154            void *outData, UErrorCode *pErrorCode) {
 155     const UDataInfo *pInfo;
 156     int32_t headerSize;
 157     const uint8_t *inBytes;
 158     uint8_t *outBytes;
 159     const int32_t *inIndexes;
 160     int32_t indexes[DictionaryData::IX_COUNT];
 161     int32_t i, offset, size;
 162
 163     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
 164     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
 165     pInfo = (const UDataInfo *)((const char *)inData + 4);
 166     if (!(pInfo->dataFormat[0] == 0x44 &&
 167           pInfo->dataFormat[1] == 0x69 &&
 168           pInfo->dataFormat[2] == 0x63 &&
 169           pInfo->dataFormat[3] == 0x74 &&
 170           pInfo->formatVersion[0] == 1)) {
 171         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
 172                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
 173         *pErrorCode = U_UNSUPPORTED_ERROR;
 174         return 0;
 175     }
 176
 177     inBytes = (const uint8_t *)inData + headerSize;
 178     outBytes = (uint8_t *)outData + headerSize;
 179
 180     inIndexes = (const int32_t *)inBytes;
 181     if (length >= 0) {
 182         length -= headerSize;
 183         if (length < (int32_t)(sizeof(indexes))) {
 184             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
 185             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
 186             return 0;
 187         }
 188     }
 189
 190     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
 191         indexes[i] = udata_readInt32(ds, inIndexes[i]);
 192     }
 193
 194     size = indexes[DictionaryData::IX_TOTAL_SIZE];
 195
 196     if (length >= 0) {
 197         if (length < size) {
 198             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
 199             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
 200             return 0;
 201         }
 202
 203         if (inBytes != outBytes) {
 204             uprv_memcpy(outBytes, inBytes, size);
 205         }
 206
 207         offset = 0;
 208         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
 209         offset = (int32_t)sizeof(indexes);
 210         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
 211         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
 212
 213         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
 214             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
 215         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
 216             // nothing to do
 217         } else {
 218             udata_printError(ds, "udict_swap(): unknown trie type!\n");
 219             *pErrorCode = U_UNSUPPORTED_ERROR;
 220             return 0;
 221         }
 222
 223         // these next two sections are empty in the current format,
 224         // but may be used later.
 225         offset = nextOffset;
 226         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
 227         offset = nextOffset;
 228         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
 229         offset = nextOffset;
 230     }
 231     return headerSize + size;
 232 }
 233 #endif