icuSources/common/dictionarydata.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2012, International Business Machines
   4 * Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 * dictionarydata.h
   7 *
   8 * created on: 2012may31
   9 * created by: Markus W. Scherer & Maxime Serrano
  10 */
  11
  12 #include "dictionarydata.h"
  13 #include "unicode/ucharstrie.h"
  14 #include "unicode/bytestrie.h"
  15 #include "unicode/udata.h"
  16 #include "cmemory.h"
  17
  18 #if !UCONFIG_NO_BREAK_ITERATION
  19
  20 U_NAMESPACE_BEGIN
  21
  22 #ifndef CYGWINMSVC /* On Cygwin/MSVC, the error redefinition of symbols occurs.*/
  23 const int32_t DictionaryData::TRIE_TYPE_BYTES;
  24 const int32_t DictionaryData::TRIE_TYPE_UCHARS;
  25 #endif
  26
  27 DictionaryMatcher::~DictionaryMatcher() {
  28 }
  29
  30 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
  31     udata_close(file);
  32 }
  33
  34 int32_t UCharsDictionaryMatcher::getType() const {
  35     return DictionaryData::TRIE_TYPE_UCHARS;
  36 }
  37
  38 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
  39     UCharsTrie uct(characters);
  40     UChar32 c = utext_next32(text);
  41     if (c < 0) {
  42         return 0;
  43     }
  44     UStringTrieResult result = uct.first(c);
  45     int32_t numChars = 1;
  46     count = 0;
  47     for (;;) {
  48         if (USTRINGTRIE_HAS_VALUE(result)) {
  49             if (count < limit) {
  50                 if (values != NULL) {
  51                     values[count] = uct.getValue();
  52                 }
  53                 lengths[count++] = numChars;
  54             }
  55             if (result == USTRINGTRIE_FINAL_VALUE) {
  56                 break;
  57             }
  58         }
  59         else if (result == USTRINGTRIE_NO_MATCH) {
  60             break;
  61         }
  62
  63         // TODO: why do we have a text limit if the UText knows its length?
  64         if (numChars >= maxLength) {
  65             break;
  66         }
  67
  68         c = utext_next32(text);
  69         if (c < 0) {
  70             break;
  71         }
  72         ++numChars;
  73         result = uct.next(c);
  74     }
  75     return numChars;
  76 }
  77
  78 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
  79     udata_close(file);
  80 }
  81
  82 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
  83     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
  84         if (c == 0x200D) {
  85             return 0xFF;
  86         } else if (c == 0x200C) {
  87             return 0xFE;
  88         }
  89         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
  90         if (delta < 0 || 0xFD < delta) {
  91             return U_SENTINEL;
  92         }
  93         return (UChar32)delta;
  94     }
  95     return c;
  96 }
  97
  98 int32_t BytesDictionaryMatcher::getType() const {
  99     return DictionaryData::TRIE_TYPE_BYTES;
 100 }
 101
 102 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
 103     BytesTrie bt(characters);
 104     UChar32 c = utext_next32(text);
 105     if (c < 0) {
 106         return 0;
 107     }
 108     UStringTrieResult result = bt.first(transform(c));
 109     int32_t numChars = 1;
 110     count = 0;
 111     for (;;) {
 112         if (USTRINGTRIE_HAS_VALUE(result)) {
 113             if (count < limit) {
 114                 if (values != NULL) {
 115                     values[count] = bt.getValue();
 116             }
 117                 lengths[count++] = numChars;
 118             }
 119             if (result == USTRINGTRIE_FINAL_VALUE) {
 120                 break;
 121             }
 122         }
 123         else if (result == USTRINGTRIE_NO_MATCH) {
 124             break;
 125         }
 126
 127         // TODO: why do we have a text limit if the UText knows its length?
 128         if (numChars >= maxLength) {
 129             break;
 130         }
 131
 132         c = utext_next32(text);
 133         if (c < 0) {
 134             break;
 135         }
 136         ++numChars;
 137         result = bt.next(transform(c));
 138     }
 139     return numChars;
 140 }
 141
 142
 143 U_NAMESPACE_END
 144
 145 U_NAMESPACE_USE
 146
 147 U_CAPI int32_t U_EXPORT2
 148 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
 149            void *outData, UErrorCode *pErrorCode) {
 150     const UDataInfo *pInfo;
 151     int32_t headerSize;
 152     const uint8_t *inBytes;
 153     uint8_t *outBytes;
 154     const int32_t *inIndexes;
 155     int32_t indexes[DictionaryData::IX_COUNT];
 156     int32_t i, offset, size;
 157
 158     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
 159     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
 160     pInfo = (const UDataInfo *)((const char *)inData + 4);
 161     if (!(pInfo->dataFormat[0] == 0x44 &&
 162           pInfo->dataFormat[1] == 0x69 &&
 163           pInfo->dataFormat[2] == 0x63 &&
 164           pInfo->dataFormat[3] == 0x74 &&
 165           pInfo->formatVersion[0] == 1)) {
 166         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
 167                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
 168         *pErrorCode = U_UNSUPPORTED_ERROR;
 169         return 0;
 170     }
 171
 172     inBytes = (const uint8_t *)inData + headerSize;
 173     outBytes = (uint8_t *)outData + headerSize;
 174
 175     inIndexes = (const int32_t *)inBytes;
 176     if (length >= 0) {
 177         length -= headerSize;
 178         if (length < (int32_t)(sizeof(indexes))) {
 179             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
 180             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
 181             return 0;
 182         }
 183     }
 184
 185     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
 186         indexes[i] = udata_readInt32(ds, inIndexes[i]);
 187     }
 188
 189     size = indexes[DictionaryData::IX_TOTAL_SIZE];
 190
 191     if (length >= 0) {
 192         if (length < size) {
 193             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
 194             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
 195             return 0;
 196         }
 197
 198         if (inBytes != outBytes) {
 199             uprv_memcpy(outBytes, inBytes, size);
 200         }
 201
 202         offset = 0;
 203         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
 204         offset = (int32_t)sizeof(indexes);
 205         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
 206         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
 207
 208         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
 209             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
 210         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
 211             // nothing to do
 212         } else {
 213             udata_printError(ds, "udict_swap(): unknown trie type!\n");
 214             *pErrorCode = U_UNSUPPORTED_ERROR;
 215             return 0;
 216         }
 217
 218         // these next two sections are empty in the current format,
 219         // but may be used later.
 220         offset = nextOffset;
 221         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
 222         offset = nextOffset;
 223         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
 224         offset = nextOffset;
 225     }
 226     return headerSize + size;
 227 }
 228 #endif