icuSources/common/dictionarydata.cpp

   1 /*
   2 *******************************************************************************
   3 * Copyright (C) 2014, International Business Machines
   4 * Corporation and others.  All Rights Reserved.
   5 *******************************************************************************
   6 * dictionarydata.h
   7 *
   8 * created on: 2012may31
   9 * created by: Markus W. Scherer & Maxime Serrano
  10 */
  11
  12 #include "dictionarydata.h"
  13 #include "unicode/ucharstrie.h"
  14 #include "unicode/bytestrie.h"
  15 #include "unicode/udata.h"
  16 #include "cmemory.h"
  17
  18 #if !UCONFIG_NO_BREAK_ITERATION
  19
  20 U_NAMESPACE_BEGIN
  21
  22 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
  23 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
  24 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
  25 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
  26
  27 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
  28 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
  29 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
  30 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
  31
  32 DictionaryMatcher::~DictionaryMatcher() {
  33 }
  34
  35 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
  36     udata_close(file);
  37 }
  38
  39 int32_t UCharsDictionaryMatcher::getType() const {
  40     return DictionaryData::TRIE_TYPE_UCHARS;
  41 }
  42
  43 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
  44                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
  45                             int32_t *prefix) const {
  46
  47     UCharsTrie uct(characters);
  48     int32_t startingTextIndex = utext_getNativeIndex(text);
  49     int32_t wordCount = 0;
  50     int32_t codePointsMatched = 0;
  51
  52     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
  53         UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
  54         int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
  55         codePointsMatched += 1;
  56         if (USTRINGTRIE_HAS_VALUE(result)) {
  57             if (wordCount < limit) {
  58                 if (values != NULL) {
  59                     values[wordCount] = uct.getValue();
  60                 }
  61                 if (lengths != NULL) {
  62                     lengths[wordCount] = lengthMatched;
  63                 }
  64                 if (cpLengths != NULL) {
  65                     cpLengths[wordCount] = codePointsMatched;
  66                 }
  67                 ++wordCount;
  68             }
  69             if (result == USTRINGTRIE_FINAL_VALUE) {
  70                 break;
  71             }
  72         }
  73         else if (result == USTRINGTRIE_NO_MATCH) {
  74             break;
  75         }
  76         if (lengthMatched >= maxLength) {
  77             break;
  78         }
  79     }
  80
  81     if (prefix != NULL) {
  82         *prefix = codePointsMatched;
  83     }
  84     return wordCount;
  85 }
  86
  87 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
  88     udata_close(file);
  89 }
  90
  91 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
  92     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
  93         if (c == 0x200D) {
  94             return 0xFF;
  95         } else if (c == 0x200C) {
  96             return 0xFE;
  97         }
  98         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
  99         if (delta < 0 || 0xFD < delta) {
 100             return U_SENTINEL;
 101         }
 102         return (UChar32)delta;
 103     }
 104     return c;
 105 }
 106
 107 int32_t BytesDictionaryMatcher::getType() const {
 108     return DictionaryData::TRIE_TYPE_BYTES;
 109 }
 110
 111 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
 112                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
 113                             int32_t *prefix) const {
 114     BytesTrie bt(characters);
 115     int32_t startingTextIndex = utext_getNativeIndex(text);
 116     int32_t wordCount = 0;
 117     int32_t codePointsMatched = 0;
 118
 119     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
 120         UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
 121         int32_t lengthMatched = utext_getNativeIndex(text) - startingTextIndex;
 122         codePointsMatched += 1;
 123         if (USTRINGTRIE_HAS_VALUE(result)) {
 124             if (wordCount < limit) {
 125                 if (values != NULL) {
 126                     values[wordCount] = bt.getValue();
 127                 }
 128                 if (lengths != NULL) {
 129                     lengths[wordCount] = lengthMatched;
 130                 }
 131                 if (cpLengths != NULL) {
 132                     cpLengths[wordCount] = codePointsMatched;
 133                 }
 134                 ++wordCount;
 135             }
 136             if (result == USTRINGTRIE_FINAL_VALUE) {
 137                 break;
 138             }
 139         }
 140         else if (result == USTRINGTRIE_NO_MATCH) {
 141             break;
 142         }
 143         if (lengthMatched >= maxLength) {
 144             break;
 145         }
 146     }
 147
 148     if (prefix != NULL) {
 149         *prefix = codePointsMatched;
 150     }
 151     return wordCount;
 152 }
 153
 154
 155 U_NAMESPACE_END
 156
 157 U_NAMESPACE_USE
 158
 159 U_CAPI int32_t U_EXPORT2
 160 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
 161            void *outData, UErrorCode *pErrorCode) {
 162     const UDataInfo *pInfo;
 163     int32_t headerSize;
 164     const uint8_t *inBytes;
 165     uint8_t *outBytes;
 166     const int32_t *inIndexes;
 167     int32_t indexes[DictionaryData::IX_COUNT];
 168     int32_t i, offset, size;
 169
 170     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
 171     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
 172     pInfo = (const UDataInfo *)((const char *)inData + 4);
 173     if (!(pInfo->dataFormat[0] == 0x44 &&
 174           pInfo->dataFormat[1] == 0x69 &&
 175           pInfo->dataFormat[2] == 0x63 &&
 176           pInfo->dataFormat[3] == 0x74 &&
 177           pInfo->formatVersion[0] == 1)) {
 178         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
 179                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
 180         *pErrorCode = U_UNSUPPORTED_ERROR;
 181         return 0;
 182     }
 183
 184     inBytes = (const uint8_t *)inData + headerSize;
 185     outBytes = (uint8_t *)outData + headerSize;
 186
 187     inIndexes = (const int32_t *)inBytes;
 188     if (length >= 0) {
 189         length -= headerSize;
 190         if (length < (int32_t)(sizeof(indexes))) {
 191             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
 192             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
 193             return 0;
 194         }
 195     }
 196
 197     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
 198         indexes[i] = udata_readInt32(ds, inIndexes[i]);
 199     }
 200
 201     size = indexes[DictionaryData::IX_TOTAL_SIZE];
 202
 203     if (length >= 0) {
 204         if (length < size) {
 205             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
 206             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
 207             return 0;
 208         }
 209
 210         if (inBytes != outBytes) {
 211             uprv_memcpy(outBytes, inBytes, size);
 212         }
 213
 214         offset = 0;
 215         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
 216         offset = (int32_t)sizeof(indexes);
 217         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
 218         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
 219
 220         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
 221             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
 222         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
 223             // nothing to do
 224         } else {
 225             udata_printError(ds, "udict_swap(): unknown trie type!\n");
 226             *pErrorCode = U_UNSUPPORTED_ERROR;
 227             return 0;
 228         }
 229
 230         // these next two sections are empty in the current format,
 231         // but may be used later.
 232         offset = nextOffset;
 233         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
 234         offset = nextOffset;
 235         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
 236         offset = nextOffset;
 237     }
 238     return headerSize + size;
 239 }
 240 #endif