icuSources/common/dictionarydata.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4 *******************************************************************************
   5 * Copyright (C) 2014-2016, International Business Machines
   6 * Corporation and others.  All Rights Reserved.
   7 *******************************************************************************
   8 * dictionarydata.h
   9 *
  10 * created on: 2012may31
  11 * created by: Markus W. Scherer & Maxime Serrano
  12 */
  13
  14 #include "dictionarydata.h"
  15 #include "unicode/ucharstrie.h"
  16 #include "unicode/bytestrie.h"
  17 #include "unicode/udata.h"
  18 #include "cmemory.h"
  19
  20 #if !UCONFIG_NO_BREAK_ITERATION
  21
  22 U_NAMESPACE_BEGIN
  23
  24 const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
  25 const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
  26 const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
  27 const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;
  28
  29 const int32_t  DictionaryData::TRANSFORM_NONE = 0;
  30 const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
  31 const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
  32 const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
  33
  34 DictionaryMatcher::~DictionaryMatcher() {
  35 }
  36
  37 UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
  38     udata_close(file);
  39 }
  40
  41 int32_t UCharsDictionaryMatcher::getType() const {
  42     return DictionaryData::TRIE_TYPE_UCHARS;
  43 }
  44
  45 int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
  46                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
  47                             int32_t *prefix) const {
  48
  49     UCharsTrie uct(characters);
  50     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
  51     int32_t wordCount = 0;
  52     int32_t codePointsMatched = 0;
  53
  54     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
  55         UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
  56         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
  57         codePointsMatched += 1;
  58         if (USTRINGTRIE_HAS_VALUE(result)) {
  59             if (wordCount < limit) {
  60                 if (values != NULL) {
  61                     values[wordCount] = uct.getValue();
  62                 }
  63                 if (lengths != NULL) {
  64                     lengths[wordCount] = lengthMatched;
  65                 }
  66                 if (cpLengths != NULL) {
  67                     cpLengths[wordCount] = codePointsMatched;
  68                 }
  69                 ++wordCount;
  70             }
  71             if (result == USTRINGTRIE_FINAL_VALUE) {
  72                 break;
  73             }
  74         }
  75         else if (result == USTRINGTRIE_NO_MATCH) {
  76             break;
  77         }
  78         if (lengthMatched >= maxLength) {
  79             break;
  80         }
  81     }
  82
  83     if (prefix != NULL) {
  84         *prefix = codePointsMatched;
  85     }
  86     return wordCount;
  87 }
  88
  89 BytesDictionaryMatcher::~BytesDictionaryMatcher() {
  90     udata_close(file);
  91 }
  92
  93 UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
  94     if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
  95         if (c == 0x200D) {
  96             return 0xFF;
  97         } else if (c == 0x200C) {
  98             return 0xFE;
  99         }
 100         int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
 101         if (delta < 0 || 0xFD < delta) {
 102             return U_SENTINEL;
 103         }
 104         return (UChar32)delta;
 105     }
 106     return c;
 107 }
 108
 109 int32_t BytesDictionaryMatcher::getType() const {
 110     return DictionaryData::TRIE_TYPE_BYTES;
 111 }
 112
 113 int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
 114                             int32_t *lengths, int32_t *cpLengths, int32_t *values,
 115                             int32_t *prefix) const {
 116     BytesTrie bt(characters);
 117     int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
 118     int32_t wordCount = 0;
 119     int32_t codePointsMatched = 0;
 120
 121     for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
 122         UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
 123         int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
 124         codePointsMatched += 1;
 125         if (USTRINGTRIE_HAS_VALUE(result)) {
 126             if (wordCount < limit) {
 127                 if (values != NULL) {
 128                     values[wordCount] = bt.getValue();
 129                 }
 130                 if (lengths != NULL) {
 131                     lengths[wordCount] = lengthMatched;
 132                 }
 133                 if (cpLengths != NULL) {
 134                     cpLengths[wordCount] = codePointsMatched;
 135                 }
 136                 ++wordCount;
 137             }
 138             if (result == USTRINGTRIE_FINAL_VALUE) {
 139                 break;
 140             }
 141         }
 142         else if (result == USTRINGTRIE_NO_MATCH) {
 143             break;
 144         }
 145         if (lengthMatched >= maxLength) {
 146             break;
 147         }
 148     }
 149
 150     if (prefix != NULL) {
 151         *prefix = codePointsMatched;
 152     }
 153     return wordCount;
 154 }
 155
 156
 157 U_NAMESPACE_END
 158
 159 U_NAMESPACE_USE
 160
 161 U_CAPI int32_t U_EXPORT2
 162 udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
 163            void *outData, UErrorCode *pErrorCode) {
 164     const UDataInfo *pInfo;
 165     int32_t headerSize;
 166     const uint8_t *inBytes;
 167     uint8_t *outBytes;
 168     const int32_t *inIndexes;
 169     int32_t indexes[DictionaryData::IX_COUNT];
 170     int32_t i, offset, size;
 171
 172     headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
 173     if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
 174     pInfo = (const UDataInfo *)((const char *)inData + 4);
 175     if (!(pInfo->dataFormat[0] == 0x44 &&
 176           pInfo->dataFormat[1] == 0x69 &&
 177           pInfo->dataFormat[2] == 0x63 &&
 178           pInfo->dataFormat[3] == 0x74 &&
 179           pInfo->formatVersion[0] == 1)) {
 180         udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
 181                          pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
 182         *pErrorCode = U_UNSUPPORTED_ERROR;
 183         return 0;
 184     }
 185
 186     inBytes = (const uint8_t *)inData + headerSize;
 187     outBytes = (uint8_t *)outData + headerSize;
 188
 189     inIndexes = (const int32_t *)inBytes;
 190     if (length >= 0) {
 191         length -= headerSize;
 192         if (length < (int32_t)(sizeof(indexes))) {
 193             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
 194             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
 195             return 0;
 196         }
 197     }
 198
 199     for (i = 0; i < DictionaryData::IX_COUNT; i++) {
 200         indexes[i] = udata_readInt32(ds, inIndexes[i]);
 201     }
 202
 203     size = indexes[DictionaryData::IX_TOTAL_SIZE];
 204
 205     if (length >= 0) {
 206         if (length < size) {
 207             udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
 208             *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
 209             return 0;
 210         }
 211
 212         if (inBytes != outBytes) {
 213             uprv_memcpy(outBytes, inBytes, size);
 214         }
 215
 216         offset = 0;
 217         ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
 218         offset = (int32_t)sizeof(indexes);
 219         int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
 220         int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
 221
 222         if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
 223             ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
 224         } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
 225             // nothing to do
 226         } else {
 227             udata_printError(ds, "udict_swap(): unknown trie type!\n");
 228             *pErrorCode = U_UNSUPPORTED_ERROR;
 229             return 0;
 230         }
 231
 232         // these next two sections are empty in the current format,
 233         // but may be used later.
 234         offset = nextOffset;
 235         nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
 236         offset = nextOffset;
 237         nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
 238         offset = nextOffset;
 239     }
 240     return headerSize + size;
 241 }
 242 #endif