ICU-511.25.tar.gz

[apple/icu.git] / icuSources / common / dictionarydata.cpp
diff --git a/icuSources/common/dictionarydata.cpp b/icuSources/common/dictionarydata.cpp

new file mode 100644 (file)

index 0000000..a29e509
--- /dev/null
+++ b/icuSources/common/dictionarydata.cpp
@@ -0,0 +1,228 @@
+/*
+*******************************************************************************
+* Copyright (C) 2012, International Business Machines
+* Corporation and others.  All Rights Reserved.
+*******************************************************************************
+* dictionarydata.h
+*
+* created on: 2012may31
+* created by: Markus W. Scherer & Maxime Serrano
+*/
+
+#include "dictionarydata.h"
+#include "unicode/ucharstrie.h"
+#include "unicode/bytestrie.h"
+#include "unicode/udata.h"
+#include "cmemory.h"
+
+#if !UCONFIG_NO_BREAK_ITERATION
+
+U_NAMESPACE_BEGIN
+
+#ifndef CYGWINMSVC /* On Cygwin/MSVC, the error redefinition of symbols occurs.*/
+const int32_t DictionaryData::TRIE_TYPE_BYTES;
+const int32_t DictionaryData::TRIE_TYPE_UCHARS;
+#endif
+
+DictionaryMatcher::~DictionaryMatcher() {
+}
+
+UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
+    udata_close(file);
+}
+
+int32_t UCharsDictionaryMatcher::getType() const {
+    return DictionaryData::TRIE_TYPE_UCHARS;
+}
+
+int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
+    UCharsTrie uct(characters);
+    UChar32 c = utext_next32(text);
+    if (c < 0) {
+        return 0;
+    }
+    UStringTrieResult result = uct.first(c);
+    int32_t numChars = 1;
+    count = 0;
+    for (;;) {
+        if (USTRINGTRIE_HAS_VALUE(result)) {
+            if (count < limit) {
+                if (values != NULL) {
+                    values[count] = uct.getValue();
+                }
+                lengths[count++] = numChars;
+            }
+            if (result == USTRINGTRIE_FINAL_VALUE) {
+                break;
+            }
+        }
+        else if (result == USTRINGTRIE_NO_MATCH) {
+            break;
+        }
+
+        // TODO: why do we have a text limit if the UText knows its length?
+        if (numChars >= maxLength) {
+            break;
+        }
+
+        c = utext_next32(text);
+        if (c < 0) {
+            break;
+        }
+        ++numChars;
+        result = uct.next(c);
+    }
+    return numChars;
+}
+
+BytesDictionaryMatcher::~BytesDictionaryMatcher() {
+    udata_close(file);
+}
+
+UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
+    if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
+        if (c == 0x200D) {
+            return 0xFF;
+        } else if (c == 0x200C) {
+            return 0xFE;
+        }
+        int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
+        if (delta < 0 || 0xFD < delta) {
+            return U_SENTINEL;
+        }
+        return (UChar32)delta;
+    }
+    return c;
+}
+
+int32_t BytesDictionaryMatcher::getType() const {
+    return DictionaryData::TRIE_TYPE_BYTES;
+}
+
+int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
+    BytesTrie bt(characters);
+    UChar32 c = utext_next32(text);
+    if (c < 0) {
+        return 0;
+    }
+    UStringTrieResult result = bt.first(transform(c));
+    int32_t numChars = 1;
+    count = 0;
+    for (;;) {
+        if (USTRINGTRIE_HAS_VALUE(result)) {
+            if (count < limit) {
+                if (values != NULL) {
+                    values[count] = bt.getValue();
+            }
+                lengths[count++] = numChars;
+            }
+            if (result == USTRINGTRIE_FINAL_VALUE) {
+                break;
+            }
+        }
+        else if (result == USTRINGTRIE_NO_MATCH) {
+            break;
+        }
+
+        // TODO: why do we have a text limit if the UText knows its length?
+        if (numChars >= maxLength) {
+            break;
+        }
+
+        c = utext_next32(text);
+        if (c < 0) {
+            break;
+        }
+        ++numChars;
+        result = bt.next(transform(c));
+    }
+    return numChars;
+}
+
+
+U_NAMESPACE_END
+
+U_NAMESPACE_USE
+
+U_CAPI int32_t U_EXPORT2
+udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
+           void *outData, UErrorCode *pErrorCode) {
+    const UDataInfo *pInfo;
+    int32_t headerSize;
+    const uint8_t *inBytes;
+    uint8_t *outBytes;
+    const int32_t *inIndexes;
+    int32_t indexes[DictionaryData::IX_COUNT];
+    int32_t i, offset, size;
+
+    headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
+    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
+    pInfo = (const UDataInfo *)((const char *)inData + 4);
+    if (!(pInfo->dataFormat[0] == 0x44 && 
+          pInfo->dataFormat[1] == 0x69 && 
+          pInfo->dataFormat[2] == 0x63 && 
+          pInfo->dataFormat[3] == 0x74 && 
+          pInfo->formatVersion[0] == 1)) {
+        udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
+                         pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
+        *pErrorCode = U_UNSUPPORTED_ERROR;
+        return 0;
+    }
+
+    inBytes = (const uint8_t *)inData + headerSize;
+    outBytes = (uint8_t *)outData + headerSize;
+
+    inIndexes = (const int32_t *)inBytes;
+    if (length >= 0) {
+        length -= headerSize;
+        if (length < (int32_t)(sizeof(indexes))) {
+            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
+            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+            return 0;
+        }
+    }
+
+    for (i = 0; i < DictionaryData::IX_COUNT; i++) {
+        indexes[i] = udata_readInt32(ds, inIndexes[i]);
+    }
+
+    size = indexes[DictionaryData::IX_TOTAL_SIZE];
+
+    if (length >= 0) {
+        if (length < size) {
+            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
+            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+            return 0;
+        }
+
+        if (inBytes != outBytes) {
+            uprv_memcpy(outBytes, inBytes, size);
+        }
+
+        offset = 0;
+        ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
+        offset = (int32_t)sizeof(indexes);
+        int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
+        int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
+
+        if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
+            ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
+        } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
+            // nothing to do
+        } else {
+            udata_printError(ds, "udict_swap(): unknown trie type!\n");
+            *pErrorCode = U_UNSUPPORTED_ERROR;
+            return 0;
+        }
+
+        // these next two sections are empty in the current format,
+        // but may be used later.
+        offset = nextOffset;
+        nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
+        offset = nextOffset;
+        nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
+        offset = nextOffset;
+    }
+    return headerSize + size;
+}
+#endif