[apple/icu.git] / icuSources / common / dictionarydata.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2014-2016, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* dictionarydata.h
*
* created on: 2012may31
* created by: Markus W. Scherer & Maxime Serrano
*/

#include "dictionarydata.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/udata.h"
#include "cmemory.h"

#if !UCONFIG_NO_BREAK_ITERATION

U_NAMESPACE_BEGIN

const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;

const int32_t  DictionaryData::TRANSFORM_NONE = 0;
const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
    
DictionaryMatcher::~DictionaryMatcher() {
}

UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
    udata_close(file);
}

int32_t UCharsDictionaryMatcher::getType() const {
    return DictionaryData::TRIE_TYPE_UCHARS;
}

int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
                            int32_t *prefix) const {

    UCharsTrie uct(characters);
    int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
    int32_t wordCount = 0;
    int32_t codePointsMatched = 0;

    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
        UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
        int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
        codePointsMatched += 1;
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (wordCount < limit) {
                if (values != NULL) {
                    values[wordCount] = uct.getValue();
                }
                if (lengths != NULL) {
                    lengths[wordCount] = lengthMatched;
                }
                if (cpLengths != NULL) {
                    cpLengths[wordCount] = codePointsMatched;
                }
                ++wordCount;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }
        if (lengthMatched >= maxLength) {
            break;
        }
    }

    if (prefix != NULL) {
        *prefix = codePointsMatched;
    }
    return wordCount;
}

BytesDictionaryMatcher::~BytesDictionaryMatcher() {
    udata_close(file);
}

UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
    if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
        if (c == 0x200D) {
            return 0xFF;
        } else if (c == 0x200C) {
            return 0xFE;
        }
        int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
        if (delta < 0 || 0xFD < delta) {
            return U_SENTINEL;
        }
        return (UChar32)delta;
    }
    return c;
}

int32_t BytesDictionaryMatcher::getType() const {
    return DictionaryData::TRIE_TYPE_BYTES;
}

int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
                            int32_t *lengths, int32_t *cpLengths, int32_t *values,
                            int32_t *prefix) const {
    BytesTrie bt(characters);
    int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
    int32_t wordCount = 0;
    int32_t codePointsMatched = 0;

    for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
        UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
        int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
        codePointsMatched += 1;
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (wordCount < limit) {
                if (values != NULL) {
                    values[wordCount] = bt.getValue();
                }
                if (lengths != NULL) {
                    lengths[wordCount] = lengthMatched;
                }
                if (cpLengths != NULL) {
                    cpLengths[wordCount] = codePointsMatched;
                }
                ++wordCount;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }
        if (lengthMatched >= maxLength) {
            break;
        }
    }

    if (prefix != NULL) {
        *prefix = codePointsMatched;
    }
    return wordCount;
}


U_NAMESPACE_END

U_NAMESPACE_USE

U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
           void *outData, UErrorCode *pErrorCode) {
    const UDataInfo *pInfo;
    int32_t headerSize;
    const uint8_t *inBytes;
    uint8_t *outBytes;
    const int32_t *inIndexes;
    int32_t indexes[DictionaryData::IX_COUNT];
    int32_t i, offset, size;

    headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
    pInfo = (const UDataInfo *)((const char *)inData + 4);
    if (!(pInfo->dataFormat[0] == 0x44 && 
          pInfo->dataFormat[1] == 0x69 && 
          pInfo->dataFormat[2] == 0x63 && 
          pInfo->dataFormat[3] == 0x74 && 
          pInfo->formatVersion[0] == 1)) {
        udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
        *pErrorCode = U_UNSUPPORTED_ERROR;
        return 0;
    }

    inBytes = (const uint8_t *)inData + headerSize;
    outBytes = (uint8_t *)outData + headerSize;

    inIndexes = (const int32_t *)inBytes;
    if (length >= 0) {
        length -= headerSize;
        if (length < (int32_t)(sizeof(indexes))) {
            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }
    }

    for (i = 0; i < DictionaryData::IX_COUNT; i++) {
        indexes[i] = udata_readInt32(ds, inIndexes[i]);
    }

    size = indexes[DictionaryData::IX_TOTAL_SIZE];

    if (length >= 0) {
        if (length < size) {
            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }

        if (inBytes != outBytes) {
            uprv_memcpy(outBytes, inBytes, size);
        }

        offset = 0;
        ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
        offset = (int32_t)sizeof(indexes);
        int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
        int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];

        if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
            ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
        } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
            // nothing to do
        } else {
            udata_printError(ds, "udict_swap(): unknown trie type!\n");
            *pErrorCode = U_UNSUPPORTED_ERROR;
            return 0;
        }

        // these next two sections are empty in the current format,
        // but may be used later.
        offset = nextOffset;
        nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
        offset = nextOffset;
        nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
        offset = nextOffset;
    }
    return headerSize + size;
}
#endif
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
51004dcb A	3	/*
51004dcb A	4	*******************************************************************************
2ca993e8	5	* Copyright (C) 2014-2016, International Business Machines
51004dcb A	6	* Corporation and others. All Rights Reserved.
	7	*******************************************************************************
	8	* dictionarydata.h
	9	*
	10	* created on: 2012may31
	11	* created by: Markus W. Scherer & Maxime Serrano
	12	*/
	13
	14	#include "dictionarydata.h"
	15	#include "unicode/ucharstrie.h"
	16	#include "unicode/bytestrie.h"
	17	#include "unicode/udata.h"
	18	#include "cmemory.h"
	19
	20	#if !UCONFIG_NO_BREAK_ITERATION
	21
	22	U_NAMESPACE_BEGIN
	23
57a6839d A	24	const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
	25	const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
	26	const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
	27	const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
51004dcb	28
57a6839d A	29	const int32_t DictionaryData::TRANSFORM_NONE = 0;
	30	const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
	31	const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
	32	const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
	33
51004dcb A	34	DictionaryMatcher::~DictionaryMatcher() {
	35	}
	36
	37	UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
	38	udata_close(file);
	39	}
	40
	41	int32_t UCharsDictionaryMatcher::getType() const {
	42	return DictionaryData::TRIE_TYPE_UCHARS;
	43	}
	44
b331163b A	45	int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
	46	int32_t lengths, int32_t cpLengths, int32_t *values,
	47	int32_t *prefix) const {
	48
51004dcb	49	UCharsTrie uct(characters);
2ca993e8	50	int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
b331163b A	51	int32_t wordCount = 0;
	52	int32_t codePointsMatched = 0;
	53
	54	for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
	55	UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c);
2ca993e8	56	int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
b331163b	57	codePointsMatched += 1;
51004dcb	58	if (USTRINGTRIE_HAS_VALUE(result)) {
b331163b	59	if (wordCount < limit) {
51004dcb	60	if (values != NULL) {
b331163b A	61	values[wordCount] = uct.getValue();
	62	}
	63	if (lengths != NULL) {
	64	lengths[wordCount] = lengthMatched;
	65	}
	66	if (cpLengths != NULL) {
	67	cpLengths[wordCount] = codePointsMatched;
51004dcb	68	}
b331163b	69	++wordCount;
51004dcb A	70	}
	71	if (result == USTRINGTRIE_FINAL_VALUE) {
	72	break;
	73	}
	74	}
	75	else if (result == USTRINGTRIE_NO_MATCH) {
	76	break;
	77	}
b331163b	78	if (lengthMatched >= maxLength) {
51004dcb A	79	break;
51004dcb A	80	}
b331163b	81	}
51004dcb	82
b331163b A	83	if (prefix != NULL) {
b331163b A	84	*prefix = codePointsMatched;
51004dcb	85	}
b331163b	86	return wordCount;
51004dcb A	87	}
	88
	89	BytesDictionaryMatcher::~BytesDictionaryMatcher() {
	90	udata_close(file);
	91	}
	92
	93	UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
	94	if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
	95	if (c == 0x200D) {
	96	return 0xFF;
	97	} else if (c == 0x200C) {
	98	return 0xFE;
	99	}
	100	int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
	101	if (delta < 0 \|\| 0xFD < delta) {
	102	return U_SENTINEL;
	103	}
	104	return (UChar32)delta;
	105	}
	106	return c;
	107	}
	108
	109	int32_t BytesDictionaryMatcher::getType() const {
	110	return DictionaryData::TRIE_TYPE_BYTES;
	111	}
	112
b331163b A	113	int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit,
	114	int32_t lengths, int32_t cpLengths, int32_t *values,
	115	int32_t *prefix) const {
51004dcb	116	BytesTrie bt(characters);
2ca993e8	117	int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text);
b331163b A	118	int32_t wordCount = 0;
	119	int32_t codePointsMatched = 0;
	120
	121	for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) {
	122	UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c));
2ca993e8	123	int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex;
b331163b	124	codePointsMatched += 1;
51004dcb	125	if (USTRINGTRIE_HAS_VALUE(result)) {
b331163b	126	if (wordCount < limit) {
51004dcb	127	if (values != NULL) {
b331163b A	128	values[wordCount] = bt.getValue();
	129	}
	130	if (lengths != NULL) {
	131	lengths[wordCount] = lengthMatched;
57a6839d	132	}
b331163b A	133	if (cpLengths != NULL) {
	134	cpLengths[wordCount] = codePointsMatched;
	135	}
	136	++wordCount;
51004dcb A	137	}
	138	if (result == USTRINGTRIE_FINAL_VALUE) {
	139	break;
	140	}
	141	}
	142	else if (result == USTRINGTRIE_NO_MATCH) {
	143	break;
	144	}
b331163b	145	if (lengthMatched >= maxLength) {
51004dcb A	146	break;
51004dcb A	147	}
b331163b	148	}
51004dcb	149
b331163b A	150	if (prefix != NULL) {
b331163b A	151	*prefix = codePointsMatched;
51004dcb	152	}
b331163b	153	return wordCount;
51004dcb A	154	}
	155
	156
	157	U_NAMESPACE_END
	158
	159	U_NAMESPACE_USE
	160
	161	U_CAPI int32_t U_EXPORT2
	162	udict_swap(const UDataSwapper ds, const void inData, int32_t length,
	163	void outData, UErrorCode pErrorCode) {
	164	const UDataInfo *pInfo;
	165	int32_t headerSize;
	166	const uint8_t *inBytes;
	167	uint8_t *outBytes;
	168	const int32_t *inIndexes;
	169	int32_t indexes[DictionaryData::IX_COUNT];
	170	int32_t i, offset, size;
	171
	172	headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
	173	if (pErrorCode == NULL \|\| U_FAILURE(*pErrorCode)) return 0;
	174	pInfo = (const UDataInfo )((const char )inData + 4);
	175	if (!(pInfo->dataFormat[0] == 0x44 &&
	176	pInfo->dataFormat[1] == 0x69 &&
	177	pInfo->dataFormat[2] == 0x63 &&
	178	pInfo->dataFormat[3] == 0x74 &&
	179	pInfo->formatVersion[0] == 1)) {
	180	udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
	181	pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
	182	*pErrorCode = U_UNSUPPORTED_ERROR;
	183	return 0;
	184	}
	185
	186	inBytes = (const uint8_t *)inData + headerSize;
	187	outBytes = (uint8_t *)outData + headerSize;
	188
	189	inIndexes = (const int32_t *)inBytes;
	190	if (length >= 0) {
	191	length -= headerSize;
	192	if (length < (int32_t)(sizeof(indexes))) {
	193	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
	194	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
	195	return 0;
	196	}
	197	}
	198
	199	for (i = 0; i < DictionaryData::IX_COUNT; i++) {
	200	indexes[i] = udata_readInt32(ds, inIndexes[i]);
	201	}
	202
	203	size = indexes[DictionaryData::IX_TOTAL_SIZE];
	204
	205	if (length >= 0) {
	206	if (length < size) {
	207	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
	208	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
	209	return 0;
	210	}
	211
	212	if (inBytes != outBytes) {
	213	uprv_memcpy(outBytes, inBytes, size);
	214	}
	215
	216	offset = 0;
	217	ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
218	offset = (int32_t)sizeof(indexes);
219	int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
220	int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
221
222	if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
223	ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
224	} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
225	// nothing to do
226	} else {
227	udata_printError(ds, "udict_swap(): unknown trie type!\n");
228	*pErrorCode = U_UNSUPPORTED_ERROR;
229	return 0;
230	}
231
232	// these next two sections are empty in the current format,
233	// but may be used later.
234	offset = nextOffset;
235	nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
236	offset = nextOffset;
237	nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
238	offset = nextOffset;
239	}
240	return headerSize + size;
241	}
242	#endif