[apple/icu.git] / icuSources / common / dictionarydata.cpp

/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* dictionarydata.h
*
* created on: 2012may31
* created by: Markus W. Scherer & Maxime Serrano
*/

#include "dictionarydata.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/udata.h"
#include "cmemory.h"

#if !UCONFIG_NO_BREAK_ITERATION

U_NAMESPACE_BEGIN

const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0;
const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1;
const int32_t  DictionaryData::TRIE_TYPE_MASK = 7;
const int32_t  DictionaryData::TRIE_HAS_VALUES = 8;

const int32_t  DictionaryData::TRANSFORM_NONE = 0;
const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
    
DictionaryMatcher::~DictionaryMatcher() {
}

UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
    udata_close(file);
}

int32_t UCharsDictionaryMatcher::getType() const {
    return DictionaryData::TRIE_TYPE_UCHARS;
}

int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
    UCharsTrie uct(characters);
    UChar32 c = utext_next32(text);
    if (c < 0) {
        return 0;
    }
    UStringTrieResult result = uct.first(c);
    int32_t numChars = 1;
    count = 0;
    for (;;) {
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (count < limit) {
                if (values != NULL) {
                    values[count] = uct.getValue();
                }
                lengths[count++] = numChars;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }

        // TODO: why do we have a text limit if the UText knows its length?
        if (numChars >= maxLength) {
            break;
        }

        c = utext_next32(text);
        if (c < 0) {
            break;
        }
        ++numChars;
        result = uct.next(c);
    }
    return numChars;
}

BytesDictionaryMatcher::~BytesDictionaryMatcher() {
    udata_close(file);
}

UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
    if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
        if (c == 0x200D) {
            return 0xFF;
        } else if (c == 0x200C) {
            return 0xFE;
        }
        int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
        if (delta < 0 || 0xFD < delta) {
            return U_SENTINEL;
        }
        return (UChar32)delta;
    }
    return c;
}

int32_t BytesDictionaryMatcher::getType() const {
    return DictionaryData::TRIE_TYPE_BYTES;
}

int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
    BytesTrie bt(characters);
    UChar32 c = utext_next32(text);
    if (c < 0) {
        return 0;
    }
    UStringTrieResult result = bt.first(transform(c));
    int32_t numChars = 1;
    count = 0;
    for (;;) {
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (count < limit) {
                if (values != NULL) {
                    values[count] = bt.getValue();
                }
                lengths[count++] = numChars;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }

        // TODO: why do we have a text limit if the UText knows its length?
        if (numChars >= maxLength) {
            break;
        }

        c = utext_next32(text);
        if (c < 0) {
            break;
        }
        ++numChars;
        result = bt.next(transform(c));
    }
    return numChars;
}


U_NAMESPACE_END

U_NAMESPACE_USE

U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
           void *outData, UErrorCode *pErrorCode) {
    const UDataInfo *pInfo;
    int32_t headerSize;
    const uint8_t *inBytes;
    uint8_t *outBytes;
    const int32_t *inIndexes;
    int32_t indexes[DictionaryData::IX_COUNT];
    int32_t i, offset, size;

    headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
    pInfo = (const UDataInfo *)((const char *)inData + 4);
    if (!(pInfo->dataFormat[0] == 0x44 && 
          pInfo->dataFormat[1] == 0x69 && 
          pInfo->dataFormat[2] == 0x63 && 
          pInfo->dataFormat[3] == 0x74 && 
          pInfo->formatVersion[0] == 1)) {
        udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
        *pErrorCode = U_UNSUPPORTED_ERROR;
        return 0;
    }

    inBytes = (const uint8_t *)inData + headerSize;
    outBytes = (uint8_t *)outData + headerSize;

    inIndexes = (const int32_t *)inBytes;
    if (length >= 0) {
        length -= headerSize;
        if (length < (int32_t)(sizeof(indexes))) {
            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }
    }

    for (i = 0; i < DictionaryData::IX_COUNT; i++) {
        indexes[i] = udata_readInt32(ds, inIndexes[i]);
    }

    size = indexes[DictionaryData::IX_TOTAL_SIZE];

    if (length >= 0) {
        if (length < size) {
            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }

        if (inBytes != outBytes) {
            uprv_memcpy(outBytes, inBytes, size);
        }

        offset = 0;
        ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
        offset = (int32_t)sizeof(indexes);
        int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
        int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];

        if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
            ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
        } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
            // nothing to do
        } else {
            udata_printError(ds, "udict_swap(): unknown trie type!\n");
            *pErrorCode = U_UNSUPPORTED_ERROR;
            return 0;
        }

        // these next two sections are empty in the current format,
        // but may be used later.
        offset = nextOffset;
        nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
        offset = nextOffset;
        nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
        offset = nextOffset;
    }
    return headerSize + size;
}
#endif
Commit	Line	Data
51004dcb A	1	/*
51004dcb A	2	*******************************************************************************
57a6839d	3	* Copyright (C) 2014, International Business Machines
51004dcb A	4	* Corporation and others. All Rights Reserved.
	5	*******************************************************************************
	6	* dictionarydata.h
	7	*
	8	* created on: 2012may31
	9	* created by: Markus W. Scherer & Maxime Serrano
	10	*/
	11
	12	#include "dictionarydata.h"
	13	#include "unicode/ucharstrie.h"
	14	#include "unicode/bytestrie.h"
	15	#include "unicode/udata.h"
	16	#include "cmemory.h"
	17
	18	#if !UCONFIG_NO_BREAK_ITERATION
	19
	20	U_NAMESPACE_BEGIN
	21
57a6839d A	22	const int32_t DictionaryData::TRIE_TYPE_BYTES = 0;
	23	const int32_t DictionaryData::TRIE_TYPE_UCHARS = 1;
	24	const int32_t DictionaryData::TRIE_TYPE_MASK = 7;
	25	const int32_t DictionaryData::TRIE_HAS_VALUES = 8;
51004dcb	26
57a6839d A	27	const int32_t DictionaryData::TRANSFORM_NONE = 0;
	28	const int32_t DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000;
	29	const int32_t DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000;
	30	const int32_t DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff;
	31
51004dcb A	32	DictionaryMatcher::~DictionaryMatcher() {
	33	}
	34
	35	UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
	36	udata_close(file);
	37	}
	38
	39	int32_t UCharsDictionaryMatcher::getType() const {
	40	return DictionaryData::TRIE_TYPE_UCHARS;
	41	}
	42
	43	int32_t UCharsDictionaryMatcher::matches(UText text, int32_t maxLength, int32_t lengths, int32_t &count, int32_t limit, int32_t *values) const {
	44	UCharsTrie uct(characters);
	45	UChar32 c = utext_next32(text);
	46	if (c < 0) {
	47	return 0;
	48	}
	49	UStringTrieResult result = uct.first(c);
	50	int32_t numChars = 1;
	51	count = 0;
	52	for (;;) {
	53	if (USTRINGTRIE_HAS_VALUE(result)) {
	54	if (count < limit) {
	55	if (values != NULL) {
	56	values[count] = uct.getValue();
	57	}
	58	lengths[count++] = numChars;
	59	}
	60	if (result == USTRINGTRIE_FINAL_VALUE) {
	61	break;
	62	}
	63	}
	64	else if (result == USTRINGTRIE_NO_MATCH) {
	65	break;
	66	}
	67
	68	// TODO: why do we have a text limit if the UText knows its length?
	69	if (numChars >= maxLength) {
	70	break;
	71	}
	72
	73	c = utext_next32(text);
	74	if (c < 0) {
	75	break;
	76	}
	77	++numChars;
	78	result = uct.next(c);
	79	}
	80	return numChars;
	81	}
	82
	83	BytesDictionaryMatcher::~BytesDictionaryMatcher() {
	84	udata_close(file);
	85	}
	86
	87	UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
	88	if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
	89	if (c == 0x200D) {
	90	return 0xFF;
	91	} else if (c == 0x200C) {
	92	return 0xFE;
	93	}
	94	int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
	95	if (delta < 0 \|\| 0xFD < delta) {
96	return U_SENTINEL;
97	}
98	return (UChar32)delta;
99	}
100	return c;
101	}
102
103	int32_t BytesDictionaryMatcher::getType() const {
104	return DictionaryData::TRIE_TYPE_BYTES;
105	}
106
107	int32_t BytesDictionaryMatcher::matches(UText text, int32_t maxLength, int32_t lengths, int32_t &count, int32_t limit, int32_t *values) const {
108	BytesTrie bt(characters);
109	UChar32 c = utext_next32(text);
110	if (c < 0) {
111	return 0;
112	}
113	UStringTrieResult result = bt.first(transform(c));
114	int32_t numChars = 1;
115	count = 0;
116	for (;;) {
117	if (USTRINGTRIE_HAS_VALUE(result)) {
118	if (count < limit) {
119	if (values != NULL) {
120	values[count] = bt.getValue();
57a6839d	121	}
51004dcb A	122	lengths[count++] = numChars;
	123	}
	124	if (result == USTRINGTRIE_FINAL_VALUE) {
	125	break;
	126	}
	127	}
	128	else if (result == USTRINGTRIE_NO_MATCH) {
	129	break;
	130	}
	131
	132	// TODO: why do we have a text limit if the UText knows its length?
	133	if (numChars >= maxLength) {
	134	break;
	135	}
	136
	137	c = utext_next32(text);
	138	if (c < 0) {
	139	break;
	140	}
	141	++numChars;
	142	result = bt.next(transform(c));
	143	}
	144	return numChars;
	145	}
	146
	147
	148	U_NAMESPACE_END
	149
	150	U_NAMESPACE_USE
	151
	152	U_CAPI int32_t U_EXPORT2
	153	udict_swap(const UDataSwapper ds, const void inData, int32_t length,
	154	void outData, UErrorCode pErrorCode) {
	155	const UDataInfo *pInfo;
	156	int32_t headerSize;
	157	const uint8_t *inBytes;
	158	uint8_t *outBytes;
	159	const int32_t *inIndexes;
	160	int32_t indexes[DictionaryData::IX_COUNT];
	161	int32_t i, offset, size;
	162
	163	headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
	164	if (pErrorCode == NULL \|\| U_FAILURE(*pErrorCode)) return 0;
	165	pInfo = (const UDataInfo )((const char )inData + 4);
	166	if (!(pInfo->dataFormat[0] == 0x44 &&
	167	pInfo->dataFormat[1] == 0x69 &&
	168	pInfo->dataFormat[2] == 0x63 &&
	169	pInfo->dataFormat[3] == 0x74 &&
	170	pInfo->formatVersion[0] == 1)) {
	171	udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
	172	pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
	173	*pErrorCode = U_UNSUPPORTED_ERROR;
	174	return 0;
	175	}
	176
	177	inBytes = (const uint8_t *)inData + headerSize;
	178	outBytes = (uint8_t *)outData + headerSize;
	179
	180	inIndexes = (const int32_t *)inBytes;
	181	if (length >= 0) {
	182	length -= headerSize;
	183	if (length < (int32_t)(sizeof(indexes))) {
	184	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
	185	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
186	return 0;
187	}
188	}
189
190	for (i = 0; i < DictionaryData::IX_COUNT; i++) {
191	indexes[i] = udata_readInt32(ds, inIndexes[i]);
192	}
193
194	size = indexes[DictionaryData::IX_TOTAL_SIZE];
195
196	if (length >= 0) {
197	if (length < size) {
198	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
199	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
200	return 0;
201	}
202
203	if (inBytes != outBytes) {
204	uprv_memcpy(outBytes, inBytes, size);
205	}
206
207	offset = 0;
208	ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
209	offset = (int32_t)sizeof(indexes);
210	int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
211	int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
212
213	if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
214	ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
215	} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
216	// nothing to do
217	} else {
218	udata_printError(ds, "udict_swap(): unknown trie type!\n");
219	*pErrorCode = U_UNSUPPORTED_ERROR;
220	return 0;
221	}
222
223	// these next two sections are empty in the current format,
224	// but may be used later.
225	offset = nextOffset;
226	nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
227	offset = nextOffset;
228	nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
229	offset = nextOffset;
230	}
231	return headerSize + size;
232	}
233	#endif