[apple/icu.git] / icuSources / common / dictionarydata.cpp

/*
*******************************************************************************
* Copyright (C) 2012, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* dictionarydata.h
*
* created on: 2012may31
* created by: Markus W. Scherer & Maxime Serrano
*/

#include "dictionarydata.h"
#include "unicode/ucharstrie.h"
#include "unicode/bytestrie.h"
#include "unicode/udata.h"
#include "cmemory.h"

#if !UCONFIG_NO_BREAK_ITERATION

U_NAMESPACE_BEGIN

#ifndef CYGWINMSVC /* On Cygwin/MSVC, the error redefinition of symbols occurs.*/
const int32_t DictionaryData::TRIE_TYPE_BYTES;
const int32_t DictionaryData::TRIE_TYPE_UCHARS;
#endif

DictionaryMatcher::~DictionaryMatcher() {
}

UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
    udata_close(file);
}

int32_t UCharsDictionaryMatcher::getType() const {
    return DictionaryData::TRIE_TYPE_UCHARS;
}

int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
    UCharsTrie uct(characters);
    UChar32 c = utext_next32(text);
    if (c < 0) {
        return 0;
    }
    UStringTrieResult result = uct.first(c);
    int32_t numChars = 1;
    count = 0;
    for (;;) {
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (count < limit) {
                if (values != NULL) {
                    values[count] = uct.getValue();
                }
                lengths[count++] = numChars;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }

        // TODO: why do we have a text limit if the UText knows its length?
        if (numChars >= maxLength) {
            break;
        }

        c = utext_next32(text);
        if (c < 0) {
            break;
        }
        ++numChars;
        result = uct.next(c);
    }
    return numChars;
}

BytesDictionaryMatcher::~BytesDictionaryMatcher() {
    udata_close(file);
}

UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
    if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
        if (c == 0x200D) {
            return 0xFF;
        } else if (c == 0x200C) {
            return 0xFE;
        }
        int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
        if (delta < 0 || 0xFD < delta) {
            return U_SENTINEL;
        }
        return (UChar32)delta;
    }
    return c;
}

int32_t BytesDictionaryMatcher::getType() const {
    return DictionaryData::TRIE_TYPE_BYTES;
}

int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, int32_t limit, int32_t *values) const {
    BytesTrie bt(characters);
    UChar32 c = utext_next32(text);
    if (c < 0) {
        return 0;
    }
    UStringTrieResult result = bt.first(transform(c));
    int32_t numChars = 1;
    count = 0;
    for (;;) {
        if (USTRINGTRIE_HAS_VALUE(result)) {
            if (count < limit) {
                if (values != NULL) {
                    values[count] = bt.getValue();
            }
                lengths[count++] = numChars;
            }
            if (result == USTRINGTRIE_FINAL_VALUE) {
                break;
            }
        }
        else if (result == USTRINGTRIE_NO_MATCH) {
            break;
        }

        // TODO: why do we have a text limit if the UText knows its length?
        if (numChars >= maxLength) {
            break;
        }

        c = utext_next32(text);
        if (c < 0) {
            break;
        }
        ++numChars;
        result = bt.next(transform(c));
    }
    return numChars;
}


U_NAMESPACE_END

U_NAMESPACE_USE

U_CAPI int32_t U_EXPORT2
udict_swap(const UDataSwapper *ds, const void *inData, int32_t length,
           void *outData, UErrorCode *pErrorCode) {
    const UDataInfo *pInfo;
    int32_t headerSize;
    const uint8_t *inBytes;
    uint8_t *outBytes;
    const int32_t *inIndexes;
    int32_t indexes[DictionaryData::IX_COUNT];
    int32_t i, offset, size;

    headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0;
    pInfo = (const UDataInfo *)((const char *)inData + 4);
    if (!(pInfo->dataFormat[0] == 0x44 && 
          pInfo->dataFormat[1] == 0x69 && 
          pInfo->dataFormat[2] == 0x63 && 
          pInfo->dataFormat[3] == 0x74 && 
          pInfo->formatVersion[0] == 1)) {
        udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
                         pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
        *pErrorCode = U_UNSUPPORTED_ERROR;
        return 0;
    }

    inBytes = (const uint8_t *)inData + headerSize;
    outBytes = (uint8_t *)outData + headerSize;

    inIndexes = (const int32_t *)inBytes;
    if (length >= 0) {
        length -= headerSize;
        if (length < (int32_t)(sizeof(indexes))) {
            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }
    }

    for (i = 0; i < DictionaryData::IX_COUNT; i++) {
        indexes[i] = udata_readInt32(ds, inIndexes[i]);
    }

    size = indexes[DictionaryData::IX_TOTAL_SIZE];

    if (length >= 0) {
        if (length < size) {
            udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
            *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
            return 0;
        }

        if (inBytes != outBytes) {
            uprv_memcpy(outBytes, inBytes, size);
        }

        offset = 0;
        ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
        offset = (int32_t)sizeof(indexes);
        int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
        int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];

        if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
            ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
        } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
            // nothing to do
        } else {
            udata_printError(ds, "udict_swap(): unknown trie type!\n");
            *pErrorCode = U_UNSUPPORTED_ERROR;
            return 0;
        }

        // these next two sections are empty in the current format,
        // but may be used later.
        offset = nextOffset;
        nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
        offset = nextOffset;
        nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
        offset = nextOffset;
    }
    return headerSize + size;
}
#endif
Commit	Line	Data
51004dcb A	1	/*
	2	*******************************************************************************
	3	* Copyright (C) 2012, International Business Machines
	4	* Corporation and others. All Rights Reserved.
	5	*******************************************************************************
	6	* dictionarydata.h
	7	*
	8	* created on: 2012may31
	9	* created by: Markus W. Scherer & Maxime Serrano
	10	*/
	11
	12	#include "dictionarydata.h"
	13	#include "unicode/ucharstrie.h"
	14	#include "unicode/bytestrie.h"
	15	#include "unicode/udata.h"
	16	#include "cmemory.h"
	17
	18	#if !UCONFIG_NO_BREAK_ITERATION
	19
	20	U_NAMESPACE_BEGIN
	21
	22	#ifndef CYGWINMSVC /* On Cygwin/MSVC, the error redefinition of symbols occurs.*/
	23	const int32_t DictionaryData::TRIE_TYPE_BYTES;
	24	const int32_t DictionaryData::TRIE_TYPE_UCHARS;
	25	#endif
	26
	27	DictionaryMatcher::~DictionaryMatcher() {
	28	}
	29
	30	UCharsDictionaryMatcher::~UCharsDictionaryMatcher() {
	31	udata_close(file);
	32	}
	33
	34	int32_t UCharsDictionaryMatcher::getType() const {
	35	return DictionaryData::TRIE_TYPE_UCHARS;
	36	}
	37
	38	int32_t UCharsDictionaryMatcher::matches(UText text, int32_t maxLength, int32_t lengths, int32_t &count, int32_t limit, int32_t *values) const {
	39	UCharsTrie uct(characters);
	40	UChar32 c = utext_next32(text);
	41	if (c < 0) {
	42	return 0;
	43	}
	44	UStringTrieResult result = uct.first(c);
	45	int32_t numChars = 1;
	46	count = 0;
	47	for (;;) {
	48	if (USTRINGTRIE_HAS_VALUE(result)) {
	49	if (count < limit) {
	50	if (values != NULL) {
	51	values[count] = uct.getValue();
	52	}
	53	lengths[count++] = numChars;
	54	}
	55	if (result == USTRINGTRIE_FINAL_VALUE) {
	56	break;
	57	}
	58	}
	59	else if (result == USTRINGTRIE_NO_MATCH) {
	60	break;
	61	}
	62
	63	// TODO: why do we have a text limit if the UText knows its length?
	64	if (numChars >= maxLength) {
65	break;
66	}
67
68	c = utext_next32(text);
69	if (c < 0) {
70	break;
71	}
72	++numChars;
73	result = uct.next(c);
74	}
75	return numChars;
76	}
77
78	BytesDictionaryMatcher::~BytesDictionaryMatcher() {
79	udata_close(file);
80	}
81
82	UChar32 BytesDictionaryMatcher::transform(UChar32 c) const {
83	if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) {
84	if (c == 0x200D) {
85	return 0xFF;
86	} else if (c == 0x200C) {
87	return 0xFE;
88	}
89	int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK);
90	if (delta < 0 \|\| 0xFD < delta) {
91	return U_SENTINEL;
92	}
93	return (UChar32)delta;
94	}
95	return c;
96	}
97
98	int32_t BytesDictionaryMatcher::getType() const {
99	return DictionaryData::TRIE_TYPE_BYTES;
100	}
101
102	int32_t BytesDictionaryMatcher::matches(UText text, int32_t maxLength, int32_t lengths, int32_t &count, int32_t limit, int32_t *values) const {
103	BytesTrie bt(characters);
104	UChar32 c = utext_next32(text);
105	if (c < 0) {
106	return 0;
107	}
108	UStringTrieResult result = bt.first(transform(c));
109	int32_t numChars = 1;
110	count = 0;
111	for (;;) {
112	if (USTRINGTRIE_HAS_VALUE(result)) {
113	if (count < limit) {
114	if (values != NULL) {
115	values[count] = bt.getValue();
116	}
117	lengths[count++] = numChars;
118	}
119	if (result == USTRINGTRIE_FINAL_VALUE) {
120	break;
121	}
122	}
123	else if (result == USTRINGTRIE_NO_MATCH) {
124	break;
125	}
126
127	// TODO: why do we have a text limit if the UText knows its length?
128	if (numChars >= maxLength) {
129	break;
130	}
131
132	c = utext_next32(text);
133	if (c < 0) {
134	break;
135	}
136	++numChars;
137	result = bt.next(transform(c));
138	}
139	return numChars;
140	}
141
142
143	U_NAMESPACE_END
144
145	U_NAMESPACE_USE
146
147	U_CAPI int32_t U_EXPORT2
148	udict_swap(const UDataSwapper ds, const void inData, int32_t length,
149	void outData, UErrorCode pErrorCode) {
150	const UDataInfo *pInfo;
151	int32_t headerSize;
152	const uint8_t *inBytes;
153	uint8_t *outBytes;
154	const int32_t *inIndexes;
155	int32_t indexes[DictionaryData::IX_COUNT];
156	int32_t i, offset, size;
157
158	headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
159	if (pErrorCode == NULL \|\| U_FAILURE(*pErrorCode)) return 0;
160	pInfo = (const UDataInfo )((const char )inData + 4);
161	if (!(pInfo->dataFormat[0] == 0x44 &&
162	pInfo->dataFormat[1] == 0x69 &&
163	pInfo->dataFormat[2] == 0x63 &&
164	pInfo->dataFormat[3] == 0x74 &&
165	pInfo->formatVersion[0] == 1)) {
166	udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n",
167	pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]);
168	*pErrorCode = U_UNSUPPORTED_ERROR;
169	return 0;
170	}
171
172	inBytes = (const uint8_t *)inData + headerSize;
173	outBytes = (uint8_t *)outData + headerSize;
174
175	inIndexes = (const int32_t *)inBytes;
176	if (length >= 0) {
177	length -= headerSize;
178	if (length < (int32_t)(sizeof(indexes))) {
179	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length);
180	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
181	return 0;
182	}
183	}
184
185	for (i = 0; i < DictionaryData::IX_COUNT; i++) {
186	indexes[i] = udata_readInt32(ds, inIndexes[i]);
187	}
188
189	size = indexes[DictionaryData::IX_TOTAL_SIZE];
190
191	if (length >= 0) {
192	if (length < size) {
193	udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length);
194	*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
195	return 0;
196	}
197
198	if (inBytes != outBytes) {
199	uprv_memcpy(outBytes, inBytes, size);
200	}
201
202	offset = 0;
203	ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode);
204	offset = (int32_t)sizeof(indexes);
205	int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
206	int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET];
207
208	if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
209	ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode);
210	} else if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
211	// nothing to do
212	} else {
213	udata_printError(ds, "udict_swap(): unknown trie type!\n");
214	*pErrorCode = U_UNSUPPORTED_ERROR;
215	return 0;
216	}
217
218	// these next two sections are empty in the current format,
219	// but may be used later.
220	offset = nextOffset;
221	nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET];
222	offset = nextOffset;
223	nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE];
224	offset = nextOffset;
225	}
226	return headerSize + size;
227	}
228	#endif