]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | ********************************************************************** | |
3 | * Copyright (C) 2008-2011, International Business Machines | |
4 | * Corporation and others. All Rights Reserved. | |
5 | ********************************************************************** | |
6 | */ | |
7 | ||
8 | #include "unicode/utypes.h" | |
9 | #include "unicode/uspoof.h" | |
10 | #include "unicode/unorm.h" | |
11 | #include "unicode/uchar.h" | |
12 | #include "unicode/uniset.h" | |
13 | #include "utrie2.h" | |
14 | #include "cmemory.h" | |
15 | #include "cstring.h" | |
16 | #include "udatamem.h" | |
17 | #include "umutex.h" | |
18 | #include "udataswp.h" | |
19 | #include "uassert.h" | |
20 | #include "uspoof_impl.h" | |
21 | ||
22 | #if !UCONFIG_NO_NORMALIZATION | |
23 | ||
24 | ||
25 | U_NAMESPACE_BEGIN | |
26 | ||
27 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) | |
28 | ||
29 | SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) : | |
30 | fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) , fAllowedLocales(NULL) { | |
31 | if (U_FAILURE(status)) { | |
32 | return; | |
33 | } | |
34 | fMagic = USPOOF_MAGIC; | |
35 | fSpoofData = data; | |
36 | fChecks = USPOOF_ALL_CHECKS; | |
37 | UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); | |
38 | if (allowedCharsSet == NULL) { | |
39 | status = U_MEMORY_ALLOCATION_ERROR; | |
40 | } | |
41 | allowedCharsSet->freeze(); | |
42 | fAllowedCharsSet = allowedCharsSet; | |
43 | fAllowedLocales = uprv_strdup(""); | |
44 | } | |
45 | ||
46 | ||
47 | SpoofImpl::SpoofImpl() { | |
48 | fMagic = USPOOF_MAGIC; | |
49 | fSpoofData = NULL; | |
50 | fChecks = USPOOF_ALL_CHECKS; | |
51 | UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); | |
52 | allowedCharsSet->freeze(); | |
53 | fAllowedCharsSet = allowedCharsSet; | |
54 | fAllowedLocales = uprv_strdup(""); | |
55 | } | |
56 | ||
57 | ||
58 | // Copy Constructor, used by the user level clone() function. | |
59 | SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : | |
60 | fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) { | |
61 | if (U_FAILURE(status)) { | |
62 | return; | |
63 | } | |
64 | fMagic = src.fMagic; | |
65 | fChecks = src.fChecks; | |
66 | if (src.fSpoofData != NULL) { | |
67 | fSpoofData = src.fSpoofData->addReference(); | |
68 | } | |
69 | fCheckMask = src.fCheckMask; | |
70 | fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone()); | |
71 | if (fAllowedCharsSet == NULL) { | |
72 | status = U_MEMORY_ALLOCATION_ERROR; | |
73 | } | |
74 | fAllowedLocales = uprv_strdup(src.fAllowedLocales); | |
75 | } | |
76 | ||
77 | SpoofImpl::~SpoofImpl() { | |
78 | fMagic = 0; // head off application errors by preventing use of | |
79 | // of deleted objects. | |
80 | if (fSpoofData != NULL) { | |
81 | fSpoofData->removeReference(); // Will delete if refCount goes to zero. | |
82 | } | |
83 | delete fAllowedCharsSet; | |
84 | uprv_free((void *)fAllowedLocales); | |
85 | } | |
86 | ||
87 | // | |
88 | // Incoming parameter check on Status and the SpoofChecker object | |
89 | // received from the C API. | |
90 | // | |
91 | const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { | |
92 | if (U_FAILURE(status)) { | |
93 | return NULL; | |
94 | } | |
95 | if (sc == NULL) { | |
96 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
97 | return NULL; | |
98 | }; | |
99 | SpoofImpl *This = (SpoofImpl *)sc; | |
100 | if (This->fMagic != USPOOF_MAGIC || | |
101 | This->fSpoofData == NULL) { | |
102 | status = U_INVALID_FORMAT_ERROR; | |
103 | return NULL; | |
104 | } | |
105 | if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) { | |
106 | return NULL; | |
107 | } | |
108 | return This; | |
109 | } | |
110 | ||
111 | SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { | |
112 | return const_cast<SpoofImpl *> | |
113 | (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status)); | |
114 | } | |
115 | ||
116 | ||
117 | ||
118 | //-------------------------------------------------------------------------------------- | |
119 | // | |
120 | // confusableLookup() This is the heart of the confusable skeleton generation | |
121 | // implementation. | |
122 | // | |
123 | // Given a source character, produce the corresponding | |
124 | // replacement character(s) | |
125 | // | |
126 | //--------------------------------------------------------------------------------------- | |
127 | int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const { | |
128 | ||
129 | // Binary search the spoof data key table for the inChar | |
130 | int32_t *low = fSpoofData->fCFUKeys; | |
131 | int32_t *mid = NULL; | |
132 | int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize; | |
133 | UChar32 midc; | |
134 | do { | |
135 | int32_t delta = ((int32_t)(limit-low))/2; | |
136 | mid = low + delta; | |
137 | midc = *mid & 0x1fffff; | |
138 | if (inChar == midc) { | |
139 | goto foundChar; | |
140 | } else if (inChar < midc) { | |
141 | limit = mid; | |
142 | } else { | |
143 | low = mid; | |
144 | } | |
145 | } while (low < limit-1); | |
146 | mid = low; | |
147 | midc = *mid & 0x1fffff; | |
148 | if (inChar != midc) { | |
149 | // Char not found. It maps to itself. | |
150 | int i = 0; | |
151 | U16_APPEND_UNSAFE(destBuf, i, inChar) | |
152 | return i; | |
153 | } | |
154 | foundChar: | |
155 | int32_t keyFlags = *mid & 0xff000000; | |
156 | if ((keyFlags & tableMask) == 0) { | |
157 | // We found the right key char, but the entry doesn't pertain to the | |
158 | // table we need. See if there is an adjacent key that does | |
159 | if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) { | |
160 | int32_t *altMid; | |
161 | for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) { | |
162 | keyFlags = *altMid & 0xff000000; | |
163 | if (keyFlags & tableMask) { | |
164 | mid = altMid; | |
165 | goto foundKey; | |
166 | } | |
167 | } | |
168 | for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) { | |
169 | keyFlags = *altMid & 0xff000000; | |
170 | if (keyFlags & tableMask) { | |
171 | mid = altMid; | |
172 | goto foundKey; | |
173 | } | |
174 | } | |
175 | } | |
176 | // No key entry for this char & table. | |
177 | // The input char maps to itself. | |
178 | int i = 0; | |
179 | U16_APPEND_UNSAFE(destBuf, i, inChar) | |
180 | return i; | |
181 | } | |
182 | ||
183 | foundKey: | |
184 | int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1; | |
185 | int32_t keyTableIndex = (int32_t)(mid - fSpoofData->fCFUKeys); | |
186 | ||
187 | // Value is either a UChar (for strings of length 1) or | |
188 | // an index into the string table (for longer strings) | |
189 | uint16_t value = fSpoofData->fCFUValues[keyTableIndex]; | |
190 | if (stringLen == 1) { | |
191 | destBuf[0] = value; | |
192 | return 1; | |
193 | } | |
194 | ||
195 | // String length of 4 from the above lookup is used for all strings of length >= 4. | |
196 | // For these, get the real length from the string lengths table, | |
197 | // which maps string table indexes to lengths. | |
198 | // All strings of the same length are stored contiguously in the string table. | |
199 | // 'value' from the lookup above is the starting index for the desired string. | |
200 | ||
201 | int32_t ix; | |
202 | if (stringLen == 4) { | |
203 | int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize; | |
204 | for (ix = 0; ix < stringLengthsLimit; ix++) { | |
205 | if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) { | |
206 | stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength; | |
207 | break; | |
208 | } | |
209 | } | |
210 | U_ASSERT(ix < stringLengthsLimit); | |
211 | } | |
212 | ||
213 | U_ASSERT(value + stringLen <= fSpoofData->fRawData->fCFUStringTableLen); | |
214 | UChar *src = &fSpoofData->fCFUStrings[value]; | |
215 | for (ix=0; ix<stringLen; ix++) { | |
216 | destBuf[ix] = src[ix]; | |
217 | } | |
218 | return stringLen; | |
219 | } | |
220 | ||
221 | ||
222 | //--------------------------------------------------------------------------------------- | |
223 | // | |
224 | // wholeScriptCheck() | |
225 | // | |
226 | // Input text is already normalized to NFD | |
227 | // Return the set of scripts, each of which can represent something that is | |
228 | // confusable with the input text. The script of the input text | |
229 | // is included; input consisting of characters from a single script will | |
230 | // always produce a result consisting of a set containing that script. | |
231 | // | |
232 | //--------------------------------------------------------------------------------------- | |
233 | void SpoofImpl::wholeScriptCheck( | |
234 | const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const { | |
235 | ||
236 | int32_t inputIdx = 0; | |
237 | UChar32 c; | |
238 | ||
239 | UTrie2 *table = | |
240 | (fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie; | |
241 | result->setAll(); | |
242 | while (inputIdx < length) { | |
243 | U16_NEXT(text, inputIdx, length, c); | |
244 | uint32_t index = utrie2_get32(table, c); | |
245 | if (index == 0) { | |
246 | // No confusables in another script for this char. | |
247 | // TODO: we should change the data to have sets with just the single script | |
248 | // bit for the script of this char. Gets rid of this special case. | |
249 | // Until then, grab the script from the char and intersect it with the set. | |
250 | UScriptCode cpScript = uscript_getScript(c, &status); | |
251 | U_ASSERT(cpScript > USCRIPT_INHERITED); | |
252 | result->intersect(cpScript); | |
253 | } else if (index == 1) { | |
254 | // Script == Common or Inherited. Nothing to do. | |
255 | } else { | |
256 | result->intersect(fSpoofData->fScriptSets[index]); | |
257 | } | |
258 | } | |
259 | } | |
260 | ||
261 | ||
262 | void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { | |
263 | UnicodeSet allowedChars; | |
264 | UnicodeSet *tmpSet = NULL; | |
265 | const char *locStart = localesList; | |
266 | const char *locEnd = NULL; | |
267 | const char *localesListEnd = localesList + uprv_strlen(localesList); | |
268 | int32_t localeListCount = 0; // Number of locales provided by caller. | |
269 | ||
270 | // Loop runs once per locale from the localesList, a comma separated list of locales. | |
271 | do { | |
272 | locEnd = uprv_strchr(locStart, ','); | |
273 | if (locEnd == NULL) { | |
274 | locEnd = localesListEnd; | |
275 | } | |
276 | while (*locStart == ' ') { | |
277 | locStart++; | |
278 | } | |
279 | const char *trimmedEnd = locEnd-1; | |
280 | while (trimmedEnd > locStart && *trimmedEnd == ' ') { | |
281 | trimmedEnd--; | |
282 | } | |
283 | if (trimmedEnd <= locStart) { | |
284 | break; | |
285 | } | |
286 | const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart)); | |
287 | localeListCount++; | |
288 | ||
289 | // We have one locale from the locales list. | |
290 | // Add the script chars for this locale to the accumulating set of allowed chars. | |
291 | // If the locale is no good, we will be notified back via status. | |
292 | addScriptChars(locale, &allowedChars, status); | |
293 | uprv_free((void *)locale); | |
294 | if (U_FAILURE(status)) { | |
295 | break; | |
296 | } | |
297 | locStart = locEnd + 1; | |
298 | } while (locStart < localesListEnd); | |
299 | ||
300 | // If our caller provided an empty list of locales, we disable the allowed characters checking | |
301 | if (localeListCount == 0) { | |
302 | uprv_free((void *)fAllowedLocales); | |
303 | fAllowedLocales = uprv_strdup(""); | |
304 | tmpSet = new UnicodeSet(0, 0x10ffff); | |
305 | if (fAllowedLocales == NULL || tmpSet == NULL) { | |
306 | status = U_MEMORY_ALLOCATION_ERROR; | |
307 | return; | |
308 | } | |
309 | tmpSet->freeze(); | |
310 | delete fAllowedCharsSet; | |
311 | fAllowedCharsSet = tmpSet; | |
312 | fCheckMask &= ~USPOOF_CHAR_LIMIT; | |
313 | return; | |
314 | } | |
315 | ||
316 | ||
317 | // Add all common and inherited characters to the set of allowed chars. | |
318 | UnicodeSet tempSet; | |
319 | tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); | |
320 | allowedChars.addAll(tempSet); | |
321 | tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); | |
322 | allowedChars.addAll(tempSet); | |
323 | ||
324 | // If anything went wrong, we bail out without changing | |
325 | // the state of the spoof checker. | |
326 | if (U_FAILURE(status)) { | |
327 | return; | |
328 | } | |
329 | ||
330 | // Store the updated spoof checker state. | |
331 | tmpSet = static_cast<UnicodeSet *>(allowedChars.clone()); | |
332 | const char *tmpLocalesList = uprv_strdup(localesList); | |
333 | if (tmpSet == NULL || tmpLocalesList == NULL) { | |
334 | status = U_MEMORY_ALLOCATION_ERROR; | |
335 | return; | |
336 | } | |
337 | uprv_free((void *)fAllowedLocales); | |
338 | fAllowedLocales = tmpLocalesList; | |
339 | tmpSet->freeze(); | |
340 | delete fAllowedCharsSet; | |
341 | fAllowedCharsSet = tmpSet; | |
342 | fCheckMask |= USPOOF_CHAR_LIMIT; | |
343 | } | |
344 | ||
345 | ||
346 | const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { | |
347 | return fAllowedLocales; | |
348 | } | |
349 | ||
350 | ||
351 | // Given a locale (a language), add all the characters from all of the scripts used with that language | |
352 | // to the allowedChars UnicodeSet | |
353 | ||
354 | void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { | |
355 | UScriptCode scripts[30]; | |
356 | ||
357 | int32_t numScripts = uscript_getCode(locale, scripts, sizeof(scripts)/sizeof(UScriptCode), &status); | |
358 | if (U_FAILURE(status)) { | |
359 | return; | |
360 | } | |
361 | if (status == U_USING_DEFAULT_WARNING) { | |
362 | status = U_ILLEGAL_ARGUMENT_ERROR; | |
363 | return; | |
364 | } | |
365 | UnicodeSet tmpSet; | |
366 | int32_t i; | |
367 | for (i=0; i<numScripts; i++) { | |
368 | tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); | |
369 | allowedChars->addAll(tmpSet); | |
370 | } | |
371 | } | |
372 | ||
373 | ||
374 | int32_t SpoofImpl::scriptScan | |
375 | (const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const { | |
376 | if (U_FAILURE(status)) { | |
377 | return 0; | |
378 | } | |
379 | int32_t inputIdx = 0; | |
380 | UChar32 c; | |
381 | int32_t scriptCount = 0; | |
382 | UScriptCode lastScript = USCRIPT_INVALID_CODE; | |
383 | UScriptCode sc = USCRIPT_INVALID_CODE; | |
384 | while ((inputIdx < length || length == -1) && scriptCount < 2) { | |
385 | U16_NEXT(text, inputIdx, length, c); | |
386 | if (c == 0 && length == -1) { | |
387 | break; | |
388 | } | |
389 | sc = uscript_getScript(c, &status); | |
390 | if (sc == USCRIPT_COMMON || sc == USCRIPT_INHERITED || sc == USCRIPT_UNKNOWN) { | |
391 | continue; | |
392 | } | |
393 | if (sc != lastScript) { | |
394 | scriptCount++; | |
395 | lastScript = sc; | |
396 | } | |
397 | } | |
398 | if (scriptCount == 2) { | |
399 | pos = inputIdx; | |
400 | } | |
401 | return scriptCount; | |
402 | } | |
403 | ||
404 | ||
405 | // Convert a text format hex number. Utility function used by builder code. Static. | |
406 | // Input: UChar *string text. Output: a UChar32 | |
407 | // Input has been pre-checked, and will have no non-hex chars. | |
408 | // The number must fall in the code point range of 0..0x10ffff | |
409 | // Static Function. | |
410 | UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) { | |
411 | if (U_FAILURE(status)) { | |
412 | return 0; | |
413 | } | |
414 | U_ASSERT(limit-start > 0); | |
415 | uint32_t val = 0; | |
416 | int i; | |
417 | for (i=start; i<limit; i++) { | |
418 | int digitVal = s[i] - 0x30; | |
419 | if (digitVal>9) { | |
420 | digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' | |
421 | } | |
422 | if (digitVal>15) { | |
423 | digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' | |
424 | } | |
425 | U_ASSERT(digitVal <= 0xf); | |
426 | val <<= 4; | |
427 | val += digitVal; | |
428 | } | |
429 | if (val > 0x10ffff) { | |
430 | status = U_PARSE_ERROR; | |
431 | val = 0; | |
432 | } | |
433 | return (UChar32)val; | |
434 | } | |
435 | ||
436 | ||
437 | ||
438 | //---------------------------------------------------------------------------------------------- | |
439 | // | |
440 | // class SpoofData Implementation | |
441 | // | |
442 | //---------------------------------------------------------------------------------------------- | |
443 | ||
444 | ||
445 | UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) { | |
446 | if (U_FAILURE(status) || | |
447 | rawData == NULL || | |
448 | rawData->fMagic != USPOOF_MAGIC || | |
449 | rawData->fFormatVersion[0] > 1 || | |
450 | rawData->fFormatVersion[1] > 0) { | |
451 | status = U_INVALID_FORMAT_ERROR; | |
452 | return FALSE; | |
453 | } | |
454 | return TRUE; | |
455 | } | |
456 | ||
457 | // | |
458 | // SpoofData::getDefault() - return a wrapper around the spoof data that is | |
459 | // baked into the default ICU data. | |
460 | // | |
461 | SpoofData *SpoofData::getDefault(UErrorCode &status) { | |
462 | // TODO: Cache it. Lazy create, keep until cleanup. | |
463 | ||
464 | UDataMemory *udm = udata_open(NULL, "cfu", "confusables", &status); | |
465 | if (U_FAILURE(status)) { | |
466 | return NULL; | |
467 | } | |
468 | SpoofData *This = new SpoofData(udm, status); | |
469 | if (U_FAILURE(status)) { | |
470 | delete This; | |
471 | return NULL; | |
472 | } | |
473 | if (This == NULL) { | |
474 | status = U_MEMORY_ALLOCATION_ERROR; | |
475 | } | |
476 | return This; | |
477 | } | |
478 | ||
479 | ||
480 | SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) | |
481 | { | |
482 | reset(); | |
483 | if (U_FAILURE(status)) { | |
484 | return; | |
485 | } | |
486 | fRawData = reinterpret_cast<SpoofDataHeader *> | |
487 | ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize); | |
488 | fUDM = udm; | |
489 | validateDataVersion(fRawData, status); | |
490 | initPtrs(status); | |
491 | } | |
492 | ||
493 | ||
494 | SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) | |
495 | { | |
496 | reset(); | |
497 | if (U_FAILURE(status)) { | |
498 | return; | |
499 | } | |
500 | if ((size_t)length < sizeof(SpoofDataHeader)) { | |
501 | status = U_INVALID_FORMAT_ERROR; | |
502 | return; | |
503 | } | |
504 | void *ncData = const_cast<void *>(data); | |
505 | fRawData = static_cast<SpoofDataHeader *>(ncData); | |
506 | if (length < fRawData->fLength) { | |
507 | status = U_INVALID_FORMAT_ERROR; | |
508 | return; | |
509 | } | |
510 | validateDataVersion(fRawData, status); | |
511 | initPtrs(status); | |
512 | } | |
513 | ||
514 | ||
515 | // Spoof Data constructor for use from data builder. | |
516 | // Initializes a new, empty data area that will be populated later. | |
517 | SpoofData::SpoofData(UErrorCode &status) { | |
518 | reset(); | |
519 | if (U_FAILURE(status)) { | |
520 | return; | |
521 | } | |
522 | fDataOwned = true; | |
523 | fRefCount = 1; | |
524 | ||
525 | // The spoof header should already be sized to be a multiple of 16 bytes. | |
526 | // Just in case it's not, round it up. | |
527 | uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; | |
528 | U_ASSERT(initialSize == sizeof(SpoofDataHeader)); | |
529 | ||
530 | fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize)); | |
531 | fMemLimit = initialSize; | |
532 | if (fRawData == NULL) { | |
533 | status = U_MEMORY_ALLOCATION_ERROR; | |
534 | return; | |
535 | } | |
536 | uprv_memset(fRawData, 0, initialSize); | |
537 | ||
538 | fRawData->fMagic = USPOOF_MAGIC; | |
539 | fRawData->fFormatVersion[0] = 1; | |
540 | fRawData->fFormatVersion[1] = 0; | |
541 | fRawData->fFormatVersion[2] = 0; | |
542 | fRawData->fFormatVersion[3] = 0; | |
543 | initPtrs(status); | |
544 | } | |
545 | ||
546 | // reset() - initialize all fields. | |
547 | // Should be updated if any new fields are added. | |
548 | // Called by constructors to put things in a known initial state. | |
549 | void SpoofData::reset() { | |
550 | fRawData = NULL; | |
551 | fDataOwned = FALSE; | |
552 | fUDM = NULL; | |
553 | fMemLimit = 0; | |
554 | fRefCount = 1; | |
555 | fCFUKeys = NULL; | |
556 | fCFUValues = NULL; | |
557 | fCFUStringLengths = NULL; | |
558 | fCFUStrings = NULL; | |
559 | fAnyCaseTrie = NULL; | |
560 | fLowerCaseTrie = NULL; | |
561 | fScriptSets = NULL; | |
562 | } | |
563 | ||
564 | ||
565 | // SpoofData::initPtrs() | |
566 | // Initialize the pointers to the various sections of the raw data. | |
567 | // | |
568 | // This function is used both during the Trie building process (multiple | |
569 | // times, as the individual data sections are added), and | |
570 | // during the opening of a Spoof Checker from prebuilt data. | |
571 | // | |
572 | // The pointers for non-existent data sections (identified by an offset of 0) | |
573 | // are set to NULL. | |
574 | // | |
575 | // Note: During building the data, adding each new data section | |
576 | // reallocs the raw data area, which likely relocates it, which | |
577 | // in turn requires reinitializing all of the pointers into it, hence | |
578 | // multiple calls to this function during building. | |
579 | // | |
580 | void SpoofData::initPtrs(UErrorCode &status) { | |
581 | fCFUKeys = NULL; | |
582 | fCFUValues = NULL; | |
583 | fCFUStringLengths = NULL; | |
584 | fCFUStrings = NULL; | |
585 | if (U_FAILURE(status)) { | |
586 | return; | |
587 | } | |
588 | if (fRawData->fCFUKeys != 0) { | |
589 | fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); | |
590 | } | |
591 | if (fRawData->fCFUStringIndex != 0) { | |
592 | fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); | |
593 | } | |
594 | if (fRawData->fCFUStringLengths != 0) { | |
595 | fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths); | |
596 | } | |
597 | if (fRawData->fCFUStringTable != 0) { | |
598 | fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); | |
599 | } | |
600 | ||
601 | if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) { | |
602 | fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, | |
603 | (char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status); | |
604 | } | |
605 | if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) { | |
606 | fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, | |
607 | (char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status); | |
608 | } | |
609 | ||
610 | if (fRawData->fScriptSets != 0) { | |
611 | fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets); | |
612 | } | |
613 | } | |
614 | ||
615 | ||
616 | SpoofData::~SpoofData() { | |
617 | utrie2_close(fAnyCaseTrie); | |
618 | fAnyCaseTrie = NULL; | |
619 | utrie2_close(fLowerCaseTrie); | |
620 | fLowerCaseTrie = NULL; | |
621 | if (fDataOwned) { | |
622 | uprv_free(fRawData); | |
623 | } | |
624 | fRawData = NULL; | |
625 | if (fUDM != NULL) { | |
626 | udata_close(fUDM); | |
627 | } | |
628 | fUDM = NULL; | |
629 | } | |
630 | ||
631 | ||
632 | void SpoofData::removeReference() { | |
633 | if (umtx_atomic_dec(&fRefCount) == 0) { | |
634 | delete this; | |
635 | } | |
636 | } | |
637 | ||
638 | ||
639 | SpoofData *SpoofData::addReference() { | |
640 | umtx_atomic_inc(&fRefCount); | |
641 | return this; | |
642 | } | |
643 | ||
644 | ||
645 | void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { | |
646 | if (U_FAILURE(status)) { | |
647 | return NULL; | |
648 | } | |
649 | if (!fDataOwned) { | |
650 | U_ASSERT(FALSE); | |
651 | status = U_INTERNAL_PROGRAM_ERROR; | |
652 | return NULL; | |
653 | } | |
654 | ||
655 | numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 | |
656 | uint32_t returnOffset = fMemLimit; | |
657 | fMemLimit += numBytes; | |
658 | fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit)); | |
659 | fRawData->fLength = fMemLimit; | |
660 | uprv_memset((char *)fRawData + returnOffset, 0, numBytes); | |
661 | initPtrs(status); | |
662 | return (char *)fRawData + returnOffset; | |
663 | } | |
664 | ||
665 | ||
666 | //---------------------------------------------------------------------------- | |
667 | // | |
668 | // ScriptSet implementation | |
669 | // | |
670 | //---------------------------------------------------------------------------- | |
671 | ScriptSet::ScriptSet() { | |
672 | for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { | |
673 | bits[i] = 0; | |
674 | } | |
675 | } | |
676 | ||
677 | ScriptSet::~ScriptSet() { | |
678 | } | |
679 | ||
680 | UBool ScriptSet::operator == (const ScriptSet &other) { | |
681 | for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { | |
682 | if (bits[i] != other.bits[i]) { | |
683 | return FALSE; | |
684 | } | |
685 | } | |
686 | return TRUE; | |
687 | } | |
688 | ||
689 | void ScriptSet::Union(UScriptCode script) { | |
690 | uint32_t index = script / 32; | |
691 | uint32_t bit = 1 << (script & 31); | |
692 | U_ASSERT(index < sizeof(bits)*4); | |
693 | bits[index] |= bit; | |
694 | } | |
695 | ||
696 | ||
697 | void ScriptSet::Union(const ScriptSet &other) { | |
698 | for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { | |
699 | bits[i] |= other.bits[i]; | |
700 | } | |
701 | } | |
702 | ||
703 | void ScriptSet::intersect(const ScriptSet &other) { | |
704 | for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { | |
705 | bits[i] &= other.bits[i]; | |
706 | } | |
707 | } | |
708 | ||
709 | void ScriptSet::intersect(UScriptCode script) { | |
710 | uint32_t index = script / 32; | |
711 | uint32_t bit = 1 << (script & 31); | |
712 | U_ASSERT(index < sizeof(bits)*4); | |
713 | uint32_t i; | |
714 | for (i=0; i<index; i++) { | |
715 | bits[i] = 0; | |
716 | } | |
717 | bits[index] &= bit; | |
718 | for (i=index+1; i<sizeof(bits)/sizeof(uint32_t); i++) { | |
719 | bits[i] = 0; | |
720 | } | |
721 | } | |
722 | ||
723 | ||
724 | ScriptSet & ScriptSet::operator =(const ScriptSet &other) { | |
725 | for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { | |
726 | bits[i] = other.bits[i]; | |
727 | } | |
728 | return *this; | |
729 | } | |
730 | ||
731 | ||
732 | void ScriptSet::setAll() { | |
733 | for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { | |
734 | bits[i] = 0xffffffffu; | |
735 | } | |
736 | } | |
737 | ||
738 | ||
739 | void ScriptSet::resetAll() { | |
740 | for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { | |
741 | bits[i] = 0; | |
742 | } | |
743 | } | |
744 | ||
745 | int32_t ScriptSet::countMembers() { | |
746 | // This bit counter is good for sparse numbers of '1's, which is | |
747 | // very much the case that we will usually have. | |
748 | int32_t count = 0; | |
749 | for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) { | |
750 | uint32_t x = bits[i]; | |
751 | while (x > 0) { | |
752 | count++; | |
753 | x &= (x - 1); // and off the least significant one bit. | |
754 | } | |
755 | } | |
756 | return count; | |
757 | } | |
758 | ||
759 | ||
760 | ||
761 | //----------------------------------------------------------------------------- | |
762 | // | |
763 | // NFDBuffer Implementation. | |
764 | // | |
765 | //----------------------------------------------------------------------------- | |
766 | ||
767 | NFDBuffer::NFDBuffer(const UChar *text, int32_t length, UErrorCode &status) { | |
768 | fNormalizedText = NULL; | |
769 | fNormalizedTextLength = 0; | |
770 | fOriginalText = text; | |
771 | if (U_FAILURE(status)) { | |
772 | return; | |
773 | } | |
774 | fNormalizedText = fSmallBuf; | |
775 | fNormalizedTextLength = unorm_normalize( | |
776 | text, length, UNORM_NFD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status); | |
777 | if (status == U_BUFFER_OVERFLOW_ERROR) { | |
778 | status = U_ZERO_ERROR; | |
779 | fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar)); | |
780 | if (fNormalizedText == NULL) { | |
781 | status = U_MEMORY_ALLOCATION_ERROR; | |
782 | } else { | |
783 | fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFD, 0, | |
784 | fNormalizedText, fNormalizedTextLength+1, &status); | |
785 | } | |
786 | } | |
787 | } | |
788 | ||
789 | ||
790 | NFDBuffer::~NFDBuffer() { | |
791 | if (fNormalizedText != fSmallBuf) { | |
792 | uprv_free(fNormalizedText); | |
793 | } | |
794 | fNormalizedText = 0; | |
795 | } | |
796 | ||
797 | const UChar *NFDBuffer::getBuffer() { | |
798 | return fNormalizedText; | |
799 | } | |
800 | ||
801 | int32_t NFDBuffer::getLength() { | |
802 | return fNormalizedTextLength; | |
803 | } | |
804 | ||
805 | ||
806 | ||
807 | ||
808 | ||
809 | U_NAMESPACE_END | |
810 | ||
811 | U_NAMESPACE_USE | |
812 | ||
813 | //----------------------------------------------------------------------------- | |
814 | // | |
815 | // uspoof_swap - byte swap and char encoding swap of spoof data | |
816 | // | |
817 | //----------------------------------------------------------------------------- | |
818 | U_CAPI int32_t U_EXPORT2 | |
819 | uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, | |
820 | UErrorCode *status) { | |
821 | ||
822 | if (status == NULL || U_FAILURE(*status)) { | |
823 | return 0; | |
824 | } | |
825 | if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { | |
826 | *status=U_ILLEGAL_ARGUMENT_ERROR; | |
827 | return 0; | |
828 | } | |
829 | ||
830 | // | |
831 | // Check that the data header is for spoof data. | |
832 | // (Header contents are defined in gencfu.cpp) | |
833 | // | |
834 | const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); | |
835 | if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ | |
836 | pInfo->dataFormat[1]==0x66 && | |
837 | pInfo->dataFormat[2]==0x75 && | |
838 | pInfo->dataFormat[3]==0x20 && | |
839 | pInfo->formatVersion[0]==1 )) { | |
840 | udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " | |
841 | "(format version %02x %02x %02x %02x) is not recognized\n", | |
842 | pInfo->dataFormat[0], pInfo->dataFormat[1], | |
843 | pInfo->dataFormat[2], pInfo->dataFormat[3], | |
844 | pInfo->formatVersion[0], pInfo->formatVersion[1], | |
845 | pInfo->formatVersion[2], pInfo->formatVersion[3]); | |
846 | *status=U_UNSUPPORTED_ERROR; | |
847 | return 0; | |
848 | } | |
849 | ||
850 | // | |
851 | // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific | |
852 | // header). This swap also conveniently gets us | |
853 | // the size of the ICU d.h., which lets us locate the start | |
854 | // of the uspoof specific data. | |
855 | // | |
856 | int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); | |
857 | ||
858 | ||
859 | // | |
860 | // Get the Spoof Data Header, and check that it appears to be OK. | |
861 | // | |
862 | // | |
863 | const uint8_t *inBytes =(const uint8_t *)inData+headerSize; | |
864 | SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; | |
865 | if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || | |
866 | ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) | |
867 | { | |
868 | udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); | |
869 | *status=U_UNSUPPORTED_ERROR; | |
870 | return 0; | |
871 | } | |
872 | ||
873 | // | |
874 | // Prefight operation? Just return the size | |
875 | // | |
876 | int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); | |
877 | int32_t totalSize = headerSize + spoofDataLength; | |
878 | if (length < 0) { | |
879 | return totalSize; | |
880 | } | |
881 | ||
882 | // | |
883 | // Check that length passed in is consistent with length from Spoof data header. | |
884 | // | |
885 | if (length < totalSize) { | |
886 | udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", | |
887 | spoofDataLength); | |
888 | *status=U_INDEX_OUTOFBOUNDS_ERROR; | |
889 | return 0; | |
890 | } | |
891 | ||
892 | ||
893 | // | |
894 | // Swap the Data. Do the data itself first, then the Spoof Data Header, because | |
895 | // we need to reference the header to locate the data, and an | |
896 | // inplace swap of the header leaves it unusable. | |
897 | // | |
898 | uint8_t *outBytes = (uint8_t *)outData + headerSize; | |
899 | SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; | |
900 | ||
901 | int32_t sectionStart; | |
902 | int32_t sectionLength; | |
903 | ||
904 | // | |
905 | // If not swapping in place, zero out the output buffer before starting. | |
906 | // Gaps may exist between the individual sections, and these must be zeroed in | |
907 | // the output buffer. The simplest way to do that is to just zero the whole thing. | |
908 | // | |
909 | if (inBytes != outBytes) { | |
910 | uprv_memset(outBytes, 0, spoofDataLength); | |
911 | } | |
912 | ||
913 | // Confusables Keys Section (fCFUKeys) | |
914 | sectionStart = ds->readUInt32(spoofDH->fCFUKeys); | |
915 | sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; | |
916 | ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); | |
917 | ||
918 | // String Index Section | |
919 | sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); | |
920 | sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; | |
921 | ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); | |
922 | ||
923 | // String Table Section | |
924 | sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); | |
925 | sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; | |
926 | ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); | |
927 | ||
928 | // String Lengths Section | |
929 | sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths); | |
930 | sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4; | |
931 | ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); | |
932 | ||
933 | // Any Case Trie | |
934 | sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie); | |
935 | sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength); | |
936 | utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); | |
937 | ||
938 | // Lower Case Trie | |
939 | sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie); | |
940 | sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength); | |
941 | utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); | |
942 | ||
943 | // Script Sets. The data is an array of int32_t | |
944 | sectionStart = ds->readUInt32(spoofDH->fScriptSets); | |
945 | sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * sizeof(ScriptSet); | |
946 | ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); | |
947 | ||
948 | // And, last, swap the header itself. | |
949 | // int32_t fMagic // swap this | |
950 | // uint8_t fFormatVersion[4] // Do not swap this, just copy | |
951 | // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. | |
952 | // | |
953 | uint32_t magic = ds->readUInt32(spoofDH->fMagic); | |
954 | ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); | |
955 | uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); | |
956 | // swap starting at fLength | |
957 | ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); | |
958 | ||
959 | return totalSize; | |
960 | } | |
961 | ||
962 | #endif | |
963 | ||
964 |