]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | ****************************************************************************** | |
3 | * | |
4 | * Copyright (C) 2008-2013, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ****************************************************************************** | |
8 | * file name: uspoof_wsconf.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2009Jan05 (refactoring earlier files) | |
14 | * created by: Andy Heninger | |
15 | * | |
16 | * Internal functions for compililing Whole Script confusable source data | |
17 | * into its binary (runtime) form. The binary data format is described | |
18 | * in uspoof_impl.h | |
19 | */ | |
20 | ||
21 | #include "unicode/utypes.h" | |
22 | #include "unicode/uspoof.h" | |
23 | ||
24 | #if !UCONFIG_NO_NORMALIZATION | |
25 | ||
26 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
27 | ||
28 | #include "unicode/unorm.h" | |
29 | #include "unicode/uregex.h" | |
30 | #include "unicode/ustring.h" | |
31 | #include "cmemory.h" | |
32 | #include "scriptset.h" | |
33 | #include "uspoof_impl.h" | |
34 | #include "uhash.h" | |
35 | #include "uvector.h" | |
36 | #include "uassert.h" | |
37 | #include "uspoof_wsconf.h" | |
38 | ||
39 | U_NAMESPACE_USE | |
40 | ||
41 | ||
42 | // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt | |
43 | // Example Lines: | |
44 | // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O | |
45 | // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I | |
46 | // | | | | | |
47 | // | | | |---- Which table, Any Case or Lower Case (A or L) | |
48 | // | | |----------Target script. We need this. | |
49 | // | |----------------Src script. Should match the script of the source | |
50 | // | code points. Beyond checking that, we don't keep it. | |
51 | // |--------------------------------Source code points or range. | |
52 | // | |
53 | // The expression will match _all_ lines, including erroneous lines. | |
54 | // The result of the parse is returned via the contents of the (match) groups. | |
55 | static const char *parseExp = | |
56 | "(?m)" // Multi-line mode | |
57 | "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. | |
58 | "|^(?:" // OR | |
59 | "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. | |
60 | "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. | |
61 | "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. | |
62 | "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 | |
63 | "[ \\t]*(?:#.*?)?" // Trailing commment | |
64 | ")$|" // OR | |
65 | "^(.*?)$"; // An error line. Group 8. | |
66 | // Any line not matching the preceding | |
67 | // parts of the expression.will match | |
68 | // this, and thus be flagged as an error | |
69 | ||
70 | ||
71 | // Extract a regular expression match group into a char * string. | |
72 | // The group must contain only invariant characters. | |
73 | // Used for script names | |
74 | // | |
75 | static void extractGroup( | |
76 | URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { | |
77 | ||
78 | UChar ubuf[50]; | |
79 | ubuf[0] = 0; | |
80 | destBuf[0] = 0; | |
81 | int32_t len = uregex_group(e, group, ubuf, 50, &status); | |
82 | if (U_FAILURE(status) || len == -1 || len >= destCapacity) { | |
83 | return; | |
84 | } | |
85 | UnicodeString s(FALSE, ubuf, len); // Aliasing constructor | |
86 | s.extract(0, len, destBuf, destCapacity, US_INV); | |
87 | } | |
88 | ||
89 | ||
90 | ||
91 | U_NAMESPACE_BEGIN | |
92 | ||
93 | // Build the Whole Script Confusable data | |
94 | // | |
95 | // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, | |
96 | // because everything is local to this one build function anyhow, | |
97 | // OR | |
98 | // break this function into more reasonably sized pieces, with | |
99 | // state in WSConfusableDataBuilder. | |
100 | // | |
101 | void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, | |
102 | int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) | |
103 | { | |
104 | if (U_FAILURE(status)) { | |
105 | return; | |
106 | } | |
107 | URegularExpression *parseRegexp = NULL; | |
108 | int32_t inputLen = 0; | |
109 | UChar *input = NULL; | |
110 | int32_t lineNum = 0; | |
111 | ||
112 | UVector *scriptSets = NULL; | |
113 | uint32_t rtScriptSetsCount = 2; | |
114 | ||
115 | UTrie2 *anyCaseTrie = NULL; | |
116 | UTrie2 *lowerCaseTrie = NULL; | |
117 | ||
118 | anyCaseTrie = utrie2_open(0, 0, &status); | |
119 | lowerCaseTrie = utrie2_open(0, 0, &status); | |
120 | ||
121 | UnicodeString pattern(parseExp, -1, US_INV); | |
122 | ||
123 | // The scriptSets vector provides a mapping from TRIE values to the set of scripts. | |
124 | // | |
125 | // Reserved TRIE values: | |
126 | // 0: Code point has no whole script confusables. | |
127 | // 1: Code point is of script Common or Inherited. | |
128 | // These code points do not participate in whole script confusable detection. | |
129 | // (This is logically equivalent to saying that they contain confusables in | |
130 | // all scripts) | |
131 | // | |
132 | // Because Trie values are indexes into the ScriptSets vector, pre-fill | |
133 | // vector positions 0 and 1 to avoid conflicts with the reserved values. | |
134 | ||
135 | scriptSets = new UVector(status); | |
136 | if (scriptSets == NULL) { | |
137 | status = U_MEMORY_ALLOCATION_ERROR; | |
138 | goto cleanup; | |
139 | } | |
140 | scriptSets->addElement((void *)NULL, status); | |
141 | scriptSets->addElement((void *)NULL, status); | |
142 | ||
143 | // Convert the user input data from UTF-8 to UChar (UTF-16) | |
144 | u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); | |
145 | if (status != U_BUFFER_OVERFLOW_ERROR) { | |
146 | goto cleanup; | |
147 | } | |
148 | status = U_ZERO_ERROR; | |
149 | input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); | |
150 | if (input == NULL) { | |
151 | status = U_MEMORY_ALLOCATION_ERROR; | |
152 | goto cleanup; | |
153 | } | |
154 | u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); | |
155 | ||
156 | parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); | |
157 | ||
158 | // Zap any Byte Order Mark at the start of input. Changing it to a space is benign | |
159 | // given the syntax of the input. | |
160 | if (*input == 0xfeff) { | |
161 | *input = 0x20; | |
162 | } | |
163 | ||
164 | // Parse the input, one line per iteration of this loop. | |
165 | uregex_setText(parseRegexp, input, inputLen, &status); | |
166 | while (uregex_findNext(parseRegexp, &status)) { | |
167 | lineNum++; | |
168 | if (uregex_start(parseRegexp, 1, &status) >= 0) { | |
169 | // this was a blank or comment line. | |
170 | continue; | |
171 | } | |
172 | if (uregex_start(parseRegexp, 8, &status) >= 0) { | |
173 | // input file syntax error. | |
174 | status = U_PARSE_ERROR; | |
175 | goto cleanup; | |
176 | } | |
177 | if (U_FAILURE(status)) { | |
178 | goto cleanup; | |
179 | } | |
180 | ||
181 | // Pick up the start and optional range end code points from the parsed line. | |
182 | UChar32 startCodePoint = SpoofImpl::ScanHex( | |
183 | input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); | |
184 | UChar32 endCodePoint = startCodePoint; | |
185 | if (uregex_start(parseRegexp, 3, &status) >=0) { | |
186 | endCodePoint = SpoofImpl::ScanHex( | |
187 | input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); | |
188 | } | |
189 | ||
190 | // Extract the two script names from the source line. We need these in an 8 bit | |
191 | // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on | |
192 | // to the ICU u_getPropertyValueEnum() function. Ugh. | |
193 | char srcScriptName[20]; | |
194 | char targScriptName[20]; | |
195 | extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); | |
196 | extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); | |
197 | UScriptCode srcScript = | |
198 | static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); | |
199 | UScriptCode targScript = | |
200 | static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); | |
201 | if (U_FAILURE(status)) { | |
202 | goto cleanup; | |
203 | } | |
204 | if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { | |
205 | status = U_INVALID_FORMAT_ERROR; | |
206 | goto cleanup; | |
207 | } | |
208 | ||
209 | // select the table - (A) any case or (L) lower case only | |
210 | UTrie2 *table = anyCaseTrie; | |
211 | if (uregex_start(parseRegexp, 7, &status) >= 0) { | |
212 | table = lowerCaseTrie; | |
213 | } | |
214 | ||
215 | // Build the set of scripts containing confusable characters for | |
216 | // the code point(s) specified in this input line. | |
217 | // Sanity check that the script of the source code point is the same | |
218 | // as the source script indicated in the input file. Failure of this check is | |
219 | // an error in the input file. | |
220 | // Include the source script in the set (needed for Mixed Script Confusable detection). | |
221 | // | |
222 | UChar32 cp; | |
223 | for (cp=startCodePoint; cp<=endCodePoint; cp++) { | |
224 | int32_t setIndex = utrie2_get32(table, cp); | |
225 | BuilderScriptSet *bsset = NULL; | |
226 | if (setIndex > 0) { | |
227 | U_ASSERT(setIndex < scriptSets->size()); | |
228 | bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); | |
229 | } else { | |
230 | bsset = new BuilderScriptSet(); | |
231 | if (bsset == NULL) { | |
232 | status = U_MEMORY_ALLOCATION_ERROR; | |
233 | goto cleanup; | |
234 | } | |
235 | bsset->codePoint = cp; | |
236 | bsset->trie = table; | |
237 | bsset->sset = new ScriptSet(); | |
238 | setIndex = scriptSets->size(); | |
239 | bsset->index = setIndex; | |
240 | bsset->rindex = 0; | |
241 | if (bsset->sset == NULL) { | |
242 | status = U_MEMORY_ALLOCATION_ERROR; | |
243 | goto cleanup; | |
244 | } | |
245 | scriptSets->addElement(bsset, status); | |
246 | utrie2_set32(table, cp, setIndex, &status); | |
247 | } | |
248 | bsset->sset->set(targScript, status); | |
249 | bsset->sset->set(srcScript, status); | |
250 | ||
251 | if (U_FAILURE(status)) { | |
252 | goto cleanup; | |
253 | } | |
254 | UScriptCode cpScript = uscript_getScript(cp, &status); | |
255 | if (cpScript != srcScript) { | |
256 | status = U_INVALID_FORMAT_ERROR; | |
257 | goto cleanup; | |
258 | } | |
259 | } | |
260 | } | |
261 | ||
262 | // Eliminate duplicate script sets. At this point we have a separate | |
263 | // script set for every code point that had data in the input file. | |
264 | // | |
265 | // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them | |
266 | // | |
267 | // printf("Number of scriptSets: %d\n", scriptSets->size()); | |
268 | { | |
269 | int32_t duplicateCount = 0; | |
270 | rtScriptSetsCount = 2; | |
271 | for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { | |
272 | BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); | |
273 | if (outerSet->index != static_cast<uint32_t>(outeri)) { | |
274 | // This set was already identified as a duplicate. | |
275 | // It will not be allocated a position in the runtime array of ScriptSets. | |
276 | continue; | |
277 | } | |
278 | outerSet->rindex = rtScriptSetsCount++; | |
279 | for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { | |
280 | BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); | |
281 | if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { | |
282 | delete innerSet->sset; | |
283 | innerSet->scriptSetOwned = FALSE; | |
284 | innerSet->sset = outerSet->sset; | |
285 | innerSet->index = outeri; | |
286 | innerSet->rindex = outerSet->rindex; | |
287 | duplicateCount++; | |
288 | } | |
289 | // But this doesn't get all. We need to fix the TRIE. | |
290 | } | |
291 | } | |
292 | // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); | |
293 | } | |
294 | ||
295 | ||
296 | ||
297 | // Update the Trie values to be reflect the run time script indexes (after duplicate merging). | |
298 | // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets | |
299 | // are unused, which is why the loop index starts at 2.) | |
300 | { | |
301 | for (int32_t i=2; i<scriptSets->size(); i++) { | |
302 | BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); | |
303 | if (bSet->rindex != (uint32_t)i) { | |
304 | utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); | |
305 | } | |
306 | } | |
307 | } | |
308 | ||
309 | // For code points with script==Common or script==Inherited, | |
310 | // Set the reserved value of 1 into both Tries. These characters do not participate | |
311 | // in Whole Script Confusable detection; this reserved value is the means | |
312 | // by which they are detected. | |
313 | { | |
314 | UnicodeSet ignoreSet; | |
315 | ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); | |
316 | UnicodeSet inheritedSet; | |
317 | inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); | |
318 | ignoreSet.addAll(inheritedSet); | |
319 | for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { | |
320 | UChar32 rangeStart = ignoreSet.getRangeStart(rn); | |
321 | UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); | |
322 | utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); | |
323 | utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); | |
324 | } | |
325 | } | |
326 | ||
327 | // Serialize the data to the Spoof Detector | |
328 | { | |
329 | utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); | |
330 | int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); | |
331 | // printf("Any case Trie size: %d\n", size); | |
332 | if (status != U_BUFFER_OVERFLOW_ERROR) { | |
333 | goto cleanup; | |
334 | } | |
335 | status = U_ZERO_ERROR; | |
336 | spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; | |
337 | spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; | |
338 | spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; | |
339 | void *where = spImpl->fSpoofData->reserveSpace(size, status); | |
340 | utrie2_serialize(anyCaseTrie, where, size, &status); | |
341 | ||
342 | utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); | |
343 | size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); | |
344 | // printf("Lower case Trie size: %d\n", size); | |
345 | if (status != U_BUFFER_OVERFLOW_ERROR) { | |
346 | goto cleanup; | |
347 | } | |
348 | status = U_ZERO_ERROR; | |
349 | spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; | |
350 | spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; | |
351 | spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; | |
352 | where = spImpl->fSpoofData->reserveSpace(size, status); | |
353 | utrie2_serialize(lowerCaseTrie, where, size, &status); | |
354 | ||
355 | spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; | |
356 | spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; | |
357 | ScriptSet *rtScriptSets = static_cast<ScriptSet *> | |
358 | (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); | |
359 | uint32_t rindex = 2; | |
360 | for (int32_t i=2; i<scriptSets->size(); i++) { | |
361 | BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); | |
362 | if (bSet->rindex < rindex) { | |
363 | // We have already copied this script set to the serialized data. | |
364 | continue; | |
365 | } | |
366 | U_ASSERT(rindex == bSet->rindex); | |
367 | rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. | |
368 | rindex++; | |
369 | } | |
370 | } | |
371 | ||
372 | // Open new utrie2s from the serialized data. We don't want to keep the ones | |
373 | // we just built because we would then have two copies of the data, one internal to | |
374 | // the utries that we have already constructed, and one in the serialized data area. | |
375 | // An alternative would be to not pre-serialize the Trie data, but that makes the | |
376 | // spoof detector data different, depending on how the detector was constructed. | |
377 | // It's simpler to keep the data always the same. | |
378 | ||
379 | spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( | |
380 | UTRIE2_16_VALUE_BITS, | |
381 | (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, | |
382 | spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, | |
383 | NULL, | |
384 | &status); | |
385 | ||
386 | spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( | |
387 | UTRIE2_16_VALUE_BITS, | |
388 | (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, | |
389 | spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, | |
390 | NULL, | |
391 | &status); | |
392 | ||
393 | ||
394 | ||
395 | cleanup: | |
396 | if (U_FAILURE(status)) { | |
397 | pe->line = lineNum; | |
398 | } | |
399 | uregex_close(parseRegexp); | |
400 | uprv_free(input); | |
401 | ||
402 | int32_t i; | |
403 | if (scriptSets != NULL) { | |
404 | for (i=0; i<scriptSets->size(); i++) { | |
405 | BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); | |
406 | delete bsset; | |
407 | } | |
408 | delete scriptSets; | |
409 | } | |
410 | utrie2_close(anyCaseTrie); | |
411 | utrie2_close(lowerCaseTrie); | |
412 | return; | |
413 | } | |
414 | ||
415 | U_NAMESPACE_END | |
416 | ||
417 | ||
418 | ||
419 | BuilderScriptSet::BuilderScriptSet() { | |
420 | codePoint = -1; | |
421 | trie = NULL; | |
422 | sset = NULL; | |
423 | index = 0; | |
424 | rindex = 0; | |
425 | scriptSetOwned = TRUE; | |
426 | } | |
427 | ||
428 | BuilderScriptSet::~BuilderScriptSet() { | |
429 | if (scriptSetOwned) { | |
430 | delete sset; | |
431 | } | |
432 | } | |
433 | ||
434 | #endif | |
435 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | |
436 |