]>
Commit | Line | Data |
---|---|---|
729e4ab9 A |
1 | /* |
2 | ****************************************************************************** | |
3 | * | |
4388f060 | 4 | * Copyright (C) 2008-2012, International Business Machines |
729e4ab9 A |
5 | * Corporation and others. All Rights Reserved. |
6 | * | |
7 | ****************************************************************************** | |
8 | * file name: uspoof_wsconf.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2009Jan05 (refactoring earlier files) | |
14 | * created by: Andy Heninger | |
15 | * | |
16 | * Internal functions for compililing Whole Script confusable source data | |
17 | * into its binary (runtime) form. The binary data format is described | |
18 | * in uspoof_impl.h | |
19 | */ | |
20 | ||
21 | #include "unicode/utypes.h" | |
22 | #include "unicode/uspoof.h" | |
23 | ||
24 | #if !UCONFIG_NO_NORMALIZATION | |
25 | ||
26 | #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
27 | ||
28 | #include "unicode/unorm.h" | |
29 | #include "unicode/uregex.h" | |
30 | #include "unicode/ustring.h" | |
31 | #include "cmemory.h" | |
32 | #include "uspoof_impl.h" | |
33 | #include "uhash.h" | |
34 | #include "uvector.h" | |
35 | #include "uassert.h" | |
36 | #include "uspoof_wsconf.h" | |
37 | ||
38 | U_NAMESPACE_USE | |
39 | ||
40 | ||
41 | // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt | |
42 | // Example Lines: | |
43 | // 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O | |
44 | // 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I | |
45 | // | | | | | |
46 | // | | | |---- Which table, Any Case or Lower Case (A or L) | |
47 | // | | |----------Target script. We need this. | |
48 | // | |----------------Src script. Should match the script of the source | |
49 | // | code points. Beyond checking that, we don't keep it. | |
50 | // |--------------------------------Source code points or range. | |
51 | // | |
52 | // The expression will match _all_ lines, including erroneous lines. | |
53 | // The result of the parse is returned via the contents of the (match) groups. | |
54 | static const char *parseExp = | |
729e4ab9 A |
55 | "(?m)" // Multi-line mode |
56 | "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. | |
57 | "|^(?:" // OR | |
58 | "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. | |
59 | "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. | |
60 | "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. | |
61 | "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 | |
62 | "[ \\t]*(?:#.*?)?" // Trailing commment | |
63 | ")$|" // OR | |
64 | "^(.*?)$"; // An error line. Group 8. | |
65 | // Any line not matching the preceding | |
66 | // parts of the expression.will match | |
67 | // this, and thus be flagged as an error | |
68 | ||
69 | ||
70 | // Extract a regular expression match group into a char * string. | |
71 | // The group must contain only invariant characters. | |
72 | // Used for script names | |
73 | // | |
74 | static void extractGroup( | |
75 | URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { | |
76 | ||
77 | UChar ubuf[50]; | |
78 | ubuf[0] = 0; | |
79 | destBuf[0] = 0; | |
80 | int32_t len = uregex_group(e, group, ubuf, 50, &status); | |
81 | if (U_FAILURE(status) || len == -1 || len >= destCapacity) { | |
82 | return; | |
83 | } | |
84 | UnicodeString s(FALSE, ubuf, len); // Aliasing constructor | |
85 | s.extract(0, len, destBuf, destCapacity, US_INV); | |
86 | } | |
87 | ||
88 | ||
89 | ||
4388f060 A |
90 | U_NAMESPACE_BEGIN |
91 | ||
729e4ab9 A |
92 | // Build the Whole Script Confusable data |
93 | // | |
94 | // TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, | |
95 | // because everything is local to this one build function anyhow, | |
96 | // OR | |
97 | // break this function into more reasonably sized pieces, with | |
98 | // state in WSConfusableDataBuilder. | |
99 | // | |
100 | void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, | |
101 | int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) | |
102 | { | |
103 | if (U_FAILURE(status)) { | |
104 | return; | |
105 | } | |
106 | URegularExpression *parseRegexp = NULL; | |
107 | int32_t inputLen = 0; | |
108 | UChar *input = NULL; | |
109 | int32_t lineNum = 0; | |
110 | ||
111 | UVector *scriptSets = NULL; | |
112 | uint32_t rtScriptSetsCount = 2; | |
113 | ||
114 | UTrie2 *anyCaseTrie = NULL; | |
115 | UTrie2 *lowerCaseTrie = NULL; | |
116 | ||
117 | anyCaseTrie = utrie2_open(0, 0, &status); | |
118 | lowerCaseTrie = utrie2_open(0, 0, &status); | |
4388f060 A |
119 | |
120 | UnicodeString pattern(parseExp, -1, US_INV); | |
729e4ab9 A |
121 | |
122 | // The scriptSets vector provides a mapping from TRIE values to the set of scripts. | |
123 | // | |
124 | // Reserved TRIE values: | |
125 | // 0: Code point has no whole script confusables. | |
126 | // 1: Code point is of script Common or Inherited. | |
127 | // These code points do not participate in whole script confusable detection. | |
128 | // (This is logically equivalent to saying that they contain confusables in | |
129 | // all scripts) | |
130 | // | |
131 | // Because Trie values are indexes into the ScriptSets vector, pre-fill | |
132 | // vector positions 0 and 1 to avoid conflicts with the reserved values. | |
133 | ||
134 | scriptSets = new UVector(status); | |
135 | if (scriptSets == NULL) { | |
136 | status = U_MEMORY_ALLOCATION_ERROR; | |
137 | goto cleanup; | |
138 | } | |
139 | scriptSets->addElement((void *)NULL, status); | |
140 | scriptSets->addElement((void *)NULL, status); | |
141 | ||
142 | // Convert the user input data from UTF-8 to UChar (UTF-16) | |
143 | u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); | |
144 | if (status != U_BUFFER_OVERFLOW_ERROR) { | |
145 | goto cleanup; | |
146 | } | |
147 | status = U_ZERO_ERROR; | |
148 | input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); | |
149 | if (input == NULL) { | |
150 | status = U_MEMORY_ALLOCATION_ERROR; | |
151 | goto cleanup; | |
152 | } | |
153 | u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); | |
154 | ||
4388f060 | 155 | parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); |
729e4ab9 | 156 | |
729e4ab9 A |
157 | // Zap any Byte Order Mark at the start of input. Changing it to a space is benign |
158 | // given the syntax of the input. | |
159 | if (*input == 0xfeff) { | |
160 | *input = 0x20; | |
161 | } | |
162 | ||
163 | // Parse the input, one line per iteration of this loop. | |
164 | uregex_setText(parseRegexp, input, inputLen, &status); | |
165 | while (uregex_findNext(parseRegexp, &status)) { | |
166 | lineNum++; | |
729e4ab9 A |
167 | if (uregex_start(parseRegexp, 1, &status) >= 0) { |
168 | // this was a blank or comment line. | |
169 | continue; | |
170 | } | |
171 | if (uregex_start(parseRegexp, 8, &status) >= 0) { | |
172 | // input file syntax error. | |
173 | status = U_PARSE_ERROR; | |
174 | goto cleanup; | |
175 | } | |
176 | if (U_FAILURE(status)) { | |
177 | goto cleanup; | |
178 | } | |
179 | ||
180 | // Pick up the start and optional range end code points from the parsed line. | |
181 | UChar32 startCodePoint = SpoofImpl::ScanHex( | |
182 | input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); | |
183 | UChar32 endCodePoint = startCodePoint; | |
184 | if (uregex_start(parseRegexp, 3, &status) >=0) { | |
185 | endCodePoint = SpoofImpl::ScanHex( | |
186 | input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); | |
187 | } | |
188 | ||
189 | // Extract the two script names from the source line. We need these in an 8 bit | |
190 | // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on | |
191 | // to the ICU u_getPropertyValueEnum() function. Ugh. | |
192 | char srcScriptName[20]; | |
193 | char targScriptName[20]; | |
194 | extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); | |
195 | extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); | |
196 | UScriptCode srcScript = | |
197 | static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); | |
198 | UScriptCode targScript = | |
199 | static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); | |
200 | if (U_FAILURE(status)) { | |
201 | goto cleanup; | |
202 | } | |
203 | if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { | |
204 | status = U_INVALID_FORMAT_ERROR; | |
205 | goto cleanup; | |
206 | } | |
207 | ||
208 | // select the table - (A) any case or (L) lower case only | |
209 | UTrie2 *table = anyCaseTrie; | |
210 | if (uregex_start(parseRegexp, 7, &status) >= 0) { | |
211 | table = lowerCaseTrie; | |
212 | } | |
213 | ||
214 | // Build the set of scripts containing confusable characters for | |
215 | // the code point(s) specified in this input line. | |
216 | // Sanity check that the script of the source code point is the same | |
217 | // as the source script indicated in the input file. Failure of this check is | |
218 | // an error in the input file. | |
219 | // Include the source script in the set (needed for Mixed Script Confusable detection). | |
220 | // | |
221 | UChar32 cp; | |
222 | for (cp=startCodePoint; cp<=endCodePoint; cp++) { | |
223 | int32_t setIndex = utrie2_get32(table, cp); | |
224 | BuilderScriptSet *bsset = NULL; | |
225 | if (setIndex > 0) { | |
226 | U_ASSERT(setIndex < scriptSets->size()); | |
227 | bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); | |
228 | } else { | |
229 | bsset = new BuilderScriptSet(); | |
230 | if (bsset == NULL) { | |
231 | status = U_MEMORY_ALLOCATION_ERROR; | |
232 | goto cleanup; | |
233 | } | |
234 | bsset->codePoint = cp; | |
235 | bsset->trie = table; | |
236 | bsset->sset = new ScriptSet(); | |
237 | setIndex = scriptSets->size(); | |
238 | bsset->index = setIndex; | |
239 | bsset->rindex = 0; | |
240 | if (bsset->sset == NULL) { | |
241 | status = U_MEMORY_ALLOCATION_ERROR; | |
242 | goto cleanup; | |
243 | } | |
244 | scriptSets->addElement(bsset, status); | |
245 | utrie2_set32(table, cp, setIndex, &status); | |
246 | } | |
247 | bsset->sset->Union(targScript); | |
248 | bsset->sset->Union(srcScript); | |
249 | ||
250 | if (U_FAILURE(status)) { | |
251 | goto cleanup; | |
252 | } | |
253 | UScriptCode cpScript = uscript_getScript(cp, &status); | |
254 | if (cpScript != srcScript) { | |
255 | status = U_INVALID_FORMAT_ERROR; | |
256 | goto cleanup; | |
257 | } | |
258 | } | |
259 | } | |
260 | ||
261 | // Eliminate duplicate script sets. At this point we have a separate | |
262 | // script set for every code point that had data in the input file. | |
263 | // | |
264 | // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them | |
265 | // | |
266 | // printf("Number of scriptSets: %d\n", scriptSets->size()); | |
267 | { | |
268 | int32_t duplicateCount = 0; | |
269 | rtScriptSetsCount = 2; | |
270 | for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { | |
271 | BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); | |
272 | if (outerSet->index != static_cast<uint32_t>(outeri)) { | |
273 | // This set was already identified as a duplicate. | |
274 | // It will not be allocated a position in the runtime array of ScriptSets. | |
275 | continue; | |
276 | } | |
277 | outerSet->rindex = rtScriptSetsCount++; | |
278 | for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { | |
279 | BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); | |
280 | if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { | |
281 | delete innerSet->sset; | |
282 | innerSet->scriptSetOwned = FALSE; | |
283 | innerSet->sset = outerSet->sset; | |
284 | innerSet->index = outeri; | |
285 | innerSet->rindex = outerSet->rindex; | |
286 | duplicateCount++; | |
287 | } | |
288 | // But this doesn't get all. We need to fix the TRIE. | |
289 | } | |
290 | } | |
291 | // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); | |
292 | } | |
293 | ||
294 | ||
295 | ||
296 | // Update the Trie values to be reflect the run time script indexes (after duplicate merging). | |
297 | // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets | |
298 | // are unused, which is why the loop index starts at 2.) | |
299 | { | |
300 | for (int32_t i=2; i<scriptSets->size(); i++) { | |
301 | BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); | |
302 | if (bSet->rindex != (uint32_t)i) { | |
303 | utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); | |
304 | } | |
305 | } | |
306 | } | |
307 | ||
308 | // For code points with script==Common or script==Inherited, | |
309 | // Set the reserved value of 1 into both Tries. These characters do not participate | |
310 | // in Whole Script Confusable detection; this reserved value is the means | |
311 | // by which they are detected. | |
312 | { | |
313 | UnicodeSet ignoreSet; | |
314 | ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); | |
315 | UnicodeSet inheritedSet; | |
316 | inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); | |
317 | ignoreSet.addAll(inheritedSet); | |
318 | for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { | |
319 | UChar32 rangeStart = ignoreSet.getRangeStart(rn); | |
320 | UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); | |
321 | utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); | |
322 | utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); | |
323 | } | |
324 | } | |
325 | ||
326 | // Serialize the data to the Spoof Detector | |
327 | { | |
328 | utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); | |
329 | int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); | |
330 | // printf("Any case Trie size: %d\n", size); | |
331 | if (status != U_BUFFER_OVERFLOW_ERROR) { | |
332 | goto cleanup; | |
333 | } | |
334 | status = U_ZERO_ERROR; | |
335 | spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; | |
336 | spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; | |
337 | spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; | |
338 | void *where = spImpl->fSpoofData->reserveSpace(size, status); | |
339 | utrie2_serialize(anyCaseTrie, where, size, &status); | |
340 | ||
341 | utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); | |
342 | size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); | |
343 | // printf("Lower case Trie size: %d\n", size); | |
344 | if (status != U_BUFFER_OVERFLOW_ERROR) { | |
345 | goto cleanup; | |
346 | } | |
347 | status = U_ZERO_ERROR; | |
348 | spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; | |
349 | spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; | |
350 | spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; | |
351 | where = spImpl->fSpoofData->reserveSpace(size, status); | |
352 | utrie2_serialize(lowerCaseTrie, where, size, &status); | |
353 | ||
354 | spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; | |
355 | spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; | |
356 | ScriptSet *rtScriptSets = static_cast<ScriptSet *> | |
357 | (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); | |
358 | uint32_t rindex = 2; | |
359 | for (int32_t i=2; i<scriptSets->size(); i++) { | |
360 | BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); | |
361 | if (bSet->rindex < rindex) { | |
362 | // We have already copied this script set to the serialized data. | |
363 | continue; | |
364 | } | |
365 | U_ASSERT(rindex == bSet->rindex); | |
366 | rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. | |
367 | rindex++; | |
368 | } | |
369 | } | |
370 | ||
371 | // Open new utrie2s from the serialized data. We don't want to keep the ones | |
372 | // we just built because we would then have two copies of the data, one internal to | |
373 | // the utries that we have already constructed, and one in the serialized data area. | |
374 | // An alternative would be to not pre-serialize the Trie data, but that makes the | |
375 | // spoof detector data different, depending on how the detector was constructed. | |
376 | // It's simpler to keep the data always the same. | |
377 | ||
378 | spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( | |
379 | UTRIE2_16_VALUE_BITS, | |
380 | (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, | |
381 | spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, | |
382 | NULL, | |
383 | &status); | |
384 | ||
385 | spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( | |
386 | UTRIE2_16_VALUE_BITS, | |
387 | (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, | |
388 | spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, | |
389 | NULL, | |
390 | &status); | |
391 | ||
392 | ||
393 | ||
394 | cleanup: | |
395 | if (U_FAILURE(status)) { | |
396 | pe->line = lineNum; | |
397 | } | |
398 | uregex_close(parseRegexp); | |
399 | uprv_free(input); | |
400 | ||
401 | int32_t i; | |
4388f060 A |
402 | if (scriptSets != NULL) { |
403 | for (i=0; i<scriptSets->size(); i++) { | |
404 | BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); | |
405 | delete bsset; | |
406 | } | |
407 | delete scriptSets; | |
729e4ab9 | 408 | } |
729e4ab9 A |
409 | utrie2_close(anyCaseTrie); |
410 | utrie2_close(lowerCaseTrie); | |
411 | return; | |
412 | } | |
413 | ||
4388f060 | 414 | U_NAMESPACE_END |
729e4ab9 A |
415 | |
416 | ||
417 | ||
418 | BuilderScriptSet::BuilderScriptSet() { | |
419 | codePoint = -1; | |
420 | trie = NULL; | |
421 | sset = NULL; | |
422 | index = 0; | |
423 | rindex = 0; | |
424 | scriptSetOwned = TRUE; | |
425 | } | |
426 | ||
427 | BuilderScriptSet::~BuilderScriptSet() { | |
428 | if (scriptSetOwned) { | |
429 | delete sset; | |
430 | } | |
431 | } | |
432 | ||
433 | #endif | |
434 | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS | |
435 |