]>
Commit | Line | Data |
---|---|---|
b75a7d8f A |
1 | /* |
2 | ******************************************************************************* | |
3 | * | |
4 | * Copyright (C) 2003, International Business Machines | |
5 | * Corporation and others. All Rights Reserved. | |
6 | * | |
7 | ******************************************************************************* | |
8 | * file name: strprep.cpp | |
9 | * encoding: US-ASCII | |
10 | * tab size: 8 (not used) | |
11 | * indentation:4 | |
12 | * | |
13 | * created on: 2003feb1 | |
14 | * created by: Ram Viswanadha | |
15 | */ | |
16 | ||
17 | #include "unicode/utypes.h" | |
18 | ||
19 | #if !UCONFIG_NO_IDNA | |
20 | ||
21 | #include "strprep.h" | |
22 | #include "utrie.h" | |
23 | #include "umutex.h" | |
24 | #include "cmemory.h" | |
25 | #include "sprpimpl.h" | |
26 | #include "nameprep.h" | |
27 | #include "ustr_imp.h" | |
28 | #include "unicode/unorm.h" | |
29 | #include "unicode/udata.h" | |
30 | #include "unicode/ustring.h" | |
31 | ||
32 | static const uint16_t* mappingData = NULL; | |
33 | static int32_t indexes[_IDNA_INDEX_TOP]={ 0 }; | |
34 | static UBool _isDataLoaded = FALSE; | |
35 | static UTrie idnTrie={ 0,0,0,0,0,0,0 }; | |
36 | static UDataMemory* idnData=NULL; | |
37 | static UErrorCode dataErrorCode =U_ZERO_ERROR; | |
38 | /* file definitions */ | |
39 | static const char DATA_NAME[] = "uidna"; | |
40 | static const char DATA_TYPE[] = "icu"; | |
41 | ||
42 | U_CFUNC UBool | |
43 | ustrprep_cleanup() { | |
44 | if(idnData!=NULL) { | |
45 | udata_close(idnData); | |
46 | idnData=NULL; | |
47 | } | |
48 | dataErrorCode=U_ZERO_ERROR; | |
49 | _isDataLoaded=FALSE; | |
50 | ||
51 | return TRUE; | |
52 | } | |
53 | ||
54 | U_CDECL_BEGIN | |
55 | static UBool U_CALLCONV | |
56 | isAcceptable(void * /* context */, | |
57 | const char * /* type */, | |
58 | const char * /* name */, | |
59 | const UDataInfo *pInfo) { | |
60 | if( | |
61 | pInfo->size>=20 && | |
62 | pInfo->isBigEndian==U_IS_BIG_ENDIAN && | |
63 | pInfo->charsetFamily==U_CHARSET_FAMILY && | |
64 | pInfo->dataFormat[0]==0x49 && /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41 */ | |
65 | pInfo->dataFormat[1]==0x44 && | |
66 | pInfo->dataFormat[2]==0x4e && | |
67 | pInfo->dataFormat[3]==0x41 && | |
68 | pInfo->formatVersion[0]==2 && | |
69 | pInfo->formatVersion[2]==UTRIE_SHIFT && | |
70 | pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT | |
71 | ) { | |
72 | return TRUE; | |
73 | } else { | |
74 | return FALSE; | |
75 | } | |
76 | } | |
77 | ||
78 | ||
79 | ||
80 | static int32_t U_CALLCONV | |
81 | getFoldingOffset(uint32_t data) { | |
82 | if(data&0x8000) { | |
83 | return (int32_t)(data&0x7fff); | |
84 | } else { | |
85 | return 0; | |
86 | } | |
87 | } | |
88 | ||
89 | U_CDECL_END | |
90 | ||
91 | static UBool U_CALLCONV | |
92 | loadData(UErrorCode &errorCode) { | |
93 | /* load Unicode IDNA data from file */ | |
94 | UBool isCached; | |
95 | ||
96 | /* do this because double-checked locking is broken */ | |
97 | umtx_lock(NULL); | |
98 | isCached=_isDataLoaded; | |
99 | umtx_unlock(NULL); | |
100 | ||
101 | if(!isCached) { | |
102 | UTrie _idnTrie={ 0,0,0,0,0,0,0 }; | |
103 | UDataMemory *data; | |
104 | const int32_t *p=NULL; | |
105 | const uint8_t *pb; | |
106 | ||
107 | if(&errorCode==NULL || U_FAILURE(errorCode)) { | |
108 | return 0; | |
109 | } | |
110 | ||
111 | /* open the data outside the mutex block */ | |
112 | //TODO: change the path | |
113 | data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode); | |
114 | dataErrorCode=errorCode; | |
115 | if(U_FAILURE(errorCode)) { | |
116 | return _isDataLoaded=FALSE; | |
117 | } | |
118 | ||
119 | p=(const int32_t *)udata_getMemory(data); | |
120 | pb=(const uint8_t *)(p+_IDNA_INDEX_TOP); | |
121 | utrie_unserialize(&_idnTrie, pb, p[_IDNA_INDEX_TRIE_SIZE], &errorCode); | |
122 | _idnTrie.getFoldingOffset=getFoldingOffset; | |
123 | ||
124 | ||
125 | if(U_FAILURE(errorCode)) { | |
126 | dataErrorCode=errorCode; | |
127 | udata_close(data); | |
128 | return _isDataLoaded=FALSE; | |
129 | } | |
130 | ||
131 | /* in the mutex block, set the data for this process */ | |
132 | umtx_lock(NULL); | |
133 | if(idnData==NULL) { | |
134 | idnData=data; | |
135 | data=NULL; | |
136 | uprv_memcpy(&indexes, p, sizeof(indexes)); | |
137 | uprv_memcpy(&idnTrie, &_idnTrie, sizeof(UTrie)); | |
138 | } else { | |
139 | p=(const int32_t *)udata_getMemory(idnData); | |
140 | } | |
141 | umtx_unlock(NULL); | |
142 | /* initialize some variables */ | |
143 | mappingData=(uint16_t *)((uint8_t *)(p+_IDNA_INDEX_TOP)+indexes[_IDNA_INDEX_TRIE_SIZE]); | |
144 | ||
145 | _isDataLoaded = TRUE; | |
146 | ||
147 | /* if a different thread set it first, then close the extra data */ | |
148 | if(data!=NULL) { | |
149 | udata_close(data); /* NULL if it was set correctly */ | |
150 | } | |
151 | } | |
152 | ||
153 | return _isDataLoaded; | |
154 | } | |
155 | ||
156 | // ***************************************************************************** | |
157 | // class StringPrep | |
158 | // ***************************************************************************** | |
159 | ||
160 | U_NAMESPACE_BEGIN | |
161 | ||
162 | const char StringPrep::fgClassID=0; | |
163 | ||
164 | UBool StringPrep::isDataLoaded(UErrorCode& status){ | |
165 | if(U_FAILURE(status)){ | |
166 | return FALSE; | |
167 | } | |
168 | if(_isDataLoaded==FALSE && U_FAILURE(dataErrorCode)){ | |
169 | status = dataErrorCode; | |
170 | return FALSE; | |
171 | } | |
172 | loadData(dataErrorCode); | |
173 | if(U_FAILURE(dataErrorCode)){ | |
174 | status = dataErrorCode; | |
175 | return FALSE; | |
176 | } | |
177 | return TRUE; | |
178 | } | |
179 | ||
180 | ||
181 | StringPrep* StringPrep::createDefaultInstance(UErrorCode& status){ | |
182 | StringPrep* strprep = new StringPrep(); | |
183 | if(!isDataLoaded(status)){ | |
184 | delete strprep; | |
185 | return NULL; | |
186 | } | |
187 | return strprep; | |
188 | } | |
189 | ||
190 | StringPrep* StringPrep::createNameprepInstance(UErrorCode& status){ | |
191 | StringPrep* strprep = new NamePrep(status); | |
192 | if(!isDataLoaded(status)){ | |
193 | delete strprep; | |
194 | return NULL; | |
195 | } | |
196 | return strprep; | |
197 | } | |
198 | ||
199 | UBool StringPrep::isNotProhibited(UChar32 /*ch*/){ | |
200 | return FALSE; | |
201 | } | |
202 | UBool StringPrep::isUnassigned(UChar32 ch){ | |
203 | ||
204 | uint32_t result; | |
205 | UTRIE_GET16(&idnTrie,ch,result); | |
206 | return (result == UIDNA_UNASSIGNED); | |
207 | ||
208 | } | |
209 | ||
210 | ||
211 | static inline void getValues(uint32_t result, int8_t& flag, | |
212 | int8_t& length, int32_t& index){ | |
213 | /* first 3 bits contain the flag */ | |
214 | flag = (int8_t) (result & 0x07); | |
215 | /* next 2 bits contain the length */ | |
216 | length = (int8_t) ((result>>3) & 0x03); | |
217 | /* next 10 bits contain the index */ | |
218 | index = (result>> 5); | |
219 | } | |
220 | ||
221 | ||
222 | int32_t StringPrep::map(const UChar* src, int32_t srcLength, | |
223 | UChar* dest, int32_t destCapacity, | |
224 | UBool allowUnassigned, | |
225 | UParseError* parseError, | |
226 | UErrorCode& status ){ | |
227 | ||
228 | uint32_t result; | |
229 | int8_t flag; | |
230 | int8_t length; | |
231 | int32_t index; | |
232 | int32_t destIndex=0; | |
233 | int32_t srcIndex=0; | |
234 | ||
235 | // check error status | |
236 | if(U_FAILURE(status)){ | |
237 | return 0; | |
238 | } | |
239 | ||
240 | //check arguments | |
241 | if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { | |
242 | status=U_ILLEGAL_ARGUMENT_ERROR; | |
243 | return 0; | |
244 | } | |
245 | if(srcLength == -1){ | |
246 | srcLength = u_strlen(src); | |
247 | } | |
248 | ||
249 | for(;srcIndex<srcLength;){ | |
250 | UChar32 ch; | |
251 | ||
252 | U16_NEXT(src,srcIndex,srcLength,ch); | |
253 | ||
254 | UTRIE_GET16(&idnTrie,ch,result); | |
255 | ||
256 | getValues(result,flag,length,index); | |
257 | ||
258 | // check if the source codepoint is unassigned | |
259 | if(flag == UIDNA_UNASSIGNED){ | |
260 | if(allowUnassigned == TRUE){ | |
261 | //copy the ch to destination | |
262 | if(ch <= 0xFFFF){ | |
263 | if(destIndex < destCapacity ){ | |
264 | dest[destIndex] = (UChar)ch; | |
265 | } | |
266 | destIndex++; | |
267 | }else{ | |
268 | if(destIndex+1 < destCapacity ){ | |
269 | dest[destIndex] = U16_LEAD(ch); | |
270 | dest[destIndex+1] = U16_TRAIL(ch); | |
271 | } | |
272 | destIndex +=2; | |
273 | } | |
274 | }else{ | |
275 | uprv_syntaxError(src,srcIndex-U16_LENGTH(ch), srcLength,parseError); | |
276 | status = U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR; | |
277 | return 0; | |
278 | } | |
279 | }else if((flag == UIDNA_MAP_NFKC && doNFKC == TRUE) || | |
280 | (index == _IDNA_MAP_TO_NOTHING && doNFKC == FALSE)){ | |
281 | ||
282 | if(length == _IDNA_LENGTH_IN_MAPPING_TABLE){ | |
283 | length = (int8_t) mappingData[index++]; | |
284 | } | |
285 | ||
286 | for(int8_t i =0; i< length; i++){ | |
287 | if(destIndex < destCapacity ){ | |
288 | dest[destIndex] = mappingData[index+i]; | |
289 | } | |
290 | destIndex++; /* for pre-flighting */ | |
291 | } | |
292 | }else{ | |
293 | //copy the source into destination | |
294 | if(ch <= 0xFFFF){ | |
295 | if(destIndex < destCapacity ){ | |
296 | dest[destIndex] = (UChar)ch; | |
297 | } | |
298 | destIndex++; | |
299 | }else{ | |
300 | if(destIndex+1 < destCapacity ){ | |
301 | dest[destIndex] = U16_LEAD(ch); | |
302 | dest[destIndex+1] = U16_TRAIL(ch); | |
303 | } | |
304 | destIndex +=2; | |
305 | } | |
306 | } | |
307 | } | |
308 | ||
309 | return u_terminateUChars(dest, destCapacity, destIndex, &status); | |
310 | } | |
311 | ||
312 | ||
313 | int32_t StringPrep::normalize( const UChar* src, int32_t srcLength, | |
314 | UChar* dest, int32_t destCapacity, | |
315 | UErrorCode& status ){ | |
316 | ||
317 | return unorm_normalize(src,srcLength,UNORM_NFKC,UNORM_UNICODE_3_2,dest,destCapacity,&status); | |
318 | } | |
319 | ||
320 | ||
321 | /* | |
322 | 1) Map -- For each character in the input, check if it has a mapping | |
323 | and, if so, replace it with its mapping. | |
324 | ||
325 | 2) Normalize -- Possibly normalize the result of step 1 using Unicode | |
326 | normalization. | |
327 | ||
328 | 3) Prohibit -- Check for any characters that are not allowed in the | |
329 | output. If any are found, return an error. | |
330 | ||
331 | 4) Check bidi -- Possibly check for right-to-left characters, and if | |
332 | any are found, make sure that the whole string satisfies the | |
333 | requirements for bidirectional strings. If the string does not | |
334 | satisfy the requirements for bidirectional strings, return an | |
335 | error. | |
336 | [Unicode3.2] defines several bidirectional categories; each character | |
337 | has one bidirectional category assigned to it. For the purposes of | |
338 | the requirements below, an "RandALCat character" is a character that | |
339 | has Unicode bidirectional categories "R" or "AL"; an "LCat character" | |
340 | is a character that has Unicode bidirectional category "L". Note | |
341 | ||
342 | ||
343 | that there are many characters which fall in neither of the above | |
344 | definitions; Latin digits (<U+0030> through <U+0039>) are examples of | |
345 | this because they have bidirectional category "EN". | |
346 | ||
347 | In any profile that specifies bidirectional character handling, all | |
348 | three of the following requirements MUST be met: | |
349 | ||
350 | 1) The characters in section 5.8 MUST be prohibited. | |
351 | ||
352 | 2) If a string contains any RandALCat character, the string MUST NOT | |
353 | contain any LCat character. | |
354 | ||
355 | 3) If a string contains any RandALCat character, a RandALCat | |
356 | character MUST be the first character of the string, and a | |
357 | RandALCat character MUST be the last character of the string. | |
358 | */ | |
359 | ||
360 | #define MAX_STACK_BUFFER_SIZE 300 | |
361 | ||
362 | int32_t StringPrep::process(const UChar* src, int32_t srcLength, | |
363 | UChar* dest, int32_t destCapacity, | |
364 | UBool allowUnassigned, | |
365 | UParseError* parseError, | |
366 | UErrorCode& status ){ | |
367 | // check error status | |
368 | if(U_FAILURE(status)){ | |
369 | return 0; | |
370 | } | |
371 | ||
372 | //check arguments | |
373 | if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) { | |
374 | status=U_ILLEGAL_ARGUMENT_ERROR; | |
375 | return 0; | |
376 | } | |
377 | ||
378 | UChar b1Stack[MAX_STACK_BUFFER_SIZE], b2Stack[MAX_STACK_BUFFER_SIZE]; | |
379 | UChar *b1 = b1Stack, *b2 = b2Stack; | |
380 | int32_t b1Len, b2Len=0, | |
381 | b1Capacity = MAX_STACK_BUFFER_SIZE , | |
382 | b2Capacity = MAX_STACK_BUFFER_SIZE; | |
383 | uint32_t result; | |
384 | int32_t b2Index = 0; | |
385 | int8_t flag; | |
386 | int8_t length; | |
387 | int32_t index; | |
388 | UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT; | |
389 | UBool leftToRight=FALSE, rightToLeft=FALSE; | |
390 | int32_t rtlPos =-1, ltrPos =-1; | |
391 | ||
392 | b1Len = map(src,srcLength, b1, b1Capacity,allowUnassigned, parseError, status); | |
393 | ||
394 | if(status == U_BUFFER_OVERFLOW_ERROR){ | |
395 | // redo processing of string | |
396 | /* we do not have enough room so grow the buffer*/ | |
397 | b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR); | |
398 | if(b1==NULL){ | |
399 | status = U_MEMORY_ALLOCATION_ERROR; | |
400 | goto CLEANUP; | |
401 | } | |
402 | ||
403 | status = U_ZERO_ERROR; // reset error | |
404 | ||
405 | b1Len = map(src,srcLength, b1, b1Len,allowUnassigned, parseError, status); | |
406 | ||
407 | } | |
408 | ||
409 | b2Len = normalize(b1,b1Len, b2,b2Capacity,status); | |
410 | ||
411 | if(status == U_BUFFER_OVERFLOW_ERROR){ | |
412 | // redo processing of string | |
413 | /* we do not have enough room so grow the buffer*/ | |
414 | b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR); | |
415 | if(b2==NULL){ | |
416 | status = U_MEMORY_ALLOCATION_ERROR; | |
417 | goto CLEANUP; | |
418 | } | |
419 | ||
420 | status = U_ZERO_ERROR; // reset error | |
421 | ||
422 | b2Len = normalize(b2,b2Len, b2,b2Len,status); | |
423 | ||
424 | } | |
425 | ||
426 | if(U_FAILURE(status)){ | |
427 | goto CLEANUP; | |
428 | } | |
429 | ||
430 | UChar32 ch; | |
431 | ||
432 | for(; b2Index<b2Len;){ | |
433 | ||
434 | ch = 0; | |
435 | ||
436 | U16_NEXT(b2, b2Index, b2Len, ch); | |
437 | ||
438 | UTRIE_GET16(&idnTrie,ch,result); | |
439 | ||
440 | getValues(result,flag,length,index); | |
441 | ||
442 | if(flag == UIDNA_PROHIBITED | |
443 | && isNotProhibited(ch) == FALSE){ | |
444 | status = U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR; | |
445 | uprv_syntaxError(b1, b2Index-U16_LENGTH(ch), b2Len, parseError); | |
446 | goto CLEANUP; | |
447 | } | |
448 | ||
449 | direction = u_charDirection(ch); | |
450 | if(firstCharDir == U_CHAR_DIRECTION_COUNT){ | |
451 | firstCharDir = direction; | |
452 | } | |
453 | if(direction == U_LEFT_TO_RIGHT){ | |
454 | leftToRight = TRUE; | |
455 | ltrPos = b2Index-1; | |
456 | } | |
457 | if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){ | |
458 | rightToLeft = TRUE; | |
459 | rtlPos = b2Index-1; | |
460 | } | |
461 | } | |
462 | ||
463 | // satisfy 2 | |
464 | if( leftToRight == TRUE && rightToLeft == TRUE){ | |
465 | status = U_IDNA_CHECK_BIDI_ERROR; | |
466 | uprv_syntaxError(b2,(rtlPos>ltrPos) ? rtlPos : ltrPos, b2Len, parseError); | |
467 | goto CLEANUP; | |
468 | } | |
469 | ||
470 | //satisfy 3 | |
471 | if( rightToLeft == TRUE && | |
472 | !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) && | |
473 | (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC)) | |
474 | ){ | |
475 | status = U_IDNA_CHECK_BIDI_ERROR; | |
476 | uprv_syntaxError(b2, rtlPos, b2Len, parseError); | |
477 | return FALSE; | |
478 | } | |
479 | ||
480 | if(b2Len <= destCapacity){ | |
481 | uprv_memmove(dest,b2, b2Len*U_SIZEOF_UCHAR); | |
482 | } | |
483 | ||
484 | CLEANUP: | |
485 | if(b1!=b1Stack){ | |
486 | uprv_free(b1); | |
487 | } | |
488 | if(b2!=b2Stack){ | |
489 | uprv_free(b2); | |
490 | } | |
491 | return u_terminateUChars(dest, destCapacity, b2Len, &status); | |
492 | } | |
493 | ||
494 | ||
495 | UBool StringPrep::isLabelSeparator(UChar32 ch, UErrorCode& status){ | |
496 | // check error status | |
497 | if(U_FAILURE(status)){ | |
498 | return FALSE; | |
499 | } | |
500 | ||
501 | if(isDataLoaded(status)){ | |
502 | int32_t result; | |
503 | UTRIE_GET16(&idnTrie,ch, result); | |
504 | if( (result & 0x07) == UIDNA_LABEL_SEPARATOR){ | |
505 | return TRUE; | |
506 | } | |
507 | } | |
508 | return FALSE; | |
509 | } | |
510 | ||
511 | U_NAMESPACE_END | |
512 | ||
513 | #endif /* #if !UCONFIG_NO_IDNA */ |